Skip to content

Commit

Permalink
simplify topk API
Browse files Browse the repository at this point in the history
  • Loading branch information
kelindar committed Feb 27, 2024
1 parent 9fe3ccc commit 869e83b
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 16 deletions.
19 changes: 8 additions & 11 deletions topk.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ package approx
import (
"sort"
"sync"
"unsafe"

"github.com/axiomhq/hyperloglog"
"github.com/zeebo/xxh3"
Expand All @@ -18,7 +17,7 @@ import (
// TopValue represents a value and its associated count.
type TopValue struct {
hash uint64 `json:"-"` // The hash of the value
Value []byte `json:"value"` // The associated value
Value string `json:"value"` // The associated value
Count uint32 `json:"count"` // The count of the value
}

Expand Down Expand Up @@ -46,14 +45,9 @@ func NewTopK(k uint) (*TopK, error) {
}, nil
}

// UpdateString adds the string value to the Count-Min Sketch and updates the top-k heap.
func (t *TopK) UpdateString(value string) {
t.Update(unsafe.Slice(unsafe.StringData(value), len(value)))
}

// Update adds the binary value to Count-Min Sketch and updates the top-k elements.
func (t *TopK) Update(value []byte) {
hash := xxh3.Hash(value)
func (t *TopK) Update(value string) {
hash := xxh3.HashString(value)
if updated := t.cms.UpdateHash(hash); !updated {
return // Estimate hasn't changed, skip
}
Expand All @@ -66,7 +60,7 @@ func (t *TopK) Update(value []byte) {
// tryInsert adds the data to the top-k heap. If the data is already an element,
// the frequency is updated. If the heap already has k elements, the element
// with the minimum frequency is removed.
func (t *TopK) tryInsert(value []byte, hash uint64, count uint32) {
func (t *TopK) tryInsert(value string, hash uint64, count uint32) {
t.mu.Lock()
defer t.mu.Unlock()

Expand Down Expand Up @@ -94,8 +88,11 @@ func (t *TopK) tryInsert(value []byte, hash uint64, count uint32) {
t.heap.Pop()
}

// Copy the string in case the caller reuses the buffer
clone := string(append([]byte(nil), value...))

// Add element to top-k and update min count
t.heap.Push(TopValue{Value: value, hash: hash, Count: count})
t.heap.Push(TopValue{Value: clone, hash: hash, Count: count})
}

// Values returns the top-k elements from lowest to highest frequency.
Expand Down
35 changes: 30 additions & 5 deletions topk_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
package approx

import (
"encoding/json"
"fmt"
"math/rand"
"strconv"
Expand All @@ -29,7 +30,7 @@ func BenchmarkTopK(b *testing.B) {
b.Run(fmt.Sprintf("k=%d", k), func(b *testing.B) {
b.ResetTimer()
for n := 0; n < b.N; n++ {
topk.UpdateString(data[n%cardinality])
topk.Update(data[n%cardinality])
}
})
}
Expand All @@ -45,7 +46,7 @@ func TestTopK(t *testing.T) {
assert.NoError(t, err)

for _, v := range deck(cardinality) {
topk.UpdateString(v)
topk.Update(v)
}

elements := topk.Values()
Expand All @@ -68,7 +69,7 @@ func TestTopK_Simple(t *testing.T) {

// Add 10 elements to the topk
for _, v := range deck(10) {
topk.UpdateString(v)
topk.Update(v)
}

elements := topk.Values()
Expand All @@ -89,7 +90,7 @@ func TestTopK_Reset(t *testing.T) {
// Check for multiple resets
for i := 0; i < 10; i++ {
for _, v := range deck(10) {
topk.UpdateString(v)
topk.Update(v)
}

// Reset the topk
Expand All @@ -111,7 +112,7 @@ func TestTopK_Race(t *testing.T) {
go func() {
assert.NotPanics(t, func() {
for _, v := range deck(10) {
topk.UpdateString(v)
topk.Update(v)
}

topk.Values()
Expand All @@ -121,6 +122,30 @@ func TestTopK_Race(t *testing.T) {
}
}

func TestTopK_JSON(t *testing.T) {
topk, err := NewTopK(5)
assert.NoError(t, err)

// Add 10 elements to the topk
for _, v := range deck(10) {
topk.Update(v)
}

values := topk.Values()
assert.Len(t, values, 5)

// Marshal the output
encoded, err := json.Marshal(values)
assert.NoError(t, err)
assert.JSONEq(t, `[
{"value":"5","count":5},
{"value":"6","count":6},
{"value":"7","count":7},
{"value":"8","count":8},
{"value":"9","count":9}
]`, string(encoded))
}

// Generate a random set of values
func deck(n int) []string {
values := make([]string, 0, n)
Expand Down

0 comments on commit 869e83b

Please sign in to comment.