mirror of
https://github.com/bloeys/nset.git
synced 2025-12-29 06:28:19 +00:00
Continue+Docs
This commit is contained in:
BIN
.res/bench-10-million.png
Executable file
BIN
.res/bench-10-million.png
Executable file
Binary file not shown.
|
After Width: | Height: | Size: 90 KiB |
BIN
.res/bench-100.png
Executable file
BIN
.res/bench-100.png
Executable file
Binary file not shown.
|
After Width: | Height: | Size: 88 KiB |
104
README.md
104
README.md
@ -1 +1,103 @@
|
||||
# nlookup
|
||||
# NSet
|
||||
|
||||
NSet is a super fast and memory efficient set implementation built for unsigned integers up to and including uint32.
|
||||
|
||||
By 'set' we mean something like a hash map, but instead of key/value pairs there are only keys.
|
||||
You can do the normal operations of add, check if item exists, and delete, but you can also do things like union sets and
|
||||
get intersections.
|
||||
|
||||
**Contents**:
|
||||
|
||||
- [NSet](#nset)
|
||||
- [When to use NSet](#when-to-use-nset)
|
||||
- [Usage](#usage)
|
||||
- [Benchmarks](#benchmarks)
|
||||
|
||||
## When to use NSet
|
||||
|
||||
Maybe you want a set implementation? Then this is one, but there are other reasons.
|
||||
|
||||
If you are using your hash maps/arrays like sets or do a lot of checks to see if items exists in your hash maps then NSet might make sense.
|
||||
In such cases NSet makes sense because it is both faster and more memory efficient. You can see more about this in the Benchmarks section.
|
||||
|
||||
Here are some examples where you might want to consider NSet:
|
||||
|
||||
``` go
|
||||
//You might be using maps mostly for checking if things exist:
|
||||
|
||||
//This map is being used like a set. Some people might also do: make(map[uint32]bool, 0)
|
||||
mapOfIds := make(map[uint32]struct{}, 0)
|
||||
|
||||
//Fill map here...
|
||||
|
||||
someId := 54312
|
||||
if _, ok:= mapOfIds[someId]; ok {
|
||||
//Do something
|
||||
} else {
|
||||
//Something else
|
||||
}
|
||||
```
|
||||
|
||||
```go
|
||||
//You might be searching arrays a lot
|
||||
func ExistsInArray(myArray []int, item int) bool {
|
||||
|
||||
for i := 0; i < len(myArray); i++ {
|
||||
if myArray[i] == item {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
To install run `go get github.com/bloeys/nset`
|
||||
|
||||
Then usage is very simple:
|
||||
```go
|
||||
|
||||
mySet := nset.NewNSet[uint32]()
|
||||
|
||||
mySet.Add(0)
|
||||
mySet.Add(300)
|
||||
mySet.Add(256)
|
||||
mySet.Add(4)
|
||||
|
||||
if mySet.Contains(5) {
|
||||
panic("Oops I don't want 5!")
|
||||
}
|
||||
|
||||
mySet.Remove(4)
|
||||
|
||||
```
|
||||
|
||||
## Benchmarks
|
||||
|
||||
NSet is faster than the built-in Go hash map in all operations (add, check, delete) by `1.6x to 64x` depending on the operation and data size.
|
||||
|
||||
Benchmark with 100 elements:
|
||||
|
||||

|
||||
|
||||
Benchmark with 10,000,000 elements:
|
||||
|
||||

|
||||
|
||||
As can be seen from the benchmarks, NSet has almost no change in its performance even with 10 million elements, while the
|
||||
hash map slows down a lot as the size grows. NSet practically doesn't allocate at all. But it should be noted that
|
||||
allocation can happen when adding a number bigger than all previously entered numbers.
|
||||
|
||||
Benchmarks that have 'Rand' in them mean that access patterns are randomized which can cause cache invalidation.
|
||||
To make sure the test is fair the seed is the same for both Go Map and NSet. Here both suffer slowdowns but NSet remains faster.
|
||||
|
||||
Benchmarks that have `Presized` in them means that the data structure was fully allocated before usage, like:
|
||||
|
||||
```go
|
||||
//This map already has space for ~100 elements and so doesn't need to resize, which is costly
|
||||
myMap := make(map[uint16], 100)
|
||||
```
|
||||
|
||||
Map benefits from sizing while NSet isn't affected, but in both cases NSet remains faster.
|
||||
|
||||
2
go.mod
2
go.mod
@ -1,3 +1,3 @@
|
||||
module github.com/bloeys/nlookup
|
||||
module github.com/bloeys/nset
|
||||
|
||||
go 1.18
|
||||
|
||||
51
nlookup.go
51
nlookup.go
@ -1,36 +1,41 @@
|
||||
package nlookup
|
||||
package nset
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var _ fmt.Stringer = &NLookup[uint]{}
|
||||
var _ fmt.Stringer = &NSet[uint8]{}
|
||||
|
||||
type StorageType uint64
|
||||
|
||||
const StorageTypeBits = 64
|
||||
|
||||
//IntsIf is limited to uint32 because we can store ALL 4 Billion uint32 numbers
|
||||
//in 256MB with NSet (instead of the normal 16GB for an array of all uint32s).
|
||||
//But if we allow uint64 (or int, since int can be 64-bit) users can easily put a big 64-bit number and use more RAM than maybe Google and crash.
|
||||
type IntsIf interface {
|
||||
uint | uint8 | uint16 | uint32 | uint64
|
||||
uint8 | uint16 | uint32
|
||||
}
|
||||
|
||||
type NLookup[T IntsIf] struct {
|
||||
type NSet[T IntsIf] struct {
|
||||
Data []StorageType
|
||||
StorageUnitCount uint64
|
||||
}
|
||||
|
||||
func (n *NLookup[T]) Add(x T) {
|
||||
func (n *NSet[T]) Add(x T) {
|
||||
|
||||
unitIndex := n.GetStorageUnitIndex(x)
|
||||
if unitIndex >= n.Size() {
|
||||
storageUnitsToAdd := unitIndex - n.Size() + 1
|
||||
n.Data = append(n.Data, make([]StorageType, storageUnitsToAdd)...)
|
||||
n.StorageUnitCount += storageUnitsToAdd
|
||||
}
|
||||
|
||||
n.Data[unitIndex] |= 1 << (x % StorageTypeBits)
|
||||
}
|
||||
|
||||
func (n *NLookup[T]) Remove(x T) {
|
||||
func (n *NSet[T]) Remove(x T) {
|
||||
|
||||
unitIndex := n.GetStorageUnitIndex(x)
|
||||
if unitIndex >= n.Size() {
|
||||
@ -40,11 +45,11 @@ func (n *NLookup[T]) Remove(x T) {
|
||||
n.Data[unitIndex] ^= 1 << (x % StorageTypeBits)
|
||||
}
|
||||
|
||||
func (n *NLookup[T]) Contains(x T) bool {
|
||||
func (n *NSet[T]) Contains(x T) bool {
|
||||
return n.isSet(x)
|
||||
}
|
||||
|
||||
func (n *NLookup[T]) ContainsAny(values ...T) bool {
|
||||
func (n *NSet[T]) ContainsAny(values ...T) bool {
|
||||
|
||||
for _, x := range values {
|
||||
if n.isSet(x) {
|
||||
@ -55,7 +60,7 @@ func (n *NLookup[T]) ContainsAny(values ...T) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func (n *NLookup[T]) ContainsAll(values ...T) bool {
|
||||
func (n *NSet[T]) ContainsAll(values ...T) bool {
|
||||
|
||||
for _, x := range values {
|
||||
if !n.isSet(x) {
|
||||
@ -66,30 +71,30 @@ func (n *NLookup[T]) ContainsAll(values ...T) bool {
|
||||
return true
|
||||
}
|
||||
|
||||
func (n *NLookup[T]) isSet(x T) bool {
|
||||
func (n *NSet[T]) isSet(x T) bool {
|
||||
unitIndex := n.GetStorageUnitIndex(x)
|
||||
return unitIndex < n.Size() && n.Data[unitIndex]&(1<<(x%StorageTypeBits)) != 0
|
||||
}
|
||||
|
||||
func (n *NLookup[T]) GetStorageUnitIndex(x T) uint64 {
|
||||
func (n *NSet[T]) GetStorageUnitIndex(x T) uint64 {
|
||||
return uint64(x) / StorageTypeBits
|
||||
}
|
||||
|
||||
func (n *NLookup[T]) GetStorageUnit(x T) StorageType {
|
||||
func (n *NSet[T]) GetStorageUnit(x T) StorageType {
|
||||
return n.Data[x/StorageTypeBits]
|
||||
}
|
||||
|
||||
//Size returns len(n.Data)
|
||||
func (n *NLookup[T]) Size() uint64 {
|
||||
return uint64(len(n.Data))
|
||||
//Size returns the number of storage units
|
||||
func (n *NSet[T]) Size() uint64 {
|
||||
return n.StorageUnitCount
|
||||
}
|
||||
|
||||
func (n *NLookup[T]) ElementCap() uint64 {
|
||||
func (n *NSet[T]) ElementCap() uint64 {
|
||||
return uint64(len(n.Data) * StorageTypeBits)
|
||||
}
|
||||
|
||||
//String returns a string of the storage as bytes separated by spaces. A comma is between each storage unit
|
||||
func (n *NLookup[T]) String() string {
|
||||
func (n *NSet[T]) String() string {
|
||||
|
||||
b := strings.Builder{}
|
||||
b.Grow(len(n.Data)*StorageTypeBits + len(n.Data)*2)
|
||||
@ -115,17 +120,19 @@ func (n *NLookup[T]) String() string {
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func NewNLookup[T IntsIf]() NLookup[T] {
|
||||
func NewNSet[T IntsIf]() NSet[T] {
|
||||
|
||||
return NLookup[T]{
|
||||
return NSet[T]{
|
||||
Data: make([]StorageType, 1),
|
||||
StorageUnitCount: 1,
|
||||
}
|
||||
}
|
||||
|
||||
//NewNLookupWithMax creates a nlookup that already has capacity to hold till at least largestNum without resizing.
|
||||
//NewNSetWithMax creates a set that already has capacity to hold till at least largestNum without resizing.
|
||||
//Note that this is NOT the count of elements you want to store, instead you input the largest value you want to store. You can store larger values as well.
|
||||
func NewNLookupWithMax[T IntsIf](largestNum T) NLookup[T] {
|
||||
return NLookup[T]{
|
||||
func NewNSetWithMax[T IntsIf](largestNum T) NSet[T] {
|
||||
return NSet[T]{
|
||||
Data: make([]StorageType, largestNum/StorageTypeBits+1),
|
||||
StorageUnitCount: uint64(largestNum/StorageTypeBits + 1),
|
||||
}
|
||||
}
|
||||
|
||||
264
nlookup_test.go
264
nlookup_test.go
@ -1,14 +1,24 @@
|
||||
package nlookup_test
|
||||
package nset_test
|
||||
|
||||
import (
|
||||
"math/rand"
|
||||
"testing"
|
||||
|
||||
"github.com/bloeys/nlookup"
|
||||
"github.com/bloeys/nset"
|
||||
)
|
||||
|
||||
func TestNLookup(t *testing.T) {
|
||||
const (
|
||||
maxBenchSize = 10_000_000
|
||||
RandSeed = 9_812_938_704
|
||||
)
|
||||
|
||||
n := nlookup.NewNLookup[uint]()
|
||||
var (
|
||||
dump int
|
||||
)
|
||||
|
||||
func TestNSet(t *testing.T) {
|
||||
|
||||
n := nset.NewNSet[uint32]()
|
||||
IsEq(t, 1, cap(n.Data))
|
||||
|
||||
n.Add(0)
|
||||
@ -22,7 +32,7 @@ func TestNLookup(t *testing.T) {
|
||||
n.Remove(1)
|
||||
AllTrue(t, n.Contains(0), n.Contains(63), !n.Contains(1))
|
||||
|
||||
n = nlookup.NewNLookupWithMax[uint](100)
|
||||
n = nset.NewNSetWithMax[uint32](100)
|
||||
IsEq(t, 2, cap(n.Data))
|
||||
}
|
||||
|
||||
@ -46,3 +56,247 @@ func IsEq[T comparable](t *testing.T, expected, val T) bool {
|
||||
t.Errorf("Expected '%v' but got '%v'\n", expected, val)
|
||||
return false
|
||||
}
|
||||
|
||||
func BenchmarkNSetAdd(b *testing.B) {
|
||||
|
||||
n := nset.NewNSet[uint32]()
|
||||
|
||||
for i := uint32(0); i < uint32(b.N); i++ {
|
||||
n.Add(i % maxBenchSize)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkMapAdd(b *testing.B) {
|
||||
|
||||
hMap := map[uint32]struct{}{}
|
||||
|
||||
for i := uint32(0); i < uint32(b.N); i++ {
|
||||
hMap[i%maxBenchSize] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkNSetAddRand(b *testing.B) {
|
||||
|
||||
n := nset.NewNSet[uint32]()
|
||||
|
||||
rand.Seed(RandSeed)
|
||||
for i := 0; i < b.N; i++ {
|
||||
n.Add(rand.Uint32() % maxBenchSize)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkMapAddRand(b *testing.B) {
|
||||
|
||||
hMap := map[uint32]struct{}{}
|
||||
|
||||
rand.Seed(RandSeed)
|
||||
for i := 0; i < b.N; i++ {
|
||||
hMap[rand.Uint32()%maxBenchSize] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkNSetAddPresized(b *testing.B) {
|
||||
|
||||
n := nset.NewNSetWithMax[uint32](maxBenchSize - 1)
|
||||
|
||||
for i := uint32(0); i < uint32(b.N); i++ {
|
||||
n.Add(i % maxBenchSize)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkMapAddPresized(b *testing.B) {
|
||||
|
||||
hMap := make(map[uint32]struct{}, maxBenchSize-1)
|
||||
|
||||
for i := uint32(0); i < uint32(b.N); i++ {
|
||||
hMap[i%maxBenchSize] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkNSetAddPresizedRand(b *testing.B) {
|
||||
|
||||
n := nset.NewNSetWithMax[uint32](maxBenchSize - 1)
|
||||
|
||||
rand.Seed(RandSeed)
|
||||
for i := 0; i < b.N; i++ {
|
||||
n.Add(rand.Uint32() % maxBenchSize)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkMapAddPresizedRand(b *testing.B) {
|
||||
|
||||
hMap := make(map[uint32]struct{}, maxBenchSize-1)
|
||||
|
||||
rand.Seed(RandSeed)
|
||||
for i := 0; i < b.N; i++ {
|
||||
hMap[rand.Uint32()%maxBenchSize] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkNSetContains(b *testing.B) {
|
||||
|
||||
//Init
|
||||
b.StopTimer()
|
||||
n := nset.NewNSet[uint32]()
|
||||
|
||||
for i := uint32(0); i < maxBenchSize; i++ {
|
||||
n.Add(i)
|
||||
}
|
||||
b.StartTimer()
|
||||
|
||||
//Work
|
||||
found := 0
|
||||
for i := uint32(0); i < uint32(b.N); i++ {
|
||||
if n.Contains(i) {
|
||||
found++
|
||||
}
|
||||
}
|
||||
|
||||
dump = found
|
||||
}
|
||||
|
||||
func BenchmarkMapContains(b *testing.B) {
|
||||
|
||||
//Init
|
||||
b.StopTimer()
|
||||
hMap := map[uint32]struct{}{}
|
||||
|
||||
for i := uint32(0); i < maxBenchSize; i++ {
|
||||
hMap[i] = struct{}{}
|
||||
}
|
||||
b.StartTimer()
|
||||
|
||||
//Work
|
||||
found := 0
|
||||
for i := uint32(0); i < uint32(b.N); i++ {
|
||||
if _, ok := hMap[i]; ok {
|
||||
found++
|
||||
}
|
||||
}
|
||||
|
||||
dump = found
|
||||
}
|
||||
|
||||
func BenchmarkNSetContainsRand(b *testing.B) {
|
||||
|
||||
//Init
|
||||
b.StopTimer()
|
||||
n := nset.NewNSet[uint32]()
|
||||
|
||||
for i := uint32(0); i < maxBenchSize; i++ {
|
||||
n.Add(i)
|
||||
}
|
||||
b.StartTimer()
|
||||
|
||||
//Work
|
||||
found := 0
|
||||
rand.Seed(RandSeed)
|
||||
for i := 0; i < b.N; i++ {
|
||||
|
||||
randVal := rand.Uint32()
|
||||
if n.Contains(randVal) {
|
||||
found++
|
||||
}
|
||||
}
|
||||
|
||||
dump = found
|
||||
}
|
||||
|
||||
func BenchmarkMapContainsRand(b *testing.B) {
|
||||
|
||||
//Init
|
||||
b.StopTimer()
|
||||
hMap := map[uint32]struct{}{}
|
||||
|
||||
for i := uint32(0); i < maxBenchSize; i++ {
|
||||
hMap[i] = struct{}{}
|
||||
}
|
||||
b.StartTimer()
|
||||
|
||||
//Work
|
||||
found := 0
|
||||
rand.Seed(RandSeed)
|
||||
for i := 0; i < b.N; i++ {
|
||||
|
||||
randVal := rand.Uint32()
|
||||
if _, ok := hMap[randVal]; ok {
|
||||
found++
|
||||
}
|
||||
}
|
||||
|
||||
dump = found
|
||||
}
|
||||
|
||||
func BenchmarkNSetDelete(b *testing.B) {
|
||||
|
||||
//Init
|
||||
b.StopTimer()
|
||||
n := nset.NewNSet[uint32]()
|
||||
|
||||
for i := uint32(0); i < maxBenchSize; i++ {
|
||||
n.Add(i)
|
||||
}
|
||||
b.StartTimer()
|
||||
|
||||
//Work
|
||||
for i := uint32(0); i < uint32(b.N); i++ {
|
||||
n.Remove(i)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkMapDelete(b *testing.B) {
|
||||
|
||||
//Init
|
||||
b.StopTimer()
|
||||
hMap := map[uint32]struct{}{}
|
||||
|
||||
for i := uint32(0); i < maxBenchSize; i++ {
|
||||
hMap[i] = struct{}{}
|
||||
}
|
||||
b.StartTimer()
|
||||
|
||||
//Work
|
||||
for i := uint32(0); i < uint32(b.N); i++ {
|
||||
delete(hMap, i)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkNSetDeleteRand(b *testing.B) {
|
||||
|
||||
//Init
|
||||
b.StopTimer()
|
||||
n := nset.NewNSet[uint32]()
|
||||
|
||||
for i := uint32(0); i < maxBenchSize; i++ {
|
||||
n.Add(i)
|
||||
}
|
||||
b.StartTimer()
|
||||
|
||||
//Work
|
||||
rand.Seed(RandSeed)
|
||||
for i := 0; i < b.N; i++ {
|
||||
|
||||
randVal := rand.Uint32()
|
||||
n.Remove(randVal)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkMapDeleteRand(b *testing.B) {
|
||||
|
||||
//Init
|
||||
b.StopTimer()
|
||||
hMap := map[uint32]struct{}{}
|
||||
|
||||
for i := uint32(0); i < maxBenchSize; i++ {
|
||||
hMap[i] = struct{}{}
|
||||
}
|
||||
b.StartTimer()
|
||||
|
||||
//Work
|
||||
rand.Seed(RandSeed)
|
||||
for i := 0; i < b.N; i++ {
|
||||
|
||||
randVal := rand.Uint32()
|
||||
delete(hMap, randVal)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user