Continue+Docs

2025-12-29 06:28:19 +00:00 · 2022-06-10 05:09:42 +04:00
parent b5be85fe03
commit 1f1e066620
6 changed files with 395 additions and 32 deletions
--- a/.res/bench-10-million.png
+++ b/.res/bench-10-million.png
--- a/.res/bench-100.png
+++ b/.res/bench-100.png
--- a/README.md
+++ b/README.md
@ -1 +1,103 @@
-# nlookup
+# NSet
+
+NSet is a super fast and memory efficient set implementation built for unsigned integers up to and including uint32.
+
+By 'set' we mean something like a hash map, but instead of key/value pairs there are only keys.
+You can do the normal operations of add, check if item exists, and delete, but you can also do things like union sets and
+get intersections.
+
+**Contents**:
+
+- [NSet](#nset)
+  - [When to use NSet](#when-to-use-nset)
+  - [Usage](#usage)
+  - [Benchmarks](#benchmarks)
+
+## When to use NSet
+
+Maybe you want a set implementation? Then this is one, but there are other reasons.
+
+If you are using your hash maps/arrays like sets or do a lot of checks to see if items exists in your hash maps then NSet might make sense.
+In such cases NSet makes sense because it is both faster and more memory efficient. You can see more about this in the Benchmarks section.
+
+Here are some examples where you might want to consider NSet:
+
+``` go
+//You might be using maps mostly for checking if things exist:
+
+//This map is being used like a set. Some people might also do: make(map[uint32]bool, 0)
+mapOfIds := make(map[uint32]struct{}, 0)
+
+//Fill map here...
+
+someId := 54312
+if _, ok:= mapOfIds[someId]; ok {
+    //Do something
+} else {
+    //Something else
+}
+```
+
+```go
+//You might be searching arrays a lot
+func ExistsInArray(myArray []int, item int) bool {
+
+    for i := 0; i < len(myArray); i++ {
+        if myArray[i] == item {
+            return true
+        }
+    }
+
+    return false
+}
+```
+
+## Usage
+
+To install run `go get github.com/bloeys/nset`
+
+Then usage is very simple:
+```go
+
+mySet := nset.NewNSet[uint32]()
+
+mySet.Add(0)
+mySet.Add(300)
+mySet.Add(256)
+mySet.Add(4)
+
+if mySet.Contains(5) {
+    panic("Oops I don't want 5!")
+}
+
+mySet.Remove(4)
+
+```
+
+## Benchmarks
+
+NSet is faster than the built-in Go hash map in all operations (add, check, delete) by `1.6x to 64x` depending on the operation and data size.
+
+Benchmark with 100 elements:
+
+![Benchmark of 100 elements](./.res/bench-100.png)
+
+Benchmark with 10,000,000 elements:
+
+![Benchmark of 10,000,000 elements](./.res/bench-10-million.png)
+
+As can be seen from the benchmarks, NSet has almost no change in its performance even with 10 million elements, while the
+hash map slows down a lot as the size grows. NSet practically doesn't allocate at all. But it should be noted that
+allocation can happen when adding a number bigger than all previously entered numbers.
+
+Benchmarks that have 'Rand' in them mean that access patterns are randomized which can cause cache invalidation.
+To make sure the test is fair the seed is the same for both Go Map and NSet. Here both suffer slowdowns but NSet remains faster.
+
+Benchmarks that have `Presized` in them means that the data structure was fully allocated before usage, like:
+
+```go
+//This map already has space for ~100 elements and so doesn't need to resize, which is costly
+myMap := make(map[uint16], 100)
+```
+
+Map benefits from sizing while NSet isn't affected, but in both cases NSet remains faster.
--- a/go.mod
+++ b/go.mod
@ -1,3 +1,3 @@
-module github.com/bloeys/nlookup
+module github.com/bloeys/nset

 go 1.18
--- a/nlookup.go
+++ b/nlookup.go
@ -1,36 +1,41 @@
-package nlookup
+package nset

 import (
 	"fmt"
 	"strings"
 )

-var _ fmt.Stringer = &NLookup[uint]{}
+var _ fmt.Stringer = &NSet[uint8]{}

 type StorageType uint64

 const StorageTypeBits = 64

+//IntsIf is limited to uint32 because we can store ALL 4 Billion uint32 numbers
+//in 256MB with NSet (instead of the normal 16GB for an array of all uint32s).
+//But if we allow uint64 (or int, since int can be 64-bit) users can easily put a big 64-bit number and use more RAM than maybe Google and crash.
 type IntsIf interface {
-	uint | uint8 | uint16 | uint32 | uint64
+	uint8 | uint16 | uint32
 }

-type NLookup[T IntsIf] struct {
-	Data []StorageType
+type NSet[T IntsIf] struct {
+	Data             []StorageType
+	StorageUnitCount uint64
 }

-func (n *NLookup[T]) Add(x T) {
+func (n *NSet[T]) Add(x T) {

 	unitIndex := n.GetStorageUnitIndex(x)
 	if unitIndex >= n.Size() {
 		storageUnitsToAdd := unitIndex - n.Size() + 1
 		n.Data = append(n.Data, make([]StorageType, storageUnitsToAdd)...)
+		n.StorageUnitCount += storageUnitsToAdd
 	}

 	n.Data[unitIndex] |= 1 << (x % StorageTypeBits)
 }

-func (n *NLookup[T]) Remove(x T) {
+func (n *NSet[T]) Remove(x T) {

 	unitIndex := n.GetStorageUnitIndex(x)
 	if unitIndex >= n.Size() {
@ -40,11 +45,11 @@ func (n *NLookup[T]) Remove(x T) {
 	n.Data[unitIndex] ^= 1 << (x % StorageTypeBits)
 }

-func (n *NLookup[T]) Contains(x T) bool {
+func (n *NSet[T]) Contains(x T) bool {
 	return n.isSet(x)
 }

-func (n *NLookup[T]) ContainsAny(values ...T) bool {
+func (n *NSet[T]) ContainsAny(values ...T) bool {

 	for _, x := range values {
 		if n.isSet(x) {
@ -55,7 +60,7 @@ func (n *NLookup[T]) ContainsAny(values ...T) bool {
 	return false
 }

-func (n *NLookup[T]) ContainsAll(values ...T) bool {
+func (n *NSet[T]) ContainsAll(values ...T) bool {

 	for _, x := range values {
 		if !n.isSet(x) {
@ -66,30 +71,30 @@ func (n *NLookup[T]) ContainsAll(values ...T) bool {
 	return true
 }

-func (n *NLookup[T]) isSet(x T) bool {
+func (n *NSet[T]) isSet(x T) bool {
 	unitIndex := n.GetStorageUnitIndex(x)
 	return unitIndex < n.Size() && n.Data[unitIndex]&(1<<(x%StorageTypeBits)) != 0
 }

-func (n *NLookup[T]) GetStorageUnitIndex(x T) uint64 {
+func (n *NSet[T]) GetStorageUnitIndex(x T) uint64 {
 	return uint64(x) / StorageTypeBits
 }

-func (n *NLookup[T]) GetStorageUnit(x T) StorageType {
+func (n *NSet[T]) GetStorageUnit(x T) StorageType {
 	return n.Data[x/StorageTypeBits]
 }

-//Size returns len(n.Data)
-func (n *NLookup[T]) Size() uint64 {
-	return uint64(len(n.Data))
+//Size returns the number of storage units
+func (n *NSet[T]) Size() uint64 {
+	return n.StorageUnitCount
 }

-func (n *NLookup[T]) ElementCap() uint64 {
+func (n *NSet[T]) ElementCap() uint64 {
 	return uint64(len(n.Data) * StorageTypeBits)
 }

 //String returns a string of the storage as bytes separated by spaces. A comma is between each storage unit
-func (n *NLookup[T]) String() string {
+func (n *NSet[T]) String() string {

 	b := strings.Builder{}
 	b.Grow(len(n.Data)*StorageTypeBits + len(n.Data)*2)
@ -115,17 +120,19 @@ func (n *NLookup[T]) String() string {
 	return b.String()
 }

-func NewNLookup[T IntsIf]() NLookup[T] {
+func NewNSet[T IntsIf]() NSet[T] {

-	return NLookup[T]{
-		Data: make([]StorageType, 1),
+	return NSet[T]{
+		Data:             make([]StorageType, 1),
+		StorageUnitCount: 1,
 	}
 }

-//NewNLookupWithMax creates a nlookup that already has capacity to hold till at least largestNum without resizing.
+//NewNSetWithMax creates a set that already has capacity to hold till at least largestNum without resizing.
 //Note that this is NOT the count of elements you want to store, instead you input the largest value you want to store. You can store larger values as well.
-func NewNLookupWithMax[T IntsIf](largestNum T) NLookup[T] {
-	return NLookup[T]{
-		Data: make([]StorageType, largestNum/StorageTypeBits+1),
+func NewNSetWithMax[T IntsIf](largestNum T) NSet[T] {
+	return NSet[T]{
+		Data:             make([]StorageType, largestNum/StorageTypeBits+1),
+		StorageUnitCount: uint64(largestNum/StorageTypeBits + 1),
 	}
 }
--- a/nlookup_test.go
+++ b/nlookup_test.go
@ -1,14 +1,24 @@
-package nlookup_test
+package nset_test

 import (
+	"math/rand"
 	"testing"

-	"github.com/bloeys/nlookup"
+	"github.com/bloeys/nset"
 )

-func TestNLookup(t *testing.T) {
+const (
+	maxBenchSize = 10_000_000
+	RandSeed     = 9_812_938_704
+)

-	n := nlookup.NewNLookup[uint]()
+var (
+	dump int
+)
+
+func TestNSet(t *testing.T) {
+
+	n := nset.NewNSet[uint32]()
 	IsEq(t, 1, cap(n.Data))

 	n.Add(0)
@ -22,7 +32,7 @@ func TestNLookup(t *testing.T) {
 	n.Remove(1)
 	AllTrue(t, n.Contains(0), n.Contains(63), !n.Contains(1))

-	n = nlookup.NewNLookupWithMax[uint](100)
+	n = nset.NewNSetWithMax[uint32](100)
 	IsEq(t, 2, cap(n.Data))
 }

@ -46,3 +56,247 @@ func IsEq[T comparable](t *testing.T, expected, val T) bool {
 	t.Errorf("Expected '%v' but got '%v'\n", expected, val)
 	return false
 }
+
+func BenchmarkNSetAdd(b *testing.B) {
+
+	n := nset.NewNSet[uint32]()
+
+	for i := uint32(0); i < uint32(b.N); i++ {
+		n.Add(i % maxBenchSize)
+	}
+}
+
+func BenchmarkMapAdd(b *testing.B) {
+
+	hMap := map[uint32]struct{}{}
+
+	for i := uint32(0); i < uint32(b.N); i++ {
+		hMap[i%maxBenchSize] = struct{}{}
+	}
+}
+
+func BenchmarkNSetAddRand(b *testing.B) {
+
+	n := nset.NewNSet[uint32]()
+
+	rand.Seed(RandSeed)
+	for i := 0; i < b.N; i++ {
+		n.Add(rand.Uint32() % maxBenchSize)
+	}
+}
+
+func BenchmarkMapAddRand(b *testing.B) {
+
+	hMap := map[uint32]struct{}{}
+
+	rand.Seed(RandSeed)
+	for i := 0; i < b.N; i++ {
+		hMap[rand.Uint32()%maxBenchSize] = struct{}{}
+	}
+}
+
+func BenchmarkNSetAddPresized(b *testing.B) {
+
+	n := nset.NewNSetWithMax[uint32](maxBenchSize - 1)
+
+	for i := uint32(0); i < uint32(b.N); i++ {
+		n.Add(i % maxBenchSize)
+	}
+}
+
+func BenchmarkMapAddPresized(b *testing.B) {
+
+	hMap := make(map[uint32]struct{}, maxBenchSize-1)
+
+	for i := uint32(0); i < uint32(b.N); i++ {
+		hMap[i%maxBenchSize] = struct{}{}
+	}
+}
+
+func BenchmarkNSetAddPresizedRand(b *testing.B) {
+
+	n := nset.NewNSetWithMax[uint32](maxBenchSize - 1)
+
+	rand.Seed(RandSeed)
+	for i := 0; i < b.N; i++ {
+		n.Add(rand.Uint32() % maxBenchSize)
+	}
+}
+
+func BenchmarkMapAddPresizedRand(b *testing.B) {
+
+	hMap := make(map[uint32]struct{}, maxBenchSize-1)
+
+	rand.Seed(RandSeed)
+	for i := 0; i < b.N; i++ {
+		hMap[rand.Uint32()%maxBenchSize] = struct{}{}
+	}
+}
+
+func BenchmarkNSetContains(b *testing.B) {
+
+	//Init
+	b.StopTimer()
+	n := nset.NewNSet[uint32]()
+
+	for i := uint32(0); i < maxBenchSize; i++ {
+		n.Add(i)
+	}
+	b.StartTimer()
+
+	//Work
+	found := 0
+	for i := uint32(0); i < uint32(b.N); i++ {
+		if n.Contains(i) {
+			found++
+		}
+	}
+
+	dump = found
+}
+
+func BenchmarkMapContains(b *testing.B) {
+
+	//Init
+	b.StopTimer()
+	hMap := map[uint32]struct{}{}
+
+	for i := uint32(0); i < maxBenchSize; i++ {
+		hMap[i] = struct{}{}
+	}
+	b.StartTimer()
+
+	//Work
+	found := 0
+	for i := uint32(0); i < uint32(b.N); i++ {
+		if _, ok := hMap[i]; ok {
+			found++
+		}
+	}
+
+	dump = found
+}
+
+func BenchmarkNSetContainsRand(b *testing.B) {
+
+	//Init
+	b.StopTimer()
+	n := nset.NewNSet[uint32]()
+
+	for i := uint32(0); i < maxBenchSize; i++ {
+		n.Add(i)
+	}
+	b.StartTimer()
+
+	//Work
+	found := 0
+	rand.Seed(RandSeed)
+	for i := 0; i < b.N; i++ {
+
+		randVal := rand.Uint32()
+		if n.Contains(randVal) {
+			found++
+		}
+	}
+
+	dump = found
+}
+
+func BenchmarkMapContainsRand(b *testing.B) {
+
+	//Init
+	b.StopTimer()
+	hMap := map[uint32]struct{}{}
+
+	for i := uint32(0); i < maxBenchSize; i++ {
+		hMap[i] = struct{}{}
+	}
+	b.StartTimer()
+
+	//Work
+	found := 0
+	rand.Seed(RandSeed)
+	for i := 0; i < b.N; i++ {
+
+		randVal := rand.Uint32()
+		if _, ok := hMap[randVal]; ok {
+			found++
+		}
+	}
+
+	dump = found
+}
+
+func BenchmarkNSetDelete(b *testing.B) {
+
+	//Init
+	b.StopTimer()
+	n := nset.NewNSet[uint32]()
+
+	for i := uint32(0); i < maxBenchSize; i++ {
+		n.Add(i)
+	}
+	b.StartTimer()
+
+	//Work
+	for i := uint32(0); i < uint32(b.N); i++ {
+		n.Remove(i)
+	}
+}
+
+func BenchmarkMapDelete(b *testing.B) {
+
+	//Init
+	b.StopTimer()
+	hMap := map[uint32]struct{}{}
+
+	for i := uint32(0); i < maxBenchSize; i++ {
+		hMap[i] = struct{}{}
+	}
+	b.StartTimer()
+
+	//Work
+	for i := uint32(0); i < uint32(b.N); i++ {
+		delete(hMap, i)
+	}
+}
+
+func BenchmarkNSetDeleteRand(b *testing.B) {
+
+	//Init
+	b.StopTimer()
+	n := nset.NewNSet[uint32]()
+
+	for i := uint32(0); i < maxBenchSize; i++ {
+		n.Add(i)
+	}
+	b.StartTimer()
+
+	//Work
+	rand.Seed(RandSeed)
+	for i := 0; i < b.N; i++ {
+
+		randVal := rand.Uint32()
+		n.Remove(randVal)
+	}
+}
+
+func BenchmarkMapDeleteRand(b *testing.B) {
+
+	//Init
+	b.StopTimer()
+	hMap := map[uint32]struct{}{}
+
+	for i := uint32(0); i < maxBenchSize; i++ {
+		hMap[i] = struct{}{}
+	}
+	b.StartTimer()
+
+	//Work
+	rand.Seed(RandSeed)
+	for i := 0; i < b.N; i++ {
+
+		randVal := rand.Uint32()
+		delete(hMap, randVal)
+	}
+}