Continue+Docs

This commit is contained in:
bloeys
2022-06-10 05:09:42 +04:00
parent b5be85fe03
commit 1f1e066620
6 changed files with 395 additions and 32 deletions

BIN
.res/bench-10-million.png Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 90 KiB

BIN
.res/bench-100.png Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 88 KiB

104
README.md
View File

@ -1 +1,103 @@
# nlookup
# NSet
NSet is a super fast and memory efficient set implementation built for unsigned integers up to and including uint32.
By 'set' we mean something like a hash map, but instead of key/value pairs there are only keys.
You can do the normal operations of add, check if item exists, and delete, but you can also do things like union sets and
get intersections.
**Contents**:
- [NSet](#nset)
- [When to use NSet](#when-to-use-nset)
- [Usage](#usage)
- [Benchmarks](#benchmarks)
## When to use NSet
Maybe you want a set implementation? Then this is one, but there are other reasons.
If you are using your hash maps/arrays like sets or do a lot of checks to see if items exists in your hash maps then NSet might make sense.
In such cases NSet makes sense because it is both faster and more memory efficient. You can see more about this in the Benchmarks section.
Here are some examples where you might want to consider NSet:
``` go
//You might be using maps mostly for checking if things exist:
//This map is being used like a set. Some people might also do: make(map[uint32]bool, 0)
mapOfIds := make(map[uint32]struct{}, 0)
//Fill map here...
someId := 54312
if _, ok:= mapOfIds[someId]; ok {
//Do something
} else {
//Something else
}
```
```go
//You might be searching arrays a lot
func ExistsInArray(myArray []int, item int) bool {
for i := 0; i < len(myArray); i++ {
if myArray[i] == item {
return true
}
}
return false
}
```
## Usage
To install run `go get github.com/bloeys/nset`
Then usage is very simple:
```go
mySet := nset.NewNSet[uint32]()
mySet.Add(0)
mySet.Add(300)
mySet.Add(256)
mySet.Add(4)
if mySet.Contains(5) {
panic("Oops I don't want 5!")
}
mySet.Remove(4)
```
## Benchmarks
NSet is faster than the built-in Go hash map in all operations (add, check, delete) by `1.6x to 64x` depending on the operation and data size.
Benchmark with 100 elements:
![Benchmark of 100 elements](./.res/bench-100.png)
Benchmark with 10,000,000 elements:
![Benchmark of 10,000,000 elements](./.res/bench-10-million.png)
As can be seen from the benchmarks, NSet has almost no change in its performance even with 10 million elements, while the
hash map slows down a lot as the size grows. NSet practically doesn't allocate at all. But it should be noted that
allocation can happen when adding a number bigger than all previously entered numbers.
Benchmarks that have 'Rand' in them mean that access patterns are randomized which can cause cache invalidation.
To make sure the test is fair the seed is the same for both Go Map and NSet. Here both suffer slowdowns but NSet remains faster.
Benchmarks that have `Presized` in them means that the data structure was fully allocated before usage, like:
```go
//This map already has space for ~100 elements and so doesn't need to resize, which is costly
myMap := make(map[uint16], 100)
```
Map benefits from sizing while NSet isn't affected, but in both cases NSet remains faster.

2
go.mod
View File

@ -1,3 +1,3 @@
module github.com/bloeys/nlookup
module github.com/bloeys/nset
go 1.18

View File

@ -1,36 +1,41 @@
package nlookup
package nset
import (
"fmt"
"strings"
)
var _ fmt.Stringer = &NLookup[uint]{}
var _ fmt.Stringer = &NSet[uint8]{}
type StorageType uint64
const StorageTypeBits = 64
//IntsIf is limited to uint32 because we can store ALL 4 Billion uint32 numbers
//in 256MB with NSet (instead of the normal 16GB for an array of all uint32s).
//But if we allow uint64 (or int, since int can be 64-bit) users can easily put a big 64-bit number and use more RAM than maybe Google and crash.
type IntsIf interface {
uint | uint8 | uint16 | uint32 | uint64
uint8 | uint16 | uint32
}
type NLookup[T IntsIf] struct {
Data []StorageType
type NSet[T IntsIf] struct {
Data []StorageType
StorageUnitCount uint64
}
func (n *NLookup[T]) Add(x T) {
func (n *NSet[T]) Add(x T) {
unitIndex := n.GetStorageUnitIndex(x)
if unitIndex >= n.Size() {
storageUnitsToAdd := unitIndex - n.Size() + 1
n.Data = append(n.Data, make([]StorageType, storageUnitsToAdd)...)
n.StorageUnitCount += storageUnitsToAdd
}
n.Data[unitIndex] |= 1 << (x % StorageTypeBits)
}
func (n *NLookup[T]) Remove(x T) {
func (n *NSet[T]) Remove(x T) {
unitIndex := n.GetStorageUnitIndex(x)
if unitIndex >= n.Size() {
@ -40,11 +45,11 @@ func (n *NLookup[T]) Remove(x T) {
n.Data[unitIndex] ^= 1 << (x % StorageTypeBits)
}
func (n *NLookup[T]) Contains(x T) bool {
func (n *NSet[T]) Contains(x T) bool {
return n.isSet(x)
}
func (n *NLookup[T]) ContainsAny(values ...T) bool {
func (n *NSet[T]) ContainsAny(values ...T) bool {
for _, x := range values {
if n.isSet(x) {
@ -55,7 +60,7 @@ func (n *NLookup[T]) ContainsAny(values ...T) bool {
return false
}
func (n *NLookup[T]) ContainsAll(values ...T) bool {
func (n *NSet[T]) ContainsAll(values ...T) bool {
for _, x := range values {
if !n.isSet(x) {
@ -66,30 +71,30 @@ func (n *NLookup[T]) ContainsAll(values ...T) bool {
return true
}
func (n *NLookup[T]) isSet(x T) bool {
func (n *NSet[T]) isSet(x T) bool {
unitIndex := n.GetStorageUnitIndex(x)
return unitIndex < n.Size() && n.Data[unitIndex]&(1<<(x%StorageTypeBits)) != 0
}
func (n *NLookup[T]) GetStorageUnitIndex(x T) uint64 {
func (n *NSet[T]) GetStorageUnitIndex(x T) uint64 {
return uint64(x) / StorageTypeBits
}
func (n *NLookup[T]) GetStorageUnit(x T) StorageType {
func (n *NSet[T]) GetStorageUnit(x T) StorageType {
return n.Data[x/StorageTypeBits]
}
//Size returns len(n.Data)
func (n *NLookup[T]) Size() uint64 {
return uint64(len(n.Data))
//Size returns the number of storage units
func (n *NSet[T]) Size() uint64 {
return n.StorageUnitCount
}
func (n *NLookup[T]) ElementCap() uint64 {
func (n *NSet[T]) ElementCap() uint64 {
return uint64(len(n.Data) * StorageTypeBits)
}
//String returns a string of the storage as bytes separated by spaces. A comma is between each storage unit
func (n *NLookup[T]) String() string {
func (n *NSet[T]) String() string {
b := strings.Builder{}
b.Grow(len(n.Data)*StorageTypeBits + len(n.Data)*2)
@ -115,17 +120,19 @@ func (n *NLookup[T]) String() string {
return b.String()
}
func NewNLookup[T IntsIf]() NLookup[T] {
func NewNSet[T IntsIf]() NSet[T] {
return NLookup[T]{
Data: make([]StorageType, 1),
return NSet[T]{
Data: make([]StorageType, 1),
StorageUnitCount: 1,
}
}
//NewNLookupWithMax creates a nlookup that already has capacity to hold till at least largestNum without resizing.
//NewNSetWithMax creates a set that already has capacity to hold till at least largestNum without resizing.
//Note that this is NOT the count of elements you want to store, instead you input the largest value you want to store. You can store larger values as well.
func NewNLookupWithMax[T IntsIf](largestNum T) NLookup[T] {
return NLookup[T]{
Data: make([]StorageType, largestNum/StorageTypeBits+1),
func NewNSetWithMax[T IntsIf](largestNum T) NSet[T] {
return NSet[T]{
Data: make([]StorageType, largestNum/StorageTypeBits+1),
StorageUnitCount: uint64(largestNum/StorageTypeBits + 1),
}
}

View File

@ -1,14 +1,24 @@
package nlookup_test
package nset_test
import (
"math/rand"
"testing"
"github.com/bloeys/nlookup"
"github.com/bloeys/nset"
)
func TestNLookup(t *testing.T) {
const (
maxBenchSize = 10_000_000
RandSeed = 9_812_938_704
)
n := nlookup.NewNLookup[uint]()
var (
dump int
)
func TestNSet(t *testing.T) {
n := nset.NewNSet[uint32]()
IsEq(t, 1, cap(n.Data))
n.Add(0)
@ -22,7 +32,7 @@ func TestNLookup(t *testing.T) {
n.Remove(1)
AllTrue(t, n.Contains(0), n.Contains(63), !n.Contains(1))
n = nlookup.NewNLookupWithMax[uint](100)
n = nset.NewNSetWithMax[uint32](100)
IsEq(t, 2, cap(n.Data))
}
@ -46,3 +56,247 @@ func IsEq[T comparable](t *testing.T, expected, val T) bool {
t.Errorf("Expected '%v' but got '%v'\n", expected, val)
return false
}
func BenchmarkNSetAdd(b *testing.B) {
n := nset.NewNSet[uint32]()
for i := uint32(0); i < uint32(b.N); i++ {
n.Add(i % maxBenchSize)
}
}
func BenchmarkMapAdd(b *testing.B) {
hMap := map[uint32]struct{}{}
for i := uint32(0); i < uint32(b.N); i++ {
hMap[i%maxBenchSize] = struct{}{}
}
}
func BenchmarkNSetAddRand(b *testing.B) {
n := nset.NewNSet[uint32]()
rand.Seed(RandSeed)
for i := 0; i < b.N; i++ {
n.Add(rand.Uint32() % maxBenchSize)
}
}
func BenchmarkMapAddRand(b *testing.B) {
hMap := map[uint32]struct{}{}
rand.Seed(RandSeed)
for i := 0; i < b.N; i++ {
hMap[rand.Uint32()%maxBenchSize] = struct{}{}
}
}
func BenchmarkNSetAddPresized(b *testing.B) {
n := nset.NewNSetWithMax[uint32](maxBenchSize - 1)
for i := uint32(0); i < uint32(b.N); i++ {
n.Add(i % maxBenchSize)
}
}
func BenchmarkMapAddPresized(b *testing.B) {
hMap := make(map[uint32]struct{}, maxBenchSize-1)
for i := uint32(0); i < uint32(b.N); i++ {
hMap[i%maxBenchSize] = struct{}{}
}
}
func BenchmarkNSetAddPresizedRand(b *testing.B) {
n := nset.NewNSetWithMax[uint32](maxBenchSize - 1)
rand.Seed(RandSeed)
for i := 0; i < b.N; i++ {
n.Add(rand.Uint32() % maxBenchSize)
}
}
func BenchmarkMapAddPresizedRand(b *testing.B) {
hMap := make(map[uint32]struct{}, maxBenchSize-1)
rand.Seed(RandSeed)
for i := 0; i < b.N; i++ {
hMap[rand.Uint32()%maxBenchSize] = struct{}{}
}
}
func BenchmarkNSetContains(b *testing.B) {
//Init
b.StopTimer()
n := nset.NewNSet[uint32]()
for i := uint32(0); i < maxBenchSize; i++ {
n.Add(i)
}
b.StartTimer()
//Work
found := 0
for i := uint32(0); i < uint32(b.N); i++ {
if n.Contains(i) {
found++
}
}
dump = found
}
func BenchmarkMapContains(b *testing.B) {
//Init
b.StopTimer()
hMap := map[uint32]struct{}{}
for i := uint32(0); i < maxBenchSize; i++ {
hMap[i] = struct{}{}
}
b.StartTimer()
//Work
found := 0
for i := uint32(0); i < uint32(b.N); i++ {
if _, ok := hMap[i]; ok {
found++
}
}
dump = found
}
func BenchmarkNSetContainsRand(b *testing.B) {
//Init
b.StopTimer()
n := nset.NewNSet[uint32]()
for i := uint32(0); i < maxBenchSize; i++ {
n.Add(i)
}
b.StartTimer()
//Work
found := 0
rand.Seed(RandSeed)
for i := 0; i < b.N; i++ {
randVal := rand.Uint32()
if n.Contains(randVal) {
found++
}
}
dump = found
}
func BenchmarkMapContainsRand(b *testing.B) {
//Init
b.StopTimer()
hMap := map[uint32]struct{}{}
for i := uint32(0); i < maxBenchSize; i++ {
hMap[i] = struct{}{}
}
b.StartTimer()
//Work
found := 0
rand.Seed(RandSeed)
for i := 0; i < b.N; i++ {
randVal := rand.Uint32()
if _, ok := hMap[randVal]; ok {
found++
}
}
dump = found
}
func BenchmarkNSetDelete(b *testing.B) {
//Init
b.StopTimer()
n := nset.NewNSet[uint32]()
for i := uint32(0); i < maxBenchSize; i++ {
n.Add(i)
}
b.StartTimer()
//Work
for i := uint32(0); i < uint32(b.N); i++ {
n.Remove(i)
}
}
func BenchmarkMapDelete(b *testing.B) {
//Init
b.StopTimer()
hMap := map[uint32]struct{}{}
for i := uint32(0); i < maxBenchSize; i++ {
hMap[i] = struct{}{}
}
b.StartTimer()
//Work
for i := uint32(0); i < uint32(b.N); i++ {
delete(hMap, i)
}
}
func BenchmarkNSetDeleteRand(b *testing.B) {
//Init
b.StopTimer()
n := nset.NewNSet[uint32]()
for i := uint32(0); i < maxBenchSize; i++ {
n.Add(i)
}
b.StartTimer()
//Work
rand.Seed(RandSeed)
for i := 0; i < b.N; i++ {
randVal := rand.Uint32()
n.Remove(randVal)
}
}
func BenchmarkMapDeleteRand(b *testing.B) {
//Init
b.StopTimer()
hMap := map[uint32]struct{}{}
for i := uint32(0); i < maxBenchSize; i++ {
hMap[i] = struct{}{}
}
b.StartTimer()
//Work
rand.Seed(RandSeed)
for i := 0; i < b.N; i++ {
randVal := rand.Uint32()
delete(hMap, randVal)
}
}