-
-
Save boydgreenfield/45c8b7b29c4ed53fa45e to your computer and use it in GitHub Desktop.
Bloom filter / bitarray testing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import memfiles # Note tests will fail for mmap-backed files unless you bother to use patched private/memfiles (below) | |
| #import private/memfiles | |
| from os import nil | |
| from strutils import `%`, formatFloat, ffDecimal, toBin | |
| import unsigned | |
| from math import random, randomize | |
| from times import nil | |
| # Type declarations | |
| type | |
| TBitScalar* = uint | |
| type | |
| EBitarray = object of EBase | |
| TBitarrayKind = enum inmem, mmap | |
| TFlexArray {.unchecked.} = array[0..0, TBitScalar] | |
| TBitarray* = ref object | |
| size_elements: int | |
| size_bits*: int | |
| size_specified*: int | |
| bitarray: ptr TFlexArray | |
| case kind: TBitarrayKind | |
| of inmem: | |
| nil | |
| of mmap: | |
| mm_filehandle: TMemFile | |
| const ONE = TBitScalar(1) | |
| proc finalize_bitarray(a: TBitarray) = | |
| if not a.bitarray.isNil: | |
| case a.kind | |
| of inmem: | |
| dealloc(a.bitarray) | |
| a.bitarray = nil | |
| of mmap: | |
| a.mm_filehandle.close() | |
| proc create_bitarray*(size: int): TBitarray = | |
| ## Creates an in-memory bitarray using a specified input size. | |
| ## Note that this will round up to the nearest byte. | |
| let n_elements = size div (sizeof(TBitScalar) * 8) | |
| let n_bits = n_elements * (sizeof(TBitScalar) * 8) | |
| new(result, finalize_bitarray) | |
| result.kind = inmem | |
| result.bitarray = cast[ptr TFlexArray](alloc0(n_elements * sizeof(TBitScalar))) | |
| result.size_elements = n_elements | |
| result.size_bits = n_bits | |
| result.size_specified = size | |
| proc create_bitarray*(file: string, size: int = -1): TBitarray = | |
| ## Creates an mmap-backed bitarray. If the specified file exists | |
| ## it will be opened, but an exception will be raised if the size | |
| ## is specified and does not match. If the file does not exist | |
| ## it will be created. | |
| let n_elements = size div (sizeof(char) * 8) | |
| let n_bits = n_elements * (sizeof(char) * 8) | |
| var mm_file: TMemFile | |
| if os.existsFile(file): | |
| mm_file = open(file, mode = fmReadWrite, mappedSize = -1) | |
| if size != -1 and mm_file.size != n_elements: | |
| raise newException(EBitarray, "Existing mmap file does not have the specified size $1" % $size) | |
| else: | |
| if size == -1: | |
| raise newException(EBitarray, "No existing mmap file. Must specify size to create one.") | |
| mm_file = open(file, mode = fmReadWrite, newFileSize = n_elements) | |
| new(result, finalize_bitarray) | |
| result.kind = mmap | |
| result.bitarray = cast[ptr TFlexArray](mm_file.mem) | |
| result.size_elements = n_elements | |
| result.size_bits = n_bits | |
| result.size_specified = size | |
| result.mm_filehandle = mm_file | |
| proc `[]=`*(ba: var TBitarray, index: int, val: bool) {.inline.} = | |
| ## Sets the bit at an index to be either 0 (false) or 1 (true) | |
| if index >= ba.size_bits or index < 0: | |
| raise newException(EBitarray, "Specified index is too large.") | |
| let i_element = index div (sizeof(TBitScalar) * 8) | |
| let i_offset = TBitScalar(index mod (sizeof(TBitScalar) * 8)) | |
| if val: | |
| ba.bitarray[i_element] = (ba.bitarray[i_element] or (ONE shl i_offset)) | |
| else: | |
| ba.bitarray[i_element] = (ba.bitarray[i_element] and ((not ONE) shl i_offset)) | |
| proc `[]`*(ba: var TBitarray, index: int): bool {.inline.} = | |
| ## Gets the bit at an index element (returns a bool) | |
| if index >= ba.size_bits or index < 0: | |
| raise newException(EBitarray, "Specified index is too large.") | |
| let i_element = index div (sizeof(TBitScalar) * 8) | |
| let i_offset = TBitScalar(index mod (sizeof(TBitScalar) * 8)) | |
| result = bool((ba.bitarray[i_element] shr i_offset) and ONE) | |
| proc `[]`*(ba: var TBitarray, index: TSlice): TBitScalar {.inline.} = | |
| ## Get the bits for a slice of the bitarray. Supports slice sizes | |
| ## up the maximum element size (64 bits by default) | |
| if index.b >= ba.size_bits or index.a < 0: | |
| raise newException(EBitarray, "Specified index is too large.") | |
| if (index.b - index.a) > (sizeof(TBitScalar) * 8): | |
| raise newException(EBitarray, "Only slices up to $1 bits are supported." % $(sizeof(TBitScalar) * 8)) | |
| let i_element_a = index.a div (sizeof(TBitScalar) * 8) | |
| let i_offset_a = TBitScalar(index.a mod (sizeof(TBitScalar) * 8)) | |
| let i_element_b = index.b div (sizeof(TBitScalar) * 8) | |
| let i_offset_b = TBitScalar(sizeof(TBitScalar) * 8) - i_offset_a | |
| var result = ba.bitarray[i_element_a] shr i_offset_a | |
| if i_element_a != i_element_b: # Combine two slices | |
| let slice_b = ba.bitarray[i_element_b] shl i_offset_b | |
| result = result or slice_b | |
| return result # Fails if this isn't included? | |
| proc `[]=`*(ba: var TBitarray, index: TSlice, val: TBitScalar) {.inline.} = | |
| ## Set the bits for a slice of the bitarray. Supports slice sizes | |
| ## up to the maximum element size (64 bits by default) | |
| ## Note: This inserts using a bitwise-or, it will *not* overwrite previously | |
| ## set true values! | |
| if index.b >= ba.size_bits or index.a < 0: | |
| raise newException(EBitarray, "Specified index is too large.") | |
| if (index.b - index.a) > (sizeof(TBitScalar) * 8): | |
| raise newException(EBitarray, "Only slices up to $1 bits are supported." % $(sizeof(TBitScalar) * 8)) | |
| # TODO(nbg): Make a macro for handling this and also the if/else in-memory piece | |
| let i_element_a = index.a div (sizeof(TBitScalar) * 8) | |
| let i_offset_a = TBitScalar(index.a mod (sizeof(TBitScalar) * 8)) | |
| let i_element_b = index.b div (sizeof(TBitScalar) * 8) | |
| let i_offset_b = TBitScalar(sizeof(TBitScalar) * 8) - i_offset_a | |
| let insert_a = val shl i_offset_a | |
| ba.bitarray[i_element_a] = ba.bitarray[i_element_a] or insert_a | |
| if i_element_a != i_element_b: | |
| let insert_b = val shr i_offset_b | |
| ba.bitarray[i_element_b] = ba.bitarray[i_element_b] or insert_b | |
| proc `$`*(ba: TBitarray): string = | |
| ## Print the number of bits and elements in the bitarray (elements are currently defined as 8-bit chars) | |
| result = ("Bitarray with $1 bits and $2 unique elements. In-memory?: $3." % | |
| [$ba.size_bits, $ba.size_elements, $ba.kind]) | |
| when isMainModule: | |
| echo("Testing bitarray.nim code.") | |
| let n_tests: int = int(1e6) | |
| let n_bits: int = int(2e9) # ~240MB, i.e., much larger than L3 cache | |
| var bitarray = create_bitarray(n_bits) | |
| bitarray[0] = true | |
| bitarray[1] = true | |
| bitarray[2] = true | |
| var bitarray_b = create_bitarray("/tmp/ba.mmap", size=n_bits) | |
| bitarray_b.bitarray[3] = 4 | |
| # # Test range lookups/inserts | |
| bitarray[65] = true | |
| doAssert bitarray[65] | |
| doAssert bitarray[2..66] == TBitScalar(-9223372036854775807) # Lexer error prevents using 9223372036854775809'u64 directly... ugh | |
| bitarray[131] = true | |
| bitarray[194] = true | |
| assert bitarray[2..66] == bitarray[131..194] | |
| let slice_value = bitarray[131..194] | |
| bitarray[270..333] = slice_value | |
| bitarray[400..463] = TBitScalar(-9223372036854775807) | |
| assert bitarray[131..194] == bitarray[270..333] | |
| assert bitarray[131..194] == bitarray[400..463] | |
| # Seed RNG | |
| randomize(2882) # Seed the RNG | |
| var n_test_positions = newSeq[int](n_tests) | |
| for i in 0..(n_tests - 1): | |
| n_test_positions[i] = random(n_bits) | |
| # Timing tests | |
| var start_time, end_time: float | |
| start_time = times.cpuTime() | |
| for i in 0..(n_tests - 1): | |
| bitarray[n_test_positions[i]] = true | |
| end_time = times.cpuTime() | |
| echo("Took ", formatFloat(end_time - start_time, format = ffDecimal, precision = 4), " seconds to insert ", n_tests, " items (in-memory).") | |
| start_time = times.cpuTime() | |
| for i in 0..(n_tests - 1): | |
| bitarray_b[n_test_positions[i]] = true | |
| end_time = times.cpuTime() | |
| echo("Took ", formatFloat(end_time - start_time, format = ffDecimal, precision = 4), " seconds to insert ", n_tests, " items (mmap-backed).") | |
| var bit_value: bool | |
| start_time = times.cpuTime() | |
| for i in 0..(n_tests - 1): | |
| doAssert bitarray[n_test_positions[i]] | |
| end_time = times.cpuTime() | |
| echo("Took ", formatFloat(end_time - start_time, format = ffDecimal, precision = 4), " seconds to lookup ", n_tests, " items (in-memory).") | |
| start_time = times.cpuTime() | |
| for i in 0..(n_tests - 1): | |
| doAssert bitarray[n_test_positions[i]] | |
| end_time = times.cpuTime() | |
| echo("Took ", formatFloat(end_time - start_time, format = ffDecimal, precision = 4), " seconds to lookup ", n_tests, " items (mmap-backed).") | |
| echo("All tests successfully completed.") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import bitarray | |
| import murmur | |
| import strutils | |
| import times | |
| from math import random, randomize | |
| import unsigned | |
| type | |
| TBloomFilter = object | |
| bitarray: TBitarray | |
| n_hashes: int | |
| n_bits_per_item: int | |
| n_bits: int | |
| proc create_bloom_filter*(n_elements: int, n_bits_per_item: int = 12, n_hashes: int = 6): TBloomFilter = | |
| ## Generate a Bloom filter, yay so simple! | |
| let n_bits = n_elements * n_bits_per_item | |
| result = TBloomFilter( | |
| bitarray: create_bitarray(n_bits), | |
| n_hashes: n_hashes, | |
| n_bits_per_item: n_bits_per_item, | |
| n_bits: n_bits | |
| ) | |
| {.push overflowChecks: off.} | |
| proc hash(bf: TBloomFilter, item: string): seq[int] = | |
| var hashes: TMurmurHashes = murmur_hash(item, 0) | |
| newSeq(result, bf.n_hashes) | |
| for i in 0..(bf.n_hashes - 1): | |
| result[i] = abs(hashes[0] + hashes[1] * i) mod (bf.n_bits - (sizeof(TBitScalar) * 8)) | |
| return result | |
| {.pop.} | |
| proc insert*(bf: var TBloomFilter, item: string) = | |
| ## Put the string there | |
| let hashes = hash(bf, item) | |
| let insert_pattern: TBitScalar = TBitScalar(1) shl TBitScalar(hashes[0] mod (8 * sizeof(TBitScalar))) | |
| for h in hashes: | |
| bf.bitarray[h..(h + sizeof(TBitScalar) * 8)] = insert_pattern | |
| proc lookup*(bf: var TBloomFilter, item: string): bool = | |
| ## Is the string there? | |
| let hashes = hash(bf, item) | |
| let pattern = TBitScalar(1) shl TBitScalar(hashes[0] mod (8 * sizeof(TBitScalar))) | |
| var lookup: TBitScalar = pattern | |
| for h in hashes: | |
| lookup = lookup and bf.bitarray[h..(h + sizeof(TBitScalar) * 8)] | |
| result = (lookup == pattern) | |
| when isMainModule: | |
| echo "Quick working Bloom filter example." | |
| let n_tests = int(2e7) | |
| var bf = create_bloom_filter(n_elements = n_tests, n_bits_per_item = 16, n_hashes = 7) | |
| bf.insert("Here we go!") | |
| assert bf.lookup("Here we go!") | |
| assert (not bf.lookup("I'm not here.")) | |
| let test_string_len = 50 | |
| let sample_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" | |
| var k_test_elements = newSeq[string](n_tests) | |
| for i in 0..(n_tests - 1): | |
| var new_string = "" | |
| for j in 0..(test_string_len): | |
| new_string.add(sample_chars[random(51)]) | |
| k_test_elements[i] = new_string | |
| let start_time = cpuTime() | |
| for i in 0..(n_tests - 1): | |
| bf.insert(k_test_elements[i]) | |
| let end_time = cpuTime() | |
| echo("Took ", formatFloat(end_time - start_time, format = ffDecimal, precision = 4), " seconds to insert ", n_tests, " items.") | |
| let start_time_b = cpuTime() | |
| for i in 0..(n_tests - 1): | |
| doAssert bf.lookup(k_test_elements[i]) | |
| let end_time_b = cpuTime() | |
| echo("Took ", formatFloat(end_time_b - start_time_b, format = ffDecimal, precision = 4), " seconds to lookup ", n_tests, " items.") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Patched, place in private/memfiles.nim | |
| # | |
| # | |
| # Nimrod's Runtime Library | |
| # (c) Copyright 2014 Nimrod Contributors | |
| # | |
| # See the file "copying.txt", included in this | |
| # distribution, for details about the copyright. | |
| # | |
| ## :Authors: Zahary Karadjov, Andreas Rumpf | |
| ## | |
| ## This module provides support for `memory mapped files`:idx: | |
| ## (Posix's `mmap`:idx:) on the different operating systems. | |
| when defined(windows): | |
| import winlean | |
| elif defined(posix): | |
| import posix | |
| else: | |
| {.error: "the memfiles module is not supported on your operating system!".} | |
| import os | |
| type | |
| TMemFile* = object {.pure.} ## represents a memory mapped file | |
| mem*: pointer ## a pointer to the memory mapped file. The pointer | |
| ## can be used directly to change the contents of the | |
| ## file, if it was opened with write access. | |
| size*: int ## size of the memory mapped file | |
| when defined(windows): | |
| fHandle: int | |
| mapHandle: int | |
| else: | |
| handle: cint | |
| proc mapMem*(m: var TMemFile, mode: TFileMode = fmRead, | |
| mappedSize = -1, offset = 0): pointer = | |
| var readonly = mode == fmRead | |
| when defined(windows): | |
| result = mapViewOfFileEx( | |
| m.mapHandle, | |
| if readonly: FILE_MAP_READ else: FILE_MAP_WRITE, | |
| int32(offset shr 32), | |
| int32(offset and 0xffffffff), | |
| if mappedSize == -1: 0 else: mappedSize, | |
| nil) | |
| if result == nil: | |
| osError(osLastError()) | |
| else: | |
| assert mappedSize > 0 | |
| result = mmap( | |
| nil, | |
| mappedSize, | |
| if readonly: PROT_READ else: PROT_READ or PROT_WRITE, | |
| if readonly: MAP_PRIVATE else: MAP_SHARED, | |
| m.handle, offset) | |
| if result == cast[pointer](MAP_FAILED): | |
| osError(osLastError()) | |
| proc unmapMem*(f: var TMemFile, p: pointer, size: int) = | |
| ## unmaps the memory region ``(p, <p+size)`` of the mapped file `f`. | |
| ## All changes are written back to the file system, if `f` was opened | |
| ## with write access. ``size`` must be of exactly the size that was requested | |
| ## via ``mapMem``. | |
| when defined(windows): | |
| if unmapViewOfFile(p) == 0: osError(osLastError()) | |
| else: | |
| if munmap(p, size) != 0: osError(osLastError()) | |
| proc open*(filename: string, mode: TFileMode = fmRead, | |
| mappedSize = -1, offset = 0, newFileSize = -1): TMemFile = | |
| ## opens a memory mapped file. If this fails, ``EOS`` is raised. | |
| ## `newFileSize` can only be set if the file does not exist and is opened | |
| ## with write access (e.g., with fmReadWrite). `mappedSize` and `offset` | |
| ## can be used to map only a slice of the file. Example: | |
| ## | |
| ## .. code-block:: nimrod | |
| ## var | |
| ## mm, mm_full, mm_half: TMemFile | |
| ## | |
| ## mm = memfiles.open("/tmp/test.mmap", mode = fmWrite, newFileSize = 1024) # Create a new file | |
| ## mm.close() | |
| ## | |
| ## # Read the whole file, would fail if newFileSize was set | |
| ## mm_full = memfiles.open("/tmp/test.mmap", mode = fmReadWrite, mappedSize = -1) | |
| ## | |
| ## # Read the first 512 bytes | |
| ## mm_half = memfiles.open("/tmp/test.mmap", mode = fmReadWrite, mappedSize = 512) | |
| # The file can be resized only when write mode is used: | |
| assert newFileSize == -1 or mode != fmRead | |
| var readonly = mode == fmRead | |
| template rollback = | |
| result.mem = nil | |
| result.size = 0 | |
| when defined(windows): | |
| template fail(errCode: TOSErrorCode, msg: expr) = | |
| rollback() | |
| if result.fHandle != 0: discard closeHandle(result.fHandle) | |
| if result.mapHandle != 0: discard closeHandle(result.mapHandle) | |
| osError(errCode) | |
| # return false | |
| #raise newException(EIO, msg) | |
| template callCreateFile(winApiProc, filename: expr): expr = | |
| winApiProc( | |
| filename, | |
| if readonly: GENERIC_READ else: GENERIC_ALL, | |
| FILE_SHARE_READ, | |
| nil, | |
| if newFileSize != -1: CREATE_ALWAYS else: OPEN_EXISTING, | |
| if readonly: FILE_ATTRIBUTE_READONLY else: FILE_ATTRIBUTE_TEMPORARY, | |
| 0) | |
| when useWinUnicode: | |
| result.fHandle = callCreateFile(createFileW, newWideCString(filename)) | |
| else: | |
| result.fHandle = callCreateFile(createFileA, filename) | |
| if result.fHandle == INVALID_HANDLE_VALUE: | |
| fail(osLastError(), "error opening file") | |
| if newFileSize != -1: | |
| var | |
| sizeHigh = int32(newFileSize shr 32) | |
| sizeLow = int32(newFileSize and 0xffffffff) | |
| var status = setFilePointer(result.fHandle, sizeLow, addr(sizeHigh), | |
| FILE_BEGIN) | |
| let lastErr = osLastError() | |
| if (status == INVALID_SET_FILE_POINTER and lastErr.int32 != NO_ERROR) or | |
| (setEndOfFile(result.fHandle) == 0): | |
| fail(lastErr, "error setting file size") | |
| # since the strings are always 'nil', we simply always call | |
| # CreateFileMappingW which should be slightly faster anyway: | |
| result.mapHandle = createFileMappingW( | |
| result.fHandle, nil, | |
| if readonly: PAGE_READONLY else: PAGE_READWRITE, | |
| 0, 0, nil) | |
| if result.mapHandle == 0: | |
| fail(osLastError(), "error creating mapping") | |
| result.mem = mapViewOfFileEx( | |
| result.mapHandle, | |
| if readonly: FILE_MAP_READ else: FILE_MAP_WRITE, | |
| int32(offset shr 32), | |
| int32(offset and 0xffffffff), | |
| if mappedSize == -1: 0 else: mappedSize, | |
| nil) | |
| if result.mem == nil: | |
| fail(osLastError(), "error mapping view") | |
| var hi, low: int32 | |
| low = getFileSize(result.fHandle, addr(hi)) | |
| if low == INVALID_FILE_SIZE: | |
| fail(osLastError(), "error getting file size") | |
| else: | |
| var fileSize = (int64(hi) shr 32) or low | |
| if mappedSize != -1: result.size = min(fileSize, mappedSize).int | |
| else: result.size = fileSize.int | |
| else: | |
| template fail(errCode: TOSErrorCode, msg: expr) = | |
| rollback() | |
| if result.handle != 0: discard close(result.handle) | |
| osError(errCode) | |
| var flags = if readonly: O_RDONLY else: O_RDWR | |
| if newFileSize != -1: | |
| flags = flags or O_CREAT or O_TRUNC | |
| # Grant user r/w permission for new files | |
| var permissions_mode = S_IRUSR or S_IWUSR | |
| result.handle = open(filename, flags, permissions_mode) | |
| else: | |
| result.handle = open(filename, flags) | |
| if result.handle == -1: | |
| # XXX: errno is supposed to be set here | |
| # Is there an exception that wraps it? | |
| fail(osLastError(), "error opening file") | |
| if newFileSize != -1: | |
| if ftruncate(result.handle, newFileSize) == -1: | |
| fail(osLastError(), "error setting file size") | |
| if mappedSize != -1: | |
| result.size = mappedSize | |
| else: | |
| var stat: TStat | |
| if fstat(result.handle, stat) != -1: | |
| # XXX: Hmm, this could be unsafe | |
| # Why is mmap taking int anyway? | |
| result.size = int(stat.st_size) | |
| else: | |
| fail(osLastError(), "error getting file size") | |
| result.mem = mmap( | |
| nil, | |
| result.size, | |
| if readonly: PROT_READ else: PROT_READ or PROT_WRITE, | |
| if readonly: MAP_PRIVATE else: MAP_SHARED, | |
| result.handle, | |
| offset) | |
| if result.mem == cast[pointer](MAP_FAILED): | |
| fail(osLastError(), "file mapping failed") | |
| proc close*(f: var TMemFile) = | |
| ## closes the memory mapped file `f`. All changes are written back to the | |
| ## file system, if `f` was opened with write access. | |
| var error = false | |
| var lastErr: TOSErrorCode | |
| when defined(windows): | |
| if f.fHandle != INVALID_HANDLE_VALUE: | |
| error = unmapViewOfFile(f.mem) == 0 | |
| lastErr = osLastError() | |
| error = (closeHandle(f.mapHandle) == 0) or error | |
| error = (closeHandle(f.fHandle) == 0) or error | |
| else: | |
| if f.handle != 0: | |
| error = munmap(f.mem, f.size) != 0 | |
| lastErr = osLastError() | |
| error = (close(f.handle) != 0) or error | |
| f.size = 0 | |
| f.mem = nil | |
| when defined(windows): | |
| f.fHandle = 0 | |
| f.mapHandle = 0 | |
| else: | |
| f.handle = 0 | |
| if error: osError(lastErr) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment