Skip to content

Instantly share code, notes, and snippets.

@boydgreenfield
Created May 13, 2014 01:09
Show Gist options
  • Save boydgreenfield/45c8b7b29c4ed53fa45e to your computer and use it in GitHub Desktop.
Save boydgreenfield/45c8b7b29c4ed53fa45e to your computer and use it in GitHub Desktop.
Bloom filter / bitarray testing
import memfiles # Note tests will fail for mmap-backed files unless you bother to use patched private/memfiles (below)
#import private/memfiles
from os import nil
from strutils import `%`, formatFloat, ffDecimal, toBin
import unsigned
from math import random, randomize
from times import nil
# Type declarations
type
TBitScalar* = uint
type
EBitarray = object of EBase
TBitarrayKind = enum inmem, mmap
TFlexArray {.unchecked.} = array[0..0, TBitScalar]
TBitarray* = ref object
size_elements: int
size_bits*: int
size_specified*: int
bitarray: ptr TFlexArray
case kind: TBitarrayKind
of inmem:
nil
of mmap:
mm_filehandle: TMemFile
const ONE = TBitScalar(1)
proc finalize_bitarray(a: TBitarray) =
if not a.bitarray.isNil:
case a.kind
of inmem:
dealloc(a.bitarray)
a.bitarray = nil
of mmap:
a.mm_filehandle.close()
proc create_bitarray*(size: int): TBitarray =
## Creates an in-memory bitarray using a specified input size.
## Note that this will round up to the nearest byte.
let n_elements = size div (sizeof(TBitScalar) * 8)
let n_bits = n_elements * (sizeof(TBitScalar) * 8)
new(result, finalize_bitarray)
result.kind = inmem
result.bitarray = cast[ptr TFlexArray](alloc0(n_elements * sizeof(TBitScalar)))
result.size_elements = n_elements
result.size_bits = n_bits
result.size_specified = size
proc create_bitarray*(file: string, size: int = -1): TBitarray =
## Creates an mmap-backed bitarray. If the specified file exists
## it will be opened, but an exception will be raised if the size
## is specified and does not match. If the file does not exist
## it will be created.
let n_elements = size div (sizeof(char) * 8)
let n_bits = n_elements * (sizeof(char) * 8)
var mm_file: TMemFile
if os.existsFile(file):
mm_file = open(file, mode = fmReadWrite, mappedSize = -1)
if size != -1 and mm_file.size != n_elements:
raise newException(EBitarray, "Existing mmap file does not have the specified size $1" % $size)
else:
if size == -1:
raise newException(EBitarray, "No existing mmap file. Must specify size to create one.")
mm_file = open(file, mode = fmReadWrite, newFileSize = n_elements)
new(result, finalize_bitarray)
result.kind = mmap
result.bitarray = cast[ptr TFlexArray](mm_file.mem)
result.size_elements = n_elements
result.size_bits = n_bits
result.size_specified = size
result.mm_filehandle = mm_file
proc `[]=`*(ba: var TBitarray, index: int, val: bool) {.inline.} =
## Sets the bit at an index to be either 0 (false) or 1 (true)
if index >= ba.size_bits or index < 0:
raise newException(EBitarray, "Specified index is too large.")
let i_element = index div (sizeof(TBitScalar) * 8)
let i_offset = TBitScalar(index mod (sizeof(TBitScalar) * 8))
if val:
ba.bitarray[i_element] = (ba.bitarray[i_element] or (ONE shl i_offset))
else:
ba.bitarray[i_element] = (ba.bitarray[i_element] and ((not ONE) shl i_offset))
proc `[]`*(ba: var TBitarray, index: int): bool {.inline.} =
## Gets the bit at an index element (returns a bool)
if index >= ba.size_bits or index < 0:
raise newException(EBitarray, "Specified index is too large.")
let i_element = index div (sizeof(TBitScalar) * 8)
let i_offset = TBitScalar(index mod (sizeof(TBitScalar) * 8))
result = bool((ba.bitarray[i_element] shr i_offset) and ONE)
proc `[]`*(ba: var TBitarray, index: TSlice): TBitScalar {.inline.} =
## Get the bits for a slice of the bitarray. Supports slice sizes
## up the maximum element size (64 bits by default)
if index.b >= ba.size_bits or index.a < 0:
raise newException(EBitarray, "Specified index is too large.")
if (index.b - index.a) > (sizeof(TBitScalar) * 8):
raise newException(EBitarray, "Only slices up to $1 bits are supported." % $(sizeof(TBitScalar) * 8))
let i_element_a = index.a div (sizeof(TBitScalar) * 8)
let i_offset_a = TBitScalar(index.a mod (sizeof(TBitScalar) * 8))
let i_element_b = index.b div (sizeof(TBitScalar) * 8)
let i_offset_b = TBitScalar(sizeof(TBitScalar) * 8) - i_offset_a
var result = ba.bitarray[i_element_a] shr i_offset_a
if i_element_a != i_element_b: # Combine two slices
let slice_b = ba.bitarray[i_element_b] shl i_offset_b
result = result or slice_b
return result # Fails if this isn't included?
proc `[]=`*(ba: var TBitarray, index: TSlice, val: TBitScalar) {.inline.} =
## Set the bits for a slice of the bitarray. Supports slice sizes
## up to the maximum element size (64 bits by default)
## Note: This inserts using a bitwise-or, it will *not* overwrite previously
## set true values!
if index.b >= ba.size_bits or index.a < 0:
raise newException(EBitarray, "Specified index is too large.")
if (index.b - index.a) > (sizeof(TBitScalar) * 8):
raise newException(EBitarray, "Only slices up to $1 bits are supported." % $(sizeof(TBitScalar) * 8))
# TODO(nbg): Make a macro for handling this and also the if/else in-memory piece
let i_element_a = index.a div (sizeof(TBitScalar) * 8)
let i_offset_a = TBitScalar(index.a mod (sizeof(TBitScalar) * 8))
let i_element_b = index.b div (sizeof(TBitScalar) * 8)
let i_offset_b = TBitScalar(sizeof(TBitScalar) * 8) - i_offset_a
let insert_a = val shl i_offset_a
ba.bitarray[i_element_a] = ba.bitarray[i_element_a] or insert_a
if i_element_a != i_element_b:
let insert_b = val shr i_offset_b
ba.bitarray[i_element_b] = ba.bitarray[i_element_b] or insert_b
proc `$`*(ba: TBitarray): string =
## Print the number of bits and elements in the bitarray (elements are currently defined as 8-bit chars)
result = ("Bitarray with $1 bits and $2 unique elements. In-memory?: $3." %
[$ba.size_bits, $ba.size_elements, $ba.kind])
when isMainModule:
echo("Testing bitarray.nim code.")
let n_tests: int = int(1e6)
let n_bits: int = int(2e9) # ~240MB, i.e., much larger than L3 cache
var bitarray = create_bitarray(n_bits)
bitarray[0] = true
bitarray[1] = true
bitarray[2] = true
var bitarray_b = create_bitarray("/tmp/ba.mmap", size=n_bits)
bitarray_b.bitarray[3] = 4
# # Test range lookups/inserts
bitarray[65] = true
doAssert bitarray[65]
doAssert bitarray[2..66] == TBitScalar(-9223372036854775807) # Lexer error prevents using 9223372036854775809'u64 directly... ugh
bitarray[131] = true
bitarray[194] = true
assert bitarray[2..66] == bitarray[131..194]
let slice_value = bitarray[131..194]
bitarray[270..333] = slice_value
bitarray[400..463] = TBitScalar(-9223372036854775807)
assert bitarray[131..194] == bitarray[270..333]
assert bitarray[131..194] == bitarray[400..463]
# Seed RNG
randomize(2882) # Seed the RNG
var n_test_positions = newSeq[int](n_tests)
for i in 0..(n_tests - 1):
n_test_positions[i] = random(n_bits)
# Timing tests
var start_time, end_time: float
start_time = times.cpuTime()
for i in 0..(n_tests - 1):
bitarray[n_test_positions[i]] = true
end_time = times.cpuTime()
echo("Took ", formatFloat(end_time - start_time, format = ffDecimal, precision = 4), " seconds to insert ", n_tests, " items (in-memory).")
start_time = times.cpuTime()
for i in 0..(n_tests - 1):
bitarray_b[n_test_positions[i]] = true
end_time = times.cpuTime()
echo("Took ", formatFloat(end_time - start_time, format = ffDecimal, precision = 4), " seconds to insert ", n_tests, " items (mmap-backed).")
var bit_value: bool
start_time = times.cpuTime()
for i in 0..(n_tests - 1):
doAssert bitarray[n_test_positions[i]]
end_time = times.cpuTime()
echo("Took ", formatFloat(end_time - start_time, format = ffDecimal, precision = 4), " seconds to lookup ", n_tests, " items (in-memory).")
start_time = times.cpuTime()
for i in 0..(n_tests - 1):
doAssert bitarray[n_test_positions[i]]
end_time = times.cpuTime()
echo("Took ", formatFloat(end_time - start_time, format = ffDecimal, precision = 4), " seconds to lookup ", n_tests, " items (mmap-backed).")
echo("All tests successfully completed.")
import bitarray
import murmur
import strutils
import times
from math import random, randomize
import unsigned
type
TBloomFilter = object
bitarray: TBitarray
n_hashes: int
n_bits_per_item: int
n_bits: int
proc create_bloom_filter*(n_elements: int, n_bits_per_item: int = 12, n_hashes: int = 6): TBloomFilter =
## Generate a Bloom filter, yay so simple!
let n_bits = n_elements * n_bits_per_item
result = TBloomFilter(
bitarray: create_bitarray(n_bits),
n_hashes: n_hashes,
n_bits_per_item: n_bits_per_item,
n_bits: n_bits
)
{.push overflowChecks: off.}
proc hash(bf: TBloomFilter, item: string): seq[int] =
var hashes: TMurmurHashes = murmur_hash(item, 0)
newSeq(result, bf.n_hashes)
for i in 0..(bf.n_hashes - 1):
result[i] = abs(hashes[0] + hashes[1] * i) mod (bf.n_bits - (sizeof(TBitScalar) * 8))
return result
{.pop.}
proc insert*(bf: var TBloomFilter, item: string) =
## Put the string there
let hashes = hash(bf, item)
let insert_pattern: TBitScalar = TBitScalar(1) shl TBitScalar(hashes[0] mod (8 * sizeof(TBitScalar)))
for h in hashes:
bf.bitarray[h..(h + sizeof(TBitScalar) * 8)] = insert_pattern
proc lookup*(bf: var TBloomFilter, item: string): bool =
## Is the string there?
let hashes = hash(bf, item)
let pattern = TBitScalar(1) shl TBitScalar(hashes[0] mod (8 * sizeof(TBitScalar)))
var lookup: TBitScalar = pattern
for h in hashes:
lookup = lookup and bf.bitarray[h..(h + sizeof(TBitScalar) * 8)]
result = (lookup == pattern)
when isMainModule:
echo "Quick working Bloom filter example."
let n_tests = int(2e7)
var bf = create_bloom_filter(n_elements = n_tests, n_bits_per_item = 16, n_hashes = 7)
bf.insert("Here we go!")
assert bf.lookup("Here we go!")
assert (not bf.lookup("I'm not here."))
let test_string_len = 50
let sample_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
var k_test_elements = newSeq[string](n_tests)
for i in 0..(n_tests - 1):
var new_string = ""
for j in 0..(test_string_len):
new_string.add(sample_chars[random(51)])
k_test_elements[i] = new_string
let start_time = cpuTime()
for i in 0..(n_tests - 1):
bf.insert(k_test_elements[i])
let end_time = cpuTime()
echo("Took ", formatFloat(end_time - start_time, format = ffDecimal, precision = 4), " seconds to insert ", n_tests, " items.")
let start_time_b = cpuTime()
for i in 0..(n_tests - 1):
doAssert bf.lookup(k_test_elements[i])
let end_time_b = cpuTime()
echo("Took ", formatFloat(end_time_b - start_time_b, format = ffDecimal, precision = 4), " seconds to lookup ", n_tests, " items.")
# Patched, place in private/memfiles.nim
#
#
# Nimrod's Runtime Library
# (c) Copyright 2014 Nimrod Contributors
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
#
## :Authors: Zahary Karadjov, Andreas Rumpf
##
## This module provides support for `memory mapped files`:idx:
## (Posix's `mmap`:idx:) on the different operating systems.
when defined(windows):
import winlean
elif defined(posix):
import posix
else:
{.error: "the memfiles module is not supported on your operating system!".}
import os
type
TMemFile* = object {.pure.} ## represents a memory mapped file
mem*: pointer ## a pointer to the memory mapped file. The pointer
## can be used directly to change the contents of the
## file, if it was opened with write access.
size*: int ## size of the memory mapped file
when defined(windows):
fHandle: int
mapHandle: int
else:
handle: cint
proc mapMem*(m: var TMemFile, mode: TFileMode = fmRead,
mappedSize = -1, offset = 0): pointer =
var readonly = mode == fmRead
when defined(windows):
result = mapViewOfFileEx(
m.mapHandle,
if readonly: FILE_MAP_READ else: FILE_MAP_WRITE,
int32(offset shr 32),
int32(offset and 0xffffffff),
if mappedSize == -1: 0 else: mappedSize,
nil)
if result == nil:
osError(osLastError())
else:
assert mappedSize > 0
result = mmap(
nil,
mappedSize,
if readonly: PROT_READ else: PROT_READ or PROT_WRITE,
if readonly: MAP_PRIVATE else: MAP_SHARED,
m.handle, offset)
if result == cast[pointer](MAP_FAILED):
osError(osLastError())
proc unmapMem*(f: var TMemFile, p: pointer, size: int) =
## unmaps the memory region ``(p, <p+size)`` of the mapped file `f`.
## All changes are written back to the file system, if `f` was opened
## with write access. ``size`` must be of exactly the size that was requested
## via ``mapMem``.
when defined(windows):
if unmapViewOfFile(p) == 0: osError(osLastError())
else:
if munmap(p, size) != 0: osError(osLastError())
proc open*(filename: string, mode: TFileMode = fmRead,
mappedSize = -1, offset = 0, newFileSize = -1): TMemFile =
## opens a memory mapped file. If this fails, ``EOS`` is raised.
## `newFileSize` can only be set if the file does not exist and is opened
## with write access (e.g., with fmReadWrite). `mappedSize` and `offset`
## can be used to map only a slice of the file. Example:
##
## .. code-block:: nimrod
## var
## mm, mm_full, mm_half: TMemFile
##
## mm = memfiles.open("/tmp/test.mmap", mode = fmWrite, newFileSize = 1024) # Create a new file
## mm.close()
##
## # Read the whole file, would fail if newFileSize was set
## mm_full = memfiles.open("/tmp/test.mmap", mode = fmReadWrite, mappedSize = -1)
##
## # Read the first 512 bytes
## mm_half = memfiles.open("/tmp/test.mmap", mode = fmReadWrite, mappedSize = 512)
# The file can be resized only when write mode is used:
assert newFileSize == -1 or mode != fmRead
var readonly = mode == fmRead
template rollback =
result.mem = nil
result.size = 0
when defined(windows):
template fail(errCode: TOSErrorCode, msg: expr) =
rollback()
if result.fHandle != 0: discard closeHandle(result.fHandle)
if result.mapHandle != 0: discard closeHandle(result.mapHandle)
osError(errCode)
# return false
#raise newException(EIO, msg)
template callCreateFile(winApiProc, filename: expr): expr =
winApiProc(
filename,
if readonly: GENERIC_READ else: GENERIC_ALL,
FILE_SHARE_READ,
nil,
if newFileSize != -1: CREATE_ALWAYS else: OPEN_EXISTING,
if readonly: FILE_ATTRIBUTE_READONLY else: FILE_ATTRIBUTE_TEMPORARY,
0)
when useWinUnicode:
result.fHandle = callCreateFile(createFileW, newWideCString(filename))
else:
result.fHandle = callCreateFile(createFileA, filename)
if result.fHandle == INVALID_HANDLE_VALUE:
fail(osLastError(), "error opening file")
if newFileSize != -1:
var
sizeHigh = int32(newFileSize shr 32)
sizeLow = int32(newFileSize and 0xffffffff)
var status = setFilePointer(result.fHandle, sizeLow, addr(sizeHigh),
FILE_BEGIN)
let lastErr = osLastError()
if (status == INVALID_SET_FILE_POINTER and lastErr.int32 != NO_ERROR) or
(setEndOfFile(result.fHandle) == 0):
fail(lastErr, "error setting file size")
# since the strings are always 'nil', we simply always call
# CreateFileMappingW which should be slightly faster anyway:
result.mapHandle = createFileMappingW(
result.fHandle, nil,
if readonly: PAGE_READONLY else: PAGE_READWRITE,
0, 0, nil)
if result.mapHandle == 0:
fail(osLastError(), "error creating mapping")
result.mem = mapViewOfFileEx(
result.mapHandle,
if readonly: FILE_MAP_READ else: FILE_MAP_WRITE,
int32(offset shr 32),
int32(offset and 0xffffffff),
if mappedSize == -1: 0 else: mappedSize,
nil)
if result.mem == nil:
fail(osLastError(), "error mapping view")
var hi, low: int32
low = getFileSize(result.fHandle, addr(hi))
if low == INVALID_FILE_SIZE:
fail(osLastError(), "error getting file size")
else:
var fileSize = (int64(hi) shr 32) or low
if mappedSize != -1: result.size = min(fileSize, mappedSize).int
else: result.size = fileSize.int
else:
template fail(errCode: TOSErrorCode, msg: expr) =
rollback()
if result.handle != 0: discard close(result.handle)
osError(errCode)
var flags = if readonly: O_RDONLY else: O_RDWR
if newFileSize != -1:
flags = flags or O_CREAT or O_TRUNC
# Grant user r/w permission for new files
var permissions_mode = S_IRUSR or S_IWUSR
result.handle = open(filename, flags, permissions_mode)
else:
result.handle = open(filename, flags)
if result.handle == -1:
# XXX: errno is supposed to be set here
# Is there an exception that wraps it?
fail(osLastError(), "error opening file")
if newFileSize != -1:
if ftruncate(result.handle, newFileSize) == -1:
fail(osLastError(), "error setting file size")
if mappedSize != -1:
result.size = mappedSize
else:
var stat: TStat
if fstat(result.handle, stat) != -1:
# XXX: Hmm, this could be unsafe
# Why is mmap taking int anyway?
result.size = int(stat.st_size)
else:
fail(osLastError(), "error getting file size")
result.mem = mmap(
nil,
result.size,
if readonly: PROT_READ else: PROT_READ or PROT_WRITE,
if readonly: MAP_PRIVATE else: MAP_SHARED,
result.handle,
offset)
if result.mem == cast[pointer](MAP_FAILED):
fail(osLastError(), "file mapping failed")
proc close*(f: var TMemFile) =
## closes the memory mapped file `f`. All changes are written back to the
## file system, if `f` was opened with write access.
var error = false
var lastErr: TOSErrorCode
when defined(windows):
if f.fHandle != INVALID_HANDLE_VALUE:
error = unmapViewOfFile(f.mem) == 0
lastErr = osLastError()
error = (closeHandle(f.mapHandle) == 0) or error
error = (closeHandle(f.fHandle) == 0) or error
else:
if f.handle != 0:
error = munmap(f.mem, f.size) != 0
lastErr = osLastError()
error = (close(f.handle) != 0) or error
f.size = 0
f.mem = nil
when defined(windows):
f.fHandle = 0
f.mapHandle = 0
else:
f.handle = 0
if error: osError(lastErr)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment