Created
August 10, 2015 03:08
-
-
Save kaja47/b0f408dbfac7c3e199d1 to your computer and use it in GitHub Desktop.
StringIntDictionary
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** Idiosyncratic dictionary mapping strings to ints specialized for english | |
* words and other short alphanumeric strings. Long strigns are packed into | |
* one continuous char array, short alphanumeric strings are inlined into the | |
* structure that normally points into the array of long strings. This has the | |
* effect that strings shorter than 9 characters need only 12 bytes per mapping | |
* and lookup causes only one cache miss. | |
* | |
* Warning: Might contains subtle and not-so-soubtle errors. | |
*/ | |
class StringIntDictionary { | |
protected val segmentLength = 3 | |
protected val loadFactor = 0.45 | |
protected var capacity = 1024 | |
protected var occupied = 0 | |
protected var maxOccupied = capacity * loadFactor toInt | |
protected var charsTop = 0 | |
var assoc = new Array[Int](capacity * segmentLength) | |
var chars = new Array[Char](capacity * 8) | |
protected val defaultValue = 0 | |
def put(str: CharSequence, value: Int): Unit = { | |
var pos = findPos(str) | |
if (stringLength(assoc, pos) > 0) setPayload(assoc, pos, value) // overwrite | |
else { | |
if (occupied > maxOccupied) grow() | |
if (isInlined(str)) { | |
var word = 0L | |
var i = 0 | |
while (i < str.length) { | |
word = setInlinedChar(assoc, word, i, encode(str.charAt(i))) | |
i += 1 | |
} | |
setInlinedWord(assoc, pos, word) | |
setStringLength(assoc, pos, str.length) | |
} else { | |
val offset = putString(str) | |
setStringOffset(assoc, pos, offset) | |
setStringLength(assoc, pos, str.length) | |
} | |
setPayload(assoc, pos, value) | |
occupied += 1 | |
} | |
} | |
def get(str: CharSequence) = getOrDefault(str, defaultValue) | |
def getOrDefault(str: CharSequence, defaultValue: Int): Int = { | |
val pos = findPos(str) | |
if (stringLength(assoc, pos) > 0) payload(assoc, pos) else defaultValue | |
} | |
def contains(str: CharSequence): Boolean = | |
stringLength(assoc, findPos(str)) > 0 | |
def size = occupied | |
protected def stringHashCode(str: CharSequence) = { | |
var h = 0 | |
var i = 0 | |
while (i < str.length) { | |
h = 31 * h + str.charAt(i) | |
i += 1 | |
} | |
h | |
} | |
/** must produce same result as stringHashCode */ | |
protected def charArrHashCode(arr: Array[Char], offset: Int, length: Int) = { | |
var h = 0 | |
var i = 0 | |
while (i < length) { | |
h = 31 * h + arr(offset + i) | |
i += 1 | |
} | |
h | |
} | |
/** must produce same result as stringHashCode */ | |
protected def inlinedHashCode(assoc: Array[Int], pos: Int) = { | |
val len = stringLength(assoc, pos) | |
val word = inlinedWord(assoc, pos) | |
var h = 0 | |
var i = 0 | |
while (i < len) { | |
h = 31 * h + decode(inlinedChar(assoc, word, i)) | |
i += 1 | |
} | |
h | |
} | |
var _probe = 0L | |
protected def findPos(str: CharSequence) = { | |
require(str.length != 0 && str.length < 256, "string must be non empty and shorter than 256 chars") | |
val hash = stringHashCode(str) | |
var i = (hash & (capacity - 1)) | |
var pos = i * segmentLength | |
while (!equalOrEmpty(pos, str)) { | |
i = (i + 1) & (capacity - 1) | |
pos = i * segmentLength | |
_probe += 1 | |
} | |
pos | |
} | |
// pos refers to the first element in segment | |
protected def stringLength(assoc: Array[Int], pos: Int) = assoc(pos+1) & 0xff | |
protected def stringOffset(assoc: Array[Int], pos: Int) = assoc(pos) | |
protected def inlinedWord(assoc: Array[Int], pos: Int): Long = (assoc(pos).toLong & 0xffffffffL) | ((assoc(pos+1).toLong & 0xffffff00L) << 24) | |
protected def inlinedChar(assoc: Array[Int], word: Long, i: Int): Int = ((word >>> (6 * i)) & ((1 << 6) - 1)).toInt | |
protected def payload(assoc: Array[Int], pos: Int) = assoc(pos+2) | |
protected def setStringLength(assoc: Array[Int], pos: Int, length: Int) = assoc(pos+1) = (assoc(pos+1) & 0xffffff00) | length | |
protected def setStringOffset(assoc: Array[Int], pos: Int, offset: Int) = assoc(pos) = offset | |
protected def setInlinedWord(assoc: Array[Int], pos: Int, word: Long) = { | |
assoc(pos) = (word & 0xffffffff).toInt | |
assoc(pos+1) = (word >>> 24).toInt | |
} | |
private def setInlinedChar(assoc: Array[Int], word: Long, i: Int, value: Int): Long = { | |
require(value < 64) | |
word | (value.toLong << (6 * i)) | |
} | |
protected def setPayload(assoc: Array[Int], pos: Int, value: Int) = assoc(pos+2) = value | |
protected def equalOrEmpty(pos: Int, str: CharSequence): Boolean = { | |
val len = stringLength(assoc, pos) | |
if (len == 0) return true | |
if (len != str.length) return false | |
if (isInlined(str)) { | |
val word = inlinedWord(assoc, pos) | |
var i = 0 | |
while (i < len) { | |
if (decode(inlinedChar(assoc, word, i)) != str.charAt(i)) return false | |
i += 1 | |
} | |
} else { | |
val off = stringOffset(assoc, pos) | |
var i = 0 | |
while (i < len) { | |
if (chars(off+i) != str.charAt(i)) return false | |
i += 1 | |
} | |
} | |
true | |
} | |
protected def isInlined(str: CharSequence): Boolean = { | |
if (str.length > 9) return false | |
var i = 0 | |
while (i < str.length) { | |
val ch = str.charAt(i) | |
val ok = (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || (ch >= '0' && ch <= '9') | |
if (!ok) return false | |
i += 1 | |
} | |
return true | |
} | |
protected val encodeTable = { | |
val arr = new Array[Byte](128) | |
var ch = 'a'.toInt | |
var i = 0 | |
ch = '0' ; while (ch <= '9') { arr(ch) = i.toByte ; ch += 1 ; i += 1 } | |
ch = 'A' ; while (ch <= 'Z') { arr(ch) = i.toByte ; ch += 1 ; i += 1 } | |
ch = 'a' ; while (ch <= 'z') { arr(ch) = i.toByte ; ch += 1 ; i += 1 } | |
arr | |
} | |
protected def encode(ch: Char): Int = encodeTable(ch).toInt | |
protected def decode(x: Int): Char = { | |
if (x < 10) return (x + '0').toChar | |
if (x < 36) return (x - 10 + 'A').toChar | |
/*if (x <= 62)*/ return (x - 36 + 'a').toChar | |
} | |
protected def grow() = { | |
val oldAssoc = assoc | |
val oldCapacity = capacity | |
this.assoc = new Array[Int](assoc.length * 2) | |
this.capacity = capacity * 2 | |
this.maxOccupied = capacity * loadFactor toInt | |
var oldPos = 0 | |
while (oldPos < (oldCapacity * segmentLength)) { | |
val len = stringLength(oldAssoc, oldPos) | |
if (len > 0) { | |
val hash = if (len <= 9) inlinedHashCode(oldAssoc, oldPos) else charArrHashCode(chars, stringOffset(oldAssoc, oldPos), len) | |
var i = (hash & (capacity - 1)) | |
var pos = i * segmentLength | |
while (stringLength(assoc, pos) != 0) { // search for space in newly allocated array | |
i = (i + 1) & (capacity - 1) | |
pos = i * segmentLength | |
} | |
assoc(pos) = oldAssoc(oldPos) | |
assoc(pos+1) = oldAssoc(oldPos+1) | |
assoc(pos+2) = oldAssoc(oldPos+2) | |
} | |
oldPos += segmentLength | |
} | |
} | |
/** puts string into `chars` array and returns it's position */ | |
protected def putString(str: CharSequence): Int = { | |
while (charsTop + str.length > chars.length) { | |
growChars() | |
} | |
var i = 0 | |
while (i < str.length) { | |
chars(charsTop+i) = str.charAt(i) | |
i += 1 | |
} | |
val pos = charsTop | |
charsTop += str.length | |
pos | |
} | |
protected def growChars(): Unit = { | |
val newChars = new Array[Char](chars.length * 2) | |
System.arraycopy(chars, 0, newChars, 0, chars.length) | |
chars = newChars | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment