Skip to content

Instantly share code, notes, and snippets.

@kaja47
Created August 10, 2015 03:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kaja47/b0f408dbfac7c3e199d1 to your computer and use it in GitHub Desktop.
Save kaja47/b0f408dbfac7c3e199d1 to your computer and use it in GitHub Desktop.
StringIntDictionary
/** Idiosyncratic dictionary mapping strings to ints specialized for english
* words and other short alphanumeric strings. Long strigns are packed into
* one continuous char array, short alphanumeric strings are inlined into the
* structure that normally points into the array of long strings. This has the
* effect that strings shorter than 9 characters need only 12 bytes per mapping
* and lookup causes only one cache miss.
*
* Warning: Might contains subtle and not-so-soubtle errors.
*/
class StringIntDictionary {
protected val segmentLength = 3
protected val loadFactor = 0.45
protected var capacity = 1024
protected var occupied = 0
protected var maxOccupied = capacity * loadFactor toInt
protected var charsTop = 0
var assoc = new Array[Int](capacity * segmentLength)
var chars = new Array[Char](capacity * 8)
protected val defaultValue = 0
def put(str: CharSequence, value: Int): Unit = {
var pos = findPos(str)
if (stringLength(assoc, pos) > 0) setPayload(assoc, pos, value) // overwrite
else {
if (occupied > maxOccupied) grow()
if (isInlined(str)) {
var word = 0L
var i = 0
while (i < str.length) {
word = setInlinedChar(assoc, word, i, encode(str.charAt(i)))
i += 1
}
setInlinedWord(assoc, pos, word)
setStringLength(assoc, pos, str.length)
} else {
val offset = putString(str)
setStringOffset(assoc, pos, offset)
setStringLength(assoc, pos, str.length)
}
setPayload(assoc, pos, value)
occupied += 1
}
}
def get(str: CharSequence) = getOrDefault(str, defaultValue)
def getOrDefault(str: CharSequence, defaultValue: Int): Int = {
val pos = findPos(str)
if (stringLength(assoc, pos) > 0) payload(assoc, pos) else defaultValue
}
def contains(str: CharSequence): Boolean =
stringLength(assoc, findPos(str)) > 0
def size = occupied
protected def stringHashCode(str: CharSequence) = {
var h = 0
var i = 0
while (i < str.length) {
h = 31 * h + str.charAt(i)
i += 1
}
h
}
/** must produce same result as stringHashCode */
protected def charArrHashCode(arr: Array[Char], offset: Int, length: Int) = {
var h = 0
var i = 0
while (i < length) {
h = 31 * h + arr(offset + i)
i += 1
}
h
}
/** must produce same result as stringHashCode */
protected def inlinedHashCode(assoc: Array[Int], pos: Int) = {
val len = stringLength(assoc, pos)
val word = inlinedWord(assoc, pos)
var h = 0
var i = 0
while (i < len) {
h = 31 * h + decode(inlinedChar(assoc, word, i))
i += 1
}
h
}
var _probe = 0L
protected def findPos(str: CharSequence) = {
require(str.length != 0 && str.length < 256, "string must be non empty and shorter than 256 chars")
val hash = stringHashCode(str)
var i = (hash & (capacity - 1))
var pos = i * segmentLength
while (!equalOrEmpty(pos, str)) {
i = (i + 1) & (capacity - 1)
pos = i * segmentLength
_probe += 1
}
pos
}
// pos refers to the first element in segment
protected def stringLength(assoc: Array[Int], pos: Int) = assoc(pos+1) & 0xff
protected def stringOffset(assoc: Array[Int], pos: Int) = assoc(pos)
protected def inlinedWord(assoc: Array[Int], pos: Int): Long = (assoc(pos).toLong & 0xffffffffL) | ((assoc(pos+1).toLong & 0xffffff00L) << 24)
protected def inlinedChar(assoc: Array[Int], word: Long, i: Int): Int = ((word >>> (6 * i)) & ((1 << 6) - 1)).toInt
protected def payload(assoc: Array[Int], pos: Int) = assoc(pos+2)
protected def setStringLength(assoc: Array[Int], pos: Int, length: Int) = assoc(pos+1) = (assoc(pos+1) & 0xffffff00) | length
protected def setStringOffset(assoc: Array[Int], pos: Int, offset: Int) = assoc(pos) = offset
protected def setInlinedWord(assoc: Array[Int], pos: Int, word: Long) = {
assoc(pos) = (word & 0xffffffff).toInt
assoc(pos+1) = (word >>> 24).toInt
}
private def setInlinedChar(assoc: Array[Int], word: Long, i: Int, value: Int): Long = {
require(value < 64)
word | (value.toLong << (6 * i))
}
protected def setPayload(assoc: Array[Int], pos: Int, value: Int) = assoc(pos+2) = value
protected def equalOrEmpty(pos: Int, str: CharSequence): Boolean = {
val len = stringLength(assoc, pos)
if (len == 0) return true
if (len != str.length) return false
if (isInlined(str)) {
val word = inlinedWord(assoc, pos)
var i = 0
while (i < len) {
if (decode(inlinedChar(assoc, word, i)) != str.charAt(i)) return false
i += 1
}
} else {
val off = stringOffset(assoc, pos)
var i = 0
while (i < len) {
if (chars(off+i) != str.charAt(i)) return false
i += 1
}
}
true
}
protected def isInlined(str: CharSequence): Boolean = {
if (str.length > 9) return false
var i = 0
while (i < str.length) {
val ch = str.charAt(i)
val ok = (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || (ch >= '0' && ch <= '9')
if (!ok) return false
i += 1
}
return true
}
protected val encodeTable = {
val arr = new Array[Byte](128)
var ch = 'a'.toInt
var i = 0
ch = '0' ; while (ch <= '9') { arr(ch) = i.toByte ; ch += 1 ; i += 1 }
ch = 'A' ; while (ch <= 'Z') { arr(ch) = i.toByte ; ch += 1 ; i += 1 }
ch = 'a' ; while (ch <= 'z') { arr(ch) = i.toByte ; ch += 1 ; i += 1 }
arr
}
protected def encode(ch: Char): Int = encodeTable(ch).toInt
protected def decode(x: Int): Char = {
if (x < 10) return (x + '0').toChar
if (x < 36) return (x - 10 + 'A').toChar
/*if (x <= 62)*/ return (x - 36 + 'a').toChar
}
protected def grow() = {
val oldAssoc = assoc
val oldCapacity = capacity
this.assoc = new Array[Int](assoc.length * 2)
this.capacity = capacity * 2
this.maxOccupied = capacity * loadFactor toInt
var oldPos = 0
while (oldPos < (oldCapacity * segmentLength)) {
val len = stringLength(oldAssoc, oldPos)
if (len > 0) {
val hash = if (len <= 9) inlinedHashCode(oldAssoc, oldPos) else charArrHashCode(chars, stringOffset(oldAssoc, oldPos), len)
var i = (hash & (capacity - 1))
var pos = i * segmentLength
while (stringLength(assoc, pos) != 0) { // search for space in newly allocated array
i = (i + 1) & (capacity - 1)
pos = i * segmentLength
}
assoc(pos) = oldAssoc(oldPos)
assoc(pos+1) = oldAssoc(oldPos+1)
assoc(pos+2) = oldAssoc(oldPos+2)
}
oldPos += segmentLength
}
}
/** puts string into `chars` array and returns it's position */
protected def putString(str: CharSequence): Int = {
while (charsTop + str.length > chars.length) {
growChars()
}
var i = 0
while (i < str.length) {
chars(charsTop+i) = str.charAt(i)
i += 1
}
val pos = charsTop
charsTop += str.length
pos
}
protected def growChars(): Unit = {
val newChars = new Array[Char](chars.length * 2)
System.arraycopy(chars, 0, newChars, 0, chars.length)
chars = newChars
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment