Skip to content

Instantly share code, notes, and snippets.

@jeancroy
Last active August 29, 2015 14:24
Show Gist options
  • Save jeancroy/a6a7f59db14de85b2b97 to your computer and use it in GitHub Desktop.
Save jeancroy/a6a7f59db14de85b2b97 to your computer and use it in GitHub Desktop.
score LCS
#
# score, provided for compatibility
# for speed one should really prepare the query once then reuse it
#
score = (string,query) ->
scorePrepared(string,prepare(query))
#
# Match an item against a prepared query
#
exports.scorePrepared = scorePrepared = (string, preparedQuery) ->
# Allow a small amount of error.
# Wrong uppercase will count as a match + an error.
minScore = 0.6
queryTokens = preparedQuery.tokens
queryMaps = preparedQuery.maps
queryLen = queryTokens.length
strTokens = tokenize(string)
# (Note that tokenize(string) could be applied in advance too )
qscore = 0
qtoken = queryTokens[0]
qmap = queryMaps[0]
qmatch = 0
for token in strTokens
if token.length == 1 && qtoken.length == 1
sc = if (token==qtoken) then 1 else 0
#To give higher score to special character modify this branch
else
sc = score_map(qtoken, token, qmap)
if sc > minScore * qtoken.length
qscore += sc
qmatch++
return qscore if qmatch == queryLen
qtoken = queryTokens[qmatch]
qmap = queryMaps[qmatch]
#return
if qmatch < queryLen then 0 else qscore
#
# prepare a query,
# first tokenize then compute an reverse index of it's char position for each token
#
exports.prepare = prepare = (query) ->
tokens = tokenize(query)
{
tokens: tokens
maps: tokens.map(bitVector)
}
# first normalize word case then
# split in words using whitespaces or \ / _ - . separator
# (capturing group) should keep tokens in output array on modern browsers and node
tokenize = (str) ->
normalize(str).split(/([\s\\\/\_\-\.])/g)
# Convert uppercase "A" to "a^" for any A-Z
# This allow correct uppercase match to count as two.
# Also false uppercase count as an one match + one error.
#
# If needed change uppercase marker "^" to suit application
normalize = (token) ->
token.replace(/([A-Z])/g, '$1^').toLowerCase()
#
# Build a reverse index:
# for each character what are it's position in string ?
# positions of char in the string is recorded as positions of bits set to 1
#
bitVector = (str) ->
len = str.length
len = 32 if len > 32
map = {}
i = -1
while ++i < len
char = str[i]
if char of map
map[char] |= 1 << i
else
map[char] = 1 << i
map
#
# compute length of LCS(a,b)
# then normalize for size of string and apply prefix bonus
# aMap is bitVector(a)
#
score_map = (a, b, aMap) ->
m = a.length
n = b.length
bonus_prefix = 0.5 #add per character
#smallest string
k = if m < n then m else n
if k == 0
return 0
#normalize score against length of both inputs
sz_score = (m + n) / (2.0 * m * n)
#compute common prefix
prefix = 0
if a == b
prefix = k
else
while a[prefix] == b[prefix] and ++prefix < k
continue
#shortest string consumed ?
if prefix == k
lcs_len = prefix
return sz_score * lcs_len * lcs_len + bonus_prefix * prefix
# Prepare
mask = (1 << m) - 1
S = mask
j = prefix - 1
# "hot loop" for each character of b
# Compute LCS, see (Hyyrö 2004)
# here " S " is the " V' " of their paper
while ++j < n
c = b[j]
if c of aMap
U = S & aMap[c]
S = (S + U) | (S - U)
# Remove match already accounted in prefix region.
mask &= ~((1 << prefix) - 1)
# lcs_len is number of 0 in S (at position lower than m)
# inverse S, mask it
S = ~S & mask
#Then do "popcount" operation on 32bit
S = S - (S >> 1 & 0x55555555)
S = (S & 0x33333333) + (S >> 2 & 0x33333333)
lcs_len = (S + (S >> 4) & 0x0F0F0F0F) * 0x01010101 >> 24
#Prefix is part of lcs
lcs_len += prefix
#return score
sz_score * lcs_len * lcs_len + bonus_prefix * prefix
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment