Last active
August 29, 2015 14:24
-
-
Save jeancroy/a6a7f59db14de85b2b97 to your computer and use it in GitHub Desktop.
score LCS
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# score, provided for compatibility | |
# for speed one should really prepare the query once then reuse it | |
# | |
score = (string,query) -> | |
scorePrepared(string,prepare(query)) | |
# | |
# Match an item against a prepared query | |
# | |
exports.scorePrepared = scorePrepared = (string, preparedQuery) -> | |
# Allow a small amount of error. | |
# Wrong uppercase will count as a match + an error. | |
minScore = 0.6 | |
queryTokens = preparedQuery.tokens | |
queryMaps = preparedQuery.maps | |
queryLen = queryTokens.length | |
strTokens = tokenize(string) | |
# (Note that tokenize(string) could be applied in advance too ) | |
qscore = 0 | |
qtoken = queryTokens[0] | |
qmap = queryMaps[0] | |
qmatch = 0 | |
for token in strTokens | |
if token.length == 1 && qtoken.length == 1 | |
sc = if (token==qtoken) then 1 else 0 | |
#To give higher score to special character modify this branch | |
else | |
sc = score_map(qtoken, token, qmap) | |
if sc > minScore * qtoken.length | |
qscore += sc | |
qmatch++ | |
return qscore if qmatch == queryLen | |
qtoken = queryTokens[qmatch] | |
qmap = queryMaps[qmatch] | |
#return | |
if qmatch < queryLen then 0 else qscore | |
# | |
# prepare a query, | |
# first tokenize then compute an reverse index of it's char position for each token | |
# | |
exports.prepare = prepare = (query) -> | |
tokens = tokenize(query) | |
{ | |
tokens: tokens | |
maps: tokens.map(bitVector) | |
} | |
# first normalize word case then | |
# split in words using whitespaces or \ / _ - . separator | |
# (capturing group) should keep tokens in output array on modern browsers and node | |
tokenize = (str) -> | |
normalize(str).split(/([\s\\\/\_\-\.])/g) | |
# Convert uppercase "A" to "a^" for any A-Z | |
# This allow correct uppercase match to count as two. | |
# Also false uppercase count as an one match + one error. | |
# | |
# If needed change uppercase marker "^" to suit application | |
normalize = (token) -> | |
token.replace(/([A-Z])/g, '$1^').toLowerCase() | |
# | |
# Build a reverse index: | |
# for each character what are it's position in string ? | |
# positions of char in the string is recorded as positions of bits set to 1 | |
# | |
bitVector = (str) -> | |
len = str.length | |
len = 32 if len > 32 | |
map = {} | |
i = -1 | |
while ++i < len | |
char = str[i] | |
if char of map | |
map[char] |= 1 << i | |
else | |
map[char] = 1 << i | |
map | |
# | |
# compute length of LCS(a,b) | |
# then normalize for size of string and apply prefix bonus | |
# aMap is bitVector(a) | |
# | |
score_map = (a, b, aMap) -> | |
m = a.length | |
n = b.length | |
bonus_prefix = 0.5 #add per character | |
#smallest string | |
k = if m < n then m else n | |
if k == 0 | |
return 0 | |
#normalize score against length of both inputs | |
sz_score = (m + n) / (2.0 * m * n) | |
#compute common prefix | |
prefix = 0 | |
if a == b | |
prefix = k | |
else | |
while a[prefix] == b[prefix] and ++prefix < k | |
continue | |
#shortest string consumed ? | |
if prefix == k | |
lcs_len = prefix | |
return sz_score * lcs_len * lcs_len + bonus_prefix * prefix | |
# Prepare | |
mask = (1 << m) - 1 | |
S = mask | |
j = prefix - 1 | |
# "hot loop" for each character of b | |
# Compute LCS, see (Hyyrö 2004) | |
# here " S " is the " V' " of their paper | |
while ++j < n | |
c = b[j] | |
if c of aMap | |
U = S & aMap[c] | |
S = (S + U) | (S - U) | |
# Remove match already accounted in prefix region. | |
mask &= ~((1 << prefix) - 1) | |
# lcs_len is number of 0 in S (at position lower than m) | |
# inverse S, mask it | |
S = ~S & mask | |
#Then do "popcount" operation on 32bit | |
S = S - (S >> 1 & 0x55555555) | |
S = (S & 0x33333333) + (S >> 2 & 0x33333333) | |
lcs_len = (S + (S >> 4) & 0x0F0F0F0F) * 0x01010101 >> 24 | |
#Prefix is part of lcs | |
lcs_len += prefix | |
#return score | |
sz_score * lcs_len * lcs_len + bonus_prefix * prefix |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment