jeancroy/gist:a6a7f59db14de85b2b97

## gistfile1.coffee
#
# score, provided for compatibility
# for speed one should really prepare the query once then reuse it
#

score = (string,query) ->
  scorePrepared(string,prepare(query))

#
# Match an item against a prepared query
#

exports.scorePrepared = scorePrepared =  (string, preparedQuery) ->

# Allow a small amount of error.
# Wrong uppercase will count as a match + an error.
  minScore = 0.6

  queryTokens = preparedQuery.tokens
  queryMaps = preparedQuery.maps
  queryLen = queryTokens.length

  strTokens = tokenize(string)
  # (Note that tokenize(string) could be applied in advance too )

  qscore = 0
  qtoken = queryTokens[0]
  qmap = queryMaps[0]
  qmatch = 0

  for token in strTokens

    if token.length == 1 && qtoken.length == 1
      sc = if (token==qtoken) then 1 else 0
      #To give higher score to special character modify this branch
    else
      sc = score_map(qtoken, token, qmap)

    if sc > minScore * qtoken.length
      qscore += sc
      qmatch++
      return qscore if qmatch == queryLen
      qtoken = queryTokens[qmatch]
      qmap = queryMaps[qmatch]

  #return
  if qmatch < queryLen then 0 else qscore

#
# prepare a query,
# first tokenize then compute an reverse index of it's char position for each token
#

exports.prepare = prepare = (query) ->
  tokens = tokenize(query)
  {
  tokens: tokens
  maps: tokens.map(bitVector)
  }


# first normalize word case then
# split in words using whitespaces or \ / _ - . separator
# (capturing group) should keep tokens in output array on modern browsers and node

tokenize = (str) ->
  normalize(str).split(/([\s\\\/\_\-\.])/g)

# Convert uppercase "A" to "a^" for any A-Z
# This allow correct uppercase match to count as two.
# Also false uppercase count as an one match + one error.
#
# If needed change uppercase marker "^" to suit application
normalize = (token) ->
  token.replace(/([A-Z])/g, '$1^').toLowerCase()

#
# Build a reverse index:
# for each character what are it's position in string ?
# positions of char in the string is recorded as positions of bits set to 1
#

bitVector = (str) ->
  len = str.length
  len = 32 if len > 32
  map = {}
  i = -1
  while ++i < len
    char = str[i]
    if char of map
      map[char] |= 1 << i
    else
      map[char] = 1 << i
  map

#
# compute length of LCS(a,b)
# then normalize for size of string and apply prefix bonus
# aMap is bitVector(a)
#

score_map = (a, b, aMap) ->
  m = a.length
  n = b.length
  bonus_prefix = 0.5 #add per character

  #smallest string
  k = if m < n then m else n
  if k == 0
    return 0

  #normalize score against length of both inputs
  sz_score = (m + n) / (2.0 * m * n)

  #compute common prefix
  prefix = 0
  if a == b
    prefix = k
  else
    while a[prefix] == b[prefix] and ++prefix < k
      continue

  #shortest string consumed ?
  if prefix == k
    lcs_len = prefix
    return sz_score * lcs_len * lcs_len + bonus_prefix * prefix

  # Prepare
  mask = (1 << m) - 1
  S = mask
  j = prefix - 1

  # "hot loop" for each character of b
  # Compute LCS, see (Hyyrö 2004)
  # here " S " is the " V' " of their paper
  while ++j < n
    c = b[j]
    if c of aMap
      U = S & aMap[c]
      S = (S + U) | (S - U)

  # Remove match already accounted in prefix region.
  mask &= ~((1 << prefix) - 1)

  # lcs_len is number of 0 in S (at position lower than m)
  # inverse S, mask it
  S = ~S & mask

  #Then do "popcount" operation on 32bit
  S = S - (S >> 1 & 0x55555555)
  S = (S & 0x33333333) + (S >> 2 & 0x33333333)
  lcs_len = (S + (S >> 4) & 0x0F0F0F0F) * 0x01010101 >> 24

  #Prefix is part of lcs
  lcs_len += prefix

  #return score
  sz_score * lcs_len * lcs_len + bonus_prefix * prefix
	#
	# score, provided for compatibility
	# for speed one should really prepare the query once then reuse it
	#

	score = (string,query) ->
	scorePrepared(string,prepare(query))

	#
	# Match an item against a prepared query
	#

	exports.scorePrepared = scorePrepared = (string, preparedQuery) ->

	# Allow a small amount of error.
	# Wrong uppercase will count as a match + an error.
	minScore = 0.6

	queryTokens = preparedQuery.tokens
	queryMaps = preparedQuery.maps
	queryLen = queryTokens.length

	strTokens = tokenize(string)
	# (Note that tokenize(string) could be applied in advance too )

	qscore = 0
	qtoken = queryTokens[0]
	qmap = queryMaps[0]
	qmatch = 0

	for token in strTokens

	if token.length == 1 && qtoken.length == 1
	sc = if (token==qtoken) then 1 else 0
	#To give higher score to special character modify this branch
	else
	sc = score_map(qtoken, token, qmap)

	if sc > minScore * qtoken.length
	qscore += sc
	qmatch++
	return qscore if qmatch == queryLen
	qtoken = queryTokens[qmatch]
	qmap = queryMaps[qmatch]

	#return
	if qmatch < queryLen then 0 else qscore

	#
	# prepare a query,
	# first tokenize then compute an reverse index of it's char position for each token
	#

	exports.prepare = prepare = (query) ->
	tokens = tokenize(query)
	{
	tokens: tokens
	maps: tokens.map(bitVector)
	}


	# first normalize word case then
	# split in words using whitespaces or \ / _ - . separator
	# (capturing group) should keep tokens in output array on modern browsers and node

	tokenize = (str) ->
	normalize(str).split(/([\s\\\/\_\-\.])/g)

	# Convert uppercase "A" to "a^" for any A-Z
	# This allow correct uppercase match to count as two.
	# Also false uppercase count as an one match + one error.
	#
	# If needed change uppercase marker "^" to suit application
	normalize = (token) ->
	token.replace(/([A-Z])/g, '$1^').toLowerCase()

	#
	# Build a reverse index:
	# for each character what are it's position in string ?
	# positions of char in the string is recorded as positions of bits set to 1
	#

	bitVector = (str) ->
	len = str.length
	len = 32 if len > 32
	map = {}
	i = -1
	while ++i < len
	char = str[i]
	if char of map
	map[char] \|= 1 << i
	else
	map[char] = 1 << i
	map

	#
	# compute length of LCS(a,b)
	# then normalize for size of string and apply prefix bonus
	# aMap is bitVector(a)
	#

	score_map = (a, b, aMap) ->
	m = a.length
	n = b.length
	bonus_prefix = 0.5 #add per character

	#smallest string
	k = if m < n then m else n
	if k == 0
	return 0

	#normalize score against length of both inputs
	sz_score = (m + n) / (2.0 * m * n)

	#compute common prefix
	prefix = 0
	if a == b
	prefix = k
	else
	while a[prefix] == b[prefix] and ++prefix < k
	continue

	#shortest string consumed ?
	if prefix == k
	lcs_len = prefix
	return sz_score * lcs_len * lcs_len + bonus_prefix * prefix

	# Prepare
	mask = (1 << m) - 1
	S = mask
	j = prefix - 1

	# "hot loop" for each character of b
	# Compute LCS, see (Hyyrö 2004)
	# here " S " is the " V' " of their paper
	while ++j < n
	c = b[j]
	if c of aMap
	U = S & aMap[c]
	S = (S + U) \| (S - U)

	# Remove match already accounted in prefix region.
	mask &= ~((1 << prefix) - 1)

	# lcs_len is number of 0 in S (at position lower than m)
	# inverse S, mask it
	S = ~S & mask

	#Then do "popcount" operation on 32bit
	S = S - (S >> 1 & 0x55555555)
	S = (S & 0x33333333) + (S >> 2 & 0x33333333)
	lcs_len = (S + (S >> 4) & 0x0F0F0F0F) * 0x01010101 >> 24

	#Prefix is part of lcs
	lcs_len += prefix

	#return score
	sz_score * lcs_len * lcs_len + bonus_prefix * prefix