rsirres/levensthein.nim

## levensthein.nim
# http://julesjacobs.github.io/2015/06/17/disqus-levenshtein-simple-and-fast.html

import sugar
import sequtils
import tables
import strutils
import hashes

type
    SparseLevenshteinAutomaton = object
        max_edits: int
        s: string


    State = tuple[indices: seq[int], values: seq[int]]


proc hash(x: State): Hash =
    ## Piggyback on the already available string hash proc.
    ##
    ## Without this proc nothing works!
    result = x.indices.hash !& x.values.hash
    result = !$result

proc start(self: SparseLevenshteinAutomaton):State =
    return (newSeq[int](self.max_edits), newSeq[int](self.max_edits))

proc step(self: SparseLevenshteinAutomaton, n: State, c:char): State =
    var new_indices: seq[int]
    var new_values: seq[int]

    if len(n.indices) != 0 and n.indices[0] == 0 and n.values[0] < self.max_edits:
        new_indices = @[0]
        new_values = @[n.values[0] + 1]
    else:
        new_indices = @[]
        new_values = @[]


    for j, i in n.indices:
        if i == len(self.s): break
        var cost = if self.s[i] == c: 0 else: 1
        var val = n.values[j] + cost
        if len(new_indices) != 0 and new_indices[^1] == i:
            val = min(val, new_values[^1] + 1)
        if j+1 < len(n.indices) and n.indices[j+1] == i+1:
            val = min(val, n.values[j+1] + 1)
        if val <= self.max_edits:
            new_indices.add(i+1)
            new_values.add(val)

    return (new_indices, new_values)


proc is_match(self: SparseLevenshteinAutomaton, n: State): bool =
    return len(n.indices) != 0 and n.indices[^1] == len(self.s)

proc can_match(self: SparseLevenshteinAutomaton, n: tuple[indices: seq[int], values: seq[int]]):bool =
    return len(n.indices) != 0

proc transitions(self: SparseLevenshteinAutomaton, n: tuple[indices: seq[int], values: seq[int]]): seq[char] =
    return deduplicate( lc[self.s[i] | (i <- n.indices, i < len(self.s)), char] )


var counter = @[0] # list is a hack for mutable lexical scoping
var states = initTable[State, int]()
var trans:seq[(int, int, char)] = @[]
var matching: seq[int] = @[]

let lev = SparseLevenshteinAutomaton(s:"woof", max_edits:2)

# proc explore(state: State) : int =
#     let key = state # lists can't be hashed in Python so convert to a tuple
#     if key in states: return states[key]
#     var i = counter[0]
#     counter[0] += 1
#     states[key] = i
#     if lev.is_match(state): matching.add(i)
#     for c in concat(lev.transitions(state), @['*']):
#         var newstate = lev.step(state, c)
#         var j = explore(newstate)
#         trans.add((i, j, c))
#     return i

# var i = explore(lev.start())

var state = lev.start()
let s = "woofies"

for c in s:
    state = lev.step(state, c)
    echo lev.can_match(state) # True, "w" can match "bannana" with distance 1
	# http://julesjacobs.github.io/2015/06/17/disqus-levenshtein-simple-and-fast.html

	import sugar
	import sequtils
	import tables
	import strutils
	import hashes

	type
	SparseLevenshteinAutomaton = object
	max_edits: int
	s: string


	State = tuple[indices: seq[int], values: seq[int]]


	proc hash(x: State): Hash =
	## Piggyback on the already available string hash proc.
	##
	## Without this proc nothing works!
	result = x.indices.hash !& x.values.hash
	result = !$result

	proc start(self: SparseLevenshteinAutomaton):State =
	return (newSeq[int](self.max_edits), newSeq[int](self.max_edits))

	proc step(self: SparseLevenshteinAutomaton, n: State, c:char): State =
	var new_indices: seq[int]
	var new_values: seq[int]

	if len(n.indices) != 0 and n.indices[0] == 0 and n.values[0] < self.max_edits:
	new_indices = @[0]
	new_values = @[n.values[0] + 1]
	else:
	new_indices = @[]
	new_values = @[]


	for j, i in n.indices:
	if i == len(self.s): break
	var cost = if self.s[i] == c: 0 else: 1
	var val = n.values[j] + cost
	if len(new_indices) != 0 and new_indices[^1] == i:
	val = min(val, new_values[^1] + 1)
	if j+1 < len(n.indices) and n.indices[j+1] == i+1:
	val = min(val, n.values[j+1] + 1)
	if val <= self.max_edits:
	new_indices.add(i+1)
	new_values.add(val)

	return (new_indices, new_values)


	proc is_match(self: SparseLevenshteinAutomaton, n: State): bool =
	return len(n.indices) != 0 and n.indices[^1] == len(self.s)

	proc can_match(self: SparseLevenshteinAutomaton, n: tuple[indices: seq[int], values: seq[int]]):bool =
	return len(n.indices) != 0

	proc transitions(self: SparseLevenshteinAutomaton, n: tuple[indices: seq[int], values: seq[int]]): seq[char] =
	return deduplicate( lc[self.s[i] \| (i <- n.indices, i < len(self.s)), char] )



	var counter = @[0] # list is a hack for mutable lexical scoping
	var states = initTable[State, int]()
	var trans:seq[(int, int, char)] = @[]
	var matching: seq[int] = @[]

	let lev = SparseLevenshteinAutomaton(s:"woof", max_edits:2)

	# proc explore(state: State) : int =
	# let key = state # lists can't be hashed in Python so convert to a tuple
	# if key in states: return states[key]
	# var i = counter[0]
	# counter[0] += 1
	# states[key] = i
	# if lev.is_match(state): matching.add(i)
	# for c in concat(lev.transitions(state), @['*']):
	# var newstate = lev.step(state, c)
	# var j = explore(newstate)
	# trans.add((i, j, c))
	# return i

	# var i = explore(lev.start())

	var state = lev.start()
	let s = "woofies"

	for c in s:
	state = lev.step(state, c)
	echo lev.can_match(state) # True, "w" can match "bannana" with distance 1