Skip to content

Instantly share code, notes, and snippets.

@matthieugomez
Last active October 26, 2015 13:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save matthieugomez/935bf1b60f1d2d415525 to your computer and use it in GitHub Desktop.
Save matthieugomez/935bf1b60f1d2d415525 to your computer and use it in GitHub Desktop.
# Define iterator on q-grams
type QGramIterator{S <: AbstractString, T <: Integer}
s::S
q::T
end
function Base.start(qgram::QGramIterator)
len = length(qgram.s)
(1, len < qgram.q ? endof(qgram.s) + 1 : chr2ind(qgram.s, qgram.q))
end
function Base.next(qgram::QGramIterator, state)
istart, iend = state
element = SubString(qgram.s, istart, iend)
nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend)
return element, nextstate
end
function Base.done(qgram::QGramIterator, state)
istart, idend = state
done(qgram.s, idend)
end
Base.length(qgram::QGramIterator) = length(qgram.s) - qgram.q + 1
Base.eltype(qgram::QGramIterator) = SubString{typeof(qgram.s)}
function Base.Set(s::QGramIterator)
set = Set{eltype(s)}()
sizehint!(set, length(s))
for x in s
push!(set, x)
end
return set
end
# Define Jaccard distance
function jaccard(s1::AbstractString, s2::AbstractString, q::Integer)
len1, len2 = length(s1), length(s2)
len1 > len2 && jaccard(s2, s1, q)
q1 = QGramIterator(s1, q)
set1 = Set(q1)
q2 = QGramIterator(s2, q)
set2 = Set(q2)
numerator = 0
for x in set1
if x in set2
numerator += 1
end
end
return 1.0 - numerator / (length(set1) + length(set2) - numerator)
end
# Performance
x = map(randstring, rand(5:25,100_000))
y = map(randstring, rand(5:25,100_000))
function f(x, y, q::Integer)
d = Array(Float64, length(x))
@inbounds for i in 1:length(x)
d[i] = jaccard(x[i], y[i], q)
end
end
@time f(x, y, 2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment