Skip to content

Instantly share code, notes, and snippets.

@kescobo
Last active March 16, 2017 16:36
Show Gist options
  • Save kescobo/7341eed840b56ba90d5661da2d745cac to your computer and use it in GitHub Desktop.
Save kescobo/7341eed840b56ba90d5661da2d745cac to your computer and use it in GitHub Desktop.
Better Jaccard Distance
function jaccarddistance(sketch1::MinHashSketch, sketch2::MinHashSketch)
d = length(setdiff(sketch1.sketch, sketch2.sketch))
l = length(sketch1)
return (l-d) / (l+d)
end
function newjd(sketch1::MinHashSketch, sketch2::MinHashSketch)
matches = 0
sketchlen = length(sketch1)
i = 1
j = 1
while i <= sketchlen && j <= sketchlen
if sketch1.sketch[i] == sketch2.sketch[j]
matches += 1
i += 1
j += 1
elseif sketch1.sketch[i] < sketch2.sketch[j]
while sketch1.sketch[i] < sketch2.sketch[j]
i += 1
end
elseif sketch2.sketch[j] < sketch1.sketch[i]
while sketch2.sketch[j] < sketch1.sketch[i]
j += 1
end
end
end
matches == sketchlen ? return 1.0 : return matches / (2 * sketchlen - matches)
end
newjd(sk1, sk2) == jaccarddistance(sk1, sk2) # true
# Old JD on sketches with k = 12, s = 500 for two ~ 4MB genomes
# BenchmarkTools.Trial:
# memory estimate: 26.02 KiB
# allocs estimate: 26
# --------------
# minimum time: 37.162 μs (0.00% GC)
# median time: 40.514 μs (0.00% GC)
# mean time: 49.695 μs (7.43% GC)
# maximum time: 5.304 ms (98.24% GC)
# --------------
# samples: 10000
# evals/sample: 1
# time tolerance: 5.00%
# memory tolerance: 1.00%
# New JD on sketches with k = 12, s = 500 for two ~ 4MB genomes
# BenchmarkTools.Trial:
# memory estimate: 16 bytes
# allocs estimate: 1
# --------------
# minimum time: 1.089 μs (0.00% GC)
# median time: 1.140 μs (0.00% GC)
# mean time: 1.246 μs (0.00% GC)
# maximum time: 7.864 μs (0.00% GC)
# --------------
# samples: 10000
# evals/sample: 10
# time tolerance: 5.00%
# memory tolerance: 1.00%
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment