Skip to content

Instantly share code, notes, and snippets.

@jscrane
Created March 10, 2015 12:42
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jscrane/a74c447729a0a3677593 to your computer and use it in GitHub Desktop.
Save jscrane/a74c447729a0a3677593 to your computer and use it in GitHub Desktop.
Finding Similar Sentences
#!/usr/bin/env python
import sys
import string
# while lines are hashed as-is into a map where values are
# sets of indices.
def hash_whole(hashes, idx, s):
h = hash(s)
if not h in hashes:
hashes[h] = []
hashes[h].append(idx)
# from each longer line is constructed a shorter one with the word at
# each position removed. this shorter line is hashed into a top-level
# map with the key being the hash of the shorter string. the value of
# this key is a map keyed by the index of the deleted word whose value
# a list of the original lines which now match when a word at that
# position is removed.
def hash_part(hashes, idx, didx, s):
h = hash(s)
if not h in hashes:
hashes[h] = {}
if not didx in hashes[h]:
hashes[h][didx] = []
hashes[h][didx].append(idx)
def add_pair(seen, a, b):
p = None
if a < b:
p = (a, b)
else:
p = (b, a)
if p in seen:
return False
seen.add(p)
return True
# lines: all lines read from input
# prev_counts: a map of hashed strings to counts from the previous
# round for comparison with lines in this round with one word removed
# indices: list indices of lines of current length
def count_similar(lines, prev_counts, indices):
# counts exact pairs among lines[indices]
# there is no problem with hash collisions
# (at least with the given dataset)
whole = {}
for i in indices:
hash_whole(whole, i, lines[i])
counts = {}
dups = {}
pairs = 0
for h, l in whole.iteritems():
n = len(l)
counts[h] = n
dups[l[0]] = n
pairs = pairs + n * (n - 1) / 2
# count lines with word deleted which match whole lines
# hashed in the previous round
partial = {}
for i in dups:
words = lines[i].split()
for j in range(len(words)):
hash_part(partial, i, j, ' '.join([w for x,w in enumerate(words) if x != j]))
# example:
# 0 A
# 1 AB
# 2 AB
# 3 AX
# 4 AY
# pairs are: 0-1, 0-2, 0-3, 0-4, 1-3, 1-4, 2-3, 2-4, 3-4 and 1-2
lrsim = 0
dsim = 0
for h, hdel in partial.iteritems():
seen = set()
sr = set()
if not h in prev_counts:
for di, li in hdel.iteritems():
# nd counts pairs between dups hashed here and uniques,
# e.g., 1-3, 1-4, 2-3, 2-4
nd = 0
n = len(li)
for i in range(n-1):
r = li[i]
for j in range(i+1, n):
s = li[j]
if add_pair(seen, r, s):
nd = nd + dups[r] * dups[s]
dsim = dsim + nd
else:
for di, li in hdel.iteritems():
# nr counts 0-1, 0-2, 0-3, 0-4
nr = 0
# nd counts pairs between dups hashed here and uniques,
# e.g., 1-3, 1-4, 2-3, 2-4
nd = 0
n = len(li)
for i in range(n):
r = li[i]
if not r in sr:
sr.add(r)
nr = nr + dups[r]
for j in range(i+1, n):
s = li[j]
if add_pair(seen, r, s):
nd = nd + dups[r] * dups[s]
lrsim = lrsim + prev_counts[h] * nr
dsim = dsim + nd
return (pairs + lrsim + dsim, counts)
# read all lines and calculate their word-lengths
lines = []
lengths = {}
with open(sys.argv[1], 'r') as file:
for line in file:
s = line.split()
index = int(s[0])
lines.append(' '.join(s[1:]))
n = len(s) - 1
if not n in lengths:
lengths[n] = []
lengths[n].append(index)
lens = sorted(lengths)
n = len(lens)
# lens is a map of word-length to indices of lines of that length
# consider lines paired by length, e.g., (10, 11), (11, 12), etc.
counts = {}
s = 0
last = -1
for i in range(n):
a = lens[i]
sa = 0
if a-last > 1:
counts = {}
(sa, counts) = count_similar(lines, counts, lengths[a])
s = s + sa
print(a, sa, s)
last = a
print(s)
@xeoncross
Copy link

Can you explain what approach you are using here?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment