Created
March 10, 2015 12:42
-
-
Save jscrane/a74c447729a0a3677593 to your computer and use it in GitHub Desktop.
Finding Similar Sentences
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys | |
import string | |
# while lines are hashed as-is into a map where values are | |
# sets of indices. | |
def hash_whole(hashes, idx, s): | |
h = hash(s) | |
if not h in hashes: | |
hashes[h] = [] | |
hashes[h].append(idx) | |
# from each longer line is constructed a shorter one with the word at | |
# each position removed. this shorter line is hashed into a top-level | |
# map with the key being the hash of the shorter string. the value of | |
# this key is a map keyed by the index of the deleted word whose value | |
# a list of the original lines which now match when a word at that | |
# position is removed. | |
def hash_part(hashes, idx, didx, s): | |
h = hash(s) | |
if not h in hashes: | |
hashes[h] = {} | |
if not didx in hashes[h]: | |
hashes[h][didx] = [] | |
hashes[h][didx].append(idx) | |
def add_pair(seen, a, b): | |
p = None | |
if a < b: | |
p = (a, b) | |
else: | |
p = (b, a) | |
if p in seen: | |
return False | |
seen.add(p) | |
return True | |
# lines: all lines read from input | |
# prev_counts: a map of hashed strings to counts from the previous | |
# round for comparison with lines in this round with one word removed | |
# indices: list indices of lines of current length | |
def count_similar(lines, prev_counts, indices): | |
# counts exact pairs among lines[indices] | |
# there is no problem with hash collisions | |
# (at least with the given dataset) | |
whole = {} | |
for i in indices: | |
hash_whole(whole, i, lines[i]) | |
counts = {} | |
dups = {} | |
pairs = 0 | |
for h, l in whole.iteritems(): | |
n = len(l) | |
counts[h] = n | |
dups[l[0]] = n | |
pairs = pairs + n * (n - 1) / 2 | |
# count lines with word deleted which match whole lines | |
# hashed in the previous round | |
partial = {} | |
for i in dups: | |
words = lines[i].split() | |
for j in range(len(words)): | |
hash_part(partial, i, j, ' '.join([w for x,w in enumerate(words) if x != j])) | |
# example: | |
# 0 A | |
# 1 AB | |
# 2 AB | |
# 3 AX | |
# 4 AY | |
# pairs are: 0-1, 0-2, 0-3, 0-4, 1-3, 1-4, 2-3, 2-4, 3-4 and 1-2 | |
lrsim = 0 | |
dsim = 0 | |
for h, hdel in partial.iteritems(): | |
seen = set() | |
sr = set() | |
if not h in prev_counts: | |
for di, li in hdel.iteritems(): | |
# nd counts pairs between dups hashed here and uniques, | |
# e.g., 1-3, 1-4, 2-3, 2-4 | |
nd = 0 | |
n = len(li) | |
for i in range(n-1): | |
r = li[i] | |
for j in range(i+1, n): | |
s = li[j] | |
if add_pair(seen, r, s): | |
nd = nd + dups[r] * dups[s] | |
dsim = dsim + nd | |
else: | |
for di, li in hdel.iteritems(): | |
# nr counts 0-1, 0-2, 0-3, 0-4 | |
nr = 0 | |
# nd counts pairs between dups hashed here and uniques, | |
# e.g., 1-3, 1-4, 2-3, 2-4 | |
nd = 0 | |
n = len(li) | |
for i in range(n): | |
r = li[i] | |
if not r in sr: | |
sr.add(r) | |
nr = nr + dups[r] | |
for j in range(i+1, n): | |
s = li[j] | |
if add_pair(seen, r, s): | |
nd = nd + dups[r] * dups[s] | |
lrsim = lrsim + prev_counts[h] * nr | |
dsim = dsim + nd | |
return (pairs + lrsim + dsim, counts) | |
# read all lines and calculate their word-lengths | |
lines = [] | |
lengths = {} | |
with open(sys.argv[1], 'r') as file: | |
for line in file: | |
s = line.split() | |
index = int(s[0]) | |
lines.append(' '.join(s[1:])) | |
n = len(s) - 1 | |
if not n in lengths: | |
lengths[n] = [] | |
lengths[n].append(index) | |
lens = sorted(lengths) | |
n = len(lens) | |
# lens is a map of word-length to indices of lines of that length | |
# consider lines paired by length, e.g., (10, 11), (11, 12), etc. | |
counts = {} | |
s = 0 | |
last = -1 | |
for i in range(n): | |
a = lens[i] | |
sa = 0 | |
if a-last > 1: | |
counts = {} | |
(sa, counts) = count_similar(lines, counts, lengths[a]) | |
s = s + sa | |
print(a, sa, s) | |
last = a | |
print(s) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Can you explain what approach you are using here?