jscrane/gist:a74c447729a0a3677593

## gistfile1.txt
#!/usr/bin/env python

import sys
import string

# while lines are hashed as-is into a map where values are
# sets of indices.
def hash_whole(hashes, idx, s):
    h = hash(s)
    if not h in hashes:
        hashes[h] = []
    hashes[h].append(idx)

# from each longer line is constructed a shorter one with the word at
# each position removed. this shorter line is hashed into a top-level
# map with the key being the hash of the shorter string. the value of
# this key is a map keyed by the index of the deleted word whose value
# a list of the original lines which now match when a word at that
# position is removed.
def hash_part(hashes, idx, didx, s):
    h = hash(s)
    if not h in hashes:
        hashes[h] = {}
    if not didx in hashes[h]:
        hashes[h][didx] = []
    hashes[h][didx].append(idx)

def add_pair(seen, a, b):
    p = None
    if a < b:
        p = (a, b)
    else:
        p = (b, a)
    if p in seen:
        return False
    seen.add(p)
    return True

# lines: all lines read from input
# prev_counts: a map of hashed strings to counts from the previous
# round for comparison with lines in this round with one word removed
# indices: list indices of lines of current length
def count_similar(lines, prev_counts, indices):

    # counts exact pairs among lines[indices]
    # there is no problem with hash collisions
    # (at least with the given dataset)
    whole = {}
    for i in indices:
        hash_whole(whole, i, lines[i])
    counts = {}
    dups = {}
    pairs = 0
    for h, l in whole.iteritems():
        n = len(l)
        counts[h] = n
        dups[l[0]] = n
        pairs = pairs + n * (n - 1) / 2

    # count lines with word deleted which match whole lines
    # hashed in the previous round
    partial = {}
    for i in dups:
        words = lines[i].split()
        for j in range(len(words)):
            hash_part(partial, i, j, ' '.join([w for x,w in enumerate(words) if x != j]))

    # example:
    # 0 A
    # 1 AB
    # 2 AB
    # 3 AX
    # 4 AY
    # pairs are: 0-1, 0-2, 0-3, 0-4, 1-3, 1-4, 2-3, 2-4, 3-4 and 1-2
    lrsim = 0
    dsim = 0
    for h, hdel in partial.iteritems():
        seen = set()
        sr = set()
        if not h in prev_counts:
            for di, li in hdel.iteritems():
                # nd counts pairs between dups hashed here and uniques,
                # e.g., 1-3, 1-4, 2-3, 2-4
                nd = 0
                n = len(li)
                for i in range(n-1):
                    r = li[i]
                    for j in range(i+1, n):
                        s = li[j]
                        if add_pair(seen, r, s):
                            nd = nd + dups[r] * dups[s]
                dsim = dsim + nd
        else:
            for di, li in hdel.iteritems():
                # nr counts 0-1, 0-2, 0-3, 0-4
                nr = 0
                # nd counts pairs between dups hashed here and uniques,
                # e.g., 1-3, 1-4, 2-3, 2-4
                nd = 0
                n = len(li)
                for i in range(n):
                    r = li[i]
                    if not r in sr:
                        sr.add(r)
                        nr = nr + dups[r]
                    for j in range(i+1, n):
                        s = li[j]
                        if add_pair(seen, r, s):
                            nd = nd + dups[r] * dups[s]
                lrsim = lrsim + prev_counts[h] * nr
                dsim = dsim + nd
    return (pairs + lrsim + dsim, counts)

# read all lines and calculate their word-lengths
lines = []
lengths = {}
with open(sys.argv[1], 'r') as file:
    for line in file:
        s = line.split()
        index = int(s[0])
        lines.append(' '.join(s[1:]))
        n = len(s) - 1
        if not n in lengths:
            lengths[n] = []
        lengths[n].append(index)

lens = sorted(lengths)
n = len(lens)

# lens is a map of word-length to indices of lines of that length
# consider lines paired by length,  e.g., (10, 11), (11, 12), etc.
counts = {}
s = 0
last = -1
for i in range(n):
    a = lens[i]
    sa = 0
    if a-last > 1:
        counts = {}
    (sa, counts) = count_similar(lines, counts, lengths[a])
    s = s + sa
    print(a, sa, s)
    last = a

print(s)
	#!/usr/bin/env python

	import sys
	import string

	# while lines are hashed as-is into a map where values are
	# sets of indices.
	def hash_whole(hashes, idx, s):
	h = hash(s)
	if not h in hashes:
	hashes[h] = []
	hashes[h].append(idx)

	# from each longer line is constructed a shorter one with the word at
	# each position removed. this shorter line is hashed into a top-level
	# map with the key being the hash of the shorter string. the value of
	# this key is a map keyed by the index of the deleted word whose value
	# a list of the original lines which now match when a word at that
	# position is removed.
	def hash_part(hashes, idx, didx, s):
	h = hash(s)
	if not h in hashes:
	hashes[h] = {}
	if not didx in hashes[h]:
	hashes[h][didx] = []
	hashes[h][didx].append(idx)

	def add_pair(seen, a, b):
	p = None
	if a < b:
	p = (a, b)
	else:
	p = (b, a)
	if p in seen:
	return False
	seen.add(p)
	return True

	# lines: all lines read from input
	# prev_counts: a map of hashed strings to counts from the previous
	# round for comparison with lines in this round with one word removed
	# indices: list indices of lines of current length
	def count_similar(lines, prev_counts, indices):

	# counts exact pairs among lines[indices]
	# there is no problem with hash collisions
	# (at least with the given dataset)
	whole = {}
	for i in indices:
	hash_whole(whole, i, lines[i])
	counts = {}
	dups = {}
	pairs = 0
	for h, l in whole.iteritems():
	n = len(l)
	counts[h] = n
	dups[l[0]] = n
	pairs = pairs + n * (n - 1) / 2

	# count lines with word deleted which match whole lines
	# hashed in the previous round
	partial = {}
	for i in dups:
	words = lines[i].split()
	for j in range(len(words)):
	hash_part(partial, i, j, ' '.join([w for x,w in enumerate(words) if x != j]))

	# example:
	# 0 A
	# 1 AB
	# 2 AB
	# 3 AX
	# 4 AY
	# pairs are: 0-1, 0-2, 0-3, 0-4, 1-3, 1-4, 2-3, 2-4, 3-4 and 1-2
	lrsim = 0
	dsim = 0
	for h, hdel in partial.iteritems():
	seen = set()
	sr = set()
	if not h in prev_counts:
	for di, li in hdel.iteritems():
	# nd counts pairs between dups hashed here and uniques,
	# e.g., 1-3, 1-4, 2-3, 2-4
	nd = 0
	n = len(li)
	for i in range(n-1):
	r = li[i]
	for j in range(i+1, n):
	s = li[j]
	if add_pair(seen, r, s):
	nd = nd + dups[r] * dups[s]
	dsim = dsim + nd
	else:
	for di, li in hdel.iteritems():
	# nr counts 0-1, 0-2, 0-3, 0-4
	nr = 0
	# nd counts pairs between dups hashed here and uniques,
	# e.g., 1-3, 1-4, 2-3, 2-4
	nd = 0
	n = len(li)
	for i in range(n):
	r = li[i]
	if not r in sr:
	sr.add(r)
	nr = nr + dups[r]
	for j in range(i+1, n):
	s = li[j]
	if add_pair(seen, r, s):
	nd = nd + dups[r] * dups[s]
	lrsim = lrsim + prev_counts[h] * nr
	dsim = dsim + nd
	return (pairs + lrsim + dsim, counts)

	# read all lines and calculate their word-lengths
	lines = []
	lengths = {}
	with open(sys.argv[1], 'r') as file:
	for line in file:
	s = line.split()
	index = int(s[0])
	lines.append(' '.join(s[1:]))
	n = len(s) - 1
	if not n in lengths:
	lengths[n] = []
	lengths[n].append(index)

	lens = sorted(lengths)
	n = len(lens)

	# lens is a map of word-length to indices of lines of that length
	# consider lines paired by length, e.g., (10, 11), (11, 12), etc.
	counts = {}
	s = 0
	last = -1
	for i in range(n):
	a = lens[i]
	sa = 0
	if a-last > 1:
	counts = {}
	(sa, counts) = count_similar(lines, counts, lengths[a])
	s = s + sa
	print(a, sa, s)
	last = a

	print(s)