Skip to content

Instantly share code, notes, and snippets.

@nova77
Created October 28, 2010 22:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nova77/652480 to your computer and use it in GitHub Desktop.
Save nova77/652480 to your computer and use it in GitHub Desktop.
import glom
import sys
from itertools import izip
def taketwo(arr):
i = (i for i in arr)
return izip(i, i)
def preload(ta_path):
"preload glom with tagged articles from tagged clusters"
# seed glom with the clusters we'll be searching against
test_articles_set = {}
ta = open(ta_path)
while True:
sl = ta.readline()
if not sl:
break
_, said, stext = sl.split(' ', 2)
sv = list((int(i), float(w)) for i,w in taketwo(stext.split()))
test_articles_set[said] = sv
return test_articles_set
def trim(ta_path, radius):
g = glom.Glom(radius, 50000)
test_articles_set = preload(ta_path)
while True:
sl = sys.stdin.readline()
if not sl:
break
_, said, stext = sl.split(' ', 2)
sv = list((int(i), float(w)) for i,w in taketwo(stext.split()))
for test_aid,test_points in test_articles_set.iteritems():
# itself or at least one similarity within the radius
if test_aid == said or g.point_sim(test_points, sv) >= radius:
print sl,
break
if __name__ == "__main__":
if len(sys.argv) < 3:
print >> sys.stderr, "usage:", sys.argv[0], "<clustered article vectors> <radius> < full_vectors > trimmed_vectors"
print >> sys.stderr, "example:", sys.argv[0], "vectors_time_sorted_tagged.txt 0.22 < vectors_time_sorted.txt > vectors_trimmed.txt"
sys.exit(1)
trim(sys.argv[1], float(sys.argv[2]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment