Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Dieterbe/842243 to your computer and use it in GitHub Desktop.
Save Dieterbe/842243 to your computer and use it in GitHub Desktop.
attempt to make SimilarityABC __getitem__ faster
def __getitem__(self, doc):
import time
# get similarities of doc to all documents in the corpus
b = time.time()
if self.normalize:
doc = matutils.unitVec(doc)
allSims = self.getSimilarities(doc)
a = time.time()
# return either all similarities as a list, or only self.numBest most similar, depending on settings from the constructor
if self.numBest is None:
return allSims
else:
c = time.time()
top = set()
lowest = (None, 1)
# pairs of (docNo, score) tuples
for el in enumerate(allSims):
if el[1] <= 0:
continue
# do the initial fill
if len(top) != self.numBest:
top.add(el)
if el[1] < lowest[1]:
lowest = el
continue
# replace an old value with a better one in the full list
if el[1] > lowest[1]:
top.add(el)
top.remove(lowest)
lowest = (None, 1)
for t in top:
if t[1] < lowest[1]:
lowest = t
print top
ret = sorted(top, key = lambda item: -item[1]) # sort by -sim => highest cossim first
d = time.time()
print "SIM: %f SORT: %f" % (a-b, d-c)
return ret
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment