Created
February 24, 2011 14:51
-
-
Save Dieterbe/842243 to your computer and use it in GitHub Desktop.
attempt to make SimilarityABC __getitem__ faster
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def __getitem__(self, doc): | |
import time | |
# get similarities of doc to all documents in the corpus | |
b = time.time() | |
if self.normalize: | |
doc = matutils.unitVec(doc) | |
allSims = self.getSimilarities(doc) | |
a = time.time() | |
# return either all similarities as a list, or only self.numBest most similar, depending on settings from the constructor | |
if self.numBest is None: | |
return allSims | |
else: | |
c = time.time() | |
top = set() | |
lowest = (None, 1) | |
# pairs of (docNo, score) tuples | |
for el in enumerate(allSims): | |
if el[1] <= 0: | |
continue | |
# do the initial fill | |
if len(top) != self.numBest: | |
top.add(el) | |
if el[1] < lowest[1]: | |
lowest = el | |
continue | |
# replace an old value with a better one in the full list | |
if el[1] > lowest[1]: | |
top.add(el) | |
top.remove(lowest) | |
lowest = (None, 1) | |
for t in top: | |
if t[1] < lowest[1]: | |
lowest = t | |
print top | |
ret = sorted(top, key = lambda item: -item[1]) # sort by -sim => highest cossim first | |
d = time.time() | |
print "SIM: %f SORT: %f" % (a-b, d-c) | |
return ret |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment