Skip to content

Instantly share code, notes, and snippets.

@shanehou
Last active August 29, 2015 13:57
Show Gist options
  • Save shanehou/9808065 to your computer and use it in GitHub Desktop.
Save shanehou/9808065 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python2
#coding=utf-8
#encoding=utf-8
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from collections import OrderedDict
from gensim import corpora, models, similarities
class PassageCorpus(object):
"""docstring for PassageCorpus"""
# def __init__(self, filteredFilePath):
# self.fFile = open(filteredFilePath, 'r')
def __iter__(self):
for p in open("filtered.txt"):
yield p.split()
# def __del__(self):
# self.fFile.close()
pCorpus = PassageCorpus()
corpusDict = corpora.Dictionary(pCorpus)
bagOfWords = [corpusDict.doc2bow(word) for word in pCorpus]
tfidfModel = models.TfidfModel(bagOfWords)
tfidf = tfidfModel[bagOfWords]
pNo = 0
output = open("output.txt", 'w')
passages = PassageCorpus()
for p in passages:
pBagOfWords = corpusDict.doc2bow(p)
pTfidf = tfidfModel[pBagOfWords]
index = similarities.MatrixSimilarity(tfidf)
sims = index[pTfidf]
sortedSims = sorted(enumerate(sims), key = lambda t: t[1], reverse = True)
pNo += 1
output.write(str(pNo))
for simPassage in sortedSims[1:101]:
output.write(',' + str(simPassage[0]+1))
output.write('\n')
output.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment