Skip to content

Instantly share code, notes, and snippets.

@shanehou
Last active August 29, 2015 13:57
Show Gist options
  • Save shanehou/9761431 to your computer and use it in GitHub Desktop.
Save shanehou/9761431 to your computer and use it in GitHub Desktop.
199801_clear_new.txt是你的源文件去掉所有空行的版本,给你发过的。_list.py用list实现了,满足你的要求……
#!/usr/bin/env python2
#coding=utf-8
#encoding=utf-8
from __future__ import division
from collections import Counter
from math import log, sqrt
from time import clock
def benchmark(func):
def wrapper(*args, **kwargs):
t = clock()
print(func.__name__ + ":"),
result = func(*args, **kwargs)
print(clock() - t)
return result
return wrapper
class TfIdf(object):
"""docstring for TfIdf"""
@benchmark
def readCorpus(self, corpusFilePath):
try:
corpusFile = open(corpusFilePath, 'r')
except IOError as err:
print("I/O error: {0}".format(err))
else:
corpus = corpusFile.readlines()
corpus = [x.rstrip() for x in corpus]
finally:
corpusFile.close()
corpusDict = dict()
for doc in corpus:
docTermsList = doc.split(' ')
docId = docTermsList[0][0:15]
if docId in corpusDict:
corpusDict[docId] += docTermsList[1:]
else:
corpusDict[docId] = docTermsList[1:]
return [doc for docId, doc in sorted(corpusDict.iteritems(), key=lambda t: t[0])]
@benchmark
def filterStopwords(self, stopwordsFilePath, corpusList):
try:
sFile = open(stopwordsFilePath, 'r')
except IOError as err:
print("I/O error: {0}".format(err))
else:
sList = sFile.readlines()
sList = [x.rstrip() for x in sList]
for doc in corpusList:
for term in doc:
if term in sList:
doc.remove(term)
finally:
sFile.close()
return corpusList
@benchmark
def calcTermFreq(self, corpusList):
tfList = [{term: doc.count(term)/len(doc) for term in doc} for doc in corpusList]
return tfList
@benchmark
def calcInverseDocFreq(self, corpusList):
idfCounter = Counter()
for doc in corpusList:
idfCounter.update(set(doc))
corpusCount = len(corpusList)
return [{term: log((corpusCount/idfCounter[term]+1), 2) for term in set(doc)} for doc in corpusList]
@benchmark
def calcWeight(self, tfList, idfList):
return [{term: tfDoc[term]*idfDoc[term] for term in tfDoc.iterkeys()} for tfDoc, idfDoc in zip(tfList, idfList)]
@benchmark
def calcCosineSimilarity(self, simMatrix, weightList, qryId, query, numberOfResult):
simList = []
for refId, reference in enumerate(weightList):
if qryId > refId:
simList.append((refId, simMatrix[(refId, qryId)]))
else:
intersection = set(reference.iterkeys()) & set(query.iterkeys())
dotProduct = sum([reference[x]*query[x] for x in intersection])
refMagnitude = sum(x*x for x in reference.itervalues())
qryMagnitude = sum(x*x for x in query.itervalues())
magnitude = sqrt(refMagnitude) * sqrt(qryMagnitude)
similarity = magnitude and dotProduct/magnitude or 0.0
simMatrix[(qryId, refId)] = similarity
simList.append((refId, similarity))
return [str(docId+1) for docId, sim in sorted(simList, key=lambda t: t[1], reverse=True)][:numberOfResult]
if __name__ == "__main__":
start = clock()
tfidfModel = TfIdf()
corpusList = tfidfModel.readCorpus("199801_clear_new.txt")
corpusList = tfidfModel.filterStopwords("stopwords.txt", corpusList)
tfList = tfidfModel.calcTermFreq(corpusList)
idfList = tfidfModel.calcInverseDocFreq(corpusList)
weightList = tfidfModel.calcWeight(tfList, idfList)
output = open("output_list.txt", 'w')
simMatrix = dict()
for qryId, query in enumerate(weightList):
simList = tfidfModel.calcCosineSimilarity(simMatrix, weightList, qryId, query, 100)
output.write(','.join(simList))
output.write('\n')
print clock()-start
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment