Last active
August 29, 2015 13:57
-
-
Save shanehou/9761431 to your computer and use it in GitHub Desktop.
199801_clear_new.txt是你的源文件去掉所有空行的版本,给你发过的。_list.py用list实现了,满足你的要求……
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
#coding=utf-8 | |
#encoding=utf-8 | |
from __future__ import division | |
from collections import Counter | |
from math import log, sqrt | |
from time import clock | |
def benchmark(func): | |
def wrapper(*args, **kwargs): | |
t = clock() | |
print(func.__name__ + ":"), | |
result = func(*args, **kwargs) | |
print(clock() - t) | |
return result | |
return wrapper | |
class TfIdf(object): | |
"""docstring for TfIdf""" | |
@benchmark | |
def readCorpus(self, corpusFilePath): | |
try: | |
corpusFile = open(corpusFilePath, 'r') | |
except IOError as err: | |
print("I/O error: {0}".format(err)) | |
else: | |
corpus = corpusFile.readlines() | |
corpus = [x.rstrip() for x in corpus] | |
finally: | |
corpusFile.close() | |
corpusDict = dict() | |
for doc in corpus: | |
docTermsList = doc.split(' ') | |
docId = docTermsList[0][0:15] | |
if docId in corpusDict: | |
corpusDict[docId] += docTermsList[1:] | |
else: | |
corpusDict[docId] = docTermsList[1:] | |
return [doc for docId, doc in sorted(corpusDict.iteritems(), key=lambda t: t[0])] | |
@benchmark | |
def filterStopwords(self, stopwordsFilePath, corpusList): | |
try: | |
sFile = open(stopwordsFilePath, 'r') | |
except IOError as err: | |
print("I/O error: {0}".format(err)) | |
else: | |
sList = sFile.readlines() | |
sList = [x.rstrip() for x in sList] | |
for doc in corpusList: | |
for term in doc: | |
if term in sList: | |
doc.remove(term) | |
finally: | |
sFile.close() | |
return corpusList | |
@benchmark | |
def calcTermFreq(self, corpusList): | |
tfList = [{term: doc.count(term)/len(doc) for term in doc} for doc in corpusList] | |
return tfList | |
@benchmark | |
def calcInverseDocFreq(self, corpusList): | |
idfCounter = Counter() | |
for doc in corpusList: | |
idfCounter.update(set(doc)) | |
corpusCount = len(corpusList) | |
return [{term: log((corpusCount/idfCounter[term]+1), 2) for term in set(doc)} for doc in corpusList] | |
@benchmark | |
def calcWeight(self, tfList, idfList): | |
return [{term: tfDoc[term]*idfDoc[term] for term in tfDoc.iterkeys()} for tfDoc, idfDoc in zip(tfList, idfList)] | |
@benchmark | |
def calcCosineSimilarity(self, simMatrix, weightList, qryId, query, numberOfResult): | |
simList = [] | |
for refId, reference in enumerate(weightList): | |
if qryId > refId: | |
simList.append((refId, simMatrix[(refId, qryId)])) | |
else: | |
intersection = set(reference.iterkeys()) & set(query.iterkeys()) | |
dotProduct = sum([reference[x]*query[x] for x in intersection]) | |
refMagnitude = sum(x*x for x in reference.itervalues()) | |
qryMagnitude = sum(x*x for x in query.itervalues()) | |
magnitude = sqrt(refMagnitude) * sqrt(qryMagnitude) | |
similarity = magnitude and dotProduct/magnitude or 0.0 | |
simMatrix[(qryId, refId)] = similarity | |
simList.append((refId, similarity)) | |
return [str(docId+1) for docId, sim in sorted(simList, key=lambda t: t[1], reverse=True)][:numberOfResult] | |
if __name__ == "__main__": | |
start = clock() | |
tfidfModel = TfIdf() | |
corpusList = tfidfModel.readCorpus("199801_clear_new.txt") | |
corpusList = tfidfModel.filterStopwords("stopwords.txt", corpusList) | |
tfList = tfidfModel.calcTermFreq(corpusList) | |
idfList = tfidfModel.calcInverseDocFreq(corpusList) | |
weightList = tfidfModel.calcWeight(tfList, idfList) | |
output = open("output_list.txt", 'w') | |
simMatrix = dict() | |
for qryId, query in enumerate(weightList): | |
simList = tfidfModel.calcCosineSimilarity(simMatrix, weightList, qryId, query, 100) | |
output.write(','.join(simList)) | |
output.write('\n') | |
print clock()-start |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment