Created
February 19, 2010 21:19
-
-
Save jkal/309224 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- encoding: utf-8 -*- | |
# | |
# containment.py | |
# use shingling technique to compute text containment | |
# | |
# {ikalantzis,vrachnis}@ceid.upatras.gr | |
# | |
import sys | |
import subprocess | |
import glob | |
import operator | |
import time | |
import hashlib | |
import lt09.config as config | |
class Containment(object): | |
def __init__(self, inputdir): | |
self.files = glob.glob('%s/*.tokens' % inputdir) | |
self.docids = [ x.split('_')[-1].split('.')[0] for x in self.files ] | |
# Format: { docid:hashlist } | |
# hashlist: list with hashes of all shingles | |
self.dict = {} | |
def docid(self, filename): | |
""" Find document id by filename. """ | |
return filename.split('_')[-1].split('.')[0] | |
def hash(self): | |
""" | |
Open file, compute shingles, hash the shingle string and put the | |
results in a dictionary. | |
""" | |
print 'Extracting w-shingles and calculating hashes...', | |
sys.stdout.flush() | |
for f, docid in zip(self.files, self.docids): | |
hashlist = [] | |
for shingle in self.shingle([ line.rstrip() for line in open(f).readlines()]): | |
hash = hashlib.sha224("".join(shingle)).hexdigest() | |
hashlist.append(hash) | |
self.dict[docid] = hashlist | |
print 'Done.' | |
def shingle(self, txtlist): | |
""" Compute and return a list of w-shingles from the given list. """ | |
shlist = [] | |
i, j = 0, config.W | |
ret = txtlist[0:j] | |
while len(ret) > 0: | |
shlist.append(ret) | |
i += 1 | |
j += 1 | |
ret = txtlist[i:j] | |
return shlist | |
def topfiles(self): | |
""" Find and return the 5 largest files in the collection. """ | |
length = {} | |
for docid in self.dict.keys(): | |
length[docid] = len(self.dict[docid]) | |
docs = sorted(length.items(), key=operator.itemgetter(1), reverse=1) | |
print 'Top files:', docs[:5] | |
return [ d[0] for d in docs[:5]] | |
def calculate(self): | |
self.hash() | |
for topdocid in self.topfiles(): | |
containment = 0 | |
for docid, hashlist in self.dict.iteritems(): | |
if docid != topdocid: | |
intersection = set(self.dict[topdocid]).intersection(hashlist) | |
containment += len(intersection) | |
print 'Total containment of', topdocid, 'is', containment | |
if __name__ == '__main__': | |
try: | |
import psyco | |
psyco.full() | |
print "Psyco is available. Fasten your seatbelts." | |
except ImportError: | |
print "Psyco not available." | |
Containment("./wikipedia").calculate() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment