Skip to content

Instantly share code, notes, and snippets.

@jkal
Created February 19, 2010 21:19
Show Gist options
  • Save jkal/309224 to your computer and use it in GitHub Desktop.
Save jkal/309224 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
#
# containment.py
# use shingling technique to compute text containment
#
# {ikalantzis,vrachnis}@ceid.upatras.gr
#
import sys
import subprocess
import glob
import operator
import time
import hashlib
import lt09.config as config
class Containment(object):
def __init__(self, inputdir):
self.files = glob.glob('%s/*.tokens' % inputdir)
self.docids = [ x.split('_')[-1].split('.')[0] for x in self.files ]
# Format: { docid:hashlist }
# hashlist: list with hashes of all shingles
self.dict = {}
def docid(self, filename):
""" Find document id by filename. """
return filename.split('_')[-1].split('.')[0]
def hash(self):
"""
Open file, compute shingles, hash the shingle string and put the
results in a dictionary.
"""
print 'Extracting w-shingles and calculating hashes...',
sys.stdout.flush()
for f, docid in zip(self.files, self.docids):
hashlist = []
for shingle in self.shingle([ line.rstrip() for line in open(f).readlines()]):
hash = hashlib.sha224("".join(shingle)).hexdigest()
hashlist.append(hash)
self.dict[docid] = hashlist
print 'Done.'
def shingle(self, txtlist):
""" Compute and return a list of w-shingles from the given list. """
shlist = []
i, j = 0, config.W
ret = txtlist[0:j]
while len(ret) > 0:
shlist.append(ret)
i += 1
j += 1
ret = txtlist[i:j]
return shlist
def topfiles(self):
""" Find and return the 5 largest files in the collection. """
length = {}
for docid in self.dict.keys():
length[docid] = len(self.dict[docid])
docs = sorted(length.items(), key=operator.itemgetter(1), reverse=1)
print 'Top files:', docs[:5]
return [ d[0] for d in docs[:5]]
def calculate(self):
self.hash()
for topdocid in self.topfiles():
containment = 0
for docid, hashlist in self.dict.iteritems():
if docid != topdocid:
intersection = set(self.dict[topdocid]).intersection(hashlist)
containment += len(intersection)
print 'Total containment of', topdocid, 'is', containment
if __name__ == '__main__':
try:
import psyco
psyco.full()
print "Psyco is available. Fasten your seatbelts."
except ImportError:
print "Psyco not available."
Containment("./wikipedia").calculate()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment