Skip to content

Instantly share code, notes, and snippets.

@sgammon
Created March 26, 2011 03:16
Show Gist options
  • Save sgammon/887995 to your computer and use it in GitHub Desktop.
Save sgammon/887995 to your computer and use it in GitHub Desktop.
from sampletext import finn, stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize.punkt import PunktLanguageVars
import string
index_mappings = {}
for word in [w.replace('.','') for w in filter(lambda xstring: xstring not in stopwords and xstring.replace('.', '').isalnum(), (PunktLanguageVars().word_tokenize(finn.lower())))]:
k = (PorterStemmer().stem(word), word)
if k in index_mappings:
index_mappings[k]['count'] = index_mappings[k]['count']+1
else:
index_mappings[k] = {'count':1, 'substrings':[word[0:p] for p in reversed(xrange(3, len(word)))]}
for k, v in index_mappings.items():
print str(k)+': '+str(v)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment