Skip to content

Instantly share code, notes, and snippets.

@lekevicius
Created March 14, 2013 14:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lekevicius/5161577 to your computer and use it in GitHub Desktop.
Save lekevicius/5161577 to your computer and use it in GitHub Desktop.
Creates corpus for all movie years.
from pattern.vector import *
import glob
from string import *
import operator
documents = []
def create_document(script, name):
document = Document(script,
filter = lambda w: w.isalpha() and len(w) > 1,
punctuation = '[]():;,.!?\n\r\t\f ',
top = None, # Filter words not in the top most frequent.
threshold = 0, # Filter words whose count falls below threshold.
stemmer = LEMMA, # STEMMER | LEMMA | function | None.
exclude = [], # Filter words in the exclude list.
stopwords = False, # Include stop words?
name = name,
type = None)
documents.append(document)
# print document.keywords(top=10)
# print document.terms
for script_file in glob.glob('years/*.txt'):
year = split( split(script_file, '/')[1] , '.')[0]
f = open(script_file, 'r')
script = f.read()
f.close()
create_document(script, year)
corpus = Corpus(documents = documents, weight = TFIDF)
print corpus.documents
corpus.save('years.corpus')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment