Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Creates corpus for all movie years.
from pattern.vector import *
import glob
from string import *
import operator
documents = []
def create_document(script, name):
document = Document(script,
filter = lambda w: w.isalpha() and len(w) > 1,
punctuation = '[]():;,.!?\n\r\t\f ',
top = None, # Filter words not in the top most frequent.
threshold = 0, # Filter words whose count falls below threshold.
stemmer = LEMMA, # STEMMER | LEMMA | function | None.
exclude = [], # Filter words in the exclude list.
stopwords = False, # Include stop words?
name = name,
type = None)
documents.append(document)
# print document.keywords(top=10)
# print document.terms
for script_file in glob.glob('years/*.txt'):
year = split( split(script_file, '/')[1] , '.')[0]
f = open(script_file, 'r')
script = f.read()
f.close()
create_document(script, year)
corpus = Corpus(documents = documents, weight = TFIDF)
print corpus.documents
corpus.save('years.corpus')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment