Created
March 14, 2013 14:04
-
-
Save lekevicius/5161577 to your computer and use it in GitHub Desktop.
Creates corpus for all movie years.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pattern.vector import * | |
import glob | |
from string import * | |
import operator | |
documents = [] | |
def create_document(script, name): | |
document = Document(script, | |
filter = lambda w: w.isalpha() and len(w) > 1, | |
punctuation = '[]():;,.!?\n\r\t\f ', | |
top = None, # Filter words not in the top most frequent. | |
threshold = 0, # Filter words whose count falls below threshold. | |
stemmer = LEMMA, # STEMMER | LEMMA | function | None. | |
exclude = [], # Filter words in the exclude list. | |
stopwords = False, # Include stop words? | |
name = name, | |
type = None) | |
documents.append(document) | |
# print document.keywords(top=10) | |
# print document.terms | |
for script_file in glob.glob('years/*.txt'): | |
year = split( split(script_file, '/')[1] , '.')[0] | |
f = open(script_file, 'r') | |
script = f.read() | |
f.close() | |
create_document(script, year) | |
corpus = Corpus(documents = documents, weight = TFIDF) | |
print corpus.documents | |
corpus.save('years.corpus') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment