lekevicius/create_corpus.py

## create_corpus.py
from pattern.vector import *
import glob
from string import *
import operator

documents = []

def create_document(script, name):
  document = Document(script,
		filter = lambda w: w.isalpha() and len(w) > 1,
		punctuation = '[]():;,.!?\n\r\t\f ',
		top = None,       # Filter words not in the top most frequent.
		threshold = 0,          # Filter words whose count falls below threshold.
		stemmer = LEMMA, # STEMMER | LEMMA | function | None.
		exclude = [],         # Filter words in the exclude list.
		stopwords = False,      # Include stop words?
		name = name,
		type = None)
	documents.append(document)
	# print document.keywords(top=10)
	# print document.terms

for script_file in glob.glob('years/*.txt'):
	year = split( split(script_file, '/')[1] , '.')[0]

	f = open(script_file, 'r')
	script = f.read()
	f.close()

	create_document(script, year)


corpus = Corpus(documents = documents, weight = TFIDF)
print corpus.documents
corpus.save('years.corpus')
	from pattern.vector import *
	import glob
	from string import *
	import operator

	documents = []

	def create_document(script, name):
	document = Document(script,
	filter = lambda w: w.isalpha() and len(w) > 1,
	punctuation = '[]():;,.!?\n\r\t\f ',
	top = None, # Filter words not in the top most frequent.
	threshold = 0, # Filter words whose count falls below threshold.
	stemmer = LEMMA, # STEMMER \| LEMMA \| function \| None.
	exclude = [], # Filter words in the exclude list.
	stopwords = False, # Include stop words?
	name = name,
	type = None)
	documents.append(document)
	# print document.keywords(top=10)
	# print document.terms

	for script_file in glob.glob('years/*.txt'):
	year = split( split(script_file, '/')[1] , '.')[0]

	f = open(script_file, 'r')
	script = f.read()
	f.close()

	create_document(script, year)


	corpus = Corpus(documents = documents, weight = TFIDF)
	print corpus.documents
	corpus.save('years.corpus')