miguelmalvarez/run.py

## run.py
from nltk import word_tokenize
from nltk.corpus import reuters
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
import re
from nltk.corpus import stopwords

cachedStopWords = stopwords.words("english")

def tokenize(text):
	min_length = 3
	words = map(lambda word: word.lower(), word_tokenize(text));
	words = [word for word in words if word not in cachedStopWords]
	tokens =(list(map(lambda token: PorterStemmer().stem(token), words)));
	p = re.compile('[a-zA-Z]+');
	filtered_tokens = list(filter(lambda token: p.match(token) and len(token)>=min_length, tokens));
	return filtered_tokens

# Return the representer, without transforming
def tf_idf(docs):
	tfidf = TfidfVectorizer(tokenizer=tokenize, min_df=3, max_df=0.90, max_features=1000, use_idf=True, sublinear_tf=True);
	tfidf.fit(docs);
	return tfidf;

def feature_values(doc, representer):
	doc_representation = representer.transform([doc])
	features = representer.get_feature_names()
	return [(features[index], doc_representation[0, index]) for index in doc_representation.nonzero()[1]]

def collection_stats():
	# List of documents
	documents = reuters.fileids()
	print(str(len(documents)) + " documents");

	train_docs = list(filter(lambda doc: doc.startswith("train"), documents));
	print(str(len(train_docs)) + " total train documents");

	test_docs = list(filter(lambda doc: doc.startswith("test"), documents));
	print(str(len(test_docs)) + " total test documents");

	# List of categories
	categories = reuters.categories();
	print(str(len(categories)) + " categories");

	# Documents in a category
	category_docs = reuters.fileids("acq");

	# Words for a document
	document_id = category_docs[0]
	document_words = reuters.words(category_docs[0]);
	print(document_words);

	# Raw document
	print(reuters.raw(document_id));

def main():
	train_docs = []
	test_docs = []

	for doc_id in reuters.fileids():
		if doc_id.startswith("train"):
			train_docs.append(reuters.raw(doc_id))
		else:
			test_docs.append(reuters.raw(doc_id))

	representer = tf_idf(train_docs);

	for doc in test_docs:
		print(feature_values(doc, representer))
	from nltk import word_tokenize
	from nltk.corpus import reuters
	from sklearn.feature_extraction.text import TfidfVectorizer
	from nltk.stem.porter import PorterStemmer
	import re
	from nltk.corpus import stopwords

	cachedStopWords = stopwords.words("english")

	def tokenize(text):
	min_length = 3
	words = map(lambda word: word.lower(), word_tokenize(text));
	words = [word for word in words if word not in cachedStopWords]
	tokens =(list(map(lambda token: PorterStemmer().stem(token), words)));
	p = re.compile('[a-zA-Z]+');
	filtered_tokens = list(filter(lambda token: p.match(token) and len(token)>=min_length, tokens));
	return filtered_tokens

	# Return the representer, without transforming
	def tf_idf(docs):
	tfidf = TfidfVectorizer(tokenizer=tokenize, min_df=3, max_df=0.90, max_features=1000, use_idf=True, sublinear_tf=True);
	tfidf.fit(docs);
	return tfidf;

	def feature_values(doc, representer):
	doc_representation = representer.transform([doc])
	features = representer.get_feature_names()
	return [(features[index], doc_representation[0, index]) for index in doc_representation.nonzero()[1]]

	def collection_stats():
	# List of documents
	documents = reuters.fileids()
	print(str(len(documents)) + " documents");

	train_docs = list(filter(lambda doc: doc.startswith("train"), documents));
	print(str(len(train_docs)) + " total train documents");

	test_docs = list(filter(lambda doc: doc.startswith("test"), documents));
	print(str(len(test_docs)) + " total test documents");

	# List of categories
	categories = reuters.categories();
	print(str(len(categories)) + " categories");

	# Documents in a category
	category_docs = reuters.fileids("acq");

	# Words for a document
	document_id = category_docs[0]
	document_words = reuters.words(category_docs[0]);
	print(document_words);

	# Raw document
	print(reuters.raw(document_id));

	def main():
	train_docs = []
	test_docs = []

	for doc_id in reuters.fileids():
	if doc_id.startswith("train"):
	train_docs.append(reuters.raw(doc_id))
	else:
	test_docs.append(reuters.raw(doc_id))

	representer = tf_idf(train_docs);

	for doc in test_docs:
	print(feature_values(doc, representer))