andreasvc/bowclassify.py

## bowclassify.py
"""A baseline Bag-of-Words text classification.

Usage: python3 classify.py <train.txt> <test.txt> [--svm] [--tfidf] [--bigrams]
train.txt and test.txt should contain one "document" per line,
first token should be the label.
The default is to use regularized Logistic Regression and relative frequencies.
Pass --svm to use Linear SVM instead.
Pass --tfidf to use tf-idf instead of relative frequencies.
Pass --bigrams to use bigrams instead of unigrams.
"""
import sys
import getopt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix


def readcorpus(corpusfile):
	documents = []
	labels = []
	with open(corpusfile, encoding='utf8') as inp:
		for line in inp:
			label, doc = line.strip().split(None, 1)
			documents.append(doc)
			labels.append(label)
	return documents, labels


def main():
	# Command line interface
	try:
		opts, args = getopt.gnu_getopt(
				sys.argv[1:], '', ['svm', 'tfidf', 'bigrams'])
		opts = dict(opts)
		train, test = args
	except (getopt.GetoptError, IndexError, ValueError) as err:
		print(err)
		print(__doc__)
		return

	# read train and test corpus
	Xtrain, Ytrain = readcorpus(train)
	Xtest, Ytest = readcorpus(test)

	# Bag-of-Words extraction
	vec = TfidfVectorizer(
			use_idf='--tfidf' in opts,
			ngram_range=(2, 2) if '--bigrams' in opts else (1, 1),
			lowercase=True,
			max_features=100000,
			binary=False)

	# choose classifier
	if '--svm' in opts:
		# With LinearSVC you have to specify the regularization parameter C
		clf = LinearSVC(C=1.0)
	else:
		# LogisticRegressionCV automatically picks the best regularization
		# parameter using cross validation.
		clf = LogisticRegressionCV(
				cv=3,
				class_weight='balanced',
				max_iter=100)

	# combine the vectorizer with a classifier
	classifier = Pipeline([
			('vec', vec),
			('clf', clf)])

	# train the classifier
	classifier.fit(Xtrain, Ytrain)

	# make predictions on test set
	Yguess = classifier.predict(Xtest)

	# evaluate
	print('confusion matrix:\n', confusion_matrix(Ytest, Yguess))
	print(classification_report(Ytest, Yguess))


if __name__ == '__main__':
	main()
	"""A baseline Bag-of-Words text classification.

	Usage: python3 classify.py <train.txt> <test.txt> [--svm] [--tfidf] [--bigrams]
	train.txt and test.txt should contain one "document" per line,
	first token should be the label.
	The default is to use regularized Logistic Regression and relative frequencies.
	Pass --svm to use Linear SVM instead.
	Pass --tfidf to use tf-idf instead of relative frequencies.
	Pass --bigrams to use bigrams instead of unigrams.
	"""
	import sys
	import getopt
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.linear_model import LogisticRegressionCV
	from sklearn.svm import LinearSVC
	from sklearn.pipeline import Pipeline
	from sklearn.metrics import classification_report, confusion_matrix


	def readcorpus(corpusfile):
	documents = []
	labels = []
	with open(corpusfile, encoding='utf8') as inp:
	for line in inp:
	label, doc = line.strip().split(None, 1)
	documents.append(doc)
	labels.append(label)
	return documents, labels


	def main():
	# Command line interface
	try:
	opts, args = getopt.gnu_getopt(
	sys.argv[1:], '', ['svm', 'tfidf', 'bigrams'])
	opts = dict(opts)
	train, test = args
	except (getopt.GetoptError, IndexError, ValueError) as err:
	print(err)
	print(__doc__)
	return

	# read train and test corpus
	Xtrain, Ytrain = readcorpus(train)
	Xtest, Ytest = readcorpus(test)

	# Bag-of-Words extraction
	vec = TfidfVectorizer(
	use_idf='--tfidf' in opts,
	ngram_range=(2, 2) if '--bigrams' in opts else (1, 1),
	lowercase=True,
	max_features=100000,
	binary=False)

	# choose classifier
	if '--svm' in opts:
	# With LinearSVC you have to specify the regularization parameter C
	clf = LinearSVC(C=1.0)
	else:
	# LogisticRegressionCV automatically picks the best regularization
	# parameter using cross validation.
	clf = LogisticRegressionCV(
	cv=3,
	class_weight='balanced',
	max_iter=100)

	# combine the vectorizer with a classifier
	classifier = Pipeline([
	('vec', vec),
	('clf', clf)])

	# train the classifier
	classifier.fit(Xtrain, Ytrain)

	# make predictions on test set
	Yguess = classifier.predict(Xtest)

	# evaluate
	print('confusion matrix:\n', confusion_matrix(Ytest, Yguess))
	print(classification_report(Ytest, Yguess))


	if __name__ == '__main__':
	main()