lambdabaa/gist:0fd274b532199d4794fd55b0f34022d1

## gistfile1.txt
st-ari1:lib ari$ ./train.py -h
usage: train.py [-h] [--debug DEBUG] [--doubleSampleSpam DOUBLESAMPLESPAM]
                [--rank RANK] [--rare2unknown RARE2UNKNOWN]
                [--stemmer STEMMER] [--tfidf TFIDF]

Train a svm based spam classifier.

optional arguments:
  -h, --help            show this help message and exit
  --debug DEBUG         whether or not to print debug messages
  --doubleSampleSpam DOUBLESAMPLESPAM
                        whether or not to double sample spam
  --rank RANK           svd rank
  --rare2unknown RARE2UNKNOWN
                        whether or not to replace some rare words with
                        <unknown>
  --stemmer STEMMER     one of porter, snowball, or none
  --tfidf TFIDF         whether or not to weight terms w/ tfidf
st-ari1:lib ari$ time ./train.py --debug 1 --rank 10
Found 3107 ham, 1265 spam
Building term-document matrix with 5073 documents!
Computing svd of term-document matrix with rank 10...
Embedding training documents...
Training SVM classifier...
Scoring classifier on 564 examples!
Error rate: 0.101064

real    1m7.126s
user    1m4.516s
sys     0m2.449s
st-ari1:lib ari$ time ./train.py --debug 0 --rank 100
Found 3107 ham, 1265 spam
Building term-document matrix with 5073 documents!
Computing svd of term-document matrix with rank 100...
Embedding training documents...
Training SVM classifier...
Scoring classifier on 564 examples!
Error rate: 0.030142

real    1m55.680s
user    1m37.925s
sys     0m17.601s
st-ari1:lib ari$ ./predict.py --rank 10 > submission.csv
st-ari1:lib ari$ head submission.csv
email_id,labels
1,1
2,1
3,0
4,0
5,1
6,0
7,0
8,1
9,0
	st-ari1:lib ari$ ./train.py -h
	usage: train.py [-h] [--debug DEBUG] [--doubleSampleSpam DOUBLESAMPLESPAM]
	[--rank RANK] [--rare2unknown RARE2UNKNOWN]
	[--stemmer STEMMER] [--tfidf TFIDF]

	Train a svm based spam classifier.

	optional arguments:
	-h, --help show this help message and exit
	--debug DEBUG whether or not to print debug messages
	--doubleSampleSpam DOUBLESAMPLESPAM
	whether or not to double sample spam
	--rank RANK svd rank
	--rare2unknown RARE2UNKNOWN
	whether or not to replace some rare words with
	<unknown>
	--stemmer STEMMER one of porter, snowball, or none
	--tfidf TFIDF whether or not to weight terms w/ tfidf
	st-ari1:lib ari$ time ./train.py --debug 1 --rank 10
	Found 3107 ham, 1265 spam
	Building term-document matrix with 5073 documents!
	Computing svd of term-document matrix with rank 10...
	Embedding training documents...
	Training SVM classifier...
	Scoring classifier on 564 examples!
	Error rate: 0.101064

	real 1m7.126s
	user 1m4.516s
	sys 0m2.449s
	st-ari1:lib ari$ time ./train.py --debug 0 --rank 100
	Found 3107 ham, 1265 spam
	Building term-document matrix with 5073 documents!
	Computing svd of term-document matrix with rank 100...
	Embedding training documents...
	Training SVM classifier...
	Scoring classifier on 564 examples!
	Error rate: 0.030142

	real 1m55.680s
	user 1m37.925s
	sys 0m17.601s
	st-ari1:lib ari$ ./predict.py --rank 10 > submission.csv
	st-ari1:lib ari$ head submission.csv
	email_id,labels
	1,1
	2,1
	3,0
	4,0
	5,1
	6,0
	7,0
	8,1
	9,0