dannguyen/easypipe.py

## easypipe.py
# some convenience functions here, nothing new
'''
# usage:
from easypipe import easy_pipeline
from easypipe import print_metrics
data_folder = "data-hold/20news"
p = easy_pipeline()
print_metrics(p, data_folder)
'''


import sys

from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy


## make a Multinomial-NB and CountVectorized pipeline by default
## or switch in tf-idf as your vectorizer, and LinearSVC for your classifier
def easy_pipeline(vect = 'tfidf', clf = 'mnb', min_df = 0.01, max_df = 0.95, stop_words = None, decode_error = 'ignore'):
  if vect == 'tfidf':
    V = TfidfVectorizer
  else:
    V = CountVectorizer

  if clf == 'lsvc':
    C = LinearSVC
  else:
    C = MultinomialNB


  pipeline = Pipeline([
      ('vect', V(min_df=min_df, max_df=max_df, stop_words = stop_words, decode_error = decode_error)),
      ('clf', C()),
  ])
  return pipeline

## Print the precision/recall/F1 numbers per label, and also
##  the top-10 most informative features per label
def print_metrics(pipeline, data_folder, test_size = 0.25):
  dataset = load_files(data_folder, shuffle = False)
  docs_train, docs_test, y_train, y_test = train_test_split(
    dataset.data, dataset.target, test_size = test_size, random_state=None)
  pipeline.fit(docs_train, y_train)
  y_predicted = pipeline.predict(docs_test)
  # print report
  print(metrics.classification_report(y_test, y_predicted, target_names = dataset.target_names))
  ## print out top 10 words
  clf = pipeline.steps[1][1]
  vect = pipeline.steps[0][1]
  for i, class_label in enumerate(dataset.target_names):
              topt = numpy.argsort(clf.coef_[i])[-10:]
              print("%s:    %s" % (class_label,
                    ", ".join(vect.get_feature_names()[j] for j in topt)))


## more-nb-bag-of-words.py
# Using just CountVectorizer (threshold of 0.05 to 0.75) and Multinomial-Naive Bayes

import sys
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.pipeline import Pipeline
import numpy

data_folder = "./data-hold/cleaned/"
dataset = load_files(data_folder, shuffle = False)

docs_train, docs_test, y_train, y_test = train_test_split(
    dataset.data, dataset.target, test_size=0.25, random_state=None)

pipeline = Pipeline([
    ('vect', CountVectorizer(min_df=0.05, max_df=0.75)),
    ('clf', MultinomialNB()),
])

pipeline.fit(docs_train, y_train)
y_predicted = pipeline.predict(docs_test)
print(metrics.classification_report(y_test, y_predicted, target_names = dataset.target_names))

################ Performance report
#                    precision    recall  f1-score   support

#    charles-m-blow       0.87      0.71      0.78        82
#      david-brooks       0.93      0.75      0.83       187
#       frank-bruni       0.69      0.83      0.76        60
#      gail-collins       0.84      0.89      0.87       169
#        joe-nocera       0.80      0.83      0.81        81
#      maureen-dowd       0.76      0.88      0.81       128
#  nicholas-kristof       0.84      0.83      0.83       123
#      paul-krugman       0.90      0.91      0.90       154
#       roger-cohen       0.80      0.86      0.83       115
#      ross-douthat       0.77      0.71      0.74        48
# thomas-l-friedman       0.87      0.87      0.87       116

#       avg / total       0.84      0.83      0.83      1263


## print out top 10 most informative features
clf = pipeline.steps[1][1]
vect = pipeline.steps[0][1]
for i, class_label in enumerate(dataset.target_names):
            topt = numpy.argsort(clf.coef_[i])[-10:]
            print("%s:    %s" % (class_label,
                  ", ".join(vect.get_feature_names()[j] for j in topt)))


# charles-m-blow:    republicans, said, some, our, most, those, were, obama, president, percent
# david-brooks:    new, government, now, them, over, some, these, do, were, obama
# frank-bruni:    last, many, just, re, them, had, him, said, her, she
# gail-collins:    get, state, mr, do, her, were, she, said, new, had
# joe-nocera:    do, them, years, other, said, she, new, were, its, had
# maureen-dowd:    hillary, even, were, him, had, president, said, her, obama, she
# nicholas-kristof:    my, also, some, because, our, year, said, had, her, she
# paul-krugman:    government, were, health, much, obama, economic, economy, now, even, mr
# roger-cohen:    had, states, united, american, now, israel, world, iran, obama, its
# ross-douthat:    well, because, new, many, even, just, party, our, its, obama
# thomas-l-friedman:    president, america, them, how, its, now, just, do, world, our


## sample-dowd-column.txt
WASHINGTON
It’s a lost art, slinking away.
Now the fashion is slinking back.
Nobody wants to simply admit they made a mistake and disappear for awhile. Nobody even wants to use the weasel words: “Mistakes were made.” No, far better to pop right back up and get in the face of those who were savoring your absence.
We should think of a name for this appalling modern phenomenon. Kissingering, perhaps.
In Las Vegas, there’s the loathsome O.J., a proper candidate for shunning and stun-gunning, barging back into the picture.
And on Capitol Hill, Larry Craig shocked mortified Republicans by bounding into their weekly lunch. You’d think the conservative 62-year-old Idaho senator would have some shame, going from fervently opposing gay rights to provocatively tapping his toe in a Minneapolis airport toilet. (The toilet stall, now known as the Larry Craig bathroom, has become a hot local tourist attraction.)
But no.
As though Republicans don’t have enough problems, Mr. Craig said he is ready to go back to work while the legal hotshots he hired appeal his case. He even cast a couple votes, one against D.C. voting rights. (This creep gets to decide about my representation?)
Even if President Bush is “the cockiest guy” around, as the former Mexican President Vicente Fox writes in a new memoir critical of W.’s “grade-school-level” Spanish and his grade-school-level Iraq policy, he can’t be feeling good about the barbs being hurled his way by former supporters and enablers.
Rummy’s back in the news, giving interviews about a planned memoir and foundation designed to encourage “reasoned and civil debate” about global challenges and to spur more young people to go into government.
It’s rich. Maybe more young people would go into government if they didn’t have to work for devious bullies like Rummy who make huge life-and-death mistakes and then don’t apologize.
In The Washington Post, he blamed the press and Congress for creating an inhospitable atmosphere that drives good people away from public service. Maybe that’s why he and his evil twin, Dick Cheney, did their best to undermine the constitutional system of checks and balances so they could get more fine young people to serve.
Does the man blamed for creating civil disorder in Iraq even know what the word “civil” means? Wasn’t he the prickly Pentagon chief who got furious with anyone who didn’t agree with him on “global challenges”?
He shoved Gen. Eric Shinseki into retirement — and failed to show up at his retirement party — after the good general correctly told Congress that it would take several hundred thousand troops to invade and control Iraq. And he snubbed the German defense minister when Germany joined the Coalition of the Unwilling.
Interviewed by GQ’s Lisa DePaulo on his ranch in Taos, N.M., with another mule named Gus nearby, the “75-year-old package of waning testosterone,” as the writer called him, was asked if he misses W. Offering a wry smile, he replied, “Um, no.”
He now treats the son with the same contempt he treated the father with, which is why it’s so odd that the son hired his dad’s nemesis in the first place.
He actually had the gall to imply to Ms. DePaulo that he was out of the loop on Iraq and dragged out a copy of a memo he had written outlining all the things that could go wrong.
In fact, he was the one, right after 9/11, who began pushing to go after Saddam. He and Cheney were orchestrating the invasion from the start, guiding the dauphin with warnings about how weak he would seem if he let Saddam mock him.
The ultimate bureaucratic infighter wrote the memo as part of his Socratic strategy, asking a lot of questions when he was already pushing to go into Iraq. He never did any contingency planning in case those things went wrong; the memo was there simply so that someday he could pull it out for a reporter.
In the same issue of GQ, Colin Powell tried to build up the objections he made to the president, too, in an interview with Walter Isaacson. But nobody’s buying.
Even though he rubber-stamped W.’s tax cuts, Alan Greenspan is now upbraiding the president and vice president for profligate spending and putting politics ahead of sound economics.
He also says in his new memoir that “the Iraq war is largely about oil,” telling Bob Woodward that he had privately told W. and Cheney that ousting Saddam was “essential” to keeping world oil supplies safe.
Irrational exuberance, indeed.

## scikit-learn-basic-nytcol.md

      
    Raw
  

              scikit-learn-basic-nytcol.md
            
          
    Machine learning fun with scikit-learn and NYT columnists

The use of TF-IDF and LinearSVC is copied verbatim from the scikit-learn text analysis tutorial on about 5,000 columns gathered across 11 NYT columnists, for example, Maureen Dowd columns as listed on /column/maureen-dowd.
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

data_folder = "./data-hold/cleaned/"
sh_dataset = load_files(data_folder, shuffle = True)
sh_docs_train, sh_docs_test, sh_y_train, sh_y_test = train_test_split(
    sh_dataset.data, sh_dataset.target, test_size=0.25, random_state=None)
sh_pipeline = Pipeline([
    ('vect', TfidfVectorizer(min_df=3, max_df=0.95)),
    ('clf', LinearSVC(C=1000)),
])

sh_pipeline.fit(sh_docs_train, sh_y_train)
sh_y_predicted = sh_pipeline.predict(sh_docs_test)

# print the results
print(metrics.classification_report(sh_y_test, sh_y_predicted, target_names = sh_dataset.target_names))
Initial results:
                   precision    recall  f1-score   support

   charles-m-blow       0.99      0.94      0.96        81
     david-brooks       0.98      0.98      0.98       169
      frank-bruni       1.00      0.98      0.99        64
     gail-collins       0.99      0.98      0.98       167
       joe-nocera       0.95      0.95      0.95        76
     maureen-dowd       0.95      0.98      0.96       125
 nicholas-kristof       0.93      0.96      0.95       134
     paul-krugman       0.98      0.99      0.98       157
      roger-cohen       0.99      0.99      0.99       115
     ross-douthat       1.00      0.94      0.97        49
thomas-l-friedman       0.98      0.98      0.98       126

      avg / total       0.97      0.97      0.97      1263

Finding the top 20 features

import numpy as np
clf = pipeline.steps[1][1]
vect = pipeline.steps[0][1]
feature_names = vect.get_feature_names()

class_labels = dataset.target_names
for i, class_label in enumerate(class_labels):
            topt = np.argsort(clf.coef_[i])[-20:]
            print("%s: %s" % (class_label,
                  " ".join(feature_names[j] for j in topt)))
Results:
charles-m-blow: zimmerman sequester week pew thankful gallup trayvon wednesday those pointed officer president continued nearly report furthermore poll must released according
david-brooks: moral series each these few speech then self cooper he culture lewinsky percent will past kerry people sort they are
frank-bruni: ones less monday there just he zelizer whose wasn evangelical isn colorado its many or last re them gay which
gail-collins: idea since perhaps giuliani all been guy ginsburg actually totally quiz who definitely was presidential going nobody pretty everybody really
joe-nocera: luke course money caro executive thus which article though indeed gun athletes retirement detainees joe football its company instance had
maureen-dowd: noting rice mushy put up poppy wrote old who christmas adding replied cheney tuesday hillary white even president said washington
nicholas-kristof: jesus isn notes my girls often united sudan then moldova one mr sometimes year found partly also yet may likewise
paul-krugman: thing which investors mainly aren isn answer even bad large claim administration example financial declared insurance fact what however mr
roger-cohen: french from century where obama course holbrooke minister perhaps land cannot words adderall before must states me has united london
ross-douthat: christian promise though post internet last critics liberals liberalism rather sweeping religious might instance instead kind well daniels liberal era
thomas-l-friedman: therefore will simon how watson putin just sandel arab more their anymore need regime israel our energy america added today

Really naive NB classifier

Let's just do Naive Bayes and a plain old bag of words that includes only words used in at least 50% of the corpus:
import sys
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.pipeline import Pipeline

data_folder = "./data-hold/cleaned/"
dataset = load_files(data_folder, shuffle = False)
print("n_samples: %d" % len(dataset.data))

docs_train, docs_test, y_train, y_test = train_test_split(
    dataset.data, dataset.target, test_size=0.25, random_state=None)

pipeline = Pipeline([
    ('vect', CountVectorizer(min_df=0.5)),
    ('clf', MultinomialNB()),
])

pipeline.fit(docs_train, y_train)
y_predicted = pipeline.predict(docs_test)
print(metrics.classification_report(y_test, y_predicted, target_names = dataset.target_names))
Results

Precision metrics and then most informative features...not super accurate...yet surprisingly accurate...:
                   precision    recall  f1-score   support

   charles-m-blow       0.59      0.58      0.58        78
     david-brooks       0.78      0.61      0.68       199
      frank-bruni       0.71      0.63      0.67        75
     gail-collins       0.77      0.74      0.76       158
       joe-nocera       0.64      0.63      0.63        70
     maureen-dowd       0.57      0.74      0.65       121
 nicholas-kristof       0.84      0.75      0.79       115
     paul-krugman       0.76      0.81      0.78       153
      roger-cohen       0.60      0.73      0.66       112
     ross-douthat       0.71      0.59      0.64        61
thomas-l-friedman       0.69      0.77      0.73       121

      avg / total       0.71      0.70      0.70      1263


charles-m-blow: they we have with but be was are on this as for it is that in to and of the
david-brooks: as be with this you on have for he but they are it is that in and of to the
frank-bruni: at we be they but was is his as with on for it he in that to of and the
gail-collins: with have his we this who be you on he was it for is that and in of to the
joe-nocera: but his be has with had they on as for was he it is and in that of to the
maureen-dowd: at not be you who for as with was is his on it he that in of and to the
nicholas-kristof: by be have he was we with are on as but it for is that in of and to the
paul-krugman: with has they was this are be have as on but for it is in and that of to the
roger-cohen: an this be but he was not as has with on for it that is in and to of the
ross-douthat: was by are have this more with be on as but is it for that in to of and the
thomas-l-friedman: they you this not are be have but on with we for it is that in of to and the
	# some convenience functions here, nothing new
	'''
	# usage:
	from easypipe import easy_pipeline
	from easypipe import print_metrics
	data_folder = "data-hold/20news"
	p = easy_pipeline()
	print_metrics(p, data_folder)
	'''


	import sys

	from sklearn.datasets import load_files
	from sklearn.cross_validation import train_test_split
	from sklearn import metrics
	from sklearn.pipeline import Pipeline
	from sklearn.svm import LinearSVC
	from sklearn.naive_bayes import MultinomialNB
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.feature_extraction.text import TfidfVectorizer
	import numpy


	## make a Multinomial-NB and CountVectorized pipeline by default
	## or switch in tf-idf as your vectorizer, and LinearSVC for your classifier
	def easy_pipeline(vect = 'tfidf', clf = 'mnb', min_df = 0.01, max_df = 0.95, stop_words = None, decode_error = 'ignore'):
	if vect == 'tfidf':
	V = TfidfVectorizer
	else:
	V = CountVectorizer

	if clf == 'lsvc':
	C = LinearSVC
	else:
	C = MultinomialNB


	pipeline = Pipeline([
	('vect', V(min_df=min_df, max_df=max_df, stop_words = stop_words, decode_error = decode_error)),
	('clf', C()),
	])
	return pipeline

	## Print the precision/recall/F1 numbers per label, and also
	## the top-10 most informative features per label
	def print_metrics(pipeline, data_folder, test_size = 0.25):
	dataset = load_files(data_folder, shuffle = False)
	docs_train, docs_test, y_train, y_test = train_test_split(
	dataset.data, dataset.target, test_size = test_size, random_state=None)
	pipeline.fit(docs_train, y_train)
	y_predicted = pipeline.predict(docs_test)
	# print report
	print(metrics.classification_report(y_test, y_predicted, target_names = dataset.target_names))
	## print out top 10 words
	clf = pipeline.steps[1][1]
	vect = pipeline.steps[0][1]
	for i, class_label in enumerate(dataset.target_names):
	topt = numpy.argsort(clf.coef_[i])[-10:]
	print("%s: %s" % (class_label,
	", ".join(vect.get_feature_names()[j] for j in topt)))
	# Using just CountVectorizer (threshold of 0.05 to 0.75) and Multinomial-Naive Bayes

	import sys
	from sklearn.naive_bayes import MultinomialNB
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.datasets import load_files
	from sklearn.cross_validation import train_test_split
	from sklearn import metrics
	from sklearn.pipeline import Pipeline
	import numpy

	data_folder = "./data-hold/cleaned/"
	dataset = load_files(data_folder, shuffle = False)

	docs_train, docs_test, y_train, y_test = train_test_split(
	dataset.data, dataset.target, test_size=0.25, random_state=None)

	pipeline = Pipeline([
	('vect', CountVectorizer(min_df=0.05, max_df=0.75)),
	('clf', MultinomialNB()),
	])

	pipeline.fit(docs_train, y_train)
	y_predicted = pipeline.predict(docs_test)
	print(metrics.classification_report(y_test, y_predicted, target_names = dataset.target_names))

	################ Performance report
	# precision recall f1-score support

	# charles-m-blow 0.87 0.71 0.78 82
	# david-brooks 0.93 0.75 0.83 187
	# frank-bruni 0.69 0.83 0.76 60
	# gail-collins 0.84 0.89 0.87 169
	# joe-nocera 0.80 0.83 0.81 81
	# maureen-dowd 0.76 0.88 0.81 128
	# nicholas-kristof 0.84 0.83 0.83 123
	# paul-krugman 0.90 0.91 0.90 154
	# roger-cohen 0.80 0.86 0.83 115
	# ross-douthat 0.77 0.71 0.74 48
	# thomas-l-friedman 0.87 0.87 0.87 116

	# avg / total 0.84 0.83 0.83 1263




	## print out top 10 most informative features
	clf = pipeline.steps[1][1]
	vect = pipeline.steps[0][1]
	for i, class_label in enumerate(dataset.target_names):
	topt = numpy.argsort(clf.coef_[i])[-10:]
	print("%s: %s" % (class_label,
	", ".join(vect.get_feature_names()[j] for j in topt)))



	# charles-m-blow: republicans, said, some, our, most, those, were, obama, president, percent
	# david-brooks: new, government, now, them, over, some, these, do, were, obama
	# frank-bruni: last, many, just, re, them, had, him, said, her, she
	# gail-collins: get, state, mr, do, her, were, she, said, new, had
	# joe-nocera: do, them, years, other, said, she, new, were, its, had
	# maureen-dowd: hillary, even, were, him, had, president, said, her, obama, she
	# nicholas-kristof: my, also, some, because, our, year, said, had, her, she
	# paul-krugman: government, were, health, much, obama, economic, economy, now, even, mr
	# roger-cohen: had, states, united, american, now, israel, world, iran, obama, its
	# ross-douthat: well, because, new, many, even, just, party, our, its, obama
	# thomas-l-friedman: president, america, them, how, its, now, just, do, world, our
	WASHINGTON
	It’s a lost art, slinking away.
	Now the fashion is slinking back.
	Nobody wants to simply admit they made a mistake and disappear for awhile. Nobody even wants to use the weasel words: “Mistakes were made.” No, far better to pop right back up and get in the face of those who were savoring your absence.
	We should think of a name for this appalling modern phenomenon. Kissingering, perhaps.
	In Las Vegas, there’s the loathsome O.J., a proper candidate for shunning and stun-gunning, barging back into the picture.
	And on Capitol Hill, Larry Craig shocked mortified Republicans by bounding into their weekly lunch. You’d think the conservative 62-year-old Idaho senator would have some shame, going from fervently opposing gay rights to provocatively tapping his toe in a Minneapolis airport toilet. (The toilet stall, now known as the Larry Craig bathroom, has become a hot local tourist attraction.)
	But no.
	As though Republicans don’t have enough problems, Mr. Craig said he is ready to go back to work while the legal hotshots he hired appeal his case. He even cast a couple votes, one against D.C. voting rights. (This creep gets to decide about my representation?)
	Even if President Bush is “the cockiest guy” around, as the former Mexican President Vicente Fox writes in a new memoir critical of W.’s “grade-school-level” Spanish and his grade-school-level Iraq policy, he can’t be feeling good about the barbs being hurled his way by former supporters and enablers.
	Rummy’s back in the news, giving interviews about a planned memoir and foundation designed to encourage “reasoned and civil debate” about global challenges and to spur more young people to go into government.
	It’s rich. Maybe more young people would go into government if they didn’t have to work for devious bullies like Rummy who make huge life-and-death mistakes and then don’t apologize.
	In The Washington Post, he blamed the press and Congress for creating an inhospitable atmosphere that drives good people away from public service. Maybe that’s why he and his evil twin, Dick Cheney, did their best to undermine the constitutional system of checks and balances so they could get more fine young people to serve.
	Does the man blamed for creating civil disorder in Iraq even know what the word “civil” means? Wasn’t he the prickly Pentagon chief who got furious with anyone who didn’t agree with him on “global challenges”?
	He shoved Gen. Eric Shinseki into retirement — and failed to show up at his retirement party — after the good general correctly told Congress that it would take several hundred thousand troops to invade and control Iraq. And he snubbed the German defense minister when Germany joined the Coalition of the Unwilling.
	Interviewed by GQ’s Lisa DePaulo on his ranch in Taos, N.M., with another mule named Gus nearby, the “75-year-old package of waning testosterone,” as the writer called him, was asked if he misses W. Offering a wry smile, he replied, “Um, no.”
	He now treats the son with the same contempt he treated the father with, which is why it’s so odd that the son hired his dad’s nemesis in the first place.
	He actually had the gall to imply to Ms. DePaulo that he was out of the loop on Iraq and dragged out a copy of a memo he had written outlining all the things that could go wrong.
	In fact, he was the one, right after 9/11, who began pushing to go after Saddam. He and Cheney were orchestrating the invasion from the start, guiding the dauphin with warnings about how weak he would seem if he let Saddam mock him.
	The ultimate bureaucratic infighter wrote the memo as part of his Socratic strategy, asking a lot of questions when he was already pushing to go into Iraq. He never did any contingency planning in case those things went wrong; the memo was there simply so that someday he could pull it out for a reporter.
	In the same issue of GQ, Colin Powell tried to build up the objections he made to the president, too, in an interview with Walter Isaacson. But nobody’s buying.
	Even though he rubber-stamped W.’s tax cuts, Alan Greenspan is now upbraiding the president and vice president for profligate spending and putting politics ahead of sound economics.
	He also says in his new memoir that “the Iraq war is largely about oil,” telling Bob Woodward that he had privately told W. and Cheney that ousting Saddam was “essential” to keeping world oil supplies safe.
	Irrational exuberance, indeed.