The use of TF-IDF and LinearSVC is copied verbatim from the scikit-learn text analysis tutorial on about 5,000 columns gathered across 11 NYT columnists, for example, Maureen Dowd columns as listed on /column/maureen-dowd.
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
data_folder = "./data-hold/cleaned/"
sh_dataset = load_files(data_folder, shuffle = True)
sh_docs_train, sh_docs_test, sh_y_train, sh_y_test = train_test_split(
sh_dataset.data, sh_dataset.target, test_size=0.25, random_state=None)
sh_pipeline = Pipeline([
('vect', TfidfVectorizer(min_df=3, max_df=0.95)),
('clf', LinearSVC(C=1000)),
])
sh_pipeline.fit(sh_docs_train, sh_y_train)
sh_y_predicted = sh_pipeline.predict(sh_docs_test)
# print the results
print(metrics.classification_report(sh_y_test, sh_y_predicted, target_names = sh_dataset.target_names))
Initial results:
precision recall f1-score support
charles-m-blow 0.99 0.94 0.96 81
david-brooks 0.98 0.98 0.98 169
frank-bruni 1.00 0.98 0.99 64
gail-collins 0.99 0.98 0.98 167
joe-nocera 0.95 0.95 0.95 76
maureen-dowd 0.95 0.98 0.96 125
nicholas-kristof 0.93 0.96 0.95 134
paul-krugman 0.98 0.99 0.98 157
roger-cohen 0.99 0.99 0.99 115
ross-douthat 1.00 0.94 0.97 49
thomas-l-friedman 0.98 0.98 0.98 126
avg / total 0.97 0.97 0.97 1263
import numpy as np
clf = pipeline.steps[1][1]
vect = pipeline.steps[0][1]
feature_names = vect.get_feature_names()
class_labels = dataset.target_names
for i, class_label in enumerate(class_labels):
topt = np.argsort(clf.coef_[i])[-20:]
print("%s: %s" % (class_label,
" ".join(feature_names[j] for j in topt)))
Results:
charles-m-blow: zimmerman sequester week pew thankful gallup trayvon wednesday those pointed officer president continued nearly report furthermore poll must released according
david-brooks: moral series each these few speech then self cooper he culture lewinsky percent will past kerry people sort they are
frank-bruni: ones less monday there just he zelizer whose wasn evangelical isn colorado its many or last re them gay which
gail-collins: idea since perhaps giuliani all been guy ginsburg actually totally quiz who definitely was presidential going nobody pretty everybody really
joe-nocera: luke course money caro executive thus which article though indeed gun athletes retirement detainees joe football its company instance had
maureen-dowd: noting rice mushy put up poppy wrote old who christmas adding replied cheney tuesday hillary white even president said washington
nicholas-kristof: jesus isn notes my girls often united sudan then moldova one mr sometimes year found partly also yet may likewise
paul-krugman: thing which investors mainly aren isn answer even bad large claim administration example financial declared insurance fact what however mr
roger-cohen: french from century where obama course holbrooke minister perhaps land cannot words adderall before must states me has united london
ross-douthat: christian promise though post internet last critics liberals liberalism rather sweeping religious might instance instead kind well daniels liberal era
thomas-l-friedman: therefore will simon how watson putin just sandel arab more their anymore need regime israel our energy america added today
Let's just do Naive Bayes and a plain old bag of words that includes only words used in at least 50% of the corpus:
import sys
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.pipeline import Pipeline
data_folder = "./data-hold/cleaned/"
dataset = load_files(data_folder, shuffle = False)
print("n_samples: %d" % len(dataset.data))
docs_train, docs_test, y_train, y_test = train_test_split(
dataset.data, dataset.target, test_size=0.25, random_state=None)
pipeline = Pipeline([
('vect', CountVectorizer(min_df=0.5)),
('clf', MultinomialNB()),
])
pipeline.fit(docs_train, y_train)
y_predicted = pipeline.predict(docs_test)
print(metrics.classification_report(y_test, y_predicted, target_names = dataset.target_names))
Precision metrics and then most informative features...not super accurate...yet surprisingly accurate...:
precision recall f1-score support
charles-m-blow 0.59 0.58 0.58 78
david-brooks 0.78 0.61 0.68 199
frank-bruni 0.71 0.63 0.67 75
gail-collins 0.77 0.74 0.76 158
joe-nocera 0.64 0.63 0.63 70
maureen-dowd 0.57 0.74 0.65 121
nicholas-kristof 0.84 0.75 0.79 115
paul-krugman 0.76 0.81 0.78 153
roger-cohen 0.60 0.73 0.66 112
ross-douthat 0.71 0.59 0.64 61
thomas-l-friedman 0.69 0.77 0.73 121
avg / total 0.71 0.70 0.70 1263
charles-m-blow: they we have with but be was are on this as for it is that in to and of the
david-brooks: as be with this you on have for he but they are it is that in and of to the
frank-bruni: at we be they but was is his as with on for it he in that to of and the
gail-collins: with have his we this who be you on he was it for is that and in of to the
joe-nocera: but his be has with had they on as for was he it is and in that of to the
maureen-dowd: at not be you who for as with was is his on it he that in of and to the
nicholas-kristof: by be have he was we with are on as but it for is that in of and to the
paul-krugman: with has they was this are be have as on but for it is in and that of to the
roger-cohen: an this be but he was not as has with on for it that is in and to of the
ross-douthat: was by are have this more with be on as but is it for that in to of and the
thomas-l-friedman: they you this not are be have but on with we for it is that in of to and the