Skip to content

Instantly share code, notes, and snippets.

@aboSamoor
Created July 25, 2013 20:52
Show Gist options
  • Save aboSamoor/6083650 to your computer and use it in GitHub Desktop.
Save aboSamoor/6083650 to your computer and use it in GitHub Desktop.
{
"metadata": {
"name": "Features-Copy0"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from sklearn.datasets import fetch_20newsgroups\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.feature_extraction.text import TfidfTransformer\n",
"\n",
"\n",
"categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']\n",
"twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)\n",
"twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)\n",
"\n",
"count_vect = CountVectorizer().fit(twenty_train.data)\n",
"X_train_counts = count_vect.transform(twenty_train.data)\n",
"Y_test_counts = count_vect.transform(twenty_test.data)\n",
"\n",
"tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)\n",
"X_train_tf = tf_transformer.transform(X_train_counts)\n",
"Y_test_tf = tf_transformer.transform(Y_test_counts)\n",
"\n",
"X_tfidf_ = TfidfTransformer(use_idf=True).fit(X_train_counts)\n",
"X_train_tfidf = X_tfidf_.transform(X_train_counts)\n",
"Y_test_tfidf = X_tfidf_.transform(Y_test_counts)\n",
"\n",
"print X_train_counts.shape\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"(2257, 18494)\n"
]
}
],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from sklearn.linear_model import SGDClassifier\n",
"from sklearn import metrics\n",
"\n",
"reversed_vocab = {v:k for k,v in count_vect.vocabulary_.iteritems()}\n",
"\n",
"clf = SGDClassifier(loss='log', penalty='l1', alpha=1e-5, n_iter=5)\n",
"clf.fit(X_train_tfidf, twenty_train.target)\n",
"predicted = clf.predict(Y_test_tfidf)\n",
"\n",
"\n",
"print metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names)\n",
"feature_names = np.asarray(count_vect.get_feature_names())\n",
"for i, category in enumerate(twenty_train.target_names):\n",
" if len(twenty_train.target_names) == 2 and i == 1:\n",
" continue\n",
" class_coef = clf.coef_[i]\n",
" indices = np.argsort(abs(class_coef))\n",
" weights = class_coef[indices]\n",
" print \n",
" print \"*\" * 40\n",
" print category\n",
" j = 0 \n",
" for k, (weight, index) in enumerate(reversed(zip(weights, indices))):\n",
" if k > 30:\n",
" break\n",
" if weight != 0:\n",
" f = feature_names[index]\n",
" print f, weight"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
" precision recall f1-score support\n",
"\n",
" alt.atheism 0.93 0.84 0.88 319\n",
" comp.graphics 0.92 0.96 0.94 389\n",
" sci.med 0.95 0.92 0.93 396\n",
"soc.religion.christian 0.89 0.95 0.92 398\n",
"\n",
" avg / total 0.92 0.92 0.92 1502\n",
"\n",
"\n",
"****************************************\n",
"alt.atheism\n",
"atheism 16.7760541382\n",
"keith 12.550337349\n",
"rutgers -10.9520680884\n",
"islamic 10.5068309563\n",
"atheists 10.4078644957\n",
"rushdie 9.57782636361\n",
"morality 9.42468112523\n",
"moral 9.21028931502\n",
"umd 9.1962134749\n",
"church -9.15715192289\n",
"mathew 8.83429779925\n",
"christ -8.45800782681\n",
"thanks -8.42171306628\n",
"kmr4 8.22796487656\n",
"so 7.86312428944\n",
"christians -7.85892815499\n",
"wingate 7.62471260906\n",
"clh -7.58749581812\n",
"matthew 7.50076415621\n",
"satan 7.44317351953\n",
"may -7.43875109544\n",
"psuvm 7.37657804559\n",
"evil 7.36177472187\n",
"okcforum 7.28853265604\n",
"rights 7.10098361628\n",
"msg -7.05539802231\n",
"mangoe 7.01350500577\n",
"lippard 6.98419228055\n",
"hiv -6.89484933605\n",
"islam 6.8939270716\n",
"liar 6.87284221869\n",
"\n",
"****************************************\n",
"comp.graphics\n",
"graphics 19.6033446295\n",
"tiff 13.2556855151\n",
"points 12.3648095777\n",
"image 12.03755074\n",
"images 11.2724491309\n",
"files 10.5224717397\n",
"virtual 9.79310808774\n",
"sphere 9.74141168702\n",
"software 9.43607720443\n",
"3d 9.43030449818\n",
"video 9.39755744296\n",
"3do 9.15621005795\n",
"keyboard -8.97735885939\n",
"42 8.91233821226\n",
"my -8.62256224259\n",
"file 8.57653671527\n",
"code 8.06541542424\n",
"animation 7.87315370552\n",
"computer 7.82563892902\n",
"of -7.79528550243\n",
"windows 7.77685841745\n",
"color 7.74937871162\n",
"cview 7.74195414079\n",
"fractal 7.6256809391\n",
"version 7.60649467542\n",
"polygon 7.59768039088\n",
"god -7.35574337651\n",
"card 7.21948844075\n",
"people -7.1898247233\n",
"pov 7.08769076909\n",
"renderman 7.0465853062\n",
"\n",
"****************************************\n",
"sci.med\n",
"god"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" -14.4090759082\n",
"graphics -13.3393730028\n",
"msg 12.8837182415\n",
"doctor 12.521083914\n",
"health 11.6132848839\n",
"pitt 11.1759190223\n",
"treatment 10.9239130307\n",
"cancer 10.4735048784\n",
"disease 9.96999306506\n",
"photography 9.81282821773\n",
"medicine 9.32067461869\n",
"keyboard 9.01804427801\n",
"homeopathy 8.9226642377\n",
"medical 8.91824116311\n",
"pain 8.58087053796\n",
"christian -8.53992157548\n",
"christians -8.39324456306\n",
"information 7.6581907484\n",
"syndrome 7.65628518432\n",
"images -7.65426706103\n",
"lyme 7.50497315246\n",
"med 7.45695710359\n",
"jesus -7.43655963838\n",
"files -7.41462695908\n",
"church -7.3512966304\n",
"counselor 7.0693139276\n",
"video -7.05398794756\n",
"religion -6.96308676999\n",
"gordon 6.90038326594\n",
"hernia 6.88176522169\n",
"diabetes 6.86070243644\n",
"\n",
"****************************************\n",
"soc.religion.christian\n",
"church 17.4615613363\n",
"rutgers 17.3452667461\n",
"christians 16.4093744448\n",
"host -15.5884150929\n",
"nntp -15.0761008507\n",
"clh 14.6944153319\n",
"christian 14.2831937151\n",
"atheism -13.8817598606\n",
"athos 13.283608912\n",
"christ 13.0571310916\n",
"posting -12.9538003512\n",
"heaven 11.0083933286\n",
"may 9.56282761613\n",
"god 9.51992552803\n",
"graphics -9.17919322856\n",
"christianity 9.08095740539\n",
"bassili 9.01522081702\n",
"authority 8.848689572\n",
"sin 8.76008305028\n",
"easter 8.71452678808\n",
"keith -8.63029348337\n",
"hell 8.48633449604\n",
"arrogance 8.35121496492\n",
"scripture 8.08100002546\n",
"article -7.99843785333\n",
"matthew -7.97344367532\n",
"apr 7.93406529413\n",
"catholic 7.85597441708\n",
"black 7.803324629\n",
"jayne 7.48732000941\n",
"geneva 7.32659892131\n"
]
}
],
"prompt_number": 78
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment