Skip to content

Instantly share code, notes, and snippets.

@mickaellegal
Last active January 3, 2016 21:49
Show Gist options
  • Save mickaellegal/8524156 to your computer and use it in GitHub Desktop.
Save mickaellegal/8524156 to your computer and use it in GitHub Desktop.
iPython Notebook: Blog article classifier
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"1. Loading the data"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Import the libraries needed\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn import cross_validation\n",
"import pandas as pd\n",
"import json"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Load the data \n",
"data = json.load(open(\"articles.json\"))"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Explore the dataset\n",
"print len(data)\n",
"print data[0].keys() # Shows all the keys for one article"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"1000\n",
"[u'type_of_material', u'content', u'document_type', u'byline', u'web_url', u'headline', u'lead_paragraph', u'abstract', u'multimedia', u'snippet', u'news_desk', u'word_count', u'blog', u'source', u'html', u'section_name', u'subsection_name', u'keywords', u'_id', u'pub_date', u'print_page']\n"
]
}
],
"prompt_number": 3
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"2. Vectorizing the content of the articles"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Convert the articles to a matrix of tf-idf features\n",
"# We get rid of the common words and limit the matrix to 500 features (i.e. words of the articles)\n",
"\n",
"vec = TfidfVectorizer(max_features=500, smooth_idf=True, stop_words='english')\n",
"\n",
"train_nyt_tfidf = vec.fit_transform(map(lambda x: \"\".join(x['content']).encode('ascii', 'ignore').strip(), data))"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Extracting the labels of articles\n",
"labels = map(lambda x: x['section_name'], data)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"3. Building the model & Cross-validation"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# create and train a classifier\n",
"nbayes = MultinomialNB(fit_prior=False)\n",
"\n",
"# Holdout: 80% as training set - 20% as test set\n",
"X_train, X_test, y_train, y_test = cross_validation.train_test_split(train_nyt_tfidf, labels, test_size = .2)\n",
"\n",
"# Fit the model\n",
"nbayes.fit(X_train, y_train)\n",
"\n",
"# Return the accuracy of the model \n",
"nbayes.score(X_test, y_test)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 35,
"text": [
"0.625"
]
}
],
"prompt_number": 35
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Let's examine the results and see if the model does a good job predicting the labels in the test set. \n",
"preds = nbayes.predict(X_test)\n",
"\n",
"print pd.crosstab(y_test, preds)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"col_0 Arts Books Business Day Corrections \\\n",
"row_0 \n",
"Arts 15 1 0 0 \n",
"Books 0 8 0 0 \n",
"Booming 1 0 0 0 \n",
"Business Day 1 0 11 0 \n",
"Corrections 0 0 0 1 \n",
"Dining & Wine 0 0 0 0 \n",
"Education 0 0 0 0 \n",
"Fashion & Style 2 0 0 0 \n",
"Great Homes and Destinations 1 0 0 0 \n",
"Health 0 0 1 0 \n",
"Home & Garden 2 0 0 0 \n",
"Magazine 0 1 0 1 \n",
"Movies 1 0 0 0 \n",
"N.Y. / Region 0 0 0 0 \n",
"Opinion 0 0 3 0 \n",
"Paid Death Notices 1 0 0 0 \n",
"Real Estate 0 0 1 0 \n",
"Science 0 2 1 0 \n",
"Sports 0 0 0 0 \n",
"Technology 1 0 1 0 \n",
"Theater 4 0 0 0 \n",
"Travel 2 0 0 0 \n",
"U.S. 0 0 2 0 \n",
"World 0 0 4 0 \n",
"Your Money 0 0 0 0 \n",
"\n",
"col_0 Fashion & Style Health Movies N.Y. / Region \\\n",
"row_0 \n",
"Arts 0 0 0 0 \n",
"Books 0 0 0 0 \n",
"Booming 1 0 0 0 \n",
"Business Day 1 0 0 1 \n",
"Corrections 0 0 0 0 \n",
"Dining & Wine 3 1 0 0 \n",
"Education 0 0 0 1 \n",
"Fashion & Style 5 0 0 0 \n",
"Great Homes and Destinations 0 0 0 0 \n",
"Health 0 1 0 0 \n",
"Home & Garden 1 0 0 0 \n",
"Magazine 0 0 0 0 \n",
"Movies 1 0 2 0 \n",
"N.Y. / Region 2 0 0 11 \n",
"Opinion 0 0 0 0 \n",
"Paid Death Notices 0 0 0 0 \n",
"Real Estate 0 0 0 2 \n",
"Science 0 1 0 0 \n",
"Sports 1 0 0 1 \n",
"Technology 1 0 0 0 \n",
"Theater 0 0 0 0 \n",
"Travel 1 0 0 0 \n",
"U.S. 0 0 0 1 \n",
"World 0 0 0 1 \n",
"Your Money 1 0 0 0 \n",
"\n",
"col_0 Opinion Real Estate Science Sports \\\n",
"row_0 \n",
"Arts 0 0 0 0 \n",
"Books 0 0 0 0 \n",
"Booming 0 0 0 0 \n",
"Business Day 0 0 0 1 \n",
"Corrections 0 0 0 0 \n",
"Dining & Wine 0 0 0 0 \n",
"Education 0 0 0 0 \n",
"Fashion & Style 0 0 0 0 \n",
"Great Homes and Destinations 0 0 0 0 \n",
"Health 0 0 0 0 \n",
"Home & Garden 0 0 0 0 \n",
"Magazine 0 0 0 0 \n",
"Movies 0 0 0 0 \n",
"N.Y. / Region 0 0 0 0 \n",
"Opinion 13 0 1 0 \n",
"Paid Death Notices 0 0 0 0 \n",
"Real Estate 0 1 0 0 \n",
"Science 0 0 0 0 \n",
"Sports 0 0 0 32 \n",
"Technology 0 0 0 0 \n",
"Theater 0 0 0 0 \n",
"Travel 0 0 0 0 \n",
"U.S. 1 0 0 0 \n",
"World 0 0 0 0 \n",
"Your Money 0 0 0 0 \n",
"\n",
"col_0 Technology U.S. World \n",
"row_0 \n",
"Arts 0 0 0 \n",
"Books 0 0 0 \n",
"Booming 0 0 0 \n",
"Business Day 0 2 0 \n",
"Corrections 0 0 0 \n",
"Dining & Wine 1 0 0 \n",
"Education 0 0 0 \n",
"Fashion & Style 0 0 0 \n",
"Great Homes and Destinations 0 0 0 \n",
"Health 0 0 0 \n",
"Home & Garden 0 0 0 \n",
"Magazine 0 0 0 \n",
"Movies 0 0 0 \n",
"N.Y. / Region 0 1 0 \n",
"Opinion 0 5 2 \n",
"Paid Death Notices 0 0 0 \n",
"Real Estate 0 0 0 \n",
"Science 0 0 0 \n",
"Sports 0 0 0 \n",
"Technology 0 1 0 \n",
"Theater 0 0 0 \n",
"Travel 0 0 0 \n",
"U.S. 0 9 3 \n",
"World 0 1 16 \n",
"Your Money 0 0 0 \n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/Users/mickaellegal/Documents/VirtualEnv/ENV/lib/python2.7/site-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated.\n",
"\n",
" warnings.warn(d.msg, DeprecationWarning)\n",
"/Users/mickaellegal/Documents/VirtualEnv/ENV/lib/python2.7/site-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated.\n",
"\n",
" warnings.warn(d.msg, DeprecationWarning)\n"
]
}
],
"prompt_number": 36
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"4. Deploying the model on yhat"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from yhat import Yhat, BaseModel\n",
"\n",
"# Create a class for the model\n",
"class NYTimesClassifier(BaseModel):\n",
" \n",
" def require(self):\n",
" pass\n",
" \n",
" def transform(self, raw_data):\n",
" return self.vec.transform(raw_data['content'])\n",
" \n",
" def predict(self, data):\n",
" return self.nbayes.predict(data)\n",
"\n",
"# Instantiate an object of the class NYTimesClassifier \n",
"myModel = NYTimesClassifier(vec=vec, nbayes=nbayes)\n",
"\n",
"# Fill in your yhat username and apikey\n",
"yh = Yhat(\"your_username\", \"your_apikey\")\n",
"\n",
"# Deploy the model on yhat\n",
"yh.deploy(\"NYTimesClassifier\", myModel)\n",
" "
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 37
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Testing the model on the local machine"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"data_test = dict()\n",
"\n",
"data_test = {'content': [\"It\u2019s not too late.\\\n",
"\u201cSherlock\u201d is returning to PBS on Sunday for a third season, and for those who haven\u2019t watched it, it is worth starting now, and even catching up with the first two seasons (at PBS.org or Netflix.)\\\n",
"It would be unfair to say that this BBC production is the best Sherlock Holmes ever: There have been so many, and so many great ones. But it is certainly the right one for right now.\\\n",
"There is no easy explanation for why this Arthur Conan Doyle character has such a lasting hold on the public imagination; possibly only Dracula has had as many incarnations. And that may be a clue to the detective\u2019s enduring popularity.\\\n",
"Vampires, after all, supposedly symbolize uncontrolled desire and repressed sexuality. It could be that more than almost any other sleuth, Sherlock Holmes represents logic and the unapologetic triumph of reason over emotion. And especially in this age of ambivalence and subjectivity, a purely cerebral hero is particularly welcome.\\\n",
"\u201cSherlock\u201d stars Benedict Cumberbatch and was created by Steven Moffat and Mark Gatiss (\u201cDr. Who\u201d), and they, of course, depict the hero as freakishly smart and oddly talented. This Sherlock is also lissome, spirited and briskly energetic; most important, the famous detective isn\u2019t turned inside out to suit current, navel-gazing fashions.\\\n",
"Sherlock jokes about his chilly British upbringing with his older brother, Mycroft (played by Mr. Gatiss), but his psychic deprivations and sexual orientation go largely unexplored. When Sherlock tells Watson (Martin Freeman) \u201cthe game is on,\u201d it\u2019s an invitation to help him solve the case, not explore his innermost feelings. (Though it would be funny if Sherlock said it while turning on the World Cup finals.)\\\n",
"Particularly on television, it\u2019s almost impossible to find another interesting crime solver who isn\u2019t driven by childhood wounds or crippling psychological flaws or fixations. People are always trying to humanize Dracula; they too often try to do the same to Sherlock Holmes.\"]}\n"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 38
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print 'Results on my local machine'\n",
"transformed = myModel.transform(data_test)\n",
"\n",
"print myModel.predict(transformed)[0]\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Results on my local machine\n",
"Arts\n"
]
}
],
"prompt_number": 39
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Testing the app on the yhat server"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"yhat_pred = yh.raw_predict(\"NYTimesClassifier\", 1, data_test)\n",
"\n",
"print yhat_pred['prediction'][0]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Arts\n"
]
}
],
"prompt_number": 33
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment