Skip to content

Instantly share code, notes, and snippets.

@mshuffett
Created May 23, 2014 23:21
Show Gist options
  • Save mshuffett/4daff37a849bfb6a232d to your computer and use it in GitHub Desktop.
Save mshuffett/4daff37a849bfb6a232d to your computer and use it in GitHub Desktop.
IPython Notebook Using Wikipedia API
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from __future__ import division\n",
"import requests\n",
"import json\n",
"import sys"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"class Query(object):\n",
" QUERY_BASE = 'http://en.wikipedia.org/w/api.php'\n",
" def __init__(self, **kwargs):\n",
" '''Takes keyword arguments for url arguments.'''\n",
" self.args = {'action': 'query', 'prop': 'revisions', 'format': 'json', 'rvprop': 'ids', 'rvlimit': '500'}\n",
" if kwargs:\n",
" self.args.update(kwargs)\n",
" \n",
" def build(self, **kwargs):\n",
" if kwargs:\n",
" args = self.args.copy()\n",
" args.update(kwargs)\n",
" else:\n",
" args = self.args\n",
" \n",
" return '{base}?{args}'.format(base=self.__class__.QUERY_BASE,\n",
" args='&'.join('='.join((str(key), str(value))) for key, value in args.iteritems()))\n",
" \n",
" def get(self, **kwargs):\n",
" query = self.build(**kwargs)\n",
" r = requests.get(query)\n",
" self.result = json.loads(r.text)\n",
" return self.result\n",
" \n",
" def result_generator(self):\n",
" result_json = self.get()\n",
" yield result_json\n",
" \n",
" while 'query-continue' in result_json:\n",
" rvcontinue = result_json['query-continue']['revisions']['rvcontinue']\n",
" result_json = self.get(rvcontinue=rvcontinue)\n",
" yield result_json\n",
" \n",
" \n",
"class Page(object):\n",
" def __init__(self, title):\n",
" self.title = title.strip().replace(' ', '_')\n",
" self.query = Query(titles=title)\n",
" \n",
" def get_num_revisions(self):\n",
" result_generator = self.query.result_generator()\n",
" revision_count = 0\n",
" \n",
" for result_json in result_generator:\n",
" revision_count += len(result_json['query']['pages'].values()[0]['revisions'])\n",
" \n",
" return revision_count"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 111
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# 17 flood pages\n",
"page_titles = ['Tropical Storm Allison', 'Hurricane Katrina', 'Northeast U.S. flooding of October 2005', 'Ka Loko Reservoir', 'Mid-Atlantic United States flood of 2006', '2007 Midwest flooding', 'Great Coastal Gale of 2007', 'Spring 2008 Midwest floods', 'June 2008 Midwest floods', '2008 Tanana Valley flood', '2009 Red River flood', 'Deep South Flood of 2009', 'May 2010 Tennessee flooding', 'September 2010 Minnesota/Wisconsin Flood', '2011 Mississippi River floods', 'Tropical Storm Lee (2011)', '2013 Colorado floods']"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 107
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"p = Page(page_titles[0])"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 108
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"p.query.build()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 109,
"text": [
"'http://en.wikipedia.org/w/api.php?titles=Tropical Storm Allison&rvlimit=500&format=json&action=query&rvprop=ids&prop=revisions'"
]
}
],
"prompt_number": 109
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"p.get_num_revisions()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 110,
"text": [
"591"
]
}
],
"prompt_number": 110
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"del title"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 14
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def do_work(title):\n",
" print Page(title).get_num_revisions()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from multiprocessing import Pool\n",
"p = Pool(10)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"num_tasks = len(page_titles)\n",
"for i, _ in enumerate(p.imap_unordered(do_work, page_titles), 1):\n",
" sys.stderr.write('\\rdone {0:%}'.format(i/num_tasks))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stderr",
"text": [
"\r",
"done 5.882353%\r",
"done 11.764706%\r",
"done 17.647059%\r",
"done 23.529412%\r",
"done 29.411765%\r",
"done 35.294118%\r",
"done 41.176471%"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"\r",
"done 47.058824%\r",
"done 52.941176%\r",
"done 58.823529%\r",
"done 64.705882%\r",
"done 70.588235%"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"\r",
"done 76.470588%\r",
"done 82.352941%\r",
"done 88.235294%"
]
}
],
"prompt_number": "*"
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"do_work(page_titles[0])"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": "*"
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment