Created
February 10, 2015 16:42
-
-
Save elgehelge/0a69f0fb3385b8f51993 to your computer and use it in GitHub Desktop.
Efficiently query all articles from a category on Wikipedia (using Python)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
parameters = {'action': 'query', | |
'generator': 'categorymembers', | |
'gcmtitle': 'Category:Marvel_Comics_superheroes', | |
'prop': 'revisions', | |
'rvprop': 'content', | |
# The API starts returning dictionaries without the 'revisions' key when requesting more than 50. | |
# Not sure why, and can't find any documentation. | |
'gcmlimit': 50, | |
'format': 'json', | |
'redirects': '', | |
'continue': '', | |
'gcmcontinue': ''} | |
all_pages = {} | |
while(True): | |
response = requests.get('http://en.wikipedia.org/w/api.php', params=parameters).json() | |
if 'error' in response: | |
raise Error(result['error']) | |
if 'warnings' in response: | |
print(response['warnings']) | |
if 'continue' not in response: | |
# break out of the loop when we reach the end of pagination | |
break | |
# get data and update the 'continue' key | |
all_pages.update(response['query']['pages']) | |
next_continue = response['continue']['gcmcontinue'] | |
print next_continue | |
parameters.update({'gcmcontinue': next_continue, | |
'continue': 'gcmcontinue||'}) | |
# now all_pages contains all the articles | |
len(all_pages) # 870 (at the momoent of writing this Gist) | |
all_pages.values()[0]['title'] # u'Warren Worthington III' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment