Skip to content

Instantly share code, notes, and snippets.

@markus-beuckelmann
Created December 6, 2013 19:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save markus-beuckelmann/7830768 to your computer and use it in GitHub Desktop.
Save markus-beuckelmann/7830768 to your computer and use it in GitHub Desktop.
Get a list of all articles in a Wikipedia (sub-)category.
def wikipedia_articles_in_category(baseurl, subcategories = False, prefix = ''):
''' Returns a tuple of (title, url, category). '''
from lxml import html
from requests import get
page = get(baseurl)
tree = html.fromstring(page.text)
articles, category = [], prefix + ''.join(tree.xpath('//h1[@id="firstHeading"]/span/text()')).split(':')[-1]
seperator = '::'
# Extracts all pages
for element in tree.xpath('//div[@id="mw-pages"]//a[contains(@href, "/wiki/")]'):
url, title = element.attrib['href'], element.attrib['title']
url = '/'.join(baseurl.split('/')[:-2]) + url
articles.append((title, url, category))
for element in tree.xpath('//div[@id="mw-pages"]/a[contains(@href, "pagefrom=")]'):
url = element.attrib['href']
nexturl = '/'.join(baseurl.split('/')[:-2]) + url
articles += wikipedia_articles_in_category(nexturl, subcategories = subcategories, prefix = prefix)
break
# Extracts all pages in subcategories
if subcategories:
for element in tree.xpath('//div[@id="mw-subcategories"]//a[contains(@href, "/wiki/")]'):
url, title = element.attrib['href'], element.text
suburl = '/'.join(baseurl.split('/')[:-2]) + url
articles += wikipedia_articles_in_category(suburl, subcategories = subcategories, prefix = category + seperator)
return articles
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment