Created
December 6, 2013 19:31
-
-
Save markus-beuckelmann/7830768 to your computer and use it in GitHub Desktop.
Get a list of all articles in a Wikipedia (sub-)category.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def wikipedia_articles_in_category(baseurl, subcategories = False, prefix = ''): | |
''' Returns a tuple of (title, url, category). ''' | |
from lxml import html | |
from requests import get | |
page = get(baseurl) | |
tree = html.fromstring(page.text) | |
articles, category = [], prefix + ''.join(tree.xpath('//h1[@id="firstHeading"]/span/text()')).split(':')[-1] | |
seperator = '::' | |
# Extracts all pages | |
for element in tree.xpath('//div[@id="mw-pages"]//a[contains(@href, "/wiki/")]'): | |
url, title = element.attrib['href'], element.attrib['title'] | |
url = '/'.join(baseurl.split('/')[:-2]) + url | |
articles.append((title, url, category)) | |
for element in tree.xpath('//div[@id="mw-pages"]/a[contains(@href, "pagefrom=")]'): | |
url = element.attrib['href'] | |
nexturl = '/'.join(baseurl.split('/')[:-2]) + url | |
articles += wikipedia_articles_in_category(nexturl, subcategories = subcategories, prefix = prefix) | |
break | |
# Extracts all pages in subcategories | |
if subcategories: | |
for element in tree.xpath('//div[@id="mw-subcategories"]//a[contains(@href, "/wiki/")]'): | |
url, title = element.attrib['href'], element.text | |
suburl = '/'.join(baseurl.split('/')[:-2]) + url | |
articles += wikipedia_articles_in_category(suburl, subcategories = subcategories, prefix = category + seperator) | |
return articles |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment