Skip to content

Instantly share code, notes, and snippets.

@pudo
Created May 18, 2015 17:11
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pudo/82f1c5ed5ad5fac3994b to your computer and use it in GitHub Desktop.
Save pudo/82f1c5ed5ad5fac3994b to your computer and use it in GitHub Desktop.
scrape wikipedia category pages
import json
import re
import mwclient
import unicodecsv
site = mwclient.Site('en.wikipedia.org')
disam = re.compile('\(.*\)$')
def get_pages(cat):
for p in cat:
if p.namespace == 0:
yield p
elif p.namespace == 14:
for pp in get_pages(p):
yield pp
def filter_page(page):
if not page.page_title:
return False
if page.page_title.startswith('List of'):
return False
return True
def clean_title(title):
title = disam.sub('', title)
return title.strip()
def page_url(page):
slug = page.normalize_title(page.name)
return 'http://%s/wiki/%s' % (page.site.host, slug)
def scrape_category(name):
fh = open('%s.csv' % name, 'wb')
columns = ['label', 'entity', 'entity_url', 'categories', 'backlinks']
writer = unicodecsv.DictWriter(fh, fieldnames=columns)
writer.writeheader()
cat = site.categories.get(name)
for page in get_pages(cat):
if not filter_page(page):
continue
data = {
'entity': page.page_title,
'entity_url': page_url(page),
'categories': json.dumps([c.page_title for c in page.categories()])
}
aliases = [page.page_title]
aliases.extend([t for (lang, t) in page.langlinks()])
backlinks = [x for x in page.backlinks()]
data['backlinks'] = json.dumps([c.page_title for c in backlinks])
for bl in backlinks:
link = bl.redirects_to()
if link is not None and link.page_title == page.page_title:
aliases.append(bl.page_title)
seen = set()
for alias in aliases:
alias = clean_title(alias)
alias_norm = alias.lower()
if alias_norm in seen:
continue
seen.add(alias_norm)
row = dict(data)
row['label'] = alias
writer.writerow(row)
row.pop('categories')
row.pop('backlinks')
print row
fh.close()
if __name__ == '__main__':
scrape_category('Ugandan_politicians')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment