Skip to content

Instantly share code, notes, and snippets.

@wclenhardt
Forked from wassname/scrape_Wikipedia_tables.py
Last active August 5, 2017 17:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wclenhardt/ad257676685f61393bb182ed7213cefc to your computer and use it in GitHub Desktop.
Save wclenhardt/ad257676685f61393bb182ed7213cefc to your computer and use it in GitHub Desktop.
Scrape a table from wikipedia using python. Allows for cells spanning multiple rows and/or columns. Outputs csv files for each table
# -*- coding: utf-8 -*-
"""
Scrape a table from wikipedia using python. Allows for cells spanning multiple rows and/or columns. Outputs csv files for
each table
"""
#20170805 - Forked this to use as part of a larger project to create a 'quiz' to learn geographic trivia. Made some updates to the urllib syntax.
from bs4 import BeautifulSoup
import urllib.request, urllib.parse, urllib.error
import os
import codecs
wiki = "https://en.wikipedia.org/wiki/List_of_national_capitals_in_alphabetical_order"
header = {'User-Agent': 'Mozilla/5.0'} #Needed to prevent 403 error on Wikipedia
req = urllib.request.Request(wiki,headers=header)
page = urllib.request.urlopen(req)
soup = BeautifulSoup(page)
tables = soup.findAll("table", { "class" : "wikitable" })
# show tables
for table in tables:
print("###############")
print(table.text[:100])
for tn in range(len(tables)):
table=tables[tn]
# preinit list of lists
rows=table.findAll("tr")
row_lengths=[len(r.findAll(['th','td'])) for r in rows]
ncols=max(row_lengths)
nrows=len(rows)
data=[]
for i in range(nrows):
rowD=[]
for j in range(ncols):
rowD.append('')
data.append(rowD)
# process html
for i in range(len(rows)):
row=rows[i]
rowD=[]
cells = row.findAll(["td","th"])
for j in range(len(cells)):
cell=cells[j]
#lots of cells span cols and rows so lets deal with that
cspan=int(cell.get('colspan',1))
rspan=int(cell.get('rowspan',1))
for k in range(rspan):
for l in range(cspan):
data[i+k][j+l]+=cell.text
data.append(rowD)
# write data out
page=os.path.split(wiki)[1]
fname='output_{}_t{}.csv'.format(page,tn)
f = codecs.open(fname, 'w')#,encoding='utf-8')
for i in range(nrows):
rowStr=','.join(data[i])
rowStr=rowStr.replace('\n','')
print(rowStr)
rowStr=rowStr#.encode('unicode_escape')
f.write(rowStr+'\n')
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment