-
-
Save wclenhardt/ad257676685f61393bb182ed7213cefc to your computer and use it in GitHub Desktop.
Scrape a table from wikipedia using python. Allows for cells spanning multiple rows and/or columns. Outputs csv files for each table
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Scrape a table from wikipedia using python. Allows for cells spanning multiple rows and/or columns. Outputs csv files for | |
each table | |
""" | |
#20170805 - Forked this to use as part of a larger project to create a 'quiz' to learn geographic trivia. Made some updates to the urllib syntax. | |
from bs4 import BeautifulSoup | |
import urllib.request, urllib.parse, urllib.error | |
import os | |
import codecs | |
wiki = "https://en.wikipedia.org/wiki/List_of_national_capitals_in_alphabetical_order" | |
header = {'User-Agent': 'Mozilla/5.0'} #Needed to prevent 403 error on Wikipedia | |
req = urllib.request.Request(wiki,headers=header) | |
page = urllib.request.urlopen(req) | |
soup = BeautifulSoup(page) | |
tables = soup.findAll("table", { "class" : "wikitable" }) | |
# show tables | |
for table in tables: | |
print("###############") | |
print(table.text[:100]) | |
for tn in range(len(tables)): | |
table=tables[tn] | |
# preinit list of lists | |
rows=table.findAll("tr") | |
row_lengths=[len(r.findAll(['th','td'])) for r in rows] | |
ncols=max(row_lengths) | |
nrows=len(rows) | |
data=[] | |
for i in range(nrows): | |
rowD=[] | |
for j in range(ncols): | |
rowD.append('') | |
data.append(rowD) | |
# process html | |
for i in range(len(rows)): | |
row=rows[i] | |
rowD=[] | |
cells = row.findAll(["td","th"]) | |
for j in range(len(cells)): | |
cell=cells[j] | |
#lots of cells span cols and rows so lets deal with that | |
cspan=int(cell.get('colspan',1)) | |
rspan=int(cell.get('rowspan',1)) | |
for k in range(rspan): | |
for l in range(cspan): | |
data[i+k][j+l]+=cell.text | |
data.append(rowD) | |
# write data out | |
page=os.path.split(wiki)[1] | |
fname='output_{}_t{}.csv'.format(page,tn) | |
f = codecs.open(fname, 'w')#,encoding='utf-8') | |
for i in range(nrows): | |
rowStr=','.join(data[i]) | |
rowStr=rowStr.replace('\n','') | |
print(rowStr) | |
rowStr=rowStr#.encode('unicode_escape') | |
f.write(rowStr+'\n') | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment