Create a gist now

Instantly share code, notes, and snippets.

@jonesbp /rivers.py Secret
Created Feb 18, 2017

What would you like to do?
Scraping a table from a Wikipedia page
from bs4 import BeautifulSoup
import requests
r = requests.get('https://en.wikipedia.org/wiki/List_of_rivers_of_Europe')
# Give the text of the Wikipedia page to BeautifulSoup and have it
# parse the HTML into a structured object
soup = BeautifulSoup(r.text, 'html.parser')
# I will append a new line for each river to this variable and
# output all at once at the end
output = ""
# The Longest Rivers list is the second table on the page
long_rivers_table = soup.find_all('table')[1]
first_row = long_rivers_table.find('tr')
# Skip the first row of the table…
row = first_row.find_next_sibling()
# …and step through the remaining rows
while row:
row_output = "" # Reset row_output
first_cell = row.find('td')
# Step through the contents of the first cell
for el in first_cell.contents:
# Check to see whether this element of the list is a string instance
# in order to determine whether we’re dealing with plain text or a link
if isinstance(el, str): # This node of the document is just plain text
# Append the text directly to row_output
row_output = row_output + el
else: # This node of the document is a link
# Wikipedia uses relative URLs so I need to put the right
# stem on them for them to work as absolute URLs.
href = "https://en.wikipedia.org{}".format(el.get('href'))
label = el.get_text()
# Append a Markdown-formatted link to row_output
row_output = row_output + "[{}]({})".format(label, href)
# Append the compiled output for this row to the final output for the script
output = output + "- {}\n".format(row_output)
# Get the next row if it exists
row = row.find_next_sibling()
print(output)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment