-
-
Save jonesbp/f9ffa3ec93bb8c8ec655a7403ea96b66 to your computer and use it in GitHub Desktop.
Scraping a table from a Wikipedia page
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
r = requests.get('https://en.wikipedia.org/wiki/List_of_rivers_of_Europe') | |
# Give the text of the Wikipedia page to BeautifulSoup and have it | |
# parse the HTML into a structured object | |
soup = BeautifulSoup(r.text, 'html.parser') | |
# I will append a new line for each river to this variable and | |
# output all at once at the end | |
output = "" | |
# The Longest Rivers list is the second table on the page | |
long_rivers_table = soup.find_all('table')[1] | |
first_row = long_rivers_table.find('tr') | |
# Skip the first row of the table… | |
row = first_row.find_next_sibling() | |
# …and step through the remaining rows | |
while row: | |
row_output = "" # Reset row_output | |
first_cell = row.find('td') | |
# Step through the contents of the first cell | |
for el in first_cell.contents: | |
# Check to see whether this element of the list is a string instance | |
# in order to determine whether we’re dealing with plain text or a link | |
if isinstance(el, str): # This node of the document is just plain text | |
# Append the text directly to row_output | |
row_output = row_output + el | |
else: # This node of the document is a link | |
# Wikipedia uses relative URLs so I need to put the right | |
# stem on them for them to work as absolute URLs. | |
href = "https://en.wikipedia.org{}".format(el.get('href')) | |
label = el.get_text() | |
# Append a Markdown-formatted link to row_output | |
row_output = row_output + "[{}]({})".format(label, href) | |
# Append the compiled output for this row to the final output for the script | |
output = output + "- {}\n".format(row_output) | |
# Get the next row if it exists | |
row = row.find_next_sibling() | |
print(output) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment