Skip to content

Instantly share code, notes, and snippets.

@BMU-Verlag
Last active Feb 11, 2020
Embed
What would you like to do?
import requests
from bs4 import BeautifulSoup
import re
from collections import namedtuple
LineContent = namedtuple('LineContent', 'name age_range')
def get_url(url):
try:
response = requests.get(url)
content_type = response.headers['Content-Type'].lower()
if (response.status_code == 200
and content_type is not None
and content_type.find('html') > -1):
return response.content
except Exception as e:
print ('error when accessing', url, '\n', e)
return None
def extract_data(response_content):
raw_html = BeautifulSoup(response_content, 'html.parser')
relevant_lines = get_relevant_lines(raw_html)
for line in relevant_lines:
print_line_if_age_under_60(line)
def print_line_if_age_under_60(line):
try:
parsed_line = parse_line(line)
age = get_age(parsed_line)
if (age < 60):
print (parsed_line.name, 'died at', age)
except Exception as e:
print ('could not parse line:', line, e)
def get_relevant_lines(raw_html):
relevant_lines = []
for index, element in enumerate(raw_html.select('li')):
current_line = element.text
if (re.match('.*\(\d{4}(–|-)\d{4}\).*', current_line)):
relevant_lines.append(current_line)
return relevant_lines
def parse_line(line):
contents = line.split('(') ## ["name", "jahr-jahr), zusatz"]
name = contents[0]
remaining_contents = contents[1].split(')') ## ["jahr-jahr", "zusatz"]
age_range = remaining_contents[0]
return LineContent(name, age_range)
def get_age(parsed_line):
years = []
if ('-' in parsed_line.age_range):
years = parsed_line.age_range.split('-')
else:
years = parsed_line.age_range.split('–')
return int(years[1]) - int(years[0])
if __name__ == '__main__':
response_content = get_url('https://en.wikipedia.org/wiki/List_of_mathe maticians_born_in_the_19th_century')
if (response_content != None):
extract_data(response_content)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment