Last active
February 11, 2020 08:50
-
-
Save BMU-Verlag/1de76b9d012b0dd7def8e34af4ee9bb2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import re | |
from collections import namedtuple | |
LineContent = namedtuple('LineContent', 'name age_range') | |
def get_url(url): | |
try: | |
response = requests.get(url) | |
content_type = response.headers['Content-Type'].lower() | |
if (response.status_code == 200 | |
and content_type is not None | |
and content_type.find('html') > -1): | |
return response.content | |
except Exception as e: | |
print ('error when accessing', url, '\n', e) | |
return None | |
def extract_data(response_content): | |
raw_html = BeautifulSoup(response_content, 'html.parser') | |
relevant_lines = get_relevant_lines(raw_html) | |
for line in relevant_lines: | |
print_line_if_age_under_60(line) | |
def print_line_if_age_under_60(line): | |
try: | |
parsed_line = parse_line(line) | |
age = get_age(parsed_line) | |
if (age < 60): | |
print (parsed_line.name, 'died at', age) | |
except Exception as e: | |
print ('could not parse line:', line, e) | |
def get_relevant_lines(raw_html): | |
relevant_lines = [] | |
for index, element in enumerate(raw_html.select('li')): | |
current_line = element.text | |
if (re.match('.*\(\d{4}(–|-)\d{4}\).*', current_line)): | |
relevant_lines.append(current_line) | |
return relevant_lines | |
def parse_line(line): | |
contents = line.split('(') ## ["name", "jahr-jahr), zusatz"] | |
name = contents[0] | |
remaining_contents = contents[1].split(')') ## ["jahr-jahr", "zusatz"] | |
age_range = remaining_contents[0] | |
return LineContent(name, age_range) | |
def get_age(parsed_line): | |
years = [] | |
if ('-' in parsed_line.age_range): | |
years = parsed_line.age_range.split('-') | |
else: | |
years = parsed_line.age_range.split('–') | |
return int(years[1]) - int(years[0]) | |
if __name__ == '__main__': | |
response_content = get_url('https://en.wikipedia.org/wiki/List_of_mathe maticians_born_in_the_19th_century') | |
if (response_content != None): | |
extract_data(response_content) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment