Last active
April 12, 2019 17:36
-
-
Save macloo/8171487509147e8d2abd8c523250f166 to your computer and use it in GitHub Desktop.
Example of stripping and splitting text
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
url = 'http://whc.unesco.org/en/list/937' | |
html = requests.get(url) | |
soup = BeautifulSoup(html.text, 'html.parser') | |
# <div class="alternate"> | |
box = soup.find( "div", {"class":"alternate"} ) | |
# print(box) | |
divs = box.find_all('div') | |
# for div in divs: | |
# print(div.get_text().strip(' \n\r\t')) | |
country = divs[0].get_text().strip(' \n\r\t') | |
latlong = divs[1].get_text().strip(' \n\r\t') | |
# Date of Inscription | |
inscrip = divs[2].get_text().split(': ') | |
inscrip_date = inscrip[1].strip(' \n\r\t') | |
# Criteria: (x) | |
crit = divs[3].get_text().split(': ') | |
criteria = crit[1].strip(' \n\r\t') | |
# Property : 360,000 ha | |
prop = divs[4].get_text().split(':') | |
property_val = prop[1].strip(' \n\r\t') | |
# Ref: 937 | |
ref = divs[5].get_text().split(':') | |
ref_val = ref[1].strip(' \n\r\t') | |
box_data = [country, latlong, inscrip_date, criteria, property_val, ref_val] | |
for item in box_data: | |
print(item) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# create new empty dictionary to hold this location | |
world_her_loc = {} | |
# <div class="alternate"> | |
box = soup.find( "div", {"class":"alternate"} ) | |
# print(box) | |
divs = box.find_all('div') | |
countries = [] | |
no_colon = [] | |
has_colon = [] | |
for div in divs: | |
# any div that contains an img will be a country, so - | |
# grab all those and process them at the end | |
if div.find('img'): | |
countries.append(div) | |
elif not ':' in div.get_text().strip(' \n\r\t'): | |
no_colon.append(div) | |
else: | |
has_colon.append(div) | |
# deal with countries | |
countries_clean = [] | |
for country in countries: | |
countries_clean.append(country.get_text().strip(' \n\r\t')) | |
# print(countries_clean) | |
if len(countries_clean) == 1: | |
country = countries_clean[0] | |
else: | |
country = countries_clean | |
# print(country) | |
world_her_loc['country'] = country | |
# deal with no_colon items | |
# print(no_colon) | |
if len(no_colon) == 1: | |
latlong = no_colon[0].get_text().strip(' \n\r\t') | |
# print(latlong) | |
world_her_loc['latlong'] = latlong | |
elif len(no_colon) == 2: | |
regions = no_colon[0].get_text().strip(' \n\r\t') | |
latlong = no_colon[1].get_text().strip(' \n\r\t') | |
# print(regions) | |
# print(latlong) | |
world_her_loc['latlong'] = latlong | |
world_her_loc['regions'] = regions | |
else: | |
print('Alert: ' + url + ' has more than 2 no_colon items!') | |
# deal with has_colon items | |
# print(has_colon) | |
for item in has_colon: | |
ilist = item.get_text().split(':') | |
# Date of Inscription | |
if 'Inscription' in ilist[0]: | |
inscrip_date = ilist[1].strip(' \n\r\t') | |
world_her_loc['inscrip_date'] = inscrip_date | |
# Criteria: (x) | |
elif 'Criteria' in ilist[0]: | |
criteria = ilist[1].strip(' \n\r\t') | |
world_her_loc['criteria'] = criteria | |
# Property : 360,000 ha | |
elif 'Property' in ilist[0]: | |
property_val = ilist[1].strip(' \n\r\t') | |
world_her_loc['property_val'] = property_val | |
# Ref: 937 | |
elif 'Ref' in ilist[0]: | |
ref_val = ilist[1].strip(' \n\r\t') | |
world_her_loc['ref_val'] = ref_val | |
else: | |
print('Alert: ' + url + ' has an unknown has_colon item!') | |
# loop over dict and print each item | |
for k, v in world_her_loc.items(): | |
if not type(v) == list: | |
print(k + ": " + v) | |
else: | |
print(k + ": ") | |
for item in v: | |
print(item) |
For http://whc.unesco.org/en/list/275 romy_scrape_divs.py prints:
Alert: http://whc.unesco.org/en/list/275 has an unknown has_colon item!
country:
Argentina
Brazil
latlong: S28 32 36 W54 15 57
regions: State of Rio Grande do Sul, Brazil; Province of Misiones, Argentina
inscrip_date: 1983
criteria: (iv)
ref_val: 275bis
For http://whc.unesco.org/en/list/1133 romy_scrape_divs.py prints:
Alert: http://whc.unesco.org/en/list/1133 has an unknown has_colon item!
Alert: http://whc.unesco.org/en/list/1133 has an unknown has_colon item!
country:
Albania
Austria
Belgium
Bulgaria
Croatia
Germany
Italy
Romania
Slovakia
Slovenia
Spain
Ukraine
latlong: N49 0 35 E22 20 20
inscrip_date: 2007
criteria: (ix)
property_val: 92,023.24 ha
ref_val: 1133ter
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Prints:
Argentina
S42 30 0 W64 0 0
1999
(x)
360,000 ha
937