Skip to content

Instantly share code, notes, and snippets.

@macloo
Last active April 12, 2019 17:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save macloo/8171487509147e8d2abd8c523250f166 to your computer and use it in GitHub Desktop.
Save macloo/8171487509147e8d2abd8c523250f166 to your computer and use it in GitHub Desktop.
Example of stripping and splitting text
from bs4 import BeautifulSoup
import requests
url = 'http://whc.unesco.org/en/list/937'
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
# <div class="alternate">
box = soup.find( "div", {"class":"alternate"} )
# print(box)
divs = box.find_all('div')
# for div in divs:
# print(div.get_text().strip(' \n\r\t'))
country = divs[0].get_text().strip(' \n\r\t')
latlong = divs[1].get_text().strip(' \n\r\t')
# Date of Inscription
inscrip = divs[2].get_text().split(': ')
inscrip_date = inscrip[1].strip(' \n\r\t')
# Criteria: (x)
crit = divs[3].get_text().split(': ')
criteria = crit[1].strip(' \n\r\t')
# Property : 360,000 ha
prop = divs[4].get_text().split(':')
property_val = prop[1].strip(' \n\r\t')
# Ref: 937
ref = divs[5].get_text().split(':')
ref_val = ref[1].strip(' \n\r\t')
box_data = [country, latlong, inscrip_date, criteria, property_val, ref_val]
for item in box_data:
print(item)
# create new empty dictionary to hold this location
world_her_loc = {}
# <div class="alternate">
box = soup.find( "div", {"class":"alternate"} )
# print(box)
divs = box.find_all('div')
countries = []
no_colon = []
has_colon = []
for div in divs:
# any div that contains an img will be a country, so -
# grab all those and process them at the end
if div.find('img'):
countries.append(div)
elif not ':' in div.get_text().strip(' \n\r\t'):
no_colon.append(div)
else:
has_colon.append(div)
# deal with countries
countries_clean = []
for country in countries:
countries_clean.append(country.get_text().strip(' \n\r\t'))
# print(countries_clean)
if len(countries_clean) == 1:
country = countries_clean[0]
else:
country = countries_clean
# print(country)
world_her_loc['country'] = country
# deal with no_colon items
# print(no_colon)
if len(no_colon) == 1:
latlong = no_colon[0].get_text().strip(' \n\r\t')
# print(latlong)
world_her_loc['latlong'] = latlong
elif len(no_colon) == 2:
regions = no_colon[0].get_text().strip(' \n\r\t')
latlong = no_colon[1].get_text().strip(' \n\r\t')
# print(regions)
# print(latlong)
world_her_loc['latlong'] = latlong
world_her_loc['regions'] = regions
else:
print('Alert: ' + url + ' has more than 2 no_colon items!')
# deal with has_colon items
# print(has_colon)
for item in has_colon:
ilist = item.get_text().split(':')
# Date of Inscription
if 'Inscription' in ilist[0]:
inscrip_date = ilist[1].strip(' \n\r\t')
world_her_loc['inscrip_date'] = inscrip_date
# Criteria: (x)
elif 'Criteria' in ilist[0]:
criteria = ilist[1].strip(' \n\r\t')
world_her_loc['criteria'] = criteria
# Property : 360,000 ha
elif 'Property' in ilist[0]:
property_val = ilist[1].strip(' \n\r\t')
world_her_loc['property_val'] = property_val
# Ref: 937
elif 'Ref' in ilist[0]:
ref_val = ilist[1].strip(' \n\r\t')
world_her_loc['ref_val'] = ref_val
else:
print('Alert: ' + url + ' has an unknown has_colon item!')
# loop over dict and print each item
for k, v in world_her_loc.items():
if not type(v) == list:
print(k + ": " + v)
else:
print(k + ": ")
for item in v:
print(item)
@macloo
Copy link
Author

macloo commented Apr 10, 2019

Prints:

Argentina
S42 30 0 W64 0 0
1999
(x)
360,000 ha
937

@macloo
Copy link
Author

macloo commented Apr 12, 2019

For http://whc.unesco.org/en/list/275 romy_scrape_divs.py prints:

Alert: http://whc.unesco.org/en/list/275 has an unknown has_colon item!
country:
Argentina
Brazil
latlong: S28 32 36 W54 15 57
regions: State of Rio Grande do Sul, Brazil; Province of Misiones, Argentina
inscrip_date: 1983
criteria: (iv)
ref_val: 275bis

For http://whc.unesco.org/en/list/1133 romy_scrape_divs.py prints:

Alert: http://whc.unesco.org/en/list/1133 has an unknown has_colon item!
Alert: http://whc.unesco.org/en/list/1133 has an unknown has_colon item!
country:
Albania
Austria
Belgium
Bulgaria
Croatia
Germany
Italy
Romania
Slovakia
Slovenia
Spain
Ukraine
latlong: N49 0 35 E22 20 20
inscrip_date: 2007
criteria: (ix)
property_val: 92,023.24 ha
ref_val: 1133ter

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment