Created
February 22, 2021 11:10
-
-
Save mjam03/414f08ff01d1bad65dc1b1b53adfee12 to your computer and use it in GitHub Desktop.
test_1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# hacky way described here to circumvent the ons website bouncing you because it knows you're not a human | |
# https://stackoverflow.com/questions/13303449/urllib2-httperror-http-error-403-forbidden | |
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', | |
'Accept-Encoding': 'none', | |
'Accept-Language': 'en-US,en;q=0.8', | |
'Connection': 'keep-alive'} | |
# name the urls - ONS_DEATH_STRING comically named | |
ONS_ROOT = 'https://www.ons.gov.uk' | |
ONS_DEATH_STRING = '/peoplepopulationandcommunity/birthsdeathsandmarriages/deaths/datasets/weeklyprovisionalfiguresondeathsregisteredinenglandandwales/' | |
# request website, parse and identify from the html only the url link elements | |
req = Request(ONS_ROOT+ONS_DEATH_STRING, headers=hdr) | |
html_page = urlopen(req) | |
soup = BeautifulSoup(html_page, "lxml") | |
links = [] | |
for link in soup.findAll('a'): | |
l = link.get('href') | |
if l != None: | |
links.append(l) | |
# now we filter through the links to only get the relevant links for what we want | |
ENG_WALES_XLS = [] | |
for year in range(2010,2022): | |
for l in links: | |
if ONS_DEATH_STRING in l and str(year) in l and '.xls' in l: | |
ENG_WALES_XLS.append(l) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment