Created
February 17, 2022 19:36
-
-
Save philipnye/ab04d5d96fd253d9936a7326d319a4d2 to your computer and use it in GitHub Desktop.
Function to scrape gov.uk article pages
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def scrape_article_page(govuk_string, article_partial_url): | |
target_url = target_url_stub + article_partial_url | |
r = requests.get(target_url, headers={'User-agent': 'Mozilla/5.0'}) # Gov.uk might require headers on the request (unconfirmed) # noqa: E501 | |
if r.status_code == 200: | |
soup = BeautifulSoup(r.content, features='html.parser') | |
# Grab article type and title from the title section | |
title_section = soup.find('div', 'gem-c-title') | |
if title_section.span is not None: | |
article_type = title_section.find('span').get_text().strip() | |
if title_section.h1 is not None: | |
title = title_section.find('h1').get_text().strip() | |
# Grab the tagged organisation and people and the publication date from the metadata section # noqa: E501 | |
metadata_section = soup.find('div', 'gem-c-metadata') | |
if metadata_section is not None: | |
entities = metadata_section.find_all('a', 'govuk-link') # These will be both organisations and people. Class name is needed so that we don't pick up 'last updated' links # noqa: E501 | |
departments = [] | |
people = [] | |
for entity in entities: | |
if 'organisations' in entity.get('href'): # Link is to an organisation page on gov.uk # noqa: E501 | |
departments.append( | |
entity.get('href') # Grab the gov.uk URL version of the department's name, rather than the text string # noqa: E501 | |
.replace('/government/organisations/', '') | |
) | |
else: | |
people.append( | |
entity.get('href').replace('/government/people/', '') # Grab the gov.uk URL version of someone's name, rather than the text string # noqa: E501 | |
) | |
published_line = metadata_section.find( | |
'dt', | |
'gem-c-metadata__term', | |
string='Published' | |
) | |
published_date = published_line.findNext('dd').get_text().strip() # There isn't a better way of uniquely selecting this line, other than going via it's sibling # noqa: E501 | |
published_date = datetime.strftime( # Put date into standard format # noqa: E501 | |
datetime.strptime( | |
published_date, | |
'%d %B %Y' | |
), | |
'%Y-%m-%d' | |
) | |
# For speeches, replace the published date with 'delivered on' date, if it exists (see e.g. https://www.gov.uk/government/speeches/the-importance-of-a-knowledge-rich-curriculum) # noqa: E501 | |
important_section = soup.find('div', 'app-c-important-metadata') | |
if important_section is not None: | |
important_section.find( | |
'dt', | |
'app-c-important-metadata__term', | |
string='Delivered on: ' | |
) | |
published_date = ( | |
published_line | |
.findNext('dd') | |
.get_text() | |
) | |
# Remove explanatory notes (Transcript of the speech, exactly as it was delivered, Original script, may differ from delivered version, Speaker's notes, may differ from delivered version; possibly others) # noqa: E501 | |
# These appear in brackets, after the date followed by a space | |
pos = published_date.find('(') | |
if pos != -1: # If bracket is found | |
published_date = published_date[:pos - 1] | |
published_date = datetime.strftime( # Put date into standard format # noqa: E501 | |
datetime.strptime( | |
published_date, | |
'%d %B %Y' | |
), | |
'%Y-%m-%d' | |
) | |
dict = {} | |
dict.update({ | |
'govuk_string': govuk_string, | |
'url': target_url, # Full article URL - URL stub plus article partial URL # noqa: E501 | |
'article_type': article_type, | |
'title': title, | |
'date': published_date, | |
'departments': departments, | |
'people': people | |
}) | |
rows_list.append(dict) | |
else: | |
dict = {} | |
dict.update({ | |
'govuk_string': govuk_string, | |
'url': target_url, # Full article URL - URL stub plus article partial URL # noqa: E501 | |
'status_code': r.status_code | |
}) | |
status_codes_list.append(dict) | |
return |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment