Skip to content

Instantly share code, notes, and snippets.

@philipnye
Created February 17, 2022 19:36
Show Gist options
  • Save philipnye/ab04d5d96fd253d9936a7326d319a4d2 to your computer and use it in GitHub Desktop.
Save philipnye/ab04d5d96fd253d9936a7326d319a4d2 to your computer and use it in GitHub Desktop.
Function to scrape gov.uk article pages
def scrape_article_page(govuk_string, article_partial_url):
target_url = target_url_stub + article_partial_url
r = requests.get(target_url, headers={'User-agent': 'Mozilla/5.0'}) # Gov.uk might require headers on the request (unconfirmed) # noqa: E501
if r.status_code == 200:
soup = BeautifulSoup(r.content, features='html.parser')
# Grab article type and title from the title section
title_section = soup.find('div', 'gem-c-title')
if title_section.span is not None:
article_type = title_section.find('span').get_text().strip()
if title_section.h1 is not None:
title = title_section.find('h1').get_text().strip()
# Grab the tagged organisation and people and the publication date from the metadata section # noqa: E501
metadata_section = soup.find('div', 'gem-c-metadata')
if metadata_section is not None:
entities = metadata_section.find_all('a', 'govuk-link') # These will be both organisations and people. Class name is needed so that we don't pick up 'last updated' links # noqa: E501
departments = []
people = []
for entity in entities:
if 'organisations' in entity.get('href'): # Link is to an organisation page on gov.uk # noqa: E501
departments.append(
entity.get('href') # Grab the gov.uk URL version of the department's name, rather than the text string # noqa: E501
.replace('/government/organisations/', '')
)
else:
people.append(
entity.get('href').replace('/government/people/', '') # Grab the gov.uk URL version of someone's name, rather than the text string # noqa: E501
)
published_line = metadata_section.find(
'dt',
'gem-c-metadata__term',
string='Published'
)
published_date = published_line.findNext('dd').get_text().strip() # There isn't a better way of uniquely selecting this line, other than going via it's sibling # noqa: E501
published_date = datetime.strftime( # Put date into standard format # noqa: E501
datetime.strptime(
published_date,
'%d %B %Y'
),
'%Y-%m-%d'
)
# For speeches, replace the published date with 'delivered on' date, if it exists (see e.g. https://www.gov.uk/government/speeches/the-importance-of-a-knowledge-rich-curriculum) # noqa: E501
important_section = soup.find('div', 'app-c-important-metadata')
if important_section is not None:
important_section.find(
'dt',
'app-c-important-metadata__term',
string='Delivered on: '
)
published_date = (
published_line
.findNext('dd')
.get_text()
)
# Remove explanatory notes (Transcript of the speech, exactly as it was delivered, Original script, may differ from delivered version, Speaker's notes, may differ from delivered version; possibly others) # noqa: E501
# These appear in brackets, after the date followed by a space
pos = published_date.find('(')
if pos != -1: # If bracket is found
published_date = published_date[:pos - 1]
published_date = datetime.strftime( # Put date into standard format # noqa: E501
datetime.strptime(
published_date,
'%d %B %Y'
),
'%Y-%m-%d'
)
dict = {}
dict.update({
'govuk_string': govuk_string,
'url': target_url, # Full article URL - URL stub plus article partial URL # noqa: E501
'article_type': article_type,
'title': title,
'date': published_date,
'departments': departments,
'people': people
})
rows_list.append(dict)
else:
dict = {}
dict.update({
'govuk_string': govuk_string,
'url': target_url, # Full article URL - URL stub plus article partial URL # noqa: E501
'status_code': r.status_code
})
status_codes_list.append(dict)
return
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment