Skip to content

Instantly share code, notes, and snippets.

@philipnye
Created February 17, 2022 19:35
Show Gist options
  • Save philipnye/3409bb0cb1f5e89f61aeeaf37400f630 to your computer and use it in GitHub Desktop.
Save philipnye/3409bb0cb1f5e89f61aeeaf37400f630 to your computer and use it in GitHub Desktop.
Function to scrape gov.uk's news and communications search page
def scrape_news_and_communications_page(
govuk_string, page_number, article_date_min, article_date_max
):
target_url = (
target_url_stub +
target_url_newscommssnippet +
target_url_pagesnippet +
str(page_number) +
target_url_peoplesnippet +
govuk_string +
target_url_startdatesnippet +
article_date_min +
target_url_enddatesnippet +
article_date_max
)
r = requests.get(target_url, headers={'User-agent': 'Mozilla/5.0'}) # Gov.uk might require headers on the request (unconfirmed) # noqa: E501
if r.status_code == 200:
soup = BeautifulSoup(r.content, features='html.parser')
results_list = soup.find('div', 'finder-results')
if results_list is not None: # This will not be the case for anyone who doesn't have any articles # noqa: E501
if results_list.ul is not None:
for result in results_list.ul.find_all('li', recursive=False): # NB: Looking for direct descendants only # noqa: E501
article_partial_url = result.find('a').get('href')
time.sleep(1)
scrape_article_page(govuk_string, article_partial_url)
if (
soup.find('nav', 'gem-c-pagination') # Pagination nav element exists # noqa: E501
):
page_number += 1
time.sleep(5)
scrape_news_and_communications_page(
govuk_string,
page_number,
article_date_min,
article_date_max
)
else:
dict = {}
dict.update({
'govuk_string': govuk_string,
'url': target_url, # Full URL - URL stub plus partial URL # noqa: E501
'status_code': r.status_code
})
status_codes_list.append(dict)
return
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment