Created
February 17, 2022 19:35
-
-
Save philipnye/3409bb0cb1f5e89f61aeeaf37400f630 to your computer and use it in GitHub Desktop.
Function to scrape gov.uk's news and communications search page
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def scrape_news_and_communications_page( | |
govuk_string, page_number, article_date_min, article_date_max | |
): | |
target_url = ( | |
target_url_stub + | |
target_url_newscommssnippet + | |
target_url_pagesnippet + | |
str(page_number) + | |
target_url_peoplesnippet + | |
govuk_string + | |
target_url_startdatesnippet + | |
article_date_min + | |
target_url_enddatesnippet + | |
article_date_max | |
) | |
r = requests.get(target_url, headers={'User-agent': 'Mozilla/5.0'}) # Gov.uk might require headers on the request (unconfirmed) # noqa: E501 | |
if r.status_code == 200: | |
soup = BeautifulSoup(r.content, features='html.parser') | |
results_list = soup.find('div', 'finder-results') | |
if results_list is not None: # This will not be the case for anyone who doesn't have any articles # noqa: E501 | |
if results_list.ul is not None: | |
for result in results_list.ul.find_all('li', recursive=False): # NB: Looking for direct descendants only # noqa: E501 | |
article_partial_url = result.find('a').get('href') | |
time.sleep(1) | |
scrape_article_page(govuk_string, article_partial_url) | |
if ( | |
soup.find('nav', 'gem-c-pagination') # Pagination nav element exists # noqa: E501 | |
): | |
page_number += 1 | |
time.sleep(5) | |
scrape_news_and_communications_page( | |
govuk_string, | |
page_number, | |
article_date_min, | |
article_date_max | |
) | |
else: | |
dict = {} | |
dict.update({ | |
'govuk_string': govuk_string, | |
'url': target_url, # Full URL - URL stub plus partial URL # noqa: E501 | |
'status_code': r.status_code | |
}) | |
status_codes_list.append(dict) | |
return |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment