philipnye/govukarticles_scrape_articles_snippet1.py

## govukarticles_scrape_articles_snippet1.py
def scrape_news_and_communications_page(
    govuk_string, page_number, article_date_min, article_date_max
):
    target_url = (
        target_url_stub +
        target_url_newscommssnippet +
        target_url_pagesnippet +
        str(page_number) +
        target_url_peoplesnippet +
        govuk_string +
        target_url_startdatesnippet +
        article_date_min +
        target_url_enddatesnippet +
        article_date_max
    )

    r = requests.get(target_url, headers={'User-agent': 'Mozilla/5.0'})     # Gov.uk might require headers on the request (unconfirmed)     # noqa: E501

    if r.status_code == 200:
        soup = BeautifulSoup(r.content, features='html.parser')

        results_list = soup.find('div', 'finder-results')

        if results_list is not None:     # This will not be the case for anyone who doesn't have any articles        # noqa: E501
            if results_list.ul is not None:
                for result in results_list.ul.find_all('li', recursive=False):     # NB: Looking for direct descendants only       # noqa: E501
                    article_partial_url = result.find('a').get('href')
                    time.sleep(1)
                    scrape_article_page(govuk_string, article_partial_url)

                if (
                    soup.find('nav', 'gem-c-pagination')        # Pagination nav element exists     # noqa: E501
                ):
                    page_number += 1
                    time.sleep(5)
                    scrape_news_and_communications_page(
                        govuk_string,
                        page_number,
                        article_date_min,
                        article_date_max
                    )
    else:
        dict = {}
        dict.update({
            'govuk_string': govuk_string,
            'url': target_url,      # Full URL - URL stub plus partial URL      # noqa: E501
            'status_code': r.status_code
        })
        status_codes_list.append(dict)

    return
	def scrape_news_and_communications_page(
	govuk_string, page_number, article_date_min, article_date_max
	):
	target_url = (
	target_url_stub +
	target_url_newscommssnippet +
	target_url_pagesnippet +
	str(page_number) +
	target_url_peoplesnippet +
	govuk_string +
	target_url_startdatesnippet +
	article_date_min +
	target_url_enddatesnippet +
	article_date_max
	)

	r = requests.get(target_url, headers={'User-agent': 'Mozilla/5.0'}) # Gov.uk might require headers on the request (unconfirmed) # noqa: E501

	if r.status_code == 200:
	soup = BeautifulSoup(r.content, features='html.parser')

	results_list = soup.find('div', 'finder-results')

	if results_list is not None: # This will not be the case for anyone who doesn't have any articles # noqa: E501
	if results_list.ul is not None:
	for result in results_list.ul.find_all('li', recursive=False): # NB: Looking for direct descendants only # noqa: E501
	article_partial_url = result.find('a').get('href')
	time.sleep(1)
	scrape_article_page(govuk_string, article_partial_url)

	if (
	soup.find('nav', 'gem-c-pagination') # Pagination nav element exists # noqa: E501
	):
	page_number += 1
	time.sleep(5)
	scrape_news_and_communications_page(
	govuk_string,
	page_number,
	article_date_min,
	article_date_max
	)
	else:
	dict = {}
	dict.update({
	'govuk_string': govuk_string,
	'url': target_url, # Full URL - URL stub plus partial URL # noqa: E501
	'status_code': r.status_code
	})
	status_codes_list.append(dict)

	return