philipnye/govukarticles_scrape_articles_snippet2.py

## govukarticles_scrape_articles_snippet2.py
def scrape_article_page(govuk_string, article_partial_url):
    target_url = target_url_stub + article_partial_url

    r = requests.get(target_url, headers={'User-agent': 'Mozilla/5.0'})     # Gov.uk might require headers on the request (unconfirmed)     # noqa: E501

    if r.status_code == 200:
        soup = BeautifulSoup(r.content, features='html.parser')

        # Grab article type and title from the title section
        title_section = soup.find('div', 'gem-c-title')
        if title_section.span is not None:
            article_type = title_section.find('span').get_text().strip()
        if title_section.h1 is not None:
            title = title_section.find('h1').get_text().strip()

        # Grab the tagged organisation and people and the publication date from the metadata section        # noqa: E501
        metadata_section = soup.find('div', 'gem-c-metadata')
        if metadata_section is not None:
            entities = metadata_section.find_all('a', 'govuk-link')       # These will be both organisations and people. Class name is needed so that we don't pick up 'last updated' links       # noqa: E501
            departments = []
            people = []
            for entity in entities:
                if 'organisations' in entity.get('href'):       # Link is to an organisation page on gov.uk     # noqa: E501
                    departments.append(
                        entity.get('href')       # Grab the gov.uk URL version of the department's name, rather than the text string        # noqa: E501
                        .replace('/government/organisations/', '')
                    )
                else:
                    people.append(
                        entity.get('href').replace('/government/people/', '')        # Grab the gov.uk URL version of someone's name, rather than the text string        # noqa: E501
                    )

            published_line = metadata_section.find(
                'dt',
                'gem-c-metadata__term',
                string='Published'
            )
            published_date = published_line.findNext('dd').get_text().strip()     # There isn't a better way of uniquely selecting this line, other than going via it's sibling       # noqa: E501
            published_date = datetime.strftime(     # Put date into standard format     # noqa: E501
                datetime.strptime(
                    published_date,
                    '%d %B %Y'
                ),
                '%Y-%m-%d'
            )

        # For speeches, replace the published date with 'delivered on' date, if it exists (see e.g. https://www.gov.uk/government/speeches/the-importance-of-a-knowledge-rich-curriculum)       # noqa: E501
        important_section = soup.find('div', 'app-c-important-metadata')

        if important_section is not None:
            important_section.find(
                'dt',
                'app-c-important-metadata__term',
                string='Delivered on: '
            )
            published_date = (
                published_line
                .findNext('dd')
                .get_text()
            )

            # Remove explanatory notes (Transcript of the speech, exactly as it was delivered, Original script, may differ from delivered version, Speaker's notes, may differ from delivered version; possibly others)     # noqa: E501
            # These appear in brackets, after the date followed by a space
            pos = published_date.find('(')

            if pos != -1:        # If bracket is found
                published_date = published_date[:pos - 1]

            published_date = datetime.strftime(     # Put date into standard format     # noqa: E501
                datetime.strptime(
                    published_date,
                    '%d %B %Y'
                ),
                '%Y-%m-%d'
            )

        dict = {}
        dict.update({
            'govuk_string': govuk_string,
            'url': target_url,      # Full article URL - URL stub plus article partial URL      # noqa: E501
            'article_type': article_type,
            'title': title,
            'date': published_date,
            'departments': departments,
            'people': people
        })
        rows_list.append(dict)
    else:
        dict = {}
        dict.update({
            'govuk_string': govuk_string,
            'url': target_url,      # Full article URL - URL stub plus article partial URL      # noqa: E501
            'status_code': r.status_code
        })
        status_codes_list.append(dict)

    return
	def scrape_article_page(govuk_string, article_partial_url):
	target_url = target_url_stub + article_partial_url

	r = requests.get(target_url, headers={'User-agent': 'Mozilla/5.0'}) # Gov.uk might require headers on the request (unconfirmed) # noqa: E501

	if r.status_code == 200:
	soup = BeautifulSoup(r.content, features='html.parser')

	# Grab article type and title from the title section
	title_section = soup.find('div', 'gem-c-title')
	if title_section.span is not None:
	article_type = title_section.find('span').get_text().strip()
	if title_section.h1 is not None:
	title = title_section.find('h1').get_text().strip()

	# Grab the tagged organisation and people and the publication date from the metadata section # noqa: E501
	metadata_section = soup.find('div', 'gem-c-metadata')
	if metadata_section is not None:
	entities = metadata_section.find_all('a', 'govuk-link') # These will be both organisations and people. Class name is needed so that we don't pick up 'last updated' links # noqa: E501
	departments = []
	people = []
	for entity in entities:
	if 'organisations' in entity.get('href'): # Link is to an organisation page on gov.uk # noqa: E501
	departments.append(
	entity.get('href') # Grab the gov.uk URL version of the department's name, rather than the text string # noqa: E501
	.replace('/government/organisations/', '')
	)
	else:
	people.append(
	entity.get('href').replace('/government/people/', '') # Grab the gov.uk URL version of someone's name, rather than the text string # noqa: E501
	)

	published_line = metadata_section.find(
	'dt',
	'gem-c-metadata__term',
	string='Published'
	)
	published_date = published_line.findNext('dd').get_text().strip() # There isn't a better way of uniquely selecting this line, other than going via it's sibling # noqa: E501
	published_date = datetime.strftime( # Put date into standard format # noqa: E501
	datetime.strptime(
	published_date,
	'%d %B %Y'
	),
	'%Y-%m-%d'
	)

	# For speeches, replace the published date with 'delivered on' date, if it exists (see e.g. https://www.gov.uk/government/speeches/the-importance-of-a-knowledge-rich-curriculum) # noqa: E501
	important_section = soup.find('div', 'app-c-important-metadata')

	if important_section is not None:
	important_section.find(
	'dt',
	'app-c-important-metadata__term',
	string='Delivered on: '
	)
	published_date = (
	published_line
	.findNext('dd')
	.get_text()
	)

	# Remove explanatory notes (Transcript of the speech, exactly as it was delivered, Original script, may differ from delivered version, Speaker's notes, may differ from delivered version; possibly others) # noqa: E501
	# These appear in brackets, after the date followed by a space
	pos = published_date.find('(')

	if pos != -1: # If bracket is found
	published_date = published_date[:pos - 1]

	published_date = datetime.strftime( # Put date into standard format # noqa: E501
	datetime.strptime(
	published_date,
	'%d %B %Y'
	),
	'%Y-%m-%d'
	)

	dict = {}
	dict.update({
	'govuk_string': govuk_string,
	'url': target_url, # Full article URL - URL stub plus article partial URL # noqa: E501
	'article_type': article_type,
	'title': title,
	'date': published_date,
	'departments': departments,
	'people': people
	})
	rows_list.append(dict)
	else:
	dict = {}
	dict.update({
	'govuk_string': govuk_string,
	'url': target_url, # Full article URL - URL stub plus article partial URL # noqa: E501
	'status_code': r.status_code
	})
	status_codes_list.append(dict)

	return