Skip to content

Instantly share code, notes, and snippets.

@mbarton

mbarton/hansard.sql Secret

Created Feb 17, 2021
Embed
What would you like to do?
Athena SQL queries to support the Queen's Consent Story
create external table if not exists default.hansard_scrapes (
`web-scraper-order` string,
`web-scraper-start-url` string,
title string,
`date` string,
`text` string,
`link` string
)
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
LOCATION 's3://investigations-data-dev/hansard/scrape_results/'
;
create table default.hansard_bills
with (external_location = 's3://investigations-data-dev/hansard/bills') as
(
select
-- TODO MRB: is there a better way than this gnarly chain of regexes?
trim(
regexp_replace(
regexp_replace(
regexp_replace(
regexp_replace(
-- Sometimes they end [HL] or [Lords]
regexp_replace(title, '(.+ Bill) \[.+\]$', '$1'),
'(.+ Bill) Lords$', '$1'),
'(.+)HI$', '$1'),
'(.+)Hl$', '$1'),
'^''', '')
) as title,
"date",
arbitrary(regexp_extract("link", '^\/(.+?)\/', 1)) as chamber,
array_agg(text) as text,
array_agg(distinct url_extract_parameter("web-scraper-start-url", 'searchTerm')) as matching_search_terms,
array_agg('https://hansard.parliament.uk' || link) as links
from
hansard_scrapes
group by
title, "date"
);
create table default.hansard_royal_consent
with (external_location = 's3://investigations-data-dev/hansard/royal_consent') as
(
select
*
from
(
select
title,
"date",
chamber,
if(contains(matching_search_terms, '"Queen''s consent signified"'), 'Queen''s consent signified', '') as direct_queens_consent,
if(contains(matching_search_terms, '"Prince of Wales''s consent signified"'), 'Prince of Wales''s consent signified', '') as direct_prince_of_wales_consent,
if(contains(matching_search_terms, '"Queen''s Consent, on behalf of the Crown, signified"'), 'Queen''s Consent, on behalf of the Crown, signified', '') as on_behalf_of_crown_queens_consent,
if(
contains(matching_search_terms, '"have it in command from"')
and contains(matching_search_terms, '"consented to place"'),
-- MRB: removed as this was too restrictive and missing bills we wanted to include
-- and contains(matching_search_terms, '"affected by the bill"')
'...I have it in Command from Her Majesty to acquaint the House...', '') as indirect_consent,
if(contains(matching_search_terms, '"Prince of Wales''s consent, on behalf of the Duchy of Cornwall, signified"'), 'Prince of Wales''s consent, on behalf of the Duchy of Cornwall, signified', '') as on_behalf_of_duchy_princes_consent,
if(contains(matching_search_terms, '"consent having been signified"'), 'consent having been signified', '') as consent_having_been_signified,
array_join(array_distinct(text), '; ') as text,
-- Google Sheets doesn't let you construct a single cell with multiple links so we hackily expand
-- the array of links into individual columns.
'=HYPERLINK("' || links[1] || '", "Link")' as link_1,
if(element_at(links, 2) is not null, '=HYPERLINK("' || links[2] || '", "Link")', '') as link_2,
if(element_at(links, 3) is not null, '=HYPERLINK("' || links[3] || '", "Link")', '') as link_3,
if(element_at(links, 4) is not null, '=HYPERLINK("' || links[4] || '", "Link")', '') as link_4,
if(element_at(links, 5) is not null, '=HYPERLINK("' || links[5] || '", "Link")', '') as link_5,
if(element_at(links, 6) is not null, '=HYPERLINK("' || links[6] || '", "Link")', '') as link_6
from hansard_bills
)
where
-- It would be more idomatic SQL to filter on NULL here but by using the empty string
-- the result imports nicely into Google Sheets (the cell appearing empty).
direct_queens_consent <> '' or
direct_prince_of_wales_consent <> '' or
on_behalf_of_crown_queens_consent <> '' or
indirect_consent <> '' or
on_behalf_of_duchy_princes_consent <> '' or
consent_having_been_signified <> ''
);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment