-
-
Save mbarton/fe1375f58cd43f73866b5670fe89dd27 to your computer and use it in GitHub Desktop.
Athena SQL queries to support the Queen's Consent Story
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
create external table if not exists default.hansard_scrapes ( | |
`web-scraper-order` string, | |
`web-scraper-start-url` string, | |
title string, | |
`date` string, | |
`text` string, | |
`link` string | |
) | |
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' | |
LOCATION 's3://investigations-data-dev/hansard/scrape_results/' | |
; | |
create table default.hansard_bills | |
with (external_location = 's3://investigations-data-dev/hansard/bills') as | |
( | |
select | |
-- TODO MRB: is there a better way than this gnarly chain of regexes? | |
trim( | |
regexp_replace( | |
regexp_replace( | |
regexp_replace( | |
regexp_replace( | |
-- Sometimes they end [HL] or [Lords] | |
regexp_replace(title, '(.+ Bill) \[.+\]$', '$1'), | |
'(.+ Bill) Lords$', '$1'), | |
'(.+)HI$', '$1'), | |
'(.+)Hl$', '$1'), | |
'^''', '') | |
) as title, | |
"date", | |
arbitrary(regexp_extract("link", '^\/(.+?)\/', 1)) as chamber, | |
array_agg(text) as text, | |
array_agg(distinct url_extract_parameter("web-scraper-start-url", 'searchTerm')) as matching_search_terms, | |
array_agg('https://hansard.parliament.uk' || link) as links | |
from | |
hansard_scrapes | |
group by | |
title, "date" | |
); | |
create table default.hansard_royal_consent | |
with (external_location = 's3://investigations-data-dev/hansard/royal_consent') as | |
( | |
select | |
* | |
from | |
( | |
select | |
title, | |
"date", | |
chamber, | |
if(contains(matching_search_terms, '"Queen''s consent signified"'), 'Queen''s consent signified', '') as direct_queens_consent, | |
if(contains(matching_search_terms, '"Prince of Wales''s consent signified"'), 'Prince of Wales''s consent signified', '') as direct_prince_of_wales_consent, | |
if(contains(matching_search_terms, '"Queen''s Consent, on behalf of the Crown, signified"'), 'Queen''s Consent, on behalf of the Crown, signified', '') as on_behalf_of_crown_queens_consent, | |
if( | |
contains(matching_search_terms, '"have it in command from"') | |
and contains(matching_search_terms, '"consented to place"'), | |
-- MRB: removed as this was too restrictive and missing bills we wanted to include | |
-- and contains(matching_search_terms, '"affected by the bill"') | |
'...I have it in Command from Her Majesty to acquaint the House...', '') as indirect_consent, | |
if(contains(matching_search_terms, '"Prince of Wales''s consent, on behalf of the Duchy of Cornwall, signified"'), 'Prince of Wales''s consent, on behalf of the Duchy of Cornwall, signified', '') as on_behalf_of_duchy_princes_consent, | |
if(contains(matching_search_terms, '"consent having been signified"'), 'consent having been signified', '') as consent_having_been_signified, | |
array_join(array_distinct(text), '; ') as text, | |
-- Google Sheets doesn't let you construct a single cell with multiple links so we hackily expand | |
-- the array of links into individual columns. | |
'=HYPERLINK("' || links[1] || '", "Link")' as link_1, | |
if(element_at(links, 2) is not null, '=HYPERLINK("' || links[2] || '", "Link")', '') as link_2, | |
if(element_at(links, 3) is not null, '=HYPERLINK("' || links[3] || '", "Link")', '') as link_3, | |
if(element_at(links, 4) is not null, '=HYPERLINK("' || links[4] || '", "Link")', '') as link_4, | |
if(element_at(links, 5) is not null, '=HYPERLINK("' || links[5] || '", "Link")', '') as link_5, | |
if(element_at(links, 6) is not null, '=HYPERLINK("' || links[6] || '", "Link")', '') as link_6 | |
from hansard_bills | |
) | |
where | |
-- It would be more idomatic SQL to filter on NULL here but by using the empty string | |
-- the result imports nicely into Google Sheets (the cell appearing empty). | |
direct_queens_consent <> '' or | |
direct_prince_of_wales_consent <> '' or | |
on_behalf_of_crown_queens_consent <> '' or | |
indirect_consent <> '' or | |
on_behalf_of_duchy_princes_consent <> '' or | |
consent_having_been_signified <> '' | |
); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment