Created
April 28, 2022 13:52
-
-
Save cameronneylon/b51c8fafe6d577a385f737b914c799d4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
WITH | |
-- UNNEST the funder DOIs in the Crossref DOI table | |
CR_FUNDER_DOI_TABLE AS ( | |
SELECT | |
a.doi, | |
crossref.published_year, | |
-- Deal with the Crossref Month 13 issue | |
CASE WHEN crossref.published_month > 12 then 12 ELSE crossref.published_month END as published_month, | |
unpaywall.* EXCEPT(doi), | |
crossref.type, | |
funder.doi as funder_doi | |
FROM `academic-observatory.observatory.doi20220423` as a, | |
UNNEST(affiliations.funders) as funder | |
), | |
-- JOIN the cOAS funders onto the unnested table to select relevant DOIs | |
coalitionS_dois AS ( | |
SELECT | |
a.*, | |
b.* EXCEPT (funder_doi) | |
FROM `utrecht-university.cOAlitionS.fundref_mapping` as a | |
LEFT JOIN CR_FUNDER_DOI_TABLE as b | |
ON UPPER(TRIM(a.funder_doi)) = UPPER(TRIM(b.funder_doi)) | |
WHERE b.published_year > 2015 | |
), | |
-- Process the raw unpaywall data to generate first oa dates | |
-- Where appropriate these should evaluate to null when no oa_date is present | |
unpaywall_processed AS ( | |
SELECT | |
doi, | |
journal_issn_l, | |
CASE | |
WHEN ( | |
SELECT COUNT(1) | |
FROM UNNEST(u.oa_locations) as g | |
WHERE (g.host_type="repository" and g.version IN ('publishedVersion', 'acceptedVersion'))) > 0 | |
THEN true | |
ELSE FALSE | |
END as green_accepted_published, | |
first_oa_location.oa_date as first_oa_date, -- This currently includes preprints | |
( | |
SELECT g.oa_date | |
FROM UNNEST(u.oa_locations) as g | |
WHERE (g.host_type="repository" and g.version IN ('publishedVersion', 'acceptedVersion')) | |
ORDER BY g.oa_date ASC LIMIT 1 | |
) | |
as first_green_oa, | |
( | |
SELECT g.oa_date | |
FROM UNNEST(u.oa_locations) as g | |
WHERE g.host_type="publisher" | |
ORDER BY g.oa_date ASC LIMIT 1 | |
) | |
as first_gold_oa, | |
IF(( | |
SELECT COUNT(1) | |
FROM UNNEST(u.oa_locations) as l | |
WHERE (l.host_type="publisher" and l.license="cc-by")) > 0, true, false) as publisher_ccby, | |
IF(( | |
SELECT COUNT(1) | |
FROM UNNEST(u.oa_locations) as l | |
WHERE (l.host_type="publisher" and l.license="cc-by-nd")) > 0, true, false) as publisher_ccbynd, | |
IF(( | |
SELECT COUNT(1) | |
FROM UNNEST(u.oa_locations) as l | |
WHERE (l.host_type="publisher" and l.license="cc-by-sa")) > 0, true, false) as publisher_ccbysa, | |
IF(( | |
SELECT COUNT(1) | |
FROM UNNEST(u.oa_locations) as l | |
WHERE (l.host_type="publisher" and l.license="cc0")) > 0, true, false) as publisher_cc0, | |
IF(( | |
SELECT COUNT(1) | |
FROM UNNEST(u.oa_locations) as l | |
WHERE (l.host_type="publisher" and l.license="pd")) > 0, true, false) as publisher_pd, | |
IF(( | |
SELECT COUNT(1) | |
FROM UNNEST(u.oa_locations) as l | |
WHERE (l.host_type="repository" and l.license="cc-by")) > 0, true, false) as repository_ccby, | |
IF(( | |
SELECT COUNT(1) | |
FROM UNNEST(u.oa_locations) as l | |
WHERE (l.host_type="repository" and l.license="cc-by-nd")) > 0, true, false) as repository_ccbynd, | |
IF(( | |
SELECT COUNT(1) | |
FROM UNNEST(u.oa_locations) as l | |
WHERE (l.host_type="repository" and l.license="cc-by-sa")) > 0, true, false) as repository_ccbysa, | |
IF(( | |
SELECT COUNT(1) | |
FROM UNNEST(u.oa_locations) as l | |
WHERE (l.host_type="repository" and l.license="cc0")) > 0, true, false) as repository_cc0, | |
IF(( | |
SELECT COUNT(1) | |
FROM UNNEST(u.oa_locations) as l | |
WHERE (l.host_type="repository" and l.license="pd")) > 0, true, false) as repository_pd, | |
IF(( | |
SELECT COUNT(1) | |
FROM UNNEST(u.oa_locations) as l | |
WHERE (l.host_type="repository" and l.version IN ('publishedVersion', 'acceptedVersion') and l.license="cc-by")) > 0, true, false) as repository_accpub_ccby, | |
IF(( | |
SELECT COUNT(1) | |
FROM UNNEST(u.oa_locations) as l | |
WHERE (l.host_type="repository" and l.version IN ('publishedVersion', 'acceptedVersion') and l.license="cc-by-nd")) > 0, true, false) as repository_accpub_ccbynd, | |
IF(( | |
SELECT COUNT(1) | |
FROM UNNEST(u.oa_locations) as l | |
WHERE (l.host_type="repository" and l.version IN ('publishedVersion', 'acceptedVersion') and l.license="cc-by-sa")) > 0, true, false) as repository_accpub_ccbysa, | |
IF(( | |
SELECT COUNT(1) | |
FROM UNNEST(u.oa_locations) as l | |
WHERE (l.host_type="repository" and l.version IN ('publishedVersion', 'acceptedVersion') and l.license="cc0")) > 0, true, false) as repository_accpub_cc0, | |
IF(( | |
SELECT COUNT(1) | |
FROM UNNEST(u.oa_locations) as l | |
WHERE (l.host_type="repository" and l.version IN ('publishedVersion', 'acceptedVersion') and l.license="pd")) > 0, true, false) as repository_accpub_pd | |
FROM `academic-observatory.our_research.unpaywall` as u | |
), | |
-- Create a DOI table linking the cOAS dois to the unpaywall data and linking to DOAJ APC data | |
-- Generate embargo classes for analysis | |
qdoi_table AS ( | |
SELECT | |
s.*, | |
u.* EXCEPT(doi), | |
DATE_DIFF(u.first_oa_date, DATE(published_year, published_month, 1), MONTH) as embargo_observed, | |
DATE_DIFF(first_green_oa, DATE(published_year, published_month, 1), MONTH) as green_embargo_observed, | |
DATE_DIFF(first_gold_oa, DATE(published_year, published_month, 1), MONTH) as gold_embargo_observed, | |
CASE | |
WHEN u.first_oa_date is null THEN null | |
WHEN | |
DATE_DIFF(u.first_oa_date, DATE(published_year, published_month, 1), MONTH) > 24 THEN "24+" | |
WHEN | |
DATE_DIFF(u.first_oa_date, DATE(published_year, published_month, 1), MONTH) > 12 THEN "12-24" | |
WHEN | |
DATE_DIFF(u.first_oa_date, DATE(published_year, published_month, 1), MONTH) > 6 THEN "6-12" | |
WHEN | |
DATE_DIFF(u.first_oa_date, DATE(published_year, published_month, 1), MONTH) > 0 THEN "0-6" | |
ELSE "0" | |
END | |
as embargo_period_class, | |
CASE | |
WHEN first_green_oa is null THEN null | |
WHEN | |
DATE_DIFF(u.first_green_oa, DATE(published_year, published_month, 1), MONTH) > 24 THEN "24+" | |
WHEN | |
DATE_DIFF(u.first_green_oa, DATE(published_year, published_month, 1), MONTH) > 12 THEN "12-24" | |
WHEN | |
DATE_DIFF(u.first_green_oa, DATE(published_year, published_month, 1), MONTH) > 6 THEN "6-12" | |
WHEN | |
DATE_DIFF(u.first_green_oa, DATE(published_year, published_month, 1), MONTH) > 0 THEN "0-6" | |
ELSE "0" | |
END | |
as green_embargo_period_class, | |
CASE | |
WHEN | |
u.first_gold_oa is null THEN null | |
WHEN | |
DATE_DIFF(u.first_gold_oa, DATE(published_year, published_month, 1), MONTH) > 24 THEN "24+" | |
WHEN | |
DATE_DIFF(u.first_gold_oa, DATE(published_year, published_month, 1), MONTH) > 12 THEN "12-24" | |
WHEN | |
DATE_DIFF(u.first_gold_oa, DATE(published_year, published_month, 1), MONTH) > 6 THEN "6-12" | |
WHEN | |
DATE_DIFF(u.first_gold_oa, DATE(published_year, published_month, 1), MONTH) > 0 THEN "0-6" | |
ELSE "0" | |
END | |
as gold_embargo_period_class, | |
a.apc | |
FROM coalitionS_dois as s | |
LEFT JOIN unpaywall_processed as u on UPPER(TRIM(s.doi)) = UPPER(TRIM(u.doi)) | |
LEFT JOIN `utrecht-university.doaj.apc_issnl_20220427` as a on a.journal_issn_l = u.journal_issn_l | |
) | |
-- Final query counts up outputs and OA status by year and cOAS funder | |
SELECT | |
published_year, | |
acronym, | |
COUNT(DISTINCT doi) as total_outputs, | |
COUNT(DISTINCT IF(is_oa, doi, null)) as count_oa, | |
COUNT(DISTINCT IF(gold, doi, null)) as count_gold, | |
COUNT(DISTINCT IF(gold_just_doaj and (apc=false), doi, null)) as count_diamond, | |
COUNT(DISTINCT IF(gold_just_doaj, doi, null)) as count_doaj, | |
COUNT(DISTINCT IF(hybrid, doi, null)) as count_hybrid, | |
COUNT(DISTINCT IF(green, doi, null)) as count_green, | |
COUNT(DISTINCT IF(green_accepted_published, doi, null)) as count_green_accpub, | |
COUNT(DISTINCT IF(green_only, doi, null)) as count_green_only, | |
COUNT(DISTINCT IF((green_only and green_accepted_published), doi, null)) as count_green_only_accpub, | |
COUNT(DISTINCT IF(green_only_ignoring_bronze, doi, null)) as count_green_only_ignoring_bronze, | |
COUNT(DISTINCT IF((green_only_ignoring_bronze and green_accepted_published), doi, null)) as count_green_only_ignoring_bronze_accpub, | |
COUNT(DISTINCT IF(is_cclicensed, doi, null)) as count_cc_licensed, | |
COUNT(DISTINCT IF(embargo_period_class='24+', doi, null)) as count_embargo24plus, | |
COUNT(DISTINCT IF(embargo_period_class='12-24', doi, null)) as count_embargo13_24, | |
COUNT(DISTINCT IF(embargo_period_class='6-12', doi, null)) as count_embargo7_12, | |
COUNT(DISTINCT IF(embargo_period_class='0-6', doi, null)) as count_embargo0_6, | |
COUNT(DISTINCT IF(embargo_period_class='0', doi, null)) as count_embargo0, | |
COUNT(DISTINCT IF(green_embargo_period_class='24+', doi, null)) as count_green_embargo24plus, | |
COUNT(DISTINCT IF(green_embargo_period_class='12-24', doi, null)) as count_green_embargo13_24, | |
COUNT(DISTINCT IF(green_embargo_period_class='6-12', doi, null)) as count_green_embargo7_12, | |
COUNT(DISTINCT IF(green_embargo_period_class='0-6', doi, null)) as count_green_embargo0_6, | |
COUNT(DISTINCT IF(green_embargo_period_class='0', doi, null)) as count_green_embargo0, | |
COUNT(DISTINCT IF((green_embargo_period_class='24+' and green_only), doi, null)) as count_green_only_embargo24plus, | |
COUNT(DISTINCT IF((green_embargo_period_class='12-24' and green_only), doi, null)) as count_green_only_embargo13_24, | |
COUNT(DISTINCT IF((green_embargo_period_class='6-12' and green_only), doi, null)) as count_green_only_embargo7_12, | |
COUNT(DISTINCT IF((green_embargo_period_class='0-6' and green_only), doi, null)) as count_green_only_embargo0_6, | |
COUNT(DISTINCT IF((green_embargo_period_class='0' and green_only), doi, null)) as count_green_only_embargo0, | |
COUNT(DISTINCT IF((green_embargo_period_class='24+' and green_only_ignoring_bronze), doi, null)) as count_green_only_ignoring_bronze_embargo24plus, | |
COUNT(DISTINCT IF((green_embargo_period_class='12-24' and green_only_ignoring_bronze), doi, null)) as count_green_only_ignoring_bronze_embargo13_24, | |
COUNT(DISTINCT IF((green_embargo_period_class='6-12' and green_only_ignoring_bronze), doi, null)) as count_green_only_ignoring_bronze_embargo7_12, | |
COUNT(DISTINCT IF((green_embargo_period_class='0-6' and green_only_ignoring_bronze), doi, null)) as count_green_only_ignoring_bronze_embargo0_6, | |
COUNT(DISTINCT IF((green_embargo_period_class='0' and green_only_ignoring_bronze), doi, null)) as count_green_only_ignoring_bronze_embargo0, | |
COUNT(DISTINCT IF(gold_embargo_period_class='24+', doi, null)) as count_gold_embargo24plus, | |
COUNT(DISTINCT IF(gold_embargo_period_class='12-24', doi, null)) as count_gold_embargo13_24, | |
COUNT(DISTINCT IF(gold_embargo_period_class='6-12', doi, null)) as count_gold_embargo7_12, | |
COUNT(DISTINCT IF(gold_embargo_period_class='0-6', doi, null)) as count_gold_embargo0_6, | |
COUNT(DISTINCT IF(gold_embargo_period_class='0', doi, null)) as count_gold_embargo0, | |
COUNT(DISTINCT IF(publisher_ccby, doi, null)) as publisher_ccby, | |
COUNT(DISTINCT IF(publisher_ccbynd, doi, null)) as publisher_ccbynd, | |
COUNT(DISTINCT IF(publisher_ccbysa, doi, null)) as publisher_ccbysa, | |
COUNT(DISTINCT IF(publisher_cc0, doi, null)) as publisher_cc0, | |
COUNT(DISTINCT IF(publisher_pd, doi, null)) as publisher_pd, | |
COUNT(DISTINCT IF(repository_ccby, doi, null)) as repository_ccby, | |
COUNT(DISTINCT IF(repository_ccbynd, doi, null)) as repository_ccbynd, | |
COUNT(DISTINCT IF(repository_ccbysa, doi, null)) as repository_ccbysa, | |
COUNT(DISTINCT IF(repository_cc0, doi, null)) as repository_cc0, | |
COUNT(DISTINCT IF(repository_pd, doi, null)) as repository_pd, | |
COUNT(DISTINCT IF(repository_accpub_ccby, doi, null)) as repository_accpub_ccby, | |
COUNT(DISTINCT IF(repository_accpub_ccbynd, doi, null)) as repository_accpub_ccbynd, | |
COUNT(DISTINCT IF(repository_accpub_ccbysa, doi, null)) as repository_accpub_ccbysa, | |
COUNT(DISTINCT IF(repository_accpub_cc0, doi, null)) as repository_accpub_cc0, | |
COUNT(DISTINCT IF(repository_accpub_pd, doi, null)) as repository_accpub_pd, | |
COUNT(DISTINCT IF((publisher_ccby or repository_ccby), doi, null)) as publisher_or_repository_ccby, | |
COUNT(DISTINCT IF((publisher_ccbynd or repository_ccbynd), doi, null)) as publisher_or_repository_ccbynd, | |
COUNT(DISTINCT IF((publisher_ccbysa or repository_ccbysa), doi, null)) as publisher_or_repository_ccbysa, | |
COUNT(DISTINCT IF((publisher_cc0 or repository_cc0), doi, null)) as publisher_or_repository_cc0, | |
COUNT(DISTINCT IF((publisher_pd or repository_pd), doi, null)) as publisher_or_repository_pd, | |
COUNT(DISTINCT IF((publisher_ccby or repository_accpub_ccby), doi, null)) as publisher_or_repository_accpub_ccby, | |
COUNT(DISTINCT IF((publisher_ccbynd or repository_accpub_ccbynd), doi, null)) as publisher_or_repository_accpub_ccbynd, | |
COUNT(DISTINCT IF((publisher_ccbysa or repository_accpub_ccbysa), doi, null)) as publisher_or_repository_accpub_ccbysa, | |
COUNT(DISTINCT IF((publisher_cc0 or repository_accpub_cc0), doi, null)) as publisher_or_repository_accpub_cc0, | |
COUNT(DISTINCT IF((publisher_pd or repository_accpub_pd), doi, null)) as publisher_or_repository_accpub_pd | |
FROM qdoi_table WHERE type="journal-article" | |
GROUP BY acronym, published_year | |
ORDER BY published_year DESC, acronym ASC | |
---To generate total for cOAS except EC (comment out line 183 and lines 258-260) | |
---FROM qdoi_table WHERE type="journal-article" AND acronym != "EC" | |
---GROUP BY published_year | |
---ORDER BY published_year DESC |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment