Skip to content

Instantly share code, notes, and snippets.

@cameronneylon
Created April 28, 2022 13:52
Show Gist options
  • Save cameronneylon/b51c8fafe6d577a385f737b914c799d4 to your computer and use it in GitHub Desktop.
Save cameronneylon/b51c8fafe6d577a385f737b914c799d4 to your computer and use it in GitHub Desktop.
WITH
-- UNNEST the funder DOIs in the Crossref DOI table
CR_FUNDER_DOI_TABLE AS (
SELECT
a.doi,
crossref.published_year,
-- Deal with the Crossref Month 13 issue
CASE WHEN crossref.published_month > 12 then 12 ELSE crossref.published_month END as published_month,
unpaywall.* EXCEPT(doi),
crossref.type,
funder.doi as funder_doi
FROM `academic-observatory.observatory.doi20220423` as a,
UNNEST(affiliations.funders) as funder
),
-- JOIN the cOAS funders onto the unnested table to select relevant DOIs
coalitionS_dois AS (
SELECT
a.*,
b.* EXCEPT (funder_doi)
FROM `utrecht-university.cOAlitionS.fundref_mapping` as a
LEFT JOIN CR_FUNDER_DOI_TABLE as b
ON UPPER(TRIM(a.funder_doi)) = UPPER(TRIM(b.funder_doi))
WHERE b.published_year > 2015
),
-- Process the raw unpaywall data to generate first oa dates
-- Where appropriate these should evaluate to null when no oa_date is present
unpaywall_processed AS (
SELECT
doi,
journal_issn_l,
CASE
WHEN (
SELECT COUNT(1)
FROM UNNEST(u.oa_locations) as g
WHERE (g.host_type="repository" and g.version IN ('publishedVersion', 'acceptedVersion'))) > 0
THEN true
ELSE FALSE
END as green_accepted_published,
first_oa_location.oa_date as first_oa_date, -- This currently includes preprints
(
SELECT g.oa_date
FROM UNNEST(u.oa_locations) as g
WHERE (g.host_type="repository" and g.version IN ('publishedVersion', 'acceptedVersion'))
ORDER BY g.oa_date ASC LIMIT 1
)
as first_green_oa,
(
SELECT g.oa_date
FROM UNNEST(u.oa_locations) as g
WHERE g.host_type="publisher"
ORDER BY g.oa_date ASC LIMIT 1
)
as first_gold_oa,
IF((
SELECT COUNT(1)
FROM UNNEST(u.oa_locations) as l
WHERE (l.host_type="publisher" and l.license="cc-by")) > 0, true, false) as publisher_ccby,
IF((
SELECT COUNT(1)
FROM UNNEST(u.oa_locations) as l
WHERE (l.host_type="publisher" and l.license="cc-by-nd")) > 0, true, false) as publisher_ccbynd,
IF((
SELECT COUNT(1)
FROM UNNEST(u.oa_locations) as l
WHERE (l.host_type="publisher" and l.license="cc-by-sa")) > 0, true, false) as publisher_ccbysa,
IF((
SELECT COUNT(1)
FROM UNNEST(u.oa_locations) as l
WHERE (l.host_type="publisher" and l.license="cc0")) > 0, true, false) as publisher_cc0,
IF((
SELECT COUNT(1)
FROM UNNEST(u.oa_locations) as l
WHERE (l.host_type="publisher" and l.license="pd")) > 0, true, false) as publisher_pd,
IF((
SELECT COUNT(1)
FROM UNNEST(u.oa_locations) as l
WHERE (l.host_type="repository" and l.license="cc-by")) > 0, true, false) as repository_ccby,
IF((
SELECT COUNT(1)
FROM UNNEST(u.oa_locations) as l
WHERE (l.host_type="repository" and l.license="cc-by-nd")) > 0, true, false) as repository_ccbynd,
IF((
SELECT COUNT(1)
FROM UNNEST(u.oa_locations) as l
WHERE (l.host_type="repository" and l.license="cc-by-sa")) > 0, true, false) as repository_ccbysa,
IF((
SELECT COUNT(1)
FROM UNNEST(u.oa_locations) as l
WHERE (l.host_type="repository" and l.license="cc0")) > 0, true, false) as repository_cc0,
IF((
SELECT COUNT(1)
FROM UNNEST(u.oa_locations) as l
WHERE (l.host_type="repository" and l.license="pd")) > 0, true, false) as repository_pd,
IF((
SELECT COUNT(1)
FROM UNNEST(u.oa_locations) as l
WHERE (l.host_type="repository" and l.version IN ('publishedVersion', 'acceptedVersion') and l.license="cc-by")) > 0, true, false) as repository_accpub_ccby,
IF((
SELECT COUNT(1)
FROM UNNEST(u.oa_locations) as l
WHERE (l.host_type="repository" and l.version IN ('publishedVersion', 'acceptedVersion') and l.license="cc-by-nd")) > 0, true, false) as repository_accpub_ccbynd,
IF((
SELECT COUNT(1)
FROM UNNEST(u.oa_locations) as l
WHERE (l.host_type="repository" and l.version IN ('publishedVersion', 'acceptedVersion') and l.license="cc-by-sa")) > 0, true, false) as repository_accpub_ccbysa,
IF((
SELECT COUNT(1)
FROM UNNEST(u.oa_locations) as l
WHERE (l.host_type="repository" and l.version IN ('publishedVersion', 'acceptedVersion') and l.license="cc0")) > 0, true, false) as repository_accpub_cc0,
IF((
SELECT COUNT(1)
FROM UNNEST(u.oa_locations) as l
WHERE (l.host_type="repository" and l.version IN ('publishedVersion', 'acceptedVersion') and l.license="pd")) > 0, true, false) as repository_accpub_pd
FROM `academic-observatory.our_research.unpaywall` as u
),
-- Create a DOI table linking the cOAS dois to the unpaywall data and linking to DOAJ APC data
-- Generate embargo classes for analysis
qdoi_table AS (
SELECT
s.*,
u.* EXCEPT(doi),
DATE_DIFF(u.first_oa_date, DATE(published_year, published_month, 1), MONTH) as embargo_observed,
DATE_DIFF(first_green_oa, DATE(published_year, published_month, 1), MONTH) as green_embargo_observed,
DATE_DIFF(first_gold_oa, DATE(published_year, published_month, 1), MONTH) as gold_embargo_observed,
CASE
WHEN u.first_oa_date is null THEN null
WHEN
DATE_DIFF(u.first_oa_date, DATE(published_year, published_month, 1), MONTH) > 24 THEN "24+"
WHEN
DATE_DIFF(u.first_oa_date, DATE(published_year, published_month, 1), MONTH) > 12 THEN "12-24"
WHEN
DATE_DIFF(u.first_oa_date, DATE(published_year, published_month, 1), MONTH) > 6 THEN "6-12"
WHEN
DATE_DIFF(u.first_oa_date, DATE(published_year, published_month, 1), MONTH) > 0 THEN "0-6"
ELSE "0"
END
as embargo_period_class,
CASE
WHEN first_green_oa is null THEN null
WHEN
DATE_DIFF(u.first_green_oa, DATE(published_year, published_month, 1), MONTH) > 24 THEN "24+"
WHEN
DATE_DIFF(u.first_green_oa, DATE(published_year, published_month, 1), MONTH) > 12 THEN "12-24"
WHEN
DATE_DIFF(u.first_green_oa, DATE(published_year, published_month, 1), MONTH) > 6 THEN "6-12"
WHEN
DATE_DIFF(u.first_green_oa, DATE(published_year, published_month, 1), MONTH) > 0 THEN "0-6"
ELSE "0"
END
as green_embargo_period_class,
CASE
WHEN
u.first_gold_oa is null THEN null
WHEN
DATE_DIFF(u.first_gold_oa, DATE(published_year, published_month, 1), MONTH) > 24 THEN "24+"
WHEN
DATE_DIFF(u.first_gold_oa, DATE(published_year, published_month, 1), MONTH) > 12 THEN "12-24"
WHEN
DATE_DIFF(u.first_gold_oa, DATE(published_year, published_month, 1), MONTH) > 6 THEN "6-12"
WHEN
DATE_DIFF(u.first_gold_oa, DATE(published_year, published_month, 1), MONTH) > 0 THEN "0-6"
ELSE "0"
END
as gold_embargo_period_class,
a.apc
FROM coalitionS_dois as s
LEFT JOIN unpaywall_processed as u on UPPER(TRIM(s.doi)) = UPPER(TRIM(u.doi))
LEFT JOIN `utrecht-university.doaj.apc_issnl_20220427` as a on a.journal_issn_l = u.journal_issn_l
)
-- Final query counts up outputs and OA status by year and cOAS funder
SELECT
published_year,
acronym,
COUNT(DISTINCT doi) as total_outputs,
COUNT(DISTINCT IF(is_oa, doi, null)) as count_oa,
COUNT(DISTINCT IF(gold, doi, null)) as count_gold,
COUNT(DISTINCT IF(gold_just_doaj and (apc=false), doi, null)) as count_diamond,
COUNT(DISTINCT IF(gold_just_doaj, doi, null)) as count_doaj,
COUNT(DISTINCT IF(hybrid, doi, null)) as count_hybrid,
COUNT(DISTINCT IF(green, doi, null)) as count_green,
COUNT(DISTINCT IF(green_accepted_published, doi, null)) as count_green_accpub,
COUNT(DISTINCT IF(green_only, doi, null)) as count_green_only,
COUNT(DISTINCT IF((green_only and green_accepted_published), doi, null)) as count_green_only_accpub,
COUNT(DISTINCT IF(green_only_ignoring_bronze, doi, null)) as count_green_only_ignoring_bronze,
COUNT(DISTINCT IF((green_only_ignoring_bronze and green_accepted_published), doi, null)) as count_green_only_ignoring_bronze_accpub,
COUNT(DISTINCT IF(is_cclicensed, doi, null)) as count_cc_licensed,
COUNT(DISTINCT IF(embargo_period_class='24+', doi, null)) as count_embargo24plus,
COUNT(DISTINCT IF(embargo_period_class='12-24', doi, null)) as count_embargo13_24,
COUNT(DISTINCT IF(embargo_period_class='6-12', doi, null)) as count_embargo7_12,
COUNT(DISTINCT IF(embargo_period_class='0-6', doi, null)) as count_embargo0_6,
COUNT(DISTINCT IF(embargo_period_class='0', doi, null)) as count_embargo0,
COUNT(DISTINCT IF(green_embargo_period_class='24+', doi, null)) as count_green_embargo24plus,
COUNT(DISTINCT IF(green_embargo_period_class='12-24', doi, null)) as count_green_embargo13_24,
COUNT(DISTINCT IF(green_embargo_period_class='6-12', doi, null)) as count_green_embargo7_12,
COUNT(DISTINCT IF(green_embargo_period_class='0-6', doi, null)) as count_green_embargo0_6,
COUNT(DISTINCT IF(green_embargo_period_class='0', doi, null)) as count_green_embargo0,
COUNT(DISTINCT IF((green_embargo_period_class='24+' and green_only), doi, null)) as count_green_only_embargo24plus,
COUNT(DISTINCT IF((green_embargo_period_class='12-24' and green_only), doi, null)) as count_green_only_embargo13_24,
COUNT(DISTINCT IF((green_embargo_period_class='6-12' and green_only), doi, null)) as count_green_only_embargo7_12,
COUNT(DISTINCT IF((green_embargo_period_class='0-6' and green_only), doi, null)) as count_green_only_embargo0_6,
COUNT(DISTINCT IF((green_embargo_period_class='0' and green_only), doi, null)) as count_green_only_embargo0,
COUNT(DISTINCT IF((green_embargo_period_class='24+' and green_only_ignoring_bronze), doi, null)) as count_green_only_ignoring_bronze_embargo24plus,
COUNT(DISTINCT IF((green_embargo_period_class='12-24' and green_only_ignoring_bronze), doi, null)) as count_green_only_ignoring_bronze_embargo13_24,
COUNT(DISTINCT IF((green_embargo_period_class='6-12' and green_only_ignoring_bronze), doi, null)) as count_green_only_ignoring_bronze_embargo7_12,
COUNT(DISTINCT IF((green_embargo_period_class='0-6' and green_only_ignoring_bronze), doi, null)) as count_green_only_ignoring_bronze_embargo0_6,
COUNT(DISTINCT IF((green_embargo_period_class='0' and green_only_ignoring_bronze), doi, null)) as count_green_only_ignoring_bronze_embargo0,
COUNT(DISTINCT IF(gold_embargo_period_class='24+', doi, null)) as count_gold_embargo24plus,
COUNT(DISTINCT IF(gold_embargo_period_class='12-24', doi, null)) as count_gold_embargo13_24,
COUNT(DISTINCT IF(gold_embargo_period_class='6-12', doi, null)) as count_gold_embargo7_12,
COUNT(DISTINCT IF(gold_embargo_period_class='0-6', doi, null)) as count_gold_embargo0_6,
COUNT(DISTINCT IF(gold_embargo_period_class='0', doi, null)) as count_gold_embargo0,
COUNT(DISTINCT IF(publisher_ccby, doi, null)) as publisher_ccby,
COUNT(DISTINCT IF(publisher_ccbynd, doi, null)) as publisher_ccbynd,
COUNT(DISTINCT IF(publisher_ccbysa, doi, null)) as publisher_ccbysa,
COUNT(DISTINCT IF(publisher_cc0, doi, null)) as publisher_cc0,
COUNT(DISTINCT IF(publisher_pd, doi, null)) as publisher_pd,
COUNT(DISTINCT IF(repository_ccby, doi, null)) as repository_ccby,
COUNT(DISTINCT IF(repository_ccbynd, doi, null)) as repository_ccbynd,
COUNT(DISTINCT IF(repository_ccbysa, doi, null)) as repository_ccbysa,
COUNT(DISTINCT IF(repository_cc0, doi, null)) as repository_cc0,
COUNT(DISTINCT IF(repository_pd, doi, null)) as repository_pd,
COUNT(DISTINCT IF(repository_accpub_ccby, doi, null)) as repository_accpub_ccby,
COUNT(DISTINCT IF(repository_accpub_ccbynd, doi, null)) as repository_accpub_ccbynd,
COUNT(DISTINCT IF(repository_accpub_ccbysa, doi, null)) as repository_accpub_ccbysa,
COUNT(DISTINCT IF(repository_accpub_cc0, doi, null)) as repository_accpub_cc0,
COUNT(DISTINCT IF(repository_accpub_pd, doi, null)) as repository_accpub_pd,
COUNT(DISTINCT IF((publisher_ccby or repository_ccby), doi, null)) as publisher_or_repository_ccby,
COUNT(DISTINCT IF((publisher_ccbynd or repository_ccbynd), doi, null)) as publisher_or_repository_ccbynd,
COUNT(DISTINCT IF((publisher_ccbysa or repository_ccbysa), doi, null)) as publisher_or_repository_ccbysa,
COUNT(DISTINCT IF((publisher_cc0 or repository_cc0), doi, null)) as publisher_or_repository_cc0,
COUNT(DISTINCT IF((publisher_pd or repository_pd), doi, null)) as publisher_or_repository_pd,
COUNT(DISTINCT IF((publisher_ccby or repository_accpub_ccby), doi, null)) as publisher_or_repository_accpub_ccby,
COUNT(DISTINCT IF((publisher_ccbynd or repository_accpub_ccbynd), doi, null)) as publisher_or_repository_accpub_ccbynd,
COUNT(DISTINCT IF((publisher_ccbysa or repository_accpub_ccbysa), doi, null)) as publisher_or_repository_accpub_ccbysa,
COUNT(DISTINCT IF((publisher_cc0 or repository_accpub_cc0), doi, null)) as publisher_or_repository_accpub_cc0,
COUNT(DISTINCT IF((publisher_pd or repository_accpub_pd), doi, null)) as publisher_or_repository_accpub_pd
FROM qdoi_table WHERE type="journal-article"
GROUP BY acronym, published_year
ORDER BY published_year DESC, acronym ASC
---To generate total for cOAS except EC (comment out line 183 and lines 258-260)
---FROM qdoi_table WHERE type="journal-article" AND acronym != "EC"
---GROUP BY published_year
---ORDER BY published_year DESC
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment