Skip to content

Instantly share code, notes, and snippets.

@njahn82
Created November 15, 2021 11:18
Show Gist options
  • Save njahn82/12f15f526acc08ef31bc21f631e7867d to your computer and use it in GitHub Desktop.
Save njahn82/12f15f526acc08ef31bc21f631e7867d to your computer and use it in GitHub Desktop.
hoad_dash_beispiel
library(tidyverse)
library(bigrquery)
library(DBI)

Connection

con <- DBI::dbConnect(
  bigrquery::bigquery(),
  project = "hoad-dash"
)

Erster Schritt: Abgleich mit Journalliste, z.B. von Open APC, die bereits über BQ verfügbar sein muss.

CREATE OR REPLACE TABLE `hoad-dash.openapc_cr.cr_raw` AS
SELECT
doi,
issn_l,
EXTRACT (YEAR
         FROM
         issued) AS year,
license
FROM (
  SELECT
  SPLIT(issn, ",") AS issn,
  doi,
  issued,
  license
  FROM
  `subugoe-collaborative.cr_instant.snapshot`
  WHERE
  NOT REGEXP_CONTAINS(title,'^Author Index$|^Back Cover|^Contents$|^Contents:|^Cover Image|^Cover Picture|^Editorial Board|^Front Cover|^Frontispiece|^Inside Back Cover|^Inside Cover|^Inside Front Cover|^Issue Information|^List of contents|^Masthead|^Title page|^Correction$|^Corrections to|^Corrections$|^Withdrawn')
  AND (NOT REGEXP_CONTAINS(page, '^S')
       OR page IS NULL) -- include online only articles, lacking page or issue
  AND (NOT REGEXP_CONTAINS(issue, '^S')
       OR issue IS NULL) ) AS `tbl_cr`,
UNNEST(issn) AS issn
INNER JOIN
`hoad-dash.openapc.oapc_hybrid_journals`
ON
issn = `hoad-dash.openapc.oapc_hybrid_journals`.`issn`

Beispiel-Aggregation

dplyr::tbl(con, "openapc_cr.cr_raw") %>%
  group_by(issn_l, year) %>%
  summarise(n = n_distinct(doi))
## `summarise()` has grouped output by 'issn_l'. You can override using the `.groups` argument.

## # Source:   lazy query [?? x 3]
## # Database: BigQueryConnection
## # Groups:   issn_l
##    issn_l     year     n
##    <chr>     <int> <int>
##  1 1544-6115  2016    36
##  2 2156-8243  2016    26
##  3 0191-2917  2016   976
##  4 1363-9196  2016   104
##  5 0066-4170  2016    26
##  6 1944-8244  2016  4204
##  7 0267-9477  2016   273
##  8 0022-1767  2016  1017
##  9 1355-8382  2016   185
## 10 0021-8561  2016  1205
## # … with more rows

Zweiter Schritt: Datensatz mit Lizenzinformationen

CREATE OR REPLACE TABLE
  `hoad-dash.openapc_cr.cr_cc` AS
SELECT
  RTRIM(REGEXP_EXTRACT(lic.url, r'by.*?/'), "/") AS cc,
  doi,
  year,
  issn_l
FROM
  `hoad-dash.openapc_cr.cr_raw`,
  UNNEST(license) AS lic
WHERE
  lic.content_version = "vor"
  AND lic.delay_in_days = 0
  AND REGEXP_CONTAINS(lic.url, "creativecommons.org")

Beispiel-Aggregation

dplyr::tbl(con, "openapc_cr.cr_cc") %>%
  group_by(cc) %>%
  summarise(n = n_distinct(doi))
## # Source:   lazy query [?? x 2]
## # Database: BigQueryConnection
##    cc            n
##    <chr>     <int>
##  1 by       245777
##  2 by-nc-nd 125256
##  3 by-nc     28358
##  4 by-nc-sa   1778
##  5 <NA>         53
##  6 by-nd        72
##  7 by-nd-nc     91
##  8 by-n          8
##  9 bync-nd       2
## 10 by-sa         1
## # … with more rows
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment