library(tidyverse)
library(bigrquery)
library(DBI)
Connection
con <- DBI::dbConnect(
bigrquery::bigquery(),
project = "hoad-dash"
)
Erster Schritt: Abgleich mit Journalliste, z.B. von Open APC, die bereits über BQ verfügbar sein muss.
CREATE OR REPLACE TABLE `hoad-dash.openapc_cr.cr_raw` AS
SELECT
doi,
issn_l,
EXTRACT (YEAR
FROM
issued) AS year,
license
FROM (
SELECT
SPLIT(issn, ",") AS issn,
doi,
issued,
license
FROM
`subugoe-collaborative.cr_instant.snapshot`
WHERE
NOT REGEXP_CONTAINS(title,'^Author Index$|^Back Cover|^Contents$|^Contents:|^Cover Image|^Cover Picture|^Editorial Board|^Front Cover|^Frontispiece|^Inside Back Cover|^Inside Cover|^Inside Front Cover|^Issue Information|^List of contents|^Masthead|^Title page|^Correction$|^Corrections to|^Corrections$|^Withdrawn')
AND (NOT REGEXP_CONTAINS(page, '^S')
OR page IS NULL) -- include online only articles, lacking page or issue
AND (NOT REGEXP_CONTAINS(issue, '^S')
OR issue IS NULL) ) AS `tbl_cr`,
UNNEST(issn) AS issn
INNER JOIN
`hoad-dash.openapc.oapc_hybrid_journals`
ON
issn = `hoad-dash.openapc.oapc_hybrid_journals`.`issn`
Beispiel-Aggregation
dplyr::tbl(con, "openapc_cr.cr_raw") %>%
group_by(issn_l, year) %>%
summarise(n = n_distinct(doi))
## `summarise()` has grouped output by 'issn_l'. You can override using the `.groups` argument.
## # Source: lazy query [?? x 3]
## # Database: BigQueryConnection
## # Groups: issn_l
## issn_l year n
## <chr> <int> <int>
## 1 1544-6115 2016 36
## 2 2156-8243 2016 26
## 3 0191-2917 2016 976
## 4 1363-9196 2016 104
## 5 0066-4170 2016 26
## 6 1944-8244 2016 4204
## 7 0267-9477 2016 273
## 8 0022-1767 2016 1017
## 9 1355-8382 2016 185
## 10 0021-8561 2016 1205
## # … with more rows
Zweiter Schritt: Datensatz mit Lizenzinformationen
CREATE OR REPLACE TABLE
`hoad-dash.openapc_cr.cr_cc` AS
SELECT
RTRIM(REGEXP_EXTRACT(lic.url, r'by.*?/'), "/") AS cc,
doi,
year,
issn_l
FROM
`hoad-dash.openapc_cr.cr_raw`,
UNNEST(license) AS lic
WHERE
lic.content_version = "vor"
AND lic.delay_in_days = 0
AND REGEXP_CONTAINS(lic.url, "creativecommons.org")
Beispiel-Aggregation
dplyr::tbl(con, "openapc_cr.cr_cc") %>%
group_by(cc) %>%
summarise(n = n_distinct(doi))
## # Source: lazy query [?? x 2]
## # Database: BigQueryConnection
## cc n
## <chr> <int>
## 1 by 245777
## 2 by-nc-nd 125256
## 3 by-nc 28358
## 4 by-nc-sa 1778
## 5 <NA> 53
## 6 by-nd 72
## 7 by-nd-nc 91
## 8 by-n 8
## 9 bync-nd 2
## 10 by-sa 1
## # … with more rows