library(tidyverse)
# raw subject table
subject_mag_raw <- readr::read_csv("data/NPL_DOI_FOS.csv",
col_types = cols(.default = "c"))
subject_mag_raw %>%
# Represents https://academic.microsoft.com/paper/2518129109/
filter(doi == "10.7717/peerj.2369") %>%
# Only top level subjects
filter(is.na(fos_1)) %>%
pull(fos_0_name)
#> [1] "biology" "population"
# Transformed subject table
subject_cleaned <- readr::read_csv("data/npl_cleaned.csv",
col_types = cols(.default = "c"))
# Check https://academic.microsoft.com/paper/2518129109/
subject_cleaned %>%
filter(doi == "10.7717/peerj.2369")
#> # A tibble: 1 x 2
#> doi top_fos
#> <chr> <chr>
#> 1 10.7717/peerj.2369 chemistry
# Is there more than one subject field per paper?
# First, subject raw
subject_mag_raw %>%
filter(is.na(fos_1)) %>%
group_by(doi) %>%
filter(n() > 1)
#> # A tibble: 3,468 x 19
#> # Groups: doi [1,563]
#> paperid doi year fos_0 fos_1 fos_1_name fos_2 fos_2_name fos_3 fos_3_name
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 2016611… 10.3… 2009 1971… <NA> <NA> <NA> <NA> <NA> <NA>
#> 2 2593039… 10.4… 2014 1274… <NA> <NA> <NA> <NA> <NA> <NA>
#> 3 2124652… 10.3… 2010 4100… <NA> <NA> <NA> <NA> <NA> <NA>
#> 4 3008890… 10.1… 2008 4100… <NA> <NA> <NA> <NA> <NA> <NA>
#> 5 1563381… 10.1… 2015 2908… <NA> <NA> <NA> <NA> <NA> <NA>
#> 6 2508225… 10.1… 2016 2908… <NA> <NA> <NA> <NA> <NA> <NA>
#> 7 33710999 10.1… 2009 2908… <NA> <NA> <NA> <NA> <NA> <NA>
#> 8 2088222… 10.1… 2015 1273… <NA> <NA> <NA> <NA> <NA> <NA>
#> 9 2102811… 10.1… 2010 8680… <NA> <NA> <NA> <NA> <NA> <NA>
#> 10 2745506… 10.1… 2017 8680… <NA> <NA> <NA> <NA> <NA> <NA>
#> # … with 3,458 more rows, and 9 more variables: fos_4 <chr>, fos_4_name <chr>,
#> # fos_5 <chr>, fos_5_name <chr>, fos_6 <chr>, fos_6_name <chr>, fos_7 <chr>,
#> # fos_7_name <chr>, fos_0_name <chr>
# Next, subject cleaned
subject_cleaned %>%
group_by(doi) %>%
filter(n() > 1)
#> # A tibble: 0 x 2
#> # Groups: doi [0]
#> # … with 2 variables: doi <chr>, top_fos <chr>
# Question: I am unsure if I can use this approach?
subject_mag_raw %>%
filter(is.na(fos_1)) %>%
select(doi, fos_0_name)
#> # A tibble: 46,895 x 2
#> doi fos_0_name
#> <chr> <chr>
#> 1 10.3791/1603 merge
#> 2 10.1038/nature08729 materials science
#> 3 10.1016/j.jnoncrysol.2016.09.031 materials science
#> 4 10.1021/am403243g materials science
#> 5 10.1002/jbm.a.33235 materials science
#> 6 10.1088/1748-6041/8/1/014103 materials science
#> 7 10.1002/adma.201204127 materials science
#> 8 10.2217/nnm.11.19 materials science
#> 9 10.1002/smll.201102156 materials science
#> 10 10.1002/adfm.200700719 materials science
#> # … with 46,885 more rows
# It does not seem like this, see 10.3791/1603 -> merge, but
# https://academic.microsoft.com/paper/2016611050/ lists
# different subjects
Created on 2021-07-27 by the reprex package (v2.0.0)
Standard output and standard error
-- nothing to show --