njahn82/mag_subject.md

## mag_subject.md

      
    Raw
  

              mag_subject.md
            
          
    library(tidyverse)
# raw subject table
subject_mag_raw <- readr::read_csv("data/NPL_DOI_FOS.csv",
  col_types =  cols(.default = "c"))
subject_mag_raw %>%
  # Represents https://academic.microsoft.com/paper/2518129109/
  filter(doi == "10.7717/peerj.2369") %>%
  # Only top level subjects
  filter(is.na(fos_1)) %>%
  pull(fos_0_name)
#> [1] "biology"    "population"
# Transformed subject table
subject_cleaned <- readr::read_csv("data/npl_cleaned.csv", 
  col_types =  cols(.default = "c"))
# Check https://academic.microsoft.com/paper/2518129109/
subject_cleaned %>% 
  filter(doi == "10.7717/peerj.2369")
#> # A tibble: 1 x 2
#>   doi                top_fos  
#>   <chr>              <chr>    
#> 1 10.7717/peerj.2369 chemistry
# Is there more than one subject field per paper?
# First, subject raw
subject_mag_raw %>%
  filter(is.na(fos_1)) %>%
  group_by(doi) %>%
  filter(n() > 1)
#> # A tibble: 3,468 x 19
#> # Groups:   doi [1,563]
#>    paperid  doi   year  fos_0 fos_1 fos_1_name fos_2 fos_2_name fos_3 fos_3_name
#>    <chr>    <chr> <chr> <chr> <chr> <chr>      <chr> <chr>      <chr> <chr>     
#>  1 2016611… 10.3… 2009  1971… <NA>  <NA>       <NA>  <NA>       <NA>  <NA>      
#>  2 2593039… 10.4… 2014  1274… <NA>  <NA>       <NA>  <NA>       <NA>  <NA>      
#>  3 2124652… 10.3… 2010  4100… <NA>  <NA>       <NA>  <NA>       <NA>  <NA>      
#>  4 3008890… 10.1… 2008  4100… <NA>  <NA>       <NA>  <NA>       <NA>  <NA>      
#>  5 1563381… 10.1… 2015  2908… <NA>  <NA>       <NA>  <NA>       <NA>  <NA>      
#>  6 2508225… 10.1… 2016  2908… <NA>  <NA>       <NA>  <NA>       <NA>  <NA>      
#>  7 33710999 10.1… 2009  2908… <NA>  <NA>       <NA>  <NA>       <NA>  <NA>      
#>  8 2088222… 10.1… 2015  1273… <NA>  <NA>       <NA>  <NA>       <NA>  <NA>      
#>  9 2102811… 10.1… 2010  8680… <NA>  <NA>       <NA>  <NA>       <NA>  <NA>      
#> 10 2745506… 10.1… 2017  8680… <NA>  <NA>       <NA>  <NA>       <NA>  <NA>      
#> # … with 3,458 more rows, and 9 more variables: fos_4 <chr>, fos_4_name <chr>,
#> #   fos_5 <chr>, fos_5_name <chr>, fos_6 <chr>, fos_6_name <chr>, fos_7 <chr>,
#> #   fos_7_name <chr>, fos_0_name <chr>
# Next, subject cleaned
subject_cleaned %>%
  group_by(doi) %>%
  filter(n() > 1)
#> # A tibble: 0 x 2
#> # Groups:   doi [0]
#> # … with 2 variables: doi <chr>, top_fos <chr>
# Question: I am unsure if I can use this approach?
subject_mag_raw %>%
  filter(is.na(fos_1)) %>%
  select(doi, fos_0_name)
#> # A tibble: 46,895 x 2
#>    doi                              fos_0_name       
#>    <chr>                            <chr>            
#>  1 10.3791/1603                     merge            
#>  2 10.1038/nature08729              materials science
#>  3 10.1016/j.jnoncrysol.2016.09.031 materials science
#>  4 10.1021/am403243g                materials science
#>  5 10.1002/jbm.a.33235              materials science
#>  6 10.1088/1748-6041/8/1/014103     materials science
#>  7 10.1002/adma.201204127           materials science
#>  8 10.2217/nnm.11.19                materials science
#>  9 10.1002/smll.201102156           materials science
#> 10 10.1002/adfm.200700719           materials science
#> # … with 46,885 more rows
# It does not seem like this, see 10.3791/1603 -> merge, but
# https://academic.microsoft.com/paper/2016611050/ lists
# different subjects
^{Created on 2021-07-27 by the reprex package (v2.0.0)}


Standard output and standard error

-- nothing to show --