njahn82/wos_tidy_text.md

## wos_tidy_text.md

      
    Raw
  

              wos_tidy_text.md
            
          
      ## ngram following http://tidytextmining.com/ngrams.html
    library(dplyr)
    library(tidytext)
    library(tidyr)

    # load manually downloaded web of science data dump
    tt <- jsonlite::stream_in(file("data/wos_total.json"), verbose = FALSE) %>% 
      filter(!is.na(AB))

    text_df <- data_frame(abstracts = tt$AB, pubs = tt$UT)
    # tokenize
    bi_bigram <- text_df %>% 
      unnest_tokens(bigram, abstracts, token = "ngrams", n = 2)

    bigrams_separated <- bi_bigram %>%
      separate(bigram, c("word1", "word2"), sep = " ")

    bigrams_filtered <- bigrams_separated %>%
      filter(!word1 %in% stop_words$word) %>%
      filter(!word2 %in% stop_words$word)

    # New bigram counts:
    bigram_counts <- bigrams_filtered %>% 
      count(word1, word2, sort = TRUE)

    bigram_counts
    #> Source: local data frame [111,737 x 3]
    #> Groups: word1 [18,712]
    #> 
    #>        word1     word2     n
    #>        <chr>     <chr> <int>
    #> 1     rights  reserved   331
    #> 2      heart   failure   246
    #> 3        pol        ii   224
    #> 4   magnetic resonance   213
    #> 5      white    matter   171
    #> 6    nervous    system   170
    #> 7     plasma  membrane   169
    #> 8       wild      type   159
    #> 9      optic     nerve   156
    #> 10 molecular  dynamics   142
    #> # ... with 111,727 more rows