## ngram following http://tidytextmining.com/ngrams.html
library(dplyr)
library(tidytext)
library(tidyr)
# load manually downloaded web of science data dump
tt <- jsonlite::stream_in(file("data/wos_total.json"), verbose = FALSE) %>%
filter(!is.na(AB))
text_df <- data_frame(abstracts = tt$AB, pubs = tt$UT)
# tokenize
bi_bigram <- text_df %>%
unnest_tokens(bigram, abstracts, token = "ngrams", n = 2)
bigrams_separated <- bi_bigram %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# New bigram counts:
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigram_counts
#> Source: local data frame [111,737 x 3]
#> Groups: word1 [18,712]
#>
#> word1 word2 n
#> <chr> <chr> <int>
#> 1 rights reserved 331
#> 2 heart failure 246
#> 3 pol ii 224
#> 4 magnetic resonance 213
#> 5 white matter 171
#> 6 nervous system 170
#> 7 plasma membrane 169
#> 8 wild type 159
#> 9 optic nerve 156
#> 10 molecular dynamics 142
#> # ... with 111,727 more rows
Last active
April 16, 2017 19:55
-
-
Save njahn82/70538a354ed7dc895483da086cac9f75 to your computer and use it in GitHub Desktop.
Tidy Text Mining of Web of Science Abstracts
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment