## ngram following http://tidytextmining.com/ngrams.html
library(dplyr)
library(tidytext)
library(tidyr)
# load manually downloaded web of science data dump
tt <- jsonlite::stream_in(file("data/wos_total.json"), verbose = FALSE) %>%
filter(!is.na(AB))
text_df <- data_frame(abstracts = tt$AB, pubs = tt$UT)
# tokenize
bi_bigram <- text_df %>%
unnest_tokens(bigram, abstracts, token = "ngrams", n = 2)
bigrams_separated <- bi_bigram %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# New bigram counts:
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigram_counts
#> Source: local data frame [111,737 x 3]
#> Groups: word1 [18,712]
#>
#> word1 word2 n
#> <chr> <chr> <int>
#> 1 rights reserved 331
#> 2 heart failure 246
#> 3 pol ii 224
#> 4 magnetic resonance 213
#> 5 white matter 171
#> 6 nervous system 170
#> 7 plasma membrane 169
#> 8 wild type 159
#> 9 optic nerve 156
#> 10 molecular dynamics 142
#> # ... with 111,727 more rows
-
-
Save miku/276511d02c3ad31cfc88da35be2276b3 to your computer and use it in GitHub Desktop.
Tidy Text Mining of Web of Science Abstracts
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment