library(tidyverse)
library(tidytext)
library(tm)
#> Loading required package: NLP
#>
#> Attaching package: 'NLP'
#> The following object is masked from 'package:ggplot2':
#>
#> annotate
set.seed(123) # random number generator seed
# get USCongress data
data(USCongress, package = "RTextTools")
congress <- as_tibble(USCongress) %>%
mutate(text = as.character(text))
# split congress_tokens into training/test sets
library(rsample)
congress_split <- initial_split(congress, prop = 0.7)
congress_train <- training(congress_split)
congress_test <- testing(congress_split)
glimpse(congress)
#> Observations: 4,449
#> Variables: 6
#> $ ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, …
#> $ cong <int> 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, …
#> $ billnum <int> 4499, 4500, 4501, 4502, 4503, 4504, 4505, 4506, 4507, 4…
#> $ h_or_sen <fct> HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR,…
#> $ major <int> 18, 18, 18, 18, 5, 21, 15, 18, 18, 18, 18, 16, 18, 12, …
#> $ text <chr> "To suspend temporarily the duty on Fast Magenta 2 Stag…
glimpse(congress_train)
#> Observations: 3,115
#> Variables: 6
#> $ ID <int> 1, 6, 7, 8, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22,…
#> $ cong <int> 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, …
#> $ billnum <int> 4499, 4504, 4505, 4506, 4508, 4509, 4510, 4511, 4514, 4…
#> $ h_or_sen <fct> HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR,…
#> $ major <int> 18, 21, 15, 18, 18, 18, 16, 18, 3, 3, 18, 18, 18, 18, 1…
#> $ text <chr> "To suspend temporarily the duty on Fast Magenta 2 Stag…
glimpse(congress_test)
#> Observations: 1,334
#> Variables: 6
#> $ ID <int> 2, 3, 4, 5, 9, 14, 15, 25, 26, 27, 34, 35, 39, 40, 45, …
#> $ cong <int> 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, …
#> $ billnum <int> 4500, 4501, 4502, 4503, 4507, 4512, 4513, 4523, 4524, 4…
#> $ h_or_sen <fct> HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR, HR,…
#> $ major <int> 18, 18, 18, 5, 18, 12, 2, 18, 19, 18, 18, 18, 18, 18, 9…
#> $ text <chr> "To suspend temporarily the duty on Fast Black 286 Stag…
# dtm of congress
congress_dtm <- congress %>%
# tokenize
unnest_tokens(output = word, input = text) %>%
# remove numbers
filter(!str_detect(word, "^[0-9]*$")) %>%
# remove stop words
anti_join(stop_words) %>%
# stem the words
mutate(word = SnowballC::wordStem(word)) %>%
# DTM the entire corpus
# get count of each token in each document
count(ID, word) %>%
# create a document-term matrix with all features and tf weighting
cast_dtm(document = ID, term = word, value = n)
#> Joining, by = "word"
# dtm of congress_train
congress_train_dtm <- congress_train %>%
# tokenize
unnest_tokens(output = word, input = text) %>%
# remove numbers
filter(!str_detect(word, "^[0-9]*$")) %>%
# remove stop words
anti_join(stop_words) %>%
# stem the words
mutate(word = SnowballC::wordStem(word)) %>%
# DTM the entire corpus
# get count of each token in each document
count(ID, word) %>%
# create a document-term matrix with all features and tf weighting
cast_dtm(document = ID, term = word, value = n)
#> Joining, by = "word"
# dtm of congress_test
congress_test_dtm <- congress_test %>%
# tokenize
unnest_tokens(output = word, input = text) %>%
# remove numbers
filter(!str_detect(word, "^[0-9]*$")) %>%
# remove stop words
anti_join(stop_words) %>%
# stem the words
mutate(word = SnowballC::wordStem(word)) %>%
# DTM the entire corpus
# get count of each token in each document
count(ID, word) %>%
# create a document-term matrix with all features and tf weighting
cast_dtm(document = ID, term = word, value = n)
#> Joining, by = "word"
# number of unique words in dictionary
ncol(congress_dtm)
#> [1] 4902
ncol(congress_train_dtm)
#> [1] 4183
ncol(congress_test_dtm)
#> [1] 2721
Created on 2019-03-10 by the reprex package (v0.2.1)
Session info
devtools::session_info()
#> ─ Session info ──────────────────────────────────────────────────────────
#> setting value
#> version R version 3.5.2 (2018-12-20)
#> os macOS Mojave 10.14.3
#> system x86_64, darwin15.6.0
#> ui X11
#> language (EN)
#> collate en_US.UTF-8
#> ctype en_US.UTF-8
#> tz America/Chicago
#> date 2019-03-10
#>
#> ─ Packages ──────────────────────────────────────────────────────────────
#> package * version date lib source
#> assertthat 0.2.0 2017-04-11 [2] CRAN (R 3.5.0)
#> backports 1.1.3 2018-12-14 [2] CRAN (R 3.5.0)
#> broom 0.5.1 2018-12-05 [2] CRAN (R 3.5.0)
#> callr 3.1.1 2018-12-21 [2] CRAN (R 3.5.0)
#> cellranger 1.1.0 2016-07-27 [2] CRAN (R 3.5.0)
#> cli 1.0.1 2018-09-25 [1] CRAN (R 3.5.0)
#> colorspace 1.4-0 2019-01-13 [2] CRAN (R 3.5.2)
#> crayon 1.3.4 2017-09-16 [2] CRAN (R 3.5.0)
#> desc 1.2.0 2018-05-01 [2] CRAN (R 3.5.0)
#> devtools 2.0.1 2018-10-26 [1] CRAN (R 3.5.1)
#> digest 0.6.18 2018-10-10 [1] CRAN (R 3.5.0)
#> dplyr * 0.8.0.1 2019-02-15 [1] CRAN (R 3.5.2)
#> evaluate 0.13 2019-02-12 [2] CRAN (R 3.5.2)
#> fansi 0.4.0 2018-10-05 [2] CRAN (R 3.5.0)
#> forcats * 0.4.0 2019-02-17 [2] CRAN (R 3.5.2)
#> fs 1.2.6 2018-08-23 [1] CRAN (R 3.5.0)
#> generics 0.0.2 2018-11-29 [1] CRAN (R 3.5.0)
#> ggplot2 * 3.1.0 2018-10-25 [1] CRAN (R 3.5.0)
#> glue 1.3.0 2018-07-17 [2] CRAN (R 3.5.0)
#> gtable 0.2.0 2016-02-26 [2] CRAN (R 3.5.0)
#> haven 2.1.0 2019-02-19 [2] CRAN (R 3.5.2)
#> highr 0.7 2018-06-09 [2] CRAN (R 3.5.0)
#> hms 0.4.2 2018-03-10 [2] CRAN (R 3.5.0)
#> htmltools 0.3.6 2017-04-28 [1] CRAN (R 3.5.0)
#> httr 1.4.0 2018-12-11 [2] CRAN (R 3.5.0)
#> janeaustenr 0.1.5 2017-06-10 [2] CRAN (R 3.5.0)
#> jsonlite 1.6 2018-12-07 [2] CRAN (R 3.5.0)
#> knitr 1.21 2018-12-10 [2] CRAN (R 3.5.1)
#> lattice 0.20-38 2018-11-04 [2] CRAN (R 3.5.2)
#> lazyeval 0.2.1 2017-10-29 [2] CRAN (R 3.5.0)
#> lubridate 1.7.4 2018-04-11 [2] CRAN (R 3.5.0)
#> magrittr 1.5 2014-11-22 [2] CRAN (R 3.5.0)
#> Matrix 1.2-15 2018-11-01 [2] CRAN (R 3.5.2)
#> memoise 1.1.0 2017-04-21 [2] CRAN (R 3.5.0)
#> modelr 0.1.4 2019-02-18 [2] CRAN (R 3.5.2)
#> munsell 0.5.0 2018-06-12 [2] CRAN (R 3.5.0)
#> nlme 3.1-137 2018-04-07 [2] CRAN (R 3.5.2)
#> NLP * 0.2-0 2018-10-18 [2] CRAN (R 3.5.0)
#> pillar 1.3.1 2018-12-15 [2] CRAN (R 3.5.0)
#> pkgbuild 1.0.2 2018-10-16 [1] CRAN (R 3.5.0)
#> pkgconfig 2.0.2 2018-08-16 [2] CRAN (R 3.5.1)
#> pkgload 1.0.2 2018-10-29 [1] CRAN (R 3.5.0)
#> plyr 1.8.4 2016-06-08 [2] CRAN (R 3.5.0)
#> prettyunits 1.0.2 2015-07-13 [2] CRAN (R 3.5.0)
#> processx 3.2.1 2018-12-05 [2] CRAN (R 3.5.0)
#> ps 1.3.0 2018-12-21 [2] CRAN (R 3.5.0)
#> purrr * 0.3.0 2019-01-27 [2] CRAN (R 3.5.2)
#> R6 2.4.0 2019-02-14 [1] CRAN (R 3.5.2)
#> Rcpp 1.0.0 2018-11-07 [1] CRAN (R 3.5.0)
#> readr * 1.3.1 2018-12-21 [2] CRAN (R 3.5.0)
#> readxl 1.3.0 2019-02-15 [2] CRAN (R 3.5.2)
#> remotes 2.0.2 2018-10-30 [1] CRAN (R 3.5.0)
#> rlang 0.3.1 2019-01-08 [1] CRAN (R 3.5.2)
#> rmarkdown 1.11 2018-12-08 [2] CRAN (R 3.5.0)
#> rprojroot 1.3-2 2018-01-03 [2] CRAN (R 3.5.0)
#> rsample * 0.0.4 2019-01-07 [1] CRAN (R 3.5.2)
#> rvest 0.3.2 2016-06-17 [2] CRAN (R 3.5.0)
#> scales 1.0.0 2018-08-09 [1] CRAN (R 3.5.0)
#> sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 3.5.0)
#> slam 0.1-44 2018-12-21 [1] CRAN (R 3.5.0)
#> SnowballC 0.6.0 2019-01-15 [2] CRAN (R 3.5.2)
#> stringi 1.3.1 2019-02-13 [1] CRAN (R 3.5.2)
#> stringr * 1.4.0 2019-02-10 [1] CRAN (R 3.5.2)
#> testthat 2.0.1 2018-10-13 [2] CRAN (R 3.5.0)
#> tibble * 2.0.1 2019-01-12 [2] CRAN (R 3.5.2)
#> tidyr * 0.8.2.9000 2019-02-11 [1] Github (tidyverse/tidyr@0b27690)
#> tidyselect 0.2.5 2018-10-11 [1] CRAN (R 3.5.0)
#> tidytext * 0.2.0 2018-10-17 [1] CRAN (R 3.5.0)
#> tidyverse * 1.2.1 2017-11-14 [2] CRAN (R 3.5.0)
#> tm * 0.7-6 2018-12-21 [2] CRAN (R 3.5.0)
#> tokenizers 0.2.1 2018-03-29 [2] CRAN (R 3.5.0)
#> usethis 1.4.0 2018-08-14 [1] CRAN (R 3.5.0)
#> utf8 1.1.4 2018-05-24 [2] CRAN (R 3.5.0)
#> withr 2.1.2 2018-03-15 [2] CRAN (R 3.5.0)
#> xfun 0.5 2019-02-20 [1] CRAN (R 3.5.2)
#> xml2 1.2.0 2018-01-24 [2] CRAN (R 3.5.0)
#> yaml 2.2.0 2018-07-25 [2] CRAN (R 3.5.0)
#>
#> [1] /Users/soltoffbc/Library/R/3.5/library
#> [2] /Library/Frameworks/R.framework/Versions/3.5/Resources/library