kbenoit/text_as_data_an_overview.R

## text_as_data_an_overview.R
library(quanteda)
## Package version: 1.4.3
## Parallel computing: 2 of 12 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
##     View

# inflation
kwic(data_corpus_inaugural, phrase("inflation"), 20)
##
##  [1981-Reagan, 806]
##  [1985-Reagan, 468]
##  [1985-Reagan, 572]
##
##       born of bigotry or discrimination. Putting America back to work means putting all Americans back to work. Ending
##  Government that properly belonged to States or to local governments or to the people themselves. We allowed taxes and
##                           free to follow their dreams. And we were right to believe that. Tax rates have been reduced,
##
##  | inflation |
##  | inflation |
##  | inflation |
##
##  means freeing all Americans from the terror of runaway living costs. All must share in the productive work of
##  to rob us of our earnings and savings and watched the great industrial machine that had made us the most
##  cut dramatically, and more people are employed than ever before in our history. We are creating a nation
## workflow figure

sotu_dfm <- dfm(data_corpus_sotu, remove_punct = TRUE) %>%
  dfm_remove(stopwords("en")) %>%
  dfm_sort()
## Error in is(x, "dfm"): object 'data_corpus_sotu' not found
head(sotu_dfm[
  c("Clinton-2000", "Bush-2008", "Obama-2016", "Trump-2019"),
  c("economy", "united", "wall", "crime", "climate")
], nf = 8)
## Error in head(sotu_dfm[c("Clinton-2000", "Bush-2008", "Obama-2016", "Trump-2019"), : object 'sotu_dfm' not found


## "kind" for dictionaries

library("spacyr")

# see https://github.com/quanteda/quanteda.corpora
data(data_corpus_sotu, package = "quanteda.corpora")

# create corpus of just sentences containing "kind"
corp_sents <- corpus_reshape(data_corpus_sotu, to = "sentences")
corp_kind <- kwic(corp_sents, "kind", window = 200) %>%
  corpus(split_context = FALSE, extract_keyword = FALSE)

# tag the parts of speech
sp <- spacyr::spacy_parse(texts(corp_kind))
## Found 'spacy_condaenv'. spacyr will use this environment
## successfully initialized (spaCy Version: 2.1.0, language model: en)
## (python options: type = "condaenv", value = "spacy_condaenv")

# convert to quanteda tokens with pos tags
toks <- as.tokens(sp, include_pos = "pos")

# get frequencies of different variants of "kind", summarize
tstat <- dfm(toks, select = "kind/*") %>%
  textstat_frequency()
tstat
##      feature frequency rank docfreq group
## 1  kind/noun       298    1     286   all
## 2   kind/adj        16    2      16   all
## 3   kind/adv         3    3       3   all
## 4 kind/propn         1    4       1   all
sum(tstat$frequency)
## [1] 318
tstat$frequency / sum(tstat$frequency)
## [1] 0.937106918 0.050314465 0.009433962 0.003144654


## illustrate sparsity

inaugdfm <- corpus_subset(data_corpus_inaugural, Year <= 2019) %>%
  dfm(remove_punct = TRUE, remove_numbers = TRUE, tolower = TRUE)
inaugdfm
## Document-feature matrix of: 58 documents, 9,273 features (91.8% sparse).
prod(dim(inaugdfm))
## [1] 537834
hapaxes <- featnames(inaugdfm)[colSums(inaugdfm) == 1]
length(hapaxes)
## [1] 3846
length(hapaxes) / nfeat(inaugdfm)
## [1] 0.4147525
head(sort(hapaxes), 100)
##   [1] "14th"             "18th"             "30th"
##   [4] "3d"               "4th"              "6th"
##   [7] "abate"            "abdicated"        "abeyance"
##  [10] "abhorring"        "abject"           "ably"
##  [13] "abode"            "abodes"           "abolishing"
##  [16] "aborigines"       "abound"           "abounds"
##  [19] "abridging"        "absolutism"       "absorb"
##  [22] "absorbed"         "absorbing"        "absorbs"
##  [25] "abstaining"       "abstract"         "abstractions"
##  [28] "absurd"           "academies"        "accepts"
##  [31] "accident"         "accidental"       "accidents"
##  [34] "accommodation"    "accommodations"   "accompany"
##  [37] "accorded"         "accords"          "accrue"
##  [40] "accrued"          "accruing"         "accumulate"
##  [43] "accumulated"      "accurately"       "accustom"
##  [46] "achieving"        "acknowledgment"   "acquaintance"
##  [49] "acquires"         "acquiring"        "acquit"
##  [52] "acrimony"         "actively"         "activism"
##  [55] "actuate"          "acute"            "adams"
##  [58] "addiction"        "additions"        "addresses"
##  [61] "adduced"          "adhered"          "adheres"
##  [64] "adjective"        "adjunct"          "adjustments"
##  [67] "administrated"    "administration's" "administrators"
##  [70] "admirably"        "admissions"       "admitting"
##  [73] "admonishes"       "admonitions"      "adopting"
##  [76] "adore"            "adoring"          "adorn"
##  [79] "adorns"           "adventurers"      "adventurously"
##  [82] "adverted"         "advisers"         "advisory"
##  [85] "advocates"        "affiliation"      "affirmation"
##  [88] "affirmations"     "afflict"          "affliction"
##  [91] "afghanistan"      "afield"           "afloat"
##  [94] "afresh"           "afte"             "aftermath"
##  [97] "aggravated"       "aggravation"      "aggressive"
## [100] "aggressor"
kwic(data_corpus_inaugural, "aborigines", window = 20)
##
##  [1873-Grant, 951]
##
##  a specie basis; to the elevation of labor; and, by a humane course, to bring the
##
##  | aborigines |
##
##  of the country under the benign influences of education and civilization. It is either this or war of extermination


## uninteresting ngrams
toks <- tokens(data_corpus_inaugural, remove_punct = TRUE) %>%
  tokens_remove(stopwords("en"), pad = TRUE) %>%
  tokens_ngrams(n = 2)
dfm(toks) %>%
  topfeatures()
##      united_states             let_us    fellow_citizens
##                157                 97                 78
##    american_people federal_government          years_ago
##                 40                 32                 26
##         four_years general_government            upon_us
##                 26                 25                 24
##      every_citizen
##                 18

## tokens to text to matrix
txt <- c(
  t1 = "The Social Democratic Party opposes tax cuts for the wealthy.",
  t2 = "We are opposed to spending another 10 million on social welfare."
)
tokens(txt)
## tokens from 2 documents.
## t1 :
##  [1] "The"        "Social"     "Democratic" "Party"      "opposes"
##  [6] "tax"        "cuts"       "for"        "the"        "wealthy"
## [11] "."
##
## t2 :
##  [1] "We"       "are"      "opposed"  "to"       "spending" "another"
##  [7] "10"       "million"  "on"       "social"   "welfare"  "."
tokens(txt, remove_punct = TRUE)
## tokens from 2 documents.
## t1 :
##  [1] "The"        "Social"     "Democratic" "Party"      "opposes"
##  [6] "tax"        "cuts"       "for"        "the"        "wealthy"
##
## t2 :
##  [1] "We"       "are"      "opposed"  "to"       "spending" "another"
##  [7] "10"       "million"  "on"       "social"   "welfare"
tokens(txt, remove_numbers = TRUE)
## tokens from 2 documents.
## t1 :
##  [1] "The"        "Social"     "Democratic" "Party"      "opposes"
##  [6] "tax"        "cuts"       "for"        "the"        "wealthy"
## [11] "."
##
## t2 :
##  [1] "We"       "are"      "opposed"  "to"       "spending" "another"
##  [7] "million"  "on"       "social"   "welfare"  "."
tokens(txt) %>%
  tokens_wordstem()
## tokens from 2 documents.
## t1 :
##  [1] "The"      "Social"   "Democrat" "Parti"    "oppos"    "tax"
##  [7] "cut"      "for"      "the"      "wealthi"  "."
##
## t2 :
##  [1] "We"      "are"     "oppos"   "to"      "spend"   "anoth"   "10"
##  [8] "million" "on"      "social"  "welfar"  "."
tokens(txt) %>%
  tokens_wordstem() %>%
  tokens_tolower()
## tokens from 2 documents.
## t1 :
##  [1] "the"      "social"   "democrat" "parti"    "oppos"    "tax"
##  [7] "cut"      "for"      "the"      "wealthi"  "."
##
## t2 :
##  [1] "we"      "are"     "oppos"   "to"      "spend"   "anoth"   "10"
##  [8] "million" "on"      "social"  "welfar"  "."
spacy_parse(txt, nounphrase = TRUE) %>%
  nounphrase_extract()
##   doc_id sentence_id                  nounphrase
## 1     t1           1 The_Social_Democratic_Party
## 2     t1           1                    tax_cuts
## 3     t2           1                          We
## 4     t2           1              social_welfare
spacy_parse(txt, entity = TRUE) %>%
  entity_extract()
##   doc_id sentence_id                      entity entity_type
## 1     t1           1 The_Social_Democratic_Party         ORG
spacy_parse(txt, nounphrase = TRUE) %>%
  nounphrase_consolidate() %>%
  as.tokens(include_pos = "pos")
## tokens from 2 documents.
## t1 :
## [1] "The_Social_Democratic_Party/nounphrase"
## [2] "opposes/VERB"
## [3] "tax_cuts/nounphrase"
## [4] "for/ADP"
## [5] "the/DET"
## [6] "wealthy/ADJ"
## [7] "./PUNCT"
##
## t2 :
##  [1] "We/nounphrase"             "are/VERB"
##  [3] "opposed/VERB"              "to/ADP"
##  [5] "spending/VERB"             "another/DET"
##  [7] "10/NUM"                    "million/NUM"
##  [9] "on/ADP"                    "social_welfare/nounphrase"
## [11] "./PUNCT"

## annotating tokens with POS tags
spacyr::spacy_parse("My kind of friend is kind of kind.") %>%
  as.tokens(include_pos = "pos") %>%
  tokens_select("kind/*")
## tokens from 1 document.
## text1 :
## [1] "kind/NOUN" "kind/ADV"  "kind/ADJ"
spacyr::spacy_parse("The President sanctions the sanctions against Iran.") %>%
  as.tokens(include_pos = "pos")
## tokens from 1 document.
## text1 :
## [1] "The/DET"         "President/PROPN" "sanctions/VERB"  "the/DET"
## [5] "sanctions/NOUN"  "against/ADP"     "Iran/PROPN"      "./PUNCT"

## similarity example
txt <- c(
  "Party X prioritizes economic growth, even at the cost of environmental protection.",
  "Party X prioritizes environmental protection, even at the cost of economic growth.",
  "Party Y embraces protection of citizens through universal health care."
)
dfm(txt) %>%
  textstat_simil(method = "cosine")
##           text1     text2
## text2 1.0000000
## text3 0.3223292 0.3223292
	library(quanteda)
	## Package version: 1.4.3
	## Parallel computing: 2 of 12 threads used.
	## See https://quanteda.io for tutorials and examples.
	##
	## Attaching package: 'quanteda'
	## The following object is masked from 'package:utils':
	##
	## View

	# inflation
	kwic(data_corpus_inaugural, phrase("inflation"), 20)
	##
	## [1981-Reagan, 806]
	## [1985-Reagan, 468]
	## [1985-Reagan, 572]
	##
	## born of bigotry or discrimination. Putting America back to work means putting all Americans back to work. Ending
	## Government that properly belonged to States or to local governments or to the people themselves. We allowed taxes and
	## free to follow their dreams. And we were right to believe that. Tax rates have been reduced,
	##
	## \| inflation \|
	## \| inflation \|
	## \| inflation \|
	##
	## means freeing all Americans from the terror of runaway living costs. All must share in the productive work of
	## to rob us of our earnings and savings and watched the great industrial machine that had made us the most
	## cut dramatically, and more people are employed than ever before in our history. We are creating a nation
	## workflow figure

	sotu_dfm <- dfm(data_corpus_sotu, remove_punct = TRUE) %>%
	dfm_remove(stopwords("en")) %>%
	dfm_sort()
	## Error in is(x, "dfm"): object 'data_corpus_sotu' not found
	head(sotu_dfm[
	c("Clinton-2000", "Bush-2008", "Obama-2016", "Trump-2019"),
	c("economy", "united", "wall", "crime", "climate")
	], nf = 8)
	## Error in head(sotu_dfm[c("Clinton-2000", "Bush-2008", "Obama-2016", "Trump-2019"), : object 'sotu_dfm' not found


	## "kind" for dictionaries

	library("spacyr")

	# see https://github.com/quanteda/quanteda.corpora
	data(data_corpus_sotu, package = "quanteda.corpora")

	# create corpus of just sentences containing "kind"
	corp_sents <- corpus_reshape(data_corpus_sotu, to = "sentences")
	corp_kind <- kwic(corp_sents, "kind", window = 200) %>%
	corpus(split_context = FALSE, extract_keyword = FALSE)

	# tag the parts of speech
	sp <- spacyr::spacy_parse(texts(corp_kind))
	## Found 'spacy_condaenv'. spacyr will use this environment
	## successfully initialized (spaCy Version: 2.1.0, language model: en)
	## (python options: type = "condaenv", value = "spacy_condaenv")

	# convert to quanteda tokens with pos tags
	toks <- as.tokens(sp, include_pos = "pos")

	# get frequencies of different variants of "kind", summarize
	tstat <- dfm(toks, select = "kind/*") %>%
	textstat_frequency()
	tstat
	## feature frequency rank docfreq group
	## 1 kind/noun 298 1 286 all
	## 2 kind/adj 16 2 16 all
	## 3 kind/adv 3 3 3 all
	## 4 kind/propn 1 4 1 all
	sum(tstat$frequency)
	## [1] 318
	tstat$frequency / sum(tstat$frequency)
	## [1] 0.937106918 0.050314465 0.009433962 0.003144654


	## illustrate sparsity

	inaugdfm <- corpus_subset(data_corpus_inaugural, Year <= 2019) %>%
	dfm(remove_punct = TRUE, remove_numbers = TRUE, tolower = TRUE)
	inaugdfm
	## Document-feature matrix of: 58 documents, 9,273 features (91.8% sparse).
	prod(dim(inaugdfm))
	## [1] 537834
	hapaxes <- featnames(inaugdfm)[colSums(inaugdfm) == 1]
	length(hapaxes)
	## [1] 3846
	length(hapaxes) / nfeat(inaugdfm)
	## [1] 0.4147525
	head(sort(hapaxes), 100)
	## [1] "14th" "18th" "30th"
	## [4] "3d" "4th" "6th"
	## [7] "abate" "abdicated" "abeyance"
	## [10] "abhorring" "abject" "ably"
	## [13] "abode" "abodes" "abolishing"
	## [16] "aborigines" "abound" "abounds"
	## [19] "abridging" "absolutism" "absorb"
	## [22] "absorbed" "absorbing" "absorbs"
	## [25] "abstaining" "abstract" "abstractions"
	## [28] "absurd" "academies" "accepts"
	## [31] "accident" "accidental" "accidents"
	## [34] "accommodation" "accommodations" "accompany"
	## [37] "accorded" "accords" "accrue"
	## [40] "accrued" "accruing" "accumulate"
	## [43] "accumulated" "accurately" "accustom"
	## [46] "achieving" "acknowledgment" "acquaintance"
	## [49] "acquires" "acquiring" "acquit"
	## [52] "acrimony" "actively" "activism"
	## [55] "actuate" "acute" "adams"
	## [58] "addiction" "additions" "addresses"
	## [61] "adduced" "adhered" "adheres"
	## [64] "adjective" "adjunct" "adjustments"
	## [67] "administrated" "administration's" "administrators"
	## [70] "admirably" "admissions" "admitting"
	## [73] "admonishes" "admonitions" "adopting"
	## [76] "adore" "adoring" "adorn"
	## [79] "adorns" "adventurers" "adventurously"
	## [82] "adverted" "advisers" "advisory"
	## [85] "advocates" "affiliation" "affirmation"
	## [88] "affirmations" "afflict" "affliction"
	## [91] "afghanistan" "afield" "afloat"
	## [94] "afresh" "afte" "aftermath"
	## [97] "aggravated" "aggravation" "aggressive"
	## [100] "aggressor"
	kwic(data_corpus_inaugural, "aborigines", window = 20)
	##
	## [1873-Grant, 951]
	##
	## a specie basis; to the elevation of labor; and, by a humane course, to bring the
	##
	## \| aborigines \|
	##
	## of the country under the benign influences of education and civilization. It is either this or war of extermination


	## uninteresting ngrams
	toks <- tokens(data_corpus_inaugural, remove_punct = TRUE) %>%
	tokens_remove(stopwords("en"), pad = TRUE) %>%
	tokens_ngrams(n = 2)
	dfm(toks) %>%
	topfeatures()
	## united_states let_us fellow_citizens
	## 157 97 78
	## american_people federal_government years_ago
	## 40 32 26
	## four_years general_government upon_us
	## 26 25 24
	## every_citizen
	## 18

	## tokens to text to matrix
	txt <- c(
	t1 = "The Social Democratic Party opposes tax cuts for the wealthy.",
	t2 = "We are opposed to spending another 10 million on social welfare."
	)
	tokens(txt)
	## tokens from 2 documents.
	## t1 :
	## [1] "The" "Social" "Democratic" "Party" "opposes"
	## [6] "tax" "cuts" "for" "the" "wealthy"
	## [11] "."
	##
	## t2 :
	## [1] "We" "are" "opposed" "to" "spending" "another"
	## [7] "10" "million" "on" "social" "welfare" "."
	tokens(txt, remove_punct = TRUE)
	## tokens from 2 documents.
	## t1 :
	## [1] "The" "Social" "Democratic" "Party" "opposes"
	## [6] "tax" "cuts" "for" "the" "wealthy"
	##
	## t2 :
	## [1] "We" "are" "opposed" "to" "spending" "another"
	## [7] "10" "million" "on" "social" "welfare"
	tokens(txt, remove_numbers = TRUE)
	## tokens from 2 documents.
	## t1 :
	## [1] "The" "Social" "Democratic" "Party" "opposes"
	## [6] "tax" "cuts" "for" "the" "wealthy"
	## [11] "."
	##
	## t2 :
	## [1] "We" "are" "opposed" "to" "spending" "another"
	## [7] "million" "on" "social" "welfare" "."
	tokens(txt) %>%
	tokens_wordstem()
	## tokens from 2 documents.
	## t1 :
	## [1] "The" "Social" "Democrat" "Parti" "oppos" "tax"
	## [7] "cut" "for" "the" "wealthi" "."
	##
	## t2 :
	## [1] "We" "are" "oppos" "to" "spend" "anoth" "10"
	## [8] "million" "on" "social" "welfar" "."
	tokens(txt) %>%
	tokens_wordstem() %>%
	tokens_tolower()
	## tokens from 2 documents.
	## t1 :
	## [1] "the" "social" "democrat" "parti" "oppos" "tax"
	## [7] "cut" "for" "the" "wealthi" "."
	##
	## t2 :
	## [1] "we" "are" "oppos" "to" "spend" "anoth" "10"
	## [8] "million" "on" "social" "welfar" "."
	spacy_parse(txt, nounphrase = TRUE) %>%
	nounphrase_extract()
	## doc_id sentence_id nounphrase
	## 1 t1 1 The_Social_Democratic_Party
	## 2 t1 1 tax_cuts
	## 3 t2 1 We
	## 4 t2 1 social_welfare
	spacy_parse(txt, entity = TRUE) %>%
	entity_extract()
	## doc_id sentence_id entity entity_type
	## 1 t1 1 The_Social_Democratic_Party ORG
	spacy_parse(txt, nounphrase = TRUE) %>%
	nounphrase_consolidate() %>%
	as.tokens(include_pos = "pos")
	## tokens from 2 documents.
	## t1 :
	## [1] "The_Social_Democratic_Party/nounphrase"
	## [2] "opposes/VERB"
	## [3] "tax_cuts/nounphrase"
	## [4] "for/ADP"
	## [5] "the/DET"
	## [6] "wealthy/ADJ"
	## [7] "./PUNCT"
	##
	## t2 :
	## [1] "We/nounphrase" "are/VERB"
	## [3] "opposed/VERB" "to/ADP"
	## [5] "spending/VERB" "another/DET"
	## [7] "10/NUM" "million/NUM"
	## [9] "on/ADP" "social_welfare/nounphrase"
	## [11] "./PUNCT"

	## annotating tokens with POS tags
	spacyr::spacy_parse("My kind of friend is kind of kind.") %>%
	as.tokens(include_pos = "pos") %>%
	tokens_select("kind/*")
	## tokens from 1 document.
	## text1 :
	## [1] "kind/NOUN" "kind/ADV" "kind/ADJ"
	spacyr::spacy_parse("The President sanctions the sanctions against Iran.") %>%
	as.tokens(include_pos = "pos")
	## tokens from 1 document.
	## text1 :
	## [1] "The/DET" "President/PROPN" "sanctions/VERB" "the/DET"
	## [5] "sanctions/NOUN" "against/ADP" "Iran/PROPN" "./PUNCT"

	## similarity example
	txt <- c(
	"Party X prioritizes economic growth, even at the cost of environmental protection.",
	"Party X prioritizes environmental protection, even at the cost of economic growth.",
	"Party Y embraces protection of citizens through universal health care."
	)
	dfm(txt) %>%
	textstat_simil(method = "cosine")
	## text1 text2
	## text2 1.0000000
	## text3 0.3223292 0.3223292