Skip to content

Instantly share code, notes, and snippets.

@kbenoit
Created March 21, 2019 06:45
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kbenoit/4593ca1deeb4d890077f3b12ba468888 to your computer and use it in GitHub Desktop.
Save kbenoit/4593ca1deeb4d890077f3b12ba468888 to your computer and use it in GitHub Desktop.
Analysis from Text as Data: An Overview
library(quanteda)
## Package version: 1.4.3
## Parallel computing: 2 of 12 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
# inflation
kwic(data_corpus_inaugural, phrase("inflation"), 20)
##
## [1981-Reagan, 806]
## [1985-Reagan, 468]
## [1985-Reagan, 572]
##
## born of bigotry or discrimination. Putting America back to work means putting all Americans back to work. Ending
## Government that properly belonged to States or to local governments or to the people themselves. We allowed taxes and
## free to follow their dreams. And we were right to believe that. Tax rates have been reduced,
##
## | inflation |
## | inflation |
## | inflation |
##
## means freeing all Americans from the terror of runaway living costs. All must share in the productive work of
## to rob us of our earnings and savings and watched the great industrial machine that had made us the most
## cut dramatically, and more people are employed than ever before in our history. We are creating a nation
## workflow figure
sotu_dfm <- dfm(data_corpus_sotu, remove_punct = TRUE) %>%
dfm_remove(stopwords("en")) %>%
dfm_sort()
## Error in is(x, "dfm"): object 'data_corpus_sotu' not found
head(sotu_dfm[
c("Clinton-2000", "Bush-2008", "Obama-2016", "Trump-2019"),
c("economy", "united", "wall", "crime", "climate")
], nf = 8)
## Error in head(sotu_dfm[c("Clinton-2000", "Bush-2008", "Obama-2016", "Trump-2019"), : object 'sotu_dfm' not found
## "kind" for dictionaries
library("spacyr")
# see https://github.com/quanteda/quanteda.corpora
data(data_corpus_sotu, package = "quanteda.corpora")
# create corpus of just sentences containing "kind"
corp_sents <- corpus_reshape(data_corpus_sotu, to = "sentences")
corp_kind <- kwic(corp_sents, "kind", window = 200) %>%
corpus(split_context = FALSE, extract_keyword = FALSE)
# tag the parts of speech
sp <- spacyr::spacy_parse(texts(corp_kind))
## Found 'spacy_condaenv'. spacyr will use this environment
## successfully initialized (spaCy Version: 2.1.0, language model: en)
## (python options: type = "condaenv", value = "spacy_condaenv")
# convert to quanteda tokens with pos tags
toks <- as.tokens(sp, include_pos = "pos")
# get frequencies of different variants of "kind", summarize
tstat <- dfm(toks, select = "kind/*") %>%
textstat_frequency()
tstat
## feature frequency rank docfreq group
## 1 kind/noun 298 1 286 all
## 2 kind/adj 16 2 16 all
## 3 kind/adv 3 3 3 all
## 4 kind/propn 1 4 1 all
sum(tstat$frequency)
## [1] 318
tstat$frequency / sum(tstat$frequency)
## [1] 0.937106918 0.050314465 0.009433962 0.003144654
## illustrate sparsity
inaugdfm <- corpus_subset(data_corpus_inaugural, Year <= 2019) %>%
dfm(remove_punct = TRUE, remove_numbers = TRUE, tolower = TRUE)
inaugdfm
## Document-feature matrix of: 58 documents, 9,273 features (91.8% sparse).
prod(dim(inaugdfm))
## [1] 537834
hapaxes <- featnames(inaugdfm)[colSums(inaugdfm) == 1]
length(hapaxes)
## [1] 3846
length(hapaxes) / nfeat(inaugdfm)
## [1] 0.4147525
head(sort(hapaxes), 100)
## [1] "14th" "18th" "30th"
## [4] "3d" "4th" "6th"
## [7] "abate" "abdicated" "abeyance"
## [10] "abhorring" "abject" "ably"
## [13] "abode" "abodes" "abolishing"
## [16] "aborigines" "abound" "abounds"
## [19] "abridging" "absolutism" "absorb"
## [22] "absorbed" "absorbing" "absorbs"
## [25] "abstaining" "abstract" "abstractions"
## [28] "absurd" "academies" "accepts"
## [31] "accident" "accidental" "accidents"
## [34] "accommodation" "accommodations" "accompany"
## [37] "accorded" "accords" "accrue"
## [40] "accrued" "accruing" "accumulate"
## [43] "accumulated" "accurately" "accustom"
## [46] "achieving" "acknowledgment" "acquaintance"
## [49] "acquires" "acquiring" "acquit"
## [52] "acrimony" "actively" "activism"
## [55] "actuate" "acute" "adams"
## [58] "addiction" "additions" "addresses"
## [61] "adduced" "adhered" "adheres"
## [64] "adjective" "adjunct" "adjustments"
## [67] "administrated" "administration's" "administrators"
## [70] "admirably" "admissions" "admitting"
## [73] "admonishes" "admonitions" "adopting"
## [76] "adore" "adoring" "adorn"
## [79] "adorns" "adventurers" "adventurously"
## [82] "adverted" "advisers" "advisory"
## [85] "advocates" "affiliation" "affirmation"
## [88] "affirmations" "afflict" "affliction"
## [91] "afghanistan" "afield" "afloat"
## [94] "afresh" "afte" "aftermath"
## [97] "aggravated" "aggravation" "aggressive"
## [100] "aggressor"
kwic(data_corpus_inaugural, "aborigines", window = 20)
##
## [1873-Grant, 951]
##
## a specie basis; to the elevation of labor; and, by a humane course, to bring the
##
## | aborigines |
##
## of the country under the benign influences of education and civilization. It is either this or war of extermination
## uninteresting ngrams
toks <- tokens(data_corpus_inaugural, remove_punct = TRUE) %>%
tokens_remove(stopwords("en"), pad = TRUE) %>%
tokens_ngrams(n = 2)
dfm(toks) %>%
topfeatures()
## united_states let_us fellow_citizens
## 157 97 78
## american_people federal_government years_ago
## 40 32 26
## four_years general_government upon_us
## 26 25 24
## every_citizen
## 18
## tokens to text to matrix
txt <- c(
t1 = "The Social Democratic Party opposes tax cuts for the wealthy.",
t2 = "We are opposed to spending another 10 million on social welfare."
)
tokens(txt)
## tokens from 2 documents.
## t1 :
## [1] "The" "Social" "Democratic" "Party" "opposes"
## [6] "tax" "cuts" "for" "the" "wealthy"
## [11] "."
##
## t2 :
## [1] "We" "are" "opposed" "to" "spending" "another"
## [7] "10" "million" "on" "social" "welfare" "."
tokens(txt, remove_punct = TRUE)
## tokens from 2 documents.
## t1 :
## [1] "The" "Social" "Democratic" "Party" "opposes"
## [6] "tax" "cuts" "for" "the" "wealthy"
##
## t2 :
## [1] "We" "are" "opposed" "to" "spending" "another"
## [7] "10" "million" "on" "social" "welfare"
tokens(txt, remove_numbers = TRUE)
## tokens from 2 documents.
## t1 :
## [1] "The" "Social" "Democratic" "Party" "opposes"
## [6] "tax" "cuts" "for" "the" "wealthy"
## [11] "."
##
## t2 :
## [1] "We" "are" "opposed" "to" "spending" "another"
## [7] "million" "on" "social" "welfare" "."
tokens(txt) %>%
tokens_wordstem()
## tokens from 2 documents.
## t1 :
## [1] "The" "Social" "Democrat" "Parti" "oppos" "tax"
## [7] "cut" "for" "the" "wealthi" "."
##
## t2 :
## [1] "We" "are" "oppos" "to" "spend" "anoth" "10"
## [8] "million" "on" "social" "welfar" "."
tokens(txt) %>%
tokens_wordstem() %>%
tokens_tolower()
## tokens from 2 documents.
## t1 :
## [1] "the" "social" "democrat" "parti" "oppos" "tax"
## [7] "cut" "for" "the" "wealthi" "."
##
## t2 :
## [1] "we" "are" "oppos" "to" "spend" "anoth" "10"
## [8] "million" "on" "social" "welfar" "."
spacy_parse(txt, nounphrase = TRUE) %>%
nounphrase_extract()
## doc_id sentence_id nounphrase
## 1 t1 1 The_Social_Democratic_Party
## 2 t1 1 tax_cuts
## 3 t2 1 We
## 4 t2 1 social_welfare
spacy_parse(txt, entity = TRUE) %>%
entity_extract()
## doc_id sentence_id entity entity_type
## 1 t1 1 The_Social_Democratic_Party ORG
spacy_parse(txt, nounphrase = TRUE) %>%
nounphrase_consolidate() %>%
as.tokens(include_pos = "pos")
## tokens from 2 documents.
## t1 :
## [1] "The_Social_Democratic_Party/nounphrase"
## [2] "opposes/VERB"
## [3] "tax_cuts/nounphrase"
## [4] "for/ADP"
## [5] "the/DET"
## [6] "wealthy/ADJ"
## [7] "./PUNCT"
##
## t2 :
## [1] "We/nounphrase" "are/VERB"
## [3] "opposed/VERB" "to/ADP"
## [5] "spending/VERB" "another/DET"
## [7] "10/NUM" "million/NUM"
## [9] "on/ADP" "social_welfare/nounphrase"
## [11] "./PUNCT"
## annotating tokens with POS tags
spacyr::spacy_parse("My kind of friend is kind of kind.") %>%
as.tokens(include_pos = "pos") %>%
tokens_select("kind/*")
## tokens from 1 document.
## text1 :
## [1] "kind/NOUN" "kind/ADV" "kind/ADJ"
spacyr::spacy_parse("The President sanctions the sanctions against Iran.") %>%
as.tokens(include_pos = "pos")
## tokens from 1 document.
## text1 :
## [1] "The/DET" "President/PROPN" "sanctions/VERB" "the/DET"
## [5] "sanctions/NOUN" "against/ADP" "Iran/PROPN" "./PUNCT"
## similarity example
txt <- c(
"Party X prioritizes economic growth, even at the cost of environmental protection.",
"Party X prioritizes environmental protection, even at the cost of economic growth.",
"Party Y embraces protection of citizens through universal health care."
)
dfm(txt) %>%
textstat_simil(method = "cosine")
## text1 text2
## text2 1.0000000
## text3 0.3223292 0.3223292
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment