kbenoit/compare_kind.R

## compare_kind.R
library("quanteda")
## Package version: 1.4.2
## Parallel computing: 2 of 12 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
##     View
library("spacyr")

# see https://github.com/quanteda/quanteda.corpora
data(data_corpus_sotu, package = "quanteda.corpora")

# create corpus of just sentences containing "kind"
corp_sents <- corpus_reshape(data_corpus_sotu, to = "sentences")
corp_kind <- kwic(corp_sents, "kind", window = 200) %>%
  corpus(split_context = FALSE, extract_keyword = FALSE)

# tag the parts of speech
sp <- spacyr::spacy_parse(texts(corp_kind))
## Found 'spacy_condaenv'. spacyr will use this environment
## successfully initialized (spaCy Version: 2.0.18, language model: en)
## (python options: type = "condaenv", value = "spacy_condaenv")

# convert to quanteda tokens with pos tags
toks <- as.tokens(sp, include_pos = "pos")

# get frequencies of different variants of "kind", summarize
tstat <- dfm(toks, select = "kind/*") %>%
  textstat_frequency()
tstat
##     feature frequency rank docfreq group
## 1 kind/noun       302    1     290   all
## 2  kind/adj        13    2      13   all
## 3  kind/adv         3    3       3   all
sum(tstat$frequency)
## [1] 318
tstat$frequency / sum(tstat$frequency)
## [1] 0.949685535 0.040880503 0.009433962
	library("quanteda")
	## Package version: 1.4.2
	## Parallel computing: 2 of 12 threads used.
	## See https://quanteda.io for tutorials and examples.
	##
	## Attaching package: 'quanteda'
	## The following object is masked from 'package:utils':
	##
	## View
	library("spacyr")

	# see https://github.com/quanteda/quanteda.corpora
	data(data_corpus_sotu, package = "quanteda.corpora")

	# create corpus of just sentences containing "kind"
	corp_sents <- corpus_reshape(data_corpus_sotu, to = "sentences")
	corp_kind <- kwic(corp_sents, "kind", window = 200) %>%
	corpus(split_context = FALSE, extract_keyword = FALSE)

	# tag the parts of speech
	sp <- spacyr::spacy_parse(texts(corp_kind))
	## Found 'spacy_condaenv'. spacyr will use this environment
	## successfully initialized (spaCy Version: 2.0.18, language model: en)
	## (python options: type = "condaenv", value = "spacy_condaenv")

	# convert to quanteda tokens with pos tags
	toks <- as.tokens(sp, include_pos = "pos")

	# get frequencies of different variants of "kind", summarize
	tstat <- dfm(toks, select = "kind/*") %>%
	textstat_frequency()
	tstat
	## feature frequency rank docfreq group
	## 1 kind/noun 302 1 290 all
	## 2 kind/adj 13 2 13 all
	## 3 kind/adv 3 3 3 all
	sum(tstat$frequency)
	## [1] 318
	tstat$frequency / sum(tstat$frequency)
	## [1] 0.949685535 0.040880503 0.009433962