vanatteveldt/quanteda_session.r

## quanteda_session.r
# warning : messy :)

library(quanteda)
text = c("I like cats", "I'm not sure my cats like me")
dfm(text)

dfm(t$name)
t

dfm(d$text)

sotu_corpus = corpus(d)
d = dfm(sotu_corpus)
d

head(docvars(d))
?dfm

d[1:10, 1:10]

text = "彭丽媛会见古巴国务委员会主席夫人"
text = "麻生太郎財務相は１０日の衆院財務金融委員会で、議員らから財務相への予算関連"
text = c("I like cats", "I'm not sure, people like my cat  :-(")
cd = dfm(text, stem = T)
dfm(text) %>% textplot_wordcloud(min_count = 1)

tokenized = tokens(text)
class(tokenized)
dfm(tokenized)

tweet = "Hey I like #hashtags, @you not? http://twtiter.com/bla"
dfm(tweet, remove_twitter=F, remove_punct=T, remove_url=T)

tokenize(text)

quanteda::dfm_wordstem(dfm(text_nl), "dutch")

text_nl = c("Ik hou van katten", "Mijn kat houdt niet van mij")
dfm(text, stem = T, language="nl")
?dfm

mystop = c(stopwords("english"), "citizen")
stopwords('german')

stopwords::stopwords(source ="stopwords-iso")

stopwords::stopwords(language = "de", source ="stopwords-iso")
?stopwords::stopwords

d = dfm(sotu_corpus, remove_punct=T, stem=T, remove=stopwords("english"))


dfm_remove(cd, c("cat*", "i*"))
dfm_trim(d, min_termfreq = 5)
dfm_trim(d, min_docfreq = 5)

dfm_trim(d, min_docfreq = .01, max_docfreq = .5, docfreq_type = "prop")

stopwords::stopwords_getsources()

# spacyr


textplot_wordcloud(d, max_words = 50)     ## top 50 (most frequent) words
textplot_wordcloud(d, max_words = 50, color = c('blue','red')) ## change colors
textstat_frequency(d, n = 10)             ## view the frequencies

is_obama = docvars(d)$President == 'Barack Obama'
obama_dtm = d[is_obama,]
textplot_wordcloud(obama_dtm, max_words = 25)

obama_dtm = dfm_subset(d, is_obama)

table(docvars(d)$President)
is_bush = docvars(d)$President == 'George W. Bush'
is_obama = docvars(d)$President == 'Barack Obama'

dtm_subset = dfm_subset(d, is_obama | is_bush)

ts = textstat_keyness(dtm_subset, docvars(dtm_subset)$President == 'Barack Obama' )
head(ts, 20)    ## view first 20 results

ts %>% arrange(chi2) %>% head

dict = dictionary(list(terrorism = 'terror*',
                       religion = c('relig*', 'christ*', 'musli*', 'isla*'),
                       economy = c('econom*', 'tax*', 'job*'),
                       military = c('army','navy','military','airforce','soldier'),
                       freedom = c('freedom','liberty')))
?dictionary

dict_dtm = dfm_lookupi(d, dict, exclusive=TRUE)
dict_dtm[1:10, ]

df = convert(dict_dtm, to='data.frame') %>% as_tibble
df

metadata = docvars(d) %>% as_tibble(rownames = "document")
metadata

inner_join(df, metadata) %>% group_by(President) %>% summarize(terrorism = sum(terrorism))
inner_join(df, metadata) %>% gather(terrorism:freedom, key = "query", value="n") %>%
  group_by(President, query) %>% summarize(n=sum(n)) %>% spread(query, n)

inner_join(df, metadata) %>% group_by(President) %>% summarize(cor = cor(terrorism, religion)) %>% arrange(-cor)
kwic(sotu_corpus, "musli*") %>% head
	# warning : messy :)

	library(quanteda)
	text = c("I like cats", "I'm not sure my cats like me")
	dfm(text)

	dfm(t$name)
	t

	dfm(d$text)

	sotu_corpus = corpus(d)
	d = dfm(sotu_corpus)
	d

	head(docvars(d))
	?dfm

	d[1:10, 1:10]

	text = "彭丽媛会见古巴国务委员会主席夫人"
	text = "麻生太郎財務相は１０日の衆院財務金融委員会で、議員らから財務相への予算関連"
	text = c("I like cats", "I'm not sure, people like my cat :-(")
	cd = dfm(text, stem = T)
	dfm(text) %>% textplot_wordcloud(min_count = 1)

	tokenized = tokens(text)
	class(tokenized)
	dfm(tokenized)

	tweet = "Hey I like #hashtags, @you not? http://twtiter.com/bla"
	dfm(tweet, remove_twitter=F, remove_punct=T, remove_url=T)

	tokenize(text)

	quanteda::dfm_wordstem(dfm(text_nl), "dutch")

	text_nl = c("Ik hou van katten", "Mijn kat houdt niet van mij")
	dfm(text, stem = T, language="nl")
	?dfm

	mystop = c(stopwords("english"), "citizen")
	stopwords('german')

	stopwords::stopwords(source ="stopwords-iso")

	stopwords::stopwords(language = "de", source ="stopwords-iso")
	?stopwords::stopwords

	d = dfm(sotu_corpus, remove_punct=T, stem=T, remove=stopwords("english"))


	dfm_remove(cd, c("cat", "i"))
	dfm_trim(d, min_termfreq = 5)
	dfm_trim(d, min_docfreq = 5)

	dfm_trim(d, min_docfreq = .01, max_docfreq = .5, docfreq_type = "prop")

	stopwords::stopwords_getsources()

	# spacyr


	textplot_wordcloud(d, max_words = 50) ## top 50 (most frequent) words
	textplot_wordcloud(d, max_words = 50, color = c('blue','red')) ## change colors
	textstat_frequency(d, n = 10) ## view the frequencies

	is_obama = docvars(d)$President == 'Barack Obama'
	obama_dtm = d[is_obama,]
	textplot_wordcloud(obama_dtm, max_words = 25)

	obama_dtm = dfm_subset(d, is_obama)

	table(docvars(d)$President)
	is_bush = docvars(d)$President == 'George W. Bush'
	is_obama = docvars(d)$President == 'Barack Obama'

	dtm_subset = dfm_subset(d, is_obama \| is_bush)

	ts = textstat_keyness(dtm_subset, docvars(dtm_subset)$President == 'Barack Obama' )
	head(ts, 20) ## view first 20 results

	ts %>% arrange(chi2) %>% head

	dict = dictionary(list(terrorism = 'terror*',
	religion = c('relig', 'christ', 'musli', 'isla'),
	economy = c('econom', 'tax', 'job*'),
	military = c('army','navy','military','airforce','soldier'),
	freedom = c('freedom','liberty')))
	?dictionary

	dict_dtm = dfm_lookupi(d, dict, exclusive=TRUE)
	dict_dtm[1:10, ]

	df = convert(dict_dtm, to='data.frame') %>% as_tibble
	df

	metadata = docvars(d) %>% as_tibble(rownames = "document")
	metadata

	inner_join(df, metadata) %>% group_by(President) %>% summarize(terrorism = sum(terrorism))
	inner_join(df, metadata) %>% gather(terrorism:freedom, key = "query", value="n") %>%
	group_by(President, query) %>% summarize(n=sum(n)) %>% spread(query, n)

	inner_join(df, metadata) %>% group_by(President) %>% summarize(cor = cor(terrorism, religion)) %>% arrange(-cor)
	kwic(sotu_corpus, "musli*") %>% head