Skip to content

Instantly share code, notes, and snippets.

@vanatteveldt
Created April 10, 2019 10:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vanatteveldt/4eeb80b59e2aeb084a91acbaad75dc00 to your computer and use it in GitHub Desktop.
Save vanatteveldt/4eeb80b59e2aeb084a91acbaad75dc00 to your computer and use it in GitHub Desktop.
# warning : messy :)
library(quanteda)
text = c("I like cats", "I'm not sure my cats like me")
dfm(text)
dfm(t$name)
t
dfm(d$text)
sotu_corpus = corpus(d)
d = dfm(sotu_corpus)
d
head(docvars(d))
?dfm
d[1:10, 1:10]
text = "彭丽媛会见古巴国务委员会主席夫人"
text = "麻生太郎財務相は10日の衆院財務金融委員会で、議員らから財務相への予算関連"
text = c("I like cats", "I'm not sure, people like my cat :-(")
cd = dfm(text, stem = T)
dfm(text) %>% textplot_wordcloud(min_count = 1)
tokenized = tokens(text)
class(tokenized)
dfm(tokenized)
tweet = "Hey I like #hashtags, @you not? http://twtiter.com/bla"
dfm(tweet, remove_twitter=F, remove_punct=T, remove_url=T)
tokenize(text)
quanteda::dfm_wordstem(dfm(text_nl), "dutch")
text_nl = c("Ik hou van katten", "Mijn kat houdt niet van mij")
dfm(text, stem = T, language="nl")
?dfm
mystop = c(stopwords("english"), "citizen")
stopwords('german')
stopwords::stopwords(source ="stopwords-iso")
stopwords::stopwords(language = "de", source ="stopwords-iso")
?stopwords::stopwords
d = dfm(sotu_corpus, remove_punct=T, stem=T, remove=stopwords("english"))
dfm_remove(cd, c("cat*", "i*"))
dfm_trim(d, min_termfreq = 5)
dfm_trim(d, min_docfreq = 5)
dfm_trim(d, min_docfreq = .01, max_docfreq = .5, docfreq_type = "prop")
stopwords::stopwords_getsources()
# spacyr
textplot_wordcloud(d, max_words = 50) ## top 50 (most frequent) words
textplot_wordcloud(d, max_words = 50, color = c('blue','red')) ## change colors
textstat_frequency(d, n = 10) ## view the frequencies
is_obama = docvars(d)$President == 'Barack Obama'
obama_dtm = d[is_obama,]
textplot_wordcloud(obama_dtm, max_words = 25)
obama_dtm = dfm_subset(d, is_obama)
table(docvars(d)$President)
is_bush = docvars(d)$President == 'George W. Bush'
is_obama = docvars(d)$President == 'Barack Obama'
dtm_subset = dfm_subset(d, is_obama | is_bush)
ts = textstat_keyness(dtm_subset, docvars(dtm_subset)$President == 'Barack Obama' )
head(ts, 20) ## view first 20 results
ts %>% arrange(chi2) %>% head
dict = dictionary(list(terrorism = 'terror*',
religion = c('relig*', 'christ*', 'musli*', 'isla*'),
economy = c('econom*', 'tax*', 'job*'),
military = c('army','navy','military','airforce','soldier'),
freedom = c('freedom','liberty')))
?dictionary
dict_dtm = dfm_lookupi(d, dict, exclusive=TRUE)
dict_dtm[1:10, ]
df = convert(dict_dtm, to='data.frame') %>% as_tibble
df
metadata = docvars(d) %>% as_tibble(rownames = "document")
metadata
inner_join(df, metadata) %>% group_by(President) %>% summarize(terrorism = sum(terrorism))
inner_join(df, metadata) %>% gather(terrorism:freedom, key = "query", value="n") %>%
group_by(President, query) %>% summarize(n=sum(n)) %>% spread(query, n)
inner_join(df, metadata) %>% group_by(President) %>% summarize(cor = cor(terrorism, religion)) %>% arrange(-cor)
kwic(sotu_corpus, "musli*") %>% head
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment