# warning : messy :) | |
library(quanteda) | |
text = c("I like cats", "I'm not sure my cats like me") | |
dfm(text) | |
dfm(t$name) | |
t | |
dfm(d$text) | |
sotu_corpus = corpus(d) | |
d = dfm(sotu_corpus) | |
d | |
head(docvars(d)) | |
?dfm | |
d[1:10, 1:10] | |
text = "彭丽媛会见古巴国务委员会主席夫人" | |
text = "麻生太郎財務相は10日の衆院財務金融委員会で、議員らから財務相への予算関連" | |
text = c("I like cats", "I'm not sure, people like my cat :-(") | |
cd = dfm(text, stem = T) | |
dfm(text) %>% textplot_wordcloud(min_count = 1) | |
tokenized = tokens(text) | |
class(tokenized) | |
dfm(tokenized) | |
tweet = "Hey I like #hashtags, @you not? http://twtiter.com/bla" | |
dfm(tweet, remove_twitter=F, remove_punct=T, remove_url=T) | |
tokenize(text) | |
quanteda::dfm_wordstem(dfm(text_nl), "dutch") | |
text_nl = c("Ik hou van katten", "Mijn kat houdt niet van mij") | |
dfm(text, stem = T, language="nl") | |
?dfm | |
mystop = c(stopwords("english"), "citizen") | |
stopwords('german') | |
stopwords::stopwords(source ="stopwords-iso") | |
stopwords::stopwords(language = "de", source ="stopwords-iso") | |
?stopwords::stopwords | |
d = dfm(sotu_corpus, remove_punct=T, stem=T, remove=stopwords("english")) | |
dfm_remove(cd, c("cat*", "i*")) | |
dfm_trim(d, min_termfreq = 5) | |
dfm_trim(d, min_docfreq = 5) | |
dfm_trim(d, min_docfreq = .01, max_docfreq = .5, docfreq_type = "prop") | |
stopwords::stopwords_getsources() | |
# spacyr | |
textplot_wordcloud(d, max_words = 50) ## top 50 (most frequent) words | |
textplot_wordcloud(d, max_words = 50, color = c('blue','red')) ## change colors | |
textstat_frequency(d, n = 10) ## view the frequencies | |
is_obama = docvars(d)$President == 'Barack Obama' | |
obama_dtm = d[is_obama,] | |
textplot_wordcloud(obama_dtm, max_words = 25) | |
obama_dtm = dfm_subset(d, is_obama) | |
table(docvars(d)$President) | |
is_bush = docvars(d)$President == 'George W. Bush' | |
is_obama = docvars(d)$President == 'Barack Obama' | |
dtm_subset = dfm_subset(d, is_obama | is_bush) | |
ts = textstat_keyness(dtm_subset, docvars(dtm_subset)$President == 'Barack Obama' ) | |
head(ts, 20) ## view first 20 results | |
ts %>% arrange(chi2) %>% head | |
dict = dictionary(list(terrorism = 'terror*', | |
religion = c('relig*', 'christ*', 'musli*', 'isla*'), | |
economy = c('econom*', 'tax*', 'job*'), | |
military = c('army','navy','military','airforce','soldier'), | |
freedom = c('freedom','liberty'))) | |
?dictionary | |
dict_dtm = dfm_lookupi(d, dict, exclusive=TRUE) | |
dict_dtm[1:10, ] | |
df = convert(dict_dtm, to='data.frame') %>% as_tibble | |
df | |
metadata = docvars(d) %>% as_tibble(rownames = "document") | |
metadata | |
inner_join(df, metadata) %>% group_by(President) %>% summarize(terrorism = sum(terrorism)) | |
inner_join(df, metadata) %>% gather(terrorism:freedom, key = "query", value="n") %>% | |
group_by(President, query) %>% summarize(n=sum(n)) %>% spread(query, n) | |
inner_join(df, metadata) %>% group_by(President) %>% summarize(cor = cor(terrorism, religion)) %>% arrange(-cor) | |
kwic(sotu_corpus, "musli*") %>% head |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment