Created
April 10, 2019 10:35
-
-
Save vanatteveldt/4eeb80b59e2aeb084a91acbaad75dc00 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# warning : messy :) | |
library(quanteda) | |
text = c("I like cats", "I'm not sure my cats like me") | |
dfm(text) | |
dfm(t$name) | |
t | |
dfm(d$text) | |
sotu_corpus = corpus(d) | |
d = dfm(sotu_corpus) | |
d | |
head(docvars(d)) | |
?dfm | |
d[1:10, 1:10] | |
text = "彭丽媛会见古巴国务委员会主席夫人" | |
text = "麻生太郎財務相は10日の衆院財務金融委員会で、議員らから財務相への予算関連" | |
text = c("I like cats", "I'm not sure, people like my cat :-(") | |
cd = dfm(text, stem = T) | |
dfm(text) %>% textplot_wordcloud(min_count = 1) | |
tokenized = tokens(text) | |
class(tokenized) | |
dfm(tokenized) | |
tweet = "Hey I like #hashtags, @you not? http://twtiter.com/bla" | |
dfm(tweet, remove_twitter=F, remove_punct=T, remove_url=T) | |
tokenize(text) | |
quanteda::dfm_wordstem(dfm(text_nl), "dutch") | |
text_nl = c("Ik hou van katten", "Mijn kat houdt niet van mij") | |
dfm(text, stem = T, language="nl") | |
?dfm | |
mystop = c(stopwords("english"), "citizen") | |
stopwords('german') | |
stopwords::stopwords(source ="stopwords-iso") | |
stopwords::stopwords(language = "de", source ="stopwords-iso") | |
?stopwords::stopwords | |
d = dfm(sotu_corpus, remove_punct=T, stem=T, remove=stopwords("english")) | |
dfm_remove(cd, c("cat*", "i*")) | |
dfm_trim(d, min_termfreq = 5) | |
dfm_trim(d, min_docfreq = 5) | |
dfm_trim(d, min_docfreq = .01, max_docfreq = .5, docfreq_type = "prop") | |
stopwords::stopwords_getsources() | |
# spacyr | |
textplot_wordcloud(d, max_words = 50) ## top 50 (most frequent) words | |
textplot_wordcloud(d, max_words = 50, color = c('blue','red')) ## change colors | |
textstat_frequency(d, n = 10) ## view the frequencies | |
is_obama = docvars(d)$President == 'Barack Obama' | |
obama_dtm = d[is_obama,] | |
textplot_wordcloud(obama_dtm, max_words = 25) | |
obama_dtm = dfm_subset(d, is_obama) | |
table(docvars(d)$President) | |
is_bush = docvars(d)$President == 'George W. Bush' | |
is_obama = docvars(d)$President == 'Barack Obama' | |
dtm_subset = dfm_subset(d, is_obama | is_bush) | |
ts = textstat_keyness(dtm_subset, docvars(dtm_subset)$President == 'Barack Obama' ) | |
head(ts, 20) ## view first 20 results | |
ts %>% arrange(chi2) %>% head | |
dict = dictionary(list(terrorism = 'terror*', | |
religion = c('relig*', 'christ*', 'musli*', 'isla*'), | |
economy = c('econom*', 'tax*', 'job*'), | |
military = c('army','navy','military','airforce','soldier'), | |
freedom = c('freedom','liberty'))) | |
?dictionary | |
dict_dtm = dfm_lookupi(d, dict, exclusive=TRUE) | |
dict_dtm[1:10, ] | |
df = convert(dict_dtm, to='data.frame') %>% as_tibble | |
df | |
metadata = docvars(d) %>% as_tibble(rownames = "document") | |
metadata | |
inner_join(df, metadata) %>% group_by(President) %>% summarize(terrorism = sum(terrorism)) | |
inner_join(df, metadata) %>% gather(terrorism:freedom, key = "query", value="n") %>% | |
group_by(President, query) %>% summarize(n=sum(n)) %>% spread(query, n) | |
inner_join(df, metadata) %>% group_by(President) %>% summarize(cor = cor(terrorism, religion)) %>% arrange(-cor) | |
kwic(sotu_corpus, "musli*") %>% head |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment