Skip to content

Instantly share code, notes, and snippets.

Created November 17, 2016 12:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/8dbfd29ea865a283a695fb551d565359 to your computer and use it in GitHub Desktop.
Save anonymous/8dbfd29ea865a283a695fb551d565359 to your computer and use it in GitHub Desktop.
library(quanteda)
texts=c("This is a test", "They tested a test", "I found a test!")
dfm(texts)
dfm(texts, stem=T, ignoredFeatures=stopwords("english"))
coll = collocations(texts)
coll
texts2 = phrasetotoken(texts, subset(coll, G2>10))
dfm = dfm(texts2, stem=T)
trim(dfm, minDoc = 2)
x = list(1,"word")
tokens = tokenize(texts2, removePunct = T)
tokens = toLower(tokens)
tokens = wordstem(tokens, "english")
dfm = dfm(tokens)
dfm = selectFeatures(dfm, stopwords("english"), "remove", )
dfm = trim(dfm, minCount = 2)
dfm
quanteda:::as.DocumentTermMatrix.dfm
quanteda:::dfm.character
texts = c("De kippen eten", "De kip heeft gegeten")
dfm(texts, language="dutch", stem=T, ignoredFeatures=stopwords("dutch"))
###### AmCAT #####
conn = amcat.connect("https://amcat.nl")
meta = amcat.articles(conn, project=1235, articleset=32114, dateparts = T)
dict = list(trump = '"donald trump" OR "donald j trump" OR "the donald"',
clinton = '"hillary clinton" OR "hillary rodham" OR hrc')
a = amcat.aggregate(conn, sets=32139, queries = dict, labels = names(dict), axis1 = "week")
h = amcat.hits(conn, sets=32139, queries = dict, labels=names(dict))
install.packages("reshape2")
h3 = reshape2::dcast(h, id ~ query, value.var="count", fun.aggregate = sum, fill = 0 )
h2 = merge(h, meta)
articles = amcat.articles(conn, project=1235, articleset=32142, dateparts = T, columns=c("date", "medium", "headline", "text"))
c = quanteda.corpus(conn, project=1235, articleset=32142, dateparts=T)
dfm = dfm(c, stem=T, ignoredFeatures=c("mr", "said", stopwords()))
head(docvars(c))
d2 = cbind(docvars(c), as.matrix(dfm.issues))
View(d2)
a = aggregate(d2[names(issues)], d2["week"], function(x) sum(x>0))
ggplot(a, aes(x=week)) +
geom_line(aes(y = economy, color="green")) +
geom_line(aes(y = immigration, color="red"))
issues = list(economy=c("econ*", "inflation"), immigration=c("immigr*", "mexican*"))
dfm.issues = applyDictionary(dfm, issues)
head(dfm.issues)
download.file("http://i.amcat.nl/tokens_full.rds", "tokens.rds")
download.file("http://i.amcat.nl/tokens_sample.rds", "tokens.rds")
tokens = readRDS("tokens.rds")
Head(tokens)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment