/session1.log

## session1.log
library(quanteda)
texts=c("This is a test", "They tested a test", "I found a test!")
dfm(texts)

dfm(texts, stem=T, ignoredFeatures=stopwords("english"))

coll = collocations(texts)
coll

texts2 = phrasetotoken(texts, subset(coll, G2>10))
dfm = dfm(texts2, stem=T)
trim(dfm, minDoc = 2)
x = list(1,"word")

tokens = tokenize(texts2, removePunct = T)
tokens = toLower(tokens)
tokens = wordstem(tokens, "english")
dfm = dfm(tokens)
dfm = selectFeatures(dfm, stopwords("english"), "remove", )
dfm = trim(dfm, minCount = 2)
dfm

quanteda:::as.DocumentTermMatrix.dfm

quanteda:::dfm.character

texts = c("De kippen eten", "De kip heeft gegeten")
dfm(texts, language="dutch", stem=T, ignoredFeatures=stopwords("dutch"))


###### AmCAT #####

conn = amcat.connect("https://amcat.nl")

meta = amcat.articles(conn, project=1235, articleset=32114, dateparts = T)

dict = list(trump = '"donald trump" OR "donald j trump" OR "the donald"',
            clinton = '"hillary clinton" OR "hillary rodham" OR hrc')

a = amcat.aggregate(conn, sets=32139, queries = dict, labels = names(dict), axis1 = "week")

h = amcat.hits(conn, sets=32139, queries = dict, labels=names(dict))
install.packages("reshape2")
h3 = reshape2::dcast(h, id ~ query, value.var="count", fun.aggregate = sum, fill = 0 )
h2 = merge(h, meta)


articles = amcat.articles(conn, project=1235, articleset=32142, dateparts = T, columns=c("date", "medium", "headline", "text"))


c = quanteda.corpus(conn, project=1235, articleset=32142, dateparts=T)
dfm = dfm(c, stem=T, ignoredFeatures=c("mr", "said", stopwords()))

head(docvars(c))

d2 = cbind(docvars(c), as.matrix(dfm.issues))
View(d2)

a = aggregate(d2[names(issues)], d2["week"], function(x) sum(x>0))
ggplot(a, aes(x=week)) +
  geom_line(aes(y = economy, color="green"))  +
  geom_line(aes(y = immigration, color="red"))


issues = list(economy=c("econ*", "inflation"), immigration=c("immigr*", "mexican*"))
dfm.issues = applyDictionary(dfm, issues)
head(dfm.issues)


download.file("http://i.amcat.nl/tokens_full.rds", "tokens.rds")
download.file("http://i.amcat.nl/tokens_sample.rds", "tokens.rds")
tokens = readRDS("tokens.rds")
Head(tokens)
	library(quanteda)
	texts=c("This is a test", "They tested a test", "I found a test!")
	dfm(texts)

	dfm(texts, stem=T, ignoredFeatures=stopwords("english"))

	coll = collocations(texts)
	coll

	texts2 = phrasetotoken(texts, subset(coll, G2>10))
	dfm = dfm(texts2, stem=T)
	trim(dfm, minDoc = 2)
	x = list(1,"word")

	tokens = tokenize(texts2, removePunct = T)
	tokens = toLower(tokens)
	tokens = wordstem(tokens, "english")
	dfm = dfm(tokens)
	dfm = selectFeatures(dfm, stopwords("english"), "remove", )
	dfm = trim(dfm, minCount = 2)
	dfm

	quanteda:::as.DocumentTermMatrix.dfm

	quanteda:::dfm.character

	texts = c("De kippen eten", "De kip heeft gegeten")
	dfm(texts, language="dutch", stem=T, ignoredFeatures=stopwords("dutch"))



	###### AmCAT #####

	conn = amcat.connect("https://amcat.nl")

	meta = amcat.articles(conn, project=1235, articleset=32114, dateparts = T)

	dict = list(trump = '"donald trump" OR "donald j trump" OR "the donald"',
	clinton = '"hillary clinton" OR "hillary rodham" OR hrc')

	a = amcat.aggregate(conn, sets=32139, queries = dict, labels = names(dict), axis1 = "week")

	h = amcat.hits(conn, sets=32139, queries = dict, labels=names(dict))
	install.packages("reshape2")
	h3 = reshape2::dcast(h, id ~ query, value.var="count", fun.aggregate = sum, fill = 0 )
	h2 = merge(h, meta)



	articles = amcat.articles(conn, project=1235, articleset=32142, dateparts = T, columns=c("date", "medium", "headline", "text"))


	c = quanteda.corpus(conn, project=1235, articleset=32142, dateparts=T)
	dfm = dfm(c, stem=T, ignoredFeatures=c("mr", "said", stopwords()))

	head(docvars(c))

	d2 = cbind(docvars(c), as.matrix(dfm.issues))
	View(d2)

	a = aggregate(d2[names(issues)], d2["week"], function(x) sum(x>0))
	ggplot(a, aes(x=week)) +
	geom_line(aes(y = economy, color="green")) +
	geom_line(aes(y = immigration, color="red"))



	issues = list(economy=c("econ", "inflation"), immigration=c("immigr", "mexican*"))
	dfm.issues = applyDictionary(dfm, issues)
	head(dfm.issues)





	download.file("http://i.amcat.nl/tokens_full.rds", "tokens.rds")
	download.file("http://i.amcat.nl/tokens_sample.rds", "tokens.rds")
	tokens = readRDS("tokens.rds")
	Head(tokens)