Skip to content

Instantly share code, notes, and snippets.

@vanatteveldt
Created June 3, 2016 03:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vanatteveldt/b3481996bf65008ddacca58542327131 to your computer and use it in GitHub Desktop.
Save vanatteveldt/b3481996bf65008ddacca58542327131 to your computer and use it in GitHub Desktop.
#run once
install.packages("devtools")
library(devtools)
devtools::install_github("amcat/amcat-r")
amcat.save.password("https://amcat.nl", "..", "..")
# run every time
library(amcatr)
conn = amcat.connect("https://amcat.nl")
h = amcat.hits(conn, c("mortgage*", "greek* OR greece*"), labels = c("mortgage", "greece"), sets = 29454)
head(h)
table(h$query)
head(h)
library(reshape2)
hw = dcast(h, id ~ query, value.var = "count")
hw[is.na(hw)] = 0
head(hw)
meta = amcat.getarticlemeta(conn, project=41, 29454, columns=c("date", "medium"), dateparts = T)
head(meta)
h = merge(meta, h)
head(h)
a = aggregate(h["count"], h[c("year", "query")], length)
head(a)
library(ggplot2)
ggplot(a, aes(x=year, y=count, color=query)) + geom_line()
library(twitteR)
tweets = searchTwitteR("#bigdata -rt", resultType="recent", n = 100)
tweets = plyr::ldply(tweets, as.data.frame)
set = amcat.upload.articles(conn, project=1,
articleset="twitter test", medium="twitter",
text=tweets$text, headline=tweets$text,
date=tweets$created, author=tweets$screenName)
head(amcat.getarticlemeta(conn, 1, set, columns=c('date', 'headline')))
amcat.add.articles.to.set(conn,1, articleset.name="subset from r", articles=h$id)
library(RTextTools)
dtm = create_matrix(c("He is lying that he lies in bed, all lies!"), removeStopwords = F, stemWords = T)
dtm
dtm = create_matrix(c("I am Spartacus", "We are Spartacus"), removeStopwords = F,stemWords=T, minDocFreq = 0, minWordLength = 0)
as.matrix(dtm)
library(slam)
col_sums(dtm)
m = as.matrix(dtm)
rownames(m)
m[,1]
install_github("kasperwelbers/corpus-tools")
library(corpustools)
t= term.statistics(dtm)
t
colnames(m)
m = matrix(rnorm(10), ncol = 2)
m
rowSums(m)
colSums(m)
rowMeans(as.matrix(dtm))
col_sums()
colnames(dtm)
rownames(dtm)
stopwords = c("i", "am", "are")
m = dtm[, !(colnames(dtm) %in% stopwords)]
as.matrix(m)
tokens = amcat.gettokens(conn, 1, 29523, page_size=1, module="corenlp_lemmatize", max_page=1)
head(tokens)
set = amcat.upload.articles(conn, project=1,
articleset="r test", medium="twitter",
text="John went to three bars in Amsterda", headline="-",
date="2001-01-01T00:00")
data(sotu)
head(sotu.tokens)
sotu.tokens[sotu.tokens$lemma == "unfinished",]
names = sotu.tokens[sotu.tokens$pos1 == "M", ]
head(names)
dtm = dtm.create(names$aid, names$lemma)
dtm = with(sotu.tokens[sotu.tokens$pos1 == "M", ], dtm.create(aid, lemma))
dtm.wordcloud(dtm, freq.fun = sqrt)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment