#run once | |
install.packages("devtools") | |
library(devtools) | |
devtools::install_github("amcat/amcat-r") | |
amcat.save.password("https://amcat.nl", "..", "..") | |
# run every time | |
library(amcatr) | |
conn = amcat.connect("https://amcat.nl") | |
h = amcat.hits(conn, c("mortgage*", "greek* OR greece*"), labels = c("mortgage", "greece"), sets = 29454) | |
head(h) | |
table(h$query) | |
head(h) | |
library(reshape2) | |
hw = dcast(h, id ~ query, value.var = "count") | |
hw[is.na(hw)] = 0 | |
head(hw) | |
meta = amcat.getarticlemeta(conn, project=41, 29454, columns=c("date", "medium"), dateparts = T) | |
head(meta) | |
h = merge(meta, h) | |
head(h) | |
a = aggregate(h["count"], h[c("year", "query")], length) | |
head(a) | |
library(ggplot2) | |
ggplot(a, aes(x=year, y=count, color=query)) + geom_line() | |
library(twitteR) | |
tweets = searchTwitteR("#bigdata -rt", resultType="recent", n = 100) | |
tweets = plyr::ldply(tweets, as.data.frame) | |
set = amcat.upload.articles(conn, project=1, | |
articleset="twitter test", medium="twitter", | |
text=tweets$text, headline=tweets$text, | |
date=tweets$created, author=tweets$screenName) | |
head(amcat.getarticlemeta(conn, 1, set, columns=c('date', 'headline'))) | |
amcat.add.articles.to.set(conn,1, articleset.name="subset from r", articles=h$id) | |
library(RTextTools) | |
dtm = create_matrix(c("He is lying that he lies in bed, all lies!"), removeStopwords = F, stemWords = T) | |
dtm | |
dtm = create_matrix(c("I am Spartacus", "We are Spartacus"), removeStopwords = F,stemWords=T, minDocFreq = 0, minWordLength = 0) | |
as.matrix(dtm) | |
library(slam) | |
col_sums(dtm) | |
m = as.matrix(dtm) | |
rownames(m) | |
m[,1] | |
install_github("kasperwelbers/corpus-tools") | |
library(corpustools) | |
t= term.statistics(dtm) | |
t | |
colnames(m) | |
m = matrix(rnorm(10), ncol = 2) | |
m | |
rowSums(m) | |
colSums(m) | |
rowMeans(as.matrix(dtm)) | |
col_sums() | |
colnames(dtm) | |
rownames(dtm) | |
stopwords = c("i", "am", "are") | |
m = dtm[, !(colnames(dtm) %in% stopwords)] | |
as.matrix(m) | |
tokens = amcat.gettokens(conn, 1, 29523, page_size=1, module="corenlp_lemmatize", max_page=1) | |
head(tokens) | |
set = amcat.upload.articles(conn, project=1, | |
articleset="r test", medium="twitter", | |
text="John went to three bars in Amsterda", headline="-", | |
date="2001-01-01T00:00") | |
data(sotu) | |
head(sotu.tokens) | |
sotu.tokens[sotu.tokens$lemma == "unfinished",] | |
names = sotu.tokens[sotu.tokens$pos1 == "M", ] | |
head(names) | |
dtm = dtm.create(names$aid, names$lemma) | |
dtm = with(sotu.tokens[sotu.tokens$pos1 == "M", ], dtm.create(aid, lemma)) | |
dtm.wordcloud(dtm, freq.fun = sqrt) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment