Skip to content

Instantly share code, notes, and snippets.

@vanatteveldt
Created April 21, 2016 13:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vanatteveldt/1cb9be7f7d6108d40def0fd9bf42c6b4 to your computer and use it in GitHub Desktop.
Save vanatteveldt/1cb9be7f7d6108d40def0fd9bf42c6b4 to your computer and use it in GitHub Desktop.
# You might need these packages:
#install.packages("devtools")
#devtools::install_github("amcat/amcat-r")
#devtools::install_github("kasperwelbers/corpus-tools")
#devtools::install_github("vanatteveldt/rsyntax")
library(corpustools)
library(amcatr)
conn = amcat.connect("https://amcat.nl")
# ophalen hits en meta
h = amcat.hits(conn, "referend*", sets=25173)
head(h)
meta = amcat.getarticlemeta(conn, 1006, 25173, dateparts = T)
head(meta)
head(h)
save(meta, h, file="metah.rda")
# selecteren, mergen, aggregeren, plotten
h = merge(meta, h)
sort(table(h$medium))
kranten = c("De Volkskrant", "Trouw", "NRC Handelsblad")
h2 = h[h$medium %in% kranten, ]
d = dcast(h2, week ~ medium, value.var="count", fun.aggregate = sum)
head(d)
plot(x=d$week, y=d$`De Volkskrant`, type="l")
lines(x=d$week, y=d$`Trouw`, col="red")
# Good, the AmCAT interface but *more* complicated. What more could I want? :-)
# Corpus analysis: the bag of words aka document-term matrix
library(RTextTools)
input = data.frame(text=c("Chickens are birds", "The bird eats"))
m = create_matrix(input$text, removeStopwords=F)
as.matrix(m)
m = create_matrix(input$text, removeStopwords=T, stemWords=T, language='english')
dim(m)
as.matrix(m)
colSums(as.matrix(m))
text = c("De kip eet", "De kippen hebben gegeten")
m = create_matrix(text, removeStopwords=T, stemWords=T, language="dutch")
colSums(as.matrix(m))
# Enter AmCAT - getting 'tokens'
tokens = amcat.gettokens(conn, 1006, 25173, page_size=100, max_page=2)
saveRDS(tokens, "tokens_elastic.rds")
head(tokens)
library(corpustools)
dtm = dtm.create(tokens$aid, tokens$term)
dtm.wordcloud(dtm)
# Well that didn't help, now did it. We need need preprocessing!
tokens = amcat.gettokens(conn, 1006, 25173, module="morphosyntactic", page_size=100, max_page=1, only_cached=T)
# saveRDS(tokens, "~/oekref16/data/tokens.rds")
tokens = readRDS("~/oekref16/data/tokens.rds")
head(tokens)
# create dtm from (subset of) tokens
library(corpustools)
subset = tokens[tokens$pos == "name", ]
dtm = dtm.create(subset$aid, subset$lemma)
as.matrix(dtm)[1:10, 1:10]
dtm.wordcloud(dtm, nterms = 200)
# Case 1: State-of-the-Union
data(sotu)
head(sotu.tokens)
aggregate(cbind(Freq=sotu.meta$id), list(Speaker=sotu.meta$headline), length)
# Contrast: Bush vs Obama
dtm = with(sotu.tokens[sotu.tokens$pos1 %in% c("N", "A", "M"), ],
dtm.create(aid, lemma))
obama = sotu.meta$id[sotu.meta$headline == "Barack Obama"]
cmp = corpora.compare(dtm, select.rows = obama)
h = rescale(log(cmp$over), c(1, .6666))
s = rescale(sqrt(cmp$chi), c(.25,1))
cmp$col = hsv(h, s, .33 + .67*s)
head(cmp)
with(head(cmp, 130), plotWords(x=log(over), words=term, wordfreq=termfreq, random.y = T, col=col, scale=2))
wordfreqs = dtm.to.df(dtm)
wordfreqs = merge(sotu.meta, wordfreqs, by.x="id", by.y="doc")
mmode <- function(v) {uniqv <- unique(v); uniqv[which.max(tabulate(match(v, uniqv)))]}
dates = aggregate(wordfreqs["date"], by=wordfreqs["term"], FUN=mmode)
cmp = arrange(merge(cmp, dates), -termfreq)
with(head(cmp, 100), plotWords(x=date, words=term, wordfreq=termfreq, random.y = T, col=col, scale=2))
# Okay, that was fun
# Now let's have a look at topic modeling
# Visualizing word clusters
set.seed(123)
m = lda.fit(dtm, K=10, alpha=.1)
topics = c("War", "People", "Energy", "Education", "Tax", "Reform", "Freedom", "Terrorism", "Jobs", "Health")
x = terms(m, 10)
colnames(x) = topics
head(x)
# Merge topic and word information
w = m@wordassignments
colnames(w) = m@terms
mostfrequent = function(x) {t = names(sort(table(x[x!=0]), decreasing = T)[1]); if(is.null(t)) 0 else t}
word.topics = apply(w, MARGIN = 2, FUN = mostfrequent)
cmp$topic = as.numeric(word.topics[as.character(cmp$term)])
cmp$topic.name = factor(cmp$topic, labels = topics)
cols = sample(substr(rainbow(length(topics), s=0.6,alpha=0.5), 1,7))
cmp$topic.col = cols[cmp$topic]
head(cmp)
# Topics over time
with(head(cmp, 100), plotWords(date, topic, words=as.character(term), wordfreq = termfreq, col=topic.col, scale=2))
# Topic overlap
cm = cor(t(m@beta))
colnames(cm) = rownames(cm) = topics
diag(cm) = 0
heatmap(cm, symm = T)
# View topic overlap
compare.topics <- function(m, cmp_topics) {
assignments = dtm.to.df(m@wordassignments, term_labels=m@terms, doc_labels=m@documents)
terms = dcast(assignments, term ~ freq, value.var = "doc", fun.aggregate = length)
terms = terms[, c(1, cmp_topics+1)]
terms$freq = rowSums(terms[-1])
terms = terms[terms$freq > 0,]
terms$prop = terms[[2]] / terms$freq
terms$col = hsv(rescale(terms$prop, c(1, .6666)), .5, .5)
terms$nn = rowSums(terms[2:3]>0)
terms[order(-terms$freq), ]
}
# Topic overlap: War and Peace
set.seed(123)
terms = compare.topics(m, match(c("Freedom", "War"), topics))
terms = compare.topics(m, match(c("Reform", "Tax"), topics))
with(head(terms, 100), plotWords(x=prop, wordfreq = freq, words = term, col=col, xaxt="none", random.y = T, scale=2))
# Topic overlap: Who is speaking?
terms$col.speaker = cmp$col[match(terms$term, cmp$term)]
with(head(terms, 100), plotWords(x=prop, wordfreq = freq, words = term, col=col.speaker, xaxt="none", random.y = T, scale=2))
# Topic overlap: three topics
compare.topics3 <- function(m, cmp_topics) {
assignments = dtm.to.df(m@wordassignments, term_labels=m@terms, doc_labels=m@documents)
terms = dcast(assignments, term ~ freq, value.var = "doc", fun.aggregate = length)
terms = terms[,c(1, cmp_topics+1)] # 'term is 1
colnames(terms)[-1] = c("x", "y", "z")
terms$freq = rowSums(terms[-1])
terms = terms[terms$freq>0, ]
terms$prop.x = terms$x / terms$freq
terms$prop.y = terms$y / terms$freq
terms$prop.z = terms$z / terms$freq
tern = ggtern::tlr2xy(terms[c("x", "y", "z")], ggtern::coord_tern())
terms$tx = tern$x
terms$ty = tern$y
terms$col = with(terms, rgb(x/freq, y/freq, z/freq))
terms$nn = rowSums(terms[c("x","y","z")] > 0)
terms[order(-terms$freq), ]
}
# Topic overlap: Health, Reform, and Tax
terms = compare.topics3(m, match(c("Tax", "Reform", "Health"), topics))
with(head(terms, 75), plotWords(tx, ty, term, freq, col=col, xaxt="none", scale=2))
# Who wants reform?
terms$col.speaker = cmp$col[match(terms$term, cmp$term)]
with(head(terms, 75), plotWords(tx, ty, term, freq, col=col.speaker, xaxt="none", scale=2))
# Educating the people about terrorism?
terms = compare.topics3(m, match(c("Terrorism", "Education", "People"), topics))
with(head(terms, 75), plotWords(tx, ty, term, freq, col=col, xaxt="none", scale=2))
terms$col.speaker = cmp$col[match(terms$term, cmp$term)]
with(head(terms, 75), plotWords(tx, ty, term, freq, col=col.speaker, xaxt="none", scale=2))
# Semantic Network Analysis
library(semnet)
g = with(sotu.tokens[sotu.tokens$pos1 %in% c("N", "M", "A"), ],
windowedCoOccurenceNetwork(location = id, term = lemma, context = aid, window.size = 20))
head(get.data.frame(g, "edges"))
gb = getBackboneNetwork(g, alpha = 1e-03, max.vertices = 75)
gb = decompose.graph(gb, min.vertices = 5, max.comps = 1)[[1]]
V(gb)$cluster = edge.betweenness.community(gb)$membership
gb = setNetworkAttributes(gb, size_attribute = V(gb)$freq, cluster_attribute = V(gb)$cluster)
plot(gb)
# Semantic Networks: color by speaker
V(gb)$frame.color = V(gb)$color = cmp$col[match(V(gb)$name, cmp$term)]
plot(gb)
# Semantic Networks: color by LDA topic
V(gb)$frame.color = V(gb)$color = cmp$topic.col[match(V(gb)$name, cmp$term)]
plot(gb)
# Semantic Networks: contrasts
g.bush = with(sotu.tokens[sotu.tokens$pos1 %in% c("N", "M", "A") & sotu.tokens$aid %in% sotu.meta$id[sotu.meta$headline == "George W. Bush"], ],
windowedCoOccurenceNetwork(location = id, term = lemma, context = aid, window.size = 20))
g.obama = with(sotu.tokens[sotu.tokens$pos1 %in% c("N", "M", "A") & sotu.tokens$aid %in% sotu.meta$id[sotu.meta$headline == "Barack Obama"], ],
windowedCoOccurenceNetwork(location = id, term = lemma, context = aid, window.size = 20))
d.obama = get.data.frame(g.obama, what=c("edges"))
d.bush = get.data.frame(g.bush, what=c("edges"))
d =merge(d.obama, d.bush, by=c("from", "to"), all=T)
d[is.na(d)] = 0
do = d[d$weight.x > d$weight.y, ]
colnames(do)[3] = "weight"
g.obama = graph.data.frame(do, vertices = get.data.frame(g.obama, "vertices"))
db = d[d$weight.x < d$weight.y, ]
colnames(db)[4] = "weight"
g.bush = graph.data.frame(db, vertices = get.data.frame(g.bush, "vertices"))
# Semantic Networks: contrasts
for (g in list(g.obama, g.bush)) {
g = getBackboneNetwork(g, alpha = 1e-03, max.vertices = 75)
g = decompose.graph(g, min.vertices = 5, max.comps = 1)[[1]]
V(g)$cluster = edge.betweenness.community(g)$membership
g = setNetworkAttributes(g, size_attribute = V(g)$freq, cluster_attribute = V(g)$cluster)
plot(g)
}
# Semantic Networks: contrasts
for (g in list(g.obama, g.bush)) {
g = getBackboneNetwork(g, alpha = 1e-03, max.vertices = 75)
g = decompose.graph(g, min.vertices = 5, max.comps = 1)[[1]]
V(g)$cluster = edge.betweenness.community(g)$membership
g = setNetworkAttributes(g, size_attribute = V(g)$freq, cluster_attribute = V(g)$cluster)
plot(g)
}
## Can we go one step further? --> Syntactic analysis for clause analysis
## Sentence -> (source, subject, predicate) triples
library(rsyntax)
data(example_tokens_dutchquotes)
tokens = tokens[tokens$sentence == 4,]
get_text(tokens)
plot(graph_from_sentence(tokens))
q = get_quotes_nl(tokens)
cl = get_clauses_nl(tokens)
plot(graph_from_sentence(tokens, quotes = q, clauses = cl))
# Case 2: Gaza war
rm(list = ls())
setwd("~/Dropbox/papers/2015_clauses/clauses_analyses")
load("quotes_clauses.rda")
load("tokens.rda")
head(tokens)
head(clauses)
src_actor = unique(quotes[quotes$actor != "" & quotes$quote_role == "source", c("quote_id", "actor")])
nn = table(src_actor$quote_id)
src_actor = subset(src_actor, !(quote_id %in% names(nn[nn>1])))
quote_src = quotes[quotes$quote_role== "quote", c('quote_id', 'id')]
quote_src = merge(quote_src, src_actor)
quote_src = data.frame(id=quote_src$id, source=quote_src$actor)
clauses = merge(clauses, quote_src, all.x=T)
clauses = merge(clauses, tokens[c("id", "lemma", "pos1")])
agcl = unique(clauses$clause_id[clauses$attack])
# (1) Who is quoted?
x = unique(clauses[ c("clause_id", "country", "source")])
x = unique(clauses[!is.na(clauses$source), c("clause_id", "country", "source")])
nn = table(x$country, x$source)
nn
nn[,2] / nn[,1]
chisq.test(nn)
x = unique(clauses[!is.na(clauses$source) & (clauses$clause_id %in% agcl), c("clause_id", "country", "source")])
nn = table(x$country, x$source)
nn
nn[,2] / nn[,1]
chisq.test(nn)
# (2) Who is aggressor / victim
acl = unique(clauses[clauses$actor != "" & clauses$clause_id %in% agcl, c("clause_id", "clause_role", "country", "actor")])
head(acl)
nn.us = with(subset(acl, country=="us"), table(actor, clause_role))
nn.us / rowSums(nn.us)
nn.cn = with(subset(acl, country=="cn"), table(actor, clause_role))
nn.cn / rowSums(nn.cn)
with(subset(acl, actor=="Israel"), chisq.test(country, clause_role))
with(subset(acl, actor=="Hamas"), chisq.test(country, clause_role))
chisq.test(nn)
# (3) What do they do?
stopwords = read.csv("stop-word-list.csv", header =F, stringsAsFactors=F)[[1]]
# dtm of predicates
p = subset(clauses, clause_role == "predicate" & pos1 %in% c('V','N','M','A') & lemma %notin% stopwords)
library(corpustools)
library(semnet)
predicates = dtm.create(p$clause_id, p$lemma, filter.chars = F, minlength = 2)
#verbs = with(tokens[which(tokens$clause_role == "predicate"),], dtm.create(clause_id, lemma, filter = pos1 == 'V' & lemma %notin% stopwords))
# what do hamas/israel do according to china / israel
cusa = unique(clauses$clause_id[clauses$country == "us"])
cchina = unique(clauses$clause_id[clauses$country == "cn"])
sh = unique(clauses$clause_id[clauses$clause_role=="subject" & clauses$actor == "Hamas"])
si = unique(clauses$clause_id[clauses$clause_role=="subject" & clauses$actor == "Israel"])
c(length(intersect(cusa, si)), length(intersect(cchina, si)), length(intersect(cusa, sh)), length(intersect(cchina, sh)))
source("functions.r")
plot.cooc.from.dtm(predicates, intersect(cusa, si), intersect(cchina, si), main="Country:US, Subject:Israel")
plot.cooc.from.dtm(predicates, intersect(cchina, si), intersect(cusa, si), main="Country:China, Subject:Israel")
plot.cooc.from.dtm(predicates, intersect(cusa, sh), intersect(cchina, sh), main="Country:US, Subject:Hamas")
plot.cooc.from.dtm(predicates, intersect(cchina, sh), intersect(cusa, sh), main="Country:China, Subject:Hamas")
# (6) Who talks about whom?
head(quotes)
entities = get.entities(tokens)
qnames = na.omit(merge(quotes, subset(entities, name == "Israel" | type %in% c("ORGANIZATION", "PERSON")))[c("country", "quote_id", "quote_role", "name")])
srcs = qnames[qnames$quote_role == "source", c("country", "quote_id", "name")]
qts = qnames[qnames$quote_role == "quote", c("country", "quote_id", "name")]
###
head(srcs)
cmp = dcast(srcs, name ~ country, fun.aggregate = length, value.var="quote_id")
smooth = .000001
cmp$relcn = cmp$cn / sum(cmp$cn)
cmp$relus = cmp$us / sum(cmp$us)
head(x)
cmp$overus = (cmp$relus + smooth) / (cmp$relcn + smooth)
cmp$chi = with(cmp, corpustools:::chi2(us, cn, sum(us) - us, sum(cn) - cn))
subset(cmp, name == "Hamas")
###
x = merge(srcs, qts, by=c("quote_id", "country"))
x = aggregate(list(n=x$quote_id), by=x[c("name.x", "name.y", "country")], FUN=function(x) length(unique(x)))
table(x$country)
#g.cn = graph.data.frame(subset(x, n>=1 & country == "cn"), directed = T)
#V(g.cn)$weight = table(qnames$name[qnames$country == "cn"])[get.vertex.attribute(g.cn, "name")]
g.cn = graph.data.frame(subset(x, n>=1 & country == "cn" & (name.x %in% cmp$name[cmp$overus < 1] |name.y %in% cmp$name[cmp$overus < 1])), directed = T)
V(g.cn)$weight = cmp$chi[match(get.vertex.attribute(g.cn, "name"), cmp$name)]
g.cn = graph.settings(g.cn, backbone=T)
V(g.cn)$shape = "none"
plot(g.cn, main="Quote network China")
#g.us = graph.data.frame(subset(x, n>=1 & country == "us"), directed = T)
#V(g.us)$weight = table(qnames$name[qnames$country == "us"])[get.vertex.attribute(g.us, "name")]
g.us = graph.data.frame(subset(x, country == "us" & (name.x %in% cmp$name[cmp$overus > 1] | name.y %in% cmp$name[cmp$overus > 1])), directed = T)
V(g.us)$weight = cmp$chi[match(get.vertex.attribute(g.us, "name"), cmp$name)]
V(g.us)$weight[is.na(V(g.us)$weight)] = 0.01
g.us = graph.settings(g.us, backbone=T)
V(g.us)$shape = "none"
plot(g.us, main="Quote network US")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment