Created
April 21, 2016 13:13
-
-
Save vanatteveldt/1cb9be7f7d6108d40def0fd9bf42c6b4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# You might need these packages: | |
#install.packages("devtools") | |
#devtools::install_github("amcat/amcat-r") | |
#devtools::install_github("kasperwelbers/corpus-tools") | |
#devtools::install_github("vanatteveldt/rsyntax") | |
library(corpustools) | |
library(amcatr) | |
conn = amcat.connect("https://amcat.nl") | |
# ophalen hits en meta | |
h = amcat.hits(conn, "referend*", sets=25173) | |
head(h) | |
meta = amcat.getarticlemeta(conn, 1006, 25173, dateparts = T) | |
head(meta) | |
head(h) | |
save(meta, h, file="metah.rda") | |
# selecteren, mergen, aggregeren, plotten | |
h = merge(meta, h) | |
sort(table(h$medium)) | |
kranten = c("De Volkskrant", "Trouw", "NRC Handelsblad") | |
h2 = h[h$medium %in% kranten, ] | |
d = dcast(h2, week ~ medium, value.var="count", fun.aggregate = sum) | |
head(d) | |
plot(x=d$week, y=d$`De Volkskrant`, type="l") | |
lines(x=d$week, y=d$`Trouw`, col="red") | |
# Good, the AmCAT interface but *more* complicated. What more could I want? :-) | |
# Corpus analysis: the bag of words aka document-term matrix | |
library(RTextTools) | |
input = data.frame(text=c("Chickens are birds", "The bird eats")) | |
m = create_matrix(input$text, removeStopwords=F) | |
as.matrix(m) | |
m = create_matrix(input$text, removeStopwords=T, stemWords=T, language='english') | |
dim(m) | |
as.matrix(m) | |
colSums(as.matrix(m)) | |
text = c("De kip eet", "De kippen hebben gegeten") | |
m = create_matrix(text, removeStopwords=T, stemWords=T, language="dutch") | |
colSums(as.matrix(m)) | |
# Enter AmCAT - getting 'tokens' | |
tokens = amcat.gettokens(conn, 1006, 25173, page_size=100, max_page=2) | |
saveRDS(tokens, "tokens_elastic.rds") | |
head(tokens) | |
library(corpustools) | |
dtm = dtm.create(tokens$aid, tokens$term) | |
dtm.wordcloud(dtm) | |
# Well that didn't help, now did it. We need need preprocessing! | |
tokens = amcat.gettokens(conn, 1006, 25173, module="morphosyntactic", page_size=100, max_page=1, only_cached=T) | |
# saveRDS(tokens, "~/oekref16/data/tokens.rds") | |
tokens = readRDS("~/oekref16/data/tokens.rds") | |
head(tokens) | |
# create dtm from (subset of) tokens | |
library(corpustools) | |
subset = tokens[tokens$pos == "name", ] | |
dtm = dtm.create(subset$aid, subset$lemma) | |
as.matrix(dtm)[1:10, 1:10] | |
dtm.wordcloud(dtm, nterms = 200) | |
# Case 1: State-of-the-Union | |
data(sotu) | |
head(sotu.tokens) | |
aggregate(cbind(Freq=sotu.meta$id), list(Speaker=sotu.meta$headline), length) | |
# Contrast: Bush vs Obama | |
dtm = with(sotu.tokens[sotu.tokens$pos1 %in% c("N", "A", "M"), ], | |
dtm.create(aid, lemma)) | |
obama = sotu.meta$id[sotu.meta$headline == "Barack Obama"] | |
cmp = corpora.compare(dtm, select.rows = obama) | |
h = rescale(log(cmp$over), c(1, .6666)) | |
s = rescale(sqrt(cmp$chi), c(.25,1)) | |
cmp$col = hsv(h, s, .33 + .67*s) | |
head(cmp) | |
with(head(cmp, 130), plotWords(x=log(over), words=term, wordfreq=termfreq, random.y = T, col=col, scale=2)) | |
wordfreqs = dtm.to.df(dtm) | |
wordfreqs = merge(sotu.meta, wordfreqs, by.x="id", by.y="doc") | |
mmode <- function(v) {uniqv <- unique(v); uniqv[which.max(tabulate(match(v, uniqv)))]} | |
dates = aggregate(wordfreqs["date"], by=wordfreqs["term"], FUN=mmode) | |
cmp = arrange(merge(cmp, dates), -termfreq) | |
with(head(cmp, 100), plotWords(x=date, words=term, wordfreq=termfreq, random.y = T, col=col, scale=2)) | |
# Okay, that was fun | |
# Now let's have a look at topic modeling | |
# Visualizing word clusters | |
set.seed(123) | |
m = lda.fit(dtm, K=10, alpha=.1) | |
topics = c("War", "People", "Energy", "Education", "Tax", "Reform", "Freedom", "Terrorism", "Jobs", "Health") | |
x = terms(m, 10) | |
colnames(x) = topics | |
head(x) | |
# Merge topic and word information | |
w = m@wordassignments | |
colnames(w) = m@terms | |
mostfrequent = function(x) {t = names(sort(table(x[x!=0]), decreasing = T)[1]); if(is.null(t)) 0 else t} | |
word.topics = apply(w, MARGIN = 2, FUN = mostfrequent) | |
cmp$topic = as.numeric(word.topics[as.character(cmp$term)]) | |
cmp$topic.name = factor(cmp$topic, labels = topics) | |
cols = sample(substr(rainbow(length(topics), s=0.6,alpha=0.5), 1,7)) | |
cmp$topic.col = cols[cmp$topic] | |
head(cmp) | |
# Topics over time | |
with(head(cmp, 100), plotWords(date, topic, words=as.character(term), wordfreq = termfreq, col=topic.col, scale=2)) | |
# Topic overlap | |
cm = cor(t(m@beta)) | |
colnames(cm) = rownames(cm) = topics | |
diag(cm) = 0 | |
heatmap(cm, symm = T) | |
# View topic overlap | |
compare.topics <- function(m, cmp_topics) { | |
assignments = dtm.to.df(m@wordassignments, term_labels=m@terms, doc_labels=m@documents) | |
terms = dcast(assignments, term ~ freq, value.var = "doc", fun.aggregate = length) | |
terms = terms[, c(1, cmp_topics+1)] | |
terms$freq = rowSums(terms[-1]) | |
terms = terms[terms$freq > 0,] | |
terms$prop = terms[[2]] / terms$freq | |
terms$col = hsv(rescale(terms$prop, c(1, .6666)), .5, .5) | |
terms$nn = rowSums(terms[2:3]>0) | |
terms[order(-terms$freq), ] | |
} | |
# Topic overlap: War and Peace | |
set.seed(123) | |
terms = compare.topics(m, match(c("Freedom", "War"), topics)) | |
terms = compare.topics(m, match(c("Reform", "Tax"), topics)) | |
with(head(terms, 100), plotWords(x=prop, wordfreq = freq, words = term, col=col, xaxt="none", random.y = T, scale=2)) | |
# Topic overlap: Who is speaking? | |
terms$col.speaker = cmp$col[match(terms$term, cmp$term)] | |
with(head(terms, 100), plotWords(x=prop, wordfreq = freq, words = term, col=col.speaker, xaxt="none", random.y = T, scale=2)) | |
# Topic overlap: three topics | |
compare.topics3 <- function(m, cmp_topics) { | |
assignments = dtm.to.df(m@wordassignments, term_labels=m@terms, doc_labels=m@documents) | |
terms = dcast(assignments, term ~ freq, value.var = "doc", fun.aggregate = length) | |
terms = terms[,c(1, cmp_topics+1)] # 'term is 1 | |
colnames(terms)[-1] = c("x", "y", "z") | |
terms$freq = rowSums(terms[-1]) | |
terms = terms[terms$freq>0, ] | |
terms$prop.x = terms$x / terms$freq | |
terms$prop.y = terms$y / terms$freq | |
terms$prop.z = terms$z / terms$freq | |
tern = ggtern::tlr2xy(terms[c("x", "y", "z")], ggtern::coord_tern()) | |
terms$tx = tern$x | |
terms$ty = tern$y | |
terms$col = with(terms, rgb(x/freq, y/freq, z/freq)) | |
terms$nn = rowSums(terms[c("x","y","z")] > 0) | |
terms[order(-terms$freq), ] | |
} | |
# Topic overlap: Health, Reform, and Tax | |
terms = compare.topics3(m, match(c("Tax", "Reform", "Health"), topics)) | |
with(head(terms, 75), plotWords(tx, ty, term, freq, col=col, xaxt="none", scale=2)) | |
# Who wants reform? | |
terms$col.speaker = cmp$col[match(terms$term, cmp$term)] | |
with(head(terms, 75), plotWords(tx, ty, term, freq, col=col.speaker, xaxt="none", scale=2)) | |
# Educating the people about terrorism? | |
terms = compare.topics3(m, match(c("Terrorism", "Education", "People"), topics)) | |
with(head(terms, 75), plotWords(tx, ty, term, freq, col=col, xaxt="none", scale=2)) | |
terms$col.speaker = cmp$col[match(terms$term, cmp$term)] | |
with(head(terms, 75), plotWords(tx, ty, term, freq, col=col.speaker, xaxt="none", scale=2)) | |
# Semantic Network Analysis | |
library(semnet) | |
g = with(sotu.tokens[sotu.tokens$pos1 %in% c("N", "M", "A"), ], | |
windowedCoOccurenceNetwork(location = id, term = lemma, context = aid, window.size = 20)) | |
head(get.data.frame(g, "edges")) | |
gb = getBackboneNetwork(g, alpha = 1e-03, max.vertices = 75) | |
gb = decompose.graph(gb, min.vertices = 5, max.comps = 1)[[1]] | |
V(gb)$cluster = edge.betweenness.community(gb)$membership | |
gb = setNetworkAttributes(gb, size_attribute = V(gb)$freq, cluster_attribute = V(gb)$cluster) | |
plot(gb) | |
# Semantic Networks: color by speaker | |
V(gb)$frame.color = V(gb)$color = cmp$col[match(V(gb)$name, cmp$term)] | |
plot(gb) | |
# Semantic Networks: color by LDA topic | |
V(gb)$frame.color = V(gb)$color = cmp$topic.col[match(V(gb)$name, cmp$term)] | |
plot(gb) | |
# Semantic Networks: contrasts | |
g.bush = with(sotu.tokens[sotu.tokens$pos1 %in% c("N", "M", "A") & sotu.tokens$aid %in% sotu.meta$id[sotu.meta$headline == "George W. Bush"], ], | |
windowedCoOccurenceNetwork(location = id, term = lemma, context = aid, window.size = 20)) | |
g.obama = with(sotu.tokens[sotu.tokens$pos1 %in% c("N", "M", "A") & sotu.tokens$aid %in% sotu.meta$id[sotu.meta$headline == "Barack Obama"], ], | |
windowedCoOccurenceNetwork(location = id, term = lemma, context = aid, window.size = 20)) | |
d.obama = get.data.frame(g.obama, what=c("edges")) | |
d.bush = get.data.frame(g.bush, what=c("edges")) | |
d =merge(d.obama, d.bush, by=c("from", "to"), all=T) | |
d[is.na(d)] = 0 | |
do = d[d$weight.x > d$weight.y, ] | |
colnames(do)[3] = "weight" | |
g.obama = graph.data.frame(do, vertices = get.data.frame(g.obama, "vertices")) | |
db = d[d$weight.x < d$weight.y, ] | |
colnames(db)[4] = "weight" | |
g.bush = graph.data.frame(db, vertices = get.data.frame(g.bush, "vertices")) | |
# Semantic Networks: contrasts | |
for (g in list(g.obama, g.bush)) { | |
g = getBackboneNetwork(g, alpha = 1e-03, max.vertices = 75) | |
g = decompose.graph(g, min.vertices = 5, max.comps = 1)[[1]] | |
V(g)$cluster = edge.betweenness.community(g)$membership | |
g = setNetworkAttributes(g, size_attribute = V(g)$freq, cluster_attribute = V(g)$cluster) | |
plot(g) | |
} | |
# Semantic Networks: contrasts | |
for (g in list(g.obama, g.bush)) { | |
g = getBackboneNetwork(g, alpha = 1e-03, max.vertices = 75) | |
g = decompose.graph(g, min.vertices = 5, max.comps = 1)[[1]] | |
V(g)$cluster = edge.betweenness.community(g)$membership | |
g = setNetworkAttributes(g, size_attribute = V(g)$freq, cluster_attribute = V(g)$cluster) | |
plot(g) | |
} | |
## Can we go one step further? --> Syntactic analysis for clause analysis | |
## Sentence -> (source, subject, predicate) triples | |
library(rsyntax) | |
data(example_tokens_dutchquotes) | |
tokens = tokens[tokens$sentence == 4,] | |
get_text(tokens) | |
plot(graph_from_sentence(tokens)) | |
q = get_quotes_nl(tokens) | |
cl = get_clauses_nl(tokens) | |
plot(graph_from_sentence(tokens, quotes = q, clauses = cl)) | |
# Case 2: Gaza war | |
rm(list = ls()) | |
setwd("~/Dropbox/papers/2015_clauses/clauses_analyses") | |
load("quotes_clauses.rda") | |
load("tokens.rda") | |
head(tokens) | |
head(clauses) | |
src_actor = unique(quotes[quotes$actor != "" & quotes$quote_role == "source", c("quote_id", "actor")]) | |
nn = table(src_actor$quote_id) | |
src_actor = subset(src_actor, !(quote_id %in% names(nn[nn>1]))) | |
quote_src = quotes[quotes$quote_role== "quote", c('quote_id', 'id')] | |
quote_src = merge(quote_src, src_actor) | |
quote_src = data.frame(id=quote_src$id, source=quote_src$actor) | |
clauses = merge(clauses, quote_src, all.x=T) | |
clauses = merge(clauses, tokens[c("id", "lemma", "pos1")]) | |
agcl = unique(clauses$clause_id[clauses$attack]) | |
# (1) Who is quoted? | |
x = unique(clauses[ c("clause_id", "country", "source")]) | |
x = unique(clauses[!is.na(clauses$source), c("clause_id", "country", "source")]) | |
nn = table(x$country, x$source) | |
nn | |
nn[,2] / nn[,1] | |
chisq.test(nn) | |
x = unique(clauses[!is.na(clauses$source) & (clauses$clause_id %in% agcl), c("clause_id", "country", "source")]) | |
nn = table(x$country, x$source) | |
nn | |
nn[,2] / nn[,1] | |
chisq.test(nn) | |
# (2) Who is aggressor / victim | |
acl = unique(clauses[clauses$actor != "" & clauses$clause_id %in% agcl, c("clause_id", "clause_role", "country", "actor")]) | |
head(acl) | |
nn.us = with(subset(acl, country=="us"), table(actor, clause_role)) | |
nn.us / rowSums(nn.us) | |
nn.cn = with(subset(acl, country=="cn"), table(actor, clause_role)) | |
nn.cn / rowSums(nn.cn) | |
with(subset(acl, actor=="Israel"), chisq.test(country, clause_role)) | |
with(subset(acl, actor=="Hamas"), chisq.test(country, clause_role)) | |
chisq.test(nn) | |
# (3) What do they do? | |
stopwords = read.csv("stop-word-list.csv", header =F, stringsAsFactors=F)[[1]] | |
# dtm of predicates | |
p = subset(clauses, clause_role == "predicate" & pos1 %in% c('V','N','M','A') & lemma %notin% stopwords) | |
library(corpustools) | |
library(semnet) | |
predicates = dtm.create(p$clause_id, p$lemma, filter.chars = F, minlength = 2) | |
#verbs = with(tokens[which(tokens$clause_role == "predicate"),], dtm.create(clause_id, lemma, filter = pos1 == 'V' & lemma %notin% stopwords)) | |
# what do hamas/israel do according to china / israel | |
cusa = unique(clauses$clause_id[clauses$country == "us"]) | |
cchina = unique(clauses$clause_id[clauses$country == "cn"]) | |
sh = unique(clauses$clause_id[clauses$clause_role=="subject" & clauses$actor == "Hamas"]) | |
si = unique(clauses$clause_id[clauses$clause_role=="subject" & clauses$actor == "Israel"]) | |
c(length(intersect(cusa, si)), length(intersect(cchina, si)), length(intersect(cusa, sh)), length(intersect(cchina, sh))) | |
source("functions.r") | |
plot.cooc.from.dtm(predicates, intersect(cusa, si), intersect(cchina, si), main="Country:US, Subject:Israel") | |
plot.cooc.from.dtm(predicates, intersect(cchina, si), intersect(cusa, si), main="Country:China, Subject:Israel") | |
plot.cooc.from.dtm(predicates, intersect(cusa, sh), intersect(cchina, sh), main="Country:US, Subject:Hamas") | |
plot.cooc.from.dtm(predicates, intersect(cchina, sh), intersect(cusa, sh), main="Country:China, Subject:Hamas") | |
# (6) Who talks about whom? | |
head(quotes) | |
entities = get.entities(tokens) | |
qnames = na.omit(merge(quotes, subset(entities, name == "Israel" | type %in% c("ORGANIZATION", "PERSON")))[c("country", "quote_id", "quote_role", "name")]) | |
srcs = qnames[qnames$quote_role == "source", c("country", "quote_id", "name")] | |
qts = qnames[qnames$quote_role == "quote", c("country", "quote_id", "name")] | |
### | |
head(srcs) | |
cmp = dcast(srcs, name ~ country, fun.aggregate = length, value.var="quote_id") | |
smooth = .000001 | |
cmp$relcn = cmp$cn / sum(cmp$cn) | |
cmp$relus = cmp$us / sum(cmp$us) | |
head(x) | |
cmp$overus = (cmp$relus + smooth) / (cmp$relcn + smooth) | |
cmp$chi = with(cmp, corpustools:::chi2(us, cn, sum(us) - us, sum(cn) - cn)) | |
subset(cmp, name == "Hamas") | |
### | |
x = merge(srcs, qts, by=c("quote_id", "country")) | |
x = aggregate(list(n=x$quote_id), by=x[c("name.x", "name.y", "country")], FUN=function(x) length(unique(x))) | |
table(x$country) | |
#g.cn = graph.data.frame(subset(x, n>=1 & country == "cn"), directed = T) | |
#V(g.cn)$weight = table(qnames$name[qnames$country == "cn"])[get.vertex.attribute(g.cn, "name")] | |
g.cn = graph.data.frame(subset(x, n>=1 & country == "cn" & (name.x %in% cmp$name[cmp$overus < 1] |name.y %in% cmp$name[cmp$overus < 1])), directed = T) | |
V(g.cn)$weight = cmp$chi[match(get.vertex.attribute(g.cn, "name"), cmp$name)] | |
g.cn = graph.settings(g.cn, backbone=T) | |
V(g.cn)$shape = "none" | |
plot(g.cn, main="Quote network China") | |
#g.us = graph.data.frame(subset(x, n>=1 & country == "us"), directed = T) | |
#V(g.us)$weight = table(qnames$name[qnames$country == "us"])[get.vertex.attribute(g.us, "name")] | |
g.us = graph.data.frame(subset(x, country == "us" & (name.x %in% cmp$name[cmp$overus > 1] | name.y %in% cmp$name[cmp$overus > 1])), directed = T) | |
V(g.us)$weight = cmp$chi[match(get.vertex.attribute(g.us, "name"), cmp$name)] | |
V(g.us)$weight[is.na(V(g.us)$weight)] = 0.01 | |
g.us = graph.settings(g.us, backbone=T) | |
V(g.us)$shape = "none" | |
plot(g.us, main="Quote network US") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment