Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
var system = require('system');
var url = system.args[1];
var n = system.args[2] + '';
var webPage = require('webpage');
var page = webPage.create();
page.settings.userAgent =
'Mozilla/5.0 (X11; Linux i686; rv:48.0) Gecko/20100101 Firefox/48.0';
var fs = require('fs');
var outfile = 'contents.html'
var loadInProgress = false;
page.onLoadStarted = function() {
loadInProgress = true;
console.log('Loading...');
};
page.open(url, function (status) { // opens an async process
if (status !== 'success') {
console.log('Unable to access network');
phantom.exit();
} else {
loadInProgress = false;
console.log('Status: ' + status);
// console.log('Content Length: ' + page.content.length);
setTimeout(function() {
page.evaluate(function(n) {
document.getElementById('CurrentPage').value = n;
document.querySelector('.forum-tb-btnjump').click();
}, n);
}, 750);
setTimeout(function() {
var cp = page.evaluate(function() {
return document.querySelector(
'.forum-tb-btnjump[data-currentpage]').getAttribute(
'data-currentpage');
// return document.title;
});
console.log('Current Page: ' + cp);
var plist = page.evaluate(function() {
return document.querySelector(
'#postinglist').innerHTML;
});
fs.write(outfile, plist, 'a');
phantom.exit();
}, 1000);
};
});
## -*- eval: (orgstruct-mode 1); orgstruct-heading-prefix-regexp: "##+ "; -*-
library(dplyr)
library(rvest)
library(ggplot2)
library(tm)
library(SnowballC)
library(wordcloud)
library(lubridate)
library(scales)
library(svglite)
options("scipen" = 100, "digits" = 4)
Sys.setlocale("LC_TIME", "C")
### * Retrieve Data
for (i in 1:43) {
system(paste("phantomjs parse-postinglists.js",
"http://derstandard.at/2000043417670/Das-Vokabular-der-Asylkritiker",
i, sep=" "))
}
rawsource <- read_html("contents.html", encoding="utf8")
## various checks
## rawsource %>% html_node("#CurrentPage") %>% html_attr("value") %>% as.numeric()
## rawsource %>% html_nodes(".posting-content") %>% html_children() %>% html_text()
### * Extract Metadata
contents <-
rawsource %>%
html_nodes(".posting-content") %>%
html_text(trim = TRUE)
authors <-
rawsource %>%
html_nodes(".user-menu") %>%
html_attr("title")
badges <-
rawsource %>%
html_nodes("li .unfollowCommunityId") %>%
html_nodes(".user-badge") %>%
html_text() %>%
as.numeric()
nrating <-
rawsource %>%
html_nodes(".bgvote") %>%
html_nodes(c(".l.counter.n")) %>%
html_text() %>%
as.numeric()
prating <-
rawsource %>%
html_nodes(".bgvote") %>%
html_nodes(c(".l.counter.p")) %>%
html_text() %>%
as.numeric()
postingdate <-
rawsource %>%
html_nodes(".date") %>%
html_nodes(".timestamp") %>%
html_attr("data-date") %>%
strptime(format = "%e. %B %Y, %H:%M:%S") %>%
format("%y-%m-%d %H:%M")
pid <-
rawsource %>%
html_nodes(".posting") %>%
html_attr("data-postingid") %>%
as.numeric()
parentpid <-
rawsource %>%
html_nodes(".posting") %>%
html_attr("data-parentpostingid") %>%
as.numeric()
res <-
data.frame(content = contents,
author = authors,
pos = prating,
neg = nrating,
badges,
date = postingdate,
pid,
parentpid) %>%
dplyr::arrange(desc(pos))
### * Posting Count Over Time
pfreq <-
res %>%
select(date, pos, neg) %>%
mutate(date = as.POSIXct(cut(as.POSIXct(date), breaks="30 min"))) %>%
group_by(date) %>%
summarise(N = n(), pos = sum(pos), neg = sum(neg))
## binning datetime
## cut(pfreq$date, '15 mins')
ggplot(pfreq, aes(date, N)) + geom_bar(stat="identity", alpha=0.9) +
ylab("Number of Postings") +
theme_classic() +
scale_x_datetime(breaks=date_breaks("6 hours"), labels=date_format("%a %H:%M"),
expand=c(0.01,0)) +
scale_y_continuous(expand=c(0,0.5)) +
theme(legend.key = element_blank(),
legend.title = element_blank(),
axis.text = element_text(size=8, color="#858585"),
axis.title.x = element_blank(),
axis.title.y = element_text(size=10, color="#858585"),
axis.line.x = element_line(size=0.2, color="#858585"),
axis.line.y = element_line(size=0.2, color="#858585"),
axis.ticks.y = element_line(size = 0.5, color="#858585"),
axis.ticks.x = element_blank())
### * Late to the Party
## ratio of votes to articles declines rapidly
pfreq %>%
mutate(Ratio=(pos+neg)/N)
### * Upvotes/Downvotes Over Time
ggplot(pfreq, aes(date, pos)) +
geom_line(stat="identity", size=0.15) +
geom_area(aes(fill = "forestgreen"), stat="identity", alpha=0.35) +
geom_area(aes(date, neg, fill = "red"), stat="identity", alpha=0.35,
inherit.aes=FALSE) +
ylab("Number of Votes") +
theme_classic() +
scale_x_datetime(breaks=date_breaks("6 hours"),
labels=date_format("%a %k:%M"),
expand=c(0.01,0)) +
scale_y_continuous(expand=c(0,10)) +
scale_fill_identity(guide = "legend",
labels = c("Upvote", "Downvote")) +
## scale_alpha_identity(guide = "legend", labels = c("Upvote", "Downvote")) +
theme(axis.text = element_text(size=8, color="#858585", family="serif"),
axis.title.x = element_blank(),
axis.title.y = element_text(size=10, color="#858585", family="serif"),
axis.line.x = element_blank(),
axis.line.y = element_blank(),
## axis.line.x = element_line(size=0.2, color="#858585"),
## axis.line.y = element_line(size=0.2, color="#858585"),
## axis.ticks.x = element_line(size = 0.3, color="#858585"),
axis.ticks.y = element_line(size = 0.3, color="#858585"),
axis.ticks.x = element_blank(),
legend.position = c(0.85, 0.3),
legend.title = element_blank(),
legend.text = element_text(size=8, color="#858585", family="serif"),
legend.key.width = unit(0.6, "cm"),
legend.key.size = unit(0.4, "cm")) +
guides(fill = guide_legend(override.aes = list(alpha=0.3))) +
ggplot2::annotate("text", x = as.POSIXct("16-09-06 20:05:00"), y = 475,
adj=0, family="serif", color="#858585", size=3,
label = paste0("Number of upvotes and downvotes ",
"(binned to 30min time windows)\n",
"for comments in ",
"'Das Vokabular der Asylkritiker'\n",
"(in derstandard.at, 2016-09-06)"))
ggsave(filename="./plots/tl-ratings.svg", width=32, height=20, units="cm",
scale=0.7)
### * Top Poster
## top upvote
arrange(res[,c("author", "pos", "neg")], desc(pos)) %>%
head(5)
## top posters
dplyr::count(res, author, sort=T) %>% top_n(5)
### * Create Corpus
## tm specific
## getSources()
## getReaders()
## apply the right meta data to corpus from data.frame object
dfreader <-
list(content = "content",
author = "author",
datetimestamp = "date",
pos = "pos",
neg = "neg",
badges = "badges",
pid = "pid",
parentpid = "parentpid") %>%
readTabular()
corp <- VCorpus(DataframeSource(res),
readerControl = list(reader = dfreader,
language = "de")) %>%
## transformations
tm_map(removePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(content_transformer(tolower)) %>%
## depending on which weighting function we use, tfidf is sensible
## to stopword removal:
## http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html
tm_map(removeWords, stopwords("german")) %>%
tm_map(removeWords, c("vgl", "bzw", "dass", "daher", "eben",
"erst", "eher", "dies", "etc", "schon",
"wer", "war", "gibt")) %>%
tm_map(stemDocument) %>%
tm_map(stripWhitespace)
### * TDM and Weighting
### (1 + log(f_{t,d})) log(1 + (N / n_t))
weightTfIdf.adj <-
function(m) {
isDTM <- inherits(m, "DocumentTermMatrix")
if (isDTM)
m <- t(m)
### tf-idf is a two-fold - in this case log - normalization
m$v <- ifelse(m$v > 0, 1 + log(m$v), 0)
### idf
## document frequency
rs <- slam::row_sums(m)
## log(1 + N / n_t) -- inverse document frequency smooth
logrs <- log(1 + (nDocs(m) / rs))
## ntf * idf
m <- m * logrs
if (isDTM)
t(m)
else m
}
tdm.adj <-
TermDocumentMatrix(
corp, control = list(weighting = weightTfIdf.adj))
tfidf.adj <-
data.frame(term=tdm.adj$dimnames$Terms, tfidf=slam::row_means(tdm.adj))
svg("./plots/keywords.svg", width=7.5, height=6.5)
set.seed(123)
tfidf.adj$tfidf[is.na(tfidf.adj[,2])] <- 0
tfidf.adj %>%
## ignore URLs
filter(!grepl("http.*", term)) %>%
with(wordcloud(words=term,
freq=tfidf,
colors = pal2,
random.order = TRUE,
scale = c(2.5, .25),
max.words = 150))
text(x=-0.1, y=0.95, paste0("Keywords used in comments on\n",
"'Das Vokabular der Asylkritiker'\n",
"(in derstandard.at, 2016-09-06)"),
col="gray53", family="serif", adj=c(0, NA), cex=0.75)
dev.off()
## just out of curiosity, inspect "bugelmess" comment
bugelmess <-
tm_filter(corp, function(x) any(grep("bugelmess", content(x))))[[1]] %>%
meta("pid")
res[res$pid == bugelmess,]$content
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment