Skip to content

Instantly share code, notes, and snippets.

@kkobayashi
Created December 1, 2012 14:09
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kkobayashi/4182475 to your computer and use it in GitHub Desktop.
Save kkobayashi/4182475 to your computer and use it in GitHub Desktop.

Usage

Steps to replay my analysis is very simple.

  1. Download Wikipedia PV data

You can download daily Wikipedia PV data from following website.

Wikipedia article traffic statistics

JSON format is also available from the link in "traffic statistics" page. For example:

$ wget -O sakura.ayane.201211.json http://stats.grok.se/json/ja/201211/%E4%BD%90%E5%80%89%E7%B6%BE%E9%9F%B3
  1. Merge & convert JSON files into one table

ls *.json | perl wpv_j2t.pl > wikipedia_pageview.table
  1. Play with the data

Now we are ready to play with the data. Hope break_wikipv.R helps your analysis.

library(ggplot2)
# generate breakpoint
gen.breakpoints <- function(df, nbreaks=20){
l <- sort(unlist(df))
bp <- NULL
n <- nbreaks
while(length(bp) < nbreaks){
bp <- unique(l[floor(seq(1/n, 1, 1/n) * length(l))])
n <- n+1
}
c(-1, bp)
}
# plot transit frequency
plot.transit <- function(df, bp, title=""){
# create transit data
trans.global <- ddply(data.frame(1:nrow(df), df), 1, function(x){
v <- cut(unlist(x)[-1], bp)
levels(v) <- bp[-1]
data.frame(from=v[-length(v)], to=v[-1])
})
# plot
d <- ggplot(trans.global, aes(x=from, y=to))
d + geom_bin2d() + scale_fill_gradient(low="grey80", high="black") +
theme_bw() +
opts(axis.text.x=theme_text(angle=-90, hjust=0)) +
opts(title=title) +
geom_abline(colour="black") +
geom_abline(colour="black", intercept=6) +
geom_abline(slope=-1, intercept=nlevels(trans.global$from)*0.75, colour="black")
}
# for a quick check to plot the data of specific name
# Example : pp("²“¡‘”ü", df)
pp <- function(n, df){
x <- unlist(df[n,])
r <- as.Date(names(x), "X%Y.%m.%d")
from <- r[1]
to <- r[length(r)]
pv.m <- tapply(x, cut(r, "month"), median)
ylim <- max(pv.m)*2
# ylim <- max(x)
plot(seq(from, to, by="days"), x, xlim=c(from, to), ylim=c(0, ylim), xlab="date", ylab="pv", main=n, xaxt="n", col="gray")
par(new=T)
plot(as.Date(names(pv.m)), pv.m, xlim=c(from, to), ylim=c(0, ylim), xlab="", ylab="", main="", xaxt="n", type="l", lwd=2, col="red")
# axis.Date(1, at=seq(from, to, "3 months"), format="%Y/%m")
axis.Date(1, at=seq(seq(from, by="month", len=2)[2], to, "3 months"), format="%Y/%m")
}
# main
# -----------------------------------------------------------------------
# read table data
f <- "wikipedia_pageview.table"
df <- read.delim(f)
# add filters if needed
df <- df[,regexpr("X2012.12", colnames(df))!=1]
# df <- df[sample(nrow(df), 30), c(1:5,10,20,30:32)]
# df <- df[,(regexpr("X2010", colnames(df))==1 | regexpr("X2009.12", colnames(df))==1)]
# df <- df[,(regexpr("X2011", colnames(df))==1 | regexpr("X2010.12", colnames(df))==1)]
# df <- df[,(regexpr("X2012", colnames(df))==1 | regexpr("X2011.12", colnames(df))==1)]
# convert to per-month data
df.m <- apply(df, 1, function(x){
tapply(x, cut(as.Date(names(x), "X%Y.%m.%d"), "month"), median)
})
df.m <- as.data.frame(t(floor(df.m)))
# generate breakpoint
bp <- gen.breakpoints(df, 20)
# calculate score
# transit probability matrix
m <- matrix(0, length(bp)-1, length(bp)-1)
invisible(apply(df.m, 1, function(x){
vl <- cut(unlist(x), bp)
for(i in 1:(length(vl)-1)){
m[vl[i], vl[i+1]] <<- m[vl[i], vl[i+1]] + 1
}
}))
m2 <- t(apply(m, 1, function(x) x/sum(x)))
score <- ddply(data.frame(1:nrow(df.m), df.m), 1, function(x){
v <- as.numeric(cut(unlist(x[-1]), bp))
# score1. distance from x-y=0 : (y-x)/sqrt(2)
d0 <- (v[-1] - v[-length(v)])/sqrt(2)
s1 <- sum(d0)
s11 <- sum(d0 * v[-1])
# score2. transit probability
s2 <- 0
for(i in 1:(length(v)-1)){
s2 <- s2 + log(m2[v[i], v[i+1]])
}
# score3. variance
s3 <- var(sqrt(v[-length(v)]^2 + v[-1]^2))
s31 <- var(sqrt(v[-length(v)]^2 + v[-1]^2) * v[-1])
data.frame(s1, s11, s2, s3, s31)
})
rownames(score) <- rownames(df.m)
score <- score[,-1]
write.table(score, "score.txt", quote=F, sep="\t")
# head(score[sort.list(score$s1, dec=T),], 40)
# head(score[sort.list(score$s2),], 40)
# head(score[sort.list(score$s3, dec=T),], 40)
# plot global transit frequency
plt.g <- plot.transit(df.m, bp)
ggsave("global_transit.png", plt.g)
# plot indivisual transit frequency
invisible(sapply(rownames(df.m), function(n){
plt <- plot.transit(df.m[n,], bp, title=n)
ggsave(sprintf("%s.png", n), plt)
}))
use strict;
use warnings;
use Path::Class qw(file);
use JSON::XS;
use Encode;
use Date::Simple qw(date);
use utf8;
my $sdate = date("2009-12-01");
my $edate = date("2012-12-31");
my %table;
while(my $file = <>){
$file =~ tr/\x0A\x0D//d;
### $file
my $data = decode_json file($file)->slurp;
### $data
foreach(keys %{$data->{daily_views}}){
$table{$data->{title}}->{$_} = $data->{daily_views}->{$_};
}
}
### %table
my @keys;
for(my $d=$sdate; $d<=$edate; $d++){
push(@keys, $d);
}
print join("\t", @keys) . "\n";
foreach my $name (keys %table){
print encode('utf8', $name);
for(my $d=$sdate; $d<=$edate; $d++){
print "\t" . (exists $table{$name}->{$d} ? $table{$name}->{$d} : 0);
}
print "\n";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment