|
library(ggplot2) |
|
|
|
# generate breakpoint |
|
gen.breakpoints <- function(df, nbreaks=20){ |
|
l <- sort(unlist(df)) |
|
bp <- NULL |
|
n <- nbreaks |
|
while(length(bp) < nbreaks){ |
|
bp <- unique(l[floor(seq(1/n, 1, 1/n) * length(l))]) |
|
n <- n+1 |
|
} |
|
c(-1, bp) |
|
} |
|
|
|
# plot transit frequency |
|
plot.transit <- function(df, bp, title=""){ |
|
# create transit data |
|
trans.global <- ddply(data.frame(1:nrow(df), df), 1, function(x){ |
|
v <- cut(unlist(x)[-1], bp) |
|
levels(v) <- bp[-1] |
|
data.frame(from=v[-length(v)], to=v[-1]) |
|
}) |
|
|
|
# plot |
|
d <- ggplot(trans.global, aes(x=from, y=to)) |
|
d + geom_bin2d() + scale_fill_gradient(low="grey80", high="black") + |
|
theme_bw() + |
|
opts(axis.text.x=theme_text(angle=-90, hjust=0)) + |
|
opts(title=title) + |
|
geom_abline(colour="black") + |
|
geom_abline(colour="black", intercept=6) + |
|
geom_abline(slope=-1, intercept=nlevels(trans.global$from)*0.75, colour="black") |
|
} |
|
|
|
# for a quick check to plot the data of specific name |
|
# Example : pp("²“¡‘”ü", df) |
|
pp <- function(n, df){ |
|
x <- unlist(df[n,]) |
|
r <- as.Date(names(x), "X%Y.%m.%d") |
|
from <- r[1] |
|
to <- r[length(r)] |
|
pv.m <- tapply(x, cut(r, "month"), median) |
|
|
|
ylim <- max(pv.m)*2 |
|
# ylim <- max(x) |
|
|
|
plot(seq(from, to, by="days"), x, xlim=c(from, to), ylim=c(0, ylim), xlab="date", ylab="pv", main=n, xaxt="n", col="gray") |
|
par(new=T) |
|
plot(as.Date(names(pv.m)), pv.m, xlim=c(from, to), ylim=c(0, ylim), xlab="", ylab="", main="", xaxt="n", type="l", lwd=2, col="red") |
|
# axis.Date(1, at=seq(from, to, "3 months"), format="%Y/%m") |
|
axis.Date(1, at=seq(seq(from, by="month", len=2)[2], to, "3 months"), format="%Y/%m") |
|
} |
|
|
|
|
|
# main |
|
# ----------------------------------------------------------------------- |
|
|
|
# read table data |
|
f <- "wikipedia_pageview.table" |
|
df <- read.delim(f) |
|
|
|
# add filters if needed |
|
df <- df[,regexpr("X2012.12", colnames(df))!=1] |
|
|
|
# df <- df[sample(nrow(df), 30), c(1:5,10,20,30:32)] |
|
# df <- df[,(regexpr("X2010", colnames(df))==1 | regexpr("X2009.12", colnames(df))==1)] |
|
# df <- df[,(regexpr("X2011", colnames(df))==1 | regexpr("X2010.12", colnames(df))==1)] |
|
# df <- df[,(regexpr("X2012", colnames(df))==1 | regexpr("X2011.12", colnames(df))==1)] |
|
|
|
# convert to per-month data |
|
df.m <- apply(df, 1, function(x){ |
|
tapply(x, cut(as.Date(names(x), "X%Y.%m.%d"), "month"), median) |
|
}) |
|
df.m <- as.data.frame(t(floor(df.m))) |
|
|
|
# generate breakpoint |
|
bp <- gen.breakpoints(df, 20) |
|
|
|
# calculate score |
|
|
|
# transit probability matrix |
|
m <- matrix(0, length(bp)-1, length(bp)-1) |
|
invisible(apply(df.m, 1, function(x){ |
|
vl <- cut(unlist(x), bp) |
|
for(i in 1:(length(vl)-1)){ |
|
m[vl[i], vl[i+1]] <<- m[vl[i], vl[i+1]] + 1 |
|
} |
|
})) |
|
m2 <- t(apply(m, 1, function(x) x/sum(x))) |
|
|
|
score <- ddply(data.frame(1:nrow(df.m), df.m), 1, function(x){ |
|
v <- as.numeric(cut(unlist(x[-1]), bp)) |
|
|
|
# score1. distance from x-y=0 : (y-x)/sqrt(2) |
|
d0 <- (v[-1] - v[-length(v)])/sqrt(2) |
|
s1 <- sum(d0) |
|
s11 <- sum(d0 * v[-1]) |
|
|
|
# score2. transit probability |
|
s2 <- 0 |
|
for(i in 1:(length(v)-1)){ |
|
s2 <- s2 + log(m2[v[i], v[i+1]]) |
|
} |
|
|
|
# score3. variance |
|
s3 <- var(sqrt(v[-length(v)]^2 + v[-1]^2)) |
|
s31 <- var(sqrt(v[-length(v)]^2 + v[-1]^2) * v[-1]) |
|
|
|
data.frame(s1, s11, s2, s3, s31) |
|
}) |
|
rownames(score) <- rownames(df.m) |
|
score <- score[,-1] |
|
write.table(score, "score.txt", quote=F, sep="\t") |
|
|
|
# head(score[sort.list(score$s1, dec=T),], 40) |
|
# head(score[sort.list(score$s2),], 40) |
|
# head(score[sort.list(score$s3, dec=T),], 40) |
|
|
|
# plot global transit frequency |
|
plt.g <- plot.transit(df.m, bp) |
|
ggsave("global_transit.png", plt.g) |
|
|
|
# plot indivisual transit frequency |
|
invisible(sapply(rownames(df.m), function(n){ |
|
plt <- plot.transit(df.m[n,], bp, title=n) |
|
ggsave(sprintf("%s.png", n), plt) |
|
})) |