Skip to content

Instantly share code, notes, and snippets.

@hnagata
Created July 31, 2015 08:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hnagata/d2c607d6b4eba4f17304 to your computer and use it in GitHub Desktop.
Save hnagata/d2c607d6b4eba4f17304 to your computer and use it in GitHub Desktop.
## grid / ggplot ----
library(ggplot2)
library(grid)
make.grid <- function(row, col) {
grid.newpage()
l <- grid.layout(row, col)
v <- viewport(layout=l)
pushViewport(v)
}
print.at <- function(o, i, j) {
print(o, vp=viewport(layout.pos.row=i, layout.pos.col=j))
}
end.grid <- function() {
popViewport()
}
plot.to.file <- FALSE
## データの読み込み ----
dat <- read.csv("user.csv", fileEncoding="utf-8")
dat$date <- as.POSIXct(dat$date, tz="JST")
dat <- dat[order(dat$user, dat$date), ]
enam <- c("chartInterval", "chartStability", "chartExpressiveness", "chartRhythm", "chartVibratoLongtone")
## とりあえずlm ----
lm.std <- lm(totalPoint ~ chartInterval + chartStability + chartExpressiveness + chartVibratoLongtone + chartRhythm, data=dat)
summary(lm.std)
## ノンパラlm ----
mask.nonp <- rep(TRUE, nrow(dat))
# 総合点100点を除外 (打ち切り対策)
mask.nonp <- mask.nonp & dat$totalPoint < 100
## 各項目30以上だけ使う (singular 対策)
#mask.nonp <- mask.nonp & apply(dat[, enam], 1, function(row) all(row[enam] >= 30))
# Singular 対策
mask.nonp <- mask.nonp & dat$chartRhythm != 0
mask.nonp <- mask.nonp & dat$chartRhythm != 2
mask.nonp <- mask.nonp & dat$chartExpressiveness != 13
# 推定
do.lm <- function(mask.nonp, use.user=FALSE) {
enam <- if (use.user) c(enam, "user") else enam
dat.lm <- dat[mask.nonp, c("totalPoint", enam)]
for (en in enam) dat.lm[[en]] <- factor(dat.lm[[en]])
design.lm <- data.frame(model.matrix(~ . - 1, dat.lm))
if ("chartInterval0" %in% colnames(design.lm)) {
design.lm <- design.lm[, -which(colnames(design.lm) == "chartInterval0")]
}
if (use.user) {
ucol <- paste0("user", levels(dat$user)[-1])
design.lm[, ucol] <- design.lm[, ucol] - as.numeric(dat$user[mask.nonp] == levels(dat$user)[1])
}
lm(totalPoint ~ . - 1, design.lm)
}
lm.nonp <- do.lm(mask.nonp)
summary(lm.nonp)
# 係数のプロット
coef.nonp <- coef(lm.nonp)
print.coef <- function(coef.nonp, lower=40) {
value <- c(-1, lower : 100)
df <- data.frame(
elements = factor(rep(enam, each=length(value)), levels=enam),
value = rep(value, 5),
beta = as.vector(sapply(enam, function(en) {
coef.nonp[paste0(en, value)]
}))
)
f <- function(en) {
ggplot(df[df$elements == en | df$value == -1, ], aes(x=value, y=beta)) +
geom_point(aes(color=elements)) +
scale_x_continuous(en, limits=c(lower, 100), breaks=seq(lower, 100, 10)) +
theme(legend.position="none")
}
make.grid(3, 2)
print.at(f("chartInterval"), 1, 1)
print.at(f("chartStability"), 1, 2)
print.at(f("chartExpressiveness"), 2, 1)
print.at(f("chartRhythm"), 2, 2)
print.at(f("chartVibratoLongtone"), 3, 1)
end.grid()
}
#print.coef(coef.nonp)
# 残差の分布を見る
resid.nonp <- resid(lm.nonp)
print.resid.point <- function(resid.nonp, mask.nonp) {
df <- data.frame(
index = 1 : length(resid.nonp),
resid = resid.nonp,
user = factor(as.numeric(dat$user[mask.nonp]) %% 3, levels=0:2)
)
make.grid(1, 1)
print.at(
ggplot(df, aes(x=index, y=resid)) +
geom_point(aes(color=user), size=1) +
theme(legend.position="none"),
1, 1)
end.grid()
}
print.resid.hist <- function(resid.nonp) {
df <- data.frame(resid = resid.nonp)
make.grid(1, 1)
print.at(
ggplot(df, aes(x=resid)) +
geom_histogram(aes(fill=0)) +
theme(legend.position="none"),
1, 1)
end.grid()
}
if (plot.to.file) svg("total-resid-index.svg", width=6, height=4)
print.resid.point(resid.nonp, mask.nonp)
if (plot.to.file) dev.off()
if (plot.to.file) svg("total-hist-resid.svg", width=6, height=4)
print.resid.hist(resid.nonp)
if (plot.to.file) dev.off()
## 外れ値を除去 ----
# ユーザーごとで 99.99% 点より外側のサンプルを除外
quant.resid <- tapply(resid.nonp, dat$user[mask.nonp], function(r) quantile(r, probs=c(0.4, 0.5, 0.6)))
mask.ol <- mapply(
function(resid, user) {
quant <- quant.resid[[user]]
resid >= quant["50%"] - (quant["60%"] - quant["40%"]) * (qnorm(0.9999) / (2*qnorm(0.6)))
},
resid.nonp, dat$user[mask.nonp]
)
mask.nonp2 <- mask.nonp
mask.nonp2[mask.nonp] <- mask.ol
# 判定された外れ値を見る
olresid.df <- data.frame(
index = 1 : length(resid.nonp),
resid = resid.nonp,
col = ifelse(mask.nonp2[mask.nonp], "darkgray", "red")
)
if (plot.to.file) svg("total-resid-index-ol.svg", width=6, height=4)
make.grid(1, 1)
print.at(
ggplot(olresid.df, aes(x=index, y=resid)) +
geom_point(color=olresid.df$col, size=1) +
theme(legend.position="none"),
1, 1)
end.grid()
if (plot.to.file) dev.off()
# 推定
lm.nonp2 <- do.lm(mask.nonp2, use.user=TRUE)
summary(lm.nonp2)
# 係数のプロット
coef.nonp2 <- coef(lm.nonp2)
if (plot.to.file) svg("total-coef-elem.svg", width=8, height=12)
print.coef(coef.nonp2, lower=40)
if (plot.to.file) dev.off()
# 各項目 80 以上での拡大図
df <- data.frame(
elements = factor(rep(enam, each=21), levels=enam),
value = rep(80 : 100, 5),
beta = as.vector(sapply(enam, function(en) {
ecoef <- coef.nonp2[paste0(en, 80 : 100)]
ecoef - max(ecoef, na.rm=TRUE)
}))
)
border <- 100 - sum(coef.nonp2[paste0(enam, 100)[-1]]) - coef.nonp2["chartInterval95"]
border
if (plot.to.file) svg("total-coef-elem-o80.svg", width=6, height=4)
make.grid(1, 1)
print.at(
ggplot(df, aes(x=value, y=beta, group=elements)) +
geom_hline(yintercept=border, color=1) +
geom_point(aes(color=elements)) +
geom_line(aes(color=elements)) +
xlim(c(80, 100)),
1, 1)
end.grid()
if (plot.to.file) dev.off()
# 残差の分布を見る
print.resid.hist(resid(lm.nonp2))
# 裏加点の box-plot
ura.per.user <- coef.nonp2[paste0("user", levels(dat$user)[-1])]
ura.per.user <- c(-sum(ura.per.user), ura.per.user)
df <- data.frame(
ura = resid(lm.nonp2) + ura.per.user[as.numeric(dat$user[mask.nonp2])],
user = dat$user[mask.nonp2]
)
df <- df[order(ura.per.user[as.numeric(dat$user[mask.nonp2])], decreasing=TRUE), ]
df$user <- factor(as.numeric(df$user), levels=unique(as.numeric(df$user)))
df$ucol <- factor(as.numeric(df$user) %% 3, levels=0:2)
if (plot.to.file) svg("total-resid-user.svg", width=6, height=4)
make.grid(1, 1)
print.at(
ggplot(df, aes(user, ura)) +
geom_boxplot(aes(color=ucol), outlier.size=1) +
theme(legend.position="none", axis.ticks=element_blank(), axis.text.x=element_blank()),
1, 1)
end.grid()
if (plot.to.file) dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment