Skip to content

Instantly share code, notes, and snippets.

@peristrophe
Last active November 28, 2017 23:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save peristrophe/9e1f944f36c8f8638d715cf0a018e3c2 to your computer and use it in GitHub Desktop.
Save peristrophe/9e1f944f36c8f8638d715cf0a018e3c2 to your computer and use it in GitHub Desktop.
R script casestudies.
require(glmnet)
source("D:/tmp/r/lib/my_functions.r")
master <- read.csv("D:/tmp/r/contest001/train.csv", header=T, row.names = 1, stringsAsFactors=TRUE)
tester <- read.csv("D:/tmp/r/contest001/test.csv", header=T, row.names = 1, stringsAsFactors=TRUE)
require(MASS)
dat <- subset(dat, rowSums(dat[,36:71]) > 0)
#master <- without_empty_im(master)
#tester <- without_empty_im(tester)
set.seed(1000)
##### データ分割(学習データ/テストデータ)
#pn <- nrow(subset(dat, use == 1))
#nn <- nrow(subset(dat, use == 0))
#tester <- rbind(subset(dat, use == 1)[1:as.integer(pn * 0.2),], subset(dat, use == 0)[1:as.integer(nn * 0.2),])
#master <- rbind(subset(dat, use == 1)[(as.integer(pn * 0.2) + 1):pn,], subset(dat, use == 0)[(as.integer(nn * 0.2) + 1):nn,])
pos <- sample(nrow(subset(dat, use == 1)), nrow(subset(dat, use == 1)) * 0.2)
neg <- sample(nrow(subset(dat, use == 0)), nrow(subset(dat, use == 0)) * 0.2)
tester <- rbind(subset(dat, use == 1)[pos,], subset(dat, use == 0)[neg,])
master <- rbind(subset(dat, use == 1)[-pos,], subset(dat, use == 0)[-neg,])
# アンダーサンプリング
#master <- rbind(subset(dat, use == 1)[(as.integer(pn * 0.2) + 1):pn,], subset(dat, use == 0)[(as.integer(nn * 0.2) + 1):(as.integer(nn * 0.2) + 1 + pn),])
down <- sample(nrow(subset(dat, use == 0)[-neg,]), nrow(subset(dat, use == 1)) - length(pos))
master <- rbind(subset(dat, use == 1)[-pos,], subset(dat, use == 0)[-neg,][down,])
# アンダーサンプリング:全量
#down <- sample(nrow(subset(dat, use == 0)), nrow(subset(dat, use == 1)))
#master <- master[-down,]
##### x: 説明変数群/y: 目的変数
#mx <- as.matrix(master[,2:131][,-47:-94])
mx <- as.matrix(master[,1:16])
my <- master$y
tx <- as.matrix(tester)
#ty <- tester$use
##### ロジスティック回帰(ステップワイズ)
m1 <- glm(use~., data = master[,2:72], family=binomial(link="logit"))
m2 <- step(m1)
prd <- predict(m2, newdata = as.data.frame(tx), type = "response")
border = median(prd)
table(ty, floor(prd + (1.0 - border)))
##### ラッソ回帰
lasso.net <- glmnet(mx, my, alpha = 1)
plot(lasso.net)
plot(lasso.net, xvar = "lambda")
##### クロス・バリデーション
lasso.cv <- cv.glmnet(mx, my, family = "binomial", alpha = 1)
plot(lasso.cv)
#split.screen(c(2,1))
#screen(1); plot(lasso.net)
#screen(2); plot(lasso.cv)
#close.screen(all=T)
##### テストデータによる混合行列
# type = class
prd <- predict(lasso.cv, newx = tx, type = "class", s="lambda.min")
table(ty, prd)
# type = response
prd <- predict(lasso.cv, newx = tx, type = "response", s="lambda.min")
border = median(prd)
table(ty, floor(prd + (1.0 - border)))
##### 偏回帰係数
coef(lasso.cv, s="lambda.min")
barplot(drop(coef(lasso.cv, s="lambda.min")))
# 決定木分析
library(mvpart)
#library(rpart.plot)
#library(partykit)
#library(rattle)
# データ読み込み/整形
#master <- read.delim("D:/tmp/r/car241/prm.tsv", sep="\t", header=T, stringsAsFactors=FALSE)
master <- read.csv("D:/tmp/r/contest001/train.csv", header=T, stringsAsFactors=TRUE)
tester <- read.csv("D:/tmp/r/contest001/test.csv", header=T, stringsAsFactors=TRUE)
master[,18] <- as.factor(master[,18])
# 決定木分析
dct <- rpart(y~., data = master, control=rpart.control(minsplit=5, cp=0.037), parms=list(split="gini"))
dct
printcp(dct)
plotcp(dct)
# プロット3
library(partykit)
plot(as.party(dct))
prd <- predict(dct, tester, type="prob")
write.csv(prd, "D:/tmp/r/contest001/predict.csv")
require(rpart)
require(randomForest)
require(RColorBrewer)
master <- read.csv("D:/tmp/r/contest001/master.csv", header=T, row.names=1, stringsAsFactors=TRUE)
tester <- read.csv("D:/tmp/r/contest001/tester.csv", header=T, row.names=1, stringsAsFactors=TRUE)
master[, 'y'] <- as.factor(master[, 'y'])
#tester[, 'y'] <- as.factor(tester[, 'y'])
master <- transform(master, mon=master$month)
#master <- transform(master, balanceLog=log(master$balance))
#master <- transform(master, durationLog=log(master$duration))
master <- transform(master, campaignLog=log(master$campaign))
#master <- transform(master, pdaysLog=log(master$pdays))
#master <- transform(master, previousLog=log(master$previous))
#tester <- transform(tester, balanceLog=log(tester$balance))
#tester <- transform(tester, durationLog=log(tester$duration))
tester <- transform(tester, campaignLog=log(tester$campaign))
#tester <- transform(tester, pdaysLog=log(tester$pdays))
#tester <- transform(tester, previousLog=log(tester$previous))
set.seed(1500)
model <- randomForest(y~., data=master, ntree=500, proximity=FALSE)
pred.model <- predict(model, newdata=tester, type='prob')
pred.model
table(tester[,73], round(pred.model))
write.csv(pred.model[,2], "D:/tmp/r/contest001/predict_rf.csv")
# 決定木分析
# https://www.macromill.com/service/data_analysis/d007.html
# カイ二乗統計量: 手法=CHAID
# ジニ係数: 手法=CART
# 平均情報量(エントロピー): 手法=C5.0
library(mvpart)
library(rpart.plot)
library(partykit)
library(rattle)
# データ読み込み/整形
master <- read.delim("D:/tmp/r/car296/car296dtree.tsv", sep="\t", header=T, stringsAsFactors=FALSE)
master <- read.delim("D:/tmp/r/car284/0418/logit_rakuraku.tsv", sep="\t", header=T, stringsAsFactors=FALSE)
users <- subset(master, use == 1)
usecv <- subset(users, cv == 1)
# 決定木分析
dct <- rpart(use ~ age + sex + child, data = master, control=rpart.control(minsplit=5, cp=0.002), parms=list(split="gini"))
dct <- rpart(use ~ age + sex + child, data = master, method = "class", cp=0.001)
dct <- rpart(cv ~ age + sex + child + marridge + income +
job_public + job_president +
job_salaryman_1 + job_salaryman_2 + job_salaryman_3 +
job_independent + job_freelancer + job_houseman +
job_parttimer + job_student + job_neet + job_other,
data = users, control=rpart.control(minsplit=5, cp=0.02), parms=list(split="gini"))
dct <- rpart(cv ~ age + sex + child + marridge + income, data = users, control=rpart.control(minsplit=5, cp=0.013), parms=list(split="gini"))
dct
printcp(dct)
plotcp(dct)
# プロット1
plot(dct, uniform = T, margin = 0.05)
text(dct, uniform = T, use.n = T, all = T, cex = 1.0, pretty = 0)
# プロット2
library(rpart.plot)
rpart.plot(dct, type = 1, uniform = TRUE, extra = 1, under = 1, faclen = 0)
# プロット3
library(partykit)
plot(dct, gp = gpar(fontfamily = "Osaka"), tp_args = list(id=F), ip_args = list(id=F))
# http://d.hatena.ne.jp/isseing333/20110825/1314253002
# https://www.rdocumentation.org/packages/partykit/versions/1.1-1/topics/party-plot
# プロット4
library(rattle)
fancyRpartPlot(dct)
# プロット5
library(rpart.plot)
prp(dct, type=2, extra=101, nn=T, fallen.leaves=T, faclen=0, varlen=0, shadow.col="white", branch.lty=2, cex = 0.8, split.cex=1.0, under.cex = 1.0)
require(glmnet)
dat <- read.delim("D:/tmp/r/lasso.tsv", sep="\t", header=T, row.names = 1, stringsAsFactors=FALSE)
dat <- subset(dat, rowSums(dat[,36:71]) > 0)
set.seed(1000)
pos <- sample(nrow(subset(dat, use == 1)), nrow(subset(dat, use == 1)) * 0.2)
neg <- sample(nrow(subset(dat, use == 0)), nrow(subset(dat, use == 0)) * 0.2)
tester <- rbind(subset(dat, use == 1)[pos,], subset(dat, use == 0)[neg,])
master <- rbind(subset(dat, use == 1)[-pos,], subset(dat, use == 0)[-neg,])
# ダウンサンプリング
down <- sample(nrow(subset(dat, use == 0)[-neg,]), nrow(subset(dat, use == 1)) - length(pos))
master <- rbind(subset(dat, use == 1)[-pos,], subset(dat, use == 0)[-neg,][down,])
##### x: 説明変数群/y: 目的変数
mx <- as.matrix(master[,1:71])
my <- master$use
tx <- as.matrix(tester[,1:71])
ty <- tester$use
##### ロジスティック回帰(ステップワイズ)
m1 <- glm(use~., data = master[,2:72], family=binomial(link="logit"))
m2 <- step(m1)
prd <- predict(m2, newdata = as.data.frame(tx), type = "response")
border = median(prd)
table(ty, floor(prd + (1.0 - border)))
##### ラッソ回帰
lasso.net <- glmnet(mx, my, family = "binomial", alpha = 1)
plot(lasso.net)
plot(lasso.net, xvar = "lambda")
##### クロス・バリデーション
lasso.cv <- cv.glmnet(mx, my, family = "binomial", alpha = 1)
plot(lasso.cv)
##### テストデータによる混合行列
# type = class
prd <- predict(lasso.cv, newx = tx, type = "class", s="lambda.min")
table(ty, prd)
# type = response
prd <- predict(lasso.cv, newx = tx, type = "response", s="lambda.min")
border = median(prd)
table(ty, floor(prd + (1.0 - border)))
##### 偏回帰係数
coef(lasso.cv, s="lambda.min")
#barplot(drop(coef(lasso.cv, s=lambda.min)))
require(e1071)
require(klaR)
dat <- read.delim("D:/tmp/r/naivebayes_train.tsv", sep="\t", header=T, row.names = 1, stringsAsFactors=TRUE)
##### 属性なしを除去
#dat <- subset(dat, rowSums(dat[,36:71]) > 0)
##### データサンプリング
set.seed(1000)
#ha <- sample(nrow(subset(dat, answer == 'A')), nrow(subset(dat, answer == 'A')) * 0.5)
#ga <- subset(dat, answer == 'A')[-ha,]
ga <- subset(dat, answer == 'A')
gb <- subset(dat, answer == 'B')
gc <- subset(dat, answer == 'C')
an <- sample(nrow(ga), nrow(ga) * 0.2)
bn <- sample(nrow(gb), nrow(gb) * 0.2)
cn <- sample(nrow(gc), nrow(gc) * 0.2)
tester <- rbind(ga[an,], gb[bn,], gc[cn,])
master <- rbind(ga[setdiff(1:nrow(ga), an),], gb[setdiff(1:nrow(gb), bn),], gc[setdiff(1:nrow(gc), cn),])
# アンダーサンプリング
coms <- min(nrow(subset(master, answer == "A")),
nrow(subset(master, answer == "B")),
nrow(subset(master, answer == "C")))
an <- sample(nrow(subset(master, answer == "A")), coms)
bn <- sample(nrow(subset(master, answer == "B")), coms)
cn <- sample(nrow(subset(master, answer == "C")), coms)
master <- rbind(subset(master, answer == "A")[an,],
subset(master, answer == "B")[bn,],
subset(master, answer == "C")[cn,])
# 初回フラグ除外
master <- master[,-1]
tester <- tester[,-1]
###### 学習
model.nv <- naiveBayes(answer~., master, laplace = 1)
#model.nv <- NaiveBayes(answer~., master, fL = 1)
###### 学習した分類器でテストデータを予測
pred.nv <- predict(model.nv, tester)
###### 混同行列 (Confusion Matrix)
table(tester$answer, pred.nv)
table(tester$answer, pred.nv$class)
###### 縦展開:事後確率表出力
require(reshape2)
ptab <- melt(model.nv$tables[1])
colnames(ptab)[2] <- 'SEQ'
for(i in 2:length(model.nv$tables)) {
t <- melt(model.nv$tables[i])
colnames(t)[2] <- colnames(ptab)[2]
ptab <- rbind(ptab, t)
}
ptab$L1 <- factor(ptab$L1, levels = colnames(tester)[1:71])
s <- order(ptab$L1, ptab$Y, ptab$SEQ)
ptab[s,]
###### 横展開
require(MASS)
require(reshape2)
ptab <- melt(model.nv$tables[1])[,-4]
colnames(ptab) <- c('KBN', 'SEQ', colnames(ptab)[2])
for(i in 2:length(model.nv$tables)) {
t <- melt(model.nv$tables[i])
ptab <- cbind(ptab, t[,3])
}
colnames(ptab)[3:73] <- colnames(tester)[1:71]
s <- order(ptab$KBN, ptab$SEQ)
write.matrix(ptab[s,], "D:/tmp/r/car284/0308_0403/ptables.csv", sep = ',')
###### データ保存
require(MASS)
write.matrix(tester, "D:/tmp/r/car284/0308_0403/testers_nv.tsv", sep = "\t" )
write.matrix(master, "D:/tmp/r/car284/0308_0403/masters_nv.tsv", sep = "\t" )
require(randomForest)
model.rf <- randomForest(answer~., data=master, ntree=500)
pred.rf <- predict(model.rf, newdata=tester, type='class')
table(tester$answer, pred.rf)
# 省メモリチューニング
# http://www.slideshare.net/fqz7c3/32bit-windowsrandomforest-39335198
model.rf <- randomForest(answer~., data=master, ntree=100)
plot(model.rf) # OOBエラーが下がりきっていなければntreeを増やすことによる改善が期待できる
str(model.rf$forest)
model.rf$forest$nodestatus
# 終端ノード数を数える(nodestatusの-1の数を数える)
tnodescount <- apply(model.rf$forest$nodestatus, 2, function(x){sum(x[x==-1])*-1})
tnodescount
hist(tnodescount) # 分布を見たいときにヒストグラム
summary(tnodescount) # 最大値(Max)を見る
# 最大より少し大きな値をmaxnodesに指定
model.rf <- randomForest(answer~., data=master, ntree=1000, maxnodes=4600)
require(kernlab)
model.svm <- ksvm(answer~., data = master, kernel = "rbfdot", kpar = list(sigma = 0.01), C = 5, cross = 10)
pred.svm <- predict(model.svm, newdata = tester, type = 'response')
pred.svm
table(tester$answer, pred.svm)
require(igraph)
vs <- read.delim("D:/tmp/r/network_graph_vertex.tsv", sep="\t", header=F, row.names = 1)
es <- read.delim("D:/tmp/r/network_graph_edges.tsv", sep="\t", header=F)
vsize <- 10000
esize <- 10000
# ノードラベル
vn <- read.delim("D:/tmp/r/network_graph_labels.tsv", sep="\t", header=F, row.names = 1)
# 描画
ng <- graph.data.frame(es, directed = F)
tkplot(ng,
edge.width=E(ng)$V3/esize,
vertex.size=vs[V(ng)$name,]/vsize,
vertex.label=vn[V(ng)$name,],
vertex.color = "#AACCFF88",
layout=layout.fruchterman.reingold
)
require(rpart)
require(randomForest)
require(RColorBrewer)
dat <- read.delim("D:/tmp/r/car284/0308_0403/logit_rakuraku.tsv", sep="\t", header=T, row.names=1, stringsAsFactors=FALSE)
set.seed(1500)
pos <- sample(nrow(subset(dat, use == 1)), nrow(subset(dat, use == 1)) * 0.2)
neg <- sample(nrow(subset(dat, use == 0)), nrow(subset(dat, use == 0)) * 0.2)
tester <- rbind(subset(dat, use == 1)[pos,], subset(dat, use == 0)[neg,])
master <- rbind(subset(dat, use == 1)[-pos,], subset(dat, use == 0)[-neg,])
down <- sample(nrow(subset(dat, use == 0)[-neg,]), nrow(subset(dat, use == 1)) - length(pos))
master <- rbind(subset(dat, use == 1)[-pos,], subset(dat, use == 0)[-neg,][down,])
model <- randomForest(
use ~ .,
data=master[,2:73],
ntree=500,
proximity=TRUE
)
pred.model <- predict(
model,
newdata=tester[,2:73],
type='class'
)
table(tester[,73], round(pred.model))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment