Last active
November 28, 2017 23:27
-
-
Save peristrophe/9e1f944f36c8f8638d715cf0a018e3c2 to your computer and use it in GitHub Desktop.
R script casestudies.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(glmnet) | |
source("D:/tmp/r/lib/my_functions.r") | |
master <- read.csv("D:/tmp/r/contest001/train.csv", header=T, row.names = 1, stringsAsFactors=TRUE) | |
tester <- read.csv("D:/tmp/r/contest001/test.csv", header=T, row.names = 1, stringsAsFactors=TRUE) | |
require(MASS) | |
dat <- subset(dat, rowSums(dat[,36:71]) > 0) | |
#master <- without_empty_im(master) | |
#tester <- without_empty_im(tester) | |
set.seed(1000) | |
##### データ分割(学習データ/テストデータ) | |
#pn <- nrow(subset(dat, use == 1)) | |
#nn <- nrow(subset(dat, use == 0)) | |
#tester <- rbind(subset(dat, use == 1)[1:as.integer(pn * 0.2),], subset(dat, use == 0)[1:as.integer(nn * 0.2),]) | |
#master <- rbind(subset(dat, use == 1)[(as.integer(pn * 0.2) + 1):pn,], subset(dat, use == 0)[(as.integer(nn * 0.2) + 1):nn,]) | |
pos <- sample(nrow(subset(dat, use == 1)), nrow(subset(dat, use == 1)) * 0.2) | |
neg <- sample(nrow(subset(dat, use == 0)), nrow(subset(dat, use == 0)) * 0.2) | |
tester <- rbind(subset(dat, use == 1)[pos,], subset(dat, use == 0)[neg,]) | |
master <- rbind(subset(dat, use == 1)[-pos,], subset(dat, use == 0)[-neg,]) | |
# アンダーサンプリング | |
#master <- rbind(subset(dat, use == 1)[(as.integer(pn * 0.2) + 1):pn,], subset(dat, use == 0)[(as.integer(nn * 0.2) + 1):(as.integer(nn * 0.2) + 1 + pn),]) | |
down <- sample(nrow(subset(dat, use == 0)[-neg,]), nrow(subset(dat, use == 1)) - length(pos)) | |
master <- rbind(subset(dat, use == 1)[-pos,], subset(dat, use == 0)[-neg,][down,]) | |
# アンダーサンプリング:全量 | |
#down <- sample(nrow(subset(dat, use == 0)), nrow(subset(dat, use == 1))) | |
#master <- master[-down,] | |
##### x: 説明変数群/y: 目的変数 | |
#mx <- as.matrix(master[,2:131][,-47:-94]) | |
mx <- as.matrix(master[,1:16]) | |
my <- master$y | |
tx <- as.matrix(tester) | |
#ty <- tester$use | |
##### ロジスティック回帰(ステップワイズ) | |
m1 <- glm(use~., data = master[,2:72], family=binomial(link="logit")) | |
m2 <- step(m1) | |
prd <- predict(m2, newdata = as.data.frame(tx), type = "response") | |
border = median(prd) | |
table(ty, floor(prd + (1.0 - border))) | |
##### ラッソ回帰 | |
lasso.net <- glmnet(mx, my, alpha = 1) | |
plot(lasso.net) | |
plot(lasso.net, xvar = "lambda") | |
##### クロス・バリデーション | |
lasso.cv <- cv.glmnet(mx, my, family = "binomial", alpha = 1) | |
plot(lasso.cv) | |
#split.screen(c(2,1)) | |
#screen(1); plot(lasso.net) | |
#screen(2); plot(lasso.cv) | |
#close.screen(all=T) | |
##### テストデータによる混合行列 | |
# type = class | |
prd <- predict(lasso.cv, newx = tx, type = "class", s="lambda.min") | |
table(ty, prd) | |
# type = response | |
prd <- predict(lasso.cv, newx = tx, type = "response", s="lambda.min") | |
border = median(prd) | |
table(ty, floor(prd + (1.0 - border))) | |
##### 偏回帰係数 | |
coef(lasso.cv, s="lambda.min") | |
barplot(drop(coef(lasso.cv, s="lambda.min"))) | |
# 決定木分析 | |
library(mvpart) | |
#library(rpart.plot) | |
#library(partykit) | |
#library(rattle) | |
# データ読み込み/整形 | |
#master <- read.delim("D:/tmp/r/car241/prm.tsv", sep="\t", header=T, stringsAsFactors=FALSE) | |
master <- read.csv("D:/tmp/r/contest001/train.csv", header=T, stringsAsFactors=TRUE) | |
tester <- read.csv("D:/tmp/r/contest001/test.csv", header=T, stringsAsFactors=TRUE) | |
master[,18] <- as.factor(master[,18]) | |
# 決定木分析 | |
dct <- rpart(y~., data = master, control=rpart.control(minsplit=5, cp=0.037), parms=list(split="gini")) | |
dct | |
printcp(dct) | |
plotcp(dct) | |
# プロット3 | |
library(partykit) | |
plot(as.party(dct)) | |
prd <- predict(dct, tester, type="prob") | |
write.csv(prd, "D:/tmp/r/contest001/predict.csv") | |
require(rpart) | |
require(randomForest) | |
require(RColorBrewer) | |
master <- read.csv("D:/tmp/r/contest001/master.csv", header=T, row.names=1, stringsAsFactors=TRUE) | |
tester <- read.csv("D:/tmp/r/contest001/tester.csv", header=T, row.names=1, stringsAsFactors=TRUE) | |
master[, 'y'] <- as.factor(master[, 'y']) | |
#tester[, 'y'] <- as.factor(tester[, 'y']) | |
master <- transform(master, mon=master$month) | |
#master <- transform(master, balanceLog=log(master$balance)) | |
#master <- transform(master, durationLog=log(master$duration)) | |
master <- transform(master, campaignLog=log(master$campaign)) | |
#master <- transform(master, pdaysLog=log(master$pdays)) | |
#master <- transform(master, previousLog=log(master$previous)) | |
#tester <- transform(tester, balanceLog=log(tester$balance)) | |
#tester <- transform(tester, durationLog=log(tester$duration)) | |
tester <- transform(tester, campaignLog=log(tester$campaign)) | |
#tester <- transform(tester, pdaysLog=log(tester$pdays)) | |
#tester <- transform(tester, previousLog=log(tester$previous)) | |
set.seed(1500) | |
model <- randomForest(y~., data=master, ntree=500, proximity=FALSE) | |
pred.model <- predict(model, newdata=tester, type='prob') | |
pred.model | |
table(tester[,73], round(pred.model)) | |
write.csv(pred.model[,2], "D:/tmp/r/contest001/predict_rf.csv") | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 決定木分析 | |
# https://www.macromill.com/service/data_analysis/d007.html | |
# カイ二乗統計量: 手法=CHAID | |
# ジニ係数: 手法=CART | |
# 平均情報量(エントロピー): 手法=C5.0 | |
library(mvpart) | |
library(rpart.plot) | |
library(partykit) | |
library(rattle) | |
# データ読み込み/整形 | |
master <- read.delim("D:/tmp/r/car296/car296dtree.tsv", sep="\t", header=T, stringsAsFactors=FALSE) | |
master <- read.delim("D:/tmp/r/car284/0418/logit_rakuraku.tsv", sep="\t", header=T, stringsAsFactors=FALSE) | |
users <- subset(master, use == 1) | |
usecv <- subset(users, cv == 1) | |
# 決定木分析 | |
dct <- rpart(use ~ age + sex + child, data = master, control=rpart.control(minsplit=5, cp=0.002), parms=list(split="gini")) | |
dct <- rpart(use ~ age + sex + child, data = master, method = "class", cp=0.001) | |
dct <- rpart(cv ~ age + sex + child + marridge + income + | |
job_public + job_president + | |
job_salaryman_1 + job_salaryman_2 + job_salaryman_3 + | |
job_independent + job_freelancer + job_houseman + | |
job_parttimer + job_student + job_neet + job_other, | |
data = users, control=rpart.control(minsplit=5, cp=0.02), parms=list(split="gini")) | |
dct <- rpart(cv ~ age + sex + child + marridge + income, data = users, control=rpart.control(minsplit=5, cp=0.013), parms=list(split="gini")) | |
dct | |
printcp(dct) | |
plotcp(dct) | |
# プロット1 | |
plot(dct, uniform = T, margin = 0.05) | |
text(dct, uniform = T, use.n = T, all = T, cex = 1.0, pretty = 0) | |
# プロット2 | |
library(rpart.plot) | |
rpart.plot(dct, type = 1, uniform = TRUE, extra = 1, under = 1, faclen = 0) | |
# プロット3 | |
library(partykit) | |
plot(dct, gp = gpar(fontfamily = "Osaka"), tp_args = list(id=F), ip_args = list(id=F)) | |
# http://d.hatena.ne.jp/isseing333/20110825/1314253002 | |
# https://www.rdocumentation.org/packages/partykit/versions/1.1-1/topics/party-plot | |
# プロット4 | |
library(rattle) | |
fancyRpartPlot(dct) | |
# プロット5 | |
library(rpart.plot) | |
prp(dct, type=2, extra=101, nn=T, fallen.leaves=T, faclen=0, varlen=0, shadow.col="white", branch.lty=2, cex = 0.8, split.cex=1.0, under.cex = 1.0) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(glmnet) | |
dat <- read.delim("D:/tmp/r/lasso.tsv", sep="\t", header=T, row.names = 1, stringsAsFactors=FALSE) | |
dat <- subset(dat, rowSums(dat[,36:71]) > 0) | |
set.seed(1000) | |
pos <- sample(nrow(subset(dat, use == 1)), nrow(subset(dat, use == 1)) * 0.2) | |
neg <- sample(nrow(subset(dat, use == 0)), nrow(subset(dat, use == 0)) * 0.2) | |
tester <- rbind(subset(dat, use == 1)[pos,], subset(dat, use == 0)[neg,]) | |
master <- rbind(subset(dat, use == 1)[-pos,], subset(dat, use == 0)[-neg,]) | |
# ダウンサンプリング | |
down <- sample(nrow(subset(dat, use == 0)[-neg,]), nrow(subset(dat, use == 1)) - length(pos)) | |
master <- rbind(subset(dat, use == 1)[-pos,], subset(dat, use == 0)[-neg,][down,]) | |
##### x: 説明変数群/y: 目的変数 | |
mx <- as.matrix(master[,1:71]) | |
my <- master$use | |
tx <- as.matrix(tester[,1:71]) | |
ty <- tester$use | |
##### ロジスティック回帰(ステップワイズ) | |
m1 <- glm(use~., data = master[,2:72], family=binomial(link="logit")) | |
m2 <- step(m1) | |
prd <- predict(m2, newdata = as.data.frame(tx), type = "response") | |
border = median(prd) | |
table(ty, floor(prd + (1.0 - border))) | |
##### ラッソ回帰 | |
lasso.net <- glmnet(mx, my, family = "binomial", alpha = 1) | |
plot(lasso.net) | |
plot(lasso.net, xvar = "lambda") | |
##### クロス・バリデーション | |
lasso.cv <- cv.glmnet(mx, my, family = "binomial", alpha = 1) | |
plot(lasso.cv) | |
##### テストデータによる混合行列 | |
# type = class | |
prd <- predict(lasso.cv, newx = tx, type = "class", s="lambda.min") | |
table(ty, prd) | |
# type = response | |
prd <- predict(lasso.cv, newx = tx, type = "response", s="lambda.min") | |
border = median(prd) | |
table(ty, floor(prd + (1.0 - border))) | |
##### 偏回帰係数 | |
coef(lasso.cv, s="lambda.min") | |
#barplot(drop(coef(lasso.cv, s=lambda.min))) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(e1071) | |
require(klaR) | |
dat <- read.delim("D:/tmp/r/naivebayes_train.tsv", sep="\t", header=T, row.names = 1, stringsAsFactors=TRUE) | |
##### 属性なしを除去 | |
#dat <- subset(dat, rowSums(dat[,36:71]) > 0) | |
##### データサンプリング | |
set.seed(1000) | |
#ha <- sample(nrow(subset(dat, answer == 'A')), nrow(subset(dat, answer == 'A')) * 0.5) | |
#ga <- subset(dat, answer == 'A')[-ha,] | |
ga <- subset(dat, answer == 'A') | |
gb <- subset(dat, answer == 'B') | |
gc <- subset(dat, answer == 'C') | |
an <- sample(nrow(ga), nrow(ga) * 0.2) | |
bn <- sample(nrow(gb), nrow(gb) * 0.2) | |
cn <- sample(nrow(gc), nrow(gc) * 0.2) | |
tester <- rbind(ga[an,], gb[bn,], gc[cn,]) | |
master <- rbind(ga[setdiff(1:nrow(ga), an),], gb[setdiff(1:nrow(gb), bn),], gc[setdiff(1:nrow(gc), cn),]) | |
# アンダーサンプリング | |
coms <- min(nrow(subset(master, answer == "A")), | |
nrow(subset(master, answer == "B")), | |
nrow(subset(master, answer == "C"))) | |
an <- sample(nrow(subset(master, answer == "A")), coms) | |
bn <- sample(nrow(subset(master, answer == "B")), coms) | |
cn <- sample(nrow(subset(master, answer == "C")), coms) | |
master <- rbind(subset(master, answer == "A")[an,], | |
subset(master, answer == "B")[bn,], | |
subset(master, answer == "C")[cn,]) | |
# 初回フラグ除外 | |
master <- master[,-1] | |
tester <- tester[,-1] | |
###### 学習 | |
model.nv <- naiveBayes(answer~., master, laplace = 1) | |
#model.nv <- NaiveBayes(answer~., master, fL = 1) | |
###### 学習した分類器でテストデータを予測 | |
pred.nv <- predict(model.nv, tester) | |
###### 混同行列 (Confusion Matrix) | |
table(tester$answer, pred.nv) | |
table(tester$answer, pred.nv$class) | |
###### 縦展開:事後確率表出力 | |
require(reshape2) | |
ptab <- melt(model.nv$tables[1]) | |
colnames(ptab)[2] <- 'SEQ' | |
for(i in 2:length(model.nv$tables)) { | |
t <- melt(model.nv$tables[i]) | |
colnames(t)[2] <- colnames(ptab)[2] | |
ptab <- rbind(ptab, t) | |
} | |
ptab$L1 <- factor(ptab$L1, levels = colnames(tester)[1:71]) | |
s <- order(ptab$L1, ptab$Y, ptab$SEQ) | |
ptab[s,] | |
###### 横展開 | |
require(MASS) | |
require(reshape2) | |
ptab <- melt(model.nv$tables[1])[,-4] | |
colnames(ptab) <- c('KBN', 'SEQ', colnames(ptab)[2]) | |
for(i in 2:length(model.nv$tables)) { | |
t <- melt(model.nv$tables[i]) | |
ptab <- cbind(ptab, t[,3]) | |
} | |
colnames(ptab)[3:73] <- colnames(tester)[1:71] | |
s <- order(ptab$KBN, ptab$SEQ) | |
write.matrix(ptab[s,], "D:/tmp/r/car284/0308_0403/ptables.csv", sep = ',') | |
###### データ保存 | |
require(MASS) | |
write.matrix(tester, "D:/tmp/r/car284/0308_0403/testers_nv.tsv", sep = "\t" ) | |
write.matrix(master, "D:/tmp/r/car284/0308_0403/masters_nv.tsv", sep = "\t" ) | |
require(randomForest) | |
model.rf <- randomForest(answer~., data=master, ntree=500) | |
pred.rf <- predict(model.rf, newdata=tester, type='class') | |
table(tester$answer, pred.rf) | |
# 省メモリチューニング | |
# http://www.slideshare.net/fqz7c3/32bit-windowsrandomforest-39335198 | |
model.rf <- randomForest(answer~., data=master, ntree=100) | |
plot(model.rf) # OOBエラーが下がりきっていなければntreeを増やすことによる改善が期待できる | |
str(model.rf$forest) | |
model.rf$forest$nodestatus | |
# 終端ノード数を数える(nodestatusの-1の数を数える) | |
tnodescount <- apply(model.rf$forest$nodestatus, 2, function(x){sum(x[x==-1])*-1}) | |
tnodescount | |
hist(tnodescount) # 分布を見たいときにヒストグラム | |
summary(tnodescount) # 最大値(Max)を見る | |
# 最大より少し大きな値をmaxnodesに指定 | |
model.rf <- randomForest(answer~., data=master, ntree=1000, maxnodes=4600) | |
require(kernlab) | |
model.svm <- ksvm(answer~., data = master, kernel = "rbfdot", kpar = list(sigma = 0.01), C = 5, cross = 10) | |
pred.svm <- predict(model.svm, newdata = tester, type = 'response') | |
pred.svm | |
table(tester$answer, pred.svm) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(igraph) | |
vs <- read.delim("D:/tmp/r/network_graph_vertex.tsv", sep="\t", header=F, row.names = 1) | |
es <- read.delim("D:/tmp/r/network_graph_edges.tsv", sep="\t", header=F) | |
vsize <- 10000 | |
esize <- 10000 | |
# ノードラベル | |
vn <- read.delim("D:/tmp/r/network_graph_labels.tsv", sep="\t", header=F, row.names = 1) | |
# 描画 | |
ng <- graph.data.frame(es, directed = F) | |
tkplot(ng, | |
edge.width=E(ng)$V3/esize, | |
vertex.size=vs[V(ng)$name,]/vsize, | |
vertex.label=vn[V(ng)$name,], | |
vertex.color = "#AACCFF88", | |
layout=layout.fruchterman.reingold | |
) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(rpart) | |
require(randomForest) | |
require(RColorBrewer) | |
dat <- read.delim("D:/tmp/r/car284/0308_0403/logit_rakuraku.tsv", sep="\t", header=T, row.names=1, stringsAsFactors=FALSE) | |
set.seed(1500) | |
pos <- sample(nrow(subset(dat, use == 1)), nrow(subset(dat, use == 1)) * 0.2) | |
neg <- sample(nrow(subset(dat, use == 0)), nrow(subset(dat, use == 0)) * 0.2) | |
tester <- rbind(subset(dat, use == 1)[pos,], subset(dat, use == 0)[neg,]) | |
master <- rbind(subset(dat, use == 1)[-pos,], subset(dat, use == 0)[-neg,]) | |
down <- sample(nrow(subset(dat, use == 0)[-neg,]), nrow(subset(dat, use == 1)) - length(pos)) | |
master <- rbind(subset(dat, use == 1)[-pos,], subset(dat, use == 0)[-neg,][down,]) | |
model <- randomForest( | |
use ~ ., | |
data=master[,2:73], | |
ntree=500, | |
proximity=TRUE | |
) | |
pred.model <- predict( | |
model, | |
newdata=tester[,2:73], | |
type='class' | |
) | |
table(tester[,73], round(pred.model)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment