Skip to content

Instantly share code, notes, and snippets.

@puyokw
Last active August 29, 2015 14:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save puyokw/04b8ab53bc842f96a18c to your computer and use it in GitHub Desktop.
Save puyokw/04b8ab53bc842f96a18c to your computer and use it in GitHub Desktop.
データサイエンス・カップ 2015 春
# 欠損値補完
# トレーニングデータの読み込み(output.csv の読み込み)
data1<-read.csv("output.csv")
# ID, 湿度, 観客数, 節を除いた
# IDは予測すべきもののIDが大きくかけ離れているため
# 湿度と観客数は欠損している
# 節は月とほぼ同じ内容
temp<-data1[,c(-1,-2,-5,-16)]
temp.x<-data1[,c(-1,-2,-5,-15,-16)]
temp.y<-data1[,15]
# predict with gbm
library(gbm)
# 気温の予測の変数重要度
# 確率勾配ブースティング
set.seed(136)
gbm.fit<- gbm(気温~.,data=temp,distribution="gaussian",
var.monotone=c(0,0,0,0,0,0,0,0,0,0,0), n.trees=50000, interaction.depth=3,
cv.folds=10,shrinkage=0.005, bag.fraction=0.5)
#最適な木の数を求める
best.iter<-gbm.perf(gbm.fit, method="cv")
PRE<-predict(gbm.fit,data1,best.iter)
MSR<-sum((temp$気温-PRE)^2/nrow(temp))
# 一応RMSEを出しているが、変数重要度を求めるために行った。
print(sqrt(MSR)) # 1.60792
print(best.iter) # 45045
summary(gbm.fit) # 気温の予測に対する変数重要度
# 変数重要度が高い方から7番目まで
# ただし収容人数はホームと情報が重複していると考えて除いた
# 気温の予測はsvmの方がよい
library(e1071)
# costを小さく保ちながら、できるだけ気温のRMSEが小さくなるように
# epsilonとgammaを調整した
model<-svm(temp.x[,c(1,4,5,6,7,8)],temp.y,epsilon=0.065,cost=2,gamma=1.8)
PRE<-predict(model,temp.x[,c(1,4,5,6,7,8)])
MSR<-sum((temp$気温-PRE)^2/nrow(temp))
print(sqrt(MSR)) # 0.9978517
# テスト用のデータの読み込み(answer.csv の読み込み)
data2<-read.csv("answer.csv")
temp.pre<-predict(model,data2[,c(2,6,7,8,9,10)])
# テスト用のデータに気温は含まれないので、付け足す
data2$気温<-temp.pre
# 観客数の予測モデルの構築
# 確率勾配ブースティング
set.seed(136)
gbm.fit<- gbm(観客数~.,data=data1[,c(-1,-16)],distribution="gaussian",
var.monotone=c(0,0,0,0,0,0,0,0,0,0,0,0,0), n.trees=25000, interaction.depth=3,
cv.folds=10,shrinkage=0.005, bag.fraction=0.5)
#最適な木の数を求める
best.iter<-gbm.perf(gbm.fit, method="cv")
PRE<-predict(gbm.fit,data1,best.iter)
MSR<-sum((data1$観客数-PRE)^2/nrow(data1))
print(sqrt(MSR)) # 2238.88
print(best.iter) # 13655
summary(gbm.fit) # 観客数の予測に対する変数重要度
# 求めたモデルを用いて予測する
PRE<-predict(gbm.fit,data2,best.iter)
ans<-data2
ans$観客数<-round(PRE)
ans<-ans[,c(1,15)]
# 結果をtest.csv として出力
write.table(ans,"test.csv",sep=",",col.names=FALSE,row.names=FALSE)
using System;
using System.IO;
using System.Collections.Generic;
class ConvertInput
{
public int convertWeek(String str)
{
switch (str)
{
case "月": return 1;
case "火": return 2;
case "水": return 3;
case "木": return 4;
case "金": return 5;
case "土": return 6;
case "日": return 7;
}
return 0;
}
public String convertStrangeNum(String str)
{
String res = "";
for (int i = 0; i < str.Length; ++i)
{
string STR = str.Substring(i, 1);
switch (STR)
{
case "0": { STR = "0"; break; }
case "1": { STR = "1"; break; }
case "2": { STR = "2"; break; }
case "3": { STR = "3"; break; }
case "4": { STR = "4"; break; }
case "5": { STR = "5"; break; }
case "6": { STR = "6"; break; }
case "7": { STR = "7"; break; }
case "8": { STR = "8"; break; }
case "9": { STR = "9"; break; }
default: { STR = ""; break; }
}
res += STR;
}
return res;
}
public double convertTime(String str)
{
double hour = double.Parse(str.Substring(0, 2));
double min = double.Parse(str.Substring(3, 2));
return hour + min / 60;
}
public String[] csvSep(string str)
{
int prev = 0; int n = 1;
for (int i = 0; i < str.Length; ++i) if (str[i] == '/') ++n;
string[] tmp = new string[n];
n = 0;
for (int i = 0; i < str.Length; ++i)
{
if (str[i] == '/')
{
tmp[n++] = str.Substring(prev, i - prev);
prev = i + 1;
}
}
tmp[n] = str.Substring(prev, str.Length - prev);
return tmp;
}
};
class optDataLunch
{
static void Main()
{
String[] Stadium=new String[100]; // スタジアムの名称
int[] StadiumCap = new int[100]; // スタジアムの収容人数
String[] Team = new String[100]; // チームの名前(ホームチームとスタジアムは同じ内容)
String[] TV = new String[200]; // テレビ局はあまり重要ではなかった
int StadiumSize = 0; // スタジアムの総数
int TeamSize = 0; // チームの総数
int TVsize = 0; // テレビ局の総数
int[] numTV=new int[200]; // 各テレビ局に割り当てられた番号
StreamReader StaSR = new StreamReader("stadium.csv", System.Text.Encoding.GetEncoding("shift_jis"));
while (true)
{
String _str = StaSR.ReadLine();
if (_str == null) break;
String[] str = _str.Split(',');
Stadium[StadiumSize] = str[0];
StadiumCap[StadiumSize] = int.Parse(str[2]);
++StadiumSize;
}
StreamReader srCond = new StreamReader("condition.csv");
String[] srID = new String[2048];
double[] srTemp = new double[2048];
double[] srHumid = new double[2048];
int srSize = 0;
while (true)
{
String _str = srCond.ReadLine();
if (_str == null) break;
String[] str = _str.Split(',');
srID[srSize]=str[0]; // IDが昇順にsortされているので二分探索したら早い (しない予定)
srTemp[srSize] = double.Parse(str[4]);
srHumid[srSize] = double.Parse(str[5].Substring(0, str[5].Length - 1));
++srSize;
}
StreamReader ifs = new StreamReader("train.csv");
StreamWriter ofs = new StreamWriter("output.txt");
StreamWriter swWeather = new StreamWriter("preWeather.txt");
ConvertInput CI = new ConvertInput();
swWeather.WriteLine("ID,天気,湿度");
// ofs.WriteLine("ID,観客数,年度,リーグ,節,試合日,開始時刻,ホーム,アウェイ,スタジアム,TV放送");
ofs.WriteLine("ID,観客数,年度,リーグ,節,第何日,月,日,時間,ホーム,アウェイ,収容人数,NHK,地方放送,気温,湿度");
while (true)
{
String _str = ifs.ReadLine();
if (_str == null) break;
String[] str = _str.Split(',');
str[3] = str[3].Substring(1, 1);
if (str[3] == "1") str[3] = "0";
else str[3] = "1";
int Month=0;
if (str[5][0] == '0') Month = int.Parse(str[5].Substring(1, 1));
else Month = int.Parse(str[5].Substring(0, 2));
int Day=0;
if (str[5][3] == '0') Day = int.Parse(str[5].Substring(4, 1));
else Day = int.Parse(str[5].Substring(3, 2));
int Week = CI.convertWeek( str[5].Substring(6, 1));
double time = CI.convertTime(str[6]);
int StadiumNo = -1;
for (int i=0; i < StadiumSize; ++i) if (str[9] == Stadium[i]) { StadiumNo = i; break; }
if (StadiumNo==-1)
{
StadiumNo = StadiumSize;
Stadium[StadiumSize++] = str[9];
}
int homeTeamNo = -1;
int awayTeamNo = -1;
for (int i = 0; i < TeamSize; ++i) if (str[7] == Team[i]) { homeTeamNo = i; break; }
if (homeTeamNo == -1)
{
homeTeamNo = TeamSize;
Team[TeamSize++] = str[7];
}
for (int i = 0; i < TeamSize; ++i) if (str[8] == Team[i]) { awayTeamNo = i; break; }
if (awayTeamNo == -1)
{
awayTeamNo = TeamSize;
Team[TeamSize++] = str[8];
}
int ites = 0, itef=str[4].Length-1;
String Term=""; // 節
String GameNo; // 第何日
for (int i = 0; i < str[4].Length; ++i)
{
if (str[4][i] == '節')
{
Term = str[4].Substring(1, i - 1);
ites = i + 2; break;
}
}
GameNo = str[4].Substring(ites,itef-ites+1);
Term = CI.convertStrangeNum(Term);
GameNo = CI.convertStrangeNum(GameNo);
// スカパー(0),スカパー光(2,4,6,9,13),スカパー!(34,35,81),e2(1,38,40)
// NHK, 地方放送
bool[] TVs = new bool[6];
String[] EachTV = CI.csvSep(str[10]);
for (int j = 0; j < EachTV.Length; ++j)
{
int TVNo = -1;
for (int i = 0; i < TVsize; ++i) if (EachTV[j] == TV[i]) { TVNo = i; break; }
if (TVNo == -1)
{
TVNo = TVsize;
TV[TVsize++] = EachTV[j];
}
if (TVNo != -1)
{
++numTV[TVNo];
if (TVNo == 0 ||
TVNo == 2 || TVNo == 4 || TVNo == 6 || TVNo == 9 || TVNo == 13 ||
TVNo == 34 || TVNo == 35 || TVNo == 81 ||
TVNo == 1 || TVNo == 38 || TVNo == 40
) continue;
else if (EachTV[j].Length >= 3 && EachTV[j].Substring(0, 3) == "NHK") TVs[0] = true;
else TVs[1] = true;
}
}
int srNo=-1;
for (int i = 0; i < srSize; ++i)
{
if (str[0] == srID[i]) { srNo=i; break;}
}
for (int i = 0; i < 4; ++i)
{
if(i!=0) ofs.Write(",");
ofs.Write(str[i]);
}
ofs.Write(","+Term+","+GameNo+","+Month+","+Day+","+time+","+homeTeamNo+","+awayTeamNo+","+StadiumCap[StadiumNo]);
for (int i = 0; i < 2; ++i)
{
ofs.Write(',');
if (TVs[i] == true) ofs.Write(1); else ofs.Write(0);
}
ofs.Write(","+srTemp[srNo]+","+srHumid[srNo]);
ofs.WriteLine();
}
ifs.Close(); ofs.Close();
for (int i = 0; i < StadiumSize; ++i)
{
Console.WriteLine(Stadium[i]+" "+StadiumCap[i]);
}
//
StreamReader ifstest = new StreamReader("test.csv");
StreamWriter ofsans = new StreamWriter("answer.txt");
ofsans.WriteLine("ID,年度,リーグ,節,第何日,月,日,時間,ホーム,アウェイ,収容人数,NHK,地方放送");
while(true)
{
String _str = ifstest.ReadLine();
if (_str == null) break;
String[] str = _str.Split(',');
str[2] = str[2].Substring(1, 1);
if (str[2] == "1") str[2] = "0";
else str[2] = "1";
int Month = 0;
if (str[4][0] == '0') Month = int.Parse(str[4].Substring(1, 1));
else Month = int.Parse(str[4].Substring(0, 2));
int Day = 0;
if (str[4][3] == '0') Day = int.Parse(str[4].Substring(4, 1));
else Day = int.Parse(str[4].Substring(3, 2));
int Week = CI.convertWeek(str[4].Substring(6, 1));
double time = CI.convertTime(str[5]);
int StadiumNo = -1;
for (int i = 0; i < StadiumSize; ++i) if (str[8] == Stadium[i]) { StadiumNo = i; break; }
if (StadiumNo == -1)
{
StadiumNo = StadiumSize;
Stadium[StadiumSize++] = str[8];
}
int homeTeamNo = -1;
int awayTeamNo = -1;
for (int i = 0; i < TeamSize; ++i) if (str[6] == Team[i]) { homeTeamNo = i; break; }
if (homeTeamNo == -1)
{
homeTeamNo = TeamSize;
Team[TeamSize++] = str[6];
}
for (int i = 0; i < TeamSize; ++i) if (str[7] == Team[i]) { awayTeamNo = i; break; }
if (awayTeamNo == -1)
{
awayTeamNo = TeamSize;
Team[TeamSize++] = str[7];
}
int ites = 0, itef = str[3].Length - 1;
String Term = ""; // 節
String GameNo; // 第何日
for (int i = 0; i < str[3].Length; ++i)
{
if (str[3][i] == '節')
{
Term = str[3].Substring(1, i - 1);
ites = i + 2; break;
}
}
GameNo = str[3].Substring(ites, itef - ites + 1);
Term = CI.convertStrangeNum(Term);
GameNo = CI.convertStrangeNum(GameNo);
// スカパー(0),スカパー光(2,4,6,9,13),スカパー!(34,35,81),e2(1,38,40)
// NHK, 地方放送
bool[] TVs = new bool[6];
String[] EachTV = CI.csvSep(str[9]);
for (int j = 0; j < EachTV.Length; ++j)
{
int TVNo = -1;
for (int i = 0; i < TVsize; ++i) if (EachTV[j] == TV[i]) { TVNo = i; break; }
if (TVNo == -1)
{
TVNo = TVsize;
TV[TVsize++] = EachTV[j];
}
if (TVNo != -1)
{
++numTV[TVNo];
if (TVNo == 0 ||
TVNo == 2 || TVNo == 4 || TVNo == 6 || TVNo == 9 || TVNo == 13 ||
TVNo == 34 || TVNo == 35 || TVNo == 81 ||
TVNo == 1 || TVNo == 38 || TVNo == 40
) continue;
else if (EachTV[j].Length >= 3 && EachTV[j].Substring(0, 3) == "NHK") TVs[0] = true;
else TVs[1] = true;
}
}
int srNo=-1;
for (int i = 0; i < srSize; ++i)
{
if (str[0] == srID[i]) { srNo=i; break;}
}
for (int i = 0; i < 3; ++i)
{
if (i != 0) ofsans.Write(",");
ofsans.Write(str[i]);
}
ofsans.Write("," + Term + "," + GameNo + "," + Month + "," + Day + "," + time + "," + homeTeamNo + "," + awayTeamNo + "," + StadiumCap[StadiumNo]);
for (int i = 0; i < 2; ++i)
{
ofsans.Write(',');
if (TVs[i] == true) ofsans.Write(1); else ofsans.Write(0);
}
ofsans.WriteLine();
swWeather.Write(str[0]);
if (srNo == -1) swWeather.WriteLine(",,");
else swWeather.WriteLine("," + srTemp[srNo] + "," + srHumid[srNo]);
}
ifstest.Close(); ofsans.Close(); swWeather.Close();
}
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment