Create a gist now

Instantly share code, notes, and snippets.

Embed
What would you like to do?
#This project was done as a team with Tarun DeviReddy.
#Referred to Yibo script (https://www.kaggle.com/yibochen/talkingdata-mobile-user-demographics/xgboost-in-r-2-27217)
#Using Extreme Gradient Boosting. Here our aim is have device_id and the corresponding feature_name and rbind all such dataframes and convert it into a matrix to feed to the xgb function.
#Grouping location: Replacing the (0,0) with means corresponding to a device id and making factors from continuous latitude and longitudes. This part can be optimized and made shorter.
events = fread("~/Downloads/TalkingData/events.csv") %>% as.data.frame()
events_1 = subset(events, (events$longitude>1 | events$longitude< -1) | (events$latitude>1 | events$latitude< -1) )
splitmean_log <- function(df) {
s <- split( df, df$device_id)
sapply( s, function(x) mean(x$longitude) )
}
splitmean_lat <- function(df) {
s <- split( df, df$device_id)
sapply( s, function(x) mean(x$latitude) )
}
mean_long = as.data.frame(splitmean_log(events_1))
mean_long$device_id = rownames(mean_long)
rownames(mean_long) <- NULL
colnames(mean_long) = c("longitude", "device_id")
mean_lat = as.data.frame(splitmean_lat(events_1))
mean_lat$device_id = rownames(mean_lat)
rownames(mean_lat)<-NULL
colnames(mean_lat) = c("latitude","device_id")
location = data.frame(device_id=mean_lat$device_id, longitude = mean_long$longitude,
latitude = mean_lat$latitude)
rm(events,events_1, mean_lat, mean_long)
location$longitude_n = ifelse((location$longitude>75 & location$longitude<135 & location$latitude>15 & location$latitude<55), location$longitude,"Outside_China")
location$latitude_n = ifelse((location$longitude>75 & location$longitude<135 & location$latitude>15 & location$latitude<55), location$latitude,"Outside_China")
location_new = subset(location, location$longitude_n!="Outside_China")
location_new$longgrp <- cut(location_new$longitude,
breaks = c(-Inf, 80, 85, 90, 95, 100, 105, 110, 115, 120, 125, 130, Inf),
labels = c("long1", "long2", "long3", "long4", "long5","long6", "long7", "long8", "long9", "long10", "long11", "long12"),
right = FALSE)
location_new$latgrp <- cut(location_new$latitude,
breaks = c(-Inf, 15, 17.5, 20, 22.5, 25, 27.5, 30, 32.5, 35, 37.5, 40, 42.5, 45, 47.5, 50, 52.5, Inf),
labels = c("lat1", "lat2", "lat3", "lat4", "lat5","lat6", "lat7", "lat8", "lat9", "lat10", "lat11", "lat12", "lat13", "lat14", "lat15","lat16","lat17"),
right = FALSE)
location_new$longitude<- location_new$latitude <- location_new$longitude_n <- location_new$latitude_n <- NULL
colnames(location_new) = c("device_id", "longitude", "latitude")
location_outside = subset(location, location$longitude_n=="Outside_China")
location_outside$longitude <- location_outside$latitude <- NULL
colnames(location_outside) = c("device_id", "longitude", "latitude")
location = rbind(location_outside, location_new)
rownames(location) <- NULL
location$device_id = as.character(location$device_id)
device_loc_long = data.frame(device_id = location$device_id, feature_name = location$longitude)
device_loc_lat = data.frame(device_id = location$device_id, feature_name = location$latitude)
rm(location_outside, location_new, location)
device_loc_lat$device_id = as.character(device_loc_lat$device_id)
device_loc_long$device_id = as.character(device_loc_long$device_id)
device_loc_lat$feature_name = as.character(device_loc_lat$feature_name)
device_loc_long$feature_name = as.character(device_loc_long$feature_name)
#make time_avg, time_mdn: Creating the average number of events per day and the most and least used duration in a day
events = fread("~/Downloads/TalkingData/events.csv") %>% as.data.frame()
events$date = as.Date(sapply(strsplit(events$timestamp, split = " "), head, 1))
events$time = as.numeric(substr(events$timestamp, 12,13))
events$Morning <- 0
events$Day <- 0
events$Night <- 0
events$Morning[events$time>4 & events$time<10] = 1
events$Day[events$time>=10 & events$time<21] = 1
events$Night = as.numeric(!(events$Morning | events$Day))
events$device_id = as.character(events$device_id)
tmp1 = as.data.frame(sapply(split(events$Morning,events$device_id), sum))
tmp1$device_id = as.character(rownames(tmp1))
rownames(tmp1) <- NULL
colnames(tmp1) = c("Morning", "device_id")
tmp2 = as.data.frame(sapply(split(events$Day,events$device_id), sum))
tmp2$device_id = as.character(rownames(tmp2))
rownames(tmp2) <- NULL
colnames(tmp2) = c("Day", "device_id")
tmp3 = as.data.frame(sapply(split(events$Night,events$device_id), sum))
tmp3$device_id = as.character(rownames(tmp3))
rownames(tmp3) <- NULL
colnames(tmp3) = c("Night", "device_id")
events$Morning<-events$Day<-events$Night<-NULL
time = merge(events, tmp1, by = "device_id", all.x=T)
time1 = merge(time, tmp2, by = "device_id", all.x=T)
time2 = merge(time1, tmp3, by = "device_id", all.x=T)
time = time2
rm(time1, time2)
rm(tmp1, tmp2, tmp3)
time$total = time$Morning+time$Day+time$Night
time$Morning = time$Morning/time$total
time$Day = time$Day/time$total
time$Night = time$Night/time$total
tmp = as.data.frame(sapply(split(events$date,events$device_id), function(x) length(unique(x))))
tmp$device_id = as.character(rownames(tmp))
tmp$noday = tmp$`sapply(split(events$date, events$device_id), function(x) length(unique(x)))`
tmp$`sapply(split(events$date, events$device_id), function(x) length(unique(x)))` <-NULL
rownames(tmp) <- NULL
time[,2:7]=NULL
time_final = unique(time)
time_final = merge(time_final, tmp, by = "device_id", all.x = T)
time_final$avgevt = (time_final$total / time_final$noday)
time_final$MDN = paste(colnames(time_final[,2:4])[apply(time_final[,2:4],1,which.max)],
colnames(time_final[,2:4])[apply(time_final[,2:4],1,which.min)],sep="-")
time_final$avg = ifelse(time_final$avgevt<2, "Low",
ifelse(time_final$avgevt<10, "Med", "High") )
time_mdn = time_final[,c(1,8)]
colnames(time_mdn) = c("device_id", "feature_name")
time_avg = time_final[,c(1,9)]
colnames(time_avg) = c("device_id", "feature_name")
rm(events, time, time_final, tmp)
#Creating the data for model and brand
label_train <- fread("~/Downloads/TalkingData/gender_age_train.csv",
colClasses=c("character","character",
"integer","character"))
label_test <- fread("~/Downloads/TalkingData/gender_age_test.csv",
colClasses=c("character"))
label_test$gender <- label_test$age <- label_test$group <- NA
label <- rbind(label_train,label_test)
setkey(label,device_id)
rm(label_test,label_train);gc()
brand = fread("~/Downloads/TalkingData/phone_brand_device_model.csv",
colClasses=c("character","character","character"))
setkey(brand,device_id)
brand2 = unique(brand)
label1 <- merge(label,brand2,by="device_id",all.x=T)
rm(brand, brand2);gc()
events = fread("~/Downloads/TalkingData/events.csv", colClasses=c("character","character","character",
"numeric","numeric"))
setkeyv(events,c("device_id","event_id"))
event_app <- fread("~/Downloads/TalkingData/app_events.csv",
colClasses=rep("character",4))
setkey(event_app,event_id)
#events <- unique(events[,list(device_id,event_id)],by=NULL)
#list of apps corresponding to each event
event_apps <- event_app[,list(apps=paste(unique(app_id),collapse=",")),by="event_id"]
device_event_apps <- merge(events,event_apps,by="event_id")
rm(events,event_app,event_apps);gc()
f_split_paste <- function(z){paste(unique(unlist(strsplit(z,","))),collapse=",")}
device_apps <- device_event_apps[,list(apps=f_split_paste(apps)),by="device_id"]
rm(device_event_apps,f_split_paste);gc()
tmp <- strsplit(device_apps$apps,",")
device_apps <- data.table(device_id=rep(device_apps$device_id,
times=sapply(tmp,length)),
app_id=unlist(tmp))
rm(tmp)
#Introducing all the app categories, no groupings done
#make device_cat
app_labels = fread("~/Downloads/TalkingData/app_labels.csv") %>% as.data.frame()
label_categories = fread("~/Downloads/TalkingData/label_categories.csv") %>% as.data.frame()
app_cat = merge(app_labels, label_categories, by="label_id", all.x= T)
uni_appcat = unique(app_cat[,c(1,3)])
uni_appcat$category = as.character(uni_appcat$category)
uni_appcat$category = tolower(uni_appcat$category)
#uni_appcat$cat = ifelse(grepl("gam|war|rac|mmo|dota|play|ball|chess|fight|tennis|billard|puzz|poker|sport|shoot|rpg", uni_appcat$category), "game",uni_appcat$category)
app = merge(app_labels,uni_appcat,by="label_id",all.x = T)
app = na.omit(app)
app = app[,c(2,3)]
colnames(app) = c("app_id", "feature_name")
app$app_id = as.character(app$app_id)
app = unique(app)
rm(uni_appcat,app_labels,label_categories, app_cat)
device_cat = merge(device_apps, app, by = "app_id", all.x = T, allow.cartesian = T)
device_cat$app_id<-NULL
device_cat = unique(device_cat)
device_cat = na.omit(device_cat)
rownames(device_cat)<-NULL
rm(app)
#xgb: merging all the data tables together and making the matrix in xgb format
d1 <- label1[,list(device_id,phone_brand)]
label1$phone_brand <- NULL
d2 <- label1[,list(device_id,device_model)]
label1$device_model <- NULL
d3 <- device_apps
rm(device_apps)
d1[,phone_brand:=paste0("phone_brand:",phone_brand)]
d2[,device_model:=paste0("device_model:",device_model)]
d3[,app_id:=paste0("app_id:",app_id)]
names(d1) <- names(d2) <- names(d3) <- c("device_id","feature_name")
dd <- rbind(d1,d2,d3)
dd = rbind(dd, time_mdn, device_cat, device_loc_lat, device_loc_long)
row.names(dd) <- NULL
rm(d1,d2,d3);gc()
require(Matrix)
ii <- unique(dd$device_id)
jj <- unique(dd$feature_name)
id_i <- match(dd$device_id,ii)
id_j <- match(dd$feature_name,jj)
id_ij <- cbind(id_i,id_j)
M <- Matrix(0,nrow=length(ii),ncol=length(jj),
dimnames=list(ii,jj),sparse=T)
M[id_ij] <- 1
rm(ii,jj,id_i,id_j,id_ij,dd);gc()
x <- M[rownames(M) %in% label1$device_id,]
id <- label1$device_id[match(rownames(x),label1$device_id)]
y <- label1$group[match(rownames(x),label1$device_id)]
rm(M,label1)
# level reduction
x_train <- x[!is.na(y),]
tmp_cnt_train <- colSums(x_train)
x <- x[,tmp_cnt_train>0 & tmp_cnt_train
require(xgboost)
(group_name <- na.omit(unique(y)))
idx_train <- which(!is.na(y))
idx_test <- which(is.na(y))
train_data <- x[idx_train,]
test_data <- x[idx_test,]
train_label <- match(y[idx_train],group_name)-1
test_label <- match(y[idx_test],group_name)-1
dtrain <- xgb.DMatrix(train_data,label=train_label,missing=NA)
dtest <- xgb.DMatrix(test_data,label=test_label,missing=NA)
param <- list(booster="gblinear",
num_class=length(group_name),
objective="multi:softprob",
eval_metric="mlogloss",
eta=0.01,
lambda=5,
lambda_bias=0,
alpha=2)
watchlist <- list(train=dtrain)
set.seed(114)
fit_cv <- xgb.cv(params=param,
data=dtrain,
nrounds=100000,
watchlist=watchlist,
nfold=5,
early.stop.round=3,
verbose=1)
ntree <- 250 # the value obtained from CV
set.seed(114)
fit_xgb <- xgb.train(params=param,
data=dtrain,
nrounds=ntree,
watchlist=watchlist,
verbose=1)
pred <- predict(fit_xgb,dtest)
pred_detail <- t(matrix(pred,nrow=length(group_name)))
res_submit <- cbind(id=id[idx_test],as.data.frame(pred_detail))
colnames(res_submit) <- c("device_id",group_name)
write.csv(res_submit,file="submit_ultimate1.csv",row.names=F,quote=F)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment