Skip to content

Instantly share code, notes, and snippets.

@shuozhang1985
Created August 7, 2016 14:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shuozhang1985/75221c804288f570d316b7726f8aba93 to your computer and use it in GitHub Desktop.
Save shuozhang1985/75221c804288f570d316b7726f8aba93 to your computer and use it in GitHub Desktop.
library(plotly)
library(dplyr)
library(ggmap)
library(map)
library(googleVis)
library(ggplot2)
library(leaflet)
library(lattice)
library(plyr)
library(Rmisc)
#explore data
#first dataset
#explore time range
setwd("~/Desktop/shiny project/app")
events<-read.csv('events.csv', header=T, stringsAsFactors = F)
#str(events)
# time range
#head(events)#start time 2016-05-01 00:55:25
#tail(events)#end time:2016-05-07 23:20:08
#translate chinese brand to english brand
phone_brand<-read.csv('phone_brand_device_model.csv', header = T,
stringsAsFactors = F)
#str(phone_brand)
english_brand = read.csv(text='phone_brand,phone_brand_English
三星,samsung
天语,Ktouch
海信,hisense
联想,lenovo
欧比,obi
爱派尔,ipair
努比亚,nubia
优米,youmi
朵唯,dowe
黑米,heymi
锤子,hammer
酷比魔方,koobee
美图,meitu
尼比鲁,nibilu
一加,oneplus
优购,yougo
诺基亚,nokia
糖葫芦,candy
中国移动,ccmc
语信,yuxin
基伍,kiwu
青橙,greeno
华硕,asus
夏新,panosonic
维图,weitu
艾优尼,aiyouni
摩托罗拉,moto
乡米,xiangmi
米奇,micky
大可乐,bigcola
沃普丰,wpf
神舟,hasse
摩乐,mole
飞秒,fs
米歌,mige
富可视,fks
德赛,desci
梦米,mengmi
乐视,lshi
小杨树,smallt
纽曼,newman
邦华,banghua
E派,epai
易派,epai
普耐尔,pner
欧新,ouxin
西米,ximi
海尔,haier
波导,bodao
糯米,nuomi
唯米,weimi
酷珀,kupo
谷歌,google
昂达,ada
聆韵,lingyun
小米,Xiaomi
华为,Huawei
魅族,Meizu
中兴,ZTE
酷派,Coolpad
金立,Gionee
SUGAR,SUGAR
OPPO,OPPO
vivo,vivo
HTC,HTC
LG,LG
ZUK,ZUK
TCL,TCL
LOGO,LOGO
SUGAR,SUGAR
Lovme,Lovme
PPTV,PPTV
ZOYE,ZOYE
MIL,MIL
索尼,Sony
欧博信,Opssom
奇酷,Qiku
酷比,CUBE
康佳,Konka
亿通,Yitong
金星数码,JXD
至尊宝,Monkey King
百立丰,Hundred Li Feng
贝尔丰,Bifer
百加,Bacardi
诺亚信,Noain
广信,Kingsun
世纪天元,Ctyon
青葱,Cong
果米,Taobao
斐讯,Phicomm
长虹,Changhong
欧奇,Oukimobile
先锋,XFPLAY
台电,Teclast
大Q,Daq
蓝魔,Ramos
奥克斯,AUX
世纪星,Centurystar
丰米,Fengmi
亚马逊,Amazon
优语,Youyu
凯利通,Kailitong
原点,Origin
唯比,Weibi
嘉源,Jiayuan
大显,Daxian
天宏时代, Tianhong
宏碁,Hongji
宝捷讯,Baoxunjie
帷幄,Weiwo
德卡诺,Dekanuo
恒宇丰,Hengyufeng
惠普,HP
戴尔,Dell
智镁,Zhimei
本为,Benwei
极米,Jimi
欧乐迪,Ouyuedi
欧乐酷,Ouyueku
欧沃,Ouao
瑞米,Ruimi
瑞高,Ruigao
白米,Baimi
碟米,Diemi
虾米,Xiami
西门子,Simon
赛博宇华,Saibo
飞利浦,Philips
首云,Shouyun
鲜米,Xianmi
E人E本,ErenEben')
english_brand$phone_brand=as.vector(english_brand$phone_brand)
english_brand$phone_brand=gsub(' ', '', english_brand$phone_brand)
english_brand$phone_brand_English=as.vector(english_brand$phone_brand_English)
english_brand$phone_brand_English=gsub(' ', '', english_brand$phone_brand_English)
phone_brand1<-merge(x =phone_brand, y = english_brand,
by = "phone_brand", all.x = T)
#str(phone_brand1)
phone_brand_device<-phone_brand1[!duplicated(phone_brand1$device_id),] #remove duplicated device id
write.csv(x=phone_brand_device, file='phone_brand_device.csv')
#str(phone_brand_device)
# merge different data set toghther to analysis
# first merge with gender and age
gender_age<-read.csv('gender_age_train.csv', header = T, stringsAsFactors = F)
#str(gender_age)
phone_brand_age_gender<-merge(x =phone_brand_device, y =gender_age ,
by = "device_id", all.x = TRUE)
#str(phone_brand_age_gender)
#second merge with events
events_sample=filter(events, device_id %in% phone_brand_age_gender$device_id)
#str(events_sample)
events_sample1=events_sample[!duplicated(events_sample$device_id),] #remove duplicated device id
#str(events_sample1)
device_data<-merge(x=phone_brand_age_gender, y=events,
by = "device_id", all.x = TRUE )
#str(device_data)
# pick the top 10 phone brand
device_data=device_data %>%
dplyr::filter(!is.na(device_id), !duplicated(device_id)) #remove duplicated device id
top10 <- names(sort(table(device_data$phone_brand_English),
decreasing = T))[1:10]
phonedata=filter(device_data, phone_brand_English %in%top10)
write.csv(x=phonedata, file='phonedata.csv') #first data set in the shiny
#str(phonedata)
#second dataset
setwd("~/Desktop/shiny project/app")
app_events=read.csv('app_events.csv', header=T, stringsAsFactors = F)
#str(app_events)
app_events_1=app_events[sample(nrow(app_events), 500000), ] # too big data set to merge, sample 100000
#str(app_events_1)
app_labels=read.csv('app_labels.csv', header=T, stringsAsFactors = F)
#str(app_labels) #459943
label_categories=read.csv('label_categories.csv', header = T,
stringsAsFactors = F)
#str(label_categories)
#merge data set
#first merge
app_labels_categories=merge(x=app_labels, y=label_categories,
by='label_id', all.x = T)
#str(app_labels_categories)
#second merge
app_data=merge(x=app_events_1,y=app_labels_categories,by='app_id', all.x=T)
#str(app_data)
app_data=filter(app_data,is_installed==1) # filter installed app_id
#str(app_data)
#third merge
events<-read.csv('events.csv', header=T, stringsAsFactors = F)
events_sample<-filter(events, event_id %in% app_data$event_id)
app_rawdata<-merge(y=events_sample, x=app_data, by='event_id', all.x=T)
app_rawdata=filter(app_rawdata,is_installed==1)
#str(app_rawdata)
#fourth merge
gender_age<-read.csv('gender_age_train.csv', header = T, stringsAsFactors = F)
#str(gender_age)
app_age_gender=merge(x=app_rawdata, y=gender_age,by='device_id', all.x=T)
app_age_gender=filter(app_age_gender,is_installed==1)# filter installed app_id
#str(app_age_gender)
#fifth merge
phone_brand_device=read.csv('phone_brand_device.csv', header = T, stringsAsFactors = F)
app_final=merge(x=app_age_gender, y=phone_brand_device,by= 'device_id', all.x=T)
app_final=filter(app_final,is_installed==1)
#str(app_final)
app_final$dup=paste0(app_final$device_id, app_final$app_id, sep=':')
appdata=app_final[!duplicated(app_final$dup),]# remove duplicated device id plus app id
str(appdata)
top10app=names(sort(table(appdata$category), decreasing = T))[1:10]
app_data=appdata%>%
filter(category%in%top10app)
#str(app_data)
write.csv(x=app_data, file='appmap.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment