shuozhang1985/analysis code for shiny

## analysis code for shiny
library(plotly)
library(dplyr)
library(ggmap)
library(map)
library(googleVis)
library(ggplot2)
library(leaflet)
library(lattice)
library(plyr)
library(Rmisc)

#explore data
#first dataset
#explore time range
setwd("~/Desktop/shiny project/app")
events<-read.csv('events.csv', header=T, stringsAsFactors = F)
#str(events)
# time range
#head(events)#start time 2016-05-01 00:55:25
#tail(events)#end time:2016-05-07 23:20:08

#translate chinese brand to english brand
phone_brand<-read.csv('phone_brand_device_model.csv', header = T,
                      stringsAsFactors = F)
#str(phone_brand)
english_brand = read.csv(text='phone_brand,phone_brand_English
  三星,samsung
  天语,Ktouch
  海信,hisense
  联想,lenovo
  欧比,obi
  爱派尔,ipair
  努比亚,nubia
  优米,youmi
  朵唯,dowe
  黑米,heymi
  锤子,hammer
  酷比魔方,koobee
  美图,meitu
  尼比鲁,nibilu
  一加,oneplus
  优购,yougo
  诺基亚,nokia
  糖葫芦,candy
  中国移动,ccmc
  语信,yuxin
  基伍,kiwu
  青橙,greeno
  华硕,asus
  夏新,panosonic
  维图,weitu
  艾优尼,aiyouni
  摩托罗拉,moto
  乡米,xiangmi
  米奇,micky
  大可乐,bigcola
  沃普丰,wpf
  神舟,hasse
  摩乐,mole
  飞秒,fs
  米歌,mige
  富可视,fks
  德赛,desci
  梦米,mengmi
  乐视,lshi
  小杨树,smallt
  纽曼,newman
  邦华,banghua
  E派,epai
  易派,epai
  普耐尔,pner
  欧新,ouxin
  西米,ximi
  海尔,haier
  波导,bodao
  糯米,nuomi
  唯米,weimi
  酷珀,kupo
  谷歌,google
  昂达,ada
  聆韵,lingyun
  小米,Xiaomi
  华为,Huawei
  魅族,Meizu
  中兴,ZTE
  酷派,Coolpad
  金立,Gionee
  SUGAR,SUGAR
  OPPO,OPPO
  vivo,vivo
  HTC,HTC
  LG,LG
  ZUK,ZUK
  TCL,TCL
  LOGO,LOGO
  SUGAR,SUGAR
  Lovme,Lovme
  PPTV,PPTV
  ZOYE,ZOYE
  MIL,MIL
  索尼,Sony
  欧博信,Opssom
  奇酷,Qiku
  酷比,CUBE
  康佳,Konka
  亿通,Yitong
  金星数码,JXD
  至尊宝,Monkey King
  百立丰,Hundred Li Feng
  贝尔丰,Bifer
  百加,Bacardi
  诺亚信,Noain
  广信,Kingsun
  世纪天元,Ctyon
  青葱,Cong
  果米,Taobao
  斐讯,Phicomm
  长虹,Changhong
  欧奇,Oukimobile
  先锋,XFPLAY
  台电,Teclast
  大Q,Daq
  蓝魔,Ramos
  奥克斯,AUX
  世纪星,Centurystar
  丰米,Fengmi
  亚马逊,Amazon
  优语,Youyu
  凯利通,Kailitong
  原点,Origin
  唯比,Weibi
  嘉源,Jiayuan
  大显,Daxian
  天宏时代, Tianhong
  宏碁,Hongji
  宝捷讯,Baoxunjie
  帷幄,Weiwo
  德卡诺,Dekanuo
  恒宇丰,Hengyufeng
  惠普,HP
  戴尔,Dell
  智镁,Zhimei
  本为,Benwei
  极米,Jimi
  欧乐迪,Ouyuedi
  欧乐酷,Ouyueku
  欧沃,Ouao
  瑞米,Ruimi
  瑞高,Ruigao
  白米,Baimi
  碟米,Diemi
  虾米,Xiami
  西门子,Simon
  赛博宇华,Saibo
  飞利浦,Philips
  首云,Shouyun
  鲜米,Xianmi
  E人E本,ErenEben')
english_brand$phone_brand=as.vector(english_brand$phone_brand)
english_brand$phone_brand=gsub(' ', '', english_brand$phone_brand)
english_brand$phone_brand_English=as.vector(english_brand$phone_brand_English)
english_brand$phone_brand_English=gsub(' ', '', english_brand$phone_brand_English)

phone_brand1<-merge(x =phone_brand, y = english_brand,
                    by = "phone_brand", all.x = T)
#str(phone_brand1)
phone_brand_device<-phone_brand1[!duplicated(phone_brand1$device_id),] #remove duplicated device id
write.csv(x=phone_brand_device, file='phone_brand_device.csv')
#str(phone_brand_device)

# merge different data set toghther to analysis

# first merge with gender and age
gender_age<-read.csv('gender_age_train.csv', header = T, stringsAsFactors = F)
#str(gender_age)

phone_brand_age_gender<-merge(x =phone_brand_device, y =gender_age ,
                              by = "device_id", all.x = TRUE)
#str(phone_brand_age_gender)

#second merge with events
events_sample=filter(events, device_id %in% phone_brand_age_gender$device_id)
#str(events_sample)
events_sample1=events_sample[!duplicated(events_sample$device_id),] #remove duplicated device id
#str(events_sample1)

device_data<-merge(x=phone_brand_age_gender, y=events,
                   by = "device_id", all.x = TRUE )

#str(device_data)

# pick the top 10 phone brand
device_data=device_data %>%
  dplyr::filter(!is.na(device_id), !duplicated(device_id)) #remove duplicated device id
top10 <- names(sort(table(device_data$phone_brand_English),
                    decreasing = T))[1:10]
phonedata=filter(device_data, phone_brand_English %in%top10)
write.csv(x=phonedata, file='phonedata.csv') #first data set in the shiny
#str(phonedata)

#second dataset
setwd("~/Desktop/shiny project/app")
app_events=read.csv('app_events.csv', header=T, stringsAsFactors = F)
#str(app_events)
app_events_1=app_events[sample(nrow(app_events), 500000), ] # too big data set to merge, sample 100000
#str(app_events_1)
app_labels=read.csv('app_labels.csv', header=T, stringsAsFactors = F)
#str(app_labels) #459943

label_categories=read.csv('label_categories.csv', header = T,
                          stringsAsFactors = F)
#str(label_categories)

#merge data set

#first merge
app_labels_categories=merge(x=app_labels, y=label_categories,
                            by='label_id', all.x = T)
#str(app_labels_categories)

#second merge
app_data=merge(x=app_events_1,y=app_labels_categories,by='app_id', all.x=T)
#str(app_data)
app_data=filter(app_data,is_installed==1) # filter installed app_id
#str(app_data)


#third merge
events<-read.csv('events.csv', header=T, stringsAsFactors = F)
events_sample<-filter(events, event_id %in% app_data$event_id)

app_rawdata<-merge(y=events_sample, x=app_data, by='event_id', all.x=T)
app_rawdata=filter(app_rawdata,is_installed==1)
#str(app_rawdata)

#fourth merge
gender_age<-read.csv('gender_age_train.csv', header = T, stringsAsFactors = F)
#str(gender_age)

app_age_gender=merge(x=app_rawdata, y=gender_age,by='device_id', all.x=T)
app_age_gender=filter(app_age_gender,is_installed==1)# filter installed app_id
#str(app_age_gender)

#fifth merge
phone_brand_device=read.csv('phone_brand_device.csv', header = T, stringsAsFactors = F)
app_final=merge(x=app_age_gender, y=phone_brand_device,by= 'device_id', all.x=T)
app_final=filter(app_final,is_installed==1)
#str(app_final)

app_final$dup=paste0(app_final$device_id, app_final$app_id, sep=':')
appdata=app_final[!duplicated(app_final$dup),]# remove duplicated device id plus app id
str(appdata)

top10app=names(sort(table(appdata$category), decreasing = T))[1:10]
app_data=appdata%>%
  filter(category%in%top10app)
#str(app_data)
write.csv(x=app_data, file='appmap.csv')
	library(plotly)
	library(dplyr)
	library(ggmap)
	library(map)
	library(googleVis)
	library(ggplot2)
	library(leaflet)
	library(lattice)
	library(plyr)
	library(Rmisc)

	#explore data
	#first dataset
	#explore time range
	setwd("~/Desktop/shiny project/app")
	events<-read.csv('events.csv', header=T, stringsAsFactors = F)
	#str(events)
	# time range
	#head(events)#start time 2016-05-01 00:55:25
	#tail(events)#end time:2016-05-07 23:20:08

	#translate chinese brand to english brand
	phone_brand<-read.csv('phone_brand_device_model.csv', header = T,
	stringsAsFactors = F)
	#str(phone_brand)
	english_brand = read.csv(text='phone_brand,phone_brand_English
	三星,samsung
	天语,Ktouch
	海信,hisense
	联想,lenovo
	欧比,obi
	爱派尔,ipair
	努比亚,nubia
	优米,youmi
	朵唯,dowe
	黑米,heymi
	锤子,hammer
	酷比魔方,koobee
	美图,meitu
	尼比鲁,nibilu
	一加,oneplus
	优购,yougo
	诺基亚,nokia
	糖葫芦,candy
	中国移动,ccmc
	语信,yuxin
	基伍,kiwu
	青橙,greeno
	华硕,asus
	夏新,panosonic
	维图,weitu
	艾优尼,aiyouni
	摩托罗拉,moto
	乡米,xiangmi
	米奇,micky
	大可乐,bigcola
	沃普丰,wpf
	神舟,hasse
	摩乐,mole
	飞秒,fs
	米歌,mige
	富可视,fks
	德赛,desci
	梦米,mengmi
	乐视,lshi
	小杨树,smallt
	纽曼,newman
	邦华,banghua
	E派,epai
	易派,epai
	普耐尔,pner
	欧新,ouxin
	西米,ximi
	海尔,haier
	波导,bodao
	糯米,nuomi
	唯米,weimi
	酷珀,kupo
	谷歌,google
	昂达,ada
	聆韵,lingyun
	小米,Xiaomi
	华为,Huawei
	魅族,Meizu
	中兴,ZTE
	酷派,Coolpad
	金立,Gionee
	SUGAR,SUGAR
	OPPO,OPPO
	vivo,vivo
	HTC,HTC
	LG,LG
	ZUK,ZUK
	TCL,TCL
	LOGO,LOGO
	SUGAR,SUGAR
	Lovme,Lovme
	PPTV,PPTV
	ZOYE,ZOYE
	MIL,MIL
	索尼,Sony
	欧博信,Opssom
	奇酷,Qiku
	酷比,CUBE
	康佳,Konka
	亿通,Yitong
	金星数码,JXD
	至尊宝,Monkey King
	百立丰,Hundred Li Feng
	贝尔丰,Bifer
	百加,Bacardi
	诺亚信,Noain
	广信,Kingsun
	世纪天元,Ctyon
	青葱,Cong
	果米,Taobao
	斐讯,Phicomm
	长虹,Changhong
	欧奇,Oukimobile
	先锋,XFPLAY
	台电,Teclast
	大Q,Daq
	蓝魔,Ramos
	奥克斯,AUX
	世纪星,Centurystar
	丰米,Fengmi
	亚马逊,Amazon
	优语,Youyu
	凯利通,Kailitong
	原点,Origin
	唯比,Weibi
	嘉源,Jiayuan
	大显,Daxian
	天宏时代, Tianhong
	宏碁,Hongji
	宝捷讯,Baoxunjie
	帷幄,Weiwo
	德卡诺,Dekanuo
	恒宇丰,Hengyufeng
	惠普,HP
	戴尔,Dell
	智镁,Zhimei
	本为,Benwei
	极米,Jimi
	欧乐迪,Ouyuedi
	欧乐酷,Ouyueku
	欧沃,Ouao
	瑞米,Ruimi
	瑞高,Ruigao
	白米,Baimi
	碟米,Diemi
	虾米,Xiami
	西门子,Simon
	赛博宇华,Saibo
	飞利浦,Philips
	首云,Shouyun
	鲜米,Xianmi
	E人E本,ErenEben')
	english_brand$phone_brand=as.vector(english_brand$phone_brand)
	english_brand$phone_brand=gsub(' ', '', english_brand$phone_brand)
	english_brand$phone_brand_English=as.vector(english_brand$phone_brand_English)
	english_brand$phone_brand_English=gsub(' ', '', english_brand$phone_brand_English)

	phone_brand1<-merge(x =phone_brand, y = english_brand,
	by = "phone_brand", all.x = T)
	#str(phone_brand1)
	phone_brand_device<-phone_brand1[!duplicated(phone_brand1$device_id),] #remove duplicated device id
	write.csv(x=phone_brand_device, file='phone_brand_device.csv')
	#str(phone_brand_device)

	# merge different data set toghther to analysis

	# first merge with gender and age
	gender_age<-read.csv('gender_age_train.csv', header = T, stringsAsFactors = F)
	#str(gender_age)

	phone_brand_age_gender<-merge(x =phone_brand_device, y =gender_age ,
	by = "device_id", all.x = TRUE)
	#str(phone_brand_age_gender)

	#second merge with events
	events_sample=filter(events, device_id %in% phone_brand_age_gender$device_id)
	#str(events_sample)
	events_sample1=events_sample[!duplicated(events_sample$device_id),] #remove duplicated device id
	#str(events_sample1)

	device_data<-merge(x=phone_brand_age_gender, y=events,
	by = "device_id", all.x = TRUE )

	#str(device_data)

	# pick the top 10 phone brand
	device_data=device_data %>%
	dplyr::filter(!is.na(device_id), !duplicated(device_id)) #remove duplicated device id
	top10 <- names(sort(table(device_data$phone_brand_English),
	decreasing = T))[1:10]
	phonedata=filter(device_data, phone_brand_English %in%top10)
	write.csv(x=phonedata, file='phonedata.csv') #first data set in the shiny
	#str(phonedata)

	#second dataset
	setwd("~/Desktop/shiny project/app")
	app_events=read.csv('app_events.csv', header=T, stringsAsFactors = F)
	#str(app_events)
	app_events_1=app_events[sample(nrow(app_events), 500000), ] # too big data set to merge, sample 100000
	#str(app_events_1)
	app_labels=read.csv('app_labels.csv', header=T, stringsAsFactors = F)
	#str(app_labels) #459943

	label_categories=read.csv('label_categories.csv', header = T,
	stringsAsFactors = F)
	#str(label_categories)

	#merge data set

	#first merge
	app_labels_categories=merge(x=app_labels, y=label_categories,
	by='label_id', all.x = T)
	#str(app_labels_categories)

	#second merge
	app_data=merge(x=app_events_1,y=app_labels_categories,by='app_id', all.x=T)
	#str(app_data)
	app_data=filter(app_data,is_installed==1) # filter installed app_id
	#str(app_data)


	#third merge
	events<-read.csv('events.csv', header=T, stringsAsFactors = F)
	events_sample<-filter(events, event_id %in% app_data$event_id)

	app_rawdata<-merge(y=events_sample, x=app_data, by='event_id', all.x=T)
	app_rawdata=filter(app_rawdata,is_installed==1)
	#str(app_rawdata)

	#fourth merge
	gender_age<-read.csv('gender_age_train.csv', header = T, stringsAsFactors = F)
	#str(gender_age)

	app_age_gender=merge(x=app_rawdata, y=gender_age,by='device_id', all.x=T)
	app_age_gender=filter(app_age_gender,is_installed==1)# filter installed app_id
	#str(app_age_gender)

	#fifth merge
	phone_brand_device=read.csv('phone_brand_device.csv', header = T, stringsAsFactors = F)
	app_final=merge(x=app_age_gender, y=phone_brand_device,by= 'device_id', all.x=T)
	app_final=filter(app_final,is_installed==1)
	#str(app_final)

	app_final$dup=paste0(app_final$device_id, app_final$app_id, sep=':')
	appdata=app_final[!duplicated(app_final$dup),]# remove duplicated device id plus app id
	str(appdata)

	top10app=names(sort(table(appdata$category), decreasing = T))[1:10]
	app_data=appdata%>%
	filter(category%in%top10app)
	#str(app_data)
	write.csv(x=app_data, file='appmap.csv')