Created
August 7, 2016 14:20
-
-
Save shuozhang1985/75221c804288f570d316b7726f8aba93 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(plotly) | |
library(dplyr) | |
library(ggmap) | |
library(map) | |
library(googleVis) | |
library(ggplot2) | |
library(leaflet) | |
library(lattice) | |
library(plyr) | |
library(Rmisc) | |
#explore data | |
#first dataset | |
#explore time range | |
setwd("~/Desktop/shiny project/app") | |
events<-read.csv('events.csv', header=T, stringsAsFactors = F) | |
#str(events) | |
# time range | |
#head(events)#start time 2016-05-01 00:55:25 | |
#tail(events)#end time:2016-05-07 23:20:08 | |
#translate chinese brand to english brand | |
phone_brand<-read.csv('phone_brand_device_model.csv', header = T, | |
stringsAsFactors = F) | |
#str(phone_brand) | |
english_brand = read.csv(text='phone_brand,phone_brand_English | |
三星,samsung | |
天语,Ktouch | |
海信,hisense | |
联想,lenovo | |
欧比,obi | |
爱派尔,ipair | |
努比亚,nubia | |
优米,youmi | |
朵唯,dowe | |
黑米,heymi | |
锤子,hammer | |
酷比魔方,koobee | |
美图,meitu | |
尼比鲁,nibilu | |
一加,oneplus | |
优购,yougo | |
诺基亚,nokia | |
糖葫芦,candy | |
中国移动,ccmc | |
语信,yuxin | |
基伍,kiwu | |
青橙,greeno | |
华硕,asus | |
夏新,panosonic | |
维图,weitu | |
艾优尼,aiyouni | |
摩托罗拉,moto | |
乡米,xiangmi | |
米奇,micky | |
大可乐,bigcola | |
沃普丰,wpf | |
神舟,hasse | |
摩乐,mole | |
飞秒,fs | |
米歌,mige | |
富可视,fks | |
德赛,desci | |
梦米,mengmi | |
乐视,lshi | |
小杨树,smallt | |
纽曼,newman | |
邦华,banghua | |
E派,epai | |
易派,epai | |
普耐尔,pner | |
欧新,ouxin | |
西米,ximi | |
海尔,haier | |
波导,bodao | |
糯米,nuomi | |
唯米,weimi | |
酷珀,kupo | |
谷歌,google | |
昂达,ada | |
聆韵,lingyun | |
小米,Xiaomi | |
华为,Huawei | |
魅族,Meizu | |
中兴,ZTE | |
酷派,Coolpad | |
金立,Gionee | |
SUGAR,SUGAR | |
OPPO,OPPO | |
vivo,vivo | |
HTC,HTC | |
LG,LG | |
ZUK,ZUK | |
TCL,TCL | |
LOGO,LOGO | |
SUGAR,SUGAR | |
Lovme,Lovme | |
PPTV,PPTV | |
ZOYE,ZOYE | |
MIL,MIL | |
索尼,Sony | |
欧博信,Opssom | |
奇酷,Qiku | |
酷比,CUBE | |
康佳,Konka | |
亿通,Yitong | |
金星数码,JXD | |
至尊宝,Monkey King | |
百立丰,Hundred Li Feng | |
贝尔丰,Bifer | |
百加,Bacardi | |
诺亚信,Noain | |
广信,Kingsun | |
世纪天元,Ctyon | |
青葱,Cong | |
果米,Taobao | |
斐讯,Phicomm | |
长虹,Changhong | |
欧奇,Oukimobile | |
先锋,XFPLAY | |
台电,Teclast | |
大Q,Daq | |
蓝魔,Ramos | |
奥克斯,AUX | |
世纪星,Centurystar | |
丰米,Fengmi | |
亚马逊,Amazon | |
优语,Youyu | |
凯利通,Kailitong | |
原点,Origin | |
唯比,Weibi | |
嘉源,Jiayuan | |
大显,Daxian | |
天宏时代, Tianhong | |
宏碁,Hongji | |
宝捷讯,Baoxunjie | |
帷幄,Weiwo | |
德卡诺,Dekanuo | |
恒宇丰,Hengyufeng | |
惠普,HP | |
戴尔,Dell | |
智镁,Zhimei | |
本为,Benwei | |
极米,Jimi | |
欧乐迪,Ouyuedi | |
欧乐酷,Ouyueku | |
欧沃,Ouao | |
瑞米,Ruimi | |
瑞高,Ruigao | |
白米,Baimi | |
碟米,Diemi | |
虾米,Xiami | |
西门子,Simon | |
赛博宇华,Saibo | |
飞利浦,Philips | |
首云,Shouyun | |
鲜米,Xianmi | |
E人E本,ErenEben') | |
english_brand$phone_brand=as.vector(english_brand$phone_brand) | |
english_brand$phone_brand=gsub(' ', '', english_brand$phone_brand) | |
english_brand$phone_brand_English=as.vector(english_brand$phone_brand_English) | |
english_brand$phone_brand_English=gsub(' ', '', english_brand$phone_brand_English) | |
phone_brand1<-merge(x =phone_brand, y = english_brand, | |
by = "phone_brand", all.x = T) | |
#str(phone_brand1) | |
phone_brand_device<-phone_brand1[!duplicated(phone_brand1$device_id),] #remove duplicated device id | |
write.csv(x=phone_brand_device, file='phone_brand_device.csv') | |
#str(phone_brand_device) | |
# merge different data set toghther to analysis | |
# first merge with gender and age | |
gender_age<-read.csv('gender_age_train.csv', header = T, stringsAsFactors = F) | |
#str(gender_age) | |
phone_brand_age_gender<-merge(x =phone_brand_device, y =gender_age , | |
by = "device_id", all.x = TRUE) | |
#str(phone_brand_age_gender) | |
#second merge with events | |
events_sample=filter(events, device_id %in% phone_brand_age_gender$device_id) | |
#str(events_sample) | |
events_sample1=events_sample[!duplicated(events_sample$device_id),] #remove duplicated device id | |
#str(events_sample1) | |
device_data<-merge(x=phone_brand_age_gender, y=events, | |
by = "device_id", all.x = TRUE ) | |
#str(device_data) | |
# pick the top 10 phone brand | |
device_data=device_data %>% | |
dplyr::filter(!is.na(device_id), !duplicated(device_id)) #remove duplicated device id | |
top10 <- names(sort(table(device_data$phone_brand_English), | |
decreasing = T))[1:10] | |
phonedata=filter(device_data, phone_brand_English %in%top10) | |
write.csv(x=phonedata, file='phonedata.csv') #first data set in the shiny | |
#str(phonedata) | |
#second dataset | |
setwd("~/Desktop/shiny project/app") | |
app_events=read.csv('app_events.csv', header=T, stringsAsFactors = F) | |
#str(app_events) | |
app_events_1=app_events[sample(nrow(app_events), 500000), ] # too big data set to merge, sample 100000 | |
#str(app_events_1) | |
app_labels=read.csv('app_labels.csv', header=T, stringsAsFactors = F) | |
#str(app_labels) #459943 | |
label_categories=read.csv('label_categories.csv', header = T, | |
stringsAsFactors = F) | |
#str(label_categories) | |
#merge data set | |
#first merge | |
app_labels_categories=merge(x=app_labels, y=label_categories, | |
by='label_id', all.x = T) | |
#str(app_labels_categories) | |
#second merge | |
app_data=merge(x=app_events_1,y=app_labels_categories,by='app_id', all.x=T) | |
#str(app_data) | |
app_data=filter(app_data,is_installed==1) # filter installed app_id | |
#str(app_data) | |
#third merge | |
events<-read.csv('events.csv', header=T, stringsAsFactors = F) | |
events_sample<-filter(events, event_id %in% app_data$event_id) | |
app_rawdata<-merge(y=events_sample, x=app_data, by='event_id', all.x=T) | |
app_rawdata=filter(app_rawdata,is_installed==1) | |
#str(app_rawdata) | |
#fourth merge | |
gender_age<-read.csv('gender_age_train.csv', header = T, stringsAsFactors = F) | |
#str(gender_age) | |
app_age_gender=merge(x=app_rawdata, y=gender_age,by='device_id', all.x=T) | |
app_age_gender=filter(app_age_gender,is_installed==1)# filter installed app_id | |
#str(app_age_gender) | |
#fifth merge | |
phone_brand_device=read.csv('phone_brand_device.csv', header = T, stringsAsFactors = F) | |
app_final=merge(x=app_age_gender, y=phone_brand_device,by= 'device_id', all.x=T) | |
app_final=filter(app_final,is_installed==1) | |
#str(app_final) | |
app_final$dup=paste0(app_final$device_id, app_final$app_id, sep=':') | |
appdata=app_final[!duplicated(app_final$dup),]# remove duplicated device id plus app id | |
str(appdata) | |
top10app=names(sort(table(appdata$category), decreasing = T))[1:10] | |
app_data=appdata%>% | |
filter(category%in%top10app) | |
#str(app_data) | |
write.csv(x=app_data, file='appmap.csv') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment