JoeUnsung/EDA

## EDA


## 워킹디렉토리
getwd()
setwd("C:/Users/Administrator/Documents/GitHub/LPOINT/data")
setwd("D:/data")


install.packages("ggplot2","dplyr","tidyr","plotly")
install.packages("sqldf")
library(ggplot2)
library(dplyr)
library(tidyr)
library(plotly)
library(sqldf)

## 한글파일 오류 해결
library(devtools)
install_github("plgrmr/readAny", force = TRUE)
library(readAny)
catego <- read.any("04_category.txt", sep =",", header = TRUE, fill = TRUE)
Sys.setlocale("LC_ALL","ko_KR.UTF-8")


## 파일 불러오기
custo <- read.table("01_customer.txt", sep=",", header = TRUE)
head(custo)

buying_shop <- read.table("02_buying_shopping.txt", sep=",", header = TRUE)
head(buying_shop)

buying_no_shop <- read.table("03_buying_noshopping.txt", sep=",", header = TRUE)
head(buying_no_shop)

catego <- read.any("04_category.txt", sep =",", header = TRUE, fill = TRUE)
head(catego)


## 테이블 미리 조인하기

head(custo,7)
glimpse(custo)


############### EDA ######################
############# 1. Custo ##################


## NA 검사기
for( i in 1 : ncol(custo)){
  x <- sum(is.na(custo[,i]))
  if (x != 0){
    y <- paste(colnames(custo)[i] ," : ", round(x/nrow(custo), 3) *100, "%")
    print(y)
  }
}
## [1] "HOM_PST_NO  :  6.8 %"

## custo - ID의 갯수가 중복이 없는지 확인한다.
temp <- custo$ID
sum(temp==temp) ## 갯수 카운트
temp2 <- unique(temp)
sum(temp2==temp2) ## id의 unique를 확인

## GENDER의 비율을 확인
prop.table(table(custo$GENDER))

ggplot(custo, aes(x=GENDER)) +
  geom_bar(width=0.5) +
  ggtitle("LPOINT 회원 성별 분포")

## AGE_PRD의 분포를 확인 (수량형 변수, 히스토그램으로 본다.)
## 그런데 수량형임에도 데이터가 명목형에 가깝게 정제되어있다. 그러면 barplot으로 바라보자
prop.table(table(custo$AGE_PRD)) ## 롯데의 주요 고객층은 3~50대에 분포해있다.

ggplot(custo, aes(x=AGE_PRD)) +
  geom_bar(width=0.5) +
  ggtitle("LPOINT 회원 연령 분포")

## HOM_PST_NO
prop.table(table(custo$HOM_PST_NO))
ggplot(custo, aes(x=HOM_PST_NO)) +
  geom_bar()


install.packages("sqldf")
library(sqldf)

temp <- na.omit(custo)
custo_idCount_byHOM_PST_NO <- sqldf(
  'select "HOM_PST_NO", count("ID")
  from temp
  group by HOM_PST_NO
  having count(ID) >= 500
  order by count(ID) desc')
custo_idCount_byHOM_PST_NO

top5_custo <- custo[ custo$HOM_PST_NO %in% c(100,55,160,470,130),]

ggplot(top5_custo, aes(x=AGE_PRD)) +
  geom_bar(width=0.5) +
  ggtitle("LPOINT 회원 지역별 연령 분포") +
  facet_wrap(~ HOM_PST_NO)


## 100번 지역 평균과 유사
custo_HOM_PST_NO_100 <- custo %>% filter(HOM_PST_NO==100)
prop.table(table(custo_HOM_PST_NO_100$GENDER))
prop.table(table(custo_HOM_PST_NO_100$AGE_PRD))

## 55번 지역 특이지역 발견 여성비율 +10%, 60대의 비율 높음, 30대의 비율 낮음
custo_HOM_PST_NO_55 <- custo %>% filter(HOM_PST_NO==55)
prop.table(table(custo_HOM_PST_NO_55$GENDER))
prop.table(table(custo_HOM_PST_NO_55$AGE_PRD))

ggplot(custo, aes(x=AGE_PRD)) +
  geom_bar(width=0.5) +
  ggtitle("LPOINT 회원 연령 분포") +
  facet_wrap(~ HOM_PST_NO)

## 160번 지역
custo_HOM_PST_NO_160 <- custo %>% filter(HOM_PST_NO==160)
prop.table(table(custo_HOM_PST_NO_160$GENDER))
prop.table(table(custo_HOM_PST_NO_160$AGE_PRD))

## 470번 지역
custo_HOM_PST_NO_470 <- custo %>% filter(HOM_PST_NO==470)
prop.table(table(custo_HOM_PST_NO_470$GENDER))
prop.table(table(custo_HOM_PST_NO_470$AGE_PRD))

## 130번 지역
custo_HOM_PST_NO_130 <- custo %>% filter(HOM_PST_NO==130)
prop.table(table(custo_HOM_PST_NO_130$GENDER))
prop.table(table(custo_HOM_PST_NO_130$AGE_PRD))


#################### EDA ######################
############# 2. buying_shop ##################


## NA 검사기
for( i in 1 : ncol(buying_shop)){
  x <- sum(is.na(buying_shop[,i]))
  if (x != 0){
    y <- paste(colnames(buying_shop)[i] ," : ", round(x/nrow(buying_shop), 3) *100, "%")
    print(y)
  }
}

head(buying_shop)

## custo - ID
## 구매회원 비율
temp <- unique(custo$ID)
temp2 <- unique(buying_shop$ID)
ratio_buying_customer <- sum(temp2==temp2)/sum(temp==temp)

paste("전체 회원중 1건 이상의 구매 이력이 있는 회원은", round(ratio_buying_customer,2), "% 입니다.")


## 평균 구매 건수
temp <- buying_shop$ID
cnt_customer_buying <- sum(temp==temp) ## 갯수 카운트
temp2 <- unique(temp)
unique_customer_buying <- sum(temp2==temp2) ## id의 unique를 확인
avg_buying_count <- cnt_customer_buying / unique_customer_buying

paste("회원 한명당 평균 구매 건수는", round(avg_buying_count, 1), "건 입니다.")


## RCT_NO

## 횟수로 최고 VIP 16742 회원, 2015년 기준 5469번 결제
count_customer_shopping <-
sqldf('select "ID", count("RCT_NO")
         from buying_shop
         group by ID
         order by count(RCT_NO) desc'
)


ggplot(count_customer_shopping)


sqldf('select "ID", sum("BUY_AM")
  from buying_shop
  where ID = 16742')


id_BUYAM_rank <- sqldf('select "ID", sum("BUY_AM")
  from buying_shop
  group by ID
  order by sum(BUY_AM) desc')
## 구매 건수가 많다고 해서 구매금액 기준으로 높은 순위를 기록하지는 않았다.


id_vip100 <- id_BUYAM_rank[0:100,1]
id_vip500 <- id_BUYAM_rank[0:500,1]

x <- custo[custo$ID %in% id_vip100,]

x2 <- x %>% filter(HOM_PST_NO==55)
prop.table(table(x$GENDER))
prop.table(table(x$AGE_PRD))

ggplot(custo, aes(x=AGE_PRD)) +
  geom_bar(width=0.5) +
  ggtitle("LPOINT 회원 연령 분포") +
  facet_wrap(~ HOM_PST_NO)


## 2-3 BIZ UNIT

## 비율
prop.table(table(buying_shop$BIZ_UNIT))

ggplot(buying_shop, aes(x=BIZ_UNIT)) +
  geom_bar(width=0.5) +
  ggtitle("LPOINT 회원 채널별 이용 건수 비율")

paste("대형마트(47.4%)에서 주로 포인트 적립을 한다는 것을 알 수 있음")

## VIP 이용 비율

temp <- buying_shop[buying_shop$ID %in% id_vip100,]

ggplot(temp, aes(x=BIZ_UNIT)) +
  geom_bar(width=0.5) +
  ggtitle("LPOINT VIP 100 회원 채널별 이용 건수 비율")

temp2 <- buying_shop[buying_shop$ID %in% id_vip500,]

ggplot(temp2, aes(x=BIZ_UNIT)) +
  geom_bar(width=0.5) +
  ggtitle("LPOINT VIP 500 회원 채널별 이용 건수 비율")

prop.table(table(buying_shop$BIZ_UNIT))
prop.table(table(temp$BIZ_UNIT))
prop.table(table(temp2$BIZ_UNIT))

## VIP로 갈수록 백화점에 매출이 집중되어있는 모습임을 알 수 있고
## 전체적으로는 대형마트와 슈퍼마켓에 집중되는 모습임을 알 수 있다.


## PD_S_C 상품 소분류 코드와

head(buying_shop)


## 백화점 중에서 핫한 백화점
temp <- sqldf('select "BIZ_UNIT", "BR_C", count(*) as cnt
  from buying_shop
  where BIZ_UNIT = "A01"
  group by BIZ_UNIT, BR_C
  order by cnt desc')


## VIP들에게 핫한 백화점


#################### EDA ######################
############# 3. no buying_shop ##################

## NA 검사기
for( i in 1 : ncol(buying_no_shop)){
  x <- sum(is.na(buying_no_shop[,i]))
  if (x != 0){
    y <- paste(colnames(buying_no_shop)[i] ," : ", round(x/nrow(buying_no_shop), 3) *100, "%")
    print(y)
  }
}


#################### EDA ######################
############# 4. catego  ######################

## NA 검사기
for( i in 1 : ncol(catego)){
  x <- sum(is.na(catego[,i]))
  if (x != 0){
    y <- paste(colnames(catego)[i] ," : ", round(x/nrow(catego), 3) *100, "%")
    print(y)
  }
}


## 1인당 평균 구매 건수
count(buying_shop)/count(custo)
## LPOINT 회원들은 2015년 평균 182건의 구매를 했다.
## 2일에 한번꼴로 LPOINT를 사용한 셈

max(buying_shop$BUY_AM) ## 1억 6천만원짜리 데이터가 있을 수 있나??
order_by(buying_shop$BUY_AM)

## 건당 평균 구매 금액, 24818원
mean(buying_shop$BUY_AM)

## 월별 구매금액 변화 추이

x <- merge(custo, buying_shop, by="ID")

x %>%
  group_by(ID)

x %>% arrange(BUY_AM)
## 구매금액이 0원인 것들은 무엇?

df1 <- data_frame(x = c(1,2), y=2:1)
df1

df2 <- data_frame(x = c(1,3), a= 10, b = "a")
df2

df1 %>% full_join(df2)


	## 워킹디렉토리
	getwd()
	setwd("C:/Users/Administrator/Documents/GitHub/LPOINT/data")
	setwd("D:/data")


	install.packages("ggplot2","dplyr","tidyr","plotly")
	install.packages("sqldf")
	library(ggplot2)
	library(dplyr)
	library(tidyr)
	library(plotly)
	library(sqldf)

	## 한글파일 오류 해결
	library(devtools)
	install_github("plgrmr/readAny", force = TRUE)
	library(readAny)
	catego <- read.any("04_category.txt", sep =",", header = TRUE, fill = TRUE)
	Sys.setlocale("LC_ALL","ko_KR.UTF-8")


	## 파일 불러오기
	custo <- read.table("01_customer.txt", sep=",", header = TRUE)
	head(custo)

	buying_shop <- read.table("02_buying_shopping.txt", sep=",", header = TRUE)
	head(buying_shop)

	buying_no_shop <- read.table("03_buying_noshopping.txt", sep=",", header = TRUE)
	head(buying_no_shop)

	catego <- read.any("04_category.txt", sep =",", header = TRUE, fill = TRUE)
	head(catego)


	## 테이블 미리 조인하기

	head(custo,7)
	glimpse(custo)


	############### EDA ######################
	############# 1. Custo ##################


	## NA 검사기
	for( i in 1 : ncol(custo)){
	x <- sum(is.na(custo[,i]))
	if (x != 0){
	y <- paste(colnames(custo)[i] ," : ", round(x/nrow(custo), 3) *100, "%")
	print(y)
	}
	}
	## [1] "HOM_PST_NO : 6.8 %"

	## custo - ID의 갯수가 중복이 없는지 확인한다.
	temp <- custo$ID
	sum(temp==temp) ## 갯수 카운트
	temp2 <- unique(temp)
	sum(temp2==temp2) ## id의 unique를 확인

	## GENDER의 비율을 확인
	prop.table(table(custo$GENDER))

	ggplot(custo, aes(x=GENDER)) +
	geom_bar(width=0.5) +
	ggtitle("LPOINT 회원 성별 분포")

	## AGE_PRD의 분포를 확인 (수량형 변수, 히스토그램으로 본다.)
	## 그런데 수량형임에도 데이터가 명목형에 가깝게 정제되어있다. 그러면 barplot으로 바라보자
	prop.table(table(custo$AGE_PRD)) ## 롯데의 주요 고객층은 3~50대에 분포해있다.

	ggplot(custo, aes(x=AGE_PRD)) +
	geom_bar(width=0.5) +
	ggtitle("LPOINT 회원 연령 분포")

	## HOM_PST_NO
	prop.table(table(custo$HOM_PST_NO))
	ggplot(custo, aes(x=HOM_PST_NO)) +
	geom_bar()



	install.packages("sqldf")
	library(sqldf)

	temp <- na.omit(custo)
	custo_idCount_byHOM_PST_NO <- sqldf(
	'select "HOM_PST_NO", count("ID")
	from temp
	group by HOM_PST_NO
	having count(ID) >= 500
	order by count(ID) desc')
	custo_idCount_byHOM_PST_NO

	top5_custo <- custo[ custo$HOM_PST_NO %in% c(100,55,160,470,130),]

	ggplot(top5_custo, aes(x=AGE_PRD)) +
	geom_bar(width=0.5) +
	ggtitle("LPOINT 회원 지역별 연령 분포") +
	facet_wrap(~ HOM_PST_NO)


	## 100번 지역 평균과 유사
	custo_HOM_PST_NO_100 <- custo %>% filter(HOM_PST_NO==100)
	prop.table(table(custo_HOM_PST_NO_100$GENDER))
	prop.table(table(custo_HOM_PST_NO_100$AGE_PRD))

	## 55번 지역 특이지역 발견 여성비율 +10%, 60대의 비율 높음, 30대의 비율 낮음
	custo_HOM_PST_NO_55 <- custo %>% filter(HOM_PST_NO==55)
	prop.table(table(custo_HOM_PST_NO_55$GENDER))
	prop.table(table(custo_HOM_PST_NO_55$AGE_PRD))

	ggplot(custo, aes(x=AGE_PRD)) +
	geom_bar(width=0.5) +
	ggtitle("LPOINT 회원 연령 분포") +
	facet_wrap(~ HOM_PST_NO)

	## 160번 지역
	custo_HOM_PST_NO_160 <- custo %>% filter(HOM_PST_NO==160)
	prop.table(table(custo_HOM_PST_NO_160$GENDER))
	prop.table(table(custo_HOM_PST_NO_160$AGE_PRD))

	## 470번 지역
	custo_HOM_PST_NO_470 <- custo %>% filter(HOM_PST_NO==470)
	prop.table(table(custo_HOM_PST_NO_470$GENDER))
	prop.table(table(custo_HOM_PST_NO_470$AGE_PRD))

	## 130번 지역
	custo_HOM_PST_NO_130 <- custo %>% filter(HOM_PST_NO==130)
	prop.table(table(custo_HOM_PST_NO_130$GENDER))
	prop.table(table(custo_HOM_PST_NO_130$AGE_PRD))





	#################### EDA ######################
	############# 2. buying_shop ##################


	## NA 검사기
	for( i in 1 : ncol(buying_shop)){
	x <- sum(is.na(buying_shop[,i]))
	if (x != 0){
	y <- paste(colnames(buying_shop)[i] ," : ", round(x/nrow(buying_shop), 3) *100, "%")
	print(y)
	}
	}

	head(buying_shop)

	## custo - ID
	## 구매회원 비율
	temp <- unique(custo$ID)
	temp2 <- unique(buying_shop$ID)
	ratio_buying_customer <- sum(temp2==temp2)/sum(temp==temp)

	paste("전체 회원중 1건 이상의 구매 이력이 있는 회원은", round(ratio_buying_customer,2), "% 입니다.")


	## 평균 구매 건수
	temp <- buying_shop$ID
	cnt_customer_buying <- sum(temp==temp) ## 갯수 카운트
	temp2 <- unique(temp)
	unique_customer_buying <- sum(temp2==temp2) ## id의 unique를 확인
	avg_buying_count <- cnt_customer_buying / unique_customer_buying

	paste("회원 한명당 평균 구매 건수는", round(avg_buying_count, 1), "건 입니다.")


	## RCT_NO

	## 횟수로 최고 VIP 16742 회원, 2015년 기준 5469번 결제
	count_customer_shopping <-
	sqldf('select "ID", count("RCT_NO")
	from buying_shop
	group by ID
	order by count(RCT_NO) desc'
	)


	ggplot(count_customer_shopping)


	sqldf('select "ID", sum("BUY_AM")
	from buying_shop
	where ID = 16742')


	id_BUYAM_rank <- sqldf('select "ID", sum("BUY_AM")
	from buying_shop
	group by ID
	order by sum(BUY_AM) desc')
	## 구매 건수가 많다고 해서 구매금액 기준으로 높은 순위를 기록하지는 않았다.


	id_vip100 <- id_BUYAM_rank[0:100,1]
	id_vip500 <- id_BUYAM_rank[0:500,1]

	x <- custo[custo$ID %in% id_vip100,]

	x2 <- x %>% filter(HOM_PST_NO==55)
	prop.table(table(x$GENDER))
	prop.table(table(x$AGE_PRD))

	ggplot(custo, aes(x=AGE_PRD)) +
	geom_bar(width=0.5) +
	ggtitle("LPOINT 회원 연령 분포") +
	facet_wrap(~ HOM_PST_NO)


	## 2-3 BIZ UNIT

	## 비율
	prop.table(table(buying_shop$BIZ_UNIT))

	ggplot(buying_shop, aes(x=BIZ_UNIT)) +
	geom_bar(width=0.5) +
	ggtitle("LPOINT 회원 채널별 이용 건수 비율")

	paste("대형마트(47.4%)에서 주로 포인트 적립을 한다는 것을 알 수 있음")

	## VIP 이용 비율

	temp <- buying_shop[buying_shop$ID %in% id_vip100,]

	ggplot(temp, aes(x=BIZ_UNIT)) +
	geom_bar(width=0.5) +
	ggtitle("LPOINT VIP 100 회원 채널별 이용 건수 비율")

	temp2 <- buying_shop[buying_shop$ID %in% id_vip500,]

	ggplot(temp2, aes(x=BIZ_UNIT)) +
	geom_bar(width=0.5) +
	ggtitle("LPOINT VIP 500 회원 채널별 이용 건수 비율")

	prop.table(table(buying_shop$BIZ_UNIT))
	prop.table(table(temp$BIZ_UNIT))
	prop.table(table(temp2$BIZ_UNIT))

	## VIP로 갈수록 백화점에 매출이 집중되어있는 모습임을 알 수 있고
	## 전체적으로는 대형마트와 슈퍼마켓에 집중되는 모습임을 알 수 있다.


	## PD_S_C 상품 소분류 코드와

	head(buying_shop)


	## 백화점 중에서 핫한 백화점
	temp <- sqldf('select "BIZ_UNIT", "BR_C", count(*) as cnt
	from buying_shop
	where BIZ_UNIT = "A01"
	group by BIZ_UNIT, BR_C
	order by cnt desc')


	## VIP들에게 핫한 백화점



	#################### EDA ######################
	############# 3. no buying_shop ##################

	## NA 검사기
	for( i in 1 : ncol(buying_no_shop)){
	x <- sum(is.na(buying_no_shop[,i]))
	if (x != 0){
	y <- paste(colnames(buying_no_shop)[i] ," : ", round(x/nrow(buying_no_shop), 3) *100, "%")
	print(y)
	}
	}






	#################### EDA ######################
	############# 4. catego ######################

	## NA 검사기
	for( i in 1 : ncol(catego)){
	x <- sum(is.na(catego[,i]))
	if (x != 0){
	y <- paste(colnames(catego)[i] ," : ", round(x/nrow(catego), 3) *100, "%")
	print(y)
	}
	}








	## 1인당 평균 구매 건수
	count(buying_shop)/count(custo)
	## LPOINT 회원들은 2015년 평균 182건의 구매를 했다.
	## 2일에 한번꼴로 LPOINT를 사용한 셈

	max(buying_shop$BUY_AM) ## 1억 6천만원짜리 데이터가 있을 수 있나??
	order_by(buying_shop$BUY_AM)

	## 건당 평균 구매 금액, 24818원
	mean(buying_shop$BUY_AM)

	## 월별 구매금액 변화 추이

	x <- merge(custo, buying_shop, by="ID")

	x %>%
	group_by(ID)

	x %>% arrange(BUY_AM)
	## 구매금액이 0원인 것들은 무엇?

	df1 <- data_frame(x = c(1,2), y=2:1)
	df1

	df2 <- data_frame(x = c(1,3), a= 10, b = "a")
	df2

	df1 %>% full_join(df2)