Last active
November 22, 2019 23:24
-
-
Save toyeiei/9976a5955fb7346dbe25aed0c7ffa078 to your computer and use it in GitHub Desktop.
TECHJAM 2019 data preparation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## ------------------------------------- | |
## Import Libraries -------------------- | |
## ------------------------------------- | |
library(dplyr) | |
library(readr) | |
library(lubridate) | |
library(tidyr) | |
library(DescTools) | |
## load dataset | |
cc <- read_csv("cc.csv") | |
demo <- read_csv("demographics.csv") | |
kplus <- read_csv("kplus.csv") | |
train <- read_csv("train.csv") | |
test <- read_csv("test.csv") | |
## missing values checking | |
mean(complete.cases(cc)) | |
mean(complete.cases(demo)) ## .9997 | |
mean(complete.cases(kplus)) | |
## there are 26 missing values in ocp_cd | |
demo %>% | |
filter(!complete.cases(.)) %>% | |
as.data.frame() | |
## clean missing values in demo | |
demo$ocp_cd[is.na(demo$ocp_cd)] <- 14 | |
## ------------------------------------- | |
## Data Exploration -------------------- | |
## ------------------------------------- | |
train %>% | |
left_join(demo) %>% | |
left_join(cc) | |
## min 10K median 30K, mean 47951 max 500K | |
summary(train$income) | |
## explore income by occupation | |
(d1 <- train %>% | |
left_join(demo, by="id") %>% | |
select(-cc_no) %>% | |
distinct_all() %>% | |
group_by(ocp_cd) %>% | |
summarise( | |
##avg_income = mean(income), | |
avg_incomeTrim = mean(income, trim=.05), | |
##med_income_ocp = median(income), | |
##min_income = min(income), | |
##max_income = max(income), | |
##n = n() | |
) %>% | |
ungroup() ) | |
## explore income by age | |
(d2 <- train %>% | |
left_join(demo, by="id") %>% | |
select(-cc_no) %>% | |
distinct_all() %>% | |
group_by(age) %>% | |
summarise( | |
##avg_income = mean(income), | |
avg_incomeTrim = mean(income, trim=.05), | |
##med_income_age = median(income), | |
##min_income = min(income), | |
##max_income = max(income), | |
##n = n() | |
) %>% | |
ungroup() ) | |
## explore income by gender | |
(d3 <- train %>% | |
left_join(demo, by="id") %>% | |
select(-cc_no) %>% | |
distinct_all() %>% | |
group_by(gender) %>% | |
summarise( | |
avg_income = mean(income), | |
avg_incomeTrim = mean(income, trim=.05), | |
med_income = median(income), | |
min_income = min(income), | |
max_income = max(income), | |
n = n()) %>% | |
ungroup() ) | |
## explore income by age x occupation | |
(d4 <- train %>% | |
left_join(demo, by="id") %>% | |
select(-cc_no) %>% | |
distinct_all() %>% | |
group_by(age, ocp_cd) %>% | |
summarise( | |
##avg_income = mean(income), | |
avg_incomeTrim = mean(income, trim=.05), | |
##med_income = median(income), | |
##min_income = min(income), | |
max_income = max(income), | |
n = n()) %>% | |
ungroup() ) | |
## number of credit cards for each customer | |
card <- demo %>% | |
group_by(id) %>% | |
summarise(noCreditCard = n_distinct(cc_no)) %>% | |
filter(id %in% 1:50000) | |
## number of active credit cards | |
activeCard <- demo %>% | |
left_join(cc, by="cc_no") %>% | |
filter(!is.na(pos_dt)) %>% | |
group_by(id) %>% | |
summarise(noActiveCC = n_distinct(cc_no)) %>% | |
filter(id %in% 1:50000) | |
## add more features to cc | |
(cc <- cc %>% | |
mutate(month = month(pos_dt), | |
day = day(pos_dt))) | |
## summary cc amount and txn by id | |
(cardSummary <- train %>% | |
left_join(demo, "id") %>% | |
left_join(cc, "cc_no") %>% | |
filter(!is.na(pos_dt)) %>% | |
rename(ccAmt = cc_txn_amt) %>% | |
group_by(id) %>% | |
summarise(total_ccAmt = sum(ccAmt, na.rm=T), | |
total_ccTxn = n(), | |
cc_active_mnth = n_distinct(month)) %>% | |
ungroup() %>% | |
mutate(avg_cc_mnth = total_ccAmt/cc_active_mnth, | |
avg_cc_txn = total_ccAmt/total_ccTxn) %>% | |
select(-total_ccAmt, -total_ccTxn) | |
) | |
## kplus summary | |
(kappSummary2 <- kplus %>% | |
mutate(month = month(sunday, label=T)) %>% | |
## mutate(avg_txn_amt = kp_txn_amt/ kp_txn_count) %>% | |
group_by(id) %>% | |
summarise(total_txn = sum(kp_txn_count), | |
total_amt = sum(kp_txn_amt), | |
n_mnth_kplus = n_distinct(month)) %>% | |
ungroup() %>% | |
mutate(avg_mnt_kplus = total_amt/n_mnth_kplus, | |
avg_txn_kplus = total_txn/n_mnth_kplus)) | |
## ------------------------------------- | |
## Create Helper Function -------------- | |
log_tran <- function(x){ | |
log2(x+1) | |
} | |
## ------------------------------------- | |
## Create Full Dataset ----------------- | |
## ------------------------------------- | |
## combine train dataset | |
(mean.INC <- mean(Trim(train$income, .05))) | |
(sd.INC <- sd(Trim(train$income, .05))) | |
(mean.MAXINC <- mean(d4$max_income)) | |
(sd.MAXINC <- sd(d4$max_income)) | |
(t1 <- train %>% | |
left_join(demo, by="id") %>% | |
select(-cc_no) %>% | |
distinct_all() %>% | |
left_join(card, by="id") %>% | |
left_join(activeCard, by="id") %>% | |
left_join(kappSummary2, by="id") %>% | |
left_join(cardSummary, by="id") %>% | |
replace(is.na(.), 0) %>% | |
## male=1, female=0 | |
mutate(gender = ifelse(gender==2,0,1)) %>% | |
## get average income age | occupation | |
left_join(d1, by = "ocp_cd") %>% | |
left_join(d2, by = "age") %>% | |
rename(avgIncomeOcp = avg_incomeTrim.x, | |
avgIncomeAge = avg_incomeTrim.y) %>% | |
mutate(flagCC = noActiveCC > 0, | |
avgIncomeOcp = (avgIncomeOcp-mean.INC)/sd.INC, | |
avgIncomeAge = (avgIncomeAge-mean.INC)/sd.INC) %>% | |
select(-noActiveCC) %>% | |
## get average income age x ocp | |
left_join(d4, by = c("age", "ocp_cd")) %>% | |
select(-age, -ocp_cd, -n) %>% | |
mutate(logIncome = log2(income)) %>% | |
rename(avgIncomeAgeOcp = avg_incomeTrim, | |
maxIncomeAgeOcp = max_income) %>% | |
mutate(avgIncomeAgeOcp = (avgIncomeAgeOcp-mean.INC)/sd.INC, | |
maxIncomeAgeOcp = (maxIncomeAgeOcp-mean.MAXINC)/sd.MAXINC) %>% | |
mutate_at(.vars = c("avg_cc_mnth", | |
"avg_cc_txn", | |
"avg_mnt_kplus", | |
"avg_txn_kplus"), | |
.funs = log_tran) %>% | |
select(-total_txn, -total_amt) | |
); mean(complete.cases(t1)) == 1 | |
(t2 <- t1 %>% | |
select(logIncome, | |
everything(), | |
-id, | |
-income) | |
); mean(complete.cases(t2)) == 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment