Skip to content

Instantly share code, notes, and snippets.

@toyeiei
Last active November 22, 2019 23:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save toyeiei/9976a5955fb7346dbe25aed0c7ffa078 to your computer and use it in GitHub Desktop.
Save toyeiei/9976a5955fb7346dbe25aed0c7ffa078 to your computer and use it in GitHub Desktop.
TECHJAM 2019 data preparation
## -------------------------------------
## Import Libraries --------------------
## -------------------------------------
library(dplyr)
library(readr)
library(lubridate)
library(tidyr)
library(DescTools)
## load dataset
cc <- read_csv("cc.csv")
demo <- read_csv("demographics.csv")
kplus <- read_csv("kplus.csv")
train <- read_csv("train.csv")
test <- read_csv("test.csv")
## missing values checking
mean(complete.cases(cc))
mean(complete.cases(demo)) ## .9997
mean(complete.cases(kplus))
## there are 26 missing values in ocp_cd
demo %>%
filter(!complete.cases(.)) %>%
as.data.frame()
## clean missing values in demo
demo$ocp_cd[is.na(demo$ocp_cd)] <- 14
## -------------------------------------
## Data Exploration --------------------
## -------------------------------------
train %>%
left_join(demo) %>%
left_join(cc)
## min 10K median 30K, mean 47951 max 500K
summary(train$income)
## explore income by occupation
(d1 <- train %>%
left_join(demo, by="id") %>%
select(-cc_no) %>%
distinct_all() %>%
group_by(ocp_cd) %>%
summarise(
##avg_income = mean(income),
avg_incomeTrim = mean(income, trim=.05),
##med_income_ocp = median(income),
##min_income = min(income),
##max_income = max(income),
##n = n()
) %>%
ungroup() )
## explore income by age
(d2 <- train %>%
left_join(demo, by="id") %>%
select(-cc_no) %>%
distinct_all() %>%
group_by(age) %>%
summarise(
##avg_income = mean(income),
avg_incomeTrim = mean(income, trim=.05),
##med_income_age = median(income),
##min_income = min(income),
##max_income = max(income),
##n = n()
) %>%
ungroup() )
## explore income by gender
(d3 <- train %>%
left_join(demo, by="id") %>%
select(-cc_no) %>%
distinct_all() %>%
group_by(gender) %>%
summarise(
avg_income = mean(income),
avg_incomeTrim = mean(income, trim=.05),
med_income = median(income),
min_income = min(income),
max_income = max(income),
n = n()) %>%
ungroup() )
## explore income by age x occupation
(d4 <- train %>%
left_join(demo, by="id") %>%
select(-cc_no) %>%
distinct_all() %>%
group_by(age, ocp_cd) %>%
summarise(
##avg_income = mean(income),
avg_incomeTrim = mean(income, trim=.05),
##med_income = median(income),
##min_income = min(income),
max_income = max(income),
n = n()) %>%
ungroup() )
## number of credit cards for each customer
card <- demo %>%
group_by(id) %>%
summarise(noCreditCard = n_distinct(cc_no)) %>%
filter(id %in% 1:50000)
## number of active credit cards
activeCard <- demo %>%
left_join(cc, by="cc_no") %>%
filter(!is.na(pos_dt)) %>%
group_by(id) %>%
summarise(noActiveCC = n_distinct(cc_no)) %>%
filter(id %in% 1:50000)
## add more features to cc
(cc <- cc %>%
mutate(month = month(pos_dt),
day = day(pos_dt)))
## summary cc amount and txn by id
(cardSummary <- train %>%
left_join(demo, "id") %>%
left_join(cc, "cc_no") %>%
filter(!is.na(pos_dt)) %>%
rename(ccAmt = cc_txn_amt) %>%
group_by(id) %>%
summarise(total_ccAmt = sum(ccAmt, na.rm=T),
total_ccTxn = n(),
cc_active_mnth = n_distinct(month)) %>%
ungroup() %>%
mutate(avg_cc_mnth = total_ccAmt/cc_active_mnth,
avg_cc_txn = total_ccAmt/total_ccTxn) %>%
select(-total_ccAmt, -total_ccTxn)
)
## kplus summary
(kappSummary2 <- kplus %>%
mutate(month = month(sunday, label=T)) %>%
## mutate(avg_txn_amt = kp_txn_amt/ kp_txn_count) %>%
group_by(id) %>%
summarise(total_txn = sum(kp_txn_count),
total_amt = sum(kp_txn_amt),
n_mnth_kplus = n_distinct(month)) %>%
ungroup() %>%
mutate(avg_mnt_kplus = total_amt/n_mnth_kplus,
avg_txn_kplus = total_txn/n_mnth_kplus))
## -------------------------------------
## Create Helper Function --------------
log_tran <- function(x){
log2(x+1)
}
## -------------------------------------
## Create Full Dataset -----------------
## -------------------------------------
## combine train dataset
(mean.INC <- mean(Trim(train$income, .05)))
(sd.INC <- sd(Trim(train$income, .05)))
(mean.MAXINC <- mean(d4$max_income))
(sd.MAXINC <- sd(d4$max_income))
(t1 <- train %>%
left_join(demo, by="id") %>%
select(-cc_no) %>%
distinct_all() %>%
left_join(card, by="id") %>%
left_join(activeCard, by="id") %>%
left_join(kappSummary2, by="id") %>%
left_join(cardSummary, by="id") %>%
replace(is.na(.), 0) %>%
## male=1, female=0
mutate(gender = ifelse(gender==2,0,1)) %>%
## get average income age | occupation
left_join(d1, by = "ocp_cd") %>%
left_join(d2, by = "age") %>%
rename(avgIncomeOcp = avg_incomeTrim.x,
avgIncomeAge = avg_incomeTrim.y) %>%
mutate(flagCC = noActiveCC > 0,
avgIncomeOcp = (avgIncomeOcp-mean.INC)/sd.INC,
avgIncomeAge = (avgIncomeAge-mean.INC)/sd.INC) %>%
select(-noActiveCC) %>%
## get average income age x ocp
left_join(d4, by = c("age", "ocp_cd")) %>%
select(-age, -ocp_cd, -n) %>%
mutate(logIncome = log2(income)) %>%
rename(avgIncomeAgeOcp = avg_incomeTrim,
maxIncomeAgeOcp = max_income) %>%
mutate(avgIncomeAgeOcp = (avgIncomeAgeOcp-mean.INC)/sd.INC,
maxIncomeAgeOcp = (maxIncomeAgeOcp-mean.MAXINC)/sd.MAXINC) %>%
mutate_at(.vars = c("avg_cc_mnth",
"avg_cc_txn",
"avg_mnt_kplus",
"avg_txn_kplus"),
.funs = log_tran) %>%
select(-total_txn, -total_amt)
); mean(complete.cases(t1)) == 1
(t2 <- t1 %>%
select(logIncome,
everything(),
-id,
-income)
); mean(complete.cases(t2)) == 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment