cseidman/plotInitialData.R

## plotInitialData.R
# Ages are more relevant when we bucket them
ds$AgeRanges <- cut(ds$Age,breaks=c(0,30,50,70,90),labels=c("11-30","31-50","51-70","71+"))
# Income ranges are also more interesting when we bin them
ds$IncomeCategories <- cut(x=ds$YearlyIncome,breaks=c(0,20000,50000,70000,100000,250000),labels=c("Low","Lower","Middle","Upper","Wealthy"))

# Helps with building the plots
ds$Count <- 1

features <- c("AgeRanges","MaritalStatus","Gender",
              "IncomeCategories","TotalChildren","NumberChildrenAtHome",
              "Education","Occupation" ,"IsHomeOwner",
              "NumberCarsOwned","YearsSinceFirstPurchase","CommuteDistance",
              "CountryRegionCode")

plts <- lapply(features,FUN=function(f) {

    # Take only the columns used for the plots
    df <- ds[,c(f,"Count","IsCardUser")]
    # Standardize the name of the feature column in order to
    # make the function more generic
    colnames(df)[1] <- "Feature"

    ggplot(df) +
      aes(y=Count,x=Feature,fill=Feature) +
      geom_col() +
      xlab(f) +
      coord_flip() +
      facet_wrap(~IsCardUser,ncol = 2) +
      theme(legend.position="none")

    })

multiplot(plts)
	# Ages are more relevant when we bucket them
	ds$AgeRanges <- cut(ds$Age,breaks=c(0,30,50,70,90),labels=c("11-30","31-50","51-70","71+"))
	# Income ranges are also more interesting when we bin them
	ds$IncomeCategories <- cut(x=ds$YearlyIncome,breaks=c(0,20000,50000,70000,100000,250000),labels=c("Low","Lower","Middle","Upper","Wealthy"))

	# Helps with building the plots
	ds$Count <- 1

	features <- c("AgeRanges","MaritalStatus","Gender",
	"IncomeCategories","TotalChildren","NumberChildrenAtHome",
	"Education","Occupation" ,"IsHomeOwner",
	"NumberCarsOwned","YearsSinceFirstPurchase","CommuteDistance",
	"CountryRegionCode")

	plts <- lapply(features,FUN=function(f) {

	# Take only the columns used for the plots
	df <- ds[,c(f,"Count","IsCardUser")]
	# Standardize the name of the feature column in order to
	# make the function more generic
	colnames(df)[1] <- "Feature"

	ggplot(df) +
	aes(y=Count,x=Feature,fill=Feature) +
	geom_col() +
	xlab(f) +
	coord_flip() +
	facet_wrap(~IsCardUser,ncol = 2) +
	theme(legend.position="none")

	})

	multiplot(plts)