Skip to content

Instantly share code, notes, and snippets.

@theotheo
Last active August 29, 2015 14:08
Show Gist options
  • Save theotheo/ddfe64ad66e74ad7f01b to your computer and use it in GitHub Desktop.
Save theotheo/ddfe64ad66e74ad7f01b to your computer and use it in GitHub Desktop.
DM Track. Lesson 2.
# > install.packages("gridExtra")
require(gridExtra)
require(ggplot2)
# read kaggle data
stores <- read.csv("raw/stores.csv", header = T)
features <- read.csv("raw/features.csv", header = T)
train <- read.csv("raw/train.csv", header = T)
# create list of stores by splitting train data by store number
stores <- split(train, train$Store)
# > str(stores)
# List of 45
# $ 1 :'data.frame': 10244 obs. of 5 variables:
# ..$ Store : int [1:10244] 1 1 1 1 1 1 1 1 1 1 ...
# ..$ Dept : int [1:10244] 1 1 1 1 1 1 1 1 1 1 ...
# ..$ Date : Factor w/ 143 levels "2010-02-05","2010-02-12",..: 1 2 3 4 5 6 7 8 9 10 ...
# ..$ Weekly_Sales: num [1:10244] 24925 46039 41596 19404 21828 ...
# ..$ IsHoliday : logi [1:10244] FALSE TRUE FALSE FALSE FALSE FALSE ...
# $ 2 :'data.frame': 10238 obs. of 5 variables:
# ..$ Store : int [1:10238] 2 2 2 2 2 2 2 2 2 2 ...
# ..$ Dept : int [1:10238] 1 1 1 1 1 1 1 1 1 1 ...
# ..$ Date : Factor w/ 143 levels "2010-02-05","2010-02-12",..: 1 2 3 4 5 6 7 8 9 10 ...
# ..$ Weekly_Sales: num [1:10238] 35034 60484 58222 25962 27372 ...
# ..$ IsHoliday : logi [1:10238] FALSE TRUE FALSE FALSE FALSE FALSE ...
# ...
plotSalesOfStoreByNumWeek <- function(storeNum) {
store <- stores[[storeNum]]
store$Date <- as.Date(store$Date)
store$WeekNum <- as.factor(format(store$Date,"%U"))
store$Dept <- as.factor(store$Dept)
#store.byDept <- ddply(store,~Dept,summarise,sum=sum(Weekly_Sales))
#store.byDept.ordered <- store.byDept[order(-store.byDept$sum),]
# store.byDate <- ddply(store,~Date,summarise,sum=sum(Weekly_Sales), IsHoliday=IsHoliday)
# only holidays
store.OnlyHolidays <- store[which(store$IsHoliday),]
# summarize all departaments weekly sales for date
store.OnlyHolidays.byDate <- ddply(store.OnlyHolidays, ~Date, summarise, Weekly_Sales=sum(Weekly_Sales))
# add number of week
store.OnlyHolidays.byDate$WeekNum <- as.factor(format(store.OnlyHolidays.byDate$Date,"%U"))
#store.OnlyHolidays.byDate.byWeekNum <- ddply(store.OnlyHolidays.byDate, ~WeekNum, summarise, Weekly_Sales=sum(Weekly_Sales))
# the same for weeks except holiday weeks
store.WOHolidays <- store[which(!store$IsHoliday),]
store.WOHolidays.byDate <- ddply(store.WOHolidays, ~Date, summarise, Weekly_Sales=sum(Weekly_Sales))
store.WOHolidays.byDate$WeekNum <- as.factor(format(store.WOHolidays.byDate$Date,"%U"))
#store.WOHolidays.byDate.byWeekNum <- ddply(store.WOHolidays.byDate, ~WeekNum, summarise, Weekly_Sales=sum(Weekly_Sales))
plot <- ggplot(store.WOHolidays.byDate, aes(WeekNum, Weekly_Sales)) +
ylim(250000, 3500000) +
geom_point() +
geom_point(data=store.OnlyHolidays.byDate, color='red') +
ggtitle(paste("Store", storeNum))
return(list(plot))
}
# draw
plots <- c()
for(i in 1:45) {
plots <- append(plots, plotSalesOfStoreByNumWeek(i))
}
n <- length(plots)
nCol <- floor(sqrt(n))
png("stores_weekly_sales_by_week_number.png", 2000, 1000)
do.call("grid.arrange", c(plots, ncol=nCol))
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment