Gedevan-Aleksizde/KFAS_crash.R

## KFAS_crash.R
require(KFAS) # 1.2.9
require(dplyr)
require(tidyr)
require(ggplot2)
require(data.table)
require(zoo)

##### read dataset #####
# https://catalog.data.gov/dataset/allegheny-county-crash-data
# data description
df.crash <- read.csv(file="data/crashdatadictionary.csv",
                     stringsAsFactors = F, fileEncoding = "utf-8")

df <- list()
for (i in 2004:2014 - 2003){
  df[[i]] <- read.csv(file=paste0("data/", i + 2003, "alcocrash.csv"),
                      stringsAsFactors = F, fileEncoding = "utf-8")
}
df[[length(df) + 1]] <- read.csv(file="data/d90eb4fd-1234-4f3b-ba3d-422769cd3761.csv",
                             stringsAsFactors = F, fileEncoding = "utf-8")
df[[length(df) + 1]] <- read.csv(file="data/reordered2016crashes.csv", stringsAsFactors = F, fileEncoding = "utf-8")
df <- rbindlist(df, fill = T)

df.mon <- filter(df, COLLISION_TYPE != 0, BICYCLE == 0) %>% group_by(CRASH_YEAR, CRASH_MONTH) %>%
  summarise(CRASH=n(),
            AGGRESSIVE=sum(AGGRESSIVE_DRIVING),
            ALCOHOL=sum(ALCOHOL_RELATED),
            PHONE=sum(CELL_PHONE),
            CROSS_MEDIAN=sum(CROSS_MEDIAN),
            DEER=sum(DEER_RELATED),
            DISTRACTED=sum(DISTRACTED),
            DRIVER_U_17YR=sum(DRIVER_16YR + DRIVER_17YR >= 1),
            DRIVER_75PLUS=sum(DRIVER_75PLUS),
            OVERTURNED=sum(OVERTURNED),
            DIRT=sum(ROAD_CONDITION==2),
            WET=sum(ROAD_CONDITION %in% c(1, 4, 7)),
            SNOW_ICE=sum(ROAD_CONDITION %in% c(3, 5, 6)),
            WEATHER_BAD=sum(WEATHER %in% 2:8)
            ) %>% ungroup() %>%
  mutate(ym=as.yearmon(paste(CRASH_YEAR, CRASH_MONTH, sep="-")))

##### plot #####
ggplot(gather(df.mon, key = series, value=value, -ym, -CRASH_YEAR, -CRASH_MONTH) %>% filter(series != "CRASH")) +
  geom_line(aes(x=ym, y=value, group=series), color="grey") + labs(x="year-month", "count of car crash") +
  facet_wrap(~series, scales = "free") + theme_classic()
ggplot(df.mon) + geom_line(aes(x=ym, y=CRASH), color="grey", lwd=1) + labs(x="year-month", y="number of crashes") +
  theme_classic()
# monthly
temp <- group_by(df, CRASH_MONTH) %>% summarise(y=n()) %>% ungroup
temp$CRASH_MONTH <- as.integer(temp$CRASH_MONTH)
ggplot(temp) +  geom_line(aes(x=CRASH_MONTH, y=y)) + scale_x_continuous(breaks= temp$CRASH_MONTH)


#####  modeling #####

# Gaussian case
model1 <- SSModel(formula = CRASH ~ SSMtrend(1, Q=NA) +
                    SSMseasonal(period = 12, Q=NA),
                  data=df.mon, H = NA)
model2 <- SSModel(formula = CRASH ~ SSMtrend(1, Q=NA) +
                    SSMseasonal(period = 12, Q=NA) +
                    DRIVER_U_17YR + DRIVER_75PLUS + DISTRACTED + PHONE + SNOW_ICE + WEATHER_BAD + WET,
                  data=df.mon, H = NA)

# Poisson case
model3 <- SSModel(formula = CRASH ~ SSMtrend(1, Q=NA) +
                    SSMseasonal(period = 12, Q=NA),
                  distribution = "poisson",  data=df.mon)

model4 <- SSModel(formula = CRASH ~ SSMtrend(1, Q=NA) +
                    SSMseasonal(period = 12, Q=NA) +
                    DRIVER_U_17YR + DRIVER_75PLUS + DISTRACTED + PHONE + SNOW_ICE + WEATHER_BAD + WET,
                  distribution = "poisson",  data=df.mon)

fit <- function(model, inits, method="BFGS"){
  f <- fitSSM(model, inits=inits, method=method)
  print(paste("Converged: ", f$optim.out$convergence == 0))
  return(f)
}

fit.model1 <- fit(model1, inits=c(1, 1, 1))
fit.model2 <- fit(model2, inits=c(1, 1, 1))
fit.model3 <- fit(model3, inits=c(1, 1, 1))
fit.model4 <- fit(model4, inits=c(.1, .1), "SANN")


# filtering
filter.model1 <- KFS(fit.model1$model, smoothing = c("state", "mean"))
filter.model2 <- KFS(fit.model2$model, smoothing = c("state", "mean"))
filter.model3 <- KFS(fit.model3$model, smoothing = c("state", "mean"))
filter.model4 <- KFS(fit.model4$model, smoothing = c("state", "mean"))

plot.error <- function(list.filter.model, t=NULL){
df <- data.frame(y=list.filter.model[[1]]$model$y,
                   lapply(list.filter.model, FUN = function(x){return(
                     if(x$model$distribution == "gaussian"){
                       cumsum(abs(x$v))
                       }
                     else{
                       cumsum(abs(x$muhat-x$model$y))
                     }
                       )}
                     ))
  if(!is.null(t)) df$t <- t
  else df$t <- 1:nrow(df)
  df <- gather(df, key = series, value=value, -t)
  df$err.or.y <- ifelse(df$series == "y", "raw data", "cumulative abs. erros")
  print(ggplot(df) +
          geom_line(aes(x=t, y=value, group=series, color=series)) +
          facet_wrap(~err.or.y, scales = "free_y", ncol=1) +
          labs(x="", y="") +
          theme_classic(legend.position="top"))
}

plot.error(list("model 1"=filter.model1, "model 2"=filter.model2,
                "model 3"=filter.model3, "model 4"=filter.model4), t = df.mon$ym)
	require(KFAS) # 1.2.9
	require(dplyr)
	require(tidyr)
	require(ggplot2)
	require(data.table)
	require(zoo)

	##### read dataset #####
	# https://catalog.data.gov/dataset/allegheny-county-crash-data
	# data description
	df.crash <- read.csv(file="data/crashdatadictionary.csv",
	stringsAsFactors = F, fileEncoding = "utf-8")

	df <- list()
	for (i in 2004:2014 - 2003){
	df[[i]] <- read.csv(file=paste0("data/", i + 2003, "alcocrash.csv"),
	stringsAsFactors = F, fileEncoding = "utf-8")
	}
	df[[length(df) + 1]] <- read.csv(file="data/d90eb4fd-1234-4f3b-ba3d-422769cd3761.csv",
	stringsAsFactors = F, fileEncoding = "utf-8")
	df[[length(df) + 1]] <- read.csv(file="data/reordered2016crashes.csv", stringsAsFactors = F, fileEncoding = "utf-8")
	df <- rbindlist(df, fill = T)

	df.mon <- filter(df, COLLISION_TYPE != 0, BICYCLE == 0) %>% group_by(CRASH_YEAR, CRASH_MONTH) %>%
	summarise(CRASH=n(),
	AGGRESSIVE=sum(AGGRESSIVE_DRIVING),
	ALCOHOL=sum(ALCOHOL_RELATED),
	PHONE=sum(CELL_PHONE),
	CROSS_MEDIAN=sum(CROSS_MEDIAN),
	DEER=sum(DEER_RELATED),
	DISTRACTED=sum(DISTRACTED),
	DRIVER_U_17YR=sum(DRIVER_16YR + DRIVER_17YR >= 1),
	DRIVER_75PLUS=sum(DRIVER_75PLUS),
	OVERTURNED=sum(OVERTURNED),
	DIRT=sum(ROAD_CONDITION==2),
	WET=sum(ROAD_CONDITION %in% c(1, 4, 7)),
	SNOW_ICE=sum(ROAD_CONDITION %in% c(3, 5, 6)),
	WEATHER_BAD=sum(WEATHER %in% 2:8)
	) %>% ungroup() %>%
	mutate(ym=as.yearmon(paste(CRASH_YEAR, CRASH_MONTH, sep="-")))

	##### plot #####
	ggplot(gather(df.mon, key = series, value=value, -ym, -CRASH_YEAR, -CRASH_MONTH) %>% filter(series != "CRASH")) +
	geom_line(aes(x=ym, y=value, group=series), color="grey") + labs(x="year-month", "count of car crash") +
	facet_wrap(~series, scales = "free") + theme_classic()
	ggplot(df.mon) + geom_line(aes(x=ym, y=CRASH), color="grey", lwd=1) + labs(x="year-month", y="number of crashes") +
	theme_classic()
	# monthly
	temp <- group_by(df, CRASH_MONTH) %>% summarise(y=n()) %>% ungroup
	temp$CRASH_MONTH <- as.integer(temp$CRASH_MONTH)
	ggplot(temp) + geom_line(aes(x=CRASH_MONTH, y=y)) + scale_x_continuous(breaks= temp$CRASH_MONTH)


	##### modeling #####

	# Gaussian case
	model1 <- SSModel(formula = CRASH ~ SSMtrend(1, Q=NA) +
	SSMseasonal(period = 12, Q=NA),
	data=df.mon, H = NA)
	model2 <- SSModel(formula = CRASH ~ SSMtrend(1, Q=NA) +
	SSMseasonal(period = 12, Q=NA) +
	DRIVER_U_17YR + DRIVER_75PLUS + DISTRACTED + PHONE + SNOW_ICE + WEATHER_BAD + WET,
	data=df.mon, H = NA)

	# Poisson case
	model3 <- SSModel(formula = CRASH ~ SSMtrend(1, Q=NA) +
	SSMseasonal(period = 12, Q=NA),
	distribution = "poisson", data=df.mon)

	model4 <- SSModel(formula = CRASH ~ SSMtrend(1, Q=NA) +
	SSMseasonal(period = 12, Q=NA) +
	DRIVER_U_17YR + DRIVER_75PLUS + DISTRACTED + PHONE + SNOW_ICE + WEATHER_BAD + WET,
	distribution = "poisson", data=df.mon)

	fit <- function(model, inits, method="BFGS"){
	f <- fitSSM(model, inits=inits, method=method)
	print(paste("Converged: ", f$optim.out$convergence == 0))
	return(f)
	}

	fit.model1 <- fit(model1, inits=c(1, 1, 1))
	fit.model2 <- fit(model2, inits=c(1, 1, 1))
	fit.model3 <- fit(model3, inits=c(1, 1, 1))
	fit.model4 <- fit(model4, inits=c(.1, .1), "SANN")


	# filtering
	filter.model1 <- KFS(fit.model1$model, smoothing = c("state", "mean"))
	filter.model2 <- KFS(fit.model2$model, smoothing = c("state", "mean"))
	filter.model3 <- KFS(fit.model3$model, smoothing = c("state", "mean"))
	filter.model4 <- KFS(fit.model4$model, smoothing = c("state", "mean"))

	plot.error <- function(list.filter.model, t=NULL){
	df <- data.frame(y=list.filter.model[[1]]$model$y,
	lapply(list.filter.model, FUN = function(x){return(
	if(x$model$distribution == "gaussian"){
	cumsum(abs(x$v))
	}
	else{
	cumsum(abs(x$muhat-x$model$y))
	}
	)}
	))
	if(!is.null(t)) df$t <- t
	else df$t <- 1:nrow(df)
	df <- gather(df, key = series, value=value, -t)
	df$err.or.y <- ifelse(df$series == "y", "raw data", "cumulative abs. erros")
	print(ggplot(df) +
	geom_line(aes(x=t, y=value, group=series, color=series)) +
	facet_wrap(~err.or.y, scales = "free_y", ncol=1) +
	labs(x="", y="") +
	theme_classic(legend.position="top"))
	}

	plot.error(list("model 1"=filter.model1, "model 2"=filter.model2,
	"model 3"=filter.model3, "model 4"=filter.model4), t = df.mon$ym)