mrecos/parking_violation.R

## parking_violation.R
# I started to play with the data and used this to get it a bit cleaned up and prepared for analysis
# I am sure my code is not the most efficient or prettiest, but if there are errors, please let me know
# Perhaps you will find this usefull in saving a few minutes getting the data ready for the fun part; analysis!
# DATA : https://www.opendataphilly.org/dataset/parking-violation

library(data.table)
library(lubridate)
setwd(<<YOUR DIRECTORY>>) # Set WD if desired
data <- fread("Parking_Violations.csv")
dat <- data.frame(data) # convert to a data.frame from data table
str(dat) # take a look at all that fun stuff!
summary(dat) # Oh no, lots of chracter fields, need to fix that.
dat$State <- as.factor(dat$State)
dat$Division <- as.factor(dat$Division)
dat$Violation.Description <- as.factor(dat$Violation.Description)
dat$Issuing.Agency <- as.factor(dat$Issuing.Agency)
dat$Fine <- as.numeric(substring(dat$Fine, 2)) # remove '$' sign and make numeric

# Coords come as a text coordinate pair and some are missing with zero-length strings
coords <- ifelse(dat$Coordinates == "", "(99,99)", dat$Coordinates) # assign missing data to '99'
coords <- unlist(strsplit(coords, ",")) # split on comma
coords <- matrix(coords, ncol=2, byrow=TRUE)
coords[,1] <- substring(coords[,1], 2) # remove leading and ending brackets
coords[,2] <- substr(coords[,2], 1, nchar(coords[,2])-1)
# join new coords back to table
dat$Y_coord <- as.double(as.character(coords[,1])) # convert to numeric
dat$X_coord <- as.double(as.character(coords[,2]))
dat$Y_coord <- ifelse(dat$Y_coord == 99, NA, dat$Y_coord) # replace '99' with NA
dat$X_coord <- ifelse(dat$X_coord == 99, NA, dat$X_coord)
# rm(coords) # recommended to save space, but up to you

# the date and time field is a character string, but at least it is consistenly formated
# I am not too good at manipulating date/time data, but this is a start
# Should be better way to add date and time or make time more accesible
date_time <- unlist(strsplit(dat[,1], " ")) # split into date, time, and AM/PM
date_time <- data.frame(matrix(date_time, ncol=3, byrow=TRUE), stringsAsFactors=FALSE)
date_time[,1] <- mdy(date_time[,1]) # use lubridate to convert to POSIX date
date_time[,2] <- hms(date_time[,2]) # use lubridate to convert to time of day
dat$Date <- date_time[,1] # join back to main data.frame
dat$Time <- date_time[,2]
dat$AMPM <- date_time[,3]
dat$Day  <- wday(dat$Date) # Compute day of the week, Sunday = 1, Monday = 2, etc...
# rm(date_time) # recommended to save space, but up to you

# at this point, there are still address fields that are text, but I perfer to use the coordinate at this point
# also, there is likely issues with typos and mis-coding in the original data, but you will find that through analysis
# have fuN!!!

# visualize some stuff
hist(dat$Date,"months")
hist(dat$Date,"weeks")
	# I started to play with the data and used this to get it a bit cleaned up and prepared for analysis
	# I am sure my code is not the most efficient or prettiest, but if there are errors, please let me know
	# Perhaps you will find this usefull in saving a few minutes getting the data ready for the fun part; analysis!
	# DATA : https://www.opendataphilly.org/dataset/parking-violation

	library(data.table)
	library(lubridate)
	setwd(<<YOUR DIRECTORY>>) # Set WD if desired
	data <- fread("Parking_Violations.csv")
	dat <- data.frame(data) # convert to a data.frame from data table
	str(dat) # take a look at all that fun stuff!
	summary(dat) # Oh no, lots of chracter fields, need to fix that.
	dat$State <- as.factor(dat$State)
	dat$Division <- as.factor(dat$Division)
	dat$Violation.Description <- as.factor(dat$Violation.Description)
	dat$Issuing.Agency <- as.factor(dat$Issuing.Agency)
	dat$Fine <- as.numeric(substring(dat$Fine, 2)) # remove '$' sign and make numeric

	# Coords come as a text coordinate pair and some are missing with zero-length strings
	coords <- ifelse(dat$Coordinates == "", "(99,99)", dat$Coordinates) # assign missing data to '99'
	coords <- unlist(strsplit(coords, ",")) # split on comma
	coords <- matrix(coords, ncol=2, byrow=TRUE)
	coords[,1] <- substring(coords[,1], 2) # remove leading and ending brackets
	coords[,2] <- substr(coords[,2], 1, nchar(coords[,2])-1)
	# join new coords back to table
	dat$Y_coord <- as.double(as.character(coords[,1])) # convert to numeric
	dat$X_coord <- as.double(as.character(coords[,2]))
	dat$Y_coord <- ifelse(dat$Y_coord == 99, NA, dat$Y_coord) # replace '99' with NA
	dat$X_coord <- ifelse(dat$X_coord == 99, NA, dat$X_coord)
	# rm(coords) # recommended to save space, but up to you

	# the date and time field is a character string, but at least it is consistenly formated
	# I am not too good at manipulating date/time data, but this is a start
	# Should be better way to add date and time or make time more accesible
	date_time <- unlist(strsplit(dat[,1], " ")) # split into date, time, and AM/PM
	date_time <- data.frame(matrix(date_time, ncol=3, byrow=TRUE), stringsAsFactors=FALSE)
	date_time[,1] <- mdy(date_time[,1]) # use lubridate to convert to POSIX date
	date_time[,2] <- hms(date_time[,2]) # use lubridate to convert to time of day
	dat$Date <- date_time[,1] # join back to main data.frame
	dat$Time <- date_time[,2]
	dat$AMPM <- date_time[,3]
	dat$Day <- wday(dat$Date) # Compute day of the week, Sunday = 1, Monday = 2, etc...
	# rm(date_time) # recommended to save space, but up to you

	# at this point, there are still address fields that are text, but I perfer to use the coordinate at this point
	# also, there is likely issues with typos and mis-coding in the original data, but you will find that through analysis
	# have fuN!!!

	# visualize some stuff
	hist(dat$Date,"months")
	hist(dat$Date,"weeks")