Last active
August 29, 2015 14:22
-
-
Save mrecos/b30ae419e6adc0022733 to your computer and use it in GitHub Desktop.
Data preparation for Philadelphia Parking Violations data. Part of Philly Transportation Hackathon http://phillyinnovates.com/2015/06/04/planes-trains-and-civic-hacking/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# I started to play with the data and used this to get it a bit cleaned up and prepared for analysis | |
# I am sure my code is not the most efficient or prettiest, but if there are errors, please let me know | |
# Perhaps you will find this usefull in saving a few minutes getting the data ready for the fun part; analysis! | |
# DATA : https://www.opendataphilly.org/dataset/parking-violation | |
library(data.table) | |
library(lubridate) | |
setwd(<<YOUR DIRECTORY>>) # Set WD if desired | |
data <- fread("Parking_Violations.csv") | |
dat <- data.frame(data) # convert to a data.frame from data table | |
str(dat) # take a look at all that fun stuff! | |
summary(dat) # Oh no, lots of chracter fields, need to fix that. | |
dat$State <- as.factor(dat$State) | |
dat$Division <- as.factor(dat$Division) | |
dat$Violation.Description <- as.factor(dat$Violation.Description) | |
dat$Issuing.Agency <- as.factor(dat$Issuing.Agency) | |
dat$Fine <- as.numeric(substring(dat$Fine, 2)) # remove '$' sign and make numeric | |
# Coords come as a text coordinate pair and some are missing with zero-length strings | |
coords <- ifelse(dat$Coordinates == "", "(99,99)", dat$Coordinates) # assign missing data to '99' | |
coords <- unlist(strsplit(coords, ",")) # split on comma | |
coords <- matrix(coords, ncol=2, byrow=TRUE) | |
coords[,1] <- substring(coords[,1], 2) # remove leading and ending brackets | |
coords[,2] <- substr(coords[,2], 1, nchar(coords[,2])-1) | |
# join new coords back to table | |
dat$Y_coord <- as.double(as.character(coords[,1])) # convert to numeric | |
dat$X_coord <- as.double(as.character(coords[,2])) | |
dat$Y_coord <- ifelse(dat$Y_coord == 99, NA, dat$Y_coord) # replace '99' with NA | |
dat$X_coord <- ifelse(dat$X_coord == 99, NA, dat$X_coord) | |
# rm(coords) # recommended to save space, but up to you | |
# the date and time field is a character string, but at least it is consistenly formated | |
# I am not too good at manipulating date/time data, but this is a start | |
# Should be better way to add date and time or make time more accesible | |
date_time <- unlist(strsplit(dat[,1], " ")) # split into date, time, and AM/PM | |
date_time <- data.frame(matrix(date_time, ncol=3, byrow=TRUE), stringsAsFactors=FALSE) | |
date_time[,1] <- mdy(date_time[,1]) # use lubridate to convert to POSIX date | |
date_time[,2] <- hms(date_time[,2]) # use lubridate to convert to time of day | |
dat$Date <- date_time[,1] # join back to main data.frame | |
dat$Time <- date_time[,2] | |
dat$AMPM <- date_time[,3] | |
dat$Day <- wday(dat$Date) # Compute day of the week, Sunday = 1, Monday = 2, etc... | |
# rm(date_time) # recommended to save space, but up to you | |
# at this point, there are still address fields that are text, but I perfer to use the coordinate at this point | |
# also, there is likely issues with typos and mis-coding in the original data, but you will find that through analysis | |
# have fuN!!! | |
# visualize some stuff | |
hist(dat$Date,"months") | |
hist(dat$Date,"weeks") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment