ronrest/crime_data_reformatter.r

## crime_data_reformatter.r
#===============================================================================
#                                                                       TOY DATA
#===============================================================================
# Setting up a toy dataframe
df = data.frame(total_crime_2015_2016=674,
                total_crime_2014_2015=323,
                total_crime_2013_2014=212,
                car_theft_2015_2016=34,
                car_theft_2014_2015=45,
                car_theft_2013_2014=74
            )
row.names(df) = "count"


#===============================================================================
#                                                                       GET_YEAR
#===============================================================================
#' get_year
#'
#' Takes a string `s` that takes the form such as:
#'
#'      "total_crime_2013_2014"
#'
#' And returns just the year range component of that string, eg:
#'
#'      "2013_2014"
#'
#' @param s (string) the string to process
get_year <- function(s){
    #  .*                   = any number of non-newline characters
    #  ()                   = Anything enclosed in this, will specify a pattern
    #                         that we can refer to later on.
    #  [0-9]{4}_[0-9]{4}    = exactly 4 digits between 0 and 9, followed by an
    #                         underscore, followed by exactly 4 more digits.
    #  $                    = specifies that the last pattern should be located
    #                         at the very end of the string.
    #
    # "\\1"                 = gsub  looks for the above pattern we specified
    #                         within the string s, and replaces it with the
    #                         first pattern that we enclosed within round
    #                         brackets
    return(gsub(".*([0-9]{4}_[0-9]{4})$", "\\1", s))
}


#===============================================================================
#                                                    Extract Columns of Interest
#===============================================================================
# extract values from columns that satisfy some regular expression pattern.
# eg:
#    ^total_crime_[0-9]{4} tells us that we are :
#      - looking for strings that START with "total_crime_"
#      - that it should then be followed by 4 numbers
# This should be enough to uniquely pick out the relevant fields without overlap

total_crime = df[, grep("^total_crime_[0-9]{4}", names(df), ignore.case=FALSE)]
row.names(total_crime) = "total_crime"

car_theft   = df[, grep("^car_theft_[0-9]{4}", names(df), ignore.case=FALSE)]
row.names(car_theft) = "car_theft"

#===============================================================================
#                                                        Rename the Column Names
#===============================================================================
# Rename the columns to only contain the years
names(total_crime) = sapply(names(total_crime), get_year)
names(car_theft) = sapply(names(car_theft), get_year)

#===============================================================================
#                                                       Create the new DataFrame
#===============================================================================
# Create a new dataframe with the extracted data
new_df = rbind(total_crime, car_theft)


# Creates a dataframe that looks like this
#             2015_2016 2014_2015 2013_2014
# total_crime       674       323       212
# car_theft          34        45        74
	#===============================================================================
	# TOY DATA
	#===============================================================================
	# Setting up a toy dataframe
	df = data.frame(total_crime_2015_2016=674,
	total_crime_2014_2015=323,
	total_crime_2013_2014=212,
	car_theft_2015_2016=34,
	car_theft_2014_2015=45,
	car_theft_2013_2014=74
	)
	row.names(df) = "count"


	#===============================================================================
	# GET_YEAR
	#===============================================================================
	#' get_year
	#'
	#' Takes a string `s` that takes the form such as:
	#'
	#' "total_crime_2013_2014"
	#'
	#' And returns just the year range component of that string, eg:
	#'
	#' "2013_2014"
	#'
	#' @param s (string) the string to process
	get_year <- function(s){
	# .* = any number of non-newline characters
	# () = Anything enclosed in this, will specify a pattern
	# that we can refer to later on.
	# [0-9]{4}_[0-9]{4} = exactly 4 digits between 0 and 9, followed by an
	# underscore, followed by exactly 4 more digits.
	# $ = specifies that the last pattern should be located
	# at the very end of the string.
	#
	# "\\1" = gsub looks for the above pattern we specified
	# within the string s, and replaces it with the
	# first pattern that we enclosed within round
	# brackets
	return(gsub(".*([0-9]{4}_[0-9]{4})$", "\\1", s))
	}




	#===============================================================================
	# Extract Columns of Interest
	#===============================================================================
	# extract values from columns that satisfy some regular expression pattern.
	# eg:
	# ^total_crime_[0-9]{4} tells us that we are :
	# - looking for strings that START with "total_crime_"
	# - that it should then be followed by 4 numbers
	# This should be enough to uniquely pick out the relevant fields without overlap

	total_crime = df[, grep("^total_crime_[0-9]{4}", names(df), ignore.case=FALSE)]
	row.names(total_crime) = "total_crime"

	car_theft = df[, grep("^car_theft_[0-9]{4}", names(df), ignore.case=FALSE)]
	row.names(car_theft) = "car_theft"

	#===============================================================================
	# Rename the Column Names
	#===============================================================================
	# Rename the columns to only contain the years
	names(total_crime) = sapply(names(total_crime), get_year)
	names(car_theft) = sapply(names(car_theft), get_year)

	#===============================================================================
	# Create the new DataFrame
	#===============================================================================
	# Create a new dataframe with the extracted data
	new_df = rbind(total_crime, car_theft)


	# Creates a dataframe that looks like this
	# 2015_2016 2014_2015 2013_2014
	# total_crime 674 323 212
	# car_theft 34 45 74