lmcstro/location_analysis_s1.r

## location_analysis_s1.r
# Section 1. Load the data from journey file

# this section takes in the dataframes (reads them in first from CSV files)

l.journeys <- list.files(pattern = "*_journey.csv")

# select a particlar car and day to examine
x.journey.number <- 4

df.x <- read.csv(l.journeys[x.journey.number]
                 , stringsAsFactors = FALSE)
#
# > str(df.x)
#'data.frame':	700 obs. of  6 variables:
#  $ from            : int  1 1 1 1 1 1 1 1 1 1 ...
#$ to              : int  1 1 1 1 1 1 1 2 2 2 ...
#$ day             : chr  "Monday" "Tuesday" "Wednesday" "Thursday" ...
#$ journeys        : int  0 0 0 0 0 0 0 0 1 1 ...
#$ journey.days    : int  0 0 0 0 0 0 0 0 1 1 ...
#$ non.journey.days: int  6 6 6 6 6 6 6 6 5 5 ...
#> head(df.x)
#from to       day journeys journey.days non.journey.days
#1    1  1    Monday        0            0                6
#2    1  1   Tuesday        0            0                6
#3    1  1 Wednesday        0            0                6
#4    1  1  Thursday        0            0                6
#5    1  1    Friday        0            0                6
#6    1  1  Saturday        0            0                6
#

# find the max cluster number in the data frame
x.max.cluster <- max(c(max(df.x$from),max(df.x$to)))


# create a label vector of ordered days of week (1 = Monday)
v.days <- c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday")

# Section 2. Journey Count

# initialize a matrix to calculate frequency and probability with
# day, start and end places
journey.count.x <- array(rep(0,
                             (x.max.cluster)*(x.max.cluster)*7),
                         dim=c(x.max.cluster,x.max.cluster,7))

# load the journey data from the dataframe into the matrix
for (i in 1:7) {
  for (j in 1:x.max.cluster) { # from
    for (k in 1:x.max.cluster) {  # to
      journey.count.x[j,k,i] <- df.x[df.x$from == j &
                                       df.x$to == k &
                                       df.x$day == v.days[i],"journeys"]
    }
  }
}

# add names to the matrix dimention.
# row = "from" place for a journey
# column = "to" place for a journey
# plane = day of week journey occured
rownames(journey.count.x) <-
    rownames(journey.count.x, do.NULL = FALSE, prefix = "From.")
colnames(journey.count.x) <-
    colnames(journey.count.x, do.NULL = FALSE, prefix = "To.")
dimnames(journey.count.x)[[3]] <- v.days


# here ia an example of queries on the matrix to find what to/from/days
# have more than 5 journeys
# dim 1 is row, dim 2 is column, dim 3 is days
which(journey.count.x >5, arr.in=TRUE)


# Section 3. Days of travel count
#
# Initialize a matrix to examine predictability of travel (using the count of
# days where at least 1 journey occured between 2 locations) with
# day, start and end places
pred.journey.x <- array(rep(0,
                            (x.max.cluster)*(x.max.cluster)*7),
                        dim=c(x.max.cluster,x.max.cluster,7))

for (i in 1:7) {
  for (j in 1:x.max.cluster) { # from
    for (k in 1:x.max.cluster) {  # to
      pred.journey.x[j,k,i] <- df.x[df.x$from == j &
                                      df.x$to == k &
                                      df.x$day == v.days[i],"journey.days"]
    }
  }
}

# add names to the matrix dimention.
# row = "from" place for a journey
# column = "to" place for a journey
# plane = day of week journey occured
rownames(pred.journey.x) <- rownames(pred.journey.x, do.NULL = FALSE, prefix = "From.")
colnames(pred.journey.x) <- colnames(pred.journey.x, do.NULL = FALSE, prefix = "To.")
dimnames(pred.journey.x)[[3]] <- v.days

# here ia an example of queries on the matrix to get insights on
# journeys
# dim 1 is row, dim 2 is column, dim 3 is days
which(pred.journey.x >5, arr.in=TRUE) # journeys that occured 6 out of 6 days
which(pred.journey.x >4, arr.in=TRUE) # journeys that occured 5 out of 6 days
which(pred.journey.x >3, arr.in=TRUE) # journeys that occured 6 out of 6 days
	# Section 1. Load the data from journey file

	# this section takes in the dataframes (reads them in first from CSV files)

	l.journeys <- list.files(pattern = "*_journey.csv")

	# select a particlar car and day to examine
	x.journey.number <- 4

	df.x <- read.csv(l.journeys[x.journey.number]
	, stringsAsFactors = FALSE)
	#
	# > str(df.x)
	#'data.frame': 700 obs. of 6 variables:
	# $ from : int 1 1 1 1 1 1 1 1 1 1 ...
	#$ to : int 1 1 1 1 1 1 1 2 2 2 ...
	#$ day : chr "Monday" "Tuesday" "Wednesday" "Thursday" ...
	#$ journeys : int 0 0 0 0 0 0 0 0 1 1 ...
	#$ journey.days : int 0 0 0 0 0 0 0 0 1 1 ...
	#$ non.journey.days: int 6 6 6 6 6 6 6 6 5 5 ...
	#> head(df.x)
	#from to day journeys journey.days non.journey.days
	#1 1 1 Monday 0 0 6
	#2 1 1 Tuesday 0 0 6
	#3 1 1 Wednesday 0 0 6
	#4 1 1 Thursday 0 0 6
	#5 1 1 Friday 0 0 6
	#6 1 1 Saturday 0 0 6
	#

	# find the max cluster number in the data frame
	x.max.cluster <- max(c(max(df.x$from),max(df.x$to)))


	# create a label vector of ordered days of week (1 = Monday)
	v.days <- c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday")

	# Section 2. Journey Count

	# initialize a matrix to calculate frequency and probability with
	# day, start and end places
	journey.count.x <- array(rep(0,
	(x.max.cluster)(x.max.cluster)7),
	dim=c(x.max.cluster,x.max.cluster,7))

	# load the journey data from the dataframe into the matrix
	for (i in 1:7) {
	for (j in 1:x.max.cluster) { # from
	for (k in 1:x.max.cluster) { # to
	journey.count.x[j,k,i] <- df.x[df.x$from == j &
	df.x$to == k &
	df.x$day == v.days[i],"journeys"]
	}
	}
	}

	# add names to the matrix dimention.
	# row = "from" place for a journey
	# column = "to" place for a journey
	# plane = day of week journey occured
	rownames(journey.count.x) <-
	rownames(journey.count.x, do.NULL = FALSE, prefix = "From.")
	colnames(journey.count.x) <-
	colnames(journey.count.x, do.NULL = FALSE, prefix = "To.")
	dimnames(journey.count.x)[[3]] <- v.days


	# here ia an example of queries on the matrix to find what to/from/days
	# have more than 5 journeys
	# dim 1 is row, dim 2 is column, dim 3 is days
	which(journey.count.x >5, arr.in=TRUE)


	# Section 3. Days of travel count
	#
	# Initialize a matrix to examine predictability of travel (using the count of
	# days where at least 1 journey occured between 2 locations) with
	# day, start and end places
	pred.journey.x <- array(rep(0,
	(x.max.cluster)(x.max.cluster)7),
	dim=c(x.max.cluster,x.max.cluster,7))

	for (i in 1:7) {
	for (j in 1:x.max.cluster) { # from
	for (k in 1:x.max.cluster) { # to
	pred.journey.x[j,k,i] <- df.x[df.x$from == j &
	df.x$to == k &
	df.x$day == v.days[i],"journey.days"]
	}
	}
	}

	# add names to the matrix dimention.
	# row = "from" place for a journey
	# column = "to" place for a journey
	# plane = day of week journey occured
	rownames(pred.journey.x) <- rownames(pred.journey.x, do.NULL = FALSE, prefix = "From.")
	colnames(pred.journey.x) <- colnames(pred.journey.x, do.NULL = FALSE, prefix = "To.")
	dimnames(pred.journey.x)[[3]] <- v.days

	# here ia an example of queries on the matrix to get insights on
	# journeys
	# dim 1 is row, dim 2 is column, dim 3 is days
	which(pred.journey.x >5, arr.in=TRUE) # journeys that occured 6 out of 6 days
	which(pred.journey.x >4, arr.in=TRUE) # journeys that occured 5 out of 6 days
	which(pred.journey.x >3, arr.in=TRUE) # journeys that occured 6 out of 6 days