Tom Hopper tomhopper

## align_common_baseline.R
# Response to a post at Storytelling with Data:
# \url{http://www.storytellingwithdata.com/blog/orytellingwithdata.com/2015/07/align-against-common-baseline.html}
# Demonstrates
#  * Cleveland-style dot plots (improvement over pie and bar charts)
#  * Sorting categorical data by a numerical variable with more than one grouping variable
#  * Highlighting differences between groups graphically

library(ggplot2)
library(scales)

## sort_factors.R
#' @title Sorting data frames factor levels for ggplot2
#' @description Sorting a factor variable by a numeric variable.
#'    In one case, each factor level is matched to one numeric value.
#'    In the other case, each factor level is repeated across a second
#'    grouping factor variable, and we want to sort only the
library(dplyr)
library(tidyr)
library(ggplot2)

# Sort a factor by variable by a numeric variable

## .Rprofile
## For original file showing use of .env to add functions invisibly, see
## \link{http://gettinggeneticsdone.blogspot.com/2013/06/customize-rprofile.html}

## Load packages
#library(BiocInstaller)

## Don't show those silly significanct stars
#options(show.signif.stars=FALSE)

## Do you want to automatically convert strings to factor variables in a data.frame?

## random_word_vector.R
# Create 2 replicates of 5 "words" generated from random characters,
# each "word" 5 - 15 characters long, with word length following a
# poisson distribution.
rep(replicate(5, paste(sample(letters, round(rpois(5000, lambda = 3)+5, 0), replace = FALSE), collapse = "")), 2)

# Sample output:
# [1] "rfexnwyjst" "vwtadhjnly" "ztfgvldo"   "tmerol"     "mcqhosap"   "rfexnwyjst" "vwtadhjnly" "ztfgvldo"   "tmerol"
#[10] "mcqhosap"

## ggplot2_xkcd_Humor_Sans.R
# The xkcd font used by the package xkcd (which provides a theme for ggplot2)
# is missing many characters and some characters don't seem to display correctly.
# An alternate xkcd-style font is Humor Sans, available free from
# \url{http://antiyawn.com/uploads/humorsans.html}
# The code below forces the use of Humor Sans instead of xkcd.
# The xkcd and ggplot2 packages are available from CRAN.

library(ggplot2)
library(xkcd)

## find_and_delete.sh
find . -name '.filename' -print -exec rm -r {} \;

# . = in current directory
# -name = file name to find
# -print = print the result's full file name to standard output
# -exec = execute the following command
# {} = fill in with the result of standard output
# \; = semicolon to terminate the -exec command, and the escape
#      character so that the terminal doesn't treat the semicolon as a
#      return character (used for stringing together multiple commands).

## rnorm.r
#' @title Returns a normally distributed vector within the 99.7% tolerance interval defined by minimum and maximum
#' @param n (required) The number of random numbers to generate
#' @param minimum (optional) The lower 99.9% tolerance limit
#' @param maximum (optional) The upper 99.9% tolerance limit
#' @return numeric vector with n elements randomly distributed so that approximately 99.7% of values will fall between minimum and maximum
#' @examples
#'  rnorm.within(10)
#'  rnorm.within(10, 10, 20)
#'  summary(rnorm.within(10000, 10, 20))
rnorm.within <- function(n, minimum=0, maximum=1)

## facet_labelling.R
#' Data frame column names are rarely human-readable, concise and clear, but are usually meaningful. Rather
#' than trying to modify the data, we can provide custom labels for facets.
library(data.table)
library(lubridate)
library(reshape2)
library(ggplot2)

#' Download raw data from "Weather Data" at \link{http://datamonitoring.marec.gvsu.edu/DataDownload.aspx},
#' rename the file to "Marec_weather.csv" and save it to /data/ in the current working directory.

## plot_aligned_series.R
#' When plotting multiple data series that share a common x axis but different y axes,
#' we can just plot each graph separately. This suffers from the drawback that the shared axis will typically
#' not align across graphs due to different plot margins.
#' One easy solution is to reshape2::melt() the data and use ggplot2's facet_grid() mapping. However, there is
#' no way to label individual y axes.
#' facet_grid() and facet_wrap() were designed to plot small multiples, where both x- and y-axis ranges are
#' shared acros all plots in the facetting. While the facet_ calls allow us to use different scales with
#' the \code{scales = "free"} argument, they should not be used this way.
#' A more robust approach is to the grid package grid.draw(), rbind() and ggplotGrob() to create a grid of
#' individual plots where the plot axes are properly aligned within the grid.

## dt_merge_nodups.R
library(data.table)

# See \link{http://stackoverflow.com/questions/11792527/filtering-out-duplicated-non-unique-rows-in-data-table}
# for a discussion of how to eliminate duplicate rows.
# The problem is that the \code{unique()} function will use a key, if it exists. We need to
# eliminate the key.

# Create one column of data
temp1 <- data.table(sample(letters,size = 15, replace = FALSE))
temp2 <- data.table(sample(letters,size = 15, replace = FALSE))
	# Response to a post at Storytelling with Data:
	# \url{http://www.storytellingwithdata.com/blog/orytellingwithdata.com/2015/07/align-against-common-baseline.html}
	# Demonstrates
	# * Cleveland-style dot plots (improvement over pie and bar charts)
	# * Sorting categorical data by a numerical variable with more than one grouping variable
	# * Highlighting differences between groups graphically

	library(ggplot2)
	library(scales)
	#' @title Sorting data frames factor levels for ggplot2
	#' @description Sorting a factor variable by a numeric variable.
	#' In one case, each factor level is matched to one numeric value.
	#' In the other case, each factor level is repeated across a second
	#' grouping factor variable, and we want to sort only the
	library(dplyr)
	library(tidyr)
	library(ggplot2)

	# Sort a factor by variable by a numeric variable
	## For original file showing use of .env to add functions invisibly, see
	## \link{http://gettinggeneticsdone.blogspot.com/2013/06/customize-rprofile.html}

	## Load packages
	#library(BiocInstaller)

	## Don't show those silly significanct stars
	#options(show.signif.stars=FALSE)

	## Do you want to automatically convert strings to factor variables in a data.frame?
	# Create 2 replicates of 5 "words" generated from random characters,
	# each "word" 5 - 15 characters long, with word length following a
	# poisson distribution.
	rep(replicate(5, paste(sample(letters, round(rpois(5000, lambda = 3)+5, 0), replace = FALSE), collapse = "")), 2)

	# Sample output:
	# [1] "rfexnwyjst" "vwtadhjnly" "ztfgvldo" "tmerol" "mcqhosap" "rfexnwyjst" "vwtadhjnly" "ztfgvldo" "tmerol"
	#[10] "mcqhosap"
	# The xkcd font used by the package xkcd (which provides a theme for ggplot2)
	# is missing many characters and some characters don't seem to display correctly.
	# An alternate xkcd-style font is Humor Sans, available free from
	# \url{http://antiyawn.com/uploads/humorsans.html}
	# The code below forces the use of Humor Sans instead of xkcd.
	# The xkcd and ggplot2 packages are available from CRAN.

	library(ggplot2)
	library(xkcd)
	find . -name '.filename' -print -exec rm -r {} \;

	# . = in current directory
	# -name = file name to find
	# -print = print the result's full file name to standard output
	# -exec = execute the following command
	# {} = fill in with the result of standard output
	# \; = semicolon to terminate the -exec command, and the escape
	# character so that the terminal doesn't treat the semicolon as a
	# return character (used for stringing together multiple commands).
	#' @title Returns a normally distributed vector within the 99.7% tolerance interval defined by minimum and maximum
	#' @param n (required) The number of random numbers to generate
	#' @param minimum (optional) The lower 99.9% tolerance limit
	#' @param maximum (optional) The upper 99.9% tolerance limit
	#' @return numeric vector with n elements randomly distributed so that approximately 99.7% of values will fall between minimum and maximum
	#' @examples
	#' rnorm.within(10)
	#' rnorm.within(10, 10, 20)
	#' summary(rnorm.within(10000, 10, 20))
	rnorm.within <- function(n, minimum=0, maximum=1)
	#' Data frame column names are rarely human-readable, concise and clear, but are usually meaningful. Rather
	#' than trying to modify the data, we can provide custom labels for facets.
	library(data.table)
	library(lubridate)
	library(reshape2)
	library(ggplot2)

	#' Download raw data from "Weather Data" at \link{http://datamonitoring.marec.gvsu.edu/DataDownload.aspx},
	#' rename the file to "Marec_weather.csv" and save it to /data/ in the current working directory.
	#' When plotting multiple data series that share a common x axis but different y axes,
	#' we can just plot each graph separately. This suffers from the drawback that the shared axis will typically
	#' not align across graphs due to different plot margins.
	#' One easy solution is to reshape2::melt() the data and use ggplot2's facet_grid() mapping. However, there is
	#' no way to label individual y axes.
	#' facet_grid() and facet_wrap() were designed to plot small multiples, where both x- and y-axis ranges are
	#' shared acros all plots in the facetting. While the facet_ calls allow us to use different scales with
	#' the \code{scales = "free"} argument, they should not be used this way.
	#' A more robust approach is to the grid package grid.draw(), rbind() and ggplotGrob() to create a grid of
	#' individual plots where the plot axes are properly aligned within the grid.
	library(data.table)

	# See \link{http://stackoverflow.com/questions/11792527/filtering-out-duplicated-non-unique-rows-in-data-table}
	# for a discussion of how to eliminate duplicate rows.
	# The problem is that the \code{unique()} function will use a key, if it exists. We need to
	# eliminate the key.

	# Create one column of data
	temp1 <- data.table(sample(letters,size = 15, replace = FALSE))
	temp2 <- data.table(sample(letters,size = 15, replace = FALSE))