Nimster/gist:2838511

## gistfile1.r
### Most of the summary is taken from the awesome R twotorials at http://www.twotorials.com/ by Anthony Damico
### Some of it are my additions from my experience. This is intended so you can Ctrl+F and find what you want using
### common names of functions and concepts from other languages or statistics.

### Troubleshooting: Search http://tolstoy.newcastle.edu.au/R/ , http://www.r-bloggers.com/, http://www.rseek.org/

### Basics
traceback() # Get the call stack after an error, for debugging
32 %% 2 # == 0 mod operator
5 %/% 3 # == 1 integer division
options(digits=22) # Set max floating point precision
pi # 3.14...
Inf # Infinity
factorial(4) # 4!
z <- sqrt(16:20) # square root, variable assignment, ranges (sequences) - 16:20 becomes 16,17,18,19,20.
4:1 # == c(4,3,2,1).
z <- c(1, 2) + 5 # == c(6, 7): create a vector, add a number to both elements of the vector
z <- c(1,2,3,4) + c(1,2) # == c(2,4,4,6) : Automatically repeat the shorter vector
df <- data.frame(col1 = c(1,2,3), col2 = 3:5) # Create a dataframe
nrow(df), ncol(df) # size of the data frame (number of rows and columns)
nchar("a string") # == 8, length of string (number of characters)
length(1:8) # == 8, length of a vector (number of elements)
rownames(df), colnames(df) # Also assignable: rownames(df) <- c("row1", "row2")
df[df$col1 == 2, 1:2] # Index the second row of the data frame, and the first two columns.
df[, -4] # Remove the 4th column
df[, 'unwanted'] <- NULL # Remove the unwanted column
z<-list(1:3, c("a","b")) # Create a list with two objects, c(1,2,3) and c("a","b")
z[[2]] # Access the second member of the list z
4 %in% c(3,4,5) # True
ls() # See all of the defined variables in the environment. You can also specify the environment
rm(y) # Delete the 'y' variable from the environment. Run gc() to garbage collect and free the memory
assign("something", val) # equivalent to something <- val. A kind of reflection
get("something") # retrieving the 'something' variable.
source("script.R") # Execute the script.R file
matrix(NA, nrow = 4, ncol = 4) # Create a 4x4 matrix. Matrix multiplication is %*%. Put the data where NA is
ts(x, start = 1960, freq = 12) # Create an equispaced time-series vector from x. See as.ts as well
ISOdatetime(1970,1,1,0,0,0, tz="EST") + 1241204120 # Convert seconds since the epoch to time

### Control structures and basic language stuff
if (! (T & (F | T))) {
} else {
  # We will get here. Note the single binary operators, and T,F are shorthands for True, False.
}

for (i in 1:3) {
  # Happens 3 times
  next # Like continue, starts in the next iteration
}

while (i < 18) { break } # While loops. Break out of a loop. repeat { } is infinite loop

# "exception" handling (try and catch) and ignoring errors:
result <- try( { 12 / 0 }, silent = T) # Will not inform you of the division by 0
class(result) == "try-error" # There was an exception. In general class returns the type of an object

myfunc <- function(a, b, c=15) {  # define a function, default values for parameters.
  14 # last line is the return value
}

### Basic Functions
seq(from = 0, to = 3, by = 0.5) # gives 0,0.5,1,1.5,2,2.5,3. Also length.out = 7 instead of by
rep(1:3, 2) # == c(1,2,3,1,2,3) repeat the vector from the beginning N times, here N=2.
as.numeric(x), as.logical(x), as.character(x) # Convert (cast) types. 0 is false.
is.na(c(1,NA,3)) # == (F, T, F) whether a value is missing or not
ifelse(c(T, F), "true_case", "false_case") # == c("true_case", "false_case") (do an if-else on each member)
outer(1:2, 3:4, FUN = "*") # returns a matrix of applying FUN to each of the outer (cartesian) product elements.
                           # provide additional parameters to FUN after that parameter.
round(x, digits=2) # also floor(), ceiling()
Sys.time() # Current time

### String functions
gsub("regex", "gerex", y) # replace (regular expression). Use sub to replace just the first match
grep("el", c("hello", "elbow", "world")) # == c(1,2) search for the substring and return the matching indices
grepl("el", c("hello", "elbow", "world")) # == c(T,T,F) search for the substring and return a logical vector
paste("a", c("b", "c"), sep="") # == c("ab", "ac") (concatenate strings). collapse="X" to make it all one string
install.packages("stringr"); library(stringr) # And then you get access to:
str_trim("  as  ", side="left") # == "as  ": remove leading/training whitespaces.
strsplit(c("a-b", "c-d"), split="-") # Split the string, returns a list with the matches. Accepts regexs for split=
strptime(string, "%d-%m-%Y") # Takes a string and a format, returns that date as a date object you can add seconds to.

### Input / Output
inData <- read.csv("inputData.csv",header=T,stringsAsFactors=FALSE,na.strings = c("","999","—-","MISS")) # also quote=False
setwd("path/to/dir") # cd to another directory
# Read/Write Microsoft Excel spreadsheet files:
install.packages("gdata"); library(gdata)
read.xls("file.xlsx", sheet = 3) # Also sheet = "named sheet". Works for XLS or XLSX
# Alternatively
install.packages("xlsx"); library(xlsx)
write.xlsx(df, "file.xlsx", sheetName="mysheet") # And keep writing to other sheets with append = T
read.xlsx("file.xlsx", sheetIndex = 1) # or sheetIndex = "sheet name"
write.csv(df, "filename.csv")
save(df1, df2, file="filename.RData") # Save several things. Preferably use saveRDS(df, "name") for one item
load("filename.RData") # And load them

### Statistics and Probability
mean(x), median(x), range(x), sd(x), var(x) # average, median and c(min, max) (also use max(), min()),
                                            # standard deviation and variance. All accept na.rm = T
summary(x) # Min, max, mean, median and 25,75 quantiles
cor(x, y, method = "pearson") # Pearson product correlation coefficient between x and y. Or just do cor(df) for a whole dataframe
quantile(x, probs = seq(0, 1, by = 0.05)) # all percentiles 0%, 5%, 10%, ..., up to 100%
ftable(df[, c("col1", "col2", "col3")]) # crosstabs (number of appearances) of values of col1, col2, col3
                                        # table also works, and you can useNA="always" to show NA values
unique(c(1,2,1,2,3,4)) # == c(3,4) : only the values that appear only once
tapply(3:6, c(1,2,1,2), mean) # returns a table where 1 => 4, 2 => 5. In general,
                              # tapply(data, grouped_by, FUN, extra_params...) to stratify FUN on the data over the groups
aggregate(col1 ~ col2, df, quantile, probs=0.9) # find the 90th percentile of col1 stratified (grouped) by col2 in df
sample(vec, 5, probs = c(0.1,0.3,0.1,0.5), replace = T) # Choose 5 items from the vector at random according to the given
                                                        # probabilities, with replacement. Without probs, sampling is uniform
set.seed(15) # set the seed for generating random numbers
runif(3, min = -1, max = 3) # 3 Uniformly distributed numbers on [-1, 3]. Also see rnorm(), rbinom(), etc.
l <- lm(x ~ y + z) # Create a linear regression model. Pass data = df to take the columns from a dataframe
residuals(l) # The residuals of the linear model
summary(l) # a summary of the model's goodness of fit. Also plot(l) to see a bunch of plots about the model
table(table(x)) # Frequency histogram: How many elements are unique? How many elements appear twice? etc.

### Playing with data
sort(x) # Returns the sorted values
order(x) # Returns the order statistics for each value.
df[order[df$column3],] # Sorts the data frame df by its "column3" column
merge(df1, df2, by = "col1") # (inner) join df1 and df2 on column col1 - no NAs are generated. See also
                             # by.x, by.y, all.x=T, all.y=T (or all=T) to control what gets joined and added
cbind(df1, df2) # If df1 and df2 have the same *number* of rows, paste them left to right
rbind(df1, df2) # If df1 and df2 have the same columns, return df1 followed by df2
install.packages("reshape"); library(reshape)
rbind.fill(df1, df2) # Return df1 followed (on top of) df2, and fill NAs for missing columns
cut(1:6, c(1, 3, 5)) # == c(NA,(1,3],(1,3],(3,5],(3,5],NA). Specify a number instead of the breaks to cut to that
                     # many breaks. Specify labels=c("lab1,"lab2",...) to name the breaks instead of the default.
rle(c("a","a","b","a","a"))  # == c(2,1,2), c("a","b","a") : The run-length encoding or consecutive chunks (batches)
                             # of groups of values from the input vector

### Case study: Comparing two ranked data frames
# Dataframe T1, sorted according to column X1 in dataframe T2
t1.matched <- t1[match(t2$X1, t1$X1),]
# So the diffs are the differences of the values
diffs <- t1.matched$value - t2$value
# The top differences are:
t2[order(abs(diffs), decreasing=T),]
# To compare side by side, use
cbind(t2, t1.matched$value, data.frame(diffs))[order(abs(diffs), decreasing=T),]
# To see where the top items from t1 went in the re-ordering, use
match(t1$X1, t2$X1)

### Packages and Documentation
install.packages("packagename")
library(ggplot2) # What's the difference between this and require('ggplot2')?
help.search("functionName") # look up a function
?seq # or ??seq to get the help for the seq built-in

### Nifty stuff
install.packages("sqldf"); library(sqldf)
sqldf("select count(*), category from df where column1 > 12 group by category") # Run this SQL against df
history(max.show = 40) # show last 40 lines of readline history. Also savehistory("filename")
options() # List of all options. options(something = val) to change options like prompt, display limits etc.
	### Most of the summary is taken from the awesome R twotorials at http://www.twotorials.com/ by Anthony Damico
	### Some of it are my additions from my experience. This is intended so you can Ctrl+F and find what you want using
	### common names of functions and concepts from other languages or statistics.

	### Troubleshooting: Search http://tolstoy.newcastle.edu.au/R/ , http://www.r-bloggers.com/, http://www.rseek.org/

	### Basics
	traceback() # Get the call stack after an error, for debugging
	32 %% 2 # == 0 mod operator
	5 %/% 3 # == 1 integer division
	options(digits=22) # Set max floating point precision
	pi # 3.14...
	Inf # Infinity
	factorial(4) # 4!
	z <- sqrt(16:20) # square root, variable assignment, ranges (sequences) - 16:20 becomes 16,17,18,19,20.
	4:1 # == c(4,3,2,1).
	z <- c(1, 2) + 5 # == c(6, 7): create a vector, add a number to both elements of the vector
	z <- c(1,2,3,4) + c(1,2) # == c(2,4,4,6) : Automatically repeat the shorter vector
	df <- data.frame(col1 = c(1,2,3), col2 = 3:5) # Create a dataframe
	nrow(df), ncol(df) # size of the data frame (number of rows and columns)
	nchar("a string") # == 8, length of string (number of characters)
	length(1:8) # == 8, length of a vector (number of elements)
	rownames(df), colnames(df) # Also assignable: rownames(df) <- c("row1", "row2")
	df[df$col1 == 2, 1:2] # Index the second row of the data frame, and the first two columns.
	df[, -4] # Remove the 4th column
	df[, 'unwanted'] <- NULL # Remove the unwanted column
	z<-list(1:3, c("a","b")) # Create a list with two objects, c(1,2,3) and c("a","b")
	z[[2]] # Access the second member of the list z
	4 %in% c(3,4,5) # True
	ls() # See all of the defined variables in the environment. You can also specify the environment
	rm(y) # Delete the 'y' variable from the environment. Run gc() to garbage collect and free the memory
	assign("something", val) # equivalent to something <- val. A kind of reflection
	get("something") # retrieving the 'something' variable.
	source("script.R") # Execute the script.R file
	matrix(NA, nrow = 4, ncol = 4) # Create a 4x4 matrix. Matrix multiplication is %*%. Put the data where NA is
	ts(x, start = 1960, freq = 12) # Create an equispaced time-series vector from x. See as.ts as well
	ISOdatetime(1970,1,1,0,0,0, tz="EST") + 1241204120 # Convert seconds since the epoch to time

	### Control structures and basic language stuff
	if (! (T & (F \| T))) {
	} else {
	# We will get here. Note the single binary operators, and T,F are shorthands for True, False.
	}

	for (i in 1:3) {
	# Happens 3 times
	next # Like continue, starts in the next iteration
	}

	while (i < 18) { break } # While loops. Break out of a loop. repeat { } is infinite loop

	# "exception" handling (try and catch) and ignoring errors:
	result <- try( { 12 / 0 }, silent = T) # Will not inform you of the division by 0
	class(result) == "try-error" # There was an exception. In general class returns the type of an object

	myfunc <- function(a, b, c=15) { # define a function, default values for parameters.
	14 # last line is the return value
	}

	### Basic Functions
	seq(from = 0, to = 3, by = 0.5) # gives 0,0.5,1,1.5,2,2.5,3. Also length.out = 7 instead of by
	rep(1:3, 2) # == c(1,2,3,1,2,3) repeat the vector from the beginning N times, here N=2.
	as.numeric(x), as.logical(x), as.character(x) # Convert (cast) types. 0 is false.
	is.na(c(1,NA,3)) # == (F, T, F) whether a value is missing or not
	ifelse(c(T, F), "true_case", "false_case") # == c("true_case", "false_case") (do an if-else on each member)
	outer(1:2, 3:4, FUN = "*") # returns a matrix of applying FUN to each of the outer (cartesian) product elements.
	# provide additional parameters to FUN after that parameter.
	round(x, digits=2) # also floor(), ceiling()
	Sys.time() # Current time

	### String functions
	gsub("regex", "gerex", y) # replace (regular expression). Use sub to replace just the first match
	grep("el", c("hello", "elbow", "world")) # == c(1,2) search for the substring and return the matching indices
	grepl("el", c("hello", "elbow", "world")) # == c(T,T,F) search for the substring and return a logical vector
	paste("a", c("b", "c"), sep="") # == c("ab", "ac") (concatenate strings). collapse="X" to make it all one string
	install.packages("stringr"); library(stringr) # And then you get access to:
	str_trim(" as ", side="left") # == "as ": remove leading/training whitespaces.
	strsplit(c("a-b", "c-d"), split="-") # Split the string, returns a list with the matches. Accepts regexs for split=
	strptime(string, "%d-%m-%Y") # Takes a string and a format, returns that date as a date object you can add seconds to.

	### Input / Output
	inData <- read.csv("inputData.csv",header=T,stringsAsFactors=FALSE,na.strings = c("","999","—-","MISS")) # also quote=False
	setwd("path/to/dir") # cd to another directory
	# Read/Write Microsoft Excel spreadsheet files:
	install.packages("gdata"); library(gdata)
	read.xls("file.xlsx", sheet = 3) # Also sheet = "named sheet". Works for XLS or XLSX
	# Alternatively
	install.packages("xlsx"); library(xlsx)
	write.xlsx(df, "file.xlsx", sheetName="mysheet") # And keep writing to other sheets with append = T
	read.xlsx("file.xlsx", sheetIndex = 1) # or sheetIndex = "sheet name"
	write.csv(df, "filename.csv")
	save(df1, df2, file="filename.RData") # Save several things. Preferably use saveRDS(df, "name") for one item
	load("filename.RData") # And load them

	### Statistics and Probability
	mean(x), median(x), range(x), sd(x), var(x) # average, median and c(min, max) (also use max(), min()),
	# standard deviation and variance. All accept na.rm = T
	summary(x) # Min, max, mean, median and 25,75 quantiles
	cor(x, y, method = "pearson") # Pearson product correlation coefficient between x and y. Or just do cor(df) for a whole dataframe
	quantile(x, probs = seq(0, 1, by = 0.05)) # all percentiles 0%, 5%, 10%, ..., up to 100%
	ftable(df[, c("col1", "col2", "col3")]) # crosstabs (number of appearances) of values of col1, col2, col3
	# table also works, and you can useNA="always" to show NA values
	unique(c(1,2,1,2,3,4)) # == c(3,4) : only the values that appear only once
	tapply(3:6, c(1,2,1,2), mean) # returns a table where 1 => 4, 2 => 5. In general,
	# tapply(data, grouped_by, FUN, extra_params...) to stratify FUN on the data over the groups
	aggregate(col1 ~ col2, df, quantile, probs=0.9) # find the 90th percentile of col1 stratified (grouped) by col2 in df
	sample(vec, 5, probs = c(0.1,0.3,0.1,0.5), replace = T) # Choose 5 items from the vector at random according to the given
	# probabilities, with replacement. Without probs, sampling is uniform
	set.seed(15) # set the seed for generating random numbers
	runif(3, min = -1, max = 3) # 3 Uniformly distributed numbers on [-1, 3]. Also see rnorm(), rbinom(), etc.
	l <- lm(x ~ y + z) # Create a linear regression model. Pass data = df to take the columns from a dataframe
	residuals(l) # The residuals of the linear model
	summary(l) # a summary of the model's goodness of fit. Also plot(l) to see a bunch of plots about the model
	table(table(x)) # Frequency histogram: How many elements are unique? How many elements appear twice? etc.

	### Playing with data
	sort(x) # Returns the sorted values
	order(x) # Returns the order statistics for each value.
	df[order[df$column3],] # Sorts the data frame df by its "column3" column
	merge(df1, df2, by = "col1") # (inner) join df1 and df2 on column col1 - no NAs are generated. See also
	# by.x, by.y, all.x=T, all.y=T (or all=T) to control what gets joined and added
	cbind(df1, df2) # If df1 and df2 have the same number of rows, paste them left to right
	rbind(df1, df2) # If df1 and df2 have the same columns, return df1 followed by df2
	install.packages("reshape"); library(reshape)
	rbind.fill(df1, df2) # Return df1 followed (on top of) df2, and fill NAs for missing columns
	cut(1:6, c(1, 3, 5)) # == c(NA,(1,3],(1,3],(3,5],(3,5],NA). Specify a number instead of the breaks to cut to that
	# many breaks. Specify labels=c("lab1,"lab2",...) to name the breaks instead of the default.
	rle(c("a","a","b","a","a")) # == c(2,1,2), c("a","b","a") : The run-length encoding or consecutive chunks (batches)
	# of groups of values from the input vector

	### Case study: Comparing two ranked data frames
	# Dataframe T1, sorted according to column X1 in dataframe T2
	t1.matched <- t1[match(t2$X1, t1$X1),]
	# So the diffs are the differences of the values
	diffs <- t1.matched$value - t2$value
	# The top differences are:
	t2[order(abs(diffs), decreasing=T),]
	# To compare side by side, use
	cbind(t2, t1.matched$value, data.frame(diffs))[order(abs(diffs), decreasing=T),]
	# To see where the top items from t1 went in the re-ordering, use
	match(t1$X1, t2$X1)

	### Packages and Documentation
	install.packages("packagename")
	library(ggplot2) # What's the difference between this and require('ggplot2')?
	help.search("functionName") # look up a function
	?seq # or ??seq to get the help for the seq built-in

	### Nifty stuff
	install.packages("sqldf"); library(sqldf)
	sqldf("select count(*), category from df where column1 > 12 group by category") # Run this SQL against df
	history(max.show = 40) # show last 40 lines of readline history. Also savehistory("filename")
	options() # List of all options. options(something = val) to change options like prompt, display limits etc.