geotheory/R-crash-course-script.R

## R-crash-course-script.R
## A CRASH COURSE IN [R] PROGRAMMING
## Robin Edwards (geotheory.co.uk), March 2018
## In RStudio run through line-by-line using Ctrl + Enter

# basic R environmental functions
x=3.14159; y='hello world'; z=TRUE # create some objects. In RStudio they'll appear in 'Workspace'
ls() # list the objects in the Workspace
print(y) # print information to R 'Console'
rm(y) # remove an object
rm(list=ls()) # remove all
getwd() # find current working directory
setwd("/Users/robinedwards/Documents") # set working directory as preferred
print ( "R ignores the 'white-space' in command syntax" )

# use '?' for help on any R function (if its library is loaded in the session)
?max
??csv # search for a text string in R documentation library
library(help=utils) # get help on a particular package (list its functions)

# 'str' is a powerful tool for investigating the underlying structure of any R object
str(iris)
str(max)

# CREATING AND MANIPULATING R OBJECTS

# assigning values to variables
n = 5   # is possible but
n <- 5  # is officially the way
5 -> n  # also works
rm(n)

# R objects can be of various data types, but probably most common are 'numeric' and 'character'
num <- 15
char <- 'any text string'

# create a VECTOR (array) using the 'c()' concatenate function
vec <- c(1,3,5,7,9)

# a vector series
vec <- 20:10

# R vectors can be accessed in various ways using [ ] brackets
vec[3]
vec[3:6]
vec[ c(1,3,8) ]
vec[vec > 15]

# use %in% to check if vectors contains value(s)
c(5,12) %in% vec

# finding first index position of a matching value/sting
x = c('one', 'five', 'two', 3, 'two')
match(c('two','five','ten'), x)

# a MATRIX is a 2D vector (essentially a vector of vectors) of matching data type
matrx = matrix(1:15, 3, 5)
print(matrx) # tba print isn't really necessary as R automatically prints objects if you call their names in console
dim(matrx)   # matrix dimensions
t(matrx)     # transpose

# a DATA.FRAME is like a matrix, but accomodates fields (columns) with different data types
df = data.frame(x = 1:26, y = LETTERS, z = rnorm(26))

# They can be viewed easily
View(df)

# examine their internal stucture
str(df)

# data interrogation with square brackets
df[1,]
df[2:3,]
df[,1]
df[2,1]

# data.frame and matrix objects can have field (column) and record (row) names
dimnames(df)
colnames(df)
names(df)
row.names(df) # rownames are considered passé by Tidyverse users. best practice is to include all data in actual fields

# interrogate data.frames by field name using the '$' operator. the result is a simple vector
df$y[2:5]

# colnames and rownames can be reassigned
names(df) <- c('id','letter','val')
row.names(df) <- letters

# check dimensions of vector/matrix/array/data.frame objects
length(vec)
dim(df)
nrow(df)
ncol(df)

# R has various inbuilt data.frame datasets used to illustrate how functions operate e.g.
data()

# examine contents
head(InsectSprays) # list the top records of a vector / matrix / d.f.
tail(InsectSprays, n=3) # bottom the 3
summary(InsectSprays) # summarise columns of a data.frame (very useful)

# aggregate() is a powerful function for summarising categorical data. As with a number of R functions
# you can use it either with explicit arguments or by specifying a formula (the more elegant approach)
aggregate(InsectSprays$count, by = list(spray = InsectSprays$spray), FUN=mean)  # explicit method
aggregate(count ~ spray, data = InsectSprays, FUN=mean)   # formula method

# subset/apply filter to a data.frame
warpbreaks[warpbreaks$wool == 'A',]   # by 1 condition
warpbreaks[warpbreaks$tension %in% c('L','M') & warpbreaks$wool == 'A',]  # multiple conditions

# adding entries is possible (if a bit tricky)
newrow = data.frame(breaks = 99, wool = 'Z', tension = 'X')
rbind(warpbreaks, newrow)

# but LISTS are better at this
lst = list()

# ways to assign/add items
lst[1] = "one"
lst[[2]] <- "two"
lst[length(lst)+1] <- "three"      # slightly clunky general append method
lst[['https://url.123']] <- 'four' # assigning index name is preferred approach if you are handling unique records and want to overwrite any previous entry to avoid duplication
print(lst)

# data retrieval
lst[[1]] # double brackets means the object returned is of the data class of the list item
lst[2:3] # selecting a more than 1 list item is possible with single brackets..
lst[c(1,3)] # but the object returned (from single bracket interrogation) is a list

# delete list items
lst[[3]] <- NULL
lst

# entries can be any object type (like python), including other lists. Some datasets -
# especially when parsed from JSON - arrive with recursive list structures like this
lst[[1]] <- list(x = 'one', y = head(LETTERS))
lst
lst[[1]][[1]]
lst[['https://url.123']] # items can also be called by id name like a Python/Javascript dictionary

# reorder a vector with 'sort'
sort(vec)

# or a dataframe with 'order'
df[order(df$val),]

# LOGICAL objects (booleans) are binary true/false objects that facilitate conditional data processing
bool = c(F, T, FALSE, TRUE)

# query an object's data/structure type with 'class()'
class(bool)
class(num) # numeric is the default data type for number objects
class(as.integer(num)) # integer class exists but is not default
class(char) # character class
class('237') # numbers aren't always numeric type
as.numeric('237') # but can be converted
as.character(237) # and vice verse

# Child-objects are often of different class to parents
class(df)
class(df[,2])
class(df[,1])

# FACTOR objects are vectors of items that have been categorised by unique values
factr = factor(c('one','two','three','two'))
str(factr)
levels(factr)
table(factr) # table is a handy tool for quickly counting unique values in a vector

# you may encounter problems converting a factor of numeric data to numeric type
factr = factor(c(200,200,300,100))
as.numeric(factr)

# instead do this
as.numeric(as.character(factr))

# many R functions e.g. data.frame() and read.csv() default character fields to factor class
class(df$letter)
# (For this reason I find it generally good practice to override by specifying "stringsAsFactors = FALSE"..)

# editing factors can be tricky
df$letter[1] <- 'A1' # generates an NA value because the assigned value is not a valid 'level' of the factor

# instead convert to character or numeric etc
df$letter = as.character(df$letter)
df$letter[1] <- 'A1'
head(df)

# LOGICAL OPERATIONS
2 + 2 == 4       # '==' denotes value equality
3 <= 2           # less than or equal to
3 >= 2           # greater than or equal to
'string' == "string"
'b' >= 'a'       # strings can be ranked
3 != 3           # NOT operator
c(4,2,6) == c(4,2,8) # vector comparisons return locical vectors
TRUE == T        # 'T' and 'F' default as boolean shortcuts (until overwritten)
TRUE & TRUE      # AND operator
TRUE | FALSE     # OR operator
F | F
c(T,F) & c(T,F)  # vectorised
c(T,F) && c(T,F) # TBA double && or || behave differently by returning single conditions. see "?base::Logic"
c(T,F) || c(T,F)

# IF/ELSE statement (used in most logical procedures)
x <- 10
if(x < 5){
  print('x is less than 5')
} else{
  print('x is not less than 5')
}

if(T | F) print('single liners can dispense with curly brackets')
if(T & F) print("") else print("but then 'else ..' only works on the same line")

# LOOPING FUNCTIONS – very useful for handling repetitive operations

# 'FOR' loop
for(i in 1:10){
  print(paste('number', i))  # 'paste' merges strings by seperator (space by default). try with 'paste0' instead
}

# WHILE loop (be careful to include safeguards to prevent infinite loops)
i = 30
while(i > 0){
  print(paste('number', i))
  i = i - 3
}

# creating a function
multiply = function(x, y){
  tot <- x * y
  return(tot)
}

multiply(3, 5)
# note 'tot' wasn't remembered outside the function – functions are contained environments
# if required use '<<-' for global assignment but BEWARE lots of people say this is BAD PROGRAMMING
# so be careful not to overwrite R's internal objects
# if you do want to capture output do like this:
tot <- multiply(3, 5)

# handling 'NA' values - generally they arise where data is missing, or where original values were not
# coercible to the field's current data type, or where functions have returned for whatever reason. see '?NA'
(x = 1:5)
x[8] = 8
x[3] = NA
print(x)      # sometimes functions will fail because of NA values
na.omit(x)    # iterates full list but ignores NAs
is.na(x)      # logical detection
x[!is.na(x)]

# useful basic math functions
seq(-2, 2, by=.2)               # sequence of equal difference
seq(length=10, from=-5, by=.2)  # with range defined by vector length
rnorm(20, mean = 0, sd = 1)     # random normal distribution
runif(20, min=0, max=100)       # vector of random numbers
sample(0:100, 20, replace=TRUE) # vector of random integers
min(vec)
max(vec)
max(x)    # these functions crunch with encountering NA values unless..
max(x, na.rm=T)
range(vec)
mean(vec)
median(vec)
# weirdly there is no 'mode' function in R, but you can use the one here:
# https://gist.github.com/geotheory/e996d7af35843dee41f6bf32f6b7070b
sum(vec)
prod(vec)
abs(-5)       # magnitude of values
sd(rnorm(10)) # standard deviation
4^2           # square
sqrt(16)      # square root
5 %% 3        # modulo (remainder after subtraction of any multiple)
for(i in 1:100) if(i %% 20 == 0) print(i) # modulo is useful for running an operation every n'th iteration

# Importing and exporting data using comma-separated file
write.csv(df, 'example.csv') # save to csv file
rm(df)
df = read.csv('example.csv', stringsAsFactors = FALSE)

# SOME PLOTTING EXAMPLES

plot(90:100, pch=16, cex=2)       # plot just 1 variable, specifying point and size
plot(sort(rnorm(100)), type='l')  # line plot
plot(x=1:25, y=25:1, pch=1:25)    # x & y inputs, and showing the available point symbols
plot(Sepal.Length ~ Petal.Length, col = Species, pch=16, data = iris)  # forumula method
plot(sin, -pi, 2*pi)              # it supports functions. This example is equivalent to:
x <- seq(-pi, 2*pi, length.out = 101); plot(x, sin(x), type='l')
hist(rnorm(1000), breaks=50)      # histogram

sumInsects = aggregate(count ~ spray, FUN = sum, data = InsectSprays)
barplot(sumInsects$count, names.arg = sumInsects$spray)
pie(sumInsects$count, labels = sumInsects$spray)

# plots with more visual components can be built up incrementally
x = sample(1:10)
plot(x, pch=17)
lines(x, col='#00FF00')
points(x+1, pch=16, col='red')
text(x-1, label = LETTERS[1:10])

# But for much more powerful and elegant data visualisation use ggplot2
# Next step: learn Tidyverse, esp. packages ggplot2, stringr, dplyr, tidyr, purrr

# END OF SCRIPT
	## A CRASH COURSE IN [R] PROGRAMMING
	## Robin Edwards (geotheory.co.uk), March 2018
	## In RStudio run through line-by-line using Ctrl + Enter

	# basic R environmental functions
	x=3.14159; y='hello world'; z=TRUE # create some objects. In RStudio they'll appear in 'Workspace'
	ls() # list the objects in the Workspace
	print(y) # print information to R 'Console'
	rm(y) # remove an object
	rm(list=ls()) # remove all
	getwd() # find current working directory
	setwd("/Users/robinedwards/Documents") # set working directory as preferred
	print ( "R ignores the 'white-space' in command syntax" )

	# use '?' for help on any R function (if its library is loaded in the session)
	?max
	??csv # search for a text string in R documentation library
	library(help=utils) # get help on a particular package (list its functions)

	# 'str' is a powerful tool for investigating the underlying structure of any R object
	str(iris)
	str(max)

	# CREATING AND MANIPULATING R OBJECTS

	# assigning values to variables
	n = 5 # is possible but
	n <- 5 # is officially the way
	5 -> n # also works
	rm(n)

	# R objects can be of various data types, but probably most common are 'numeric' and 'character'
	num <- 15
	char <- 'any text string'

	# create a VECTOR (array) using the 'c()' concatenate function
	vec <- c(1,3,5,7,9)

	# a vector series
	vec <- 20:10

	# R vectors can be accessed in various ways using [ ] brackets
	vec[3]
	vec[3:6]
	vec[ c(1,3,8) ]
	vec[vec > 15]

	# use %in% to check if vectors contains value(s)
	c(5,12) %in% vec

	# finding first index position of a matching value/sting
	x = c('one', 'five', 'two', 3, 'two')
	match(c('two','five','ten'), x)

	# a MATRIX is a 2D vector (essentially a vector of vectors) of matching data type
	matrx = matrix(1:15, 3, 5)
	print(matrx) # tba print isn't really necessary as R automatically prints objects if you call their names in console
	dim(matrx) # matrix dimensions
	t(matrx) # transpose

	# a DATA.FRAME is like a matrix, but accomodates fields (columns) with different data types
	df = data.frame(x = 1:26, y = LETTERS, z = rnorm(26))

	# They can be viewed easily
	View(df)

	# examine their internal stucture
	str(df)

	# data interrogation with square brackets
	df[1,]
	df[2:3,]
	df[,1]
	df[2,1]

	# data.frame and matrix objects can have field (column) and record (row) names
	dimnames(df)
	colnames(df)
	names(df)
	row.names(df) # rownames are considered passé by Tidyverse users. best practice is to include all data in actual fields

	# interrogate data.frames by field name using the '$' operator. the result is a simple vector
	df$y[2:5]

	# colnames and rownames can be reassigned
	names(df) <- c('id','letter','val')
	row.names(df) <- letters

	# check dimensions of vector/matrix/array/data.frame objects
	length(vec)
	dim(df)
	nrow(df)
	ncol(df)

	# R has various inbuilt data.frame datasets used to illustrate how functions operate e.g.
	data()

	# examine contents
	head(InsectSprays) # list the top records of a vector / matrix / d.f.
	tail(InsectSprays, n=3) # bottom the 3
	summary(InsectSprays) # summarise columns of a data.frame (very useful)

	# aggregate() is a powerful function for summarising categorical data. As with a number of R functions
	# you can use it either with explicit arguments or by specifying a formula (the more elegant approach)
	aggregate(InsectSprays$count, by = list(spray = InsectSprays$spray), FUN=mean) # explicit method
	aggregate(count ~ spray, data = InsectSprays, FUN=mean) # formula method

	# subset/apply filter to a data.frame
	warpbreaks[warpbreaks$wool == 'A',] # by 1 condition
	warpbreaks[warpbreaks$tension %in% c('L','M') & warpbreaks$wool == 'A',] # multiple conditions

	# adding entries is possible (if a bit tricky)
	newrow = data.frame(breaks = 99, wool = 'Z', tension = 'X')
	rbind(warpbreaks, newrow)

	# but LISTS are better at this
	lst = list()

	# ways to assign/add items
	lst[1] = "one"
	lst[[2]] <- "two"
	lst[length(lst)+1] <- "three" # slightly clunky general append method
	lst[['https://url.123']] <- 'four' # assigning index name is preferred approach if you are handling unique records and want to overwrite any previous entry to avoid duplication
	print(lst)

	# data retrieval
	lst[[1]] # double brackets means the object returned is of the data class of the list item
	lst[2:3] # selecting a more than 1 list item is possible with single brackets..
	lst[c(1,3)] # but the object returned (from single bracket interrogation) is a list

	# delete list items
	lst[[3]] <- NULL
	lst

	# entries can be any object type (like python), including other lists. Some datasets -
	# especially when parsed from JSON - arrive with recursive list structures like this
	lst[[1]] <- list(x = 'one', y = head(LETTERS))
	lst
	lst[[1]][[1]]
	lst[['https://url.123']] # items can also be called by id name like a Python/Javascript dictionary

	# reorder a vector with 'sort'
	sort(vec)

	# or a dataframe with 'order'
	df[order(df$val),]

	# LOGICAL objects (booleans) are binary true/false objects that facilitate conditional data processing
	bool = c(F, T, FALSE, TRUE)

	# query an object's data/structure type with 'class()'
	class(bool)
	class(num) # numeric is the default data type for number objects
	class(as.integer(num)) # integer class exists but is not default
	class(char) # character class
	class('237') # numbers aren't always numeric type
	as.numeric('237') # but can be converted
	as.character(237) # and vice verse

	# Child-objects are often of different class to parents
	class(df)
	class(df[,2])
	class(df[,1])

	# FACTOR objects are vectors of items that have been categorised by unique values
	factr = factor(c('one','two','three','two'))
	str(factr)
	levels(factr)
	table(factr) # table is a handy tool for quickly counting unique values in a vector

	# you may encounter problems converting a factor of numeric data to numeric type
	factr = factor(c(200,200,300,100))
	as.numeric(factr)

	# instead do this
	as.numeric(as.character(factr))

	# many R functions e.g. data.frame() and read.csv() default character fields to factor class
	class(df$letter)
	# (For this reason I find it generally good practice to override by specifying "stringsAsFactors = FALSE"..)

	# editing factors can be tricky
	df$letter[1] <- 'A1' # generates an NA value because the assigned value is not a valid 'level' of the factor

	# instead convert to character or numeric etc
	df$letter = as.character(df$letter)
	df$letter[1] <- 'A1'
	head(df)

	# LOGICAL OPERATIONS
	2 + 2 == 4 # '==' denotes value equality
	3 <= 2 # less than or equal to
	3 >= 2 # greater than or equal to
	'string' == "string"
	'b' >= 'a' # strings can be ranked
	3 != 3 # NOT operator
	c(4,2,6) == c(4,2,8) # vector comparisons return locical vectors
	TRUE == T # 'T' and 'F' default as boolean shortcuts (until overwritten)
	TRUE & TRUE # AND operator
	TRUE \| FALSE # OR operator
	F \| F
	c(T,F) & c(T,F) # vectorised
	c(T,F) && c(T,F) # TBA double && or \|\| behave differently by returning single conditions. see "?base::Logic"
	c(T,F) \|\| c(T,F)

	# IF/ELSE statement (used in most logical procedures)
	x <- 10
	if(x < 5){
	print('x is less than 5')
	} else{
	print('x is not less than 5')
	}

	if(T \| F) print('single liners can dispense with curly brackets')
	if(T & F) print("") else print("but then 'else ..' only works on the same line")

	# LOOPING FUNCTIONS – very useful for handling repetitive operations

	# 'FOR' loop
	for(i in 1:10){
	print(paste('number', i)) # 'paste' merges strings by seperator (space by default). try with 'paste0' instead
	}

	# WHILE loop (be careful to include safeguards to prevent infinite loops)
	i = 30
	while(i > 0){
	print(paste('number', i))
	i = i - 3
	}

	# creating a function
	multiply = function(x, y){
	tot <- x * y
	return(tot)
	}

	multiply(3, 5)
	# note 'tot' wasn't remembered outside the function – functions are contained environments
	# if required use '<<-' for global assignment but BEWARE lots of people say this is BAD PROGRAMMING
	# so be careful not to overwrite R's internal objects
	# if you do want to capture output do like this:
	tot <- multiply(3, 5)

	# handling 'NA' values - generally they arise where data is missing, or where original values were not
	# coercible to the field's current data type, or where functions have returned for whatever reason. see '?NA'
	(x = 1:5)
	x[8] = 8
	x[3] = NA
	print(x) # sometimes functions will fail because of NA values
	na.omit(x) # iterates full list but ignores NAs
	is.na(x) # logical detection
	x[!is.na(x)]

	# useful basic math functions
	seq(-2, 2, by=.2) # sequence of equal difference
	seq(length=10, from=-5, by=.2) # with range defined by vector length
	rnorm(20, mean = 0, sd = 1) # random normal distribution
	runif(20, min=0, max=100) # vector of random numbers
	sample(0:100, 20, replace=TRUE) # vector of random integers
	min(vec)
	max(vec)
	max(x) # these functions crunch with encountering NA values unless..
	max(x, na.rm=T)
	range(vec)
	mean(vec)
	median(vec)
	# weirdly there is no 'mode' function in R, but you can use the one here:
	# https://gist.github.com/geotheory/e996d7af35843dee41f6bf32f6b7070b
	sum(vec)
	prod(vec)
	abs(-5) # magnitude of values
	sd(rnorm(10)) # standard deviation
	4^2 # square
	sqrt(16) # square root
	5 %% 3 # modulo (remainder after subtraction of any multiple)
	for(i in 1:100) if(i %% 20 == 0) print(i) # modulo is useful for running an operation every n'th iteration

	# Importing and exporting data using comma-separated file
	write.csv(df, 'example.csv') # save to csv file
	rm(df)
	df = read.csv('example.csv', stringsAsFactors = FALSE)

	# SOME PLOTTING EXAMPLES

	plot(90:100, pch=16, cex=2) # plot just 1 variable, specifying point and size
	plot(sort(rnorm(100)), type='l') # line plot
	plot(x=1:25, y=25:1, pch=1:25) # x & y inputs, and showing the available point symbols
	plot(Sepal.Length ~ Petal.Length, col = Species, pch=16, data = iris) # forumula method
	plot(sin, -pi, 2*pi) # it supports functions. This example is equivalent to:
	x <- seq(-pi, 2*pi, length.out = 101); plot(x, sin(x), type='l')
	hist(rnorm(1000), breaks=50) # histogram

	sumInsects = aggregate(count ~ spray, FUN = sum, data = InsectSprays)
	barplot(sumInsects$count, names.arg = sumInsects$spray)
	pie(sumInsects$count, labels = sumInsects$spray)

	# plots with more visual components can be built up incrementally
	x = sample(1:10)
	plot(x, pch=17)
	lines(x, col='#00FF00')
	points(x+1, pch=16, col='red')
	text(x-1, label = LETTERS[1:10])

	# But for much more powerful and elegant data visualisation use ggplot2
	# Next step: learn Tidyverse, esp. packages ggplot2, stringr, dplyr, tidyr, purrr

	# END OF SCRIPT