Skip to content

Instantly share code, notes, and snippets.

@geotheory
Last active March 29, 2019 16:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save geotheory/ca050d70eab7977cc52d0676a564f2ba to your computer and use it in GitHub Desktop.
Save geotheory/ca050d70eab7977cc52d0676a564f2ba to your computer and use it in GitHub Desktop.
## A CRASH COURSE IN [R] PROGRAMMING
## Robin Edwards (geotheory.co.uk), March 2018
## In RStudio run through line-by-line using Ctrl + Enter
# basic R environmental functions
x=3.14159; y='hello world'; z=TRUE # create some objects. In RStudio they'll appear in 'Workspace'
ls() # list the objects in the Workspace
print(y) # print information to R 'Console'
rm(y) # remove an object
rm(list=ls()) # remove all
getwd() # find current working directory
setwd("/Users/robinedwards/Documents") # set working directory as preferred
print ( "R ignores the 'white-space' in command syntax" )
# use '?' for help on any R function (if its library is loaded in the session)
?max
??csv # search for a text string in R documentation library
library(help=utils) # get help on a particular package (list its functions)
# 'str' is a powerful tool for investigating the underlying structure of any R object
str(iris)
str(max)
# CREATING AND MANIPULATING R OBJECTS
# assigning values to variables
n = 5 # is possible but
n <- 5 # is officially the way
5 -> n # also works
rm(n)
# R objects can be of various data types, but probably most common are 'numeric' and 'character'
num <- 15
char <- 'any text string'
# create a VECTOR (array) using the 'c()' concatenate function
vec <- c(1,3,5,7,9)
# a vector series
vec <- 20:10
# R vectors can be accessed in various ways using [ ] brackets
vec[3]
vec[3:6]
vec[ c(1,3,8) ]
vec[vec > 15]
# use %in% to check if vectors contains value(s)
c(5,12) %in% vec
# finding first index position of a matching value/sting
x = c('one', 'five', 'two', 3, 'two')
match(c('two','five','ten'), x)
# a MATRIX is a 2D vector (essentially a vector of vectors) of matching data type
matrx = matrix(1:15, 3, 5)
print(matrx) # tba print isn't really necessary as R automatically prints objects if you call their names in console
dim(matrx) # matrix dimensions
t(matrx) # transpose
# a DATA.FRAME is like a matrix, but accomodates fields (columns) with different data types
df = data.frame(x = 1:26, y = LETTERS, z = rnorm(26))
# They can be viewed easily
View(df)
# examine their internal stucture
str(df)
# data interrogation with square brackets
df[1,]
df[2:3,]
df[,1]
df[2,1]
# data.frame and matrix objects can have field (column) and record (row) names
dimnames(df)
colnames(df)
names(df)
row.names(df) # rownames are considered passé by Tidyverse users. best practice is to include all data in actual fields
# interrogate data.frames by field name using the '$' operator. the result is a simple vector
df$y[2:5]
# colnames and rownames can be reassigned
names(df) <- c('id','letter','val')
row.names(df) <- letters
# check dimensions of vector/matrix/array/data.frame objects
length(vec)
dim(df)
nrow(df)
ncol(df)
# R has various inbuilt data.frame datasets used to illustrate how functions operate e.g.
data()
# examine contents
head(InsectSprays) # list the top records of a vector / matrix / d.f.
tail(InsectSprays, n=3) # bottom the 3
summary(InsectSprays) # summarise columns of a data.frame (very useful)
# aggregate() is a powerful function for summarising categorical data. As with a number of R functions
# you can use it either with explicit arguments or by specifying a formula (the more elegant approach)
aggregate(InsectSprays$count, by = list(spray = InsectSprays$spray), FUN=mean) # explicit method
aggregate(count ~ spray, data = InsectSprays, FUN=mean) # formula method
# subset/apply filter to a data.frame
warpbreaks[warpbreaks$wool == 'A',] # by 1 condition
warpbreaks[warpbreaks$tension %in% c('L','M') & warpbreaks$wool == 'A',] # multiple conditions
# adding entries is possible (if a bit tricky)
newrow = data.frame(breaks = 99, wool = 'Z', tension = 'X')
rbind(warpbreaks, newrow)
# but LISTS are better at this
lst = list()
# ways to assign/add items
lst[1] = "one"
lst[[2]] <- "two"
lst[length(lst)+1] <- "three" # slightly clunky general append method
lst[['https://url.123']] <- 'four' # assigning index name is preferred approach if you are handling unique records and want to overwrite any previous entry to avoid duplication
print(lst)
# data retrieval
lst[[1]] # double brackets means the object returned is of the data class of the list item
lst[2:3] # selecting a more than 1 list item is possible with single brackets..
lst[c(1,3)] # but the object returned (from single bracket interrogation) is a list
# delete list items
lst[[3]] <- NULL
lst
# entries can be any object type (like python), including other lists. Some datasets -
# especially when parsed from JSON - arrive with recursive list structures like this
lst[[1]] <- list(x = 'one', y = head(LETTERS))
lst
lst[[1]][[1]]
lst[['https://url.123']] # items can also be called by id name like a Python/Javascript dictionary
# reorder a vector with 'sort'
sort(vec)
# or a dataframe with 'order'
df[order(df$val),]
# LOGICAL objects (booleans) are binary true/false objects that facilitate conditional data processing
bool = c(F, T, FALSE, TRUE)
# query an object's data/structure type with 'class()'
class(bool)
class(num) # numeric is the default data type for number objects
class(as.integer(num)) # integer class exists but is not default
class(char) # character class
class('237') # numbers aren't always numeric type
as.numeric('237') # but can be converted
as.character(237) # and vice verse
# Child-objects are often of different class to parents
class(df)
class(df[,2])
class(df[,1])
# FACTOR objects are vectors of items that have been categorised by unique values
factr = factor(c('one','two','three','two'))
str(factr)
levels(factr)
table(factr) # table is a handy tool for quickly counting unique values in a vector
# you may encounter problems converting a factor of numeric data to numeric type
factr = factor(c(200,200,300,100))
as.numeric(factr)
# instead do this
as.numeric(as.character(factr))
# many R functions e.g. data.frame() and read.csv() default character fields to factor class
class(df$letter)
# (For this reason I find it generally good practice to override by specifying "stringsAsFactors = FALSE"..)
# editing factors can be tricky
df$letter[1] <- 'A1' # generates an NA value because the assigned value is not a valid 'level' of the factor
# instead convert to character or numeric etc
df$letter = as.character(df$letter)
df$letter[1] <- 'A1'
head(df)
# LOGICAL OPERATIONS
2 + 2 == 4 # '==' denotes value equality
3 <= 2 # less than or equal to
3 >= 2 # greater than or equal to
'string' == "string"
'b' >= 'a' # strings can be ranked
3 != 3 # NOT operator
c(4,2,6) == c(4,2,8) # vector comparisons return locical vectors
TRUE == T # 'T' and 'F' default as boolean shortcuts (until overwritten)
TRUE & TRUE # AND operator
TRUE | FALSE # OR operator
F | F
c(T,F) & c(T,F) # vectorised
c(T,F) && c(T,F) # TBA double && or || behave differently by returning single conditions. see "?base::Logic"
c(T,F) || c(T,F)
# IF/ELSE statement (used in most logical procedures)
x <- 10
if(x < 5){
print('x is less than 5')
} else{
print('x is not less than 5')
}
if(T | F) print('single liners can dispense with curly brackets')
if(T & F) print("") else print("but then 'else ..' only works on the same line")
# LOOPING FUNCTIONS – very useful for handling repetitive operations
# 'FOR' loop
for(i in 1:10){
print(paste('number', i)) # 'paste' merges strings by seperator (space by default). try with 'paste0' instead
}
# WHILE loop (be careful to include safeguards to prevent infinite loops)
i = 30
while(i > 0){
print(paste('number', i))
i = i - 3
}
# creating a function
multiply = function(x, y){
tot <- x * y
return(tot)
}
multiply(3, 5)
# note 'tot' wasn't remembered outside the function – functions are contained environments
# if required use '<<-' for global assignment but BEWARE lots of people say this is BAD PROGRAMMING
# so be careful not to overwrite R's internal objects
# if you do want to capture output do like this:
tot <- multiply(3, 5)
# handling 'NA' values - generally they arise where data is missing, or where original values were not
# coercible to the field's current data type, or where functions have returned for whatever reason. see '?NA'
(x = 1:5)
x[8] = 8
x[3] = NA
print(x) # sometimes functions will fail because of NA values
na.omit(x) # iterates full list but ignores NAs
is.na(x) # logical detection
x[!is.na(x)]
# useful basic math functions
seq(-2, 2, by=.2) # sequence of equal difference
seq(length=10, from=-5, by=.2) # with range defined by vector length
rnorm(20, mean = 0, sd = 1) # random normal distribution
runif(20, min=0, max=100) # vector of random numbers
sample(0:100, 20, replace=TRUE) # vector of random integers
min(vec)
max(vec)
max(x) # these functions crunch with encountering NA values unless..
max(x, na.rm=T)
range(vec)
mean(vec)
median(vec)
# weirdly there is no 'mode' function in R, but you can use the one here:
# https://gist.github.com/geotheory/e996d7af35843dee41f6bf32f6b7070b
sum(vec)
prod(vec)
abs(-5) # magnitude of values
sd(rnorm(10)) # standard deviation
4^2 # square
sqrt(16) # square root
5 %% 3 # modulo (remainder after subtraction of any multiple)
for(i in 1:100) if(i %% 20 == 0) print(i) # modulo is useful for running an operation every n'th iteration
# Importing and exporting data using comma-separated file
write.csv(df, 'example.csv') # save to csv file
rm(df)
df = read.csv('example.csv', stringsAsFactors = FALSE)
# SOME PLOTTING EXAMPLES
plot(90:100, pch=16, cex=2) # plot just 1 variable, specifying point and size
plot(sort(rnorm(100)), type='l') # line plot
plot(x=1:25, y=25:1, pch=1:25) # x & y inputs, and showing the available point symbols
plot(Sepal.Length ~ Petal.Length, col = Species, pch=16, data = iris) # forumula method
plot(sin, -pi, 2*pi) # it supports functions. This example is equivalent to:
x <- seq(-pi, 2*pi, length.out = 101); plot(x, sin(x), type='l')
hist(rnorm(1000), breaks=50) # histogram
sumInsects = aggregate(count ~ spray, FUN = sum, data = InsectSprays)
barplot(sumInsects$count, names.arg = sumInsects$spray)
pie(sumInsects$count, labels = sumInsects$spray)
# plots with more visual components can be built up incrementally
x = sample(1:10)
plot(x, pch=17)
lines(x, col='#00FF00')
points(x+1, pch=16, col='red')
text(x-1, label = LETTERS[1:10])
# But for much more powerful and elegant data visualisation use ggplot2
# Next step: learn Tidyverse, esp. packages ggplot2, stringr, dplyr, tidyr, purrr
# END OF SCRIPT
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment