Last active
March 29, 2019 16:06
-
-
Save geotheory/ca050d70eab7977cc52d0676a564f2ba to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## A CRASH COURSE IN [R] PROGRAMMING | |
## Robin Edwards (geotheory.co.uk), March 2018 | |
## In RStudio run through line-by-line using Ctrl + Enter | |
# basic R environmental functions | |
x=3.14159; y='hello world'; z=TRUE # create some objects. In RStudio they'll appear in 'Workspace' | |
ls() # list the objects in the Workspace | |
print(y) # print information to R 'Console' | |
rm(y) # remove an object | |
rm(list=ls()) # remove all | |
getwd() # find current working directory | |
setwd("/Users/robinedwards/Documents") # set working directory as preferred | |
print ( "R ignores the 'white-space' in command syntax" ) | |
# use '?' for help on any R function (if its library is loaded in the session) | |
?max | |
??csv # search for a text string in R documentation library | |
library(help=utils) # get help on a particular package (list its functions) | |
# 'str' is a powerful tool for investigating the underlying structure of any R object | |
str(iris) | |
str(max) | |
# CREATING AND MANIPULATING R OBJECTS | |
# assigning values to variables | |
n = 5 # is possible but | |
n <- 5 # is officially the way | |
5 -> n # also works | |
rm(n) | |
# R objects can be of various data types, but probably most common are 'numeric' and 'character' | |
num <- 15 | |
char <- 'any text string' | |
# create a VECTOR (array) using the 'c()' concatenate function | |
vec <- c(1,3,5,7,9) | |
# a vector series | |
vec <- 20:10 | |
# R vectors can be accessed in various ways using [ ] brackets | |
vec[3] | |
vec[3:6] | |
vec[ c(1,3,8) ] | |
vec[vec > 15] | |
# use %in% to check if vectors contains value(s) | |
c(5,12) %in% vec | |
# finding first index position of a matching value/sting | |
x = c('one', 'five', 'two', 3, 'two') | |
match(c('two','five','ten'), x) | |
# a MATRIX is a 2D vector (essentially a vector of vectors) of matching data type | |
matrx = matrix(1:15, 3, 5) | |
print(matrx) # tba print isn't really necessary as R automatically prints objects if you call their names in console | |
dim(matrx) # matrix dimensions | |
t(matrx) # transpose | |
# a DATA.FRAME is like a matrix, but accomodates fields (columns) with different data types | |
df = data.frame(x = 1:26, y = LETTERS, z = rnorm(26)) | |
# They can be viewed easily | |
View(df) | |
# examine their internal stucture | |
str(df) | |
# data interrogation with square brackets | |
df[1,] | |
df[2:3,] | |
df[,1] | |
df[2,1] | |
# data.frame and matrix objects can have field (column) and record (row) names | |
dimnames(df) | |
colnames(df) | |
names(df) | |
row.names(df) # rownames are considered passé by Tidyverse users. best practice is to include all data in actual fields | |
# interrogate data.frames by field name using the '$' operator. the result is a simple vector | |
df$y[2:5] | |
# colnames and rownames can be reassigned | |
names(df) <- c('id','letter','val') | |
row.names(df) <- letters | |
# check dimensions of vector/matrix/array/data.frame objects | |
length(vec) | |
dim(df) | |
nrow(df) | |
ncol(df) | |
# R has various inbuilt data.frame datasets used to illustrate how functions operate e.g. | |
data() | |
# examine contents | |
head(InsectSprays) # list the top records of a vector / matrix / d.f. | |
tail(InsectSprays, n=3) # bottom the 3 | |
summary(InsectSprays) # summarise columns of a data.frame (very useful) | |
# aggregate() is a powerful function for summarising categorical data. As with a number of R functions | |
# you can use it either with explicit arguments or by specifying a formula (the more elegant approach) | |
aggregate(InsectSprays$count, by = list(spray = InsectSprays$spray), FUN=mean) # explicit method | |
aggregate(count ~ spray, data = InsectSprays, FUN=mean) # formula method | |
# subset/apply filter to a data.frame | |
warpbreaks[warpbreaks$wool == 'A',] # by 1 condition | |
warpbreaks[warpbreaks$tension %in% c('L','M') & warpbreaks$wool == 'A',] # multiple conditions | |
# adding entries is possible (if a bit tricky) | |
newrow = data.frame(breaks = 99, wool = 'Z', tension = 'X') | |
rbind(warpbreaks, newrow) | |
# but LISTS are better at this | |
lst = list() | |
# ways to assign/add items | |
lst[1] = "one" | |
lst[[2]] <- "two" | |
lst[length(lst)+1] <- "three" # slightly clunky general append method | |
lst[['https://url.123']] <- 'four' # assigning index name is preferred approach if you are handling unique records and want to overwrite any previous entry to avoid duplication | |
print(lst) | |
# data retrieval | |
lst[[1]] # double brackets means the object returned is of the data class of the list item | |
lst[2:3] # selecting a more than 1 list item is possible with single brackets.. | |
lst[c(1,3)] # but the object returned (from single bracket interrogation) is a list | |
# delete list items | |
lst[[3]] <- NULL | |
lst | |
# entries can be any object type (like python), including other lists. Some datasets - | |
# especially when parsed from JSON - arrive with recursive list structures like this | |
lst[[1]] <- list(x = 'one', y = head(LETTERS)) | |
lst | |
lst[[1]][[1]] | |
lst[['https://url.123']] # items can also be called by id name like a Python/Javascript dictionary | |
# reorder a vector with 'sort' | |
sort(vec) | |
# or a dataframe with 'order' | |
df[order(df$val),] | |
# LOGICAL objects (booleans) are binary true/false objects that facilitate conditional data processing | |
bool = c(F, T, FALSE, TRUE) | |
# query an object's data/structure type with 'class()' | |
class(bool) | |
class(num) # numeric is the default data type for number objects | |
class(as.integer(num)) # integer class exists but is not default | |
class(char) # character class | |
class('237') # numbers aren't always numeric type | |
as.numeric('237') # but can be converted | |
as.character(237) # and vice verse | |
# Child-objects are often of different class to parents | |
class(df) | |
class(df[,2]) | |
class(df[,1]) | |
# FACTOR objects are vectors of items that have been categorised by unique values | |
factr = factor(c('one','two','three','two')) | |
str(factr) | |
levels(factr) | |
table(factr) # table is a handy tool for quickly counting unique values in a vector | |
# you may encounter problems converting a factor of numeric data to numeric type | |
factr = factor(c(200,200,300,100)) | |
as.numeric(factr) | |
# instead do this | |
as.numeric(as.character(factr)) | |
# many R functions e.g. data.frame() and read.csv() default character fields to factor class | |
class(df$letter) | |
# (For this reason I find it generally good practice to override by specifying "stringsAsFactors = FALSE"..) | |
# editing factors can be tricky | |
df$letter[1] <- 'A1' # generates an NA value because the assigned value is not a valid 'level' of the factor | |
# instead convert to character or numeric etc | |
df$letter = as.character(df$letter) | |
df$letter[1] <- 'A1' | |
head(df) | |
# LOGICAL OPERATIONS | |
2 + 2 == 4 # '==' denotes value equality | |
3 <= 2 # less than or equal to | |
3 >= 2 # greater than or equal to | |
'string' == "string" | |
'b' >= 'a' # strings can be ranked | |
3 != 3 # NOT operator | |
c(4,2,6) == c(4,2,8) # vector comparisons return locical vectors | |
TRUE == T # 'T' and 'F' default as boolean shortcuts (until overwritten) | |
TRUE & TRUE # AND operator | |
TRUE | FALSE # OR operator | |
F | F | |
c(T,F) & c(T,F) # vectorised | |
c(T,F) && c(T,F) # TBA double && or || behave differently by returning single conditions. see "?base::Logic" | |
c(T,F) || c(T,F) | |
# IF/ELSE statement (used in most logical procedures) | |
x <- 10 | |
if(x < 5){ | |
print('x is less than 5') | |
} else{ | |
print('x is not less than 5') | |
} | |
if(T | F) print('single liners can dispense with curly brackets') | |
if(T & F) print("") else print("but then 'else ..' only works on the same line") | |
# LOOPING FUNCTIONS – very useful for handling repetitive operations | |
# 'FOR' loop | |
for(i in 1:10){ | |
print(paste('number', i)) # 'paste' merges strings by seperator (space by default). try with 'paste0' instead | |
} | |
# WHILE loop (be careful to include safeguards to prevent infinite loops) | |
i = 30 | |
while(i > 0){ | |
print(paste('number', i)) | |
i = i - 3 | |
} | |
# creating a function | |
multiply = function(x, y){ | |
tot <- x * y | |
return(tot) | |
} | |
multiply(3, 5) | |
# note 'tot' wasn't remembered outside the function – functions are contained environments | |
# if required use '<<-' for global assignment but BEWARE lots of people say this is BAD PROGRAMMING | |
# so be careful not to overwrite R's internal objects | |
# if you do want to capture output do like this: | |
tot <- multiply(3, 5) | |
# handling 'NA' values - generally they arise where data is missing, or where original values were not | |
# coercible to the field's current data type, or where functions have returned for whatever reason. see '?NA' | |
(x = 1:5) | |
x[8] = 8 | |
x[3] = NA | |
print(x) # sometimes functions will fail because of NA values | |
na.omit(x) # iterates full list but ignores NAs | |
is.na(x) # logical detection | |
x[!is.na(x)] | |
# useful basic math functions | |
seq(-2, 2, by=.2) # sequence of equal difference | |
seq(length=10, from=-5, by=.2) # with range defined by vector length | |
rnorm(20, mean = 0, sd = 1) # random normal distribution | |
runif(20, min=0, max=100) # vector of random numbers | |
sample(0:100, 20, replace=TRUE) # vector of random integers | |
min(vec) | |
max(vec) | |
max(x) # these functions crunch with encountering NA values unless.. | |
max(x, na.rm=T) | |
range(vec) | |
mean(vec) | |
median(vec) | |
# weirdly there is no 'mode' function in R, but you can use the one here: | |
# https://gist.github.com/geotheory/e996d7af35843dee41f6bf32f6b7070b | |
sum(vec) | |
prod(vec) | |
abs(-5) # magnitude of values | |
sd(rnorm(10)) # standard deviation | |
4^2 # square | |
sqrt(16) # square root | |
5 %% 3 # modulo (remainder after subtraction of any multiple) | |
for(i in 1:100) if(i %% 20 == 0) print(i) # modulo is useful for running an operation every n'th iteration | |
# Importing and exporting data using comma-separated file | |
write.csv(df, 'example.csv') # save to csv file | |
rm(df) | |
df = read.csv('example.csv', stringsAsFactors = FALSE) | |
# SOME PLOTTING EXAMPLES | |
plot(90:100, pch=16, cex=2) # plot just 1 variable, specifying point and size | |
plot(sort(rnorm(100)), type='l') # line plot | |
plot(x=1:25, y=25:1, pch=1:25) # x & y inputs, and showing the available point symbols | |
plot(Sepal.Length ~ Petal.Length, col = Species, pch=16, data = iris) # forumula method | |
plot(sin, -pi, 2*pi) # it supports functions. This example is equivalent to: | |
x <- seq(-pi, 2*pi, length.out = 101); plot(x, sin(x), type='l') | |
hist(rnorm(1000), breaks=50) # histogram | |
sumInsects = aggregate(count ~ spray, FUN = sum, data = InsectSprays) | |
barplot(sumInsects$count, names.arg = sumInsects$spray) | |
pie(sumInsects$count, labels = sumInsects$spray) | |
# plots with more visual components can be built up incrementally | |
x = sample(1:10) | |
plot(x, pch=17) | |
lines(x, col='#00FF00') | |
points(x+1, pch=16, col='red') | |
text(x-1, label = LETTERS[1:10]) | |
# But for much more powerful and elegant data visualisation use ggplot2 | |
# Next step: learn Tidyverse, esp. packages ggplot2, stringr, dplyr, tidyr, purrr | |
# END OF SCRIPT |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment