zeffii/r_cheatsheet.md

## r_cheatsheet.md

      
    Raw
  

              r_cheatsheet.md
            
          
    cheat-sheet info

css / styling of this document is currently default for github. Additional resources listed below
working directory

indexing starts at 1 not 0, bring on the OBO-errors! :)
getwd()
setwd('C:/blaaa/just/use/fw_slashes')

assignments

4 -> x
x <- 4
x = 4

lists and vectors

lists can contain mixed data, vectors will always contain one type of data (even when you pass
mixed datatypes).
x <- c(1,2,3,4)
y <- c(6,7,8)
z <- c(x,y)         # [1] 1 2 3 4 6 7 8

x <- c(TRUE, FALSE)
x <- c(T, F)
x <- c(TRUE, 7)     # works but maybe not as you expect
x <- list(TRUE, 7)  # you probably want

# pre populate, vector with 20 NA
v <- rep(NA, 20)

import data

csvdata_f <- read.csv("filename.csv")
csvdata_f <- read.csv("some_directory/filename.csv")

csvdata_f <- read.csv(filename, 
    colClasses = "character",           # not sure
    na.strings = c('Not Available'))    # replaces all occurances of items in c with NA

# character vector
text <- readLines(filename)

stats

summary()

casting

as.numeric(variable)
as.data.frame(some_matrix)

# use entire dataframe, or select columns 
# (either index or column name string)
DM = as.matrix(DF)

# casting and subsetting
DM = as.matrix(DF[c(12,14,25)])
DM = as.matrix(DF[c("colname", "other_colname")])

joining / concatenating strings

paste(some_list, sep="")
paste(some_list, collapse="/")
paste(str1, str2, collapse="/")

> fpath = sprintf("%s/%s%s", "some_path", "some_file", ".csv")
> fpath
[1] "some_path/some_file.csv"

# id , can be 2 "02" "002", this will output the same string
fpath = sprintf("%s/%03d%s", "some_path", as.numeric(id), ".csv")

NA

# is a value an NA? 
is.na(some_value)

sum(is.na(csvdata_f["col_label"]))  # counts num NA
colMeans(csvdata["col_label"], na.rm=TRUE)
ma <- median(some_data_frame[, colnum], na.rm=TRUE)  # drop NA from calcs, works on most things.

sifting

nd <- subset(csvdata, col_label > some_value & col_label > some_value,)
nd <- subset(csvdata, col_label > some_value | col_label > some_value,)
quantity <- sum(complete.cases(df))

maxing

max(nd['col_label'])  # if no NA
max(nd['col_label'], na.rm=TRUE) # else

Range (over several columns)

# DF is some data.frame
av = range(DF[, 12], na.rm=TRUE)
bv = range(DF[, 14], na.rm=TRUE)
cv = range(DF[, 25], na.rm=TRUE)
dv = range(c(av,bv,cv)) 

# or, looks neater
DM = as.matrix(DF[c(12,14,25)])
dv = range(DM, na.rm=TRUE)

creating sequences

1:4
4:1
c(1,2,3,4)
seq(1, 4)
seq(from=1, to=4)

seq(from=1, to=14, by=2)
seq(from=-20, by=2, length= 20)
rep(c(3,2,1), each=2)    # [1] 3 3 2 2 1 1
rep(c(3,2,1), times=2)   # [1] 3 2 1 3 2 1

Matrices

# renaming column / rownames names
colnames(dataframe)[column_number] <- "new_name"

# display first n rows of dataframe / matrix
head(dataframe, n=20)
tail(dataframe, n=20) # last 20 rows

# row gets
br[1:3,]  # first 3 rows of br
br[3,]  # row 3 of br only

# row col gets
br[1:3,]$column_name # returns that column for those rows.

dataframe

name = c('aa', 'bc', 'ac', 'cd', 'ge', 'hr', 'de', 'ed', 'ae') 
state = c('tx','tx','tx','tx','ab','ab','md','md','md')
rank = c(1, 2, 3, 4, 1, 2, 1,2,3) 
maxrank = c(4,4,4,4,2,2,3,3,3) 
df = data.frame(name, state, rank, maxrank) 

# adding to a dataframe use 
df <- rbind(df, df2)

Loops

for (i in c(4,5,6)){
    print(i)
}

for (idx in seq_along(some_vector)){
    print(idx)
}

Ordering

# order on condition 1 then on condition 2
# both forms can occur
m[ order(m[,1],m[,2]), ]
m[ order(m[1],m[2]), ] 

Regex

if you have to..
r <- regexpr(pat, text, ignore.case=TRUE)
m <- regmatches(text, r)

Plotting

this might develop into a separate document, but it is included for now. Taken from (https://class.coursera.org/compdata-003/lecture/28)
x <- rnorm(100)
y <- x + rnorm(100, sd = 0.5)
f <- gl(2, 50, labels = c("Group 1", "Group 2"))

# basic plot
xyplot(y ~ x | f)

# show regression line
xyplot(y ~ x | f, 
  panel = function(x,y, ...) {
    panel.xyplot(x,y,...)
    panel.lmline(x,y, col=2)
  })

# show median of y
xyplot(y ~ x | f, 
  panel = function(x,y, ...) {
    panel.xyplot(x,y,...)
    panel.abline(h = median(y), lty=2)
  })

taken from https://class.coursera.org/compdata-003/lecture/31
> library(lattice)
> package ? lattice
> library(help = lattice)
> data(environmental)
> ?environmental
> head(environmental)
  ozone radiation temperature wind
1    41       190          67  7.4
2    36       118          72  8.0
3    12       149          74 12.6
4    18       313          62 11.5
5    23       299          65  8.6
6    19        99          59 13.8

> xyplot(ozone ~ radiation, data=environmental)
> xyplot(ozone ~ radiation, data=environmental, main = "Ozone vs. Radiation")
> xyplot(ozone ~ temperature, data=environmental)

> summary(environmental$temperature)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  57.00   71.00   79.00   77.79   84.50   97.00 
> temp.cut <- equal.count(environmental$temperature, 4)
> temp.cut

Data:
  [1] 67 72 74 62 65 59 61 69 66 68 58 64 66 57 68 62 59 73 61 61 67 81 79 76 82
 [26] 90 87 82 77 72 65 73 76 84 85 81 83 83 88 92 92 89 73 81 80 81 82 84 87 85
 [51] 74 86 85 82 86 88 86 83 81 81 81 82 89 90 90 86 82 80 77 79 76 78 78 77 72
 [76] 79 81 86 97 94 96 94 91 92 93 93 87 84 80 78 75 73 81 76 77 71 71 78 67 76
[101] 68 82 64 71 81 69 63 70 75 76 68

Intervals:
   min  max count
1 56.5 76.5    46
2 67.5 81.5    51
3 75.5 86.5    51
4 80.5 97.5    51

Overlap between adjacent intervals:
[1] 27 30 31
> xyplot(ozone ~ radiation | temp.cut, data = environmental)
> xyplot(ozone ~ radiation | temp.cut, data = environmental, layout = c(1,4))
> xyplot(ozone ~ radiation | temp.cut, data = environmental, layout = c(1,4), pch = 20)
> xyplot(ozone ~ radiation | temp.cut, data = environmental, as.table = TRUE)

> # write your own panel function
> xyplot(ozone ~ radiation | temp.cut, data = environmental, as.table = TRUE,
+   panel = function(x, y, ...) {
+     panel.xyplot(x, y, ...)
+     fit <- lm(y ~ x)
+     panel.abline(fit)
+   })

more plotting, using a smoothed estimator
xyplot(ozone ~ radiation | temp.cut, data = environmental, 
    as.table = TRUE,
    panel = function(x, y, ...) {
        panel.xyplot(x, y, ...)
        panel.loess(x, y)
    })

adding labels and main title
xyplot(ozone ~ radiation | temp.cut, data = environmental, 
    as.table = TRUE,
    panel = function(x, y, ...) {
        panel.xyplot(x, y, ...)
        panel.loess(x, y)
    },
    xlab = "Solar Radiation",
    ylab = "Ozone(ppb)",
    main = "Ozone vs. Solar Radiation")

take wind into account
wind.cut <- equal.count(environmental$wind, 4)
xyplot(ozone ~ radiation | temp.cut * wind.cut, data = environmental, 
    as.table = TRUE,
    panel = function(x, y, ...) {
        panel.xyplot(x, y, ...)
        panel.loess(x, y)
    },
    xlab = "Solar Radiation",
    ylab = "Ozone(ppb)",
    main = "Ozone vs. Solar Radiation")

dynamic substitution of labels

from https://class.coursera.org/compdata-003/lecture/29  (week 3)
# dynamic substitution
x <- rnorm(100)
y <- x + rnorm(100, sd = 0.5)
plot(x, y,
    xlab = substitute(bar(x) == k, list(k=mean(x))),
    ylab = substitute(bar(y) == k, list(k=mean(y)))
    )

# or in a loop of plots
par(mfrow = c(2,2))
for(i in 1:4) {
    x <- rnorm(100)
    hist(x, main=substitute(theta==num, list(num=i)))
}

Additional resources

http://students.washington.edu/mclarkso/
http://faculty.washington.edu/kenrice/sisg/SISG-08-05.pdf
http://www2.warwick.ac.uk/fac/sci/statistics/staff/academic-research/reed/rexercises.pdf

plot titles ( concatenate-strings-and-expressions-in-a-plots-title )

http://stackoverflow.com/questions/4302367/
apply functions:

http://nsaunders.wordpress.com/2010/08/20/a-brief-introduction-to-apply-in-r/