Skip to content

Instantly share code, notes, and snippets.

@crazyhottommy
Last active August 29, 2015 14:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save crazyhottommy/611018b3241a9d58d434 to your computer and use it in GitHub Desktop.
Save crazyhottommy/611018b3241a9d58d434 to your computer and use it in GitHub Desktop.
#2014 MSU NGS R basics tutorial
#http://angus.readthedocs.org/en/2014/R_Introductory_tutorial_2014.html
#https://github.com/jrherr/quick_basic_R_tutorial/blob/master/R_tutorial.md
#pick one language, and learn it well!
#pick up a dataset, play with it!
#object-oriented programming
#functional programming
#deal with big data in R: (R holds all the data in memory)
#http://theodi.org/blog/fig-data-11-tips-how-handle-big-data-r-and-1-bad-pun
#http://r-pbd.org/
#packages: plyr, dplyr, ggplot2, reshape2, data.table (fread function)
# commands start here!!
q() # quit R
getwd() # get working directory
setwd() # set working directory
y<- 2 # assign a variable
x<- 3
x + y # treat it as a calculator
x * y
x/y
x %% y
x == 3 # equal sign, will reture a logical vector: True or False
# in R, True and False have numerical values: True resolves to 1 , False resolves to 0
# exponents ** or ^
2**2 # returns 4
2^2 # returns 4
log(2.7) # natural log returns 0.99325
log(4,2) # returns 2
a<- c(2,3,6,8) # assign a vector use c denotes concatenate
b<- c(3,5,6,7)
a + b #
a * b #
# when length of a and b are different, R will recycle the longer one and gives a warning
length(a) # length of a, returns 4
new_varaible <- c(a,b) # concatenate two variables
crap<- rep(1:100) # index starts at 1
rm (crap) # remove this variable
?lm # get help for linear regression model function
# simple functions
a<- c(2,3,6,8)
mean(a)
sum(a)
var(a)
b<- c(3,5,6,7)
cor(a,b) # pearson correlation for two vectors
m<- cbind(a,b) # column bind the vector to a matrix
m
#> m
# a b
# [1,] 2 3
# [2,] 3 5
# [3,] 6 6
# [4,] 8 7
cor(m) # pearson correlation for columns of a matrix
cor(m, method="spearman") # spearman correlation of columns of a matrix
?cor
mode (a) # numeric
class (a) # numeric
class (m) # matrix
typeof(m)
str(m) # structure of m, try it out in your R console
dim(m) # dimension of m: 4 2
nrow(m) # number of rows 4
ncol(m) # number of columns 2
length(m) # 8
is.matrix(m) # True
# create a matrix from scratch
#> m1<- matrix(1:12,3,4)
#> m1
# [,1] [,2] [,3] [,4]
#[1,] 1 4 7 10
#[2,] 2 5 8 11
#[3,] 3 6 9 12
# strings
cities<- c("E.lansing", "Gainesville", "Shanghai", "Yichun")
class(cities)
#[1] "character"
length(cities) # 4 cities in the vector
nchar(cities) # number of characters for each city
#[1] 9 11 8 6
sum(nchar(cities))
#[1] 34
rivers<- c("Red Cedar", "swamp", "Huang Pu", "Long He")
cities_rivers<- cbind (cities,rivers)
cities_rivers
# cities rivers
#[1,] "E.lansing" "Red Cedar"
#[2,] "Gainesville" "swamp"
#[3,] "Shanghai" "Huang Pu"
#[4,] "Yichun" "Long He"
class(cities_rivers) # matrix
mode (citeis_rivers) # character
model_1<- y ~ x1 + x2 + x1:x2
model_1
#> class(model_1)
#[1] "formula"
counts_transcript_a <- c(250, 157, 155, 300, 125, 100, 153, 175)
genotype <- gl(n=2, k=4, labels = c("wild_type", "mutant"))
#> genotype
#[1] wild_type wild_type wild_type wild_type mutant
#[6] mutant mutant mutant
#Levels: wild_type mutant
#alternative to gl function, one can
genotype1<- factor(rep(c("wild_type","mutant"),each=4))
#> genotype1
#[1] wild_type wild_type wild_type wild_type mutant
#[6] mutant mutant mutant
#Levels: mutant wild_type
#notice the use of the "each" argument
genotype1<- factor(rep(c("wild_type","mutant"),4))
#> genotype1
#[1] wild_type mutant wild_type mutant wild_type
#[6] mutant wild_type mutant
#Levels: mutant wild_type
#also, notice that the levels are different with that generated by gl function
#we want the wild_type to be the base level. Instead, do:
genotype1<- factor(rep(c("wild_type","mutant"),each=4), levels=c("wild_type","mutant"))
#> genotype1
#[1] wild_type wild_type wild_type wild_type mutant mutant mutant mutant
#Levels: wild_type mutant
?relevel # try it also
expression_data <- data.frame(counts_transcript_a, genotype)
#> expression_data
# counts_transcript_a genotype
#1 250 wild_type
#2 157 wild_type
#3 155 wild_type
#4 300 wild_type
#5 125 mutant
#6 100 mutant
#7 153 mutant
#8 175 mutant
expression_data$counts_transcript_a # access a column of the dataframe
ls() # objects in the enviroment
rm(list=ls()) # remove all the objects in the enviroment
### write functions
StdErr <- function(vector) {
sd(vector)/sqrt(length(vector))
}
CoefVar<- function(vector){
sd(vector)/mean(vector)
}
# apply families http://nsaunders.wordpress.com/2010/08/20/a-brief-introduction-to-apply-in-r/
# with
> with(expression_data, tapply(X=counts_transcript_a, INDEX=genotype, FUN=mean))
wild_type mutant
215.50 138.25
# some commonly used functions, try ?to understand them
head() # print the first 6 lines, different with linux (default 10 lines)
table()
rownames()
colnames()
nrow()
ncol()
by()
with()
rowSums()
rowMeans()
summary()
# construct sequences
one_to_20<- 1:20
twenty_to_1<- 20:1
seq1<- seq(from =1, to = 20, by 0.5)
# or seq1<- seq(1,20,0.5)
# repeat numbers
many_2<- rep(2, times=20)
many_a<- rep("a", times=20)
seq_rep<- rep(1:10, times=2)
rep_3_times<- rep(c(1,2,3), times=3)
# different
rep_each_3_times<- rep(c(1,2,3), each=3)
# to do: subsetting for vectors and matrix
# R mark-down
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment