crazyhottommy/ngs2014_R_basics.r

## ngs2014_R_basics.r
#2014 MSU NGS R basics tutorial
#http://angus.readthedocs.org/en/2014/R_Introductory_tutorial_2014.html
#https://github.com/jrherr/quick_basic_R_tutorial/blob/master/R_tutorial.md

#pick one language, and learn it well!

#pick up a dataset, play with it!

#object-oriented programming
#functional programming

#deal with big data in R: (R holds all the data in memory)
#http://theodi.org/blog/fig-data-11-tips-how-handle-big-data-r-and-1-bad-pun
#http://r-pbd.org/

#packages: plyr, dplyr, ggplot2, reshape2, data.table (fread function)

# commands start here!!

q()  # quit R

getwd() # get working directory
setwd() # set working directory

y<- 2  # assign a variable

x<- 3

x + y   # treat it as a calculator

x * y

x/y

x %% y

x == 3  # equal sign, will reture a logical vector: True or False

# in R, True and False have numerical values: True resolves to 1 , False resolves to 0

# exponents **  or ^
2**2  # returns  4
2^2   # returns 4

log(2.7) #  natural log returns 0.99325
log(4,2) #  returns 2

a<- c(2,3,6,8) # assign a vector use c denotes concatenate
b<- c(3,5,6,7)

a + b   #
a * b   #

# when length of a and b are different, R will recycle the longer one and gives a warning

length(a)  # length of a, returns 4

new_varaible <- c(a,b)  # concatenate two variables

crap<- rep(1:100) # index starts at 1

rm (crap)  # remove this variable


?lm  # get help for linear regression model function

# simple functions
a<- c(2,3,6,8)
mean(a)
sum(a)
var(a)

b<- c(3,5,6,7)
cor(a,b)  # pearson correlation for two vectors

m<- cbind(a,b)  # column bind the vector to a matrix

m

#> m
#      a b
# [1,] 2 3
# [2,] 3 5
# [3,] 6 6
# [4,] 8 7

cor(m)   # pearson correlation for columns of a matrix
cor(m, method="spearman")  # spearman correlation of columns of a matrix
?cor

mode (a)  #  numeric

class (a) # numeric

class (m) # matrix

typeof(m)

str(m)  # structure of m, try it out in your R console
dim(m)  # dimension of m: 4 2

nrow(m)  # number of rows 4
ncol(m) # number of columns 2
length(m)  # 8

is.matrix(m)  # True

# create a matrix from scratch
#> m1<- matrix(1:12,3,4)
#> m1
#     [,1] [,2] [,3] [,4]
#[1,]    1    4    7   10
#[2,]    2    5    8   11
#[3,]    3    6    9   12


# strings
cities<- c("E.lansing", "Gainesville", "Shanghai", "Yichun")
class(cities)
#[1] "character"

length(cities)  # 4 cities in the vector

nchar(cities)  # number of characters for each city
#[1]  9 11  8  6

sum(nchar(cities))
#[1] 34

rivers<- c("Red Cedar", "swamp", "Huang Pu", "Long He")

cities_rivers<- cbind (cities,rivers)

cities_rivers
#     cities        rivers
#[1,] "E.lansing"   "Red Cedar"
#[2,] "Gainesville" "swamp"
#[3,] "Shanghai"    "Huang Pu"
#[4,] "Yichun"      "Long He"

class(cities_rivers) # matrix
mode (citeis_rivers) # character


model_1<- y ~ x1 + x2 + x1:x2
model_1

#> class(model_1)
#[1] "formula"

counts_transcript_a <- c(250, 157, 155, 300, 125, 100, 153, 175)
genotype <- gl(n=2, k=4, labels = c("wild_type", "mutant"))
#> genotype
#[1] wild_type wild_type wild_type wild_type mutant
#[6] mutant    mutant    mutant
#Levels: wild_type mutant

#alternative to gl function, one can

genotype1<- factor(rep(c("wild_type","mutant"),each=4))
#> genotype1
#[1] wild_type wild_type wild_type wild_type mutant
#[6] mutant    mutant    mutant
#Levels: mutant wild_type

#notice the use of the "each" argument

genotype1<- factor(rep(c("wild_type","mutant"),4))
#> genotype1
#[1] wild_type mutant    wild_type mutant    wild_type
#[6] mutant    wild_type mutant
#Levels: mutant wild_type


#also, notice that the levels are different with that generated by gl function

#we want the wild_type to be the base level. Instead, do:

genotype1<- factor(rep(c("wild_type","mutant"),each=4), levels=c("wild_type","mutant"))
#> genotype1
#[1] wild_type wild_type wild_type wild_type mutant    mutant    mutant    mutant
#Levels: wild_type mutant

?relevel # try it also


expression_data <- data.frame(counts_transcript_a, genotype)
#> expression_data
#  counts_transcript_a  genotype
#1                 250 wild_type
#2                 157 wild_type
#3                 155 wild_type
#4                 300 wild_type
#5                 125    mutant
#6                 100    mutant
#7                 153    mutant
#8                 175    mutant

expression_data$counts_transcript_a  # access a column of the dataframe

ls()  # objects in the enviroment

rm(list=ls())  # remove all the objects in the enviroment


### write functions

StdErr <- function(vector) {
    sd(vector)/sqrt(length(vector))
}


CoefVar<- function(vector){
    sd(vector)/mean(vector)
}


# apply families http://nsaunders.wordpress.com/2010/08/20/a-brief-introduction-to-apply-in-r/

# with

> with(expression_data, tapply(X=counts_transcript_a, INDEX=genotype, FUN=mean))
wild_type    mutant
   215.50    138.25

# some commonly used functions, try ?to understand them

head() # print the first 6 lines, different with linux (default 10 lines)
table()
rownames()
colnames()
nrow()
ncol()
by()
with()
rowSums()
rowMeans()
summary()


# construct sequences
one_to_20<- 1:20

twenty_to_1<- 20:1

seq1<- seq(from =1, to = 20, by 0.5)

# or seq1<- seq(1,20,0.5)

# repeat numbers

many_2<- rep(2, times=20)

many_a<- rep("a", times=20)

seq_rep<- rep(1:10, times=2)

rep_3_times<- rep(c(1,2,3), times=3)

# different

rep_each_3_times<- rep(c(1,2,3), each=3)


# to do:  subsetting for vectors and matrix
# R mark-down
	#2014 MSU NGS R basics tutorial
	#http://angus.readthedocs.org/en/2014/R_Introductory_tutorial_2014.html
	#https://github.com/jrherr/quick_basic_R_tutorial/blob/master/R_tutorial.md

	#pick one language, and learn it well!

	#pick up a dataset, play with it!

	#object-oriented programming
	#functional programming

	#deal with big data in R: (R holds all the data in memory)
	#http://theodi.org/blog/fig-data-11-tips-how-handle-big-data-r-and-1-bad-pun
	#http://r-pbd.org/

	#packages: plyr, dplyr, ggplot2, reshape2, data.table (fread function)

	# commands start here!!

	q() # quit R

	getwd() # get working directory
	setwd() # set working directory

	y<- 2 # assign a variable

	x<- 3

	x + y # treat it as a calculator

	x * y

	x/y

	x %% y

	x == 3 # equal sign, will reture a logical vector: True or False

	# in R, True and False have numerical values: True resolves to 1 , False resolves to 0

	# exponents ** or ^
	2**2 # returns 4
	2^2 # returns 4

	log(2.7) # natural log returns 0.99325
	log(4,2) # returns 2

	a<- c(2,3,6,8) # assign a vector use c denotes concatenate
	b<- c(3,5,6,7)

	a + b #
	a * b #

	# when length of a and b are different, R will recycle the longer one and gives a warning

	length(a) # length of a, returns 4

	new_varaible <- c(a,b) # concatenate two variables

	crap<- rep(1:100) # index starts at 1

	rm (crap) # remove this variable


	?lm # get help for linear regression model function

	# simple functions
	a<- c(2,3,6,8)
	mean(a)
	sum(a)
	var(a)

	b<- c(3,5,6,7)
	cor(a,b) # pearson correlation for two vectors

	m<- cbind(a,b) # column bind the vector to a matrix

	m

	#> m
	# a b
	# [1,] 2 3
	# [2,] 3 5
	# [3,] 6 6
	# [4,] 8 7

	cor(m) # pearson correlation for columns of a matrix
	cor(m, method="spearman") # spearman correlation of columns of a matrix
	?cor

	mode (a) # numeric

	class (a) # numeric

	class (m) # matrix

	typeof(m)

	str(m) # structure of m, try it out in your R console
	dim(m) # dimension of m: 4 2

	nrow(m) # number of rows 4
	ncol(m) # number of columns 2
	length(m) # 8

	is.matrix(m) # True

	# create a matrix from scratch
	#> m1<- matrix(1:12,3,4)
	#> m1
	# [,1] [,2] [,3] [,4]
	#[1,] 1 4 7 10
	#[2,] 2 5 8 11
	#[3,] 3 6 9 12


	# strings
	cities<- c("E.lansing", "Gainesville", "Shanghai", "Yichun")
	class(cities)
	#[1] "character"

	length(cities) # 4 cities in the vector

	nchar(cities) # number of characters for each city
	#[1] 9 11 8 6

	sum(nchar(cities))
	#[1] 34

	rivers<- c("Red Cedar", "swamp", "Huang Pu", "Long He")

	cities_rivers<- cbind (cities,rivers)

	cities_rivers
	# cities rivers
	#[1,] "E.lansing" "Red Cedar"
	#[2,] "Gainesville" "swamp"
	#[3,] "Shanghai" "Huang Pu"
	#[4,] "Yichun" "Long He"

	class(cities_rivers) # matrix
	mode (citeis_rivers) # character


	model_1<- y ~ x1 + x2 + x1:x2
	model_1

	#> class(model_1)
	#[1] "formula"

	counts_transcript_a <- c(250, 157, 155, 300, 125, 100, 153, 175)
	genotype <- gl(n=2, k=4, labels = c("wild_type", "mutant"))
	#> genotype
	#[1] wild_type wild_type wild_type wild_type mutant
	#[6] mutant mutant mutant
	#Levels: wild_type mutant

	#alternative to gl function, one can

	genotype1<- factor(rep(c("wild_type","mutant"),each=4))
	#> genotype1
	#[1] wild_type wild_type wild_type wild_type mutant
	#[6] mutant mutant mutant
	#Levels: mutant wild_type

	#notice the use of the "each" argument

	genotype1<- factor(rep(c("wild_type","mutant"),4))
	#> genotype1
	#[1] wild_type mutant wild_type mutant wild_type
	#[6] mutant wild_type mutant
	#Levels: mutant wild_type


	#also, notice that the levels are different with that generated by gl function

	#we want the wild_type to be the base level. Instead, do:

	genotype1<- factor(rep(c("wild_type","mutant"),each=4), levels=c("wild_type","mutant"))
	#> genotype1
	#[1] wild_type wild_type wild_type wild_type mutant mutant mutant mutant
	#Levels: wild_type mutant

	?relevel # try it also


	expression_data <- data.frame(counts_transcript_a, genotype)
	#> expression_data
	# counts_transcript_a genotype
	#1 250 wild_type
	#2 157 wild_type
	#3 155 wild_type
	#4 300 wild_type
	#5 125 mutant
	#6 100 mutant
	#7 153 mutant
	#8 175 mutant

	expression_data$counts_transcript_a # access a column of the dataframe

	ls() # objects in the enviroment

	rm(list=ls()) # remove all the objects in the enviroment


	### write functions

	StdErr <- function(vector) {
	sd(vector)/sqrt(length(vector))
	}


	CoefVar<- function(vector){
	sd(vector)/mean(vector)
	}


	# apply families http://nsaunders.wordpress.com/2010/08/20/a-brief-introduction-to-apply-in-r/

	# with

	> with(expression_data, tapply(X=counts_transcript_a, INDEX=genotype, FUN=mean))
	wild_type mutant
	215.50 138.25

	# some commonly used functions, try ?to understand them

	head() # print the first 6 lines, different with linux (default 10 lines)
	table()
	rownames()
	colnames()
	nrow()
	ncol()
	by()
	with()
	rowSums()
	rowMeans()
	summary()


	# construct sequences
	one_to_20<- 1:20

	twenty_to_1<- 20:1

	seq1<- seq(from =1, to = 20, by 0.5)

	# or seq1<- seq(1,20,0.5)

	# repeat numbers

	many_2<- rep(2, times=20)

	many_a<- rep("a", times=20)

	seq_rep<- rep(1:10, times=2)

	rep_3_times<- rep(c(1,2,3), times=3)

	# different

	rep_each_3_times<- rep(c(1,2,3), each=3)



	# to do: subsetting for vectors and matrix
	# R mark-down