yulijia/CB2BioComp_notebook.md

## CB2BioComp_notebook.md

      
    Raw
  

              CB2BioComp_notebook.md
            
          
    revisited

R

google n-gram
donot do
python
GATK java
recommand book An Introduction to Statistical Learning with Applications in R
$\latex$
CTAN
CPAN
CRAN
Biocondutor
yum install -y epel-release
yum install R
download https://download1.rstudio.org/rstudio-1.0.44-x86_64.rpm


inHybrid
Hybrid
compiled


Python
Java
C


R
Perl
C++


Bash


su -c "yum install rstudio-1.0.44-x86_64.rpm"
#!/usr/bin/Rscript
cat("hello world\n")


export PATH=$PATH=~/ourbin

getwd()
setwd("/home/cb2user/Download")
a<-2
a + 2
b <- 3
myfunction1<-function(x){
  cat(x)
}

x<-"Hello World\n"
myfunction1(x)

x=100
myfunction1<-function(x){
  v=rnorm(x)
  mean(v)
}
myfunction1(x)

x=100

myfunction=function(x){
  v<<-rnorm(x)
  mean(v)
}
myfunction(x)
v

http://stackoverflow.com/questions/10904124/global-and-local-variables-in-r
myfunction2<-function(y){
  cat(y)
}

rseek.org

killall rstudio

install.packages("ggplot2")

vector()
list()
x=1:20
x=5:50
class(x)
x[1]

y=c("a","b","c")
class(y)
p=c("1","2","3")
class(p)
z=c(1,2,3)
class(z)

y[1]

list()
l=c(1,"a",2,"b")
c() # Combine Values into a Vector or List
l=list(1,"a",2,"b")
y[3]="d"
l[[3]]=function(){cat("hello")}
l[[3]]()

l=list()

for(i in 1:10){
  cat(i,"\n")
  l[[i]]=i
}

for(i in 1:length(l)){
  cat(l[[i]],"\n")
}

words=c("hello","world")
for(i in words){
  cat(i," ")
}

words=c("Hello","and hi","World")
l=list()
for(i in 1:length(words)){
  if(i==3 && (words[i]=="World" || words[i]=="world")){
    l[[i]]="America!"
  }else{
    l[[i]]=words[i]
  }
}
for(i in l){
  cat(i," ")
}


day2

8bit, 16bit, linked list
class(v)

v=list("a"="A","b"="B","c"=c(1,2,3))
typeof(v)

t=TRUE
f=FALSE
n=c(1:10)
n[n>5|n<2]
n=1:20
n[n>=10& n<=15]
n[n!=5 & n!=6]
n[n!=c(5,6)]
n[!(n%in%c(5,6))]
f=factor(c("yes","yes","no"))
table(f)
as.character(n)
m=matrix(n,nrow=2)
matrix(1:20,byrow = T,nrow=2)
n=1:10
dim(n)=c(2,5)
m[1,]
m[,2]
m[,-4]

http://www.programiz.com/r-programming/precedence-associativity
https://stat.ethz.ch/R-manual/R-devel/library/base/html/Logic.html
https://stat.ethz.ch/R-manual/R-devel/library/base/html/Syntax.html
data.frame
data.table
dplyr
d=data.frame("X"=c(1,3,3),Y=c(5,6,7))
d[,1]
d$X

d=data.frame("X"=c(1,3,3),Y=c(5,6,7),"Z"=c(T,T,F))

data(iris)
View(iris)
dim(iris)
write.table(iris,"/home/cb2user/iris.txt",quote=F,row.names=F)

cat iris.txt|column -t

myiris=read.table("iris.txt",header=T)

gzip iris.txt
zmyiris=read.table("iris.txt.gz",header=T,na.strings="-")

test=c(1,2,NA,3)
sum(test)
is.na(test)
test=test[!is.na(test)]
sum(test)

NA, NaN
regular expression

\n
\d
\s
\S
\d+ plus means one or more
(.*)
^>gi|(\d+)|ref|(\S+)| $
fh=file("/media/sf_CB2/dm/dm.faa",open="r")
while (length (line=readLines(fh,warn=F,n=1))>0){
  pattern="^\\>gi\\|(\\d+)\\|ref\\|(\\S+)\\|"
  m=regexec(pattern,line,perl=T)
  v=regmatches(line,m)
  vec=v[[1]]
  cat(vec[2],vec[3],sep="\t")
}
close(fh)

str(v)

fh=file("/media/sf_CB2/dm/dm.faa",open="r")
while (length (line=readLines(fh,warn=F,n=1))>0){
  pattern="^\\>gi\\|(\\d+)\\|ref\\|(\\S+)\\|"
  #m=regexec(pattern,line,perl=T)
  #m=mymatch(pattern,line)
  #if(m[[1]][1]==-1){
  #  next
  #}
  #if(!(ifmatched(m))){
  #  next
  #}
  m=mymatch(commapattern,line)
  if(!ifmatched(m)){
    my=mymatch(pattern,line)
  }
  if(!(ifmatched(m))){
    next
  }
  cat(getstring(line,m,2))
  cat("\n")
  #v=regmatches(line,m)
  #vec=v[[1]]
  #cat(vec[2],vec[3],sep="\t")
  #cat("\n")
}
close(fh)

pattern="\\|\\s+(.\*)\\s+\\["

commapattern="\\|\\s+(.\*)\\,\\s+\\["


getstring=function(text,match,index){
  v=regmatches(text,match)
  vec=v[[1]]
  return(vex[[index]])
}

mymatch=function(pattern,text){
  m=regexec(patter,text,perl=T)
  return(m)
}

ifmatched=function(m){
  if(m[[1]][1]==-1){
    return(FALSE)
  }else{
    return(TRUE)
  }
}

v=rnorm(100)
median(v)
mean(v)
hist(v)

normal distribution, p-value  
CVS SVN GIT
su -c "yum install git"
git clone https://github.com/cb2edu/CB2-101-BioComp.git
mkdir my_git_project
cd my_git_project
git init
echo "hello" > hello.txt
git add hello.txt
git status
git config --global user.email "email@gmail.com"
git config --global user.name "username"
git commit -m "Added new file Hello.txt"
git log
sha

md5sum hello.txt > md5sum.txt
git add hello.txt
git add md5sum.txt
git status
git log -a
git checkout 2c2f1
n=1:10
for(i in n){
  cat(i)
  cat("\n")
}

if (!length(line)>0){
  next
}


data("airquality")
mean(airquality$Ozone)
mean(airquality$Ozone[is.na(airquality$Ozone)])

d=airquality$Ozone
index=is.na(airquality$Ozone)
clean_idx=!index
clean_value=d[clean_index]
mean(clean_value)

mean(airquality$Ozone,na.rm=T)

summary(airquality)

normal distribution  always mean=median

plot(airquality)

plot(airquality$Temp,airquality$Ozone, xlab="Temp", ylab="", main="Temp vs Ozone")

plot(airquality$Temp,airquality$Ozone,type="n",xlab="",ylab="",axes=F)
points(airquality$Temp,airquality$Ozone,pch=16)
axis(1)
axis(2)
box()
title(main="Temp vs Ozone",xlab="Temp",ylab="Ozone")
legend("topleft",legend="Some points",pch=16)

par(mfrow=c(1,2))
plot(airquality$Temp,airquality$Ozone, xlab="Temp", ylab="", main="Temp vs Ozone")
plot(airquality$Temp,airquality$Ozone, xlab="Temp", ylab="", main="Temp vs Ozone")

#plot(airquality$Temp,airquality$Ozone,type="n",xlab="",ylab="",axes=F)
#plot(airquality$Temp,airquality$Ozone,type="o",xlab="",ylab="",axes=F)
#plot(airquality$Temp,airquality$Ozone,type="l",xlab="",ylab="",axes=F)
#plot(airquality$Temp,airquality$Ozone,type="p",xlab="",ylab="",axes=F)
#plot(airquality$Temp,airquality$Ozone,type="p",xlab="",ylab="",axes=T)

difference between vector and bitmap
eps
ps
pdf
svg
pdf("test.pdf",paper="letter")
plot(airquality$Temp,airquality$Ozone, xlab="Temp", ylab="", main="Temp vs Ozone")
dev.off()

png()
svg()


su -c "yum install evince"
hist(airquality$Ozone)
hist(airquality$Ozone, breaks=50)
boxplot(airquality)

MAD
Outliers: http://www.stat.wmich.edu/s160/book/node8.html
data("mtcars")
View(mtcars)

counts=table(mtcars$gear)
barplot(counts,xlab="Gears",ylab="Freq",main="Grears",horiz=T)

counts=table(mtcars$cyl,mtcars$gear)

barplot(counts,beside=T,legend=rownames(counts))

rownames(counts)

rnorm(10)

rnorm(10,sd=1,mean=5)

data=list(a=rnorm(10,sd=1,mean=5),b=rnorm(10,sd=1,mean=5),c=rnorm(10,sd=1,mean=5))

data=list(rnorm(10,sd=1,mean=5),rnorm(10,sd=1,mean=5),rnorm(10,sd=1,mean=5))
stds=c()
means=c()

sd(data[[1]])

for(i in 1:3){
  data[[i]]=rnorm(10,mean=5,sd=1)
  stds[i]=sd(data[[i]])
  means[i]=mean(data[[i]])
}

bp=barplot(means,names=c(1:3),ylim=c(0,ceiling(max(means+stds))),col=nice) # median points
arrows(x0=bp,y0=means-stds,x1=bp,y1=means+stds, code=3,angle=90)

install.packages("RColorBrewer")
library("RColorBrewer")
display.brewer.all()

nice=brewer.pal(3,"Pastel2")


Reproducible_research

https://en.wikipedia.org/wiki/Johannes_Gutenberg
https://en.wikipedia.org/wiki/Hermann_Zapf
https://en.wikipedia.org/wiki/Donald_Knuth
$$\TeX$$
https://en.wikipedia.org/wiki/Leslie_Lamport
pdfTeX
https://en.wikipedia.org/wiki/PdfTeX
XeTeX
https://tug.org/interviews/kew.html
pandoc
markdown
\documentclass{article}
\title{My first document in \LaTeX}
\author{Malay}
\begin{document}
\maketitle
\section{First section}
\subsection{A subsection}
blablabla
\section{Second section}
Second document
Math
$$\frac{1}{2}$$
\end{document}
su -c "yum install pandoc"

title: My first document in Markdown
author: Malay

First section

this is first section.
Second section

This is second section
This is a subsection

blablablablablabla
pandoc --number-sections -o hello_markdown.pdf hello.md
pandoc -o hello_markdown.docx hello.md
Zotero
https://guides.github.com/features/mastering-markdown/
su -c "yum install texlive-framed"
su -c "yum install texlive-titling"
regression gradient descent
data(women)
head(women)
dim(women)

plot(women)
m=lm(women$weight~women$height)
#regression(prediction，dependence of x y) and correlation(翻转xy不变) difference
s=summary(m)
abline(v=65,col="Blue")
abline(h=140,col="Green")
abline(m,col="red")
s$adj.r.squared
s$coefficients
s$coefficients[2,1]
s$coefficients[1,1]

d=data.frame()

data("airquality")
ncol=dim(airquality)[2]
names=colnames(airquality)

for(i in 1:ncol){
  for(j in 1:ncol){
    if(i==j){
      next
    }
    x=airquality[,i]
    y=airquality[,j]

    m=lm(y~x)
    s=summary(m)
    r.sq=s$adj.r.squared
    d=rbind(d,data.frame(names[i],names[j],r.sq))
    cat(names[i],names[j],r.sq,sep="\t")
    cat("\n")
  }
}

o=order(d$r.sq,decreasing=T)
d=d[o,]

lm(airquality$Ozone~airquality$Temp)
# http://stackoverflow.com/questions/10300769/how-to-load-packages-in-r-automatically

library(pheatmap)
index=complete.cases(airquality)
c=airquality[index,]
C=cor(c)
pheatmap(C)

ct=cor.text(airquality$Ozone,airquality$Temp)

SEquence search blast

wget ftp://ftp.uniprot.org/pub/databases/uniprot/knowledgebase/uniprot_sprot.fasta.gz
homoplasy
homology
entroy KL divergence Bayes chain
Markov
HMM
hmmer
wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/legacy/2.2.26/blast-2.2.26-x64-linux.tar.gz
tar and zip
tar cvzf cvJf cvjf
c for creation x for unzip
tar zxvf blast-2.2.26*
export PATH=$PATH:pwd
http://www.uniprot.org/uniprot/P04637.fasta
wget -O p53.fas http://www.uniprot.org/uniprot/P04637.fasta
formatdb -i uniprot_sprot.fasta -n uniprot
blastall -pblastp -i p53.fas -d uniprot -o p53.blastp.out -m 9
Hmmer

e-value is based on the database size.
wget ftp://ftp.ncbi.nlm.nih.gov/pub/HomoloGene/current/homologene.data
d=read.table("/media/sf_CB2/hmm/homologene.data",sep="\t")
cluster=d[d$V1==460,]
write.table(cluster,"p53_homologene.txt",sep="\t",quote=F,row.names=F)
\n break
REST
API
#!/bin/bash
file=$1
for i in cat $file | cut -f 6
do
wget -q -O - "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=$i&rettype=fasta&retmode=text"
done
./download_Seq.sh p53_homologene.txt > p53.fa
evince & # program running in the background
fg bring the software to the frontground
seq alignment

muscle
export PATH=$PATH:pwd
muscle -in p53.fa -out p53.aln
pushd
popd
hmmbuikld --informat afa p53.hmm p53.aln
ln -s /media/sf_CB2/uniprot/uniprot_sprot.fasta .
hmmsearch -o hits.txt /media/sf_CB2/hmm/p53.hmm uniprot_sprot.fasta
su -c "yum install libxml2 libxml2-devel libcurl libcurl-devel"
source("https://bioconductor.org/biocLite.R")

biocLite("Homo.sapiens")


library("Homo.sapiens")

columns(Homo.sapiens)
keytypes(Homo.sapiens)
genename=keys(Homo.sapiens,keytype="SYMBOL")
gene.list=select(Homo.sapiens,keys=genenames,keytype="SYMBOL",columns=c("TXCHROM"))

gene.freq=table(gene.df$TXCHROM)
barplot(gene.freq,las=2)

library(GenomicFeatures)
chr.info=getChromInFromUCSC("hg19")
gene.freq=data.frame(gene.freq)
names(gene.freq)=c("chr","freq")
merged.data=merge(gene.freq,chr.info,by.x="chr",by.y="chrom")


plot(merged.data$length,merged.data$freq,main="Lenth vs gene count",xlab="Chromosome length",ylab="Gene count")

text(merged.data$length,merged.data$freq,cex=0.6,pos=4,col="red")

m=lm(merged.data$freq~merged.data$length)

summary(m)
abline(m)

cor.test(merged.data$length,merged.data$freq)

#!/usr/bin/Rscript


mymatch <- function (pattern, text) {
  m <- regexec( pattern, text, perl = TRUE)
  return(m)
}

ifmatched <- function (m) {
  if (m[[1]][1] == -1) {
    return(FALSE)
  }else {
    return(TRUE)
  }
}

getstring <- function (text, match, index) {
  v <- regmatches(text, match)
  vec <- v[[1]]
  return(vec[index])
}

###### MAIN ENTRY #############

cmd=commandArgs(trailingOnly=TRUE)
id=cmd[1]
filename=cmd[2]

fh <- file(filename,open="r")

inside=FALSE

while (length(line <- readLines(fh,warn=FALSE, n=1))>0){

    idpattern="\\>(\\S+)\\s+"

    #commapattern <- "\\|\\s+(.*)\\,\\s+"
    #pattern <- "\\|\\s+(.*)\\s+\\["

    m <- mymatch (idpattern, line)

    if (!(ifmatched(m))) {
      #next
      if(inside){
        seq=paste(seq,line)
      }else{
        next
      }
    }else{
      #cat(getstring(line, m, 2))
      fileis=getstring(line,m,2)
      if(fileid==id){
        seq=paste(seq,line)
        inside=TRUE
      }else{
        if(inside){
          cat(seq)
          break
        }
      }
    }
    #cat("\n")
}
close(fh)

cat(seq)
#################

#!/usr/bin/Rscript

mymatch <- function (pattern, text) {
  m <- regexec( pattern, text, perl = TRUE)
  return(m)
}

ifmatched <- function (m) {
  if (m[[1]][1] == -1) {
    return(FALSE)
  }else {
    return(TRUE)
  }
}

getstring <- function (text, match, index) {
  v <- regmatches(text, match)
  vec <- v[[1]]
  return(vec[index])
}

###### MAIN ENTRY #########

cmd <- commandArgs(trailingOnly = TRUE)
id <- cmd[1]
filename <- cmd[2]


fh <- file(filename,open="r")

inside <- FALSE
seq <- c("")

while ( length(line <- readLines(fh,warn=FALSE, n=1))> 0 ){

  idpattern <- "\\>(\\S+)\\s+"

  #commapattern <- "\\|\\s+(.*)\\,\\s+"
  #pattern <- "\\|\\s+(.*)\\s+\\["

  m <- mymatch (idpattern, line)
  if (!(ifmatched(m))) {
    if (inside) {
      seq <- paste(seq, line)
    }else {
      next
    }
  }else {
    # cat (getstring(line, m, 2))
    fileid <- etstring(line, m, 2)
    if (fileid == id) {
      seq <- paste(seq, line)
      inside <- TRUE
    }else {
      if (inside) {
        cat (seq)
        break
      }
    }
  }

}
close(fh)

cat (seq)


sequence

e=seq(0,60,10)
a=1-10^(-e/10)
plot(e,a,xlab="Phred score",ylab="Accuracy")
lines(e,a)
abline(v=20,col="red")

su -c "yum install java-1.8.0-openjdk"
download fastqc_v0.11.5.zip and install it
download trimmomatic and install it
jave -jar trimmomatic-0.36.jar PE adrenal_R1.fq adrenal_R2.fq adrenal_trimmed_R1.fq adrenal_R1_u.fq adrenal_trimmed_R2.fq adrenal_R2_u.fq ILLUMINACLIP:Trimmomatic-0.36/adapters/TruSeq3-PE.fa:2:30:10 LEADING:2 TRAILING:2 SLIDINGWINDOW:4:15 MINLEN:30
sequenceing

Sailfish: Rapid Alignment free Quantification of Isoform Abundance
mutation of DNA

transition transversion
heraclitean fire chargaff
pyrimidines,purines
assembly is different from alignment
GATK and Freebayes
https://udall-lab.byu.edu/Research/Software/BamBam
file 文件名 #查看文件类型
yum provides aclocal
autogen.sh=>configure=>makefile=>make
perl script vcf-annotate (in vcftools)
RNA seq

RPKM 在样本之间不能比较
RSEM=> TPM value normalized the RPKM, but also cannot compared between multiple samples
样本间
edgeR TMM normalizion
DeSeq normalizion
Up Quantile normalizaion 表达量数据是非线性的，所以用quantile的好，median是统计线性的数据
If there are only 2 different conditions among the samples, four statistics (columns) will be reported for each gene/transcript. They are "PPEE", "PPDE", "PostFC" and "RealFC". "PPEE" is the posterior probability (estimated by EBSeq) that a gene/transcript is equally expressed. "PPDE" is the posterior probability that a gene/transcript is differentially expressed. "PostFC" is the posterior fold change (condition 1 over condition2) for a gene/transcript. It is defined as the ratio between posterior mean expression estimates of the gene/transcript for each condition. "RealFC" is the real fold change (condition 1 over condition2) for a gene/transcript. It is the ratio of the normalized within condition 1 mean count over normalized within condition 2 mean count for the gene/transcript. Fold changes are calculated using EBSeq's 'PostFC' function. The genes/transcripts are reported in descending order of their "PPDE" values.
4 control 3 treatment，之间的fold change， 统计的是拟合的两条线之间的fold change