{{ message }}

Instantly share code, notes, and snippets.

# yulijia/CB2BioComp_notebook.md

Last active Feb 23, 2017
CB2 BioComp note

# R

donot do

python

GATK java

$\latex$

CTAN CPAN CRAN Biocondutor

yum install -y epel-release

yum install R

inHybrid Hybrid compiled
Python Java C
R Perl C++
Bash

su -c "yum install rstudio-1.0.44-x86_64.rpm"

#!/usr/bin/Rscript
cat("hello world\n")

export PATH=$PATH=~/ourbin getwd() setwd("/home/cb2user/Download") a<-2 a + 2 b <- 3 myfunction1<-function(x){ cat(x) } x<-"Hello World\n" myfunction1(x) x=100 myfunction1<-function(x){ v=rnorm(x) mean(v) } myfunction1(x) x=100 myfunction=function(x){ v<<-rnorm(x) mean(v) } myfunction(x) v  http://stackoverflow.com/questions/10904124/global-and-local-variables-in-r myfunction2<-function(y){ cat(y) } rseek.org killall rstudio install.packages("ggplot2") vector() list() x=1:20 x=5:50 class(x) x[1] y=c("a","b","c") class(y) p=c("1","2","3") class(p) z=c(1,2,3) class(z) y[1] list() l=c(1,"a",2,"b") c() # Combine Values into a Vector or List l=list(1,"a",2,"b") y[3]="d" l[[3]]=function(){cat("hello")} l[[3]]() l=list() for(i in 1:10){ cat(i,"\n") l[[i]]=i } for(i in 1:length(l)){ cat(l[[i]],"\n") } words=c("hello","world") for(i in words){ cat(i," ") } words=c("Hello","and hi","World") l=list() for(i in 1:length(words)){ if(i==3 && (words[i]=="World" || words[i]=="world")){ l[[i]]="America!" }else{ l[[i]]=words[i] } } for(i in l){ cat(i," ") }  ## day2 8bit, 16bit, linked list class(v) v=list("a"="A","b"="B","c"=c(1,2,3)) typeof(v) t=TRUE f=FALSE n=c(1:10) n[n>5|n<2] n=1:20 n[n>=10& n<=15] n[n!=5 & n!=6] n[n!=c(5,6)] n[!(n%in%c(5,6))] f=factor(c("yes","yes","no")) table(f) as.character(n) m=matrix(n,nrow=2) matrix(1:20,byrow = T,nrow=2) n=1:10 dim(n)=c(2,5) m[1,] m[,2] m[,-4]  http://www.programiz.com/r-programming/precedence-associativity https://stat.ethz.ch/R-manual/R-devel/library/base/html/Logic.html https://stat.ethz.ch/R-manual/R-devel/library/base/html/Syntax.html data.frame data.table dplyr d=data.frame("X"=c(1,3,3),Y=c(5,6,7)) d[,1] d$X

d=data.frame("X"=c(1,3,3),Y=c(5,6,7),"Z"=c(T,T,F))

data(iris)
View(iris)
dim(iris)
write.table(iris,"/home/cb2user/iris.txt",quote=F,row.names=F)

cat iris.txt|column -t



gzip iris.txt

zmyiris=read.table("iris.txt.gz",header=T,na.strings="-")

test=c(1,2,NA,3)
sum(test)
is.na(test)
test=test[!is.na(test)]
sum(test)


NA, NaN

## regular expression

\n \d \s \S \d+ plus means one or more (.*)

^>gi|(\d+)|ref|(\S+)| $fh=file("/media/sf_CB2/dm/dm.faa",open="r") while (length (line=readLines(fh,warn=F,n=1))>0){ pattern="^\\>gi\\|(\\d+)\\|ref\\|(\\S+)\\|" m=regexec(pattern,line,perl=T) v=regmatches(line,m) vec=v[[1]] cat(vec[2],vec[3],sep="\t") } close(fh) str(v) fh=file("/media/sf_CB2/dm/dm.faa",open="r") while (length (line=readLines(fh,warn=F,n=1))>0){ pattern="^\\>gi\\|(\\d+)\\|ref\\|(\\S+)\\|" #m=regexec(pattern,line,perl=T) #m=mymatch(pattern,line) #if(m[[1]][1]==-1){ # next #} #if(!(ifmatched(m))){ # next #} m=mymatch(commapattern,line) if(!ifmatched(m)){ my=mymatch(pattern,line) } if(!(ifmatched(m))){ next } cat(getstring(line,m,2)) cat("\n") #v=regmatches(line,m) #vec=v[[1]] #cat(vec[2],vec[3],sep="\t") #cat("\n") } close(fh) pattern="\\|\\s+(.\*)\\s+\\[" commapattern="\\|\\s+(.\*)\\,\\s+\\[" getstring=function(text,match,index){ v=regmatches(text,match) vec=v[[1]] return(vex[[index]]) } mymatch=function(pattern,text){ m=regexec(patter,text,perl=T) return(m) } ifmatched=function(m){ if(m[[1]][1]==-1){ return(FALSE) }else{ return(TRUE) } } v=rnorm(100) median(v) mean(v) hist(v)  normal distribution, p-value CVS SVN GIT su -c "yum install git" git clone https://github.com/cb2edu/CB2-101-BioComp.git mkdir my_git_project cd my_git_project git init echo "hello" > hello.txt git add hello.txt git status git config --global user.email "email@gmail.com" git config --global user.name "username" git commit -m "Added new file Hello.txt" git log #### sha md5sum hello.txt > md5sum.txt git add hello.txt git add md5sum.txt git status git log -a git checkout 2c2f1 n=1:10 for(i in n){ cat(i) cat("\n") } if (!length(line)>0){ next } data("airquality") mean(airquality$Ozone)
mean(airquality$Ozone[is.na(airquality$Ozone)])

d=airquality$Ozone index=is.na(airquality$Ozone)
clean_idx=!index
clean_value=d[clean_index]
mean(clean_value)

mean(airquality$Ozone,na.rm=T) summary(airquality) normal distribution always mean=median plot(airquality) plot(airquality$Temp,airquality$Ozone, xlab="Temp", ylab="", main="Temp vs Ozone") plot(airquality$Temp,airquality$Ozone,type="n",xlab="",ylab="",axes=F) points(airquality$Temp,airquality$Ozone,pch=16) axis(1) axis(2) box() title(main="Temp vs Ozone",xlab="Temp",ylab="Ozone") legend("topleft",legend="Some points",pch=16) par(mfrow=c(1,2)) plot(airquality$Temp,airquality$Ozone, xlab="Temp", ylab="", main="Temp vs Ozone") plot(airquality$Temp,airquality$Ozone, xlab="Temp", ylab="", main="Temp vs Ozone") #plot(airquality$Temp,airquality$Ozone,type="n",xlab="",ylab="",axes=F) #plot(airquality$Temp,airquality$Ozone,type="o",xlab="",ylab="",axes=F) #plot(airquality$Temp,airquality$Ozone,type="l",xlab="",ylab="",axes=F) #plot(airquality$Temp,airquality$Ozone,type="p",xlab="",ylab="",axes=F) #plot(airquality$Temp,airquality$Ozone,type="p",xlab="",ylab="",axes=T)  difference between vector and bitmap eps ps pdf svg pdf("test.pdf",paper="letter") plot(airquality$Temp,airquality$Ozone, xlab="Temp", ylab="", main="Temp vs Ozone") dev.off() png() svg()  su -c "yum install evince" hist(airquality$Ozone)
hist(airquality$Ozone, breaks=50) boxplot(airquality)  MAD Outliers: http://www.stat.wmich.edu/s160/book/node8.html data("mtcars") View(mtcars) counts=table(mtcars$gear)
barplot(counts,xlab="Gears",ylab="Freq",main="Grears",horiz=T)

counts=table(mtcars$cyl,mtcars$gear)

barplot(counts,beside=T,legend=rownames(counts))

rownames(counts)

rnorm(10)

rnorm(10,sd=1,mean=5)

data=list(a=rnorm(10,sd=1,mean=5),b=rnorm(10,sd=1,mean=5),c=rnorm(10,sd=1,mean=5))

data=list(rnorm(10,sd=1,mean=5),rnorm(10,sd=1,mean=5),rnorm(10,sd=1,mean=5))
stds=c()
means=c()

sd(data[[1]])

for(i in 1:3){
data[[i]]=rnorm(10,mean=5,sd=1)
stds[i]=sd(data[[i]])
means[i]=mean(data[[i]])
}

bp=barplot(means,names=c(1:3),ylim=c(0,ceiling(max(means+stds))),col=nice) # median points
arrows(x0=bp,y0=means-stds,x1=bp,y1=means+stds, code=3,angle=90)

install.packages("RColorBrewer")
library("RColorBrewer")
display.brewer.all()

nice=brewer.pal(3,"Pastel2")



# Reproducible_research

https://en.wikipedia.org/wiki/Johannes_Gutenberg

https://en.wikipedia.org/wiki/Hermann_Zapf

https://en.wikipedia.org/wiki/Donald_Knuth

$$\TeX$$

https://en.wikipedia.org/wiki/Leslie_Lamport

pandoc markdown

\documentclass{article} \title{My first document in \LaTeX} \author{Malay}

\begin{document} \maketitle \section{First section}

\subsection{A subsection} blablabla

\section{Second section} Second document

Math $$\frac{1}{2}$$

\end{document}

su -c "yum install pandoc"

# First section

this is first section.

# Second section

This is second section

## This is a subsection

blablablablablabla

pandoc --number-sections -o hello_markdown.pdf hello.md

pandoc -o hello_markdown.docx hello.md

Zotero

https://guides.github.com/features/mastering-markdown/

su -c "yum install texlive-framed" su -c "yum install texlive-titling"

data(women)
dim(women)

plot(women)
m=lm(women$weight~women$height)
#regression(prediction，dependence of x y) and correlation(翻转xy不变) difference
s=summary(m)
abline(v=65,col="Blue")
abline(h=140,col="Green")
abline(m,col="red")
s$adj.r.squared s$coefficients
s$coefficients[2,1] s$coefficients[1,1]

d=data.frame()

data("airquality")
ncol=dim(airquality)[2]
names=colnames(airquality)

for(i in 1:ncol){
for(j in 1:ncol){
if(i==j){
next
}
x=airquality[,i]
y=airquality[,j]

m=lm(y~x)
s=summary(m)
r.sq=s$adj.r.squared d=rbind(d,data.frame(names[i],names[j],r.sq)) cat(names[i],names[j],r.sq,sep="\t") cat("\n") } } o=order(d$r.sq,decreasing=T)
d=d[o,]

lm(airquality$Ozone~airquality$Temp)

library(pheatmap)
index=complete.cases(airquality)
c=airquality[index,]
C=cor(c)
pheatmap(C)

ct=cor.text(airquality$Ozone,airquality$Temp)


## SEquence search blast

wget ftp://ftp.uniprot.org/pub/databases/uniprot/knowledgebase/uniprot_sprot.fasta.gz

homoplasy homology

entroy KL divergence Bayes chain Markov HMM

hmmer

wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/legacy/2.2.26/blast-2.2.26-x64-linux.tar.gz

tar and zip

tar cvzf cvJf cvjf

c for creation x for unzip

tar zxvf blast-2.2.26*

export PATH=$PATH:pwd http://www.uniprot.org/uniprot/P04637.fasta wget -O p53.fas http://www.uniprot.org/uniprot/P04637.fasta formatdb -i uniprot_sprot.fasta -n uniprot blastall -pblastp -i p53.fas -d uniprot -o p53.blastp.out -m 9 ## Hmmer e-value is based on the database size. wget ftp://ftp.ncbi.nlm.nih.gov/pub/HomoloGene/current/homologene.data d=read.table("/media/sf_CB2/hmm/homologene.data",sep="\t") cluster=d[d$V1==460,] write.table(cluster,"p53_homologene.txt",sep="\t",quote=F,row.names=F)

\n break

REST API

#!/bin/bash

file=$1 for i in cat$file | cut -f 6 do wget -q -O - "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=i&rettype=fasta&retmode=text" done ./download_Seq.sh p53_homologene.txt > p53.fa evince & # program running in the background fg bring the software to the frontground ### seq alignment muscle export PATH=PATH:pwd

muscle -in p53.fa -out p53.aln

pushd popd

hmmbuikld --informat afa p53.hmm p53.aln

ln -s /media/sf_CB2/uniprot/uniprot_sprot.fasta .

hmmsearch -o hits.txt /media/sf_CB2/hmm/p53.hmm uniprot_sprot.fasta

su -c "yum install libxml2 libxml2-devel libcurl libcurl-devel"

source("https://bioconductor.org/biocLite.R")

biocLite("Homo.sapiens")

library("Homo.sapiens")

columns(Homo.sapiens)
keytypes(Homo.sapiens)
genename=keys(Homo.sapiens,keytype="SYMBOL")
gene.list=select(Homo.sapiens,keys=genenames,keytype="SYMBOL",columns=c("TXCHROM"))

gene.freq=table(gene.df$TXCHROM) barplot(gene.freq,las=2) library(GenomicFeatures) chr.info=getChromInFromUCSC("hg19") gene.freq=data.frame(gene.freq) names(gene.freq)=c("chr","freq") merged.data=merge(gene.freq,chr.info,by.x="chr",by.y="chrom") plot(merged.data$length,merged.data$freq,main="Lenth vs gene count",xlab="Chromosome length",ylab="Gene count") text(merged.data$length,merged.data$freq,cex=0.6,pos=4,col="red") m=lm(merged.data$freq~merged.data$length) summary(m) abline(m) cor.test(merged.data$length,merged.data\$freq)

#!/usr/bin/Rscript

mymatch <- function (pattern, text) {
m <- regexec( pattern, text, perl = TRUE)
return(m)
}

ifmatched <- function (m) {
if (m[[1]][1] == -1) {
return(FALSE)
}else {
return(TRUE)
}
}

getstring <- function (text, match, index) {
v <- regmatches(text, match)
vec <- v[[1]]
return(vec[index])
}

###### MAIN ENTRY #############

cmd=commandArgs(trailingOnly=TRUE)
id=cmd[1]
filename=cmd[2]

fh <- file(filename,open="r")

inside=FALSE

idpattern="\\>(\\S+)\\s+"

#commapattern <- "\\|\\s+(.*)\\,\\s+"
#pattern <- "\\|\\s+(.*)\\s+\\["

m <- mymatch (idpattern, line)

if (!(ifmatched(m))) {
#next
if(inside){
seq=paste(seq,line)
}else{
next
}
}else{
#cat(getstring(line, m, 2))
fileis=getstring(line,m,2)
if(fileid==id){
seq=paste(seq,line)
inside=TRUE
}else{
if(inside){
cat(seq)
break
}
}
}
#cat("\n")
}
close(fh)

cat(seq)
#################

#!/usr/bin/Rscript

mymatch <- function (pattern, text) {
m <- regexec( pattern, text, perl = TRUE)
return(m)
}

ifmatched <- function (m) {
if (m[[1]][1] == -1) {
return(FALSE)
}else {
return(TRUE)
}
}

getstring <- function (text, match, index) {
v <- regmatches(text, match)
vec <- v[[1]]
return(vec[index])
}

###### MAIN ENTRY #########

cmd <- commandArgs(trailingOnly = TRUE)
id <- cmd[1]
filename <- cmd[2]

fh <- file(filename,open="r")

inside <- FALSE
seq <- c("")

while ( length(line <- readLines(fh,warn=FALSE, n=1))> 0 ){

idpattern <- "\\>(\\S+)\\s+"

#commapattern <- "\\|\\s+(.*)\\,\\s+"
#pattern <- "\\|\\s+(.*)\\s+\\["

m <- mymatch (idpattern, line)
if (!(ifmatched(m))) {
if (inside) {
seq <- paste(seq, line)
}else {
next
}
}else {
# cat (getstring(line, m, 2))
fileid <- etstring(line, m, 2)
if (fileid == id) {
seq <- paste(seq, line)
inside <- TRUE
}else {
if (inside) {
cat (seq)
break
}
}
}

}
close(fh)

cat (seq)



## sequence

e=seq(0,60,10)
a=1-10^(-e/10)
plot(e,a,xlab="Phred score",ylab="Accuracy")
lines(e,a)
abline(v=20,col="red")


su -c "yum install java-1.8.0-openjdk"

## sequenceing

Sailfish: Rapid Alignment free Quantification of Isoform Abundance

### mutation of DNA

transition transversion heraclitean fire chargaff

pyrimidines,purines

assembly is different from alignment

GATK and Freebayes

https://udall-lab.byu.edu/Research/Software/BamBam

file 文件名 #查看文件类型

yum provides aclocal

autogen.sh=>configure=>makefile=>make

perl script vcf-annotate (in vcftools)

## RNA seq

RPKM 在样本之间不能比较

RSEM=> TPM value normalized the RPKM, but also cannot compared between multiple samples

DeSeq normalizion Up Quantile normalizaion 表达量数据是非线性的，所以用quantile的好，median是统计线性的数据

If there are only 2 different conditions among the samples, four statistics (columns) will be reported for each gene/transcript. They are "PPEE", "PPDE", "PostFC" and "RealFC". "PPEE" is the posterior probability (estimated by EBSeq) that a gene/transcript is equally expressed. "PPDE" is the posterior probability that a gene/transcript is differentially expressed. "PostFC" is the posterior fold change (condition 1 over condition2) for a gene/transcript. It is defined as the ratio between posterior mean expression estimates of the gene/transcript for each condition. "RealFC" is the real fold change (condition 1 over condition2) for a gene/transcript. It is the ratio of the normalized within condition 1 mean count over normalized within condition 2 mean count for the gene/transcript. Fold changes are calculated using EBSeq's 'PostFC' function. The genes/transcripts are reported in descending order of their "PPDE" values.

4 control 3 treatment，之间的fold change， 统计的是拟合的两条线之间的fold change