Created
November 29, 2012 01:10
-
-
Save radaniba/4166043 to your computer and use it in GitHub Desktop.
Analyze human Genome in R. Read more here http://www.biocodershub.net/community/analyzing-human-genome-r-scripts/ and here http://userwww.service.emory.edu/~hwu30/teaching/bioc/bios560R.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###### 1. explore sequence composition of human genome | |
library(BSgenome) | |
available.genomes() | |
library(BSgenome.Hsapiens.UCSC.hg18) | |
### get sequence for chromosome 1 | |
Seq=Hsapiens[["chr1"]] | |
Seq # shows some summaries | |
Seq=unmasked(Seq) ## remove the mask | |
## get GC content | |
bases=alphabetFrequency(Seq,baseOnly=TRUE) | |
bases[1:4] | |
ntotBases=sum(bases[1:4]) | |
baseFreq=bases[1:4]/ntotBases | |
GCcontent=baseFreq["C"]+baseFreq["G"] | |
## look at CG dinucleotide content | |
cg=matchPattern("CG", Seq) | |
ncg=length(cg) | |
## compute the observed to expected ratio | |
ncg/(baseFreq["C"]*baseFreq["G"]*ntotBases) ## this shows CG rarely stay together. | |
## compare to TG | |
tg=matchPattern("TG", Seq) | |
ntg=length(tg) | |
ntg/(baseFreq["T"]*baseFreq["G"]*ntotBases) ## this shows TG presented more than expected. | |
######### look at GC content and CG dinucleotide distribution in 1000 bp windows in whole genome. | |
## Note I'm only doing it in chr1 now. | |
ss=seq(1, length(Seq), by=1000) | |
ss=ss[-length(ss)] ## remove the last one | |
Seq.set=DNAStringSet(Seq, start=ss, end=ss+999) | |
ff=alphabetFrequency(Seq.set, baseOnly=TRUE) | |
pCG=(ff[,"C"]+ff[,"G"])/rowSums(ff) | |
hist(pCG[pCG>0],100) | |
## CG occurance | |
Seq.set=DNAStringSet(Seq, start=ss, end=ss+999) | |
nCG=vcountPattern("CG", Seq.set) | |
obsExp=nCG*1000/(ff[,"C"]*ff[,"G"]) | |
mean(obsExp,na.rm=TRUE) | |
hist(obsExp,100) | |
###### 2. Explore GC content at gene TSS (transcriptional starting site), | |
## and their overlaps with CpG island. | |
## first obtain genes from UCSC using GenomicFeatures functions | |
library(GenomicFeatures) | |
txdb=makeTranscriptDbFromUCSC(genom="hg18",tablename="refGene") ## this is slow, take a few minutes. | |
genes=as.data.frame(transcripts(txdb)) | |
colnames(genes)[1:3]=c("chrom","txStart","txEnd") | |
## Alternatively, you can read in genes from the file downloaded at last lab | |
genes=read.table("hg18genes.txt", comment="", header=TRUE, stringsAsFactors=FALSE) | |
## get transcriptional start site. | |
## Note that txEnd is the start site for genes on - strand | |
tss=genes$txStart | |
idx=genes$strand=="-" | |
tss[idx]=genes$txEnd[idx] | |
## create GRanges object for the TSS, remove random and hap chromosomes | |
idx=c(grep("random", genes$chrom),grep("hap", genes$chrom)) | |
## create ranges with TSS +/- 500 bp | |
TSS=GRanges(seqnames=Rle(genes$chrom[-idx]), ranges=IRanges(tss[-idx]-500,tss[-idx]+500)) | |
## get GC contents of TSS for genes on chromosome 1 | |
idx.chr1=seqnames(TSS)=="chr1" | |
Seq.set=DNAStringSet(Seq, start=start(TSS[idx.chr1]), end=end(TSS[idx.chr1])) | |
ff=alphabetFrequency(Seq.set, baseOnly=TRUE) | |
pCG.TSS=(ff[,"C"]+ff[,"G"])/rowSums(ff) | |
hist(pCG.TSS,100) | |
## compare with genome wide distribution of GC content | |
d1=density(pCG[pCG>0]) | |
d2=density(pCG.TSS) | |
plot(d1, lwd=2) | |
lines(d2, col="red",lwd=2) | |
legend("topright", legend=c("Genome", "TSS"), lwd=2, col=c("black", "red")) | |
## CG dinucleotide around TSS, compute obsExp | |
nCG=vcountPattern("CG", Seq.set) | |
obsExp.TSS=nCG*1000/(ff[,"C"]*ff[,"G"]) | |
hist(obsExp.TSS,100) | |
## compare with the genome wide one: | |
d1=density(obsExp[!is.na(obsExp)]) | |
d2=density(obsExp.TSS) | |
plot(d1, lwd=2, xlim=c(0,2), main="Observed to expected CG ratio") | |
lines(d2, col="red",lwd=2) | |
legend("topright", legend=c("Genome", "TSS"), lwd=2, col=c("black", "red")) | |
########## | |
### now read in CpG island file and compare it to TSS | |
cgi=read.table("hg18CGI.txt", comment="", header=TRUE, sep="\t", stringsAsFactors=FALSE) | |
colnames(cgi)[1]="chrom" ## fix column name | |
idx=c(grep("random", cgi$chrom),grep("hap", cgi$chrom)) | |
CGI=GRanges(seqnames=Rle(cgi$chrom[-idx]), ranges=IRanges(cgi$chromStart[-idx],cgi$chromEnd[-idx])) | |
## look at % of TSS overlapping CGI | |
overlaps=countOverlaps(TSS, CGI) | |
mean(overlaps>0) | |
## or do | |
mean(TSS %in% CGI) | |
##################################################################### | |
## Time allows, perform following to explore worm gene annotations. | |
## otherwise students can keep and study the scripts. | |
##################################################################### | |
library(GenomicFeatures) | |
txdb=makeTranscriptDbFromUCSC(genom="ce2",tablename="refGene") | |
genes=transcriptsBy(txdb) | |
length(genes) | |
## number of +/- genes | |
strands=as.character(unlist(strand(genes))) | |
table(strands) | |
## gene lengths | |
geneLength=unlist(end(genes))-unlist(start(genes)) | |
## distribution of gene lengths versus strand | |
boxplot(log(geneLength)~strands) | |
t.test(log(geneLength)~strands) | |
## number of genes on each chromosome | |
chrs=as.character(seqnames(genes)) | |
ngenes.chr=table(chrs) | |
ngenes.chr | |
## number of genes on each chromosome versus chromosome lengths | |
## pay attention to the way to plot | |
chrlen=seqlengths(genes) | |
## there's no genes on chrM, remove it in plotting | |
chrlen2=chrlen[names(ngenes.chr)]/1e6 | |
plot(chrlen2, as.numeric(ngenes.chr), type="n", xlab="Chromosome length (Mbp)", ylab="Number of genes", main="C. elegans") | |
text(chrlen2, ngenes.chr, names(ngenes.chr)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment