Skip to content

Instantly share code, notes, and snippets.

@sirusb
Last active August 29, 2015 14:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sirusb/a8e9d5c3f3d28556747d to your computer and use it in GitHub Desktop.
Save sirusb/a8e9d5c3f3d28556747d to your computer and use it in GitHub Desktop.
Example counting GC content
# نقوم بتحميل مواقع جزر السي بي جي لكامل الجينوم
dataURL <-"http://bios221.stanford.edu/data/model-based-cpg-islands-hg19.txt"
cpglocs=read.table(dataURL ,header=T)
# نختار فقظ الكروموزم رقم 8
cpglocs8=cpglocs[which(cpglocs[,1]=="chr8"),2:3]
# الجدول يحتوي على أماكن بداية ونهاية كل جزيرة
head(cpglocs8)
# start end
#56961 32437 33708
#56962 40354 40656
#56963 44536 46203
#56964 99457 100721
#56965 155954 156761
#56966 179033 179756
# نقوم باخذ بعض المناطق التي لا تنتمي إلى الجزر
nonilocs8=matrix(0,ncol=2,nrow=2856)
nonilocs8[1,1]=10000
nonilocs8[1,2]=cpglocs8[1,1]-1
nonilocs8[2:2856,1]=cpglocs8[1:2855,2]+1
nonilocs8[2:2855,2]=cpglocs8[2:2855,1]-1
nonilocs8[2856,2]=146304021
head(nonilocs8)
# [,1] [,2]
#[1,] 10000 32436
#[2,] 33709 40353
#[3,] 40657 44535
#[4,] 46204 99456
#[5,] 100722 155953
#[6,] 156762 179032
# نقوم يتحميل الجينوم البشري
library(BSgenome.Hsapiens.UCSC.hg19)
# نحولها إلى سلاسل حمض نووي
seqChr8Islands = DNAStringSet(Hsapiens$chr8, start=cpglocs8[,1], end=cpglocs8[,2])
seqChr8NonIslands = DNAStringSet(Hsapiens$chr8, start=nonilocs8[,1], end=nonilocs8[,2])
seqChr8Islands
# A DNAStringSet instance of length 2855
# width seq
# [1] 1272 CGGGTGCCATCTCAGCAGCTCACGGTGTGGAA...TTTGCGGAAGAACACGGCGGGGGGGGCGGCGC
# [2] 303 CGATTAGCGGGAGCGCAAAGCGAAGGGGCGGC...GCGCACGCGCGGTCGGCTCGGCGAGGAGCCGG
# [3] 1668 CGGCGCTGCAGGAGAGGAGATGCCCAGGCCAG...CTCGGTGTGAGTTCATGGGTGTGACGGGGCGT
# [4] 1265 CGGCTGGCCGGGCAGGGGGGCTGACCCCCCCC...TGAACTCCATCCTCCCGGCGGTCGGGCGGCGG
# [5] 808 CGGGAAAGATTTTATTCACCGTCGATGCGGCC...CTCTTGCTCGCAGTATACTGGCGGCACGCCGC
# ... ... ...
#[2851] 2077 CGGCAAGTAGCACACTCGACCAACTGCTGAAA...ATTATTTCAAGGTCGACGGCCAAGGAGACCGG
#[2852] 472 CGACAGGCGGGAACGCCACCAGCCTGTGGGCG...TAGCCTCTCTAGATGGCGGGAGGCCGGGGCGA
#[2853] 902 CGCCACCCTCGCCCTGCCCGCGCCTCGGCCAA...GCGACCCGGGCAGGTGAGGAGCACGGGCCCGG
#[2854] 125 CGCCTCCAGCTGCGCAAGGGCAGGTGCGGGCA...GAGGATGCCTCAGTGCGTCCGGACACCACCGA
#[2855] 1631 CGTCTCAGGAAACACCGCTTTCCACTCCTGTG...ATCCACGGGCACGTGTGATCTGGGGTCCGCGG
# نقوم بحساب نسبة السلسلة سي جي في كل منطقة
freqIslands = vcountPattern("CG", seqChr8Islands) / width(seqChr8Islands)
freqNonIslands = vcountPattern("CG", seqChr8NonIslands) / width(seqChr8NonIslands)
# نضع هذه النتائج مع بعض في جدول
freqCombined = rbind(data.frame(freq = freqIslands, type = "islands"),
data.frame(freq = freqNonIslands, type = "non-islands"))
# السطور الأولى من الجدول
head(freqCombined)
# freq type
#1 0.09119497 islands
#2 0.09570957 islands
#3 0.05755396 islands
#4 0.08537549 islands
#5 0.10519802 islands
#6 0.06906077 islands
# ثم نقوم برسم الجدول التكراري لتوزع هذه النسب
library(ggplot2)
ggplot(freqCombined) +
geom_histogram(aes(fill = type, x = freq), alpha = .7, position = "identity", binwidth = .005) +
labs(title = "Frequency of CG digrams") + theme_bw()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment