Skip to content

Instantly share code, notes, and snippets.

@davetang
Last active March 23, 2017 15:23
Show Gist options
  • Save davetang/6562474 to your computer and use it in GitHub Desktop.
Save davetang/6562474 to your computer and use it in GitHub Desktop.
From a data frame with chromosomal coordinates, obtain the sequence, and calculate the dinucleotide frequencies
#I want to fetch sequences from
#my_random_loci and my_refseq_tss
head(my_random_loci,2)
chr start end strand
1 chr18 59415403 59415407 +
2 chr22 8535632 8535636 -
#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("BSgenome.Hsapiens.UCSC.hg19")
#load if necessary
library(BSgenome.Hsapiens.UCSC.hg19)
#obtain the sequences
my_random_loci_seq <- getSeq(BSgenome.Hsapiens.UCSC.hg19,
names=my_random_loci$chr,
start=my_random_loci$start,
end=my_random_loci$end,
strand=my_random_loci$strand)
head(my_random_loci_seq)
# A DNAStringSet instance of length 6
# width seq
#[1] 5 CTCAG
#[2] 5 NNNNN
#[3] 5 CCTCA
#[4] 5 AGAAA
#[5] 5 TAATT
#[6] 5 CCCTC
#I don't want entries with N's
#how many are there?
length(grep(pattern="N",x=my_random_loci_seq))
#[1] 5150
#remove them
my_random_loci_seq <- my_random_loci_seq[-grep(pattern="N",x=my_random_loci_seq)]
#do the same for my_refseq_tss
my_refseq_tss_seq <- getSeq(BSgenome.Hsapiens.UCSC.hg19,
names=my_refseq_tss$chromosome_name,
start=my_refseq_tss$transcript_start,
end=my_refseq_tss$transcript_end,
strand=my_refseq_tss$strand)
head(my_refseq_tss_seq)
# A DNAStringSet instance of length 6
# width seq
#[1] 5 TTGTT
#[2] 5 GGGAT
#[3] 5 GGCAC
#[4] 5 CGCTG
#[5] 5 CAGCA
#[6] 5 GTGGC
#calculate the dinucleotide frequency
my_random_loci_seq_di <- dinucleotideFrequency(my_random_loci_seq)
colSums(my_random_loci_seq_di)
# AA AC AG AT CA CC CG CT GA GC GG GT TA TC
#14142 7386 9968 11071 10667 7813 1401 10098 8640 6150 7471 7147 9097 8622
# TG TT
#10424 13611
my_refseq_tss_seq_di <- dinucleotideFrequency(my_refseq_tss_seq)
colSums(my_refseq_tss_seq_di)
# AA AC AG AT CA CC CG CT GA GC GG GT TA TC
# 5868 5941 10382 5275 10694 11895 9787 8117 8649 13169 13423 6469 3493 7238
# TG TT
# 8390 5546
#how about the number of combinations?
my_random_loci_seq_di_comb <- table(apply(my_random_loci_seq_di, 1, function(x) paste(x, collapse='')))
head(my_random_loci_seq_di_comb)
#
#0000000000000004 0000000000000013 0000000000000103 0000000000001003 0000000000010003
# 234 74 81 110 70
#0000000000010012
# 231
head(my_random_loci_seq_di_comb[order(my_random_loci_seq_di_comb, decreasing=T)])
#
#2001000000001000 0001000000001002 2010000010000000 0000000100000102 2100100000000000
# 395 393 371 341 287
#1001000000001001
# 275
length(unique(my_random_loci_seq_di_comb))
#[1] 158
#combinations for RefSeq
my_refseq_tss_seq_di_comb <- table(apply(my_refseq_tss_seq_di, 1, function(x) paste(x, collapse='')))
head(my_refseq_tss_seq_di_comb[order(my_refseq_tss_seq_di_comb, decreasing=T)])
#
#0000001001200000 0000021001000000 0010000010200000 0000011001100000 0010110001000000
# 627 400 397 363 338
#0001110000000010
# 334
length(unique(my_refseq_tss_seq_di_comb))
[1] 156
#not run
#par(mfrow=c(2,1))
#plot(density(my_refseq_tss_seq_di_comb))
#plot(density(my_random_loci_seq_di_comb))
#par(mfrow=c(1,1))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment