Last active
March 24, 2022 04:14
-
-
Save yuifu/54a61a0e5955953e71a157f7c827d740 to your computer and use it in GitHub Desktop.
junction_annotation.py from RSeQC 4.0.0 installed via pip http://rseqc.sourceforge.net/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/Users/ozakiharuka/miniforge3/bin/python3.9 | |
''' | |
Annotate splicing reads against gene model in two levels: reads level and juncion level. | |
Note: | |
1) A read, especially long read, can be spliced 2 or more times | |
2) Multiple splicing reads spanning the same intron can be consolidated into one splicing junction. | |
''' | |
#import built-in modules | |
import os,sys | |
if sys.version_info[0] != 3: | |
print("\nYou are using python" + str(sys.version_info[0]) + '.' + str(sys.version_info[1]) + " This verion of RSeQC needs python3!\n", file=sys.stderr) | |
sys.exit() | |
import re | |
import string | |
from optparse import OptionParser | |
import warnings | |
import string | |
import collections | |
import math | |
from time import strftime | |
import subprocess | |
#import my own modules | |
from qcmodule import SAM | |
#changes to the paths | |
#changing history to this module | |
__author__ = "Liguo Wang" | |
__copyright__ = "Copyleft" | |
__credits__ = [] | |
__license__ = "GPL" | |
__version__="4.0.0" | |
__maintainer__ = "Liguo Wang" | |
__email__ = "wang.liguo@mayo.edu" | |
__status__ = "Production" | |
def printlog (mesg): | |
'''print progress into stderr and log file''' | |
mesg="@ " + strftime("%Y-%m-%d %H:%M:%S") + ": " + mesg | |
LOG=open('class.log','a') | |
print(mesg, file=sys.stderr) | |
print(mesg, file=LOG) | |
def generate_bed12(infile,size=1): | |
''' | |
infile: input file. eg: chrX 66766604 66788677 348 partial_novel | |
size: the block size representing exons | |
''' | |
outfile = infile.replace('.xls','.bed') | |
OUT = open(outfile,'w') | |
for line in open(infile,'r'): | |
if line.startswith('chrom'):continue | |
line = line.strip() | |
f = line.split() | |
if len(f) != 5:continue | |
chrom = f[0] | |
start = int(f[1]) - size | |
end = int(f[2]) + size | |
score = int(f[3]) | |
strand = '.' | |
name = f[4] | |
thick_st = start | |
thick_end = end | |
if name == 'annotated': | |
color = '205,0,0' | |
elif name == 'partial_novel': | |
color = '0,205,0' | |
elif name == 'complete_novel': | |
color = '0,0,205' | |
else: | |
color = '0,0,0' | |
blockCount = 2 | |
blockSizes = ','.join((str(size),str(size))) | |
blockStarts = '0,' + str(end-size-start) | |
print('\t'.join( [str(i) for i in [chrom, start, end, name, score, strand, thick_st, thick_end,color, blockCount, blockSizes,blockStarts]]), file=OUT) | |
OUT.close() | |
def generate_interact(infile, bam_file, size=1): | |
''' | |
infile: input file. eg: chrX 66766604 66788677 348 partial_novel | |
size: the block size representing exons | |
''' | |
outfile = infile.replace('.xls','.Interact.bed') | |
OUT = open(outfile,'w') | |
print ('track type=interact name="Splice junctions" description="Splice junctions detected from %s" maxHeightPixels=200:200:50 visibility=full' % bam_file, file=OUT) | |
for line in open(infile,'r'): | |
if line.startswith('chrom'):continue | |
line = line.strip() | |
f = line.split() | |
if len(f) != 5:continue | |
chrom = f[0] | |
chromStart = int(f[1]) - size | |
chromEnd = int(f[2]) + size | |
name1 = f[4] | |
if name1 == 'annotated': | |
color = '205,0,0' | |
elif name1 == 'partial_novel': | |
color = '0,205,0' | |
elif name1 == 'complete_novel': | |
color = '0,0,205' | |
else: | |
color = '0,0,0' | |
name = chrom + ":" + str(chromStart) + '-' + str(chromEnd) + "_" + name1 | |
score = int(f[3]) | |
value = float(score) | |
exp = 'RNAseq_junction' | |
sourceChrom = chrom | |
sourceStart = chromStart | |
sourceEnd = chromStart + size | |
sourceName = sourceChrom + ":" + str(sourceStart) + '-' + str(sourceEnd) | |
sourceStrand = '.' | |
targetChrom = chrom | |
targetStart = chromEnd - size | |
targetEnd = chromEnd | |
targetName = targetChrom + ":" + str(targetStart) + '-' + str(targetEnd) | |
targetStrand = '.' | |
print ('\t'.join([str(i) for i in [chrom, chromStart, chromEnd, name, score, value, exp, color, sourceChrom, sourceStart, sourceEnd, sourceName, sourceStrand, targetChrom, targetStart, targetEnd, targetName, targetStrand ]]), file=OUT) | |
OUT.close() | |
def main(): | |
usage="%prog [options]" + '\n' + __doc__ + "\n" | |
parser = OptionParser(usage,version="%prog " + __version__) | |
parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format.") | |
parser.add_option("-r","--refgene",action="store",type="string",dest="ref_gene_model",help="Reference gene model in bed format. This file is better to be a pooled gene model as it will be used to annotate splicing junctions [required]") | |
parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files(s). [required]") | |
parser.add_option("-m","--min-intron",action="store",type="int",dest="min_intron",default=50, help="Minimum intron length (bp). default=%default [optional]") | |
parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality (phred scaled) for an alignment to be considered as \"uniquely mapped\". default=%default") | |
(options,args)=parser.parse_args() | |
if not (options.output_prefix and options.input_file and options.ref_gene_model): | |
parser.print_help() | |
sys.exit(0) | |
if not os.path.exists(options.ref_gene_model): | |
print('\n\n' + options.ref_gene_model + " does NOT exists" + '\n', file=sys.stderr) | |
sys.exit(0) | |
if os.path.exists(options.input_file): | |
obj = SAM.ParseBAM(options.input_file) | |
obj.annotate_junction(outfile=options.output_prefix,refgene=options.ref_gene_model,min_intron=options.min_intron, q_cut = options.map_qual) | |
try: | |
subprocess.call("Rscript " + options.output_prefix + '.junction_plot.r', shell=True) | |
except: | |
print("Cannot generate pdf file from " + '.junction_plot.r', file=sys.stderr) | |
pass | |
else: | |
print('\n\n' + options.input_file + " does NOT exists" + '\n', file=sys.stderr) | |
sys.exit(0) | |
try: | |
print ('Create BED file ...', file=sys.stderr) | |
generate_bed12(options.output_prefix + '.junction.xls') | |
except: | |
pass | |
try: | |
print ('Create Interact file ...', file=sys.stderr) | |
generate_interact(options.output_prefix + '.junction.xls', options.input_file) | |
except: | |
pass | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
'''manipulate BAM/SAM file.''' | |
#import built-in modules | |
import os,sys | |
import re | |
import string | |
from optparse import OptionParser | |
import warnings | |
import string | |
import collections | |
import math | |
import random | |
#import third-party modules | |
from bx.bitset import * | |
from bx.bitset_builders import * | |
from bx.intervals import * | |
from bx.binned_array import BinnedArray | |
from bx_extras.fpconst import isNaN | |
from bx.bitset_utils import * | |
import pysam | |
from qcmodule import mystat | |
from qcmodule import fasta | |
from qcmodule import bam_cigar | |
from qcmodule import BED | |
#changes to the paths | |
#changing history to this module | |
#05/26/2011: suppport multiple spliced mapped reads | |
#10/13/2011: saturation test for RNAs-eq data | |
__author__ = "Liguo Wang" | |
__copyright__ = "Copyleft" | |
__credits__ = [] | |
__license__ = "GPL" | |
__version__="3.0.0" | |
__maintainer__ = "Liguo Wang" | |
__email__ = "wang.liguo@mayo.edu" | |
__status__ = "Production" | |
class ParseSAM(object): | |
'''This class provides fuctions to parsing/processing/transforming SAM format file | |
Format of SAM file see: http://samtools.sourceforge.net/SAM1.pdf''' | |
_reExpr1=re.compile(r'\s+') #>=1 spaces | |
_reExpr2=re.compile(r'^\s*$') #blank line | |
_splicedHit_pat = re.compile(r'(\d+)[M|N]',re.IGNORECASE) #regular expression for spliced mapped reads | |
_monoHit_pat = re.compile(r'^(\d+)M$',re.IGNORECASE) #regular expresion for Non-spliced mapped reads | |
_insertionHit_pat =re.compile(r'\d+I',re.IGNORECASE) | |
_deletionHit_pat =re.compile(r'\d+D',re.IGNORECASE) | |
_softClipHi_pat =re.compile(r'\d+S',re.IGNORECASE) | |
_hardClipHi_pat =re.compile(r'\d+H',re.IGNORECASE) | |
_padHit_pat =re.compile(r'\d+P',re.IGNORECASE) | |
_uniqueHit_pat = re.compile(r'[NI]H:i:1\b') | |
def __init__(self,samFile): | |
'''constructor''' | |
if samFile == '-': | |
self.fileName = "STDIN" | |
self.f = sys.stdin | |
else: | |
self.fileName = os.path.basename(samFile) | |
self.f=open(samFile,'r') | |
def stat (self): | |
'''Calculate mapping statistics''' | |
total_read=0 | |
pcr_duplicate =0 | |
low_qual =0 | |
secondary_hit =0 | |
unmapped_read1=0 | |
mapped_read1=0 | |
reverse_read1=0 | |
forward_read1=0 | |
unmapped_read2=0 | |
mapped_read2=0 | |
reverse_read2=0 | |
forward_read2=0 | |
_numSplitHit =0 | |
_numMonoHit =0 | |
_numInsertion =0 | |
_numDeletion =0 | |
minus_minus=0 | |
minus_plus =0 | |
plus_minus=0 | |
plus_plus=0 | |
paired=True | |
unmap_SE=0 | |
map_SE=0 | |
reverse_SE=0 | |
forward_SE=0 | |
for line in self.f: | |
line=line.rstrip() | |
if line.startswith('@'):continue #skip head lines | |
if ParseSAM._reExpr2.match(line):continue #skip blank lines | |
total_read +=1 | |
field = line.split() | |
flagCode=string.atoi(field[1]) | |
if (flagCode & 0x0400 !=0): #PCR or optical duplicate | |
pcr_duplicate +=1 | |
continue | |
if (flagCode & 0x0200 !=0): #Low quality | |
low_qual +=1 | |
continue | |
if (flagCode & 0x0200 !=0): #Not primary alignment | |
secondary_hit +=1 | |
continue | |
if (len(ParseSAM._splicedHit_pat.findall(field[5]))>1):_numSplitHit +=1 #Splicing mapped reads | |
if (len(ParseSAM._splicedHit_pat.findall(field[5]))==1):_numMonoHit +=1 #mono mapped reads | |
if (ParseSAM._insertionHit_pat.search(field[5])):_numInsertion +=1 #insertion in reads | |
if (ParseSAM._deletionHit_pat.search(field[5])):_numDeletion +=1 #deletion in reads | |
if (flagCode & 0x0001 !=0): #This is paired end sequencing | |
if (flagCode & 0x0040 != 0): #1st read | |
if (flagCode & 0x0004 != 0):unmapped_read1 +=1 | |
if (flagCode & 0x0004 == 0):mapped_read1 +=1 | |
if (flagCode & 0x0010 != 0):reverse_read1 +=1 | |
if (flagCode & 0x0010 == 0):forward_read1 +=1 | |
if (flagCode & 0x0080 != 0): #2nd read | |
if (flagCode & 0x0004 != 0):unmapped_read2 +=1 | |
if (flagCode & 0x0004 == 0):mapped_read2 +=1 | |
if (flagCode & 0x0010 != 0):reverse_read2 +=1 | |
if (flagCode & 0x0010 == 0):forward_read2 +=1 | |
if (flagCode & 0x0010 != 0 and flagCode & 0x0020 != 0): | |
minus_minus +=1 | |
if (flagCode & 0x0010 != 0 and flagCode & 0x0020 == 0): | |
minus_plus +=1 | |
if (flagCode & 0x0010 == 0 and flagCode & 0x0020 != 0): | |
plus_minus +=1 | |
if (flagCode & 0x0010 == 0 and flagCode & 0x0020 == 0): | |
plus_plus +=1 | |
if (flagCode & 0x0001 ==0): #This is single end sequencing | |
paired=False | |
if (flagCode & 0x0004 != 0): | |
unmap_SE +=1 | |
if (flagCode & 0x0004 == 0): | |
map_SE +=1 | |
if (flagCode & 0x0010 != 0): | |
reverse_SE +=1 | |
if (flagCode & 0x0010 == 0): | |
forward_SE +=1 | |
if paired: | |
print("\n#==================================================", file=sys.stderr) | |
print("#================Report (pair-end)=================", file=sys.stderr) | |
print("%-25s%d" % ("Total Reads:",total_read), file=sys.stderr) | |
print("%-25s%d" % ("Total Mapped Reads:", (mapped_read1 + mapped_read2)), file=sys.stderr) | |
print("%-25s%d" % ("Total Unmapped Reads:",(unmapped_read1 + unmapped_read2)), file=sys.stderr) | |
print("%-25s%d" % ("PCR duplicate:",pcr_duplicate), file=sys.stderr) | |
print("%-25s%d" % ("QC-failed:",low_qual), file=sys.stderr) | |
print("%-25s%d" % ("Not primary mapping:",secondary_hit), file=sys.stderr) | |
print("\n", end=' ', file=sys.stderr) | |
print("%-25s%d" % ("Unmapped Read-1:",unmapped_read1), file=sys.stderr) | |
print("%-25s%d" % ("Mapped Read-1:",mapped_read1), file=sys.stderr) | |
print("%-25s%d" % (" Forward (+):",forward_read1), file=sys.stderr) | |
print("%-25s%d" % (" Reverse (-):",reverse_read1), file=sys.stderr) | |
print("\n", end=' ', file=sys.stderr) | |
print("%-25s%d" % ("Unmapped Read-2:",unmapped_read2), file=sys.stderr) | |
print("%-25s%d" % ("Mapped Read-2:",mapped_read2), file=sys.stderr) | |
print("%-25s%d" % (" Forward (+):",forward_read2), file=sys.stderr) | |
print("%-25s%d" % (" Reverse (-):",reverse_read2), file=sys.stderr) | |
print("\n", end=' ', file=sys.stderr) | |
print("%-25s%d" % ("Mapped to (+/-):",plus_minus), file=sys.stderr) | |
print("%-25s%d" % ("Mapped to (-/+):",minus_plus), file=sys.stderr) | |
print("%-25s%d" % ("Mapped to (+/+):",plus_plus), file=sys.stderr) | |
print("%-25s%d" % ("Mapped to (-/-):",minus_minus), file=sys.stderr) | |
print("\n", end=' ', file=sys.stderr) | |
print("%-25s%d" % ("Spliced Hits:",_numSplitHit), file=sys.stderr) | |
print("%-25s%d" % ("Non-spliced Hits:",_numMonoHit), file=sys.stderr) | |
print("%-25s%d" % ("Reads have insertion:",_numInsertion), file=sys.stderr) | |
print("%-25s%d" % ("Reads have deletion:",_numDeletion), file=sys.stderr) | |
else: | |
print("\n#====================================================", file=sys.stderr) | |
print("#================Report (single-end)=================", file=sys.stderr) | |
print("%-25s%d" % ("Total Reads:",total_read), file=sys.stderr) | |
print("%-25s%d" % ("Total Mapped Reads:", map_SE), file=sys.stderr) | |
print("%-25s%d" % ("Total Unmapped Reads:",unmap_SE), file=sys.stderr) | |
print("%-25s%d" % ("PCR duplicate:",pcr_duplicate), file=sys.stderr) | |
print("%-25s%d" % ("QC-failed:",low_qual), file=sys.stderr) | |
print("%-25s%d" % ("Not primary mapping:",secondary_hit), file=sys.stderr) | |
print("%-25s%d" % ("froward (+):",forward_SE), file=sys.stderr) | |
print("%-25s%d" % ("reverse (-):",reverse_SE), file=sys.stderr) | |
print("\n", end=' ', file=sys.stderr) | |
print("%-25s%d" % ("Spliced Hits:",_numSplitHit), file=sys.stderr) | |
print("%-25s%d" % ("Non-spliced Hits:",_numMonoHit), file=sys.stderr) | |
print("%-25s%d" % ("Reads have insertion:",_numInsertion), file=sys.stderr) | |
print("%-25s%d" % ("Reads have deletion:",_numDeletion), file=sys.stderr) | |
def samTobed(self,outfile=None,mergePE=False): | |
"""Convert SAM file to BED file. BED file will be saved as xxx.sam.bed unless otherwise specified. | |
If mergePE=False, each read will be one bed entry. If mergePE=True, pair-end (if there are and on | |
the same chr) reads will be displayed in bed entry.""" | |
if outfile is None: | |
outfile=self.fileName + ".bed" | |
print("\tWriting bed entries to\"",outfile,"\"...", end=' ', file=sys.stderr) | |
FO=open(outfile,'w') | |
for line in self.f: | |
if line.startswith(('@','track')):continue #skip head lines | |
if ParseSAM._reExpr2.match(line):continue #skip blank line | |
field=line.rstrip().split() | |
if (string.atoi(field[1]) & 0x0004)!=0: continue #skip unmapped line | |
if (string.atoi(field[1]) & 0x0040)!=0: | |
mate="/1" | |
else: | |
mate="/2" | |
flag = string.atoi(field[1]) | |
comb=[int(i) for i in ParseSAM._splicedHit_pat.findall(field[5])] #"9M4721N63M3157N8M" return ['9', '4721', '63', '3157', '8'] | |
chrom = field[2] | |
chromStart = string.atoi(field[3])-1 | |
chromEnd=chromStart | |
for i in comb: | |
chromEnd += i | |
name = field[0] + mate | |
score = field[4] | |
if(flag & 0x0010)==0: | |
strand = '+' | |
else: | |
strand = '-' | |
thickStart = chromStart | |
thickEnd = chromEnd | |
itemRgb = "0,255,0" | |
blockCount = (len(comb) +1) /2 | |
blockSize = [] | |
for i in range(0,len(comb),2): | |
blockSize.append(str(comb[i])) | |
blockSizes = ','.join(blockSize) | |
blockStart=[] | |
for i in range(0,len(comb),2): | |
blockStart.append(str(sum(comb[:i]))) | |
blockStarts = ','.join(blockStart) | |
print(string.join((str(i) for i in [chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts]),sep="\t"), file=FO) | |
print("Done", file=sys.stderr) | |
FO.close() | |
self.f.seek(0) | |
if mergePE: | |
#creat another bed file. pair-end reads will be merged into single bed entry | |
print("Writing consoidated bed file ...", end=' ', file=sys.stderr) | |
bedfile = open(outfile,'r') | |
outfile_2 = outfile + ".consolidate.bed" | |
outfile_3 = outfile + '.filter' | |
FO = open(outfile_2,'w') | |
FOF = open(outfile_3,'w') | |
count={} | |
chr=collections.defaultdict(set) | |
txSt={} | |
txEnd={} | |
strand=collections.defaultdict(list) | |
blocks={} | |
sizes=collections.defaultdict(list) | |
starts=collections.defaultdict(list) | |
for line in bedfile: | |
field=line.strip().split() | |
field[3]=field[3].replace('/1','') | |
field[3]=field[3].replace('/2','') | |
if field[3] not in count: | |
count[field[3]] = 1 | |
chr[field[3]].add(field[0]) | |
txSt[field[3]] = string.atoi(field[1]) | |
txEnd[field[3]] = string.atoi(field[2]) | |
strand[field[3]].append(field[5]) | |
blocks[field[3]] = string.atoi(field[9]) | |
sizes[field[3]].extend( field[10].split(',') ) | |
starts[field[3]].extend([string.atoi(i) + string.atoi(field[1]) for i in field[11].split(',') ]) | |
else: | |
count[field[3]] += 1 | |
chr[field[3]].add(field[0]) | |
if string.atoi(field[1]) < txSt[field[3]]: | |
txSt[field[3]] = string.atoi(field[1]) | |
if string.atoi(field[2]) > txEnd[field[3]]: | |
txEnd[field[3]] = string.atoi(field[2]) | |
blocks[field[3]] += string.atoi(field[9]) | |
strand[field[3]].append(field[5]) | |
sizes[field[3]].extend( field[10].split(',') ) | |
starts[field[3]].extend([string.atoi(i) + string.atoi(field[1]) for i in field[11].split(',') ]) | |
#befile.seek(0) | |
for key in count: | |
st=[] #real sorted starts | |
sz=[] #real sorted sizes | |
if(count[key] ==1): #single-end read | |
if(blocks[key] ==1): #single end, single hit | |
st = [i - txSt[key] for i in starts[key]] | |
st = string.join([str(i) for i in st],',') | |
print(chr[key].pop(),"\t",txSt[key],"\t",txEnd[key],"\t",key,"\t","11\t",strand[key][0],"\t",txSt[key],"\t",txEnd[key],"\t","0,255,0\t",blocks[key],"\t",string.join(sizes[key],','),"\t",st, file=FO) | |
else: | |
st = [i - txSt[key] for i in starts[key]] #single end, spliced hit | |
st = string.join([str(i) for i in st],',') | |
print(chr[key].pop(),"\t",txSt[key],"\t",txEnd[key],"\t",key,"\t","12\t",strand[key][0],"\t",txSt[key],"\t",txEnd[key],"\t","0,255,0\t",blocks[key],"\t",string.join(sizes[key],','),"\t",st, file=FO) | |
elif(count[key]==2): #pair-end read | |
direction = string.join(strand[key],'/') | |
for i,j in sorted (zip(starts[key],sizes[key])): | |
st.append(i-txSt[key]) | |
sz.append(j) | |
#st=[string.atoi(i) for i in st] | |
if(len(chr[key])==1): #pair-end reads mapped to same chromosome | |
if blocks[key] ==2: #pair end, single hits | |
print(chr[key].pop(),"\t",txSt[key],"\t",txEnd[key],"\t",key + "|strand=" + direction + "|chrom=same","\t","21\t",'.',"\t",txSt[key],"\t",txEnd[key],"\t","0,255,0\t",blocks[key],"\t",string.join(sz,','),"\t",string.join([str(i) for i in st],','), file=FO) | |
elif blocks[key] >2: # | |
print(chr[key].pop(),"\t",txSt[key],"\t",txEnd[key],"\t",key + "|strand=" + direction + "|chrom=same","\t","22\t",'.',"\t",txSt[key],"\t",txEnd[key],"\t","0,255,0\t",blocks[key],"\t",string.join(sz,','),"\t",string.join([str(i) for i in st],','), file=FO) | |
else: | |
print(key,"\t","pair-end mapped, but two ends mapped to different chromosome", file=FOF) | |
elif(count[key] >2): #reads occur more than 2 times | |
print(key,"\t","occurs more than 2 times in sam file", file=FOF) | |
continue | |
FO.close() | |
FOF.close() | |
print("Done", file=sys.stderr) | |
def getUnmap(self, outfile=None,fastq=True): | |
'''Extract unmapped reads from SAM file and write to fastq [when fastq=True] or fasta [when fastq=False] file''' | |
if outfile is None: | |
if fastq: outfile = self.fileName + ".unmap.fq" | |
else: outfile = self.fileName + ".unmap.fa" | |
FO=open(outfile,'w') | |
unmapCount=0 | |
print("Writing unmapped reads to\"",outfile,"\"... ", end=' ', file=sys.stderr) | |
for line in self.f: | |
hits=[] | |
if line[0] == '@':continue #skip head lines | |
if ParseSAM._reExpr2.match(line):continue #skip blank lines | |
field=line.rstrip().split() | |
flagCode=string.atoi(field[1]) | |
seq=field[9] | |
qual=field[10] | |
if (flagCode & 0x0004) != 0: #read unmap | |
unmapCount +=1 | |
if (flagCode & 0x0001) != 0: #paried in sequencing | |
if (flagCode & 0x0040)!=0:seqID=field[0] + '/1' #first read | |
if (flagCode & 0x0080)!=0:seqID=field[0] + '/2' #second read | |
else: seqID=field[0] | |
if fastq: FO.write('@' + seqID + '\n' + seq +'\n' + '+' +'\n' + qual+'\n') | |
else: FO.write('>' + seqID + '\n' + seq +'\n') | |
print(str(unmapCount) + " reads saved!\n", file=sys.stderr) | |
FO.close() | |
self.f.seek(0) | |
def getProperPair(self,outfile=None): | |
'''Extract proper paried mapped reads.''' | |
if outfile is None: | |
outfile = self.fileName + ".PP.sam" | |
FO=open(outfile,'w') | |
PPcount=0 | |
print("Writing proper paired reads to\"",outfile,"\"... ", end=' ', file=sys.stderr) | |
for line in self.f: | |
hits=[] | |
if line[0] == '@':continue #skip head lines | |
if ParseSAM._reExpr2.match(line):continue #skip blank lines | |
field=line.rstrip('\n').split() | |
flagCode=string.atoi(field[1]) | |
if ((flagCode & 0x0001) != 0) and ((flagCode & 0x0002)!=0): | |
PPcount +=1 | |
FO.write(line) | |
FO.close() | |
print(str(PPcount) + " reads were saved!\n", end=' ', file=sys.stderr) | |
self.f.seek(0) | |
def samNVC(self,outfile=None): | |
'''for each read, calculate nucleotide frequency vs position''' | |
if outfile is None: | |
outfile1 = self.fileName + ".NVC.xls" | |
outfile2 = self.fileName +".NVC_plot.r" | |
else: | |
outfile1 = outfile + ".NVC.xls" | |
outfile2 = outfile +".NVC_plot.r" | |
FO=open(outfile1,'w') | |
RS=open(outfile2,'w') | |
PPcount=0 | |
transtab = string.maketrans("ACGTNX","TGCANX") | |
base_freq=collections.defaultdict(int) | |
a_count=[] | |
c_count=[] | |
g_count=[] | |
t_count=[] | |
print("reading sam file ... ", file=sys.stderr) | |
for line in self.f: | |
if line.startswith('@'):continue #skip head lines | |
if ParseSAM._reExpr2.match(line):continue #skip blank lines | |
field=line.rstrip('\n').split() | |
flagCode=string.atoi(field[1]) | |
if flagCode & 0x0010 ==0: #plus strand | |
RNA_read = field[9].upper() | |
else: | |
RNA_read = field[9].upper().translate(transtab)[::-1] | |
for i in range(len(RNA_read)): | |
key = str(i) + RNA_read[i] | |
base_freq[key] += 1 | |
print("generating data matrix ...", file=sys.stderr) | |
print("Position\tA\tC\tG\tT\tN\tX", file=FO) | |
for i in range(len(RNA_read)): | |
print(str(i) + '\t', end=' ', file=FO) | |
print(str(base_freq[str(i) + "A"]) + '\t', end=' ', file=FO) | |
a_count.append(str(base_freq[str(i) + "A"])) | |
print(str(base_freq[str(i) + "C"]) + '\t', end=' ', file=FO) | |
c_count.append(str(base_freq[str(i) + "C"])) | |
print(str(base_freq[str(i) + "G"]) + '\t', end=' ', file=FO) | |
g_count.append(str(base_freq[str(i) + "G"])) | |
print(str(base_freq[str(i) + "T"]) + '\t', end=' ', file=FO) | |
t_count.append(str(base_freq[str(i) + "T"])) | |
print(str(base_freq[str(i) + "N"]) + '\t', end=' ', file=FO) | |
print(str(base_freq[str(i) + "X"]) + '\t', file=FO) | |
FO.close() | |
#generating R scripts | |
print("generating R script ...", file=sys.stderr) | |
print("position=c(" + ','.join([str(i) for i in range(len(RNA_read))]) + ')', file=RS) | |
print("A_count=c(" + ','.join(a_count) + ')', file=RS) | |
print("C_count=c(" + ','.join(c_count) + ')', file=RS) | |
print("G_count=c(" + ','.join(g_count) + ')', file=RS) | |
print("T_count=c(" + ','.join(t_count) + ')', file=RS) | |
print("total= A_count + C_count + G_count + T_count", file=RS) | |
print("ym=max(A_count/total,C_count/total,G_count/total,T_count/total) + 0.05", file=RS) | |
print("yn=min(A_count/total,C_count/total,G_count/total,T_count/total)", file=RS) | |
print('pdf("NVC_plot.pdf")', file=RS) | |
print('plot(position,A_count/total,type="o",pch=20,ylim=c(yn,ym),col="dark green",xlab="Position of Read",ylab="Nucleotide Frequency")', file=RS) | |
print('lines(position,T_count/total,type="o",pch=20,col="red")', file=RS) | |
print('lines(position,G_count/total,type="o",pch=20,col="blue")', file=RS) | |
print('lines(position,C_count/total,type="o",pch=20,col="cyan")', file=RS) | |
print('legend('+ str(len(RNA_read)-10) + ',ym,legend=c("A","T","G","C"),col=c("dark green","red","blue","cyan"),lwd=2,pch=20,text.col=c("dark green","red","blue","cyan"))', file=RS) | |
print("dev.off()", file=RS) | |
RS.close() | |
#self.f.seek(0) | |
def samGC(self,outfile=None): | |
'''GC content distribution of reads''' | |
if outfile is None: | |
outfile1 = self.fileName + ".GC.xls" | |
outfile2 = self.fileName +".GC_plot.r" | |
else: | |
outfile1 = outfile + ".GC.xls" | |
outfile2 = outfile +".GC_plot.r" | |
FO=open(outfile1,'w') | |
RS=open(outfile2,'w') | |
gc_hist=collections.defaultdict(int) #key is GC percent, value is count of reads | |
print("reading sam file ... ", file=sys.stderr) | |
for line in self.f: | |
if line[0] == '@':continue #skip head lines | |
if ParseSAM._reExpr2.match(line):continue #skip blank lines | |
field=line.rstrip('\n').split() | |
flagCode=string.atoi(field[1]) | |
gc_percent = "%4.2f" % ((field[9].upper().count('C') + field[9].upper().count('G'))/(len(field[9])+0.0)*100) | |
#print gc_percent | |
gc_hist[gc_percent] += 1 | |
print("writing GC content ...", file=sys.stderr) | |
print("GC%\tread_count", file=FO) | |
for i in list(gc_hist.keys()): | |
print(i + '\t' + str(gc_hist[i]), file=FO) | |
print("writing R script ...", file=sys.stderr) | |
print("pdf('GC_content.pdf')", file=RS) | |
print('gc=rep(c(' + ','.join([i for i in list(gc_hist.keys())]) + '),' + 'times=c(' + ','.join([str(i) for i in list(gc_hist.values())]) + '))', file=RS) | |
print('hist(gc,probability=T,breaks=%d,xlab="GC content (%%)",ylab="Density of Reads",border="blue",main="")' % 100, file=RS) | |
#print >>RS, "lines(density(gc),col='red')" | |
print("dev.off()", file=RS) | |
#self.f.seek(0) | |
def samDupRate(self,outfile=None,up_bound=500): | |
'''Calculate reads's duplicate rates''' | |
if outfile is None: | |
outfile1 = self.fileName + ".seq.DupRate.xls" | |
outfile2 = self.fileName + ".pos.DupRate.xls" | |
outfile3 = self.fileName +".DupRate_plot.r" | |
else: | |
outfile1 = outfile + ".seq.DupRate.xls" | |
outfile2 = outfile + ".pos.DupRate.xls" | |
outfile3 = outfile +".DupRate_plot.r" | |
SEQ=open(outfile1,'w') | |
POS=open(outfile2,'w') | |
RS=open(outfile3,'w') | |
seqDup=collections.defaultdict(int) | |
posDup=collections.defaultdict(int) | |
seqDup_count=collections.defaultdict(int) | |
posDup_count=collections.defaultdict(int) | |
print("reading sam file ... ", file=sys.stderr) | |
for line in self.f: | |
if line[0] == '@':continue #skip head lines | |
if ParseSAM._reExpr2.match(line):continue #skip blank lines | |
field=line.rstrip('\n').split() | |
flagCode=string.atoi(field[1]) | |
if (flagCode & 0x0004) == 1: | |
continue #skip unmapped reads | |
seqDup[field[9]] +=1 #key is read sequence | |
#calculte duplicate read based on coordinates | |
comb=[int(i) for i in ParseSAM._splicedHit_pat.findall(field[5])] #"9M4721N63M3157N8M" return ['9', '4721', '63', '3157', '8'] | |
chrom = field[2] | |
chromStart = string.atoi(field[3])-1 | |
chromEnd=chromStart + sum(map(int,comb)) | |
blockSize = [] | |
for i in range(0,len(comb),2): | |
blockSize.append(str(comb[i])) | |
blockSizes = ','.join(blockSize) | |
blockStart=[] | |
for i in range(0,len(comb),2): | |
blockStart.append(str(sum(comb[:i]))) | |
blockStarts = ','.join(blockStart) | |
coord = chrom + ":" + str(chromStart) + "-" + str(chromEnd) + ":" + blockSizes + ":" + blockStarts | |
posDup[coord] +=1 | |
print("report duplicte rate based on sequence ...", file=sys.stderr) | |
print("Occurrence\tUniqReadNumber", file=SEQ) | |
for i in list(seqDup.values()): #key is occurence, value is uniq reads number (based on seq) | |
seqDup_count[i] +=1 | |
for k in sorted(seqDup_count.keys()): | |
print(str(k) +'\t'+ str(seqDup_count[k]), file=SEQ) | |
SEQ.close() | |
print("report duplicte rate based on mapping ...", file=sys.stderr) | |
print("Occurrence\tUniqReadNumber", file=POS) | |
for i in list(posDup.values()): #key is occurence, value is uniq reads number (based on coord) | |
posDup_count[i] +=1 | |
for k in sorted(posDup_count.keys()): | |
print(str(k) +'\t'+ str(posDup_count[k]), file=POS) | |
POS.close() | |
print("generate R script ...", file=sys.stderr) | |
print("pdf('duplicateRead.pdf')", file=RS) | |
print("par(mar=c(5,4,4,5),las=0)", file=RS) | |
print("seq_occ=c(" + ','.join([str(i) for i in sorted(seqDup_count.keys()) ]) + ')', file=RS) | |
print("seq_uniqRead=c(" + ','.join([str(seqDup_count[i]) for i in sorted(seqDup_count.keys()) ]) + ')', file=RS) | |
print("pos_occ=c(" + ','.join([str(i) for i in sorted(posDup_count.keys()) ]) + ')', file=RS) | |
print("pos_uniqRead=c(" + ','.join([str(posDup_count[i]) for i in sorted(posDup_count.keys()) ]) + ')', file=RS) | |
print("plot(pos_occ,log10(pos_uniqRead),ylab='Number of Reads (log10)',xlab='Frequency',pch=4,cex=0.8,col='blue',xlim=c(1,%d),yaxt='n')" % up_bound, file=RS) | |
print("points(seq_occ,log10(seq_uniqRead),pch=20,cex=0.8,col='red')", file=RS) | |
print('ym=floor(max(log10(pos_uniqRead)))', file=RS) | |
print("legend(%d,ym,legend=c('Sequence-base','Mapping-base'),col=c('red','blue'),pch=c(4,20))" % max(up_bound-200,1), file=RS) | |
print('axis(side=2,at=0:ym,labels=0:ym)', file=RS) | |
print('axis(side=4,at=c(log10(pos_uniqRead[1]),log10(pos_uniqRead[2]),log10(pos_uniqRead[3]),log10(pos_uniqRead[4])), labels=c(round(pos_uniqRead[1]*100/sum(pos_uniqRead)),round(pos_uniqRead[2]*100/sum(pos_uniqRead)),round(pos_uniqRead[3]*100/sum(pos_uniqRead)),round(pos_uniqRead[4]*100/sum(pos_uniqRead))))', file=RS) | |
print('mtext(4, text = "Reads %", line = 2)', file=RS) | |
#self.f.seek(0) | |
def getUniqMapRead(self,outfile=None): | |
'''Extract uniquely mapped reads.''' | |
if outfile is None: | |
outfile = self.fileName + ".uniq.sam" | |
FO=open(outfile,'w') | |
Uniqcount=0 | |
print("Writing uniquely mapped reads to\"",outfile,"\"... ", end=' ', file=sys.stderr) | |
for line in self.f: | |
hits=[] | |
if line[0] == '@':continue #skip head lines | |
if ParseSAM._reExpr2.match(line):continue #skip blank lines | |
field=line.rstrip('\n').split() | |
flagCode=string.atoi(field[1]) | |
if (flagCode & 0x0004) == 1: | |
continue #skip unmapped reads | |
#else: | |
#print >>sys.stderr,line, | |
if (ParseSAM._uniqueHit_pat.search(line)): | |
print(line, end=' ', file=sys.stderr) | |
Uniqcount +=1 | |
FO.write(line) | |
FO.close() | |
print(str(Uniqcount) + " reads were saved!\n", end=' ', file=sys.stderr) | |
self.f.seek(0) | |
def getWrongStrand(self,outfile=None): | |
'''Extract pair-end reads mapped in incorrectly strand, such +/+ or -/-''' | |
if outfile is None: | |
outfile = self.fileName + ".wrongStrand.sam" | |
FO=open(outfile,'w') | |
wrongStrand=0 | |
print("Writing incorrectly stranded reads to\"",outfile,"\"... ", end=' ', file=sys.stderr) | |
for line in self.f: | |
hits=[] | |
if line.startswith('@'):continue #skip head lines | |
if ParseSAM._reExpr2.match(line):continue #skip blank lines | |
field=line.rstrip('\n').split() | |
flagCode=string.atoi(field[1]) | |
if (flagCode & 0x0004) != 0: | |
continue #skipped if read itself is unmapped | |
if (flagCode & 0x0008) !=0: | |
continue #skipped if mate is unmapped | |
if (flagCode & 0x0001) == 0: | |
continue #skip single end sequencing' | |
if ((flagCode & 0x0010) ==0) and ((flagCode & 0x0020)==0 ): | |
FO.write(line) | |
wrongStrand+=1 | |
if ((flagCode & 0x0010) !=0) and ((flagCode & 0x0020)!=0 ): | |
FO.write(line) | |
wrongStrand+=1 | |
FO.close() | |
print(str(wrongStrand) + " reads were saved!\n", end=' ', file=sys.stderr) | |
self.f.seek(0) | |
def filterSpliceRead(self,outfile=None,min_overhang=8,min_gap=50,max_gap=1000000): | |
'''filter spiced mapped reads from sam file. min_overhang is used to determine the reliability of splice sites | |
splice reads with overhang size <8 will also be reported if the same splice sites has been suported by | |
at least 1 read with overhang size >8. Multiple spliced reads (belong to the same splice junction) | |
will always be reported. min_overhang, min_gap and max_gap only applied to one-time splice read''' | |
if outfile is None: | |
outfile = self.fileName + ".SR.sam" | |
#outfile2 = self.fileName + ".SR.filter.sam" | |
splice_sites=collections.defaultdict(set) | |
print("\tDetermine splice sites with proper overhang, intron size ... ", end=' ', file=sys.stderr) | |
for line in self.f: | |
if line[0] == '@':continue #skip head lines | |
if ParseSAM._reExpr2.match(line):continue #skip blank lines | |
if not (ParseSAM._uniqueHit_pat.search(line)): #skip non unique mapped read | |
continue | |
field=line.rstrip('\n').split() | |
flagCode=string.atoi(field[1]) | |
map_st = int(field[3]) | |
chrom = field[2] | |
if (flagCode & 0x0004) == 1:continue #skip unmapped reads | |
comb=[int(i) for i in ParseSAM._splicedHit_pat.findall(field[5])] #"9M4721N63M3157N8M" return ['9', '4721', '63', '3157', '8'] | |
if (len(comb)==1): #skip non-spliced | |
#print line, | |
continue | |
if (len(comb)>3): #skip multiple spliced | |
continue | |
else: #one-time spliced | |
if (comb[1] < min_gap or comb[1] > max_gap): | |
continue | |
else: | |
if (comb[0] >= min_overhang): | |
splice_sites[chrom].add(map_st + comb[0]) | |
if (comb[2] >= min_overhang): | |
splice_sites[chrom].add(map_st + comb[0] + comb[1]) | |
self.f.seek(0) | |
print("Done", file=sys.stderr) | |
FO=open(outfile,'w') | |
#FO2=open(outfile2,'w') | |
print("\tExtracting splicing reads ... ", end=' ', file=sys.stderr) | |
total_SR =0 | |
extract_SR =0 | |
total_read =0 | |
for line in self.f: | |
if line[0] == '@':continue #skip head lines | |
if ParseSAM._reExpr2.match(line):continue #skip blank lines | |
field=line.rstrip('\n').split() | |
flagCode=string.atoi(field[1]) | |
map_st = int(field[3]) | |
chrom = field[2] | |
if (flagCode & 0x0004) == 1:continue #skip unmapped reads | |
total_read +=1 | |
comb=[int(i) for i in ParseSAM._splicedHit_pat.findall(field[5])] #"9M4721N63M3157N8M" return ['9', '4721', '63', '3157', '8'] | |
if (len(comb)==1): #skip non-spliced | |
continue | |
total_SR +=1 | |
if (len(comb)>3): #multipel splice read. report directly | |
FO.write(line) | |
extract_SR +=1 | |
else: #one-time splice read | |
if (comb[1] < min_gap or comb[1] > max_gap): | |
continue | |
if (chrom in splice_sites) and ((map_st + comb[0]) in splice_sites[chrom]) and ((map_st + comb[0] + comb[1]) in splice_sites[chrom]): | |
FO.write(line) | |
#print line | |
extract_SR +=1 | |
else: | |
#FO2.write(line) | |
continue | |
print("Done", file=sys.stderr) | |
print("\tTotal mapped Read: " + str(total_read), file=sys.stderr) | |
print("\tTotal Splicing Read: " + str(total_SR), file=sys.stderr) | |
print("\\Usable Splicing Read: " + str(extract_SR), file=sys.stderr) | |
FO.close() | |
#FO2.close() | |
self.f.seek(0) | |
def getSpliceRead(self,outfile=None): | |
'''Extract spiced mapped reads from sam file''' | |
if outfile is None: | |
outfile = self.fileName + ".SR.sam" | |
FO=open(outfile,'w') | |
print("\tExtract splicing reads without any filter ...", end=' ', file=sys.stderr) | |
for line in self.f: | |
if line[0] == '@':continue #skip head lines | |
if ParseSAM._reExpr2.match(line):continue #skip blank lines | |
field=line.rstrip('\n').split() | |
flagCode=string.atoi(field[1]) | |
if (flagCode & 0x0004) == 1:continue #skip unmapped reads | |
comb=[int(i) for i in ParseSAM._splicedHit_pat.findall(field[5])] #"9M4721N63M3157N8M" return ['9', '4721', '63', '3157', '8'] | |
if (len(comb)>=3): | |
FO.write(line) | |
print("Done", file=sys.stderr) | |
self.f.seek(0) | |
FO.close() | |
def collapseSAM(self, outfile=None,collevel=10): | |
'''At most collevel[default=10] identical reads will be retained in outputting SAM file | |
The original SAM file must be sorted before hand. if not, using linux command like "sort -k3,3 -k4,4n myfile.sam >myfile.sorted.sam" ''' | |
if outfile is None: | |
outfile = self.fileName + ".collapsed.sam" | |
print("Writing collapsed SAM file to\"",outfile,"\"... ", file=sys.stderr) | |
FO=open(outfile,'w') | |
flag="" | |
for line in self.f: | |
if line[0] == '@':continue #skip head lines | |
if ParseSAM._reExpr2.match(line):continue #skip blank lines | |
field=line.rstrip('\n').split() | |
if (string.atoi(field[1]) & 0x0004)!=0: continue #skip unmapped line | |
id=field[2] + field[3] + field[5] | |
if (id != flag): | |
FO.write(line) | |
flag=id | |
skipTrigger=0 | |
else: | |
skipTrigger+=1 | |
if skipTrigger < collevel: | |
FO.write(line) | |
else:continue | |
FO.close() | |
self.f.seek(0) | |
def qualSAM(self,read_len,outfile=None): | |
'''calculate phred quality score for each base in read (5->3)''' | |
if outfile is None: | |
outfile = self.fileName + ".qual.plot.r" | |
else: | |
outfile = outfile + ".qual.plot.r" | |
FO=open(outfile,'w') | |
print("\tcalculating quality score ... ", file=sys.stderr) | |
qual_min={} | |
qual_max={} | |
qual_sum={} | |
total_read=0 | |
for i in range(0,read_len): | |
qual_min.setdefault(i,1000) | |
qual_max.setdefault(i,-1) | |
qual_sum.setdefault(i,0.0) | |
for line in self.f: | |
if line[0] == '@':continue #skip head lines | |
if ParseSAM._reExpr2.match(line):continue #skip blank lines | |
field=line.rstrip('\n').split() | |
#if (string.atoi(field[1]) & 0x0004)!=0: continue #skip unmapped line | |
if (len(field[10]) != read_len): | |
continue | |
if (string.atoi(field[1]) & 0x0010)==0: #query map to + | |
qual_str=field[10] | |
else: | |
qual_str=field[10][::-1] | |
total_read +=1 | |
for i in range(0,read_len): | |
#print ord(qual_str[i])-33, | |
qual_sum[i] += ord(qual_str[i])-33 | |
if(qual_min[i] > (ord(qual_str[i])-33)): | |
qual_min[i] = ord(qual_str[i])-33 | |
if(qual_max[i] < (ord(qual_str[i])-33)): | |
qual_max[i] = ord(qual_str[i])-33 | |
#print '\n', | |
min_qualities = [str(qual_min[i]) for i in range(0,read_len)] | |
max_qualities =[str(qual_max[i]) for i in range(0,read_len)] | |
avg_qualities = [str(qual_sum[i]/total_read) for i in range(0,read_len)] | |
nt_pos = [str(i) for i in range(0,read_len)] | |
print("nt_pos=c(" + ','.join(nt_pos) + ')', file=FO) | |
print("max_qual=c(" + ','.join(max_qualities) + ')', file=FO) | |
print("min_qual=c(" + ','.join(min_qualities) + ')', file=FO) | |
print("avg_qual=c(" + ','.join(avg_qualities) + ')', file=FO) | |
print("pdf('phred_qual.pdf')", file=FO) | |
print("plot(nt_pos,avg_qual, xlab=\"Nucleotide Position (5'->3')\", ylab='Phred Quality',ylim=c(0,97),lwd=2,type='s')", file=FO) | |
print('lines(nt_pos,max_qual,type="s",lwd=2,col="red")', file=FO) | |
print('lines(nt_pos,min_qual,type="s",lwd=2,col="blue")', file=FO) | |
print('legend(0,100,legend=c("Max","Average","Min"),col=c("red","black","blue"),lwd=2)', file=FO) | |
print('dev.off()', file=FO) | |
#for i in range(0,read_len): | |
# print >>sys.stderr, str(i) + '\t' + str(qual_max[i]) + '\t' + str(qual_min[i]) + '\t' + str(qual_sum[i]/total_read) | |
#self.f.seek(0) | |
def samToBinnedArray(self): | |
"""Convert SAM file to BinnedArray.""" | |
lines=0 | |
for line in self.f: | |
if line.startswith('@'):continue #skip head lines | |
if ParseSAM._reExpr2.match(line):continue #skip blank lines | |
field=line.rstrip().split() | |
if (string.atoi(field[1]) & 0x0004)!=0: continue #skip unmapped line | |
txStart=string.atoi(field[3]) | |
#if (string.atoi(field[1]) & 0x0010 != 0): | |
# strand='-' | |
#else: | |
# strand='+' | |
lines +=1 | |
scores={} | |
chrom = field[2] | |
comb=[int(i) for i in ParseSAM._splicedHit_pat.findall(field[5])] #"9M4721N63M3157N8M" return ['9', '4721', '63', '3157', '8'] | |
if not chrom in scores:scores[chrom] = BinnedArray() | |
for i in range(0,len(comb),2): | |
for pos in range(txStart + sum(comb[:i]), txStart + sum(comb[:i]) + comb[i]): | |
tmp = scores[chrom][pos] | |
if isNaN(tmp): | |
scores[chrom][pos] =1 | |
else: | |
scores[chrom][pos] +=1 | |
if lines % 10000 == 0: print("%i lines loaded \r" % lines, file=sys.stderr) | |
return scores | |
self.f.seek(0) | |
class QCSAM(object): | |
'''Perform basic quality control. Useful for RNA-seq experiment''' | |
def __init__(self,samFile): | |
'''constructor''' | |
if samFile == '-': | |
self.fileName = "STDIN" | |
self.f = sys.stdin | |
else: | |
self.fileName = os.path.basename(samFile) | |
self.f=open(samFile,'r') | |
def distribSAM(self,refbed,outfile=None): | |
'''calculate reads distribution over genome features (Exon reads, Inton reads, Intergenic | |
reads, spliced reads). A bed file representing the gene model (i.e. refseq) must be provided | |
Two bed format files will be generated: outfile_exon.count.bed and outfile_intron.count.bed. | |
The 5th column is number of reads fallen into the region defined by the first 3 columns''' | |
if refbed is None: | |
print("You must specify a bed file representing gene model\n", file=sys.stderr) | |
exit(0) | |
if outfile is None: | |
exon_count = self.fileName + "_exon.count.bed" | |
intron_count = self.fileName + "_intron.count.bed" | |
rscript=self.fileName + ".piechart.r" | |
rpdf=self.fileName + ".piechart.pdf" | |
else: | |
exon_count = outfile + "_exon.count.bed" | |
intron_count = outfile + "_intron.count.bed" | |
rscript=outfile + ".piechart.r" | |
rpdf=outfile + ".piechart.pdf" | |
EXON_OUT = open(exon_count,'w') | |
INTRON_OUT =open(intron_count,'w') | |
R_OUT = open(rscript,'w') | |
ranges={} | |
intronReads=0 | |
exonReads=0 | |
intergenicReads=0 | |
totalReads=0 | |
splicedReads=0 | |
#read SAM | |
print("reading "+ self.fileName + '...', end=' ', file=sys.stderr) | |
for line in self.f: | |
if line.startswith("@"):continue | |
fields=line.rstrip('\n ').split() | |
flagCode=string.atoi(fields[1]) | |
if (flagCode & 0x0004) != 0: continue #skip unmap reads | |
totalReads +=1 | |
comb=[int(i) for i in ParseSAM._splicedHit_pat.findall(fields[5])] #"9M4721N63M3157N8M" return ['9', '4721', '63', '3157', '8'] | |
if( len(comb)>1): | |
splicedReads +=1 | |
continue | |
else: | |
chrom=fields[2].upper() | |
#st=int(fields[3])-1 | |
#end= st +len(fields[9]) | |
mid = int(fields[3]) + int(len(fields[9])/2) | |
if chrom not in ranges: | |
ranges[chrom] = Intersecter() | |
ranges[chrom].add_interval( Interval( mid, mid ) ) | |
self.f.seek(0) | |
print("Done", file=sys.stderr) | |
#read refbed file | |
print("Assign reads to "+ refbed + '...', end=' ', file=sys.stderr) | |
for line in open(refbed,'r'): | |
try: | |
if line.startswith('#'):continue | |
if line.startswith('track'):continue | |
if line.startswith('browser'):continue | |
# Parse fields from gene tabls | |
fields = line.split() | |
chrom = fields[0].upper() | |
tx_start = int( fields[1] ) | |
tx_end = int( fields[2] ) | |
geneName = fields[3] | |
strand = fields[5].replace(" ","_") | |
exon_starts = list(map( int, fields[11].rstrip( ',\n' ).split( ',' ) )) | |
exon_starts = list(map((lambda x: x + tx_start ), exon_starts)) | |
exon_ends = list(map( int, fields[10].rstrip( ',\n' ).split( ',' ) )) | |
exon_ends = list(map((lambda x, y: x + y ), exon_starts, exon_ends)); | |
intron_starts = exon_ends[:-1] | |
intron_ends=exon_starts[1:] | |
except: | |
print("[NOTE:input bed must be 12-column] skipped this line: " + line, end=' ', file=sys.stderr) | |
continue | |
# assign reads to intron | |
if(strand == '-'): | |
intronNum=len(intron_starts) | |
exonNum=len(exon_starts) | |
for st,end in zip(intron_starts,intron_ends): | |
if chrom in ranges: | |
hits= len(ranges[chrom].find(st,end)) | |
intronReads += hits | |
INTRON_OUT.write(chrom + "\t" + str(st) + "\t" + str(end) + "\t" + geneName + "_intron_" + str(intronNum) + "\t" + str(hits) + "\t" + strand + '\n') | |
intronNum -= 1 | |
for st,end in zip(exon_starts,exon_ends): | |
if chrom in ranges: | |
hits= len(ranges[chrom].find(st,end)) | |
exonReads += hits | |
EXON_OUT.write(chrom + "\t" + str(st) + "\t" + str(end) + "\t" + geneName + "_exon_" + str(exonNum) + "\t" + str(hits) + "\t" + strand + '\n') | |
exonNum -= 1 | |
elif(strand == '+'): | |
intronNum=1 | |
exonNum=1 | |
for st,end in zip(intron_starts,intron_ends): | |
if chrom in ranges: | |
hits= len(ranges[chrom].find(st,end)) | |
intronReads += hits | |
INTRON_OUT.write(chrom + "\t" + str(st) + "\t" + str(end) + "\t" + geneName + "_intron_" + str(intronNum) + "\t" + str(hits) + "\t" + strand + '\n') | |
intronNum += 1 | |
for st,end in zip(exon_starts,exon_ends): | |
if chrom in ranges: | |
hits= len(ranges[chrom].find(st,end)) | |
exonReads += hits | |
EXON_OUT.write(chrom + "\t" + str(st) + "\t" + str(end) + "\t" + geneName + "_exon_" + str(exonNum) + "\t" + str(hits) + "\t" + strand + '\n') | |
exonNum += 1 | |
intergenicReads=totalReads-exonReads-intronReads-splicedReads | |
print("Done." + '\n', file=sys.stderr) | |
print("Total reads:\t" + str(totalReads), file=sys.stderr) | |
print("Exonic reads:\t" + str(exonReads), file=sys.stderr) | |
print("Intronic reads:\t" + str(intronReads), file=sys.stderr) | |
print("Splicing reads:\t" + str(splicedReads), file=sys.stderr) | |
print("Intergenic reads:\t" + str(intergenicReads), file=sys.stderr) | |
print("writing R script ...", end=' ', file=sys.stderr) | |
totalReads=float(totalReads) | |
print("pdf('%s')" % rpdf, file=R_OUT) | |
print("dat=c(%d,%d,%d,%d)" % (exonReads,splicedReads,intronReads,intergenicReads), file=R_OUT) | |
print("lb=c('exon(%.2f)','junction(%.2f)','intron(%.2f)','intergenic(%.2f)')" % (exonReads/totalReads,splicedReads/totalReads,intronReads/totalReads,intergenicReads/totalReads), file=R_OUT) | |
print("pie(dat,labels=lb,col=rainbow(4),clockwise=TRUE,main='Total reads = %d')" % int(totalReads), file=R_OUT) | |
print("dev.off()", file=R_OUT) | |
print("Done.", file=sys.stderr) | |
def coverageGeneBody(self,refbed,outfile=None): | |
'''Calculate reads coverage over gene body, from 5'to 3'. each gene will be equally divied | |
into 100 regsions''' | |
if refbed is None: | |
print("You must specify a bed file representing gene model\n", file=sys.stderr) | |
exit(0) | |
if outfile is None: | |
outfile1 = self.fileName + ".geneBodyCoverage_plot.r" | |
outfile2 = self.fileName + ".geneBodyCoverage.txt" | |
else: | |
outfile1 = outfile + ".geneBodyCoverage_plot.r" | |
outfile2 = outfile + ".geneBodyCoverage.txt" | |
OUT1 = open(outfile1,'w') | |
OUT2 = open(outfile2,'w') | |
ranges={} | |
totalReads=0 | |
fragment_num=0 #splice reads will counted twice | |
rpkm={} | |
#read SAM | |
print("reading "+ self.fileName + '...', end=' ', file=sys.stderr) | |
for line in self.f: | |
if line.startswith("@"):continue | |
fields=line.rstrip('\n ').split() | |
flagCode=string.atoi(fields[1]) | |
if (flagCode & 0x0004) != 0: continue #skip unmap reads | |
totalReads +=1 | |
chrom = fields[2].upper() | |
chromStart = string.atoi(fields[3])-1 | |
comb=[int(i) for i in ParseSAM._splicedHit_pat.findall(fields[5])] #"9M4721N63M3157N8M" return ['9', '4721', '63', '3157', '8'] | |
fragment_num += (len(comb) +1)/2 | |
blockStart=[] | |
blockSize=[] | |
for i in range(0,len(comb),2): | |
blockStart.append(chromStart + sum(comb[:i]) ) | |
for i in range(0,len(comb),2): | |
blockSize.append(comb[i]) | |
for st,size in zip(blockStart,blockSize): | |
if chrom not in ranges: | |
ranges[chrom] = Intersecter() | |
ranges[chrom].add_interval( Interval( st, st+size ) ) | |
print("Done", file=sys.stderr) | |
print("calculating coverage over gene body ...", file=sys.stderr) | |
coverage=collections.defaultdict(int) | |
flag=0 | |
for line in open(refbed,'r'): | |
try: | |
if line.startswith(('#','track','browser')):continue | |
# Parse fields from gene tabls | |
fields = line.split() | |
chrom = fields[0].upper() | |
tx_start = int( fields[1] ) | |
tx_end = int( fields[2] ) | |
geneName = fields[3] | |
strand = fields[5] | |
exon_starts = list(map( int, fields[11].rstrip( ',\n' ).split( ',' ) )) | |
exon_starts = list(map((lambda x: x + tx_start ), exon_starts)) | |
exon_ends = list(map( int, fields[10].rstrip( ',\n' ).split( ',' ) )) | |
exon_ends = list(map((lambda x, y: x + y ), exon_starts, exon_ends)); | |
except: | |
print("[NOTE:input bed must be 12-column] skipped this line: " + line, end=' ', file=sys.stderr) | |
continue | |
gene_all_base=[] | |
percentile_base=[] | |
mRNA_len =0 | |
flag=0 | |
for st,end in zip(exon_starts,exon_ends): | |
gene_all_base.extend(list(range(st+1,end+1))) #0-based coordinates on genome | |
mRNA_len = len(gene_all_base) | |
if mRNA_len <100: | |
flag=1 | |
break | |
if flag==1: continue | |
if strand == '-': | |
gene_all_base.sort(reverse=True) #deal with gene on minus stand | |
else: | |
gene_all_base.sort(reverse=False) | |
percentile_base = mystat.percentile_list (gene_all_base) #get 101 points from each gene's coordinates | |
for i in range(0,len(percentile_base)): | |
if chrom in ranges: | |
coverage[i] += len(ranges[chrom].find(percentile_base[i], percentile_base[i]+1)) | |
x_coord=[] | |
y_coord=[] | |
print("Total reads: " + str(totalReads), file=OUT2) | |
print("Fragment number: " + str(fragment_num), file=OUT2) | |
print("percentile\tcount", file=OUT2) | |
for i in coverage: | |
x_coord.append(str(i)) | |
y_coord.append(str(coverage[i])) | |
print(str(i) + '\t' + str(coverage[i]), file=OUT2) | |
print("pdf('geneBody_coverage.pdf')", file=OUT1) | |
print("x=0:100", file=OUT1) | |
print("y=c(" + ','.join(y_coord) + ')', file=OUT1) | |
print("plot(x,y,xlab=\"percentile of gene body (5'->3')\",ylab='read number',type='s')", file=OUT1) | |
print("dev.off()", file=OUT1) | |
def calculateRPKM(self,refbed,outfile=None): | |
'''calculate RPKM values for each gene in refbed. Only uniquely aligned reads are used. | |
Spilced reads are split. output raw read connt and eRPKM (eRPKM = exon Represented times Per Kb | |
exon per Million mapped reads) for each exon, intron and mRNA''' | |
if refbed is None: | |
print("You must specify a bed file representing gene model\n", file=sys.stderr) | |
exit(0) | |
if outfile is None: | |
rpkm_file = self.fileName + ".rpkm.xls" | |
else: | |
rpkm_file = outfile + ".rpkm.xls" | |
RPKM_OUT=open(rpkm_file,'w') | |
ranges={} | |
totalReads=0 | |
cUR=0 | |
sR=0 | |
multiMapReads=0 | |
rpkm={} | |
#read SAM | |
print("reading "+ self.fileName + '...', end=' ', file=sys.stderr) | |
for line in self.f: | |
if line.startswith("@"):continue | |
fields=line.rstrip('\n ').split() | |
flagCode=string.atoi(fields[1]) | |
if (flagCode & 0x0004) != 0: continue #skip unmap reads | |
totalReads +=1 | |
if not ParseSAM._uniqueHit_pat.search(line): #skip multiple mapped reads | |
multiMapReads +=1 | |
continue | |
chrom = fields[2].upper() | |
chromStart = string.atoi(fields[3])-1 | |
comb=[int(i) for i in ParseSAM._splicedHit_pat.findall(fields[5])] #"9M4721N63M3157N8M" return ['9', '4721', '63', '3157', '8'] | |
cUR += (len(comb) +1)/2 | |
if(len(comb)>1): | |
sR+=1 | |
blockStart=[] | |
blockSize=[] | |
for i in range(0,len(comb),2): | |
blockStart.append(chromStart + sum(comb[:i]) ) | |
for i in range(0,len(comb),2): | |
blockSize.append(comb[i]) | |
for st,size in zip(blockStart,blockSize): | |
mid = int(st) + (size/2) | |
if chrom not in ranges: | |
ranges[chrom] = Intersecter() | |
ranges[chrom].add_interval( Interval( mid, mid ) ) | |
self.f.seek(0) | |
print("Done", file=sys.stderr) | |
print("Total mapped reads (TR): " + str(totalReads), file=RPKM_OUT) | |
print("Multiple mapped reads (MR): " + str(multiMapReads), file=RPKM_OUT) | |
print("Uniquely mapped reads (UR): " + str(totalReads - multiMapReads), file=RPKM_OUT) | |
print("Spliced mapped reads (SR): " + str(sR), file=RPKM_OUT) | |
print("Corrected uniquely mapped reads (cUR): " + str(cUR), file=RPKM_OUT) | |
if totalReads ==0: | |
sys.exit(1) | |
#read refbed file | |
print("Assign reads to "+ refbed + '...', end=' ', file=sys.stderr) | |
for line in open(refbed,'r'): | |
try: | |
if line.startswith('#'):continue | |
if line.startswith('track'):continue | |
if line.startswith('browser'):continue | |
# Parse fields from gene tabls | |
fields = line.split() | |
chrom = fields[0].upper() | |
tx_start = int( fields[1] ) | |
tx_end = int( fields[2] ) | |
geneName = fields[3] | |
strand = fields[5].replace(" ","_") | |
exon_starts = list(map( int, fields[11].rstrip( ',\n' ).split( ',' ) )) | |
exon_starts = list(map((lambda x: x + tx_start ), exon_starts)) | |
exon_ends = list(map( int, fields[10].rstrip( ',\n' ).split( ',' ) )) | |
exon_ends = list(map((lambda x, y: x + y ), exon_starts, exon_ends)) | |
exon_sizes = list(map(int,fields[10].rstrip(',\n').split(','))) | |
intron_starts = exon_ends[:-1] | |
intron_ends=exon_starts[1:] | |
key='\t'.join((chrom.lower(),str(tx_start),str(tx_end),geneName,'0',strand)) | |
except: | |
print("[NOTE:input bed must be 12-column] skipped this line: " + line, end=' ', file=sys.stderr) | |
continue | |
# assign reads to intron | |
mRNA_count=0 | |
mRNA_len=sum(exon_sizes) | |
if(strand == '-'): | |
intronNum=len(intron_starts) | |
exonNum=len(exon_starts) | |
for st,end in zip(intron_starts,intron_ends): | |
if chrom in ranges: | |
hits= len(ranges[chrom].find(st,end)) | |
RPKM_OUT.write(chrom.lower() + "\t" + str(st) + "\t" + str(end) + "\t" + geneName + "_intron_" + str(intronNum) + "\t" + str(hits) + "\t" + strand + '\t' + str(hits*1000000000.0/((end-st)*(cUR))) +'\n') | |
intronNum -= 1 | |
for st,end in zip(exon_starts,exon_ends): | |
if chrom in ranges: | |
hits= len(ranges[chrom].find(st,end)) | |
RPKM_OUT.write(chrom.lower() + "\t" + str(st) + "\t" + str(end) + "\t" + geneName + "_exon_" + str(exonNum) + "\t" + str(hits) + "\t" + strand + '\t' + str(hits*1000000000.0/((end-st)*(cUR))) +'\n') | |
exonNum -= 1 | |
mRNA_count += hits | |
try: | |
RPKM_OUT.write(chrom.lower() + "\t" + str(tx_start) + "\t" + str(tx_end) + "\t" + geneName + "_mRNA" + "\t" + str(mRNA_count) + "\t" + strand + '\t' + str(mRNA_count*1000000000.0/(mRNA_len*cUR)) +'\n') | |
rpkm[key] = mRNA_count*1000000000.0/(mRNA_len*cUR) | |
except: | |
RPKM_OUT.write(chrom.lower() + "\t" + str(tx_start) + "\t" + str(tx_end) + "\t" + geneName + "_mRNA" + "\t" + str(0) + "\t" + strand + '\t' + str(0) +'\n') | |
rpkm[key] = 0 | |
elif(strand == '+'): | |
intronNum=1 | |
exonNum=1 | |
for st,end in zip(intron_starts,intron_ends): | |
if chrom in ranges: | |
hits= len(ranges[chrom].find(st,end)) | |
RPKM_OUT.write(chrom.lower() + "\t" + str(st) + "\t" + str(end) + "\t" + geneName + "_intron_" + str(intronNum) + "\t" + str(hits) + "\t" + strand + '\t' + str(hits*1000000000.0/((end-st)*(cUR))) +'\n') | |
intronNum += 1 | |
for st,end in zip(exon_starts,exon_ends): | |
if chrom in ranges: | |
hits= len(ranges[chrom].find(st,end)) | |
RPKM_OUT.write(chrom.lower() + "\t" + str(st) + "\t" + str(end) + "\t" + geneName + "_exon_" + str(exonNum) + "\t" + str(hits) + "\t" + strand + '\t' + str(hits*1000000000.0/((end-st)*(cUR))) +'\n') | |
exonNum += 1 | |
mRNA_count += hits | |
try: | |
RPKM_OUT.write(chrom.lower() + "\t" + str(tx_start) + "\t" + str(tx_end) + "\t" + geneName + "_mRNA" + "\t" + str(mRNA_count) + "\t" + strand + '\t' + str(mRNA_count*1000000000.0/(mRNA_len*cUR)) +'\n') | |
rpkm[key] = mRNA_count*1000000000.0/(mRNA_len*cUR) | |
except: | |
RPKM_OUT.write(chrom.lower() + "\t" + str(tx_start) + "\t" + str(tx_end) + "\t" + geneName + "_mRNA" + "\t" + str(0) + "\t" + strand + '\t' + str(0) +'\n') | |
rpkm[key] = 0 | |
print("Done", file=sys.stderr) | |
return rpkm | |
self.f.seek(0) | |
def calculateRPKM2(self,refbed,outfile=None): | |
'''calculate RPKM values for each gene in refbed. Only uniquely aligned reads are used. | |
Spilced reads are split. output raw read connt and eRPKM (eRPKM = exon Represented times Per Kb | |
exon per Million mapped reads) for each exon, intron and mRNA | |
NOTE: intronic reads are not counted as part of total reads''' | |
if refbed is None: | |
print("You must specify a bed file representing gene model\n", file=sys.stderr) | |
exit(0) | |
if outfile is None: | |
rpkm_file = self.fileName + ".rpkm.xls" | |
else: | |
rpkm_file = outfile + ".rpkm.xls" | |
RPKM_OUT=open(rpkm_file,'w') | |
ranges={} | |
exon_ranges={} | |
totalReads=0 | |
#intronic=0 | |
cUR=0 | |
sR=0 | |
multiMapReads=0 | |
rpkm={} | |
#read gene model file, the purpose is to remove intronic reads | |
print("Reading reference gene model "+ refbed + '...', file=sys.stderr) | |
for line in open(refbed,'r'): | |
try: | |
if line.startswith(('#','track','browser')):continue | |
# Parse fields from gene tabls | |
fields = line.split() | |
chrom = fields[0].upper() | |
tx_start = int( fields[1] ) | |
tx_end = int( fields[2] ) | |
geneName = fields[3] | |
strand = fields[5].replace(" ","_") | |
exon_starts = list(map( int, fields[11].rstrip( ',\n' ).split( ',' ) )) | |
exon_starts = list(map((lambda x: x + tx_start ), exon_starts)) | |
exon_ends = list(map( int, fields[10].rstrip( ',\n' ).split( ',' ) )) | |
exon_ends = list(map((lambda x, y: x + y ), exon_starts, exon_ends)); | |
except: | |
print("[NOTE:input bed must be 12-column] skipped this line: " + line, end=' ', file=sys.stderr) | |
continue | |
for st,end in zip(exon_starts,exon_ends): | |
if chrom not in exon_ranges: | |
exon_ranges[chrom] = Intersecter() | |
exon_ranges[chrom].add_interval( Interval( st, end ) ) | |
#read SAM | |
print("reading "+ self.fileName + '...', end=' ', file=sys.stderr) | |
for line in self.f: | |
if line.startswith("@"):continue | |
fields=line.rstrip('\n ').split() | |
flagCode=string.atoi(fields[1]) | |
if (flagCode & 0x0004) != 0: continue #skip unmap reads | |
totalReads +=1 | |
if not ParseSAM._uniqueHit_pat.search(line): #skip multiple mapped reads | |
multiMapReads +=1 | |
continue | |
chrom = fields[2].upper() | |
chromStart = string.atoi(fields[3])-1 | |
comb=[int(i) for i in ParseSAM._splicedHit_pat.findall(fields[5])] #"9M4721N63M3157N8M" return ['9', '4721', '63', '3157', '8'] | |
#cUR += (len(comb) +1)/2 | |
if(len(comb)>1): | |
sR+=1 | |
blockStart=[] | |
blockSize=[] | |
for i in range(0,len(comb),2): | |
blockStart.append(chromStart + sum(comb[:i]) ) | |
for i in range(0,len(comb),2): | |
blockSize.append(comb[i]) | |
#build bitset only for exonic reads | |
for st,size in zip(blockStart,blockSize): | |
if (chrom in exon_ranges) and (len(exon_ranges[chrom].find(st,st+size)) >0): | |
cUR += 1 | |
mid = int(st) + (size/2) | |
if chrom not in ranges: | |
ranges[chrom] = Intersecter() | |
ranges[chrom].add_interval( Interval( mid, mid ) ) | |
else: | |
continue | |
self.f.seek(0) | |
print("Done", file=sys.stderr) | |
print("Total mapped reads (TR): " + str(totalReads), file=RPKM_OUT) | |
print("Multiple mapped reads (MR): " + str(multiMapReads), file=RPKM_OUT) | |
print("Uniquely mapped reads (UR): " + str(totalReads - multiMapReads), file=RPKM_OUT) | |
print("Spliced mapped reads (SR): " + str(sR), file=RPKM_OUT) | |
print("Corrected uniquely mapped reads (cUR, non-intronic fragments): " + str(cUR), file=RPKM_OUT) | |
#print >>RPKM_OUT, "Intronic Fragments (IF): " + str(intronic) | |
if totalReads ==0: | |
sys.exit(1) | |
#read refbed file | |
print("Assign reads to "+ refbed + '...', end=' ', file=sys.stderr) | |
for line in open(refbed,'r'): | |
try: | |
if line.startswith('#'):continue | |
if line.startswith('track'):continue | |
if line.startswith('browser'):continue | |
# Parse fields from gene tabls | |
fields = line.split() | |
chrom = fields[0].upper() | |
tx_start = int( fields[1] ) | |
tx_end = int( fields[2] ) | |
geneName = fields[3] | |
strand = fields[5].replace(" ","_") | |
exon_starts = list(map( int, fields[11].rstrip( ',\n' ).split( ',' ) )) | |
exon_starts = list(map((lambda x: x + tx_start ), exon_starts)) | |
exon_ends = list(map( int, fields[10].rstrip( ',\n' ).split( ',' ) )) | |
exon_ends = list(map((lambda x, y: x + y ), exon_starts, exon_ends)) | |
exon_sizes = list(map(int,fields[10].rstrip(',\n').split(','))) | |
intron_starts = exon_ends[:-1] | |
intron_ends=exon_starts[1:] | |
key='\t'.join((chrom.lower(),str(tx_start),str(tx_end),geneName,'0',strand)) | |
except: | |
print("[NOTE:input bed must be 12-column] skipped this line: " + line, end=' ', file=sys.stderr) | |
continue | |
# assign reads to intron | |
mRNA_count=0 | |
mRNA_len=sum(exon_sizes) | |
if(strand == '-'): | |
intronNum=len(intron_starts) | |
exonNum=len(exon_starts) | |
for st,end in zip(intron_starts,intron_ends): | |
if chrom in ranges: | |
hits= len(ranges[chrom].find(st,end)) | |
RPKM_OUT.write(chrom.lower() + "\t" + str(st) + "\t" + str(end) + "\t" + geneName + "_intron_" + str(intronNum) + "\t" + str(hits) + "\t" + strand + '\t' + str(hits*1000000000.0/((end-st)*(cUR))) +'\n') | |
intronNum -= 1 | |
for st,end in zip(exon_starts,exon_ends): | |
if chrom in ranges: | |
hits= len(ranges[chrom].find(st,end)) | |
RPKM_OUT.write(chrom.lower() + "\t" + str(st) + "\t" + str(end) + "\t" + geneName + "_exon_" + str(exonNum) + "\t" + str(hits) + "\t" + strand + '\t' + str(hits*1000000000.0/((end-st)*(cUR))) +'\n') | |
exonNum -= 1 | |
mRNA_count += hits | |
try: | |
RPKM_OUT.write(chrom.lower() + "\t" + str(tx_start) + "\t" + str(tx_end) + "\t" + geneName + "_mRNA" + "\t" + str(mRNA_count) + "\t" + strand + '\t' + str(mRNA_count*1000000000.0/(mRNA_len*cUR)) +'\n') | |
rpkm[key] = mRNA_count*1000000000.0/(mRNA_len*cUR) | |
except: | |
RPKM_OUT.write(chrom.lower() + "\t" + str(tx_start) + "\t" + str(tx_end) + "\t" + geneName + "_mRNA" + "\t" + str(0) + "\t" + strand + '\t' + str(0) +'\n') | |
rpkm[key] = 0 | |
elif(strand == '+'): | |
intronNum=1 | |
exonNum=1 | |
for st,end in zip(intron_starts,intron_ends): | |
if chrom in ranges: | |
hits= len(ranges[chrom].find(st,end)) | |
RPKM_OUT.write(chrom.lower() + "\t" + str(st) + "\t" + str(end) + "\t" + geneName + "_intron_" + str(intronNum) + "\t" + str(hits) + "\t" + strand + '\t' + str(hits*1000000000.0/((end-st)*(cUR))) +'\n') | |
intronNum += 1 | |
for st,end in zip(exon_starts,exon_ends): | |
if chrom in ranges: | |
hits= len(ranges[chrom].find(st,end)) | |
RPKM_OUT.write(chrom.lower() + "\t" + str(st) + "\t" + str(end) + "\t" + geneName + "_exon_" + str(exonNum) + "\t" + str(hits) + "\t" + strand + '\t' + str(hits*1000000000.0/((end-st)*(cUR))) +'\n') | |
exonNum += 1 | |
mRNA_count += hits | |
try: | |
RPKM_OUT.write(chrom.lower() + "\t" + str(tx_start) + "\t" + str(tx_end) + "\t" + geneName + "_mRNA" + "\t" + str(mRNA_count) + "\t" + strand + '\t' + str(mRNA_count*1000000000.0/(mRNA_len*cUR)) +'\n') | |
rpkm[key] = mRNA_count*1000000000.0/(mRNA_len*cUR) | |
except: | |
RPKM_OUT.write(chrom.lower() + "\t" + str(tx_start) + "\t" + str(tx_end) + "\t" + geneName + "_mRNA" + "\t" + str(0) + "\t" + strand + '\t' + str(0) +'\n') | |
rpkm[key] = 0 | |
print("Done", file=sys.stderr) | |
return rpkm | |
self.f.seek(0) | |
def filterKnownReads(self,refbed,outfile=None): | |
'''Compare SAM files with reference gene model, all reads mapped to gene model will be filted | |
out. The remainning unknown reads will be writtern to a new SAM file''' | |
totalReads=0 #total mapped reads | |
unknownReads=0 | |
ranges={} | |
if refbed is None: | |
print("You must specify a bed file representing gene model\n", file=sys.stderr) | |
exit(0) | |
if outfile is None: | |
out_file = self.fileName + ".unknownReads.SAM" | |
else: | |
out_file = outfile + ".unknownReads.SAM" | |
OUT=open(out_file,'w') | |
print("Reading reference gene model "+ refbed + '...', file=sys.stderr) | |
for line in open(refbed,'r'): | |
try: | |
if line.startswith(('#','track','browser')):continue | |
# Parse fields from gene tabls | |
fields = line.split() | |
chrom = fields[0].upper() | |
tx_start = int( fields[1] ) | |
tx_end = int( fields[2] ) | |
geneName = fields[3] | |
strand = fields[5].replace(" ","_") | |
exon_starts = list(map( int, fields[11].rstrip( ',\n' ).split( ',' ) )) | |
exon_starts = list(map((lambda x: x + tx_start ), exon_starts)) | |
exon_ends = list(map( int, fields[10].rstrip( ',\n' ).split( ',' ) )) | |
exon_ends = list(map((lambda x, y: x + y ), exon_starts, exon_ends)); | |
except: | |
print("[NOTE:input bed must be 12-column] skipped this line: " + line, end=' ', file=sys.stderr) | |
continue | |
for st,end in zip(exon_starts,exon_ends): | |
if chrom not in ranges: | |
ranges[chrom] = Intersecter() | |
ranges[chrom].add_interval( Interval( st, end ) ) | |
print("Processing SAM file "+ self.fileName + '...', file=sys.stderr) | |
for line in self.f: | |
if line.startswith("@"):continue | |
fields=line.rstrip('\n ').split() | |
flagCode=string.atoi(fields[1]) | |
if (flagCode & 0x0004) != 0: continue #skip unmap reads | |
if not ParseSAM._uniqueHit_pat.search(line): #skip multiple mapped reads | |
continue | |
blockStart=[] | |
blockSize=[] | |
totalReads +=1 | |
chrom = fields[2].upper() | |
chromStart = string.atoi(fields[3])-1 | |
comb=[int(i) for i in ParseSAM._splicedHit_pat.findall(fields[5])] #"9M4721N63M3157N8M" return ['9', '4721', '63', '3157', '8'] | |
for i in range(0,len(comb),2): | |
blockStart.append(chromStart + sum(comb[:i]) ) | |
for i in range(0,len(comb),2): | |
blockSize.append(comb[i]) | |
for st,size in zip(blockStart,blockSize): | |
if (chrom in ranges) and (len(ranges[chrom].find(st,st+size)) >0): #if we found this read is overlapped with known gene | |
break | |
else: | |
OUT.write(line) | |
unknownReads +=1 | |
OUT.close() | |
print("Total reads mapped to genome: " + str(totalReads), file=sys.stderr) | |
print("Total reads not overlapped with any exon: " + str(unknownReads), file=sys.stderr) | |
self.f.seek(0) | |
def genomicFragSize(self,outfile=None,low_bound=0,up_bound=1000,step=10): | |
'''estimate the genomic fragment size of mRNA experiment. fragment size = insert_size + 2 x read_length''' | |
if outfile is None: | |
out_file1 = self.fileName + ".fragSize.txt" | |
out_file2 = self.fileName + ".fragSize.Freq.txt" | |
out_file3 = self.fileName + ".fragSize_plot.r" | |
else: | |
out_file1 = outfile + ".fragSize.txt" | |
out_file2 = outfile + ".fragSize.Freq.txt" | |
out_file3 = outfile + ".fragSize_plot.r" | |
FO=open(out_file1,'w') | |
FQ=open(out_file2,'w') | |
RS=open(out_file3,'w') | |
chrom="chr100" #this is the fake chromosome | |
ranges={} | |
ranges[chrom]=Intersecter() | |
window_left_bound = list(range(low_bound,up_bound,step)) | |
frag_size=0 | |
pair_num=0.0 | |
ultra_low=0.0 | |
ultra_high=0.0 | |
size=[] | |
counts=[] | |
count=0 | |
print("Reading SAM file "+ self.fileName + ' ... ', end=' ', file=sys.stderr) | |
for line in self.f: | |
if line.startswith("@"):continue | |
fields=line.rstrip('\n ').split() | |
#if fields[0] in pairRead_info: | |
# continue | |
flagCode=string.atoi(fields[1]) | |
if (flagCode & 0x0001) ==0: | |
print("NOT pair-end sequencing", file=sys.stderr) | |
sys.exit(1) | |
if (flagCode & 0x0004) != 0: continue #skip unmap reads | |
if not ParseSAM._uniqueHit_pat.search(line): #skip multiple mapped reads | |
continue | |
if (flagCode & 0x0008 !=0): #skip single-end mapped reads | |
continue | |
if (fields[7] =='0'): | |
continue | |
if (int(fields[3]) > int(fields[7])): #left < right | |
continue | |
pair_num +=1 | |
comb=[int(i) for i in ParseSAM._splicedHit_pat.findall(fields[5])] #"9M4721N63M3157N8M" return ['9', '4721', '63', '3157', '8'] | |
read_len = len(fields[9]) | |
if (len(comb)==1): # this read is NOT spliced | |
frag_size = (int(fields[7]) - int(fields[3]) +1) + read_len | |
elif (len(comb) >1): # this read is spliced | |
frag_size = (int(fields[7]) - int(fields[3]) +1) + read_len - sum(comb[1::2]) | |
FO.write(fields[0] + '\t' + str(frag_size) + '\n') | |
if frag_size <= low_bound: | |
ultra_low+=1 | |
continue | |
elif frag_size > up_bound: | |
ultra_high +=1 | |
continue | |
ranges[chrom].add_interval( Interval( frag_size-1, frag_size ) ) | |
print("Done", file=sys.stderr) | |
if pair_num==0: | |
print("Cannot find paired reads", file=sys.stderr) | |
sys.exit(0) | |
print("Total paired read " + str(pair_num), file=FQ) | |
print("<=" + str(low_bound) + "\t"+ str(ultra_low), file=FQ) | |
for st in window_left_bound: | |
size.append(str(st + step/2)) | |
count = str(len(ranges[chrom].find(st,st + step))) | |
counts.append(count) | |
print(str(st) + '\t' + str(st+step) +'\t' + count, file=FQ) | |
print(">" + str(up_bound) + "\t"+ str(ultra_high), file=FQ) | |
print("pdf('gFragSize.pdf')", file=RS) | |
print("par(mfrow=c(2,1),cex.main=0.8,cex.lab=0.8,cex.axis=0.8,mar=c(4,4,4,1))", file=RS) | |
print('pie(c(%d,%d,%d),col=rainbow(3),cex=0.5,radius=1,main="Total %d fragments",labels=c("fraSize <= %d\\n(%4.2f%%)","fragSize > %d\\n(%4.2f%%)","%d < fragSize <= %d\\n(%4.2f%%)"), density=rep(80,80,80),angle=c(90,140,170))' % (ultra_low, ultra_high, pair_num -ultra_low -ultra_high, pair_num, low_bound, ultra_low*100/pair_num, up_bound, ultra_high*100/pair_num, low_bound, up_bound, 100-ultra_low*100/pair_num - ultra_high*100/pair_num), file=RS) | |
print('fragsize=rep(c(' + ','.join(size) + '),' + 'times=c(' + ','.join(counts) + '))', file=RS) | |
print('frag_sd = round(sd(fragsize))', file=RS) | |
print('frag_mean = round(mean(fragsize))', file=RS) | |
print('hist(fragsize,probability=T,breaks=%d,xlab="Fragment size (bp)",main=paste(c("Mean=",frag_mean,";","SD=",frag_sd),collapse=""),border="blue")' % len(window_left_bound), file=RS) | |
print("lines(density(fragsize,bw=%d),col='red')" % (2*step), file=RS) | |
print("dev.off()", file=RS) | |
FO.close() | |
FQ.close() | |
RS.close() | |
#self.f.seek(0) | |
def saturation_RPKM(self,refbed,outfile=None,sample_start=5,sample_step=5,sample_end=100): | |
'''for each gene, check if its RPKM (epxresion level) has already been saturated or not''' | |
if refbed is None: | |
print("You must specify a bed file representing gene model\n", file=sys.stderr) | |
exit(0) | |
if outfile is None: | |
rpkm_file = self.fileName + ".eRPKM.xls" | |
raw_file = self.fileName + ".rawCount.xls" | |
else: | |
rpkm_file = outfile + ".eRPKM.xls" | |
raw_file = outfile + ".rawCount.xls" | |
RPKM_OUT = open(rpkm_file,'w') | |
RAW_OUT = open(raw_file ,'w') | |
ranges={} | |
totalReads=0 | |
cUR_num = 0 #number | |
block_list=[] #non-spliced read AS IS, splicing reads were counted multiple times | |
#read SAM | |
my_pat = re.compile(r'NH:i:(\d+)\b') | |
NH_tag=0 | |
print("Reading "+ self.fileName + '...', end=' ', file=sys.stderr) | |
for line in self.f: | |
if line.startswith("@"):continue | |
fields=line.rstrip('\n ').split() | |
flagCode=string.atoi(fields[1]) | |
if (flagCode & 0x0004) != 0: continue #skip unmap reads | |
totalReads +=1 | |
hitNum =[int(i) for i in my_pat.findall(line)] | |
if len(hitNum) ==0: | |
NH_tag=1 #cannot determine uniqness without NH tag | |
elif len(hitNum) ==1: | |
if int(hitNum[0])>1: continue #skip multiple mapped reads | |
else: | |
print("More than 1 NH tag found within a single line. Incorrect SAM format!", file=sys.stderr) | |
sys.exit(1) | |
chrom = fields[2].upper() | |
chromStart = string.atoi(fields[3])-1 | |
comb=[int(i) for i in ParseSAM._splicedHit_pat.findall(fields[5])] #"9M4721N63M3157N8M" return ['9', '4721', '63', '3157', '8'] | |
cUR_num += (len(comb) +1)/2 | |
blockStart=[] | |
blockSize=[] | |
for i in range(0,len(comb),2): | |
blockStart.append(chromStart + sum(comb[:i]) ) | |
for i in range(0,len(comb),2): | |
blockSize.append(comb[i]) | |
for st,size in zip(blockStart,blockSize): | |
mid = int(st) + (size/2) | |
block_list.append(chrom + ":" + str(mid)) | |
if NH_tag==1: | |
print("Warn: NO NH tag found. Cannot determine uniqueness of alignment. All alignments will be used", file=sys.stderr) | |
print("Done", file=sys.stderr) | |
print("shuffling alignments ...", end=' ', file=sys.stderr) | |
random.shuffle(block_list) | |
print("Done", file=sys.stderr) | |
ranges={} | |
sample_size=0 | |
frag_total = cUR_num | |
RPKM_table=collections.defaultdict(list) | |
rawCount_table=collections.defaultdict(list) | |
RPKM_head=['chr','start','end','name','score','strand'] | |
tmp=list(range(sample_start,sample_end,sample_step)) | |
tmp.append(100) | |
#=========================sampling uniquely mapped reads from population | |
for pertl in tmp: #[5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95,100] | |
index_st = int(frag_total * (pertl-sample_step)/100.0) | |
index_end = int(frag_total * pertl/100.0) | |
if index_st < 0: index_st = 0 | |
sample_size += index_end -index_st | |
RPKM_head.append(str(pertl) + '%') | |
print("sampling " + str(pertl) +"% (" + str(sample_size) + ") fragments ...", end=' ', file=sys.stderr) | |
for i in range(index_st, index_end): | |
(chr,coord) = block_list[i].split(':') | |
if chr not in ranges: | |
ranges[chr] = Intersecter() | |
ranges[chr].add_interval( Interval( int(coord), int(coord)+1 ) ) | |
#========================= calculating RPKM based on sub-population | |
#print >>sys.stderr, "assign reads to "+ refbed + '...', | |
for line in open(refbed,'r'): | |
try: | |
if line.startswith(('#','track','browser')):continue | |
# Parse fields from gene tabls | |
fields = line.split() | |
chrom = fields[0].upper() | |
tx_start = int( fields[1] ) | |
tx_end = int( fields[2] ) | |
geneName = fields[3] | |
strand = fields[5].replace(" ","_") | |
exon_starts = list(map( int, fields[11].rstrip( ',\n' ).split( ',' ) )) | |
exon_starts = list(map((lambda x: x + tx_start ), exon_starts)) | |
exon_ends = list(map( int, fields[10].rstrip( ',\n' ).split( ',' ) )) | |
exon_ends = list(map((lambda x, y: x + y ), exon_starts, exon_ends)) | |
exon_sizes = list(map(int,fields[10].rstrip(',\n').split(','))) | |
key='\t'.join((chrom.lower(),str(tx_start),str(tx_end),geneName,'0',strand)) | |
except: | |
print("[NOTE:input bed must be 12-column] skipped this line: " + line, file=sys.stderr) | |
continue | |
mRNA_count=0 #we need to initializ it to 0 for each gene | |
mRNA_len=sum(exon_sizes) | |
for st,end in zip(exon_starts,exon_ends): | |
if chrom in ranges: | |
mRNA_count += len(ranges[chrom].find(st,end)) | |
if mRNA_len ==0: | |
print(geneName + " has 0 nucleotides. Exit!", file=sys.stderr) | |
sys.exit(1) | |
if sample_size == 0: | |
print("Too few reads to sample. Exit!", file=sys.stderr) | |
sys.exit(1) | |
mRNA_RPKM = (mRNA_count * 1000000000.0)/(mRNA_len * sample_size) | |
RPKM_table[key].append(str(mRNA_RPKM)) | |
rawCount_table[key].append(str(mRNA_count)) | |
print("Done", file=sys.stderr) | |
#self.f.seek(0) | |
print('\t'.join(RPKM_head), file=RPKM_OUT) | |
print('\t'.join(RPKM_head), file=RAW_OUT) | |
for key in RPKM_table: | |
print(key + '\t', end=' ', file=RPKM_OUT) | |
print('\t'.join(RPKM_table[key]), file=RPKM_OUT) | |
print(key + '\t', end=' ', file=RAW_OUT) | |
print('\t'.join(rawCount_table[key]), file=RAW_OUT) | |
def saturation_junction(self,refgene,outfile=None,sample_start=5,sample_step=5,sample_end=100,min_intron=50,recur=1): | |
'''check if an RNA-seq experiment is saturated in terms of detecting known splicing junction''' | |
if outfile is None: | |
out_file = self.fileName + ".junctionSaturation_plot.r" | |
else: | |
out_file = outfile + ".junctionSaturation_plot.r" | |
if refgene is None: | |
print("You must provide reference gene model in bed format.", file=sys.stderr) | |
sys.exit(1) | |
OUT = open(out_file,'w') | |
#reading reference gene | |
knownSpliceSites= set() | |
print("reading reference bed file: ",refgene, " ... ", end=' ', file=sys.stderr) | |
for line in open(refgene,'r'): | |
if line.startswith(('#','track','browser')):continue | |
fields = line.split() | |
if(len(fields)<12): | |
print("Invalid bed line (skipped):",line, end=' ', file=sys.stderr) | |
continue | |
chrom = fields[0].upper() | |
tx_start = int( fields[1] ) | |
tx_end = int( fields[2] ) | |
if int(fields[9] ==1): | |
continue | |
exon_starts = list(map( int, fields[11].rstrip( ',\n' ).split( ',' ) )) | |
exon_starts = list(map((lambda x: x + tx_start ), exon_starts)) | |
exon_ends = list(map( int, fields[10].rstrip( ',\n' ).split( ',' ) )) | |
exon_ends = list(map((lambda x, y: x + y ), exon_starts, exon_ends)); | |
intron_start = exon_ends[:-1] | |
intron_end=exon_starts[1:] | |
for st,end in zip (intron_start, intron_end): | |
knownSpliceSites.add(chrom + ":" + str(st) + "-" + str(end)) | |
print("Done! Total "+str(len(knownSpliceSites)) + " known splicing sites", file=sys.stderr) | |
#read SAM file | |
samSpliceSites=[] | |
intron_start=[] | |
intron_end=[] | |
uniqSpliceSites=collections.defaultdict(int) | |
print("Reading "+ self.fileName + '...', end=' ', file=sys.stderr) | |
for line in self.f: | |
if line.startswith("@"):continue | |
fields=line.rstrip('\n ').split() | |
flagCode=string.atoi(fields[1]) | |
chrom = fields[2].upper() | |
chromStart = string.atoi(fields[3])-1 | |
if (flagCode & 0x0004) != 0: continue #skip unmap reads | |
if not ParseSAM._uniqueHit_pat.search(line): #skip multiple mapped reads | |
continue | |
if (len(ParseSAM._splicedHit_pat.findall(fields[5]))==1): #skip non-spilced reads | |
continue | |
comb=[int(i) for i in ParseSAM._splicedHit_pat.findall(fields[5])] #"9M4721N63M3157N8M" return ['9', '4721', '63', '3157', '8'] | |
blockStart=[] | |
blockSize=[] | |
blockEnd=[] | |
#if intron size < min_intron, skip. | |
flag=0 | |
for i in range(1,len(comb),2): | |
if comb[i] < min_intron: | |
flag=1 | |
break | |
if flag ==1: | |
continue | |
for i in range(0,len(comb),2): | |
blockStart.append(chromStart + sum(comb[:i]) ) | |
for i in range(0,len(comb),2): | |
blockSize.append(comb[i]) | |
for st,size in zip(blockStart,blockSize): | |
end = st + size | |
blockEnd.append(end) | |
intron_st = blockEnd[:-1] | |
intron_end = blockStart[1:] | |
for st,end in zip(intron_st, intron_end): | |
samSpliceSites.append(chrom + ":" + str(st) + "-" + str(end)) | |
#self.f.seek(0) | |
print("Done", file=sys.stderr) | |
print("shuffling alignments ...", end=' ', file=sys.stderr) | |
random.shuffle(samSpliceSites) | |
print("Done", file=sys.stderr) | |
#resampling | |
SR_num = len(samSpliceSites) | |
sample_size=0 | |
knownSpliceSites_num = 0 | |
known_junc=[] | |
all_junc=[] | |
#=========================sampling uniquely mapped reads from population | |
tmp=list(range(sample_start,sample_end,sample_step)) | |
tmp.append(100) | |
for pertl in tmp: #[5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95,100] | |
knownSpliceSites_num = 0 | |
index_st = int(SR_num * ((pertl - sample_step)/100.0)) | |
index_end = int(SR_num * (pertl/100.0)) | |
if index_st < 0: index_st = 0 | |
sample_size += index_end -index_st | |
print("sampling " + str(pertl) +"% (" + str(sample_size) + ") unique splicing alignments ...", end=' ', file=sys.stderr) | |
for i in range(index_st, index_end): | |
uniqSpliceSites[samSpliceSites[i]] +=1 | |
all_junc.append(str(len(list(uniqSpliceSites.keys())))) | |
for sj in uniqSpliceSites: | |
if sj in knownSpliceSites and uniqSpliceSites[sj] >= recur: | |
knownSpliceSites_num +=1 | |
print(str(knownSpliceSites_num) + " known splicing junctions", file=sys.stderr) | |
known_junc.append(str(knownSpliceSites_num)) | |
#for j in uniq_SJ: | |
#print >>OUT, j + "\t" + str(uniq_SJ[j]) | |
print("pdf('junction_saturation.pdf')", file=OUT) | |
print("x=c(" + ','.join([str(i) for i in tmp]) + ')', file=OUT) | |
print("y=c(" + ','.join(known_junc) + ')', file=OUT) | |
print("z=c(" + ','.join(all_junc) + ')', file=OUT) | |
print("plot(x,z/1000,xlab='percent of total reads',ylab='Number of splicing junctions (x1000)',type='o',col='blue',ylim=c(%d,%d))" % (int(int(known_junc[0])/1000), int(int(all_junc[-1])/1000)), file=OUT) | |
print("points(x,y/1000,type='o',col='red')", file=OUT) | |
print('legend(5,%d, legend=c("All detected junction","Annotated junction"),col=c("blue","red"),lwd=1,pch=1)' % int(int(all_junc[-1])/1000), file=OUT) | |
print("dev.off()", file=OUT) | |
def annotate_junction(self,refgene,outfile=None,min_intron=50): | |
'''Annotate splicing junctions in SAM file. Note that a (long) read might have multiple splicing | |
events (splice multiple times), and the same splicing events can be consolidated into a single | |
junction''' | |
if outfile is None: | |
out_file = self.fileName + ".junction.xls" | |
out_file2 = self.fileName + ".junction_plot.r" | |
else: | |
out_file = outfile + ".junction.xls" | |
out_file2 = outfile + ".junction_plot.r" | |
if refgene is None: | |
print("You must provide reference gene model in bed format.", file=sys.stderr) | |
sys.exit(1) | |
OUT = open(out_file,'w') | |
ROUT = open(out_file2,'w') | |
#reading reference gene model | |
refIntronStarts=collections.defaultdict(dict) | |
refIntronEnds=collections.defaultdict(dict) | |
total_junc =0 | |
novel35_junc =0 | |
novel3or5_junc =0 | |
known_junc =0 | |
splicing_events=collections.defaultdict(int) | |
print("\treading reference bed file: ",refgene, " ... ", end=' ', file=sys.stderr) | |
for line in open(refgene,'r'): | |
if line.startswith(('#','track','browser')):continue | |
# Parse fields from gene tabls | |
fields = line.split() | |
if(len(fields)<12): | |
print("Invalid bed line (skipped):",line, end=' ', file=sys.stderr) | |
continue | |
chrom = fields[0].upper() | |
tx_start = int( fields[1] ) | |
tx_end = int( fields[2] ) | |
if int(fields[9] ==1): | |
continue | |
exon_starts = list(map( int, fields[11].rstrip( ',\n' ).split( ',' ) )) | |
exon_starts = list(map((lambda x: x + tx_start ), exon_starts)) | |
exon_ends = list(map( int, fields[10].rstrip( ',\n' ).split( ',' ) )) | |
exon_ends = list(map((lambda x, y: x + y ), exon_starts, exon_ends)); | |
intron_start = exon_ends[:-1] | |
intron_end=exon_starts[1:] | |
for i_st,i_end in zip (intron_start, intron_end): | |
refIntronStarts[chrom][i_st] =i_st | |
refIntronEnds[chrom][i_end] =i_end | |
print("Done", file=sys.stderr) | |
#reading input SAM file | |
print("\tProcessing "+ self.fileName + '...', end=' ', file=sys.stderr) | |
for line in self.f: | |
if line.startswith("@"):continue | |
fields=line.rstrip('\n ').split() | |
flagCode=string.atoi(fields[1]) | |
chrom = fields[2].upper() | |
chromStart = string.atoi(fields[3])-1 | |
if (flagCode & 0x0004) != 0: continue #skip unmap reads | |
if not ParseSAM._uniqueHit_pat.search(line): #skip multiple mapped reads | |
continue | |
if (len(ParseSAM._splicedHit_pat.findall(fields[5]))==1): #skip non-spilced reads | |
continue | |
comb=[int(i) for i in ParseSAM._splicedHit_pat.findall(fields[5])] #"9M4721N63M3157N8M" return ['9', '4721', '63', '3157', '8'] | |
blockStart=[] | |
blockSize=[] | |
blockEnd=[] | |
#if intron size < min_intron, skip. | |
flag=0 | |
for i in range(1,len(comb),2): | |
if comb[i] < min_intron: | |
flag=1 | |
break | |
if flag ==1: | |
continue | |
total_junc += (len(comb) -1)/2 | |
for i in range(0,len(comb),2): | |
blockStart.append(chromStart + sum(comb[:i]) ) | |
for i in range(0,len(comb),2): | |
blockSize.append(comb[i]) | |
for st,size in zip(blockStart,blockSize): | |
end = st + size | |
blockEnd.append(end) | |
intron_st = blockEnd[:-1] | |
intron_end = blockStart[1:] | |
for i_st,i_end in zip(intron_st, intron_end): | |
splicing_events[chrom + ":" + str(i_st) + ":" + str(i_end)] += 1 | |
if (i_st in refIntronStarts[chrom] and i_end in refIntronEnds[chrom]): | |
known_junc +=1 #known both | |
elif (i_st not in refIntronStarts[chrom] and i_end not in refIntronEnds[chrom]): | |
novel35_junc +=1 | |
else: | |
novel3or5_junc +=1 | |
#self.f.seek(0) | |
print("Done", file=sys.stderr) | |
print('pdf("splicing_events_pie.pdf")', file=ROUT) | |
print("events=c(" + ','.join([str(i*100.0/total_junc) for i in (novel3or5_junc,novel35_junc,known_junc)])+ ')', file=ROUT) | |
print('pie(events,col=c(2,3,4),init.angle=30,angle=c(60,120,150),density=c(70,70,70),main="splicing events",labels=c("partial_novel %d%%","complete_novel %d%%","known %d%%"))' % (round(novel3or5_junc*100.0/total_junc),round(novel35_junc*100.0/total_junc),round(known_junc*100.0/total_junc)), file=ROUT) | |
print("dev.off()", file=ROUT) | |
print("\n===================================================================", file=sys.stderr) | |
print("Total splicing Events:\t" + str(total_junc), file=sys.stderr) | |
print("Known Splicing Events:\t" + str(known_junc), file=sys.stderr) | |
print("Partial Novel Splicing Events:\t" + str(novel3or5_junc), file=sys.stderr) | |
print("Novel Splicing Events:\t" + str(novel35_junc), file=sys.stderr) | |
#reset variables | |
total_junc =0 | |
novel35_junc =0 | |
novel3or5_junc =0 | |
known_junc =0 | |
print("chrom\tintron_st(0-based)\tintron_end(1-based)\tread_count\tannotation", file=OUT) | |
for i in splicing_events: | |
total_junc += 1 | |
(chrom, i_st, i_end) = i.split(":") | |
print('\t'.join([chrom.replace("CHR","chr"),i_st,i_end]) + '\t' + str(splicing_events[i]) + '\t', end=' ', file=OUT) | |
i_st = int(i_st) | |
i_end = int(i_end) | |
if (i_st in refIntronStarts[chrom] and i_end in refIntronEnds[chrom]): | |
print("annotated", file=OUT) | |
known_junc +=1 | |
elif (i_st not in refIntronStarts[chrom] and i_end not in refIntronEnds[chrom]): | |
print('complete_novel', file=OUT) | |
novel35_junc +=1 | |
else: | |
print('partial_novel', file=OUT) | |
novel3or5_junc +=1 | |
print("\nTotal splicing Junctions:\t" + str(total_junc), file=sys.stderr) | |
print("Known Splicing Junctions:\t" + str(known_junc), file=sys.stderr) | |
print("Partial Novel Splicing Junctions:\t" + str(novel3or5_junc), file=sys.stderr) | |
print("Novel Splicing Junctions:\t" + str(novel35_junc), file=sys.stderr) | |
print("\n===================================================================", file=sys.stderr) | |
print('pdf("splicing_junction_pie.pdf")', file=ROUT) | |
print("junction=c(" + ','.join([str(i*100.0/total_junc) for i in (novel3or5_junc,novel35_junc,known_junc,)])+ ')', file=ROUT) | |
print('pie(junction,col=c(2,3,4),init.angle=30,angle=c(60,120,150),density=c(70,70,70),main="splicing junctions",labels=c("partial_novel %d%%","complete_novel %d%%","known %d%%"))' % (round(novel3or5_junc*100.0/total_junc),round(novel35_junc*100.0/total_junc),round(known_junc*100.0/total_junc)), file=ROUT) | |
print("dev.off()", file=ROUT) | |
#print >>ROUT, "mat=matrix(c(events,junction),byrow=T,ncol=3)" | |
#print >>ROUT, 'barplot(mat,beside=T,ylim=c(0,100),names=c("known","partial\nnovel","complete\nnovel"),legend.text=c("splicing events","splicing junction"),ylab="Percent")' | |
def mRNA_RPKM(self,refbed,outfile=None): | |
'''calculate mRNA's RPKM value''' | |
if refbed is None: | |
print("You must specify a bed file representing gene model\n", file=sys.stderr) | |
exit(0) | |
if outfile is None: | |
rpkm_file = self.fileName + ".RPKM.xls" | |
else: | |
rpkm_file = outfile + ".RPKM.xls" | |
RPKM_OUT = open(rpkm_file,'w') | |
ranges={} | |
totalReads=0 | |
cUR_num = 0 #number | |
RPKM_table={} | |
rawCount_table={} | |
mRNAlen_table={} | |
RPKM_head=['chr','start','end','name','score','strand','length','rawCount','RPKM'] | |
#read SAM | |
print("Reading "+ self.fileName + '...', end=' ', file=sys.stderr) | |
for line in self.f: | |
if line.startswith("@"):continue | |
fields=line.rstrip('\n ').split() | |
flagCode=string.atoi(fields[1]) | |
if (flagCode & 0x0004) != 0: continue #skip unmap reads | |
totalReads +=1 | |
if not ParseSAM._uniqueHit_pat.search(line): #skip multiple mapped reads | |
continue | |
chrom = fields[2].upper() | |
chromStart = string.atoi(fields[3])-1 | |
comb=[int(i) for i in ParseSAM._splicedHit_pat.findall(fields[5])] #"9M4721N63M3157N8M" return ['9', '4721', '63', '3157', '8'] | |
cUR_num += (len(comb) +1)/2 | |
blockStart=[] | |
blockSize=[] | |
for i in range(0,len(comb),2): | |
blockStart.append(chromStart + sum(comb[:i]) ) | |
for i in range(0,len(comb),2): | |
blockSize.append(comb[i]) | |
for st,size in zip(blockStart,blockSize): | |
mid = int(st) + (size/2) | |
if chrom not in ranges: | |
ranges[chrom] = Intersecter() | |
ranges[chrom].add_interval( Interval( mid, mid ) ) | |
print("Done", file=sys.stderr) | |
print("Calculating RPKM ...", end=' ', file=sys.stderr) | |
for line in open(refbed,'r'): | |
try: | |
if line.startswith(('#','track','browser')):continue | |
# Parse fields from gene tabls | |
fields = line.split() | |
chrom = fields[0].upper() | |
tx_start = int( fields[1] ) | |
tx_end = int( fields[2] ) | |
geneName = fields[3] | |
strand = fields[5].replace(" ","_") | |
exon_starts = list(map( int, fields[11].rstrip( ',\n' ).split( ',' ) )) | |
exon_starts = list(map((lambda x: x + tx_start ), exon_starts)) | |
exon_ends = list(map( int, fields[10].rstrip( ',\n' ).split( ',' ) )) | |
exon_ends = list(map((lambda x, y: x + y ), exon_starts, exon_ends)) | |
exon_sizes = list(map(int,fields[10].rstrip(',\n').split(','))) | |
key='\t'.join((chrom.lower(),str(tx_start),str(tx_end),geneName,'0',strand)) | |
except: | |
print("[NOTE:input bed must be 12-column] skipped this line: " + line, file=sys.stderr) | |
continue | |
mRNA_count=0 #we need to initializ it to 0 for each gene | |
mRNA_len=sum(exon_sizes) | |
for st,end in zip(exon_starts,exon_ends): | |
if chrom in ranges: | |
mRNA_count += len(ranges[chrom].find(st,end)) | |
mRNA_RPKM = (mRNA_count * 1000000000.0)/(mRNA_len * cUR_num) | |
mRNAlen_table[key] = mRNA_len | |
RPKM_table[key] = str(mRNA_RPKM) | |
rawCount_table[key] = str(mRNA_count) | |
print("Done", file=sys.stderr) | |
print('\t'.join(RPKM_head), file=RPKM_OUT) | |
for k in RPKM_table: | |
print(k + '\t', end=' ', file=RPKM_OUT) | |
print(str(mRNAlen_table[k]) + '\t', end=' ', file=RPKM_OUT) | |
print(str(rawCount_table[k]) + '\t', end=' ', file=RPKM_OUT) | |
print(str(RPKM_table[k]) + '\t', file=RPKM_OUT) | |
return RPKM_table | |
self.f.seek(0) | |
def strand_specificity(self,refbed,genome,outfile=None,sp="GTAG,GCAG,ATAC"): | |
'''Measure the strand specificity of strand specific RNA-seq protocol. For non-splice read, | |
use the parental gene as standard, for spliced read, use the splicing motif as strandard''' | |
if refbed is None: | |
print("You must specify a bed file representing gene model\n", file=sys.stderr) | |
exit(0) | |
if genome is None: | |
print("You must specify genome sequence in fasta format\n", file=sys.stderr) | |
exit(0) | |
if outfile is None: | |
strand_file = self.fileName + ".strand.infor" | |
else: | |
strand_file = outfile + ".strand.infor" | |
OUT = open(strand_file,'w') | |
print("read_type\tread_id\tread_seq\tchr\tStart\tCigar\tprotocol_strand\tgene_strand", file=OUT) | |
transtab = string.maketrans("ACGTNX","TGCANX") | |
motif=sp.upper().split(',') | |
motif_rev = [m.translate(transtab)[::-1] for m in motif] | |
#load genome | |
print("\tloading "+genome+'...', file=sys.stderr) | |
tmp=fasta.Fasta(genome) | |
#load reference gene model | |
gene_ranges={} | |
print("reading reference gene model ...", end=' ', file=sys.stderr) | |
for line in open(refbed,'r'): | |
try: | |
if line.startswith(('#','track','browser')):continue | |
# Parse fields from gene tabls | |
fields = line.split() | |
chrom = fields[0] | |
tx_start = int( fields[1] ) | |
tx_end = int( fields[2] ) | |
geneName = fields[3] | |
strand = fields[5] | |
except: | |
print("[NOTE:input bed must be 12-column] skipped this line: " + line, file=sys.stderr) | |
continue | |
if chrom not in gene_ranges: | |
gene_ranges[chrom]=Intersecter() | |
gene_ranges[chrom].insert(tx_start,tx_end,strand) | |
print("Done", file=sys.stderr) | |
#read SAM | |
read_type="unknown" | |
strand_from_protocol = 'unknown' | |
strand_from_gene='unknown' | |
strand_stat=collections.defaultdict(int) | |
print("Reading "+ self.fileName + '...', end=' ', file=sys.stderr) | |
for line in self.f: | |
if line.startswith("@"):continue | |
fields=line.rstrip('\n ').split() | |
flagCode=string.atoi(fields[1]) | |
if (flagCode & 0x0004) != 0: continue #skip unmap reads | |
if not ParseSAM._uniqueHit_pat.search(line):continue #skip multiple mapped reads | |
if (flagCode & 0x0100 !=0): continue #skip non primary hit | |
if (flagCode & 0x0200 !=0): continue #skip QC-failed | |
if (flagCode & 0x0400 !=0): continue #skip PCR artifact | |
if (flagCode & 0x0010 !=0): strand_from_protocol = '-' | |
if (flagCode & 0x0010 ==0): strand_from_protocol = '+' | |
if (flagCode & 0x0040 !=0): read_type="read_1" | |
if (flagCode & 0x0080 !=0): read_type="read_2" | |
chrom = fields[2] | |
comb=[int(i) for i in ParseSAM._splicedHit_pat.findall(fields[5])] #"9M4721N63M3157N8M" return ['9', '4721', '63', '3157', '8'] | |
readStart = string.atoi(fields[3])-1 | |
#for non spliced read | |
if len(comb)==1: | |
readEnd = readStart + len(fields[9]) | |
if chrom in gene_ranges: | |
if len(set(gene_ranges[chrom].find(readStart,readEnd)))>1: | |
strand_from_gene="overlap" | |
elif len(set(gene_ranges[chrom].find(readStart,readEnd)))==1: | |
strand_from_gene = set(gene_ranges[chrom].find(readStart,readEnd)).pop() | |
else: | |
strand_from_gene="intergenic" | |
print(read_type + '\t' + fields[0] + '\t' + fields[9] + '\t' + fields[2] + '\t' + fields[3] + '\t' + fields[5] +'\t', end=' ', file=OUT) | |
print(strand_from_protocol + '\t' + strand_from_gene, file=OUT) | |
strand_stat[read_type + '\t' + strand_from_protocol +'\t' + strand_from_gene] +=1 | |
#for spliced read | |
if len(comb)>=3: | |
splice_strand=[] | |
blockStart=[] | |
blockSize=[] | |
blockEnd =[] | |
for i in range(0,len(comb),2): | |
blockStart.append(readStart + sum(comb[:i]) ) | |
for i in range(0,len(comb),2): | |
blockSize.append(comb[i]) | |
blockEnd=list(map((lambda x,y:x+y),blockStart,blockSize)) | |
intron_start=blockEnd[:-1] | |
intron_end=blockStart[1:] | |
for st,end in zip(intron_start,intron_end): | |
try: | |
splice_motif = str(tmp.fetchSeq(chrom, st, st+2)) + str(tmp.fetchSeq(chrom, end-2,end)) | |
except: | |
print(line) | |
if splice_motif in motif: | |
splice_strand.append('+') | |
elif splice_motif in motif_rev: | |
splice_strand.append('-') | |
else: | |
splice_strand.append('unknown motif') | |
if len(set(splice_strand))>1: | |
strand_from_splice = 'unknown motif' | |
else: | |
strand_from_splice = set(splice_strand).pop() | |
print(read_type + '\t' + fields[0] + '\t' + fields[9] + '\t' + fields[2] + '\t' + fields[3] + '\t' + fields[5] +'\t', end=' ', file=OUT) | |
print(strand_from_protocol + '\t' + strand_from_splice, file=OUT) | |
strand_stat[read_type + '\t' + strand_from_protocol +'\t' + strand_from_splice] +=1 | |
print("Done", file=sys.stderr) | |
print("read_type\tstrand_expected\tstrand_observed\tcount") | |
for i in sorted(strand_stat): | |
print(str(i) +'\t' + str(strand_stat[i])) | |
def clipping_profile(self,outfile=None): | |
'''calculate profile of soft clipping''' | |
if outfile is None: | |
out_file1 = self.fileName + ".clipping_profile.xls" | |
out_file2 = self.fileName + ".clipping_profile.r" | |
else: | |
out_file1 = outfile + ".clipping_profile.xls" | |
out_file2 = outfile + ".clipping_profile.r" | |
OUT=open(out_file1,'w') | |
ROUT=open(out_file2,'w') | |
print("Position\tRead_Total\tRead_clipped", file=OUT) | |
soft_p = re.compile(r'(.*?)(\d+)S') | |
read_part = re.compile(r'(\d+)[MIS=X]') | |
total_read =0 | |
skip_part_of_read =0 | |
soft_clip_profile=collections.defaultdict(int) | |
read_pos=[] | |
clip_count=[] | |
print("Reading "+ self.fileName + '...', file=sys.stderr) | |
for line in self.f: | |
if line.startswith("@"):continue | |
fields=line.rstrip('\n ').split() | |
flagCode=string.atoi(fields[1]) | |
if (flagCode & 0x0004) != 0: continue #skip unmap reads | |
if not ParseSAM._uniqueHit_pat.search(line):continue #skip multiple mapped reads | |
if (flagCode & 0x0100 !=0): continue #skip non primary hit | |
if (flagCode & 0x0200 !=0): continue #skip QC-failed | |
if (flagCode & 0x0400 !=0): continue #skip PCR artifact | |
total_read +=1 | |
m = soft_p.findall(fields[5]) | |
skip_part_of_read =0 | |
if len(m)==0: #NO soft clip | |
continue | |
else: | |
for j in m: | |
skip_part_of_read += sum([int(i) for i in read_part.findall(j[0])]) | |
for n in range(skip_part_of_read,(skip_part_of_read + int(j[1]))): | |
soft_clip_profile[n]+=1 | |
skip_part_of_read += int(j[1]) | |
for i in soft_clip_profile: | |
read_pos.append(str(i)) | |
clip_count.append(str(soft_clip_profile[i])) | |
print(str(i) + '\t' + str(total_read) + '\t' + str(soft_clip_profile[i]), file=OUT) | |
print("pdf(\"%s\")" % outfile + '.clipping_profile.pdf', file=ROUT) | |
print("read_pos=c(" + ','.join(read_pos) + ')', file=ROUT) | |
print("count=c(" + ','.join(clip_count) + ')', file=ROUT) | |
print('plot(read_pos,1-(count/%d),col="blue",main="clipping profile",xlab="Position of reads",ylab="Mappability",type="b")' % total_read, file=ROUT) | |
print("dev.off()", file=ROUT) | |
def insertion_profile(self,read_len,outfile=None): | |
'''calculate profile of insertion (insertion means insertion to the reference)''' | |
if outfile is None: | |
out_file1 = self.fileName + ".insertion_profile.xls" | |
out_file2 = self.fileName + ".insertion_profile.r" | |
else: | |
out_file1 = outfile + ".insertion_profile.xls" | |
out_file2 = outfile + ".insertion_profile.r" | |
OUT=open(out_file1,'w') | |
ROUT=open(out_file2,'w') | |
print("Position\tRead_Total\tRead_clipped", file=OUT) | |
soft_p = re.compile(r'(.*?)(\d+)I') | |
read_part = re.compile(r'(\d+)[MIS=X]') | |
total_read =0 | |
skip_part_of_read =0 | |
soft_clip_profile=collections.defaultdict(int) | |
print("Reading "+ self.fileName + '...', end=' ', file=sys.stderr) | |
for line in self.f: | |
if line.startswith("@"):continue | |
fields=line.rstrip('\n ').split() | |
flagCode=string.atoi(fields[1]) | |
if (flagCode & 0x0004) != 0: continue #skip unmap reads | |
if not ParseSAM._uniqueHit_pat.search(line):continue #skip multiple mapped reads | |
if (flagCode & 0x0100 !=0): continue #skip non primary hit | |
if (flagCode & 0x0200 !=0): continue #skip QC-failed | |
if (flagCode & 0x0400 !=0): continue #skip PCR artifact | |
total_read +=1 | |
m = soft_p.findall(fields[5]) | |
skip_part_of_read =0 | |
if len(m)==0: #NO soft clip | |
continue | |
else: | |
for j in m: | |
skip_part_of_read += sum([int(i) for i in read_part.findall(j[0])]) | |
for n in range(skip_part_of_read,(skip_part_of_read + int(j[1]))): | |
soft_clip_profile[n]+=1 | |
skip_part_of_read += int(j[1]) | |
for i in range(0,read_len): | |
print(str(i) + '\t' + str(total_read) + '\t' + str(soft_clip_profile[i]), file=OUT) | |
class ParseBAM(object): | |
'''This class provides fuctions to parsing/processing/transforming SAM or BAM files. The input | |
file could be either SAM or BAM format file''' | |
multi_hit_tags=['H0','H1','H2','IH','NH'] | |
def __init__(self,inputFile): | |
'''constructor. input could be bam or sam''' | |
try: | |
self.samfile = pysam.Samfile(inputFile,'rb') | |
if len(self.samfile.header) ==0: | |
print("BAM/SAM file has no header section. Exit!", file=sys.stderr) | |
sys.exit(1) | |
self.bam_format = True | |
except: | |
self.samfile = pysam.Samfile(inputFile,'r') | |
if len(self.samfile.header) ==0: | |
print("BAM/SAM file has no header section. Exit!", file=sys.stderr) | |
sys.exit(1) | |
self.bam_format = False | |
def stat (self,q_cut=30): | |
'''Calculate mapping statistics''' | |
R_total=0 | |
R_qc_fail=0 | |
R_duplicate=0 | |
R_nonprimary=0 | |
R_unmap =0 | |
R_multipleHit=0 | |
R_uniqHit=0 #all the following count should be sum to uniqHit | |
R_read1=0 | |
R_read2=0 | |
R_reverse =0 | |
R_forward=0 | |
R_nonSplice=0 | |
R_splice=0 | |
R_properPair =0 | |
R_pair_diff_chrom = 0 | |
R_mitochondria = 0 | |
if self.bam_format: | |
print("Load BAM file ... ", end=' ', file=sys.stderr) | |
else: | |
print("Load SAM file ... ", end=' ', file=sys.stderr) | |
try: | |
while(1): | |
flag=0 | |
aligned_read = next(self.samfile) | |
R_total +=1 | |
if aligned_read.is_qcfail: #skip QC fail read | |
R_qc_fail +=1 | |
continue | |
if aligned_read.is_duplicate: #skip duplicate read | |
R_duplicate +=1 | |
continue | |
if aligned_read.is_secondary: #skip non primary hit | |
R_nonprimary +=1 | |
continue | |
if aligned_read.is_unmapped: #skip unmap read | |
R_unmap +=1 | |
continue | |
if aligned_read.mapq < q_cut: | |
R_multipleHit +=1 | |
continue #skip multiple map read | |
if aligned_read.mapq >= q_cut: | |
R_uniqHit +=1 | |
if aligned_read.is_read1: | |
R_read1 +=1 | |
if aligned_read.is_read2: | |
R_read2 +=1 | |
if aligned_read.is_reverse: | |
R_reverse +=1 | |
else: | |
R_forward +=1 | |
introns = bam_cigar.fetch_intron('chr1', aligned_read.pos, aligned_read.cigar) | |
if len(introns)==0: | |
R_nonSplice +=1 | |
if len(introns)>=1: | |
R_splice +=1 | |
if aligned_read.is_proper_pair: | |
R_properPair +=1 | |
R_read1_ref = self.samfile.getrname(aligned_read.tid) | |
R_read2_ref = self.samfile.getrname(aligned_read.rnext) | |
if R_read1_ref != R_read2_ref: | |
#print aligned_read.qname | |
R_pair_diff_chrom +=1 | |
except StopIteration: | |
print("Done", file=sys.stderr) | |
#self.samfile.seek(current_pos) | |
print("\n#==================================================", file=sys.stdout) | |
print("#All numbers are READ count", file=sys.stdout) | |
print("#==================================================\n", file=sys.stdout) | |
print("%-40s%d" % ("Total records:",R_total), file=sys.stdout) | |
print("\n", end='', file=sys.stdout) | |
print("%-40s%d" % ("QC failed:",R_qc_fail), file=sys.stdout) | |
print("%-40s%d" % ("Optical/PCR duplicate:", R_duplicate), file=sys.stdout) | |
print("%-40s%d" % ("Non primary hits", R_nonprimary), file=sys.stdout) | |
print("%-40s%d" % ("Unmapped reads:",R_unmap), file=sys.stdout) | |
print("%-40s%d" % ("mapq < mapq_cut (non-unique):",R_multipleHit), file=sys.stdout) | |
print("\n", end='', file=sys.stdout) | |
print("%-40s%d" % ("mapq >= mapq_cut (unique):",R_uniqHit), file=sys.stdout) | |
print("%-40s%d" % ("Read-1:",R_read1), file=sys.stdout) | |
print("%-40s%d" % ("Read-2:",R_read2), file=sys.stdout) | |
print("%-40s%d" % ("Reads map to '+':",R_forward), file=sys.stdout) | |
print("%-40s%d" % ("Reads map to '-':",R_reverse), file=sys.stdout) | |
print("%-40s%d" % ("Non-splice reads:",R_nonSplice), file=sys.stdout) | |
print("%-40s%d" % ("Splice reads:",R_splice), file=sys.stdout) | |
print("%-40s%d" % ("Reads mapped in proper pairs:",R_properPair), file=sys.stdout) | |
print("%-40s%d" % ("Proper-paired reads map to different chrom:",R_pair_diff_chrom), file=sys.stdout) | |
def configure_experiment(self,refbed,sample_size, q_cut = 30): | |
'''Given a BAM/SAM file, this function will try to guess the RNA-seq experiment: | |
1) single-end or pair-end | |
2) strand_specific or not | |
3) if it is strand-specific, what's the strand_ness of the protocol | |
''' | |
#how many reads you want to sample | |
count =0 | |
p_strandness=collections.defaultdict(int) | |
s_strandness=collections.defaultdict(int) | |
#load reference gene model | |
gene_ranges={} | |
print("Reading reference gene model " + refbed + ' ...', end=' ', file=sys.stderr) | |
for line in open(refbed,'r'): | |
try: | |
if line.startswith(('#','track','browser')):continue | |
# Parse fields from gene tabls | |
fields = line.split() | |
chrom = fields[0] | |
txStart = int( fields[1] ) | |
txEnd = int( fields[2] ) | |
geneName = fields[3] | |
strand = fields[5] | |
except: | |
print("[NOTE:input bed must be 12-column] skipped this line: " + line, file=sys.stderr) | |
continue | |
if chrom not in gene_ranges: | |
gene_ranges[chrom]=Intersecter() | |
gene_ranges[chrom].insert(txStart,txEnd,strand) | |
print("Done", file=sys.stderr) | |
#read SAM/BAM file | |
#current_pos = self.samfile.tell() | |
print("Loading SAM/BAM file ... ", end=' ', file=sys.stderr) | |
try: | |
while(1): | |
if count >= sample_size: | |
break | |
aligned_read = next(self.samfile) | |
if aligned_read.is_qcfail: #skip low quanlity | |
continue | |
if aligned_read.is_duplicate: #skip duplicate read | |
continue | |
if aligned_read.is_secondary: #skip non primary hit | |
continue | |
if aligned_read.is_unmapped: #skip unmap read | |
continue | |
if aligned_read.mapq < q_cut: | |
continue | |
chrom = self.samfile.getrname(aligned_read.tid) | |
if aligned_read.is_paired: | |
if aligned_read.is_read1: | |
read_id = '1' | |
if aligned_read.is_read2: | |
read_id = '2' | |
if aligned_read.is_reverse: | |
map_strand = '-' | |
else: | |
map_strand = '+' | |
readStart = aligned_read.pos | |
readEnd = readStart + aligned_read.qlen | |
if chrom in gene_ranges: | |
tmp = set(gene_ranges[chrom].find(readStart,readEnd)) | |
if len(tmp) == 0: continue | |
strand_from_gene = ':'.join(tmp) | |
p_strandness[read_id + map_strand + strand_from_gene]+=1 | |
count += 1 | |
else: | |
if aligned_read.is_reverse: | |
map_strand = '-' | |
else: | |
map_strand = '+' | |
readStart = aligned_read.pos | |
readEnd = readStart + aligned_read.qlen | |
if chrom in gene_ranges: | |
tmp = set(gene_ranges[chrom].find(readStart,readEnd)) | |
if len(tmp) == 0: continue | |
strand_from_gene = ':'.join(tmp) | |
s_strandness[map_strand + strand_from_gene]+=1 | |
count += 1 | |
except StopIteration: | |
print("Finished", file=sys.stderr) | |
#self.samfile.seek(current_pos) | |
print("Total " + str(count) + " usable reads were sampled", file=sys.stderr) | |
protocol="unknown" | |
strandness=None | |
spec1=0.0 | |
spec2=0.0 | |
other=0.0 | |
if len(p_strandness) >0 and len(s_strandness) ==0 : | |
protocol="PairEnd" | |
#for k,v in p_strandness.items(): | |
# print >>sys.stderr, k + '\t' + str(v) | |
spec1= (p_strandness['1++'] + p_strandness['1--'] + p_strandness['2+-'] + p_strandness['2-+'])/float(sum(p_strandness.values())) | |
spec2= (p_strandness['1+-'] + p_strandness['1-+'] + p_strandness['2++'] + p_strandness['2--'])/float(sum(p_strandness.values())) | |
other = 1-spec1-spec2 | |
elif len(s_strandness) >0 and len(p_strandness) ==0 : | |
protocol="SingleEnd" | |
#for k,v in s_strandness.items(): | |
# print >>sys.stderr, k + '\t' + str(v) | |
spec1 = (s_strandness['++'] + s_strandness['--'])/float(sum(s_strandness.values())) | |
spec2 = (s_strandness['+-'] + s_strandness['-+'])/float(sum(s_strandness.values())) | |
other = 1-spec1-spec2 | |
else: | |
protocol="Mixture" | |
spec1 = 0 | |
spec2 = 0 | |
other = 0 | |
return [protocol,spec1,spec2,other] | |
def bamTowig(self,outfile,chrom_sizes, chrom_file,skip_multi=True,strand_rule=None,WigSumFactor=None,q_cut=30): | |
"""Convert BAM/SAM file to wig file. chrom_size is dict with chrom as key and chrom_size as value | |
strandRule should be determined from \"infer_experiment\". such as \"1++,1--,2+-,2-+\". When | |
WigSumFactor is provided, output wig file will be normalized to this number """ | |
#strand_rule={'1+':'-','1-':'+','2+':'+','2-':'-'} | |
strandRule={} | |
if strand_rule is None: # Not strand-specific | |
pass | |
elif len(strand_rule.split(',')) ==4: #PairEnd, strand-specific | |
for i in strand_rule.split(','):strandRule[i[0]+i[1]]=i[2] | |
elif len(strand_rule.split(',')) ==2: #singeEnd, strand-specific | |
for i in strand_rule.split(','):strandRule[i[0]]=i[1] | |
else: | |
print("Unknown value of option :'strand_rule' " + strand_rule, file=sys.stderr) | |
sys.exit(1) | |
if len(strandRule) == 0: | |
FWO = open(outfile + '.wig','w') | |
else: | |
FWO = open(outfile + '.Forward.wig','w') | |
RVO = open(outfile + '.Reverse.wig','w') | |
read_id='' | |
for chr_name, chr_size in list(chrom_sizes.items()): #iterate each chrom | |
try: | |
self.samfile.fetch(chr_name,0,chr_size) | |
except: | |
print("No alignments for " + chr_name + '. skipped', file=sys.stderr) | |
continue | |
print("Processing " + chr_name + " ...", file=sys.stderr) | |
if len(strandRule) == 0: FWO.write('variableStep chrom='+chr_name+'\n') | |
else: | |
FWO.write('variableStep chrom='+chr_name+'\n') | |
RVO.write('variableStep chrom='+chr_name+'\n') | |
Fwig = collections.defaultdict(int) | |
Rwig = collections.defaultdict(int) | |
alignedReads = self.samfile.fetch(chr_name,0,chr_size) | |
for aligned_read in alignedReads: | |
if aligned_read.is_qcfail:continue #skip low quanlity | |
if aligned_read.is_duplicate:continue #skip duplicate read | |
if aligned_read.is_secondary:continue #skip non primary hit | |
if aligned_read.is_unmapped:continue #skip unmap read | |
if skip_multi: | |
if aligned_read.mapq < q_cut: | |
continue | |
if aligned_read.is_paired: | |
if aligned_read.is_read1:read_id = '1' | |
if aligned_read.is_read2:read_id = '2' | |
if aligned_read.is_reverse:map_strand = '-' | |
else:map_strand = '+' | |
key = read_id + map_strand | |
hit_st = aligned_read.pos | |
for block in bam_cigar.fetch_exon(chr_name, hit_st, aligned_read.cigar): | |
for pos in range(block[1]+1,block[2]+1): | |
if len(strandRule) == 0: Fwig[pos] +=1.0 #this is NOT strand specific. everything into Fwig | |
else: #this is strand specific. separate Fwig and Rwig | |
if strandRule[key] == '+':Fwig[pos] +=1.0 | |
if strandRule[key] == '-':Rwig[pos] -=1.0 | |
if WigSumFactor is None: #not normalize | |
if len(strandRule) == 0: #this is NOT strand specific. | |
for pos in sorted (Fwig.keys()): | |
print("%d\t%.2f" % (pos,Fwig[pos]), file=FWO) | |
else: | |
for pos in sorted (Fwig.keys()): | |
print("%d\t%.2f" % (pos,Fwig[pos]), file=FWO) | |
for pos in sorted (Rwig.keys()): | |
print("%d\t%.2f" % (pos,Rwig[pos]), file=RVO) | |
else: #normalize wig signal to WigSumFactor | |
if len(strandRule) == 0: #this is NOT strand specific. | |
for pos in sorted (Fwig.keys()): | |
print("%d\t%.2f" % (pos,Fwig[pos]*WigSumFactor), file=FWO) | |
else: | |
for pos in sorted (Fwig.keys()): | |
print("%d\t%.2f" % (pos,Fwig[pos]*WigSumFactor), file=FWO) | |
for pos in sorted (Rwig.keys()): | |
print("%d\t%.2f" % (pos,Rwig[pos]*WigSumFactor), file=RVO) | |
FWO.close() | |
if len(strandRule) != 0: | |
RVO.close() | |
if len(strandRule) == 0: | |
try: | |
import subprocess | |
print("Run " + "wigToBigWig " + outfile + '.wig ' + chrom_file + ' ' + outfile + ".bw ") | |
subprocess.call("wigToBigWig -clip " + outfile + '.wig ' + chrom_file + ' ' + outfile + ".bw ",shell=True) | |
except: | |
print("Failed to call \"wigToBigWig\".", file=sys.stderr) | |
pass | |
else: | |
try: | |
import subprocess | |
subprocess.call("wigToBigWig -clip " + outfile + '.Forward.wig ' + chrom_file + ' ' + outfile + ".Forward.bw ",shell=True) | |
subprocess.call("wigToBigWig -clip " + outfile + '.Reverse.wig ' + chrom_file + ' ' + outfile + ".Reverse.bw ",shell=True) | |
except: | |
print("Failed to call \"wigToBigWig\".", file=sys.stderr) | |
pass | |
def calWigSum(self,chrom_sizes, skip_multi=True): | |
"""Calculate wigsum from BAM file""" | |
print("Calcualte wigsum ... ", file=sys.stderr) | |
wigsum = 0.0 | |
read_id='' | |
for chr_name, chr_size in list(chrom_sizes.items()): #iterate each chrom | |
try: | |
self.samfile.fetch(chr_name,0,chr_size) | |
except: | |
print("No alignments for " + chr_name + '. skipped', file=sys.stderr) | |
continue | |
print("Processing " + chr_name + " ...", file=sys.stderr) | |
alignedReads = self.samfile.fetch(chr_name,0,chr_size) | |
for aligned_read in alignedReads: | |
flag=0 | |
if aligned_read.is_qcfail:continue #skip low quanlity | |
if aligned_read.is_duplicate:continue #skip duplicate read | |
if aligned_read.is_secondary:continue #skip non primary hit | |
if aligned_read.is_unmapped:continue #skip unmap read | |
if skip_multi: | |
if len(aligned_read.tags)>0: #( ("NM", 1),("RG", "L1") ) | |
for i in aligned_read.tags: | |
if i[0] in ParseBAM.multi_hit_tags and i[1] >1: | |
flag=1 #multiple hit read | |
break | |
if flag==1:continue #skip multiple map read | |
if aligned_read.is_paired: | |
if aligned_read.is_read1:read_id = '1' | |
if aligned_read.is_read2:read_id = '2' | |
if aligned_read.is_reverse:map_strand = '-' | |
else:map_strand = '+' | |
key = read_id + map_strand | |
hit_st = aligned_read.pos | |
for block in bam_cigar.fetch_exon(chr_name, hit_st, aligned_read.cigar): | |
wigsum += (block[2] - block[1]) | |
return wigsum | |
def bam2fq(self,prefix, paired = True): | |
"""Convert BAM/SAM into fastq files""" | |
transtab = str.maketrans("ACGTNX","TGCANX") | |
if paired: | |
OUT1 = open(prefix + '.R1.fastq','w') | |
OUT2 = open(prefix + '.R2.fastq','w') | |
read1_count = 0 | |
read2_count = 0 | |
else: | |
OUT = open(prefix + '.fastq','w') | |
read_count = 0 | |
read_name = '' | |
read_seq = '' | |
read_qual = '' | |
print("Convert BAM/SAM file into fastq format ... ", end=' ', file=sys.stderr) | |
try: | |
while(1): | |
aligned_read = next(self.samfile) | |
read_name = aligned_read.qname | |
read_seq = aligned_read.seq.upper() | |
read_qual = aligned_read.qual | |
if aligned_read.is_reverse: | |
read_seq = read_seq.translate(transtab)[::-1] | |
read_qual = read_qual[::-1] | |
if paired: | |
if aligned_read.is_read1: | |
read1_count += 1 | |
if not read_name.endswith('/1'): | |
print('@' + read_name + '/1', file=OUT1) | |
else: | |
print('@' + read_name, file=OUT1) | |
print(read_seq, file=OUT1) | |
print('+', file=OUT1) | |
print(read_qual, file=OUT1) | |
if aligned_read.is_read2: | |
read2_count += 1 | |
if not read_name.endswith('/2'): | |
print('@' + read_name + '/2', file=OUT2) | |
else: | |
print('@' + read_name, file=OUT2) | |
print(read_seq, file=OUT2) | |
print('+', file=OUT2) | |
print(read_qual, file=OUT2) | |
else: #single end | |
read_count += 1 | |
print('@' + read_name, file=OUT) | |
print(read_seq, file=OUT) | |
print('+', file=OUT) | |
print(read_qual, file=OUT) | |
except StopIteration: | |
print("Done", file=sys.stderr) | |
if paired: | |
print("read_1 count: %d" % read1_count, file=sys.stderr) | |
print("read_2 count: %d" % read2_count, file=sys.stderr) | |
else: | |
print("read count: %d" % read_count, file=sys.stderr) | |
def calculate_rpkm(self,geneFile,outfile,strand_rule=None): | |
'''calculate RPKM vaues. For single end RNA-seq, if it is strand specific, we assume that | |
read plus mapped indicates a gene on plus strand.(similar to minus). | |
Advantages: works for both SAM and BAM | |
works for both sorted and unsorted BAM/SAM file | |
works for both index or unindexed BAM/SAM file | |
much faster than indexing bam file | |
Disadvantage: random access BAM file was disabled, thus large mount of RAM is required | |
strand_rule: could be the following values: | |
'1++,1--,2+-,2-+ | |
'1+-,1-+,2++,2-- | |
'++,--' | |
'+-,-+' | |
None | |
''' | |
strandRule={} | |
if strand_rule is None: # Not strand-specific | |
pass | |
elif len(strand_rule.split(',')) ==4: #PairEnd, strand-specific | |
for i in strand_rule.split(','):strandRule[i[0]+i[1]]=i[2] | |
elif len(strand_rule.split(',')) ==2: #singeEnd, strand-specific | |
for i in strand_rule.split(','):strandRule[i[0]]=i[1] | |
else: | |
print("Unknown value of option :'strand_rule' " + strand_rule, file=sys.stderr) | |
sys.exit(1) | |
uniq_read=0 | |
total_tags=0 | |
plus_ranges={} | |
minus_ranges={} | |
unstrand_ranges={} | |
rpkm_value={} | |
RPKM_OUT = open(outfile,'w') | |
if self.bam_format:print("Load BAM file ... ", end=' ', file=sys.stderr) | |
else:print("Load SAM file ... ", end=' ', file=sys.stderr) | |
#current_pos = self.samfile.tell() | |
try: | |
while(1): | |
flag=0 | |
aligned_read = next(self.samfile) | |
if aligned_read.is_qcfail:continue #skip low quanlity | |
if aligned_read.is_duplicate:continue #skip duplicate read | |
if aligned_read.is_secondary:continue #skip non primary hit | |
if aligned_read.is_unmapped:continue #skip unmap read | |
if len(aligned_read.tags)>0: #( ("NM", 1),("RG", "L1") ) | |
for i in aligned_read.tags: | |
if i[0] in ParseBAM.multi_hit_tags and i[1] >1: | |
flag=1 #multiple hit read | |
break | |
if flag==1:continue #skip multiple map read | |
uniq_read +=1 | |
if aligned_read.is_paired: | |
if aligned_read.is_read1:read_id = '1' | |
if aligned_read.is_read2:read_id = '2' | |
else: | |
read_id = '' | |
if aligned_read.is_reverse:map_strand = '-' | |
else:map_strand = '+' | |
strand_key = read_id + map_strand | |
chrom = self.samfile.getrname(aligned_read.tid).upper() | |
hit_st = aligned_read.pos | |
exon_blocks = bam_cigar.fetch_exon(chrom, hit_st, aligned_read.cigar) | |
total_tags += len(exon_blocks) | |
#construct bitset | |
if strand_rule is not None: | |
if strandRule[strand_key] == '+': | |
for block in exon_blocks: | |
mid = block[1] + int((block[2] - block[1])/2) | |
if chrom not in plus_ranges:plus_ranges[chrom] = Intersecter() | |
plus_ranges[chrom].add_interval( Interval( mid,mid+1 ) ) | |
elif strandRule[strand_key] == '-': | |
for block in exon_blocks: | |
mid = block[1] + int((block[2] - block[1])/2) | |
if chrom not in minus_ranges:minus_ranges[chrom] = Intersecter() | |
minus_ranges[chrom].add_interval( Interval( mid,mid+1 ) ) | |
elif strand_rule is None: | |
for block in exon_blocks: | |
mid = block[1] + int((block[2] - block[1])/2) | |
if chrom not in unstrand_ranges:unstrand_ranges[chrom] = Intersecter() | |
unstrand_ranges[chrom].add_interval( Interval( mid,mid+1 ) ) | |
except StopIteration: | |
print("Done", file=sys.stderr) | |
#self.samfile.seek(current_pos) | |
print("#Total uniquely mapped reads = " + str(uniq_read), file=RPKM_OUT) | |
print("#Total fragments = " + str(total_tags), file=RPKM_OUT) | |
print("Assign reads to "+ geneFile + '...', end=' ', file=sys.stderr) | |
for line in open(geneFile,'r'): | |
try: | |
if line.startswith('#'):continue | |
if line.startswith('track'):continue | |
if line.startswith('browser'):continue | |
# Parse fields from gene tabls | |
fields = line.split() | |
chrom = fields[0].upper() | |
tx_start = int( fields[1] ) | |
tx_end = int( fields[2] ) | |
geneName = fields[3] | |
strand = fields[5].replace(" ","_") | |
exon_starts = list(map( int, fields[11].rstrip( ',\n' ).split( ',' ) )) | |
exon_starts = list(map((lambda x: x + tx_start ), exon_starts)) | |
exon_ends = list(map( int, fields[10].rstrip( ',\n' ).split( ',' ) )) | |
exon_ends = list(map((lambda x, y: x + y ), exon_starts, exon_ends)) | |
exon_sizes = list(map(int,fields[10].rstrip(',\n').split(','))) | |
intron_starts = exon_ends[:-1] | |
intron_ends=exon_starts[1:] | |
key='\t'.join((chrom.lower(),str(tx_start),str(tx_end),geneName,'0',strand)) | |
except: | |
print("[NOTE:input bed must be 12-column] skipped this line: " + line, end=' ', file=sys.stderr) | |
continue | |
mRNA_count=0 | |
mRNA_len=sum(exon_sizes) | |
if (strand_rule is not None) and (strand == '-'): | |
intronNum=len(intron_starts) | |
exonNum=len(exon_starts) | |
# assign reads to intron | |
for st,end in zip(intron_starts,intron_ends): | |
if chrom in minus_ranges: | |
hits= len(minus_ranges[chrom].find(st,end)) | |
RPKM_OUT.write(chrom.lower() + "\t" + str(st) + "\t" + str(end) + "\t" + geneName + "_intron_" + str(intronNum) + "\t" + str(hits) + "\t" + strand + '\t' + str(hits*1000000000.0/((end-st)*(total_tags))) +'\n') | |
intronNum -= 1 | |
# assign reads to exon | |
for st,end in zip(exon_starts,exon_ends): | |
if chrom in minus_ranges: | |
hits= len(minus_ranges[chrom].find(st,end)) | |
RPKM_OUT.write(chrom.lower() + "\t" + str(st) + "\t" + str(end) + "\t" + geneName + "_exon_" + str(exonNum) + "\t" + str(hits) + "\t" + strand + '\t' + str(hits*1000000000.0/((end-st)*(total_tags))) +'\n') | |
exonNum -= 1 | |
mRNA_count += hits | |
try: | |
RPKM_OUT.write(chrom.lower() + "\t" + str(tx_start) + "\t" + str(tx_end) + "\t" + geneName + "_mRNA" + "\t" + str(mRNA_count) + "\t" + strand + '\t' + str(mRNA_count*1000000000.0/(mRNA_len*total_tags)) +'\n') | |
except: | |
RPKM_OUT.write(chrom.lower() + "\t" + str(tx_start) + "\t" + str(tx_end) + "\t" + geneName + "_mRNA" + "\t" + str(0) + "\t" + strand + '\t' + str(0) +'\n') | |
elif (strand_rule is not None) and (strand == '+'): | |
intronNum=1 | |
exonNum=1 | |
for st,end in zip(intron_starts,intron_ends): | |
if chrom in plus_ranges: | |
hits= len(plus_ranges[chrom].find(st,end)) | |
RPKM_OUT.write(chrom.lower() + "\t" + str(st) + "\t" + str(end) + "\t" + geneName + "_intron_" + str(intronNum) + "\t" + str(hits) + "\t" + strand + '\t' + str(hits*1000000000.0/((end-st)*(total_tags))) +'\n') | |
intronNum += 1 | |
for st,end in zip(exon_starts,exon_ends): | |
if chrom in plus_ranges: | |
hits= len(plus_ranges[chrom].find(st,end)) | |
RPKM_OUT.write(chrom.lower() + "\t" + str(st) + "\t" + str(end) + "\t" + geneName + "_exon_" + str(exonNum) + "\t" + str(hits) + "\t" + strand + '\t' + str(hits*1000000000.0/((end-st)*(total_tags))) +'\n') | |
exonNum += 1 | |
mRNA_count += hits | |
try: | |
RPKM_OUT.write(chrom.lower() + "\t" + str(tx_start) + "\t" + str(tx_end) + "\t" + geneName + "_mRNA" + "\t" + str(mRNA_count) + "\t" + strand + '\t' + str(mRNA_count*1000000000.0/(mRNA_len*total_tags)) +'\n') | |
except: | |
RPKM_OUT.write(chrom.lower() + "\t" + str(tx_start) + "\t" + str(tx_end) + "\t" + geneName + "_mRNA" + "\t" + str(0) + "\t" + strand + '\t' + str(0) +'\n') | |
elif strand_rule is None: | |
intronNum=1 | |
exonNum=1 | |
for st,end in zip(intron_starts,intron_ends): | |
if chrom in unstrand_ranges: | |
hits= len(unstrand_ranges[chrom].find(st,end)) | |
RPKM_OUT.write(chrom.lower() + "\t" + str(st) + "\t" + str(end) + "\t" + geneName + "_intron_" + str(intronNum) + "\t" + str(hits) + "\t" + strand + '\t' + str(hits*1000000000.0/((end-st)*(total_tags))) +'\n') | |
intronNum += 1 | |
for st,end in zip(exon_starts,exon_ends): | |
if chrom in unstrand_ranges: | |
hits= len(unstrand_ranges[chrom].find(st,end)) | |
RPKM_OUT.write(chrom.lower() + "\t" + str(st) + "\t" + str(end) + "\t" + geneName + "_exon_" + str(exonNum) + "\t" + str(hits) + "\t" + strand + '\t' + str(hits*1000000000.0/((end-st)*(total_tags))) +'\n') | |
exonNum += 1 | |
mRNA_count += hits | |
try: | |
RPKM_OUT.write(chrom.lower() + "\t" + str(tx_start) + "\t" + str(tx_end) + "\t" + geneName + "_mRNA" + "\t" + str(mRNA_count) + "\t" + strand + '\t' + str(mRNA_count*1000000000.0/(mRNA_len*total_tags)) +'\n') | |
except: | |
RPKM_OUT.write(chrom.lower() + "\t" + str(tx_start) + "\t" + str(tx_end) + "\t" + geneName + "_mRNA" + "\t" + str(0) + "\t" + strand + '\t' + str(0) +'\n') | |
print("Done", file=sys.stderr) | |
def readsNVC(self,outfile=None,nx=True, q_cut = 30): | |
'''for each read, calculate nucleotide frequency vs position''' | |
if outfile is None: | |
outfile1 = self.fileName + ".NVC.xls" | |
outfile2 = self.fileName +".NVC_plot.r" | |
else: | |
outfile1 = outfile + ".NVC.xls" | |
outfile2 = outfile +".NVC_plot.r" | |
FO=open(outfile1,'w') | |
RS=open(outfile2,'w') | |
PPcount=0 | |
transtab = str.maketrans("ACGTNX","TGCANX") | |
base_freq=collections.defaultdict(int) | |
a_count=[] | |
c_count=[] | |
g_count=[] | |
t_count=[] | |
n_count=[] | |
x_count=[] | |
if self.bam_format:print("Read BAM file ... ", end=' ', file=sys.stderr) | |
else:print("Read SAM file ... ", end=' ', file=sys.stderr) | |
try: | |
while(1): | |
aligned_read = next(self.samfile) | |
if aligned_read.mapq < q_cut: continue | |
#if aligned_read.is_unmapped:continue #skip unmapped read | |
#if aligned_read.is_qcfail:continue #skip low quality | |
RNA_read = aligned_read.seq.upper() | |
if aligned_read.is_reverse: | |
RNA_read = RNA_read.translate(transtab)[::-1] | |
for i,j in enumerate(RNA_read): | |
key = str(i) + j | |
base_freq[key] += 1 | |
except StopIteration: | |
print("Done", file=sys.stderr) | |
print("generating data matrix ...", file=sys.stderr) | |
print("Position\tA\tC\tG\tT\tN\tX", file=FO) | |
for i in range(len(RNA_read)): | |
print(str(i) + '\t', end=' ', file=FO) | |
print(str(base_freq[str(i) + "A"]) + '\t', end=' ', file=FO) | |
a_count.append(str(base_freq[str(i) + "A"])) | |
print(str(base_freq[str(i) + "C"]) + '\t', end=' ', file=FO) | |
c_count.append(str(base_freq[str(i) + "C"])) | |
print(str(base_freq[str(i) + "G"]) + '\t', end=' ', file=FO) | |
g_count.append(str(base_freq[str(i) + "G"])) | |
print(str(base_freq[str(i) + "T"]) + '\t', end=' ', file=FO) | |
t_count.append(str(base_freq[str(i) + "T"])) | |
print(str(base_freq[str(i) + "N"]) + '\t', end=' ', file=FO) | |
n_count.append(str(base_freq[str(i) + "N"])) | |
print(str(base_freq[str(i) + "X"]) + '\t', file=FO) | |
x_count.append(str(base_freq[str(i) + "X"])) | |
FO.close() | |
#generating R scripts | |
print("generating R script ...", file=sys.stderr) | |
print("position=c(" + ','.join([str(i) for i in range(len(RNA_read))]) + ')', file=RS) | |
print("A_count=c(" + ','.join(a_count) + ')', file=RS) | |
print("C_count=c(" + ','.join(c_count) + ')', file=RS) | |
print("G_count=c(" + ','.join(g_count) + ')', file=RS) | |
print("T_count=c(" + ','.join(t_count) + ')', file=RS) | |
print("N_count=c(" + ','.join(n_count) + ')', file=RS) | |
print("X_count=c(" + ','.join(x_count) + ')', file=RS) | |
if nx: | |
print("total= A_count + C_count + G_count + T_count + N_count + X_count", file=RS) | |
print("ym=max(A_count/total,C_count/total,G_count/total,T_count/total,N_count/total,X_count/total) + 0.05", file=RS) | |
print("yn=min(A_count/total,C_count/total,G_count/total,T_count/total,N_count/total,X_count/total)", file=RS) | |
print('pdf(\"%s\")' % (outfile +".NVC_plot.pdf"), file=RS) | |
print('plot(position,A_count/total,type="o",pch=20,ylim=c(yn,ym),col="dark green",xlab="Position of Read",ylab="Nucleotide Frequency")', file=RS) | |
print('lines(position,T_count/total,type="o",pch=20,col="red")', file=RS) | |
print('lines(position,G_count/total,type="o",pch=20,col="blue")', file=RS) | |
print('lines(position,C_count/total,type="o",pch=20,col="cyan")', file=RS) | |
print('lines(position,N_count/total,type="o",pch=20,col="black")', file=RS) | |
print('lines(position,X_count/total,type="o",pch=20,col="grey")', file=RS) | |
print('legend('+ str(len(RNA_read)-10) + ',ym,legend=c("A","T","G","C","N","X"),col=c("dark green","red","blue","cyan","black","grey"),lwd=2,pch=20,text.col=c("dark green","red","blue","cyan","black","grey"))', file=RS) | |
print("dev.off()", file=RS) | |
else: | |
print("total= A_count + C_count + G_count + T_count", file=RS) | |
print("ym=max(A_count/total,C_count/total,G_count/total,T_count/total) + 0.05", file=RS) | |
print("yn=min(A_count/total,C_count/total,G_count/total,T_count/total)", file=RS) | |
print('pdf(\"%s\")' % (outfile +".NVC_plot.pdf"), file=RS) | |
print('plot(position,A_count/total,type="o",pch=20,ylim=c(yn,ym),col="dark green",xlab="Position of Read",ylab="Nucleotide Frequency")', file=RS) | |
print('lines(position,T_count/total,type="o",pch=20,col="red")', file=RS) | |
print('lines(position,G_count/total,type="o",pch=20,col="blue")', file=RS) | |
print('lines(position,C_count/total,type="o",pch=20,col="cyan")', file=RS) | |
print('legend('+ str(len(RNA_read)-10) + ',ym,legend=c("A","T","G","C"),col=c("dark green","red","blue","cyan"),lwd=2,pch=20,text.col=c("dark green","red","blue","cyan"))', file=RS) | |
print("dev.off()", file=RS) | |
RS.close() | |
#self.f.seek(0) | |
def readsQual_boxplot(self,outfile,shrink=1000, q_cut=30): | |
'''calculate phred quality score for each base in read (5->3)''' | |
output = outfile + ".qual.r" | |
FO=open(output,'w') | |
if self.bam_format:print("Read BAM file ... ", end=' ', file=sys.stderr) | |
else:print("Read SAM file ... ", end=' ', file=sys.stderr) | |
quality = collections.defaultdict(dict) #read_pos=>quality score=>count | |
q_max = -1 | |
q_min = 10000 | |
q_list=[] | |
i_box={} #key is read postion,value is | |
try: | |
while(1): | |
aligned_read = next(self.samfile) | |
if aligned_read.mapq < q_cut: continue | |
#if aligned_read.is_unmapped:continue #skip unmapped read | |
#if aligned_read.is_qcfail:continue #skip low quality | |
qual_str = aligned_read.qqual | |
read_len = aligned_read.rlen | |
if aligned_read.is_reverse: | |
qual_str = qual_str[::-1] | |
for i,j in enumerate(qual_str): | |
q=ord(j)-33 | |
if q > q_max: q_max = q | |
if q < q_min: q_min = q | |
try: | |
quality[i][q] += 1 | |
except: | |
quality[i][q] = 1 | |
except StopIteration: | |
print("Done", file=sys.stderr) | |
for p in range(0,read_len): | |
#print str(p) + ':', | |
val=[] | |
occurrence=[] | |
for q in range(q_min,q_max+1): | |
if p in quality and q in quality[p]: | |
val.append(str(q)) | |
occurrence.append(str(quality[p][q])) | |
q_list.append(str(quality[p][q])) | |
else: | |
q_list.append(str(0)) | |
i_box[p] = 'rep(c(' + ','.join(val) + '),times=c(' + ','.join(occurrence) + ')/' + str(shrink)+ ')' | |
#generate R script for boxplot | |
print("pdf(\'%s\')" % (outfile + ".qual.boxplot.pdf"), file=FO) | |
for i in sorted(i_box): | |
print('p'+str(i) + '<-' + i_box[i], file=FO) | |
print('boxplot(' + ','.join(['p'+str(i) for i in i_box]) + ',xlab=\"Position of Read(5\'->3\')\",ylab=\"Phred Quality Score\",outline=F' + ')', file=FO) | |
print("dev.off()", file=FO) | |
#generate R script for heatmap | |
print('\n', file=FO) | |
print("pdf(\'%s\')" % (outfile + ".qual.heatmap.pdf"), file=FO) | |
print("qual=c(" + ','.join(q_list) + ')', file=FO) | |
print("mat=matrix(qual,ncol=%s,byrow=F)" % (read_len), file=FO) | |
print('Lab.palette <- colorRampPalette(c("blue", "orange", "red3","red2","red1","red"), space = "rgb",interpolate=c(\'spline\'))', file=FO) | |
print("heatmap(mat,Rowv=NA,Colv=NA,xlab=\"Position of Read\",ylab=\"Phred Quality Score\",labRow=seq(from=%s,to=%s),col = Lab.palette(256),scale=\"none\" )" % (q_min,q_max), file=FO) | |
print('dev.off()', file=FO) | |
def readGC(self,outfile=None, q_cut=30): | |
'''GC content distribution of reads''' | |
if outfile is None: | |
outfile1 = self.fileName + ".GC.xls" | |
outfile2 = self.fileName +".GC_plot.r" | |
else: | |
outfile1 = outfile + ".GC.xls" | |
outfile2 = outfile + ".GC_plot.r" | |
FO=open(outfile1,'w') | |
RS=open(outfile2,'w') | |
gc_hist=collections.defaultdict(int) #key is GC percent, value is count of reads | |
if self.bam_format:print("Read BAM file ... ", end=' ', file=sys.stderr) | |
else:print("Read SAM file ... ", end=' ', file=sys.stderr) | |
try: | |
while(1): | |
aligned_read = next(self.samfile) | |
if aligned_read.is_unmapped:continue #skip unmapped read | |
if aligned_read.is_qcfail:continue #skip low quality | |
if aligned_read.mapq < q_cut: continue | |
RNA_read = aligned_read.seq.upper() | |
gc_percent = "%4.2f" % ((RNA_read.count('C') + RNA_read.count('G'))/(len(RNA_read)+0.0)*100) | |
#print gc_percent | |
gc_hist[gc_percent] += 1 | |
except StopIteration: | |
print("Done", file=sys.stderr) | |
print("writing GC content ...", file=sys.stderr) | |
print("GC%\tread_count", file=FO) | |
for i in list(gc_hist.keys()): | |
print(i + '\t' + str(gc_hist[i]), file=FO) | |
print("writing R script ...", file=sys.stderr) | |
print("pdf(\"%s\")" % (outfile + ".GC_plot.pdf"), file=RS) | |
print('gc=rep(c(' + ','.join([i for i in list(gc_hist.keys())]) + '),' + 'times=c(' + ','.join([str(i) for i in list(gc_hist.values())]) + '))', file=RS) | |
print('hist(gc,probability=T,breaks=%d,xlab="GC content (%%)",ylab="Density of Reads",border="blue",main="")' % 100, file=RS) | |
#print >>RS, "lines(density(gc),col='red')" | |
print("dev.off()", file=RS) | |
#self.f.seek(0) | |
def readDupRate(self,q_cut, outfile=None,up_bound=500): | |
'''Calculate reads's duplicate rates''' | |
if outfile is None: | |
outfile1 = self.fileName + ".seq.DupRate.xls" | |
outfile2 = self.fileName + ".pos.DupRate.xls" | |
outfile3 = self.fileName + ".DupRate_plot.r" | |
else: | |
outfile1 = outfile + ".seq.DupRate.xls" | |
outfile2 = outfile + ".pos.DupRate.xls" | |
outfile3 = outfile +".DupRate_plot.r" | |
SEQ=open(outfile1,'w') | |
POS=open(outfile2,'w') | |
RS=open(outfile3,'w') | |
seqDup=collections.defaultdict(int) | |
posDup=collections.defaultdict(int) | |
seqDup_count=collections.defaultdict(int) | |
posDup_count=collections.defaultdict(int) | |
if self.bam_format:print("Load BAM file ... ", end=' ', file=sys.stderr) | |
else:print("Load SAM file ... ", end=' ', file=sys.stderr) | |
try: | |
while(1): | |
exon_boundary="" | |
aligned_read = next(self.samfile) | |
if aligned_read.is_unmapped:continue #skip unmapped read | |
if aligned_read.is_qcfail:continue #skip low quality | |
if aligned_read.mapq < q_cut: continue | |
RNA_read = aligned_read.seq.upper() | |
seqDup[RNA_read] +=1 #key is read sequence | |
chrom = self.samfile.getrname(aligned_read.tid) | |
hit_st = aligned_read.pos | |
exon_blocks = bam_cigar.fetch_exon(chrom, hit_st, aligned_read.cigar) | |
for ex in exon_blocks: | |
exon_boundary += str(ex[1]) + '-' + str(ex[2]) + ":" | |
key = chrom + ":" + str(hit_st) + ":" + exon_boundary | |
posDup[key] +=1 | |
except StopIteration: | |
print("Done", file=sys.stderr) | |
print("report duplicte rate based on sequence ...", file=sys.stderr) | |
print("Occurrence\tUniqReadNumber", file=SEQ) | |
for i in list(seqDup.values()): #key is occurence, value is uniq reads number (based on seq) | |
seqDup_count[i] +=1 | |
for k in sorted(seqDup_count.keys()): | |
print(str(k) +'\t'+ str(seqDup_count[k]), file=SEQ) | |
SEQ.close() | |
print("report duplicte rate based on mapping ...", file=sys.stderr) | |
print("Occurrence\tUniqReadNumber", file=POS) | |
for i in list(posDup.values()): #key is occurence, value is uniq reads number (based on coord) | |
posDup_count[i] +=1 | |
for k in sorted(posDup_count.keys()): | |
print(str(k) +'\t'+ str(posDup_count[k]), file=POS) | |
POS.close() | |
print("generate R script ...", file=sys.stderr) | |
print("pdf(\'%s\')" % (outfile +".DupRate_plot.pdf"), file=RS) | |
print("par(mar=c(5,4,4,5),las=0)", file=RS) | |
print("seq_occ=c(" + ','.join([str(i) for i in sorted(seqDup_count.keys()) ]) + ')', file=RS) | |
print("seq_uniqRead=c(" + ','.join([str(seqDup_count[i]) for i in sorted(seqDup_count.keys()) ]) + ')', file=RS) | |
print("pos_occ=c(" + ','.join([str(i) for i in sorted(posDup_count.keys()) ]) + ')', file=RS) | |
print("pos_uniqRead=c(" + ','.join([str(posDup_count[i]) for i in sorted(posDup_count.keys()) ]) + ')', file=RS) | |
print("plot(pos_occ,log10(pos_uniqRead),ylab='Number of Reads (log10)',xlab='Occurrence of read',pch=4,cex=0.8,col='blue',xlim=c(1,%d),yaxt='n')" % up_bound, file=RS) | |
print("points(seq_occ,log10(seq_uniqRead),pch=20,cex=0.8,col='red')", file=RS) | |
print('ym=floor(max(log10(pos_uniqRead)))', file=RS) | |
print("legend(%d,ym,legend=c('Sequence-based','Mapping-based'),col=c('blue','red'),pch=c(4,20))" % max(up_bound-200,1), file=RS) | |
print('axis(side=2,at=0:ym,labels=0:ym)', file=RS) | |
print('axis(side=4,at=c(log10(pos_uniqRead[1]),log10(pos_uniqRead[2]),log10(pos_uniqRead[3]),log10(pos_uniqRead[4])), labels=c(round(pos_uniqRead[1]*100/sum(pos_uniqRead*pos_occ)),round(pos_uniqRead[2]*100/sum(pos_uniqRead*pos_occ)),round(pos_uniqRead[3]*100/sum(pos_uniqRead*pos_occ)),round(pos_uniqRead[4]*100/sum(pos_uniqRead*pos_occ))))', file=RS) | |
print('mtext(4, text = "Reads %", line = 2)', file=RS) | |
print('dev.off()', file=RS) | |
#self.f.seek(0) | |
def clipping_profile(self,outfile, q_cut, PE, type="S"): | |
'''calculate profile of soft clipping or insertion''' | |
out_file1 = outfile + ".clipping_profile.xls" | |
out_file2 = outfile + ".clipping_profile.r" | |
OUT = open(out_file1,'w') | |
ROUT = open(out_file2,'w') | |
print("Position\tClipped_nt\tNon_clipped_nt", file=OUT) | |
if self.bam_format:print("Load BAM file ... ", end=' ', file=sys.stderr) | |
else:print("Load SAM file ... ", end=' ', file=sys.stderr) | |
cigar_str = "" | |
#single end sequencing | |
if PE is False: | |
total_read = 0.0 | |
soft_clip_profile = collections.defaultdict(int) | |
try: | |
while(1): | |
aligned_read = next(self.samfile) | |
if aligned_read.mapq < q_cut: continue | |
if aligned_read.is_unmapped:continue #skip unmapped read | |
if aligned_read.is_qcfail:continue #skip low quality | |
total_read +=1 | |
cigar_str = bam_cigar.list2longstr(aligned_read.cigar) # ([(0, 9), (4, 1)] ==> MMMMMMMMMS | |
if type not in cigar_str: # no clipping | |
continue | |
if aligned_read.is_reverse: | |
cigar_str = cigar_str[::-1] | |
for indx,symbl in enumerate(cigar_str): | |
if symbl == type: | |
soft_clip_profile[indx] += 1.0 | |
except StopIteration: | |
print("Done", file=sys.stderr) | |
print("Totoal reads used: %d" % int(total_read), file=sys.stderr) | |
read_pos = list(range(0,len(cigar_str))) | |
clip_count = [] | |
for i in read_pos: | |
print(str(i) + '\t' + str(soft_clip_profile[i]) + '\t' + str(total_read - soft_clip_profile[i]), file=OUT) | |
clip_count.append(soft_clip_profile[i]) | |
print("pdf(\"%s\")" % (outfile + '.clipping_profile.pdf'), file=ROUT) | |
print("read_pos=c(%s)" % ','.join([str(i) for i in read_pos]), file=ROUT) | |
print("clip_count=c(%s)" % ','.join([str(i) for i in clip_count]), file=ROUT) | |
print("nonclip_count= %d - clip_count" % (total_read), file=ROUT) | |
print('plot(read_pos, nonclip_count*100/(clip_count+nonclip_count),col="blue",main="clipping profile",xlab="Position of read",ylab="Non-clipped %",type="b")', file=ROUT) | |
print("dev.off()", file=ROUT) | |
if PE is True: | |
total_read1 = 0.0 | |
total_read2 = 0.0 | |
r1_soft_clip_profile = collections.defaultdict(int) | |
r2_soft_clip_profile = collections.defaultdict(int) | |
try: | |
while(1): | |
aligned_read = next(self.samfile) | |
if aligned_read.mapq < q_cut: continue | |
if aligned_read.is_unmapped:continue #skip unmapped read | |
if aligned_read.is_qcfail:continue #skip low quality | |
if not aligned_read.is_paired: continue | |
if aligned_read.is_read1: | |
total_read1 += 1 | |
if aligned_read.is_read2: | |
total_read2 += 1 | |
cigar_str = bam_cigar.list2longstr(aligned_read.cigar) # ([(0, 9), (4, 1)] ==> MMMMMMMMMS | |
if aligned_read.is_reverse: | |
cigar_str = cigar_str[::-1] | |
if type not in cigar_str: # no clipping | |
continue | |
if aligned_read.is_read1: | |
for indx,symbl in enumerate(cigar_str): | |
if symbl == type: | |
r1_soft_clip_profile[indx] += 1.0 | |
if aligned_read.is_read2: | |
for indx,symbl in enumerate(cigar_str): | |
if symbl == type: | |
r2_soft_clip_profile[indx] += 1.0 | |
except StopIteration: | |
print("Done", file=sys.stderr) | |
read_pos = list(range(0,len(cigar_str))) | |
r1_clip_count = [] | |
r2_clip_count = [] | |
print("Totoal read-1 used: %d" % int(total_read1), file=sys.stderr) | |
print("Totoal read-2 used: %d" % int(total_read2), file=sys.stderr) | |
print("Read-1:", file=OUT) | |
for i in read_pos: | |
print(str(i) + '\t' + str(r1_soft_clip_profile[i]) + '\t' + str(total_read1 - r1_soft_clip_profile[i]), file=OUT) | |
r1_clip_count.append(r1_soft_clip_profile[i]) | |
print("Read-2:", file=OUT) | |
for i in read_pos: | |
print(str(i) + '\t' + str(r2_soft_clip_profile[i]) + '\t' + str(total_read2 - r2_soft_clip_profile[i]), file=OUT) | |
r2_clip_count.append(r2_soft_clip_profile[i]) | |
print("pdf(\"%s\")" % (outfile + '.clipping_profile.R1.pdf'), file=ROUT) | |
print("read_pos=c(%s)" % ','.join([str(i) for i in read_pos]), file=ROUT) | |
print("r1_clip_count=c(%s)" % ','.join([str(i) for i in r1_clip_count]), file=ROUT) | |
print("r1_nonclip_count = %d - r1_clip_count" % (total_read1), file=ROUT) | |
print('plot(read_pos, r1_nonclip_count*100/(r1_clip_count + r1_nonclip_count),col="blue",main="clipping profile",xlab="Position of read (read-1)",ylab="Non-clipped %",type="b")', file=ROUT) | |
print("dev.off()", file=ROUT) | |
print("pdf(\"%s\")" % (outfile + '.clipping_profile.R2.pdf'), file=ROUT) | |
print("read_pos=c(%s)" % ','.join([str(i) for i in read_pos]), file=ROUT) | |
print("r2_clip_count=c(%s)" % ','.join([str(i) for i in r2_clip_count]), file=ROUT) | |
print("r2_nonclip_count = %d - r2_clip_count" % (total_read2), file=ROUT) | |
print('plot(read_pos, r2_nonclip_count*100/(r2_clip_count + r2_nonclip_count),col="blue",main="clipping profile",xlab="Position of read (read-2)",ylab="Non-clipped %",type="b")', file=ROUT) | |
print("dev.off()", file=ROUT) | |
def insertion_profile(self,outfile, q_cut, PE, type="I"): | |
'''calculate profile of insertion''' | |
out_file1 = outfile + ".insertion_profile.xls" | |
out_file2 = outfile + ".insertion_profile.r" | |
OUT = open(out_file1,'w') | |
ROUT = open(out_file2,'w') | |
print("Position\tInsert_nt\tNon_insert_nt", file=OUT) | |
if self.bam_format:print("Load BAM file ... ", end=' ', file=sys.stderr) | |
else:print("Load SAM file ... ", end=' ', file=sys.stderr) | |
cigar_str = "" | |
#single end sequencing | |
if PE is False: | |
total_read = 0.0 | |
soft_clip_profile = collections.defaultdict(int) | |
try: | |
while(1): | |
aligned_read = next(self.samfile) | |
if aligned_read.mapq < q_cut: continue | |
if aligned_read.is_unmapped:continue #skip unmapped read | |
if aligned_read.is_qcfail:continue #skip low quality | |
total_read +=1 | |
cigar_str = bam_cigar.list2longstr(aligned_read.cigar) # ([(0, 9), (4, 1)] ==> MMMMMMMMMS | |
if type not in cigar_str: # no insertion | |
continue | |
if aligned_read.is_reverse: | |
cigar_str = cigar_str[::-1] | |
for indx,symbl in enumerate(cigar_str): | |
if symbl == type: | |
soft_clip_profile[indx] += 1.0 | |
except StopIteration: | |
print("Done", file=sys.stderr) | |
print("Totoal reads used: %d" % int(total_read), file=sys.stderr) | |
read_pos = list(range(0,len(cigar_str))) | |
clip_count = [] | |
for i in read_pos: | |
print(str(i) + '\t' + str(soft_clip_profile[i]) + '\t' + str(total_read - soft_clip_profile[i]), file=OUT) | |
clip_count.append(soft_clip_profile[i]) | |
print("pdf(\"%s\")" % (outfile + '.insertion_profile.pdf'), file=ROUT) | |
print("read_pos=c(%s)" % ','.join([str(i) for i in read_pos]), file=ROUT) | |
print("insert_count=c(%s)" % ','.join([str(i) for i in clip_count]), file=ROUT) | |
print("noninsert_count= %d - insert_count" % (total_read), file=ROUT) | |
print('plot(read_pos, insert_count*100/(insert_count+noninsert_count),col="blue",main="Insertion profile",xlab="Position of read",ylab="Insertion %",type="b")', file=ROUT) | |
print("dev.off()", file=ROUT) | |
if PE is True: | |
total_read1 = 0.0 | |
total_read2 = 0.0 | |
r1_soft_clip_profile = collections.defaultdict(int) | |
r2_soft_clip_profile = collections.defaultdict(int) | |
try: | |
while(1): | |
aligned_read = next(self.samfile) | |
if aligned_read.mapq < q_cut: continue | |
if aligned_read.is_unmapped:continue #skip unmapped read | |
if aligned_read.is_qcfail:continue #skip low quality | |
if not aligned_read.is_paired: continue | |
if aligned_read.is_read1: | |
total_read1 += 1 | |
if aligned_read.is_read2: | |
total_read2 += 1 | |
cigar_str = bam_cigar.list2longstr(aligned_read.cigar) # ([(0, 9), (4, 1)] ==> MMMMMMMMMS | |
if aligned_read.is_reverse: | |
cigar_str = cigar_str[::-1] | |
if type not in cigar_str: # no clipping | |
continue | |
if aligned_read.is_read1: | |
for indx,symbl in enumerate(cigar_str): | |
if symbl == type: | |
r1_soft_clip_profile[indx] += 1.0 | |
if aligned_read.is_read2: | |
for indx,symbl in enumerate(cigar_str): | |
if symbl == type: | |
r2_soft_clip_profile[indx] += 1.0 | |
except StopIteration: | |
print("Done", file=sys.stderr) | |
read_pos = list(range(0,len(cigar_str))) | |
r1_clip_count = [] | |
r2_clip_count = [] | |
print("Totoal read-1 used: %d" % int(total_read1), file=sys.stderr) | |
print("Totoal read-2 used: %d" % int(total_read2), file=sys.stderr) | |
print("Read-1:", file=OUT) | |
for i in read_pos: | |
print(str(i) + '\t' + str(r1_soft_clip_profile[i]) + '\t' + str(total_read1 - r1_soft_clip_profile[i]), file=OUT) | |
r1_clip_count.append(r1_soft_clip_profile[i]) | |
print("Read-2:", file=OUT) | |
for i in read_pos: | |
print(str(i) + '\t' + str(r2_soft_clip_profile[i]) + '\t' + str(total_read2 - r2_soft_clip_profile[i]), file=OUT) | |
r2_clip_count.append(r2_soft_clip_profile[i]) | |
print("pdf(\"%s\")" % (outfile + '.insertion_profile.R1.pdf'), file=ROUT) | |
print("read_pos=c(%s)" % ','.join([str(i) for i in read_pos]), file=ROUT) | |
print("r1_insert_count=c(%s)" % ','.join([str(i) for i in r1_clip_count]), file=ROUT) | |
print("r1_noninsert_count = %d - r1_insert_count" % (total_read1), file=ROUT) | |
print('plot(read_pos, r1_insert_count*100/(r1_insert_count + r1_noninsert_count),col="blue",main="Insertion profile",xlab="Position of read (read-1)",ylab="Insertion %",type="b")', file=ROUT) | |
print("dev.off()", file=ROUT) | |
print("pdf(\"%s\")" % (outfile + '.insertion_profile.R2.pdf'), file=ROUT) | |
print("read_pos=c(%s)" % ','.join([str(i) for i in read_pos]), file=ROUT) | |
print("r2_insert_count=c(%s)" % ','.join([str(i) for i in r2_clip_count]), file=ROUT) | |
print("r2_noninsert_count = %d - r2_insert_count" % (total_read2), file=ROUT) | |
print('plot(read_pos, r2_insert_count*100/(r2_insert_count + r2_noninsert_count),col="blue",main="Insertion profile",xlab="Position of read (read-2)",ylab="Insertion %",type="b")', file=ROUT) | |
print("dev.off()", file=ROUT) | |
def coverageGeneBody(self,refbed,outfile): | |
'''Calculate reads coverage over gene body, from 5'to 3'. each gene will be equally divided | |
into 100 regsions''' | |
if refbed is None: | |
print("You must specify a bed file representing gene model\n", file=sys.stderr) | |
exit(0) | |
OUT1 = open(outfile + ".geneBodyCoverage_plot.r",'w') | |
OUT2 = open(outfile + ".geneBodyCoverage.txt",'w') | |
ranges={} | |
totalReads=0 | |
fragment_num=0 #splice reads will counted twice | |
rpkm={} | |
#read SAM | |
if self.bam_format:print("Load BAM file ... ", end=' ', file=sys.stderr) | |
else:print("Load SAM file ... ", end=' ', file=sys.stderr) | |
try: | |
while(1): | |
aligned_read = next(self.samfile) | |
if aligned_read.is_qcfail:continue #skip low quanlity | |
if aligned_read.is_duplicate:continue #skip duplicate read | |
if aligned_read.is_secondary:continue #skip non primary hit | |
if aligned_read.is_unmapped:continue #skip unmap read | |
totalReads +=1 | |
chrom = self.samfile.getrname(aligned_read.tid).upper() | |
hit_st = aligned_read.pos | |
exon_blocks = bam_cigar.fetch_exon(chrom, hit_st, aligned_read.cigar) | |
fragment_num += len(exon_blocks) | |
for exon in exon_blocks: | |
if chrom not in ranges: | |
ranges[chrom] = Intersecter() | |
ranges[chrom].add_interval( Interval( exon[1], exon[2] ) ) | |
except StopIteration: | |
print("Done", file=sys.stderr) | |
print("calculating coverage over gene body ...", file=sys.stderr) | |
coverage=collections.defaultdict(int) | |
flag=0 | |
for line in open(refbed,'r'): | |
try: | |
if line.startswith(('#','track','browser')):continue | |
# Parse fields from gene tabls | |
fields = line.split() | |
chrom = fields[0].upper() | |
tx_start = int( fields[1] ) | |
tx_end = int( fields[2] ) | |
geneName = fields[3] | |
strand = fields[5] | |
exon_starts = list(map( int, fields[11].rstrip( ',\n' ).split( ',' ) )) | |
exon_starts = list(map((lambda x: x + tx_start ), exon_starts)) | |
exon_ends = list(map( int, fields[10].rstrip( ',\n' ).split( ',' ) )) | |
exon_ends = list(map((lambda x, y: x + y ), exon_starts, exon_ends)); | |
except: | |
print("[NOTE:input bed must be 12-column] skipped this line: " + line, end=' ', file=sys.stderr) | |
continue | |
gene_all_base=[] | |
percentile_base=[] | |
mRNA_len =0 | |
flag=0 | |
for st,end in zip(exon_starts,exon_ends): | |
gene_all_base.extend(list(range(st+1,end+1))) #0-based coordinates on genome | |
mRNA_len = len(gene_all_base) | |
if mRNA_len <100: | |
continue | |
if strand == '-': | |
gene_all_base.sort(reverse=True) #deal with gene on minus stand | |
else: | |
gene_all_base.sort(reverse=False) | |
percentile_base = mystat.percentile_list (gene_all_base) #get 101 points from each gene's coordinates | |
for i in range(0,len(percentile_base)): | |
if chrom in ranges: | |
coverage[i] += len(ranges[chrom].find(percentile_base[i], percentile_base[i]+1)) | |
x_coord=[] | |
y_coord=[] | |
print("Total reads: " + str(totalReads), file=OUT2) | |
print("Fragment number: " + str(fragment_num), file=OUT2) | |
print("percentile\tcount", file=OUT2) | |
for i in coverage: | |
x_coord.append(str(i)) | |
y_coord.append(str(coverage[i])) | |
print(str(i) + '\t' + str(coverage[i]), file=OUT2) | |
print("pdf(\'%s\')" % (outfile + ".geneBodyCoverage.pdf"), file=OUT1) | |
print("x=0:100", file=OUT1) | |
print("y=c(" + ','.join(y_coord) + ')', file=OUT1) | |
print("plot(x,y,xlab=\"percentile of gene body (5'->3')\",ylab='read number',type='s')", file=OUT1) | |
print("dev.off()", file=OUT1) | |
def mRNA_inner_distance(self,outfile,refbed,low_bound=0,up_bound=1000,step=10,sample_size=1000000, q_cut=30): | |
'''estimate the inner distance of mRNA pair end fragment. fragment size = insert_size + 2 x read_length''' | |
out_file1 = outfile + ".inner_distance.txt" | |
out_file2 = outfile + ".inner_distance_freq.txt" | |
out_file3 = outfile + ".inner_distance_plot.r" | |
FO=open(out_file1,'w') | |
FQ=open(out_file2,'w') | |
RS=open(out_file3,'w') | |
fchrom="chr100" #this is the fake chromosome | |
ranges={} | |
ranges[fchrom]=Intersecter() | |
window_left_bound = list(range(low_bound,up_bound,step)) | |
frag_size=0 | |
inner_distance_bitsets=BinnedBitSet() | |
tmp = BinnedBitSet() | |
tmp.set_range(0,0) | |
pair_num=0 | |
sizes=[] | |
counts=[] | |
count=0 | |
print("Get exon regions from " + refbed + " ...", file=sys.stderr) | |
bed_obj = BED.ParseBED(refbed) | |
ref_exons = [] | |
for exn in bed_obj.getExon(): | |
ref_exons.append([exn[0].upper(), exn[1], exn[2]]) | |
exon_bitsets = binned_bitsets_from_list(ref_exons) | |
transcript_ranges = {} | |
for i_chr, i_st, i_end, i_strand, i_name in bed_obj.getTranscriptRanges(): | |
i_chr = i_chr.upper() | |
if i_chr not in transcript_ranges: | |
transcript_ranges[i_chr] = Intersecter() | |
else: | |
transcript_ranges[i_chr].add_interval(Interval(i_st, i_end, value=i_name)) | |
if self.bam_format:print("Load BAM file ... ", end=' ', file=sys.stderr) | |
else:print("Load SAM file ... ", end=' ', file=sys.stderr) | |
try: | |
while(1): | |
if pair_num >= sample_size: | |
break | |
splice_intron_size=0 | |
aligned_read = next(self.samfile) | |
if aligned_read.is_qcfail:continue #skip low quanlity | |
if aligned_read.is_duplicate:continue #skip duplicate read | |
if aligned_read.is_secondary:continue #skip non primary hit | |
if aligned_read.is_unmapped:continue #skip unmap read | |
if not aligned_read.is_paired: continue #skip single map read | |
if aligned_read.mate_is_unmapped:continue # | |
if aligned_read.mapq < q_cut:continue | |
read1_len = aligned_read.qlen | |
read1_start = aligned_read.pos | |
read2_start = aligned_read.mpos #0-based, not included | |
if read2_start < read1_start: | |
continue #because BAM file is sorted, mate_read is already processed if its coordinate is smaller | |
if read2_start == read1_start and aligned_read.is_read1: | |
inner_distance = 0 | |
continue | |
pair_num +=1 | |
# check if reads were mapped to diff chromsomes | |
R_read1_ref = self.samfile.getrname(aligned_read.tid) | |
R_read2_ref = self.samfile.getrname(aligned_read.rnext) | |
if R_read1_ref != R_read2_ref: | |
FO.write(aligned_read.qname + '\t' + 'NA' + '\tsameChrom=No\n') #reads mapped to different chromosomes | |
continue | |
chrom = self.samfile.getrname(aligned_read.tid).upper() | |
intron_blocks = bam_cigar.fetch_intron(chrom, read1_start, aligned_read.cigar) | |
for intron in intron_blocks: | |
splice_intron_size += intron[2] - intron[1] | |
read1_end = read1_start + read1_len + splice_intron_size | |
if read2_start >= read1_end: | |
inner_distance = read2_start - read1_end | |
else: | |
exon_positions = [] | |
exon_blocks = bam_cigar.fetch_exon(chrom, read1_start,aligned_read.cigar) | |
for ex in exon_blocks: | |
for i in range(ex[1]+1,ex[2]+1): | |
exon_positions.append(i) | |
inner_distance = -len([i for i in exon_positions if i > read2_start and i <= read1_end]) | |
#print aligned_read.qname,read1_end, read2_start | |
read1_gene_names = set() #read1_end | |
try: | |
for gene in transcript_ranges[chrom].find(read1_end-1, read1_end): #gene: Interval(0, 10, value=a) | |
read1_gene_names.add(gene.value) | |
except: | |
pass | |
read2_gene_names = set() #read2_start | |
try: | |
for gene in transcript_ranges[chrom].find(read2_start, read2_start +1): #gene: Interval(0, 10, value=a) | |
read2_gene_names.add(gene.value) | |
except: | |
pass | |
if len(read1_gene_names.intersection(read2_gene_names)) == 0: # no common gene | |
FO.write(aligned_read.qname + '\t' + str(inner_distance) + '\tsameTranscript=No,dist=genomic\n') #reads mapped to different gene | |
ranges[fchrom].add_interval( Interval( inner_distance-1, inner_distance ) ) | |
continue | |
if inner_distance > 0: | |
if chrom in exon_bitsets: | |
size =0 | |
inner_distance_bitsets.set_range(read1_end, read2_start-read1_end) | |
inner_distance_bitsets.iand(exon_bitsets[chrom]) | |
end=0 | |
while 1: | |
start = inner_distance_bitsets.next_set( end ) | |
if start == inner_distance_bitsets.size: break | |
end = inner_distance_bitsets.next_clear( start ) | |
size += (end - start) | |
inner_distance_bitsets.iand(tmp) #clear BinnedBitSet | |
if size == inner_distance: | |
FO.write(aligned_read.qname + '\t' + str(size) + '\tsameTranscript=Yes,sameExon=Yes,dist=mRNA\n') | |
ranges[fchrom].add_interval( Interval( size-1, size ) ) | |
elif size > 0 and size < inner_distance: | |
FO.write(aligned_read.qname + '\t' + str(size) + '\tsameTranscript=Yes,sameExon=No,dist=mRNA\n') | |
ranges[fchrom].add_interval( Interval( size-1, size ) ) | |
elif size <= 0: | |
FO.write(aligned_read.qname + '\t' + str(inner_distance) + '\tsameTranscript=Yes,nonExonic=Yes,dist=genomic\n') | |
ranges[fchrom].add_interval( Interval( inner_distance-1, inner_distance ) ) | |
else: | |
FO.write(aligned_read.qname + '\t' + str(inner_distance) + '\tunknownChromosome,dist=genomic') | |
ranges[fchrom].add_interval( Interval( inner_distance-1, inner_distance ) ) | |
else: | |
FO.write(aligned_read.qname + '\t' + str(inner_distance) + '\treadPairOverlap\n') | |
ranges[fchrom].add_interval( Interval( inner_distance-1, inner_distance ) ) | |
except StopIteration: | |
print("Done", file=sys.stderr) | |
print("Total read pairs used " + str(pair_num), file=sys.stderr) | |
if pair_num==0: | |
print("Cannot find paired reads", file=sys.stderr) | |
sys.exit(0) | |
for st in window_left_bound: | |
sizes.append(str(st + step/2)) | |
count = str(len(ranges[fchrom].find(st,st + step))) | |
counts.append(count) | |
print(str(st) + '\t' + str(st+step) +'\t' + count, file=FQ) | |
print("out_file = \'%s\'" % outfile, file=RS) | |
print("pdf(\'%s\')" % (outfile + ".inner_distance_plot.pdf"), file=RS) | |
#print >>RS, "par(mfrow=c(2,1),cex.main=0.8,cex.lab=0.8,cex.axis=0.8,mar=c(4,4,4,1))" | |
#print >>RS, 'pie(c(%d,%d,%d),col=rainbow(3),cex=0.5,radius=1,main="Total %d fragments",labels=c("fraSize <= %d\\n(%4.2f%%)","fragSize > %d\\n(%4.2f%%)","%d < fragSize <= %d\\n(%4.2f%%)"), density=rep(80,80,80),angle=c(90,140,170))' % (ultra_low, ultra_high, pair_num -ultra_low -ultra_high, pair_num, low_bound, ultra_low*100/pair_num, up_bound, ultra_high*100/pair_num, low_bound, up_bound, 100-ultra_low*100/pair_num - ultra_high*100/pair_num) | |
print('fragsize=rep(c(' + ','.join(sizes) + '),' + 'times=c(' + ','.join(counts) + '))', file=RS) | |
print('frag_sd = sd(fragsize)', file=RS) | |
print('frag_mean = mean(fragsize)', file=RS) | |
print('frag_median = median(fragsize)', file=RS) | |
print('write(x=c("Name","Mean","Median","sd"), sep="\t", file=stdout(),ncolumns=4)', file=RS) | |
print('write(c(out_file,frag_mean,frag_median,frag_sd),sep="\t", file=stdout(),ncolumns=4)', file=RS) | |
print('hist(fragsize,probability=T,breaks=%d,xlab="mRNA insert size (bp)",main=paste(c("Mean=",frag_mean,";","SD=",frag_sd),collapse=""),border="blue")' % len(window_left_bound), file=RS) | |
print("lines(density(fragsize,bw=%d),col='red')" % (2*step), file=RS) | |
print("dev.off()", file=RS) | |
FO.close() | |
FQ.close() | |
RS.close() | |
#self.f.seek(0) | |
def annotate_junction(self,refgene,outfile,min_intron=50, q_cut=30): | |
'''Annotate splicing junctions in BAM or SAM file. Note that a (long) read might have multiple splicing | |
events (splice multiple times), and the same splicing events can be consolidated into a single | |
junction''' | |
out_file = outfile + ".junction.xls" | |
out_file2 = outfile + ".junction_plot.r" | |
if refgene is None: | |
print("You must provide reference gene model in bed format.", file=sys.stderr) | |
sys.exit(1) | |
OUT = open(out_file,'w') | |
ROUT = open(out_file2,'w') | |
#reading reference gene model | |
refIntronStarts=collections.defaultdict(dict) | |
refIntronEnds=collections.defaultdict(dict) | |
total_junc = 0 | |
novel35_junc = 0 | |
novel3or5_junc = 0 | |
known_junc = 0 | |
filtered_junc = 0 | |
splicing_events=collections.defaultdict(int) | |
print("Reading reference bed file: ",refgene, " ... ", end=' ', file=sys.stderr) | |
for line in open(refgene,'r'): | |
if line.startswith(('#','track','browser')):continue | |
# Parse fields from gene tabls | |
fields = line.split() | |
if(len(fields)<12): | |
print("Invalid bed line (skipped):",line, end=' ', file=sys.stderr) | |
continue | |
chrom = fields[0].upper() | |
tx_start = int( fields[1] ) | |
tx_end = int( fields[2] ) | |
if int(fields[9] ==1): | |
continue | |
exon_starts = list(map( int, fields[11].rstrip( ',\n' ).split( ',' ) )) | |
exon_starts = list(map((lambda x: x + tx_start ), exon_starts)) | |
exon_ends = list(map( int, fields[10].rstrip( ',\n' ).split( ',' ) )) | |
exon_ends = list(map((lambda x, y: x + y ), exon_starts, exon_ends)); | |
intron_start = exon_ends[:-1] | |
intron_end=exon_starts[1:] | |
for i_st,i_end in zip (intron_start, intron_end): | |
refIntronStarts[chrom][i_st] =i_st | |
refIntronEnds[chrom][i_end] =i_end | |
print("Done", file=sys.stderr) | |
#reading input SAM file | |
if self.bam_format:print("Load BAM file ... ", end=' ', file=sys.stderr) | |
else:print("Load SAM file ... ", end=' ', file=sys.stderr) | |
try: | |
while(1): | |
aligned_read = next(self.samfile) | |
if aligned_read.is_qcfail:continue #skip low quanlity | |
if aligned_read.is_duplicate:continue #skip duplicate read | |
if aligned_read.is_secondary:continue #skip non primary hit | |
if aligned_read.is_unmapped:continue #skip unmap read | |
if aligned_read.mapq < q_cut:continue | |
chrom = self.samfile.getrname(aligned_read.tid).upper() | |
hit_st = aligned_read.pos | |
intron_blocks = bam_cigar.fetch_intron(chrom, hit_st, aligned_read.cigar) | |
if len(intron_blocks)==0: | |
continue | |
for intrn in intron_blocks: | |
total_junc +=1 | |
if intrn[2] - intrn[1] < min_intron: | |
filtered_junc += 1 | |
continue | |
splicing_events[intrn[0] + ":" + str(intrn[1]) + ":" + str(intrn[2])] += 1 | |
if (intrn[1] in refIntronStarts[chrom] and intrn[2] in refIntronEnds[chrom]): | |
known_junc +=1 #known both | |
elif (intrn[1] not in refIntronStarts[chrom] and intrn[2] not in refIntronEnds[chrom]): | |
novel35_junc +=1 | |
else: | |
novel3or5_junc +=1 | |
except StopIteration: | |
print("Done", file=sys.stderr) | |
print("total = " + str(total_junc)) | |
if total_junc == 0: | |
print("No splice junction found.", file=sys.stderr) | |
sys.exit() | |
#self.f.seek(0) | |
print('pdf(\"%s\")' % (outfile + ".splice_events.pdf"), file=ROUT) | |
print("events=c(" + ','.join([str(i*100.0/total_junc) for i in (novel3or5_junc,novel35_junc,known_junc)])+ ')', file=ROUT) | |
print('pie(events,col=c(2,3,4),init.angle=30,angle=c(60,120,150),density=c(70,70,70),main="splicing events",labels=c("partial_novel %d%%","complete_novel %d%%","known %d%%"))' % (round(novel3or5_junc*100.0/total_junc),round(novel35_junc*100.0/total_junc),round(known_junc*100.0/total_junc)), file=ROUT) | |
print("dev.off()", file=ROUT) | |
print("\n===================================================================", file=sys.stderr) | |
print("Total splicing Events:\t" + str(total_junc), file=sys.stderr) | |
print("Known Splicing Events:\t" + str(known_junc), file=sys.stderr) | |
print("Partial Novel Splicing Events:\t" + str(novel3or5_junc), file=sys.stderr) | |
print("Novel Splicing Events:\t" + str(novel35_junc), file=sys.stderr) | |
print("Filtered Splicing Events:\t" + str(filtered_junc), file=sys.stderr) | |
#reset variables | |
total_junc =0 | |
novel35_junc =0 | |
novel3or5_junc =0 | |
known_junc =0 | |
print("chrom\tintron_st(0-based)\tintron_end(1-based)\tread_count\tannotation", file=OUT) | |
for i in splicing_events: | |
total_junc += 1 | |
(chrom, i_st, i_end) = i.split(":") | |
print('\t'.join([chrom.replace("CHR","chr"),i_st,i_end]) + '\t' + str(splicing_events[i]) + '\t', end=' ', file=OUT) | |
i_st = int(i_st) | |
i_end = int(i_end) | |
if (i_st in refIntronStarts[chrom] and i_end in refIntronEnds[chrom]): | |
print("annotated", file=OUT) | |
known_junc +=1 | |
elif (i_st not in refIntronStarts[chrom] and i_end not in refIntronEnds[chrom]): | |
print('complete_novel', file=OUT) | |
novel35_junc +=1 | |
else: | |
print('partial_novel', file=OUT) | |
novel3or5_junc +=1 | |
if total_junc ==0: | |
print("No splice read found", file=sys.stderr) | |
sys.exit(1) | |
print("\nTotal splicing Junctions:\t" + str(total_junc), file=sys.stderr) | |
print("Known Splicing Junctions:\t" + str(known_junc), file=sys.stderr) | |
print("Partial Novel Splicing Junctions:\t" + str(novel3or5_junc), file=sys.stderr) | |
print("Novel Splicing Junctions:\t" + str(novel35_junc), file=sys.stderr) | |
print("\n===================================================================", file=sys.stderr) | |
print('pdf(\"%s\")' % (outfile + ".splice_junction.pdf"), file=ROUT) | |
print("junction=c(" + ','.join([str(i*100.0/total_junc) for i in (novel3or5_junc,novel35_junc,known_junc,)])+ ')', file=ROUT) | |
print('pie(junction,col=c(2,3,4),init.angle=30,angle=c(60,120,150),density=c(70,70,70),main="splicing junctions",labels=c("partial_novel %d%%","complete_novel %d%%","known %d%%"))' % (round(novel3or5_junc*100.0/total_junc),round(novel35_junc*100.0/total_junc),round(known_junc*100.0/total_junc)), file=ROUT) | |
print("dev.off()", file=ROUT) | |
#print >>ROUT, "mat=matrix(c(events,junction),byrow=T,ncol=3)" | |
#print >>ROUT, 'barplot(mat,beside=T,ylim=c(0,100),names=c("known","partial\nnovel","complete\nnovel"),legend.text=c("splicing events","splicing junction"),ylab="Percent")' | |
def junction_freq(self, chrom, st, end, known_junctions, q_cut=30): | |
''' | |
return number of splicing reads for each known junction | |
''' | |
#reading input SAM file | |
#if self.bam_format:print >>sys.stderr, "Load BAM file ... ", | |
#else:print >>sys.stderr, "Load SAM file ... ", | |
junc_freq = collections.defaultdict(int) | |
try: | |
alignedReads = self.samfile.fetch(chrom,st,end) | |
except: | |
return junc_freq | |
for aligned_read in alignedReads: | |
if aligned_read.is_qcfail:continue #skip low quanlity | |
if aligned_read.is_duplicate:continue #skip duplicate read | |
if aligned_read.is_secondary:continue #skip non primary hit | |
if aligned_read.is_unmapped:continue #skip unmap read | |
if aligned_read.mapq < q_cut:continue | |
intron_blocks = bam_cigar.fetch_intron(chrom, aligned_read.pos, aligned_read.cigar) | |
if len(intron_blocks)==0: | |
continue | |
for intrn in intron_blocks: | |
tmp = chrom + ":" + str(intrn[1]) + '-' + str(intrn[2]) | |
if tmp in known_junctions: | |
junc_freq[tmp] += 1 | |
else: | |
continue | |
for k in known_junctions: | |
if k not in list(junc_freq.keys()): | |
junc_freq[k] = 0 | |
elif junc_freq[k] < 2: junc_freq[k] = 0 | |
return junc_freq | |
def saturation_junction(self,refgene,outfile=None,sample_start=5,sample_step=5,sample_end=100,min_intron=50,recur=1, q_cut=30): | |
'''check if an RNA-seq experiment is saturated in terms of detecting known splicing junction''' | |
out_file = outfile + ".junctionSaturation_plot.r" | |
if refgene is None: | |
print("You must provide reference gene model in bed format.", file=sys.stderr) | |
sys.exit(1) | |
OUT = open(out_file,'w') | |
#reading reference gene | |
knownSpliceSites= set() | |
chrom_list=set() | |
print("reading reference bed file: ",refgene, " ... ", end=' ', file=sys.stderr) | |
for line in open(refgene,'r'): | |
if line.startswith(('#','track','browser')):continue | |
fields = line.split() | |
if(len(fields)<12): | |
print("Invalid bed line (skipped):",line, end=' ', file=sys.stderr) | |
continue | |
chrom = fields[0].upper() | |
chrom_list.add(chrom) | |
tx_start = int( fields[1] ) | |
tx_end = int( fields[2] ) | |
if int(fields[9] ==1): | |
continue | |
exon_starts = list(map( int, fields[11].rstrip( ',\n' ).split( ',' ) )) | |
exon_starts = list(map((lambda x: x + tx_start ), exon_starts)) | |
exon_ends = list(map( int, fields[10].rstrip( ',\n' ).split( ',' ) )) | |
exon_ends = list(map((lambda x, y: x + y ), exon_starts, exon_ends)); | |
intron_start = exon_ends[:-1] | |
intron_end=exon_starts[1:] | |
for st,end in zip (intron_start, intron_end): | |
knownSpliceSites.add(chrom + ":" + str(st) + "-" + str(end)) | |
print("Done! Total "+str(len(knownSpliceSites)) + " known splicing junctions.", file=sys.stderr) | |
#read SAM file | |
samSpliceSites=[] | |
intron_start=[] | |
intron_end=[] | |
uniqSpliceSites=collections.defaultdict(int) | |
if self.bam_format:print("Load BAM file ... ", end=' ', file=sys.stderr) | |
else:print("Load SAM file ... ", end=' ', file=sys.stderr) | |
try: | |
while(1): | |
aligned_read = next(self.samfile) | |
try: | |
chrom = self.samfile.getrname(aligned_read.tid).upper() | |
except: | |
continue | |
if chrom not in chrom_list: | |
continue | |
if aligned_read.is_qcfail:continue #skip low quanlity | |
if aligned_read.is_duplicate:continue #skip duplicate read | |
if aligned_read.is_secondary:continue #skip non primary hit | |
if aligned_read.is_unmapped:continue #skip unmap read | |
if aligned_read.mapq < q_cut: continue | |
hit_st = aligned_read.pos | |
intron_blocks = bam_cigar.fetch_intron(chrom, hit_st, aligned_read.cigar) | |
if len(intron_blocks)==0: | |
continue | |
for intrn in intron_blocks: | |
if intrn[2] - intrn[1] < min_intron:continue | |
samSpliceSites.append(intrn[0] + ":" + str(intrn[1]) + "-" + str(intrn[2])) | |
except StopIteration: | |
print("Done", file=sys.stderr) | |
print("shuffling alignments ...", end=' ', file=sys.stderr) | |
random.shuffle(samSpliceSites) | |
print("Done", file=sys.stderr) | |
#resampling | |
SR_num = len(samSpliceSites) | |
sample_size=0 | |
all_junctionNum = 0 | |
known_junc=[] | |
all_junc=[] | |
unknown_junc=[] | |
#=========================sampling uniquely mapped reads from population | |
tmp=list(range(sample_start,sample_end,sample_step)) | |
tmp.append(100) | |
for pertl in tmp: #[5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95,100] | |
knownSpliceSites_num = 0 | |
index_st = int(SR_num * ((pertl - sample_step)/100.0)) | |
index_end = int(SR_num * (pertl/100.0)) | |
if index_st < 0: index_st = 0 | |
sample_size += index_end -index_st | |
print("sampling " + str(pertl) +"% (" + str(sample_size) + ") splicing reads.", end=' ', file=sys.stderr) | |
#all splice juntion | |
for i in range(index_st, index_end): | |
uniqSpliceSites[samSpliceSites[i]] +=1 | |
all_junctionNum = len(list(uniqSpliceSites.keys())) | |
all_junc.append(str(all_junctionNum)) | |
print(str(all_junctionNum) + " splicing junctions.", end=' ', file=sys.stderr) | |
#known splice junction | |
known_junctionNum = 0 | |
for sj in uniqSpliceSites: | |
if sj in knownSpliceSites and uniqSpliceSites[sj] >= recur: | |
known_junctionNum +=1 | |
print(str(known_junctionNum) + " known splicing junctions.", end=' ', file=sys.stderr) | |
known_junc.append(str(known_junctionNum)) | |
#unknown splice junction | |
unknown_junctionNum = 0 | |
for sj in uniqSpliceSites: | |
if sj not in knownSpliceSites: | |
unknown_junctionNum +=1 | |
unknown_junc.append(str(unknown_junctionNum)) | |
print(str(unknown_junctionNum) + " novel splicing junctions.", file=sys.stderr) | |
#for j in uniq_SJ: | |
#print >>OUT, j + "\t" + str(uniq_SJ[j]) | |
print("pdf(\'%s\')" % (outfile + '.junctionSaturation_plot.pdf'), file=OUT) | |
print("x=c(" + ','.join([str(i) for i in tmp]) + ')', file=OUT) | |
print("y=c(" + ','.join(known_junc) + ')', file=OUT) | |
print("z=c(" + ','.join(all_junc) + ')', file=OUT) | |
print("w=c(" + ','.join(unknown_junc) + ')', file=OUT) | |
print("m=max(%d,%d,%d)" % (int(int(known_junc[-1])/1000), int(int(all_junc[-1])/1000),int(int(unknown_junc[-1])/1000)), file=OUT) | |
print("n=min(%d,%d,%d)" % (int(int(known_junc[0])/1000), int(int(all_junc[0])/1000),int(int(unknown_junc[0])/1000)), file=OUT) | |
print("plot(x,z/1000,xlab='percent of total reads',ylab='Number of splicing junctions (x1000)',type='o',col='blue',ylim=c(n,m))", file=OUT) | |
print("points(x,y/1000,type='o',col='red')", file=OUT) | |
print("points(x,w/1000,type='o',col='green')", file=OUT) | |
print('legend(5,%d, legend=c("All junctions","known junctions", "novel junctions"),col=c("blue","red","green"),lwd=1,pch=1)' % int(int(all_junc[-1])/1000), file=OUT) | |
print("dev.off()", file=OUT) | |
def saturation_RPKM(self,refbed,outfile,sample_start=5,sample_step=5,sample_end=100,skip_multi=True, strand_rule=None, q_cut=30): | |
'''for each gene, check if its RPKM (epxresion level) has already been saturated or not''' | |
if refbed is None: | |
print("You must specify a bed file representing gene model\n", file=sys.stderr) | |
exit(0) | |
rpkm_file = outfile + ".eRPKM.xls" | |
raw_file = outfile + ".rawCount.xls" | |
RPKM_OUT = open(rpkm_file,'w') | |
RAW_OUT = open(raw_file ,'w') | |
ranges={} | |
totalReads=0 | |
cUR_num = 0 #number of fragements | |
cUR_plus = 0 | |
cUR_minus = 0 | |
block_list_plus = [] #non-spliced read AS IS, splicing reads were counted multiple times | |
block_list_minus = [] | |
block_list = [] | |
strandRule = {} | |
if strand_rule is None: # Not strand-specific | |
pass | |
elif len(strand_rule.split(',')) ==4: #PairEnd, strand-specific | |
for i in strand_rule.split(','):strandRule[i[0]+i[1]]=i[2] | |
elif len(strand_rule.split(',')) ==2: #singeEnd, strand-specific | |
for i in strand_rule.split(','):strandRule[i[0]]=i[1] | |
else: | |
print("Unknown value of: 'strand_rule' " + strand_rule, file=sys.stderr) | |
sys.exit(1) | |
#read SAM or BAM | |
if self.bam_format:print("Load BAM file ... ", end=' ', file=sys.stderr) | |
else:print("Load SAM file ... ", end=' ', file=sys.stderr) | |
try: | |
while(1): | |
aligned_read = next(self.samfile) | |
if aligned_read.is_qcfail:continue #skip low quanlity | |
if aligned_read.is_duplicate:continue #skip duplicate read | |
if aligned_read.is_secondary:continue #skip non primary hit | |
if aligned_read.is_unmapped:continue #skip unmap read | |
if skip_multi: | |
if aligned_read.mapq < q_cut: | |
continue | |
chrom = self.samfile.getrname(aligned_read.tid).upper() | |
#determine read_id and read_strand | |
if aligned_read.is_paired: #pair end | |
if aligned_read.is_read1:read_id = '1' | |
if aligned_read.is_read2:read_id = '2' | |
else: | |
read_id = '' #single end | |
if aligned_read.is_reverse: | |
map_strand = '-' | |
else: | |
map_strand = '+' | |
strand_key = read_id + map_strand #used to determine if a read should assign to gene(+) or gene(-) | |
hit_st = aligned_read.pos | |
exon_blocks = bam_cigar.fetch_exon(chrom, hit_st, aligned_read.cigar) | |
cUR_num += len(exon_blocks) | |
#strand specific | |
if strand_rule is not None: | |
if strandRule[strand_key] == '+': cUR_plus += len(exon_blocks) | |
if strandRule[strand_key] == '-': cUR_minus += len(exon_blocks) | |
for exn in exon_blocks: | |
if strandRule[strand_key] == '+': block_list_plus.append(exn[0] + ":" + str(exn[1] + int((exn[2]-exn[1])/2) )) | |
if strandRule[strand_key] == '-': block_list_minus.append(exn[0] + ":" + str(exn[1] + int((exn[2]-exn[1])/2) )) | |
#Not strand specific | |
else: | |
for exn in exon_blocks: | |
block_list.append(exn[0] + ":" + str(exn[1] + int((exn[2]-exn[1])/2) )) | |
except StopIteration: | |
print("Done", file=sys.stderr) | |
print("shuffling alignments ...", end=' ', file=sys.stderr) | |
random.shuffle(block_list_plus) | |
random.shuffle(block_list_minus) | |
random.shuffle(block_list) | |
print("Done", file=sys.stderr) | |
ranges_plus={} | |
ranges_minus={} | |
ranges={} | |
sample_size=0 | |
RPKM_table=collections.defaultdict(list) | |
rawCount_table=collections.defaultdict(list) | |
RPKM_head=['#chr','start','end','name','score','strand'] | |
tmp=list(range(sample_start,sample_end,sample_step)) | |
tmp.append(100) | |
#=========================sampling uniquely mapped reads from population | |
for pertl in tmp: #[5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95,100] | |
percent_st = (pertl-sample_step)/100.0 | |
percent_end = pertl/100.0 | |
if percent_st < 0: percent_st = 0 | |
sample_size = cUR_num * percent_end | |
RPKM_head.append(str(pertl) + '%') | |
if strand_rule is not None: | |
print("sampling " + str(pertl) +"% (" + str(int(cUR_plus * percent_end)) + ") forward strand fragments ...", file=sys.stderr) | |
for i in block_list_plus[int(cUR_plus*percent_st):int(cUR_plus*percent_end)]: | |
(chr,coord) = i.split(':') | |
if chr not in ranges_plus:ranges_plus[chr] = Intersecter() | |
ranges_plus[chr].add_interval( Interval( int(coord), int(coord)+1 ) ) | |
print("sampling " + str(pertl) +"% (" + str(int(cUR_minus * percent_end)) + ") reverse strand fragments ...", file=sys.stderr) | |
for i in block_list_minus[int(cUR_minus*percent_st):int(cUR_minus*percent_end)]: | |
(chr,coord) = i.split(':') | |
if chr not in ranges_minus:ranges_minus[chr] = Intersecter() | |
ranges_minus[chr].add_interval( Interval( int(coord), int(coord)+1 ) ) | |
else: | |
print("sampling " + str(pertl) + "% (" + str(int(sample_size)) + ") fragments ...", file=sys.stderr) | |
for i in block_list[int(cUR_num*percent_st):int(cUR_num*percent_end)]: | |
(chr,coord) = i.split(':') | |
if chr not in ranges:ranges[chr] = Intersecter() | |
ranges[chr].add_interval( Interval( int(coord), int(coord)+1 ) ) | |
#========================= calculating RPKM based on sub-population | |
print("assign reads to transcripts in " + refbed + ' ...', file=sys.stderr) | |
for line in open(refbed,'r'): | |
try: | |
if line.startswith(('#','track','browser')):continue | |
# Parse fields from gene tabls | |
fields = line.split() | |
chrom = fields[0].upper() | |
tx_start = int( fields[1] ) | |
tx_end = int( fields[2] ) | |
geneName = fields[3] | |
strand = fields[5] | |
exon_starts = list(map( int, fields[11].rstrip( ',\n' ).split( ',' ) )) | |
exon_starts = list(map((lambda x: x + tx_start ), exon_starts)) | |
exon_ends = list(map( int, fields[10].rstrip( ',\n' ).split( ',' ) )) | |
exon_ends = list(map((lambda x, y: x + y ), exon_starts, exon_ends)) | |
exon_sizes = list(map(int,fields[10].rstrip(',\n').split(','))) | |
key='\t'.join((chrom.lower(),str(tx_start),str(tx_end),geneName,'0',strand)) | |
except: | |
print("[NOTE:input bed must be 12-column] skipped this line: " + line, file=sys.stderr) | |
continue | |
mRNA_count=0 #we need to initializ it to 0 for each gene | |
mRNA_len=sum(exon_sizes) | |
for st,end in zip(exon_starts,exon_ends): | |
#if chrom in ranges: | |
if strand_rule is not None: | |
if (strand == '+') and (chrom in ranges_plus): mRNA_count += len(ranges_plus[chrom].find(st,end)) | |
if (strand == '-') and (chrom in ranges_minus): mRNA_count += len(ranges_minus[chrom].find(st,end)) | |
else: | |
if chrom in ranges: | |
mRNA_count += len(ranges[chrom].find(st,end)) | |
if mRNA_len ==0: | |
print(geneName + " has 0 nucleotides. Exit!", file=sys.stderr) | |
sys.exit(1) | |
if sample_size == 0: | |
print("Too few reads to sample. Exit!", file=sys.stderr) | |
sys.exit(1) | |
mRNA_RPKM = (mRNA_count * 1000000000.0)/(mRNA_len * sample_size) | |
RPKM_table[key].append(str(mRNA_RPKM)) | |
rawCount_table[key].append(str(mRNA_count)) | |
print("", file=sys.stderr) | |
#self.f.seek(0) | |
print('\t'.join(RPKM_head), file=RPKM_OUT) | |
print('\t'.join(RPKM_head), file=RAW_OUT) | |
for key in RPKM_table: | |
print(key + '\t', end=' ', file=RPKM_OUT) | |
print('\t'.join(RPKM_table[key]), file=RPKM_OUT) | |
print(key + '\t', end=' ', file=RAW_OUT) | |
print('\t'.join(rawCount_table[key]), file=RAW_OUT) | |
def shuffle_RPKM(self,refbed,outfile,sample_percentage=0.5,shuffle_times=50,skip_multi=True, strand_rule=None): | |
'''for each gene, check if its RPKM (epxresion level) has already been saturated or not''' | |
if refbed is None: | |
print("You must specify a bed file representing gene model\n", file=sys.stderr) | |
exit(0) | |
rpkm_file = outfile + ".eRPKM.xls" | |
raw_file = outfile + ".rawCount.xls" | |
RPKM_OUT = open(rpkm_file,'w') | |
RAW_OUT = open(raw_file ,'w') | |
ranges={} | |
totalReads=0 | |
cUR_num = 0 #number of fragements | |
cUR_plus = 0 | |
cUR_minus = 0 | |
block_list_plus = [] #non-spliced read AS IS, splicing reads were counted multiple times | |
block_list_minus = [] | |
block_list = [] | |
strandRule = {} | |
if strand_rule is None: # Not strand-specific | |
pass | |
elif len(strand_rule.split(',')) ==4: #PairEnd, strand-specific | |
for i in strand_rule.split(','):strandRule[i[0]+i[1]]=i[2] | |
elif len(strand_rule.split(',')) ==2: #singeEnd, strand-specific | |
for i in strand_rule.split(','):strandRule[i[0]]=i[1] | |
else: | |
print("Unknown value of: 'strand_rule' " + strand_rule, file=sys.stderr) | |
sys.exit(1) | |
#read SAM or BAM | |
if self.bam_format:print("Load BAM file ... ", end=' ', file=sys.stderr) | |
else:print("Load SAM file ... ", end=' ', file=sys.stderr) | |
try: | |
while(1): | |
flag=0 | |
aligned_read = next(self.samfile) | |
if aligned_read.is_qcfail:continue #skip low quanlity | |
if aligned_read.is_duplicate:continue #skip duplicate read | |
if aligned_read.is_secondary:continue #skip non primary hit | |
if aligned_read.is_unmapped:continue #skip unmap read | |
if skip_multi: | |
if len(aligned_read.tags)>0: #( ("NM", 1),("RG", "L1") ) | |
for i in aligned_read.tags: | |
if i[0] in ParseBAM.multi_hit_tags and i[1] >1: | |
flag=1 #multiple hit read | |
break | |
if flag==1:continue #skip multiple map read | |
chrom = self.samfile.getrname(aligned_read.tid).upper() | |
#determine read_id and read_strand | |
if aligned_read.is_paired: #pair end | |
if aligned_read.is_read1:read_id = '1' | |
if aligned_read.is_read2:read_id = '2' | |
else:read_id = '' #single end | |
if aligned_read.is_reverse:map_strand = '-' | |
else:map_strand = '+' | |
strand_key = read_id + map_strand #used to determine if a read should assign to gene(+) or gene(-) | |
hit_st = aligned_read.pos | |
exon_blocks = bam_cigar.fetch_exon(chrom, hit_st, aligned_read.cigar) | |
cUR_num += len(exon_blocks) | |
#strand specific | |
if strand_rule is not None: | |
if strandRule[strand_key] == '+': cUR_plus += len(exon_blocks) | |
if strandRule[strand_key] == '-': cUR_minus += len(exon_blocks) | |
for exn in exon_blocks: | |
if strandRule[strand_key] == '+': block_list_plus.append(exn[0] + ":" + str(exn[1] + (exn[2]-exn[1])/2 )) | |
if strandRule[strand_key] == '-': block_list_minus.append(exn[0] + ":" + str(exn[1] + (exn[2]-exn[1])/2 )) | |
#Not strand specific | |
else: | |
for exn in exon_blocks: | |
block_list.append(exn[0] + ":" + str(exn[1] + (exn[2]-exn[1])/2 )) | |
except StopIteration: | |
print("Done", file=sys.stderr) | |
RPKM_table=collections.defaultdict(list) | |
rawCount_table=collections.defaultdict(list) | |
RPKM_head=['#chr','start','end','name','score','strand'] | |
iter_times=0 | |
#=========================sampling uniquely mapped reads from population | |
for x in range(0,shuffle_times+1): | |
print("Shuffle " + str(iter_times) + " times", file=sys.stderr) | |
iter_times += 1 | |
if iter_times == shuffle_times: | |
sample_percent = 1 | |
else: | |
sample_percent = sample_percentage | |
ranges_plus={} | |
ranges_minus={} | |
ranges={} | |
if strand_rule is not None: | |
for i in random.sample(block_list_plus, int(cUR_plus * sample_percent)): | |
(chr,coord) = i.split(':') | |
if chr not in ranges_plus:ranges_plus[chr] = Intersecter() | |
ranges_plus[chr].add_interval( Interval( int(coord), int(coord)+1 ) ) | |
for i in random.sample(block_list_minus, int(cUR_minus * sample_percent)): | |
(chr,coord) = i.split(':') | |
if chr not in ranges_minus:ranges_minus[chr] = Intersecter() | |
ranges_minus[chr].add_interval( Interval( int(coord), int(coord)+1 ) ) | |
else: | |
for i in random.sample(block_list,int(cUR_num * sample_percent)): | |
(chr,coord) = i.split(':') | |
if chr not in ranges:ranges[chr] = Intersecter() | |
ranges[chr].add_interval( Interval( int(coord), int(coord)+1 ) ) | |
#========================= calculating RPKM based on sub-population | |
print("assign reads to transcripts in " + refbed + ' ...', file=sys.stderr) | |
for line in open(refbed,'r'): | |
try: | |
if line.startswith(('#','track','browser')):continue | |
# Parse fields from gene tabls | |
fields = line.split() | |
chrom = fields[0].upper() | |
tx_start = int( fields[1] ) | |
tx_end = int( fields[2] ) | |
geneName = fields[3] | |
strand = fields[5] | |
exon_starts = list(map( int, fields[11].rstrip( ',\n' ).split( ',' ) )) | |
exon_starts = list(map((lambda x: x + tx_start ), exon_starts)) | |
exon_ends = list(map( int, fields[10].rstrip( ',\n' ).split( ',' ) )) | |
exon_ends = list(map((lambda x, y: x + y ), exon_starts, exon_ends)) | |
exon_sizes = list(map(int,fields[10].rstrip(',\n').split(','))) | |
key='\t'.join((chrom.lower(),str(tx_start),str(tx_end),geneName,'0',strand)) | |
except: | |
print("[NOTE:input bed must be 12-column] skipped this line: " + line, file=sys.stderr) | |
continue | |
mRNA_count=0 #we need to initializ it to 0 for each gene | |
mRNA_len=sum(exon_sizes) | |
for st,end in zip(exon_starts,exon_ends): | |
#if chrom in ranges: | |
if strand_rule is not None: | |
if (strand == '+') and (chrom in ranges_plus): mRNA_count += len(ranges_plus[chrom].find(st,end)) | |
if (strand == '-') and (chrom in ranges_minus): mRNA_count += len(ranges_minus[chrom].find(st,end)) | |
else: | |
if chrom in ranges: | |
mRNA_count += len(ranges[chrom].find(st,end)) | |
if mRNA_len ==0: | |
print(geneName + " has 0 nucleotides. Exit!", file=sys.stderr) | |
sys.exit(1) | |
if cUR_num * sample_percentage == 0: | |
print("Too few reads to sample. Exit!", file=sys.stderr) | |
sys.exit(1) | |
mRNA_RPKM = (mRNA_count * 1000000000.0)/(mRNA_len * (cUR_num * sample_percentage)) | |
RPKM_table[key].append(str(mRNA_RPKM)) | |
rawCount_table[key].append(str(mRNA_count)) | |
print("", file=sys.stderr) | |
#self.f.seek(0) | |
print('\t'.join(RPKM_head), file=RPKM_OUT) | |
print('\t'.join(RPKM_head), file=RAW_OUT) | |
for key in RPKM_table: | |
print(key + '\t', end=' ', file=RPKM_OUT) | |
print('\t'.join(RPKM_table[key]), file=RPKM_OUT) | |
print(key + '\t', end=' ', file=RAW_OUT) | |
print('\t'.join(rawCount_table[key]), file=RAW_OUT) | |
def fetchAlignments(self,chr,st,end): | |
'''fetch alignment from sorted BAM file based on chr, st, end | |
Note: BAM file must be indexed''' | |
try: | |
a=self.samfile.fetch(chr,st,end) | |
return a | |
except: | |
return None | |
def mismatchProfile(self,read_length,read_num, outfile, q_cut=30): | |
''' | |
Calculate mismatch profile. Note that the "MD" tag must exist. | |
''' | |
DOUT = open(outfile + '.mismatch_profile.xls','w') | |
ROUT = open(outfile + '.mismatch_profile.r','w') | |
#reading input SAM file | |
if self.bam_format:print("Process BAM file ... ", end=' ', file=sys.stderr) | |
else:print("Process SAM file ... ", end=' ', file=sys.stderr) | |
MD_pat =re.compile(r'(\d+)([A-Z]+)') | |
number_base = re.compile(r'([0-9]+)([A-Z]+)', re.I) | |
count = 0 | |
data = collections.defaultdict(dict) # data[read_coord][genotype] = geno_type_number | |
try: | |
while(1): | |
if count >= read_num: | |
print("Total reads used: " + str(count), file=sys.stderr) | |
break | |
aligned_read = next(self.samfile) | |
if aligned_read.is_qcfail:continue #skip low quanlity | |
if aligned_read.is_duplicate:continue #skip duplicate read | |
if aligned_read.is_secondary:continue #skip non primary hit | |
if aligned_read.is_unmapped:continue #skip unmap read | |
if aligned_read.mapq < q_cut: continue | |
if aligned_read.is_reverse: | |
strand = '-' | |
else: | |
strand = '+' | |
#Skip if there is no mismatch, or there is deletion | |
tags = aligned_read.tags | |
skip = False | |
for tag in tags: | |
if tag[0] == "NM" and tag[1] ==0: | |
skip = True #skip reads with no mismatches | |
if (tag[0] == "MD") and ('^' in tag[1]): | |
skip = True # skip as there is deletion from the reference | |
if skip is True: continue | |
# skip partially mapped read | |
read_seq = aligned_read.seq | |
if len(read_seq) != read_length: | |
continue | |
if 'N' in read_seq: | |
continue | |
matched_portion_size = 0 | |
for op,value in aligned_read.cigar: | |
if op == 0: matched_portion_size += value | |
if matched_portion_size != read_length: | |
continue | |
count += 1 | |
if strand == '+': | |
for tag in tags: | |
if tag[0] == "MD": | |
a = MD_pat.findall(tag[1]) #tag[1] = "5G19T75"; a = [('5', 'G'), ('19', 'T')] | |
read_coord = 0 | |
for (match_number, ref_base) in a: | |
read_coord += int(match_number) | |
read_base = read_seq[read_coord] | |
if read_base == ref_base: continue | |
genotype = ref_base + '2' + read_base | |
if genotype not in data[read_coord]: | |
data[read_coord][genotype] = 1 | |
else: | |
data[read_coord][genotype] += 1 | |
read_coord += 1 | |
if strand == '-': | |
for tag in tags: | |
if tag[0] == "MD": | |
a = MD_pat.findall(tag[1]) #tag[1] = "5G19T75"; a = [('5', 'G'), ('19', 'T')] | |
read_coord = 0 | |
for (match_number, ref_base) in a: | |
read_coord += int(match_number) | |
read_base = read_seq[read_coord] | |
if read_base == ref_base: continue | |
genotype = ref_base + '2' + read_base | |
if genotype not in data[read_length - read_coord -1]: | |
data[read_length - read_coord -1][genotype] = 1 | |
else: | |
data[read_length - read_coord -1][genotype] += 1 | |
read_coord += 1 | |
if read_base == ref_base: print(aligned_read) | |
except StopIteration: | |
print("Total reads used: " + str(count), file=DOUT) | |
print('\n') | |
if len(data) == 0: | |
print("No mismatches found", file=sys.stderr) | |
sys.exit() | |
# write data out | |
all_genotypes = ['A2C','A2G','A2T','C2A','C2G','C2T','G2A','G2C','G2T','T2A','T2C','T2G'] | |
print("read_pos\tsum\t" + '\t'.join(all_genotypes), file=DOUT) | |
for indx in sorted(data): | |
tmp = [indx, sum(data[indx].values())] #read position and sum of mismatches | |
for i in all_genotypes: | |
if i in data[indx]: | |
tmp.append(data[indx][i]) | |
else: | |
tmp.append(0) | |
print('\t'.join([str(i) for i in tmp]), file=DOUT) | |
DOUT.close() | |
# write Rscript | |
r_data = collections.defaultdict(list) | |
for gt in all_genotypes: | |
for indx in sorted(data): | |
if gt in data[indx]: | |
r_data[gt].append(data[indx][gt]) | |
else: | |
r_data[gt].append(0) | |
for k in sorted(r_data): | |
print('%s=c(%s)' % (k, ','.join([str(i) for i in r_data[k]])), file=ROUT) | |
#print >>ROUT, 'color_code = c("#a6cee3","#1f78b4","#b2df8a","#33a02c","#fb9a99","#e31a1c","#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#b15928")' | |
print('color_code = c("green","powderblue","lightseagreen","red","violetred4","mediumorchid1","blue","royalblue","steelblue1","orange","gold","black")', file=ROUT) | |
print('y_up_bound = max(c(%s))' % (','.join(["log10(" + str(i) + "+1)" for i in all_genotypes])), file=ROUT) | |
print('y_low_bound = min(c(%s))' % (','.join(["log10(" + str(i) + "+1)" for i in all_genotypes])), file=ROUT) | |
print("pdf(\"%s\")" % (outfile + '.mismatch_profile.pdf'), file=ROUT) | |
count=1 | |
for gt in all_genotypes: | |
if count == 1: | |
print("plot(log10(%s+1),type=\"l\",col=color_code[%d],ylim=c(y_low_bound,y_up_bound),ylab=\"log10(# of mismatch)\",xlab=\"Read position (5\'->3\')\")" % (gt,count), file=ROUT) | |
else: | |
print("lines(log10(%s+1), col=color_code[%d])" % (gt,count), file=ROUT) | |
count += 1 | |
print("legend(13,y_up_bound,legend=c(%s), fill=color_code, border=color_code, ncol=4)" % (','.join(['"' + i + '"' for i in all_genotypes])), file=ROUT) | |
print("dev.off()", file=ROUT) | |
def deletionProfile(self,read_length,read_num, outfile, q_cut=30): | |
''' | |
Calculate deletion profile. | |
Deletion: Deletion from the read (relative to the reference), CIGAR operator 'D' | |
''' | |
DOUT = open(outfile + '.deletion_profile.txt','w') | |
ROUT = open(outfile + '.deletion_profile.r','w') | |
#reading input SAM file | |
if self.bam_format: | |
print("Process BAM file ... ", end=' ', file=sys.stderr) | |
else: | |
print("Process SAM file ... ", end=' ', file=sys.stderr) | |
count = 0 | |
del_postns = collections.defaultdict(int) #key: position of read. value: deletion times | |
#del_sizes = collections.defaultdict(int) #key: deletion size. value: deletion frequency of this size | |
try: | |
while(1): | |
if count >= read_num: | |
print("Total reads used: " + str(count), file=sys.stderr) | |
break | |
aligned_read = next(self.samfile) | |
if aligned_read.is_qcfail:continue #skip low quanlity | |
if aligned_read.is_duplicate:continue #skip duplicate read | |
if aligned_read.is_secondary:continue #skip non primary hit | |
if aligned_read.is_unmapped:continue #skip unmap read | |
if aligned_read.mapq < q_cut: continue | |
if aligned_read.is_reverse: | |
strand = '-' | |
else: | |
strand = '+' | |
# skip if read doesn't have deletion | |
read_cigar = aligned_read.cigar | |
if 2 not in [i[0] for i in read_cigar]: #read contains no deletion | |
continue | |
# skip partially mapped read | |
read_seq = aligned_read.seq | |
if len(read_seq) != read_length: | |
continue | |
matched_portion_size = 0 | |
for op,value in aligned_read.cigar: | |
if op == 0: matched_portion_size += value #match | |
if op == 4: matched_portion_size += value #soft clp | |
if op == 1: matched_portion_size += value #insertion to read | |
if matched_portion_size != read_length: | |
continue | |
count += 1 | |
del_positions = bam_cigar.fetch_deletion_range(read_cigar) #[(position, size),(position, size),...] | |
for (p,s) in del_positions: | |
if strand == '-': p = read_length - p | |
del_postns[p] += 1 | |
#del_sizes[s] += 1 | |
#print aligned_read | |
#print del_positions | |
except StopIteration: | |
print("Total reads used: " + str(count), file=sys.stderr) | |
print('\n') | |
del_count=[] | |
print("read_position\tdeletion_count", file=DOUT) | |
for k in range(0,read_length): | |
if k in del_postns: | |
print(str(k) + '\t' + str(del_postns[k]), file=DOUT) | |
del_count.append(str(del_postns[k])) | |
else: | |
print(str(k) + '\t0', file=DOUT) | |
del_count.append('0') | |
DOUT.close() | |
print("pdf(\"%s\")" % (outfile + '.deletion_profile.pdf'), file=ROUT) | |
print("pos=c(%s)" % ','.join([str(i) for i in range(0,read_length)]), file=ROUT) | |
print("value=c(%s)" % ','.join([i for i in del_count]), file=ROUT) | |
print("plot(pos,value,type='b', col='blue',xlab=\"Read position (5\'->3\')\", ylab='Deletion count')", file=ROUT) | |
print("dev.off()", file=ROUT) | |
def print_bits_as_bed( bits ): | |
end = 0 | |
while 1: | |
start = bits.next_set( end ) | |
if start == bits.size: break | |
end = bits.next_clear( start ) | |
print("%d\t%d" % ( start, end )) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment