|
#!/usr/bin/env python |
|
# Programmer : zhuxp |
|
# Date: |
|
# Last-modified: 05-07-2014, 16:25:22 EDT |
|
from __future__ import print_function |
|
import os,sys,argparse |
|
from bam2x import TableIO,Tools,DBI |
|
from bam2x import IO |
|
from collections import defaultdict |
|
from bam2x.Annotation import BED6 |
|
def set_parser(p): |
|
''' This Function Parse the Argument ''' |
|
p.add_argument('-s','--tss',dest="db_tss",type=str,help="tabix tss file (tabix bam2tss output file)") |
|
p.add_argument('-e','--tts',dest="db_tts",type=str,help="tabix tts file (tabix bam2tts output file)") |
|
p.add_argument('--flank',dest="flank",type=int,default=100,action="store",help="count flank region") |
|
p.add_argument('--cutoff',dest="cutoff",type=int,default=0.2,action="store",help="filter out the peaks are not comparable with the max one (0,1) , default: %(default)s ") |
|
def help(): |
|
return "query tss and tts tabix file, and report revised TSS and TTS in a bed region, report possible alternative TSS and TTS" |
|
def run(args): |
|
CUTOFF=args.cutoff |
|
fin=IO.fopen(args.input,"r") |
|
out=IO.fopen(args.output,"w") |
|
s=TableIO.parse(args.input,"bed12") |
|
db_tts=DBI.init(args.db_tts,"tabix",cls=BED6) |
|
db_tss=DBI.init(args.db_tss,"tabix",cls=BED6) |
|
genes=list() |
|
for i in s: |
|
flank=Tools.get_flank_region(i,args.flank,args.flank) |
|
|
|
tss_ends=[ j for j in db_tss.query(flank) if j.strand==i.strand] |
|
tss_ends.sort() |
|
tss_peaks=peak_calling(tss_ends,i.id+".tss") |
|
|
|
tts_ends=[ j for j in db_tts.query(flank) if j.strand==i.strand] |
|
tts_ends.sort() |
|
tts_peaks=peak_calling(tts_ends,i.id+".tts") |
|
|
|
peak_max=0.0 |
|
gene={} |
|
gene["gene"]=i; |
|
gene["tss"]=[] |
|
gene["tts"]=[] |
|
for j in sorted(tss_peaks, key = lambda x: x["meta"].score * x["peak"].score, reverse=True): |
|
if peak_max==0.0: peak_max=j["meta"].score*j["peak"].score |
|
if(j["meta"].score * j["peak"].score > CUTOFF*peak_max): |
|
gene["tss"].append(j) |
|
|
|
|
|
peak_max=0.0 |
|
for j in sorted(tts_peaks, key = lambda x: x["meta"].score * x["peak"].score, reverse=True): |
|
if peak_max==0.0: peak_max=j["meta"].score*j["peak"].score |
|
if(j["meta"].score * j["peak"].score > CUTOFF*peak_max): |
|
gene["tts"].append(j) |
|
#print("",file=out) |
|
genes.append(gene) |
|
for gene in genes: |
|
print(report_gene(gene),file=out) |
|
def format_gene(gene): |
|
s="GENE\t"+str(gene["gene"])+"\n" |
|
for i in gene["tss"]: |
|
s+="TSS\t"+record_to_bed12(i)+"\n" |
|
for i in gene["tts"]: |
|
s+="TTS\t"+record_to_bed12(i,True)+"\n" |
|
return s |
|
def report_gene(gene): |
|
s="" |
|
s0="GENE\t"+str(gene["gene"])+"\t" |
|
tss_number=len(gene["tss"]) |
|
tts_number=len(gene["tts"]) |
|
tss=gene["gene"].tss() |
|
tts=gene["gene"].tts() |
|
tssHit=False |
|
tssShift=False |
|
tssAltI=False |
|
tssAltE=False |
|
ttsHit=False |
|
ttsShift=False |
|
ttsAltI=False |
|
ttsAltE=False |
|
for i in gene["tss"]: |
|
distance=Tools.distance(i["peak"],tss) |
|
state="" |
|
if distance < 10: |
|
tssHit=True |
|
state="HIT" |
|
elif distance < 50: |
|
tssShift=True |
|
state="SHIFT" |
|
else: |
|
state="ALT_I" |
|
for e in gene["gene"].Exons(): |
|
if Tools.overlap(e,i["peak"]): |
|
state="ALT_E" |
|
if state=="ALT_I": |
|
tssAltI=True |
|
else: |
|
tssAltE=True |
|
|
|
|
|
start,end,strand=Tools.translate_coordinate(tss,i["peak"]) |
|
s+="TSS\t{state}\t{pos}\t{tss}\n".format(state=state,pos=start,tss=record_to_bed12(i)) |
|
for i in gene["tts"]: |
|
distance=Tools.distance(i["peak"],tts) |
|
state="" |
|
if distance < 10: |
|
ttsHit=True |
|
state="HIT" |
|
elif distance < 50: |
|
ttsShift=True |
|
state="SHIFT" |
|
else: |
|
state="ALT_I" |
|
for e in gene["gene"].Exons(): |
|
if Tools.overlap(e,i["peak"]): |
|
state="ALT_E" |
|
if state=="ALT_I": |
|
ttsAltI=True |
|
else: |
|
ttsAltE=True |
|
|
|
s+="TTS\t{state}\t{distance}\t{tts}\n".format(state=state,distance=distance,tts=record_to_bed12(i)) |
|
#retv = s0+"{}\t{}\t{}\t{}\t{}\t{}\n".format(tssHit,tssShift,tssAlt,ttsHit,ttsShift,ttsAlt)+s |
|
return"GENE\t{gene}\t{tss},{rep},{tts}\n{hits}".format(tss=tss_number,tts=tts_number,gene=gene["gene"],rep=rep(tssHit,tssShift,tssAltE,tssAltI,ttsAltI,ttsAltE,ttsShift,ttsHit),hits=s) |
|
|
|
def rep(*x): |
|
s=list("[(EIie)]") |
|
for i,y in enumerate(x): |
|
if not y: |
|
s[i]="_" |
|
return "".join(s) |
|
|
|
def record_to_bed12(record,red=False): |
|
x=record |
|
if red: |
|
g=0 |
|
r=int(record["gini"]*200) |
|
else: |
|
g=int(record["gini"]*200) |
|
r=0 |
|
b=int(record["peak"].score*200) |
|
if record["gini"]>0.5: |
|
meta=record["meta"]._replace(id=x["meta"].id+".end") |
|
else: |
|
meta=record["meta"] |
|
rgb="{r},{g},{b}".format(r=r,g=g,b=b) |
|
return "{bed6}\t{thickStart}\t{thickEnd}\t{itemRgb}\t{blockCount}\t{blockSizes}\t{blockStarts}".format(bed6=meta,thickStart=x["peak"].start,thickEnd=x["peak"].end,itemRgb=rgb,blockSizes=x["meta"].stop-x["meta"].start,blockCount=1,blockStarts=0) |
|
|
|
|
|
def peak_calling(iterator,PREFIX="peak",GAP=10, MIN_READS_NUMBER=20): |
|
def process(): |
|
max_score=0.0 |
|
total_score=0.0 |
|
e=[] |
|
for i in buff: |
|
total_score+=i.score |
|
e.append(i.score) |
|
e=[i/total_score for i in e] |
|
gini=Tools.gini_coefficient(e) |
|
if total_score < MIN_READS_NUMBER: |
|
return 0 |
|
record={} |
|
meta=BED6(buff[0].chr,buff[0].start,buff[-1].stop,PREFIX+"."+str(group_id),total_score,buff[0].strand) |
|
peak=max(buff,key=lambda x:x.score) |
|
record["peak"]=peak._replace(score=peak.score/total_score) |
|
record["meta"]=meta._replace(strand=peak.strand) |
|
record["gini"]=gini |
|
records.append(record) |
|
return 1 |
|
records=[] |
|
group_id=0 |
|
last_stop=-GAP |
|
buff=[] |
|
for x,i in enumerate(iterator): |
|
if i.start-last_stop > GAP: |
|
group_id+=process() |
|
buff=[i] |
|
last_stop=i.stop |
|
else: |
|
buff.append(i) |
|
if i.stop>last_stop: |
|
last_stop=i.stop |
|
if len(buff)>0: process() |
|
return records |
|
|
|
|
|
|
|
|
|
|
|
if __name__=="__main__": |
|
from bam2x.IO import parser_factory |
|
p=parser_factory(description=help()) |
|
set_parser(p) |
|
if len(sys.argv)==1: |
|
print(p.print_help()) |
|
exit(0) |
|
run(p.parse_args()) |
|
|
|
|
|
|