Skip to content

Instantly share code, notes, and snippets.

@nimezhu
Last active August 29, 2015 14:01
Show Gist options
  • Save nimezhu/44daca6fdaf1a7d1a0ee to your computer and use it in GitHub Desktop.
Save nimezhu/44daca6fdaf1a7d1a0ee to your computer and use it in GitHub Desktop.
query ends to report

##TSS(TTS) POSITION CLASSIFIED

HITS : distance < 10
SHIFT : 10<= distance < 50
ALT_E : 50 < distance and peak in Exon
ALT_E : 50 < distance and peak in Intron

COL14 for gene: number of tss, repr string, number of tts

##REPR STR

- TSS HIT   : [   ( tss within 10bp)
- TSS SHIFT : (   ( tss near 50bp , > 10bp)
- TSS ALT_E : E   ( tss in exon  , > 50bp)
- TSS ALT_I : I   ( tss in intron , >50bp)
- TTS ALT_I : i   ( tts in intron , >50bp)
- TTS ALT_E : e   ( tts in exon   , >50bp)
- TTS SHIFT : )   ( tts near 50bp , >10bp)
- TTS HIT   : ]   ( tts within 10bp)

##EXAMPLE

GENE	chr1	36619540	36740028	NM_001126046	0.0	-	36619832	36717792	0	15	325,153,78,126,88,98,177,85,115,85,106,153,62,315,316,	0,1634,2135,15823,23220,37427,37712,38171,45721,69724,72471,81514,96668,98039,120172,	2,_(_Ii__],2
TSS	SHIFT	45	chr1	36739826	36740028	NM_001126046.tss.4.end	555.0	-	36739982	36739983	0,122,20	1	202	0
TSS	ALT_I	10887	chr1	36729138	36729141	NM_001126046.tss.3	24.0	-	36729140	36729141	0,91,191	1	3	0
TTS	HIT	2	chr1	36619538	36619561	NM_001126046.tts.0.end	214.0	-	36619543	36619544	0,157,185	1	23	0
TTS	ALT_I	37279	chr1	36656814	36656851	NM_001126046.tts.1.end	223.0	-	36656820	36656821	0,157,119	1	37	0

2 TSS , 2,_(_Ii__],2

  • SHIFT TSS EXISTS 2,_(_Ii__],2
  • ALT_I TSS EXISTS 2,_(_I)i__],2

2 TTS , 2,_(_Ii__],2

  • HIT TTS EXISTS 2,_(_Ii__],2
  • ALT_I TTS EXISTS 2,_(_Ii__],2
#!/usr/bin/env python
# Programmer : zhuxp
# Date:
# Last-modified: 05-07-2014, 16:25:22 EDT
from __future__ import print_function
import os,sys,argparse
from bam2x import TableIO,Tools,DBI
from bam2x import IO
from collections import defaultdict
from bam2x.Annotation import BED6
def set_parser(p):
''' This Function Parse the Argument '''
p.add_argument('-s','--tss',dest="db_tss",type=str,help="tabix tss file (tabix bam2tss output file)")
p.add_argument('-e','--tts',dest="db_tts",type=str,help="tabix tts file (tabix bam2tts output file)")
p.add_argument('--flank',dest="flank",type=int,default=100,action="store",help="count flank region")
p.add_argument('--cutoff',dest="cutoff",type=int,default=0.2,action="store",help="filter out the peaks are not comparable with the max one (0,1) , default: %(default)s ")
def help():
return "query tss and tts tabix file, and report revised TSS and TTS in a bed region, report possible alternative TSS and TTS"
def run(args):
CUTOFF=args.cutoff
fin=IO.fopen(args.input,"r")
out=IO.fopen(args.output,"w")
s=TableIO.parse(args.input,"bed12")
db_tts=DBI.init(args.db_tts,"tabix",cls=BED6)
db_tss=DBI.init(args.db_tss,"tabix",cls=BED6)
genes=list()
for i in s:
flank=Tools.get_flank_region(i,args.flank,args.flank)
tss_ends=[ j for j in db_tss.query(flank) if j.strand==i.strand]
tss_ends.sort()
tss_peaks=peak_calling(tss_ends,i.id+".tss")
tts_ends=[ j for j in db_tts.query(flank) if j.strand==i.strand]
tts_ends.sort()
tts_peaks=peak_calling(tts_ends,i.id+".tts")
peak_max=0.0
gene={}
gene["gene"]=i;
gene["tss"]=[]
gene["tts"]=[]
for j in sorted(tss_peaks, key = lambda x: x["meta"].score * x["peak"].score, reverse=True):
if peak_max==0.0: peak_max=j["meta"].score*j["peak"].score
if(j["meta"].score * j["peak"].score > CUTOFF*peak_max):
gene["tss"].append(j)
peak_max=0.0
for j in sorted(tts_peaks, key = lambda x: x["meta"].score * x["peak"].score, reverse=True):
if peak_max==0.0: peak_max=j["meta"].score*j["peak"].score
if(j["meta"].score * j["peak"].score > CUTOFF*peak_max):
gene["tts"].append(j)
#print("",file=out)
genes.append(gene)
for gene in genes:
print(report_gene(gene),file=out)
def format_gene(gene):
s="GENE\t"+str(gene["gene"])+"\n"
for i in gene["tss"]:
s+="TSS\t"+record_to_bed12(i)+"\n"
for i in gene["tts"]:
s+="TTS\t"+record_to_bed12(i,True)+"\n"
return s
def report_gene(gene):
s=""
s0="GENE\t"+str(gene["gene"])+"\t"
tss_number=len(gene["tss"])
tts_number=len(gene["tts"])
tss=gene["gene"].tss()
tts=gene["gene"].tts()
tssHit=False
tssShift=False
tssAltI=False
tssAltE=False
ttsHit=False
ttsShift=False
ttsAltI=False
ttsAltE=False
for i in gene["tss"]:
distance=Tools.distance(i["peak"],tss)
state=""
if distance < 10:
tssHit=True
state="HIT"
elif distance < 50:
tssShift=True
state="SHIFT"
else:
state="ALT_I"
for e in gene["gene"].Exons():
if Tools.overlap(e,i["peak"]):
state="ALT_E"
if state=="ALT_I":
tssAltI=True
else:
tssAltE=True
start,end,strand=Tools.translate_coordinate(tss,i["peak"])
s+="TSS\t{state}\t{pos}\t{tss}\n".format(state=state,pos=start,tss=record_to_bed12(i))
for i in gene["tts"]:
distance=Tools.distance(i["peak"],tts)
state=""
if distance < 10:
ttsHit=True
state="HIT"
elif distance < 50:
ttsShift=True
state="SHIFT"
else:
state="ALT_I"
for e in gene["gene"].Exons():
if Tools.overlap(e,i["peak"]):
state="ALT_E"
if state=="ALT_I":
ttsAltI=True
else:
ttsAltE=True
s+="TTS\t{state}\t{distance}\t{tts}\n".format(state=state,distance=distance,tts=record_to_bed12(i))
#retv = s0+"{}\t{}\t{}\t{}\t{}\t{}\n".format(tssHit,tssShift,tssAlt,ttsHit,ttsShift,ttsAlt)+s
return"GENE\t{gene}\t{tss},{rep},{tts}\n{hits}".format(tss=tss_number,tts=tts_number,gene=gene["gene"],rep=rep(tssHit,tssShift,tssAltE,tssAltI,ttsAltI,ttsAltE,ttsShift,ttsHit),hits=s)
def rep(*x):
s=list("[(EIie)]")
for i,y in enumerate(x):
if not y:
s[i]="_"
return "".join(s)
def record_to_bed12(record,red=False):
x=record
if red:
g=0
r=int(record["gini"]*200)
else:
g=int(record["gini"]*200)
r=0
b=int(record["peak"].score*200)
if record["gini"]>0.5:
meta=record["meta"]._replace(id=x["meta"].id+".end")
else:
meta=record["meta"]
rgb="{r},{g},{b}".format(r=r,g=g,b=b)
return "{bed6}\t{thickStart}\t{thickEnd}\t{itemRgb}\t{blockCount}\t{blockSizes}\t{blockStarts}".format(bed6=meta,thickStart=x["peak"].start,thickEnd=x["peak"].end,itemRgb=rgb,blockSizes=x["meta"].stop-x["meta"].start,blockCount=1,blockStarts=0)
def peak_calling(iterator,PREFIX="peak",GAP=10, MIN_READS_NUMBER=20):
def process():
max_score=0.0
total_score=0.0
e=[]
for i in buff:
total_score+=i.score
e.append(i.score)
e=[i/total_score for i in e]
gini=Tools.gini_coefficient(e)
if total_score < MIN_READS_NUMBER:
return 0
record={}
meta=BED6(buff[0].chr,buff[0].start,buff[-1].stop,PREFIX+"."+str(group_id),total_score,buff[0].strand)
peak=max(buff,key=lambda x:x.score)
record["peak"]=peak._replace(score=peak.score/total_score)
record["meta"]=meta._replace(strand=peak.strand)
record["gini"]=gini
records.append(record)
return 1
records=[]
group_id=0
last_stop=-GAP
buff=[]
for x,i in enumerate(iterator):
if i.start-last_stop > GAP:
group_id+=process()
buff=[i]
last_stop=i.stop
else:
buff.append(i)
if i.stop>last_stop:
last_stop=i.stop
if len(buff)>0: process()
return records
if __name__=="__main__":
from bam2x.IO import parser_factory
p=parser_factory(description=help())
set_parser(p)
if len(sys.argv)==1:
print(p.print_help())
exit(0)
run(p.parse_args())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment