Skip to content

Instantly share code, notes, and snippets.

@NDHall
Created October 11, 2017 20:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save NDHall/da9f9b9b3825bac7f1cb7508d4fec86e to your computer and use it in GitHub Desktop.
Save NDHall/da9f9b9b3825bac7f1cb7508d4fec86e to your computer and use it in GitHub Desktop.
Sample script for counting go terms for a trinontate report.
# First generation for parsing out GO: terms. We used this to simplify working with regular
# expressions and delimiters that are inherit to the Trinotate file
# couple things to note:
# Though we are parsing canonical GO terms here. They are not in the default column.
# We have chosen to drop some columns from Trinontate output.
#
# If you are using this make sure that the script is taking the correct column!
# Also, for ease of parsing post,python scripts the name of each go term has " " replaced with "_"
# finally notice the GOTerm class below.
# TERM=TERM
# CAT=Namespace
# SubCAT=Name
class GoTerm:
def __init__(self, TERM, CAT, SubCAT,COUNT ):
self.TERM=TERM
self.CAT=CAT
self.SubCAT=SubCAT
self.COUNT=int(COUNT)
import re
import argparse
# house keeping
def getKey0(item): # for sorting
return item[0]
def clean_for_append(out):
f=open(out,'w')
f.write("")
f.close()
def TrinontateGoTermParser(BlastGo, GoTERMS ):
for TERM in GoTERMS:
if TERM==".":
BlastGo["UI"]+=1
#print BlastGo["UI"]
elif "GO:" in TERM:
STerm=TERM.split("^")
#class GoTerm:
# def __init__(self, TERM, CAT, SubCAT,COUNT ):
if len(STerm)==3:
GTERM=GoTerm(STerm[0],STerm[1],STerm[2],1)
if GTERM.TERM in BlastGo:
BlastGo[GTERM.TERM].COUNT+=GTERM.COUNT
elif GTERM.TERM not in BlastGo:
BlastGo[GTERM.TERM]=GTERM
#print STerm
else:
print "Scary" , TERM
def WriteToFile(Added,BlastGo,B_out): #(listOfGOTerms, Dictionary, OUTfile, that can be appended to)
clean_for_append(B_out)
B=open(B_out, "a")
for G in Added:
if G=="UI" or G==".":
B.write(str(BlastGo[G])+"\tUI\tUnidentified\t"+G+"\n")
elif G in BlastGo:
B.write(str(BlastGo[G].COUNT)+"\t"+BlastGo[G].SubCAT.replace(" ","_")+"\t"+BlastGo[G].CAT.replace(" ","_")+"\t"+BlastGo[G].TERM+"\n")
elif G not in BlastGo:
B.write("0\tNull\tNull\t"+G+"\n")
B.close()
def WriteToFileMaster(Added,BlastGo,PfamGo,M_out): #(listOfGOTerms, BlastDictionary,PFamDictionary, OUTfile, that can be appended to)
M_out=STEM+"_master.tsv"
BIO=STEM+"_BioProc.tsv"
MOL=STEM+"_MolFun.tsv"
CEL=STEM+"_CelCom.tsv"
for X in [BIO, MOL,CEL,M_out] :
clean_for_append(X)
B=open(M_out, "a")
BP=open(BIO,"a")
CC=open(CEL,"a")
MF=open(MOL,"a")
for Y in [B,BP,CC,MF]:
Y.write("GreatestTotal\tBlastTotal\tPfamTotal\tName\tNameSpace\tGoTerm\n")
FINAL=[]
for G in Added:
ROW=[0,0,0,"N","NS","GO"]
if G=="UI" or G==".":
ROW[1]=BlastGo[G]
ROW[2]=PfamGo[G]
ROW[3]="UI"
ROW[4]="UI"
ROW[5]=G
elif G in BlastGo and G in PfamGo:
ROW[1]=BlastGo[G].COUNT
ROW[2]=PfamGo[G].COUNT
ROW[3]=BlastGo[G].SubCAT.replace(" ","_")
ROW[4]=BlastGo[G].CAT.replace(" ","_")
ROW[5]=BlastGo[G].TERM
elif G in BlastGo and G not in PfamGo:
ROW[0]=BlastGo[G].COUNT
ROW[1]=BlastGo[G].COUNT
ROW[3]=BlastGo[G].SubCAT.replace(" ","_")
ROW[4]=BlastGo[G].CAT.replace(" ","_")
ROW[5]=BlastGo[G].TERM
elif G not in BlastGo and G in PfamGo:
ROW[0]=PfamGo[G].COUNT
ROW[2]=PfamGo[G].COUNT
ROW[3]=PfamGo[G].SubCAT.replace(" ","_")
ROW[4]=PfamGo[G].CAT.replace(" ","_")
ROW[5]=PfamGo[G].TERM
elif G not in BlastGo and G not in PfamGo:
raise Exception(str(G)+" not included in either dictionary" )
if ROW[1] >= ROW[2]:
ROW[0]=ROW[1]
elif ROW[2]>ROW[1]:
ROW[0]=ROW[2]
else:
raise Exception(ROW[0:2],"Number Error" )
FINAL.append(ROW)
SorFin=sorted(FINAL, key=lambda x: int(x[0] ))
for ROW in SorFin:
ROW[0]=str(ROW[0])
ROW[1]=str(ROW[1])
ROW[2]=str(ROW[2])
B.write("\t".join(ROW)+"\n")
if ROW[4] == "biological_process" :
BP.write("\t".join(ROW)+"\n")
elif ROW[4] == "cellular_component":
CC.write("\t".join(ROW)+"\n")
elif ROW[4] == "molecular_function":
MF.write("\t".join(ROW)+"\n")
B.close()
BP.close()
CC.close()
MF.close()
#----------------------------------------------------------------------------------------------------------------
# XARGS parser
parser=argparse.ArgumentParser()
parser.add_argument("HANDLE0", help="Tab delimited Trinontate file here\n\tDo not add csv. Since parsing is based on commas and tabs. ")
parser.add_argument("STEM",help="Stem for output GO term files.")
args=parser.parse_args()
HANDLE=args.HANDLE0
STEM=args.STEM
f=open(HANDLE,"r")
BlastGo={"UI":0}
PfamGO={"UI":0}
for Line in f:
SLine=Line.split("\t")
BGoTERMS=SLine[13].split("`")
PGoTERMS=SLine[14].split("`")
TrinontateGoTermParser(BlastGo, BGoTERMS )
TrinontateGoTermParser(PfamGO, PGoTERMS )
Added=[]
B_out=STEM+"_blast.table"
P_out=STEM+"_pfam.table"
M_out=STEM
for G in BlastGo:
if G not in Added:
Added.append(G)
for G in PfamGO:
if G not in Added:
Added.append(G)
# these earlier ones required to much post script parsing. But could still be fine to use if you want them.
#WriteToFile(Added,BlastGo,B_out)
#WriteToFile(Added,PfamGO,P_out)
WriteToFileMaster(Added,BlastGo,PfamGO,M_out) #(listOfGOTerms, BlastDictionary,PFamDictionary, OUTfile, that can be appended to)
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment