Created
October 11, 2017 20:38
-
-
Save NDHall/da9f9b9b3825bac7f1cb7508d4fec86e to your computer and use it in GitHub Desktop.
Sample script for counting go terms for a trinontate report.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# First generation for parsing out GO: terms. We used this to simplify working with regular | |
# expressions and delimiters that are inherit to the Trinotate file | |
# couple things to note: | |
# Though we are parsing canonical GO terms here. They are not in the default column. | |
# We have chosen to drop some columns from Trinontate output. | |
# | |
# If you are using this make sure that the script is taking the correct column! | |
# Also, for ease of parsing post,python scripts the name of each go term has " " replaced with "_" | |
# finally notice the GOTerm class below. | |
# TERM=TERM | |
# CAT=Namespace | |
# SubCAT=Name | |
class GoTerm: | |
def __init__(self, TERM, CAT, SubCAT,COUNT ): | |
self.TERM=TERM | |
self.CAT=CAT | |
self.SubCAT=SubCAT | |
self.COUNT=int(COUNT) | |
import re | |
import argparse | |
# house keeping | |
def getKey0(item): # for sorting | |
return item[0] | |
def clean_for_append(out): | |
f=open(out,'w') | |
f.write("") | |
f.close() | |
def TrinontateGoTermParser(BlastGo, GoTERMS ): | |
for TERM in GoTERMS: | |
if TERM==".": | |
BlastGo["UI"]+=1 | |
#print BlastGo["UI"] | |
elif "GO:" in TERM: | |
STerm=TERM.split("^") | |
#class GoTerm: | |
# def __init__(self, TERM, CAT, SubCAT,COUNT ): | |
if len(STerm)==3: | |
GTERM=GoTerm(STerm[0],STerm[1],STerm[2],1) | |
if GTERM.TERM in BlastGo: | |
BlastGo[GTERM.TERM].COUNT+=GTERM.COUNT | |
elif GTERM.TERM not in BlastGo: | |
BlastGo[GTERM.TERM]=GTERM | |
#print STerm | |
else: | |
print "Scary" , TERM | |
def WriteToFile(Added,BlastGo,B_out): #(listOfGOTerms, Dictionary, OUTfile, that can be appended to) | |
clean_for_append(B_out) | |
B=open(B_out, "a") | |
for G in Added: | |
if G=="UI" or G==".": | |
B.write(str(BlastGo[G])+"\tUI\tUnidentified\t"+G+"\n") | |
elif G in BlastGo: | |
B.write(str(BlastGo[G].COUNT)+"\t"+BlastGo[G].SubCAT.replace(" ","_")+"\t"+BlastGo[G].CAT.replace(" ","_")+"\t"+BlastGo[G].TERM+"\n") | |
elif G not in BlastGo: | |
B.write("0\tNull\tNull\t"+G+"\n") | |
B.close() | |
def WriteToFileMaster(Added,BlastGo,PfamGo,M_out): #(listOfGOTerms, BlastDictionary,PFamDictionary, OUTfile, that can be appended to) | |
M_out=STEM+"_master.tsv" | |
BIO=STEM+"_BioProc.tsv" | |
MOL=STEM+"_MolFun.tsv" | |
CEL=STEM+"_CelCom.tsv" | |
for X in [BIO, MOL,CEL,M_out] : | |
clean_for_append(X) | |
B=open(M_out, "a") | |
BP=open(BIO,"a") | |
CC=open(CEL,"a") | |
MF=open(MOL,"a") | |
for Y in [B,BP,CC,MF]: | |
Y.write("GreatestTotal\tBlastTotal\tPfamTotal\tName\tNameSpace\tGoTerm\n") | |
FINAL=[] | |
for G in Added: | |
ROW=[0,0,0,"N","NS","GO"] | |
if G=="UI" or G==".": | |
ROW[1]=BlastGo[G] | |
ROW[2]=PfamGo[G] | |
ROW[3]="UI" | |
ROW[4]="UI" | |
ROW[5]=G | |
elif G in BlastGo and G in PfamGo: | |
ROW[1]=BlastGo[G].COUNT | |
ROW[2]=PfamGo[G].COUNT | |
ROW[3]=BlastGo[G].SubCAT.replace(" ","_") | |
ROW[4]=BlastGo[G].CAT.replace(" ","_") | |
ROW[5]=BlastGo[G].TERM | |
elif G in BlastGo and G not in PfamGo: | |
ROW[0]=BlastGo[G].COUNT | |
ROW[1]=BlastGo[G].COUNT | |
ROW[3]=BlastGo[G].SubCAT.replace(" ","_") | |
ROW[4]=BlastGo[G].CAT.replace(" ","_") | |
ROW[5]=BlastGo[G].TERM | |
elif G not in BlastGo and G in PfamGo: | |
ROW[0]=PfamGo[G].COUNT | |
ROW[2]=PfamGo[G].COUNT | |
ROW[3]=PfamGo[G].SubCAT.replace(" ","_") | |
ROW[4]=PfamGo[G].CAT.replace(" ","_") | |
ROW[5]=PfamGo[G].TERM | |
elif G not in BlastGo and G not in PfamGo: | |
raise Exception(str(G)+" not included in either dictionary" ) | |
if ROW[1] >= ROW[2]: | |
ROW[0]=ROW[1] | |
elif ROW[2]>ROW[1]: | |
ROW[0]=ROW[2] | |
else: | |
raise Exception(ROW[0:2],"Number Error" ) | |
FINAL.append(ROW) | |
SorFin=sorted(FINAL, key=lambda x: int(x[0] )) | |
for ROW in SorFin: | |
ROW[0]=str(ROW[0]) | |
ROW[1]=str(ROW[1]) | |
ROW[2]=str(ROW[2]) | |
B.write("\t".join(ROW)+"\n") | |
if ROW[4] == "biological_process" : | |
BP.write("\t".join(ROW)+"\n") | |
elif ROW[4] == "cellular_component": | |
CC.write("\t".join(ROW)+"\n") | |
elif ROW[4] == "molecular_function": | |
MF.write("\t".join(ROW)+"\n") | |
B.close() | |
BP.close() | |
CC.close() | |
MF.close() | |
#---------------------------------------------------------------------------------------------------------------- | |
# XARGS parser | |
parser=argparse.ArgumentParser() | |
parser.add_argument("HANDLE0", help="Tab delimited Trinontate file here\n\tDo not add csv. Since parsing is based on commas and tabs. ") | |
parser.add_argument("STEM",help="Stem for output GO term files.") | |
args=parser.parse_args() | |
HANDLE=args.HANDLE0 | |
STEM=args.STEM | |
f=open(HANDLE,"r") | |
BlastGo={"UI":0} | |
PfamGO={"UI":0} | |
for Line in f: | |
SLine=Line.split("\t") | |
BGoTERMS=SLine[13].split("`") | |
PGoTERMS=SLine[14].split("`") | |
TrinontateGoTermParser(BlastGo, BGoTERMS ) | |
TrinontateGoTermParser(PfamGO, PGoTERMS ) | |
Added=[] | |
B_out=STEM+"_blast.table" | |
P_out=STEM+"_pfam.table" | |
M_out=STEM | |
for G in BlastGo: | |
if G not in Added: | |
Added.append(G) | |
for G in PfamGO: | |
if G not in Added: | |
Added.append(G) | |
# these earlier ones required to much post script parsing. But could still be fine to use if you want them. | |
#WriteToFile(Added,BlastGo,B_out) | |
#WriteToFile(Added,PfamGO,P_out) | |
WriteToFileMaster(Added,BlastGo,PfamGO,M_out) #(listOfGOTerms, BlastDictionary,PFamDictionary, OUTfile, that can be appended to) | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment