NDHall/GoTermParse.py

## GoTermParse.py
# First generation for parsing out GO: terms. We used this to simplify working with regular
# expressions and delimiters that are inherit to the Trinotate file
# couple things to note:
# Though we are parsing canonical GO terms here. They are not in the default column.
# We have chosen to drop some columns from Trinontate output.
#
# If you are using this make sure that the script is taking the correct column!
# Also, for ease of parsing post,python scripts the name of each go term has " " replaced with "_"
# finally notice the GOTerm class below.
#	TERM=TERM
#	CAT=Namespace
#	SubCAT=Name


class GoTerm:
	def __init__(self, TERM, CAT, SubCAT,COUNT ):
		self.TERM=TERM
		self.CAT=CAT
		self.SubCAT=SubCAT
		self.COUNT=int(COUNT)


import re
import argparse


#	house keeping

def getKey0(item): # for sorting
    return item[0]


def clean_for_append(out):
	f=open(out,'w')
	f.write("")
	f.close()

def TrinontateGoTermParser(BlastGo, GoTERMS ):
	for TERM in GoTERMS:
		if TERM==".":
			BlastGo["UI"]+=1
			#print BlastGo["UI"]
		elif "GO:" in TERM:
			STerm=TERM.split("^")
							#class GoTerm:
							#	def __init__(self, TERM, CAT, SubCAT,COUNT ):
			if len(STerm)==3:
				GTERM=GoTerm(STerm[0],STerm[1],STerm[2],1)
				if GTERM.TERM  in BlastGo:
					BlastGo[GTERM.TERM].COUNT+=GTERM.COUNT
				elif  GTERM.TERM  not in BlastGo:
					BlastGo[GTERM.TERM]=GTERM
				#print STerm
			else:
				print "Scary" , TERM


def WriteToFile(Added,BlastGo,B_out): #(listOfGOTerms, Dictionary, OUTfile, that can be appended to)
	clean_for_append(B_out)
	B=open(B_out, "a")
	for G in Added:
		if G=="UI" or G==".":
			B.write(str(BlastGo[G])+"\tUI\tUnidentified\t"+G+"\n")
		elif G in BlastGo:
			B.write(str(BlastGo[G].COUNT)+"\t"+BlastGo[G].SubCAT.replace(" ","_")+"\t"+BlastGo[G].CAT.replace(" ","_")+"\t"+BlastGo[G].TERM+"\n")
		elif G not in BlastGo:
			B.write("0\tNull\tNull\t"+G+"\n")

	B.close()


def WriteToFileMaster(Added,BlastGo,PfamGo,M_out): #(listOfGOTerms, BlastDictionary,PFamDictionary, OUTfile, that can be appended to)
	M_out=STEM+"_master.tsv"
	BIO=STEM+"_BioProc.tsv"
	MOL=STEM+"_MolFun.tsv"
	CEL=STEM+"_CelCom.tsv"
	for X in [BIO, MOL,CEL,M_out] :
		clean_for_append(X)
	B=open(M_out, "a")
	BP=open(BIO,"a")
	CC=open(CEL,"a")
	MF=open(MOL,"a")
	for Y in [B,BP,CC,MF]:
		Y.write("GreatestTotal\tBlastTotal\tPfamTotal\tName\tNameSpace\tGoTerm\n")
	FINAL=[]
	for G in Added:
		ROW=[0,0,0,"N","NS","GO"]
		if G=="UI" or G==".":
			ROW[1]=BlastGo[G]
			ROW[2]=PfamGo[G]
			ROW[3]="UI"
			ROW[4]="UI"
			ROW[5]=G
		elif G in BlastGo and G in PfamGo:
			ROW[1]=BlastGo[G].COUNT
			ROW[2]=PfamGo[G].COUNT
			ROW[3]=BlastGo[G].SubCAT.replace(" ","_")
			ROW[4]=BlastGo[G].CAT.replace(" ","_")
			ROW[5]=BlastGo[G].TERM
		elif G in BlastGo and G not in PfamGo:
			ROW[0]=BlastGo[G].COUNT
			ROW[1]=BlastGo[G].COUNT
			ROW[3]=BlastGo[G].SubCAT.replace(" ","_")
			ROW[4]=BlastGo[G].CAT.replace(" ","_")
			ROW[5]=BlastGo[G].TERM
		elif G not in BlastGo and G  in PfamGo:
			ROW[0]=PfamGo[G].COUNT
			ROW[2]=PfamGo[G].COUNT
			ROW[3]=PfamGo[G].SubCAT.replace(" ","_")
			ROW[4]=PfamGo[G].CAT.replace(" ","_")
			ROW[5]=PfamGo[G].TERM
		elif G not in BlastGo and G not in PfamGo:
			raise Exception(str(G)+" not included in either dictionary" )
		if ROW[1] >= ROW[2]:
			ROW[0]=ROW[1]

		elif ROW[2]>ROW[1]:
			ROW[0]=ROW[2]
		else:
			raise Exception(ROW[0:2],"Number Error" )


		FINAL.append(ROW)
	SorFin=sorted(FINAL, key=lambda x: int(x[0] ))
	for ROW in SorFin:
		ROW[0]=str(ROW[0])
		ROW[1]=str(ROW[1])
		ROW[2]=str(ROW[2])

		B.write("\t".join(ROW)+"\n")
		if ROW[4] == "biological_process" :
			BP.write("\t".join(ROW)+"\n")
		elif 	ROW[4] == "cellular_component":
			CC.write("\t".join(ROW)+"\n")
		elif  ROW[4] == "molecular_function":
			MF.write("\t".join(ROW)+"\n")

	B.close()
	BP.close()
	CC.close()
	MF.close()

#----------------------------------------------------------------------------------------------------------------
#	XARGS parser

parser=argparse.ArgumentParser()
parser.add_argument("HANDLE0", help="Tab delimited Trinontate file here\n\tDo not add csv. Since parsing is based on commas and tabs. ")
parser.add_argument("STEM",help="Stem for output GO term files.")

args=parser.parse_args()
HANDLE=args.HANDLE0
STEM=args.STEM
f=open(HANDLE,"r")

BlastGo={"UI":0}
PfamGO={"UI":0}
for Line in f:
	SLine=Line.split("\t")
	BGoTERMS=SLine[13].split("`")
	PGoTERMS=SLine[14].split("`")
	TrinontateGoTermParser(BlastGo, BGoTERMS )
	TrinontateGoTermParser(PfamGO, PGoTERMS )
        Added=[]

B_out=STEM+"_blast.table"
P_out=STEM+"_pfam.table"
M_out=STEM
for G in BlastGo:
	if G not in Added:
		Added.append(G)
for G in PfamGO:
	if G not in Added:
		Added.append(G)


# these earlier ones required to much post script parsing. But could still be fine to use if you want them.
#WriteToFile(Added,BlastGo,B_out)
#WriteToFile(Added,PfamGO,P_out)
WriteToFileMaster(Added,BlastGo,PfamGO,M_out) #(listOfGOTerms, BlastDictionary,PFamDictionary, OUTfile, that can be appended to)

f.close()
	# First generation for parsing out GO: terms. We used this to simplify working with regular
	# expressions and delimiters that are inherit to the Trinotate file
	# couple things to note:
	# Though we are parsing canonical GO terms here. They are not in the default column.
	# We have chosen to drop some columns from Trinontate output.
	#
	# If you are using this make sure that the script is taking the correct column!
	# Also, for ease of parsing post,python scripts the name of each go term has " " replaced with "_"
	# finally notice the GOTerm class below.
	# TERM=TERM
	# CAT=Namespace
	# SubCAT=Name




	class GoTerm:
	def __init__(self, TERM, CAT, SubCAT,COUNT ):
	self.TERM=TERM
	self.CAT=CAT
	self.SubCAT=SubCAT
	self.COUNT=int(COUNT)


	import re
	import argparse



	# house keeping

	def getKey0(item): # for sorting
	return item[0]


	def clean_for_append(out):
	f=open(out,'w')
	f.write("")
	f.close()

	def TrinontateGoTermParser(BlastGo, GoTERMS ):
	for TERM in GoTERMS:
	if TERM==".":
	BlastGo["UI"]+=1
	#print BlastGo["UI"]
	elif "GO:" in TERM:
	STerm=TERM.split("^")
	#class GoTerm:
	# def __init__(self, TERM, CAT, SubCAT,COUNT ):
	if len(STerm)==3:
	GTERM=GoTerm(STerm[0],STerm[1],STerm[2],1)
	if GTERM.TERM in BlastGo:
	BlastGo[GTERM.TERM].COUNT+=GTERM.COUNT
	elif GTERM.TERM not in BlastGo:
	BlastGo[GTERM.TERM]=GTERM
	#print STerm
	else:
	print "Scary" , TERM


	def WriteToFile(Added,BlastGo,B_out): #(listOfGOTerms, Dictionary, OUTfile, that can be appended to)
	clean_for_append(B_out)
	B=open(B_out, "a")
	for G in Added:
	if G=="UI" or G==".":
	B.write(str(BlastGo[G])+"\tUI\tUnidentified\t"+G+"\n")
	elif G in BlastGo:
	B.write(str(BlastGo[G].COUNT)+"\t"+BlastGo[G].SubCAT.replace(" ","_")+"\t"+BlastGo[G].CAT.replace(" ","_")+"\t"+BlastGo[G].TERM+"\n")
	elif G not in BlastGo:
	B.write("0\tNull\tNull\t"+G+"\n")

	B.close()


	def WriteToFileMaster(Added,BlastGo,PfamGo,M_out): #(listOfGOTerms, BlastDictionary,PFamDictionary, OUTfile, that can be appended to)
	M_out=STEM+"_master.tsv"
	BIO=STEM+"_BioProc.tsv"
	MOL=STEM+"_MolFun.tsv"
	CEL=STEM+"_CelCom.tsv"
	for X in [BIO, MOL,CEL,M_out] :
	clean_for_append(X)
	B=open(M_out, "a")
	BP=open(BIO,"a")
	CC=open(CEL,"a")
	MF=open(MOL,"a")
	for Y in [B,BP,CC,MF]:
	Y.write("GreatestTotal\tBlastTotal\tPfamTotal\tName\tNameSpace\tGoTerm\n")
	FINAL=[]
	for G in Added:
	ROW=[0,0,0,"N","NS","GO"]
	if G=="UI" or G==".":
	ROW[1]=BlastGo[G]
	ROW[2]=PfamGo[G]
	ROW[3]="UI"
	ROW[4]="UI"
	ROW[5]=G
	elif G in BlastGo and G in PfamGo:
	ROW[1]=BlastGo[G].COUNT
	ROW[2]=PfamGo[G].COUNT
	ROW[3]=BlastGo[G].SubCAT.replace(" ","_")
	ROW[4]=BlastGo[G].CAT.replace(" ","_")
	ROW[5]=BlastGo[G].TERM
	elif G in BlastGo and G not in PfamGo:
	ROW[0]=BlastGo[G].COUNT
	ROW[1]=BlastGo[G].COUNT
	ROW[3]=BlastGo[G].SubCAT.replace(" ","_")
	ROW[4]=BlastGo[G].CAT.replace(" ","_")
	ROW[5]=BlastGo[G].TERM
	elif G not in BlastGo and G in PfamGo:
	ROW[0]=PfamGo[G].COUNT
	ROW[2]=PfamGo[G].COUNT
	ROW[3]=PfamGo[G].SubCAT.replace(" ","_")
	ROW[4]=PfamGo[G].CAT.replace(" ","_")
	ROW[5]=PfamGo[G].TERM
	elif G not in BlastGo and G not in PfamGo:
	raise Exception(str(G)+" not included in either dictionary" )
	if ROW[1] >= ROW[2]:
	ROW[0]=ROW[1]

	elif ROW[2]>ROW[1]:
	ROW[0]=ROW[2]
	else:
	raise Exception(ROW[0:2],"Number Error" )



	FINAL.append(ROW)
	SorFin=sorted(FINAL, key=lambda x: int(x[0] ))
	for ROW in SorFin:
	ROW[0]=str(ROW[0])
	ROW[1]=str(ROW[1])
	ROW[2]=str(ROW[2])

	B.write("\t".join(ROW)+"\n")
	if ROW[4] == "biological_process" :
	BP.write("\t".join(ROW)+"\n")
	elif ROW[4] == "cellular_component":
	CC.write("\t".join(ROW)+"\n")
	elif ROW[4] == "molecular_function":
	MF.write("\t".join(ROW)+"\n")

	B.close()
	BP.close()
	CC.close()
	MF.close()

	#----------------------------------------------------------------------------------------------------------------
	# XARGS parser

	parser=argparse.ArgumentParser()
	parser.add_argument("HANDLE0", help="Tab delimited Trinontate file here\n\tDo not add csv. Since parsing is based on commas and tabs. ")
	parser.add_argument("STEM",help="Stem for output GO term files.")

	args=parser.parse_args()
	HANDLE=args.HANDLE0
	STEM=args.STEM
	f=open(HANDLE,"r")

	BlastGo={"UI":0}
	PfamGO={"UI":0}
	for Line in f:
	SLine=Line.split("\t")
	BGoTERMS=SLine[13].split("`")
	PGoTERMS=SLine[14].split("`")
	TrinontateGoTermParser(BlastGo, BGoTERMS )
	TrinontateGoTermParser(PfamGO, PGoTERMS )
	Added=[]

	B_out=STEM+"_blast.table"
	P_out=STEM+"_pfam.table"
	M_out=STEM
	for G in BlastGo:
	if G not in Added:
	Added.append(G)
	for G in PfamGO:
	if G not in Added:
	Added.append(G)


	# these earlier ones required to much post script parsing. But could still be fine to use if you want them.
	#WriteToFile(Added,BlastGo,B_out)
	#WriteToFile(Added,PfamGO,P_out)
	WriteToFileMaster(Added,BlastGo,PfamGO,M_out) #(listOfGOTerms, BlastDictionary,PFamDictionary, OUTfile, that can be appended to)

	f.close()