Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 4, 2022 10:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/d32c5c0f845777b3b5b2ba367f131bf2 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/d32c5c0f845777b3b5b2ba367f131bf2 to your computer and use it in GitHub Desktop.
Parse BLAST output against ChEMBL, to have the top hits for each query protein
import os
import sys
from collections import defaultdict
import FiftyHG_Chembl
#====================================================================#
def main():
# find the blast output files:
blastoutput = defaultdict()
mydir = os.getcwd() # the current directory
subdirs = os.listdir(mydir) # there may be blast searches against ChEMBL for several query pathogen species
for subdir in subdirs:
species = subdir # e.g. schistosoma_mansoni
subdir = os.path.join(mydir,subdir)
myfile = os.path.join(subdir,'blast.out')
if os.path.exists(myfile):
assert(species not in blastoutput)
blastoutput[species] = myfile
# read the chembl fasta file of drug targets:
chembl_file = 'none'
# get a list of files in mydir:
files = os.listdir(mydir)
for file in files:
#file = os.path.join(mydir,file)
# check if it is the ChEMBL file (starting with chembl_ and ending with .fa):
if str(file).endswith('.fa') and str(file).startswith('chembl_'):
# check the file exists (and is not a broken symbolic link):
if os.path.exists(file):
chembl_file = file
assert(chembl_file != 'none')
(species_name,protein_seq) = FiftyHG_Chembl.read_chembl_file(chembl_file)
# format blastp output for each species:
for species in blastoutput:
myfile = blastoutput[species]
# parse and format this blast output file:
output_file = '%s.txt' % species
print('Making file',output_file)
if not os.path.exists(output_file):
FiftyHG_Chembl.reformat_blast_output(myfile,species_name,protein_seq,output_file)
print("FINISHED")
#====================================================================#
if __name__=="__main__":
main()
#====================================================================#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment