Created
March 4, 2022 10:54
-
-
Save avrilcoghlan/d32c5c0f845777b3b5b2ba367f131bf2 to your computer and use it in GitHub Desktop.
Parse BLAST output against ChEMBL, to have the top hits for each query protein
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
from collections import defaultdict | |
import FiftyHG_Chembl | |
#====================================================================# | |
def main(): | |
# find the blast output files: | |
blastoutput = defaultdict() | |
mydir = os.getcwd() # the current directory | |
subdirs = os.listdir(mydir) # there may be blast searches against ChEMBL for several query pathogen species | |
for subdir in subdirs: | |
species = subdir # e.g. schistosoma_mansoni | |
subdir = os.path.join(mydir,subdir) | |
myfile = os.path.join(subdir,'blast.out') | |
if os.path.exists(myfile): | |
assert(species not in blastoutput) | |
blastoutput[species] = myfile | |
# read the chembl fasta file of drug targets: | |
chembl_file = 'none' | |
# get a list of files in mydir: | |
files = os.listdir(mydir) | |
for file in files: | |
#file = os.path.join(mydir,file) | |
# check if it is the ChEMBL file (starting with chembl_ and ending with .fa): | |
if str(file).endswith('.fa') and str(file).startswith('chembl_'): | |
# check the file exists (and is not a broken symbolic link): | |
if os.path.exists(file): | |
chembl_file = file | |
assert(chembl_file != 'none') | |
(species_name,protein_seq) = FiftyHG_Chembl.read_chembl_file(chembl_file) | |
# format blastp output for each species: | |
for species in blastoutput: | |
myfile = blastoutput[species] | |
# parse and format this blast output file: | |
output_file = '%s.txt' % species | |
print('Making file',output_file) | |
if not os.path.exists(output_file): | |
FiftyHG_Chembl.reformat_blast_output(myfile,species_name,protein_seq,output_file) | |
print("FINISHED") | |
#====================================================================# | |
if __name__=="__main__": | |
main() | |
#====================================================================# |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment