Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/883d63f327d34b6bb57d8453d5497cd0 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/883d63f327d34b6bb57d8453d5497cd0 to your computer and use it in GitHub Desktop.
Python script to filter BLAST hits to ChEMBL, to just take hits to single-protein targets:
import os
import sys
from collections import defaultdict
import FiftyHG_Chembl
#====================================================================#
def main():
# find the blast output files:
blastoutput = defaultdict()
mydir = os.getcwd() # the current directory
myfiles = os.listdir(mydir)
for myfile in myfiles:
if myfile.endswith('.txt2'): # eg. schistosoma_mansoni.txt2
# find the species name:
temp = myfile.split('.txt2')
species = temp[0] # eg. schistosoma_mansoni
# read in the list of uniprot ids in chembl single-protein targets:
singleproteintargetfile = os.path.join(mydir, 'chembl_single_protein_targets_uniprot_ids')
assert(os.path.exists(singleproteintargetfile))
singleproteintargets = FiftyHG_Chembl.read_single_protein_targets(singleproteintargetfile) # returns a set of uniprot ids in targets
# parse and format this blast output file, to just take the blast matches to single-protein chembl targets:
myfile = os.path.join(mydir, myfile)
output_file = '%sb' % myfile
print('Making file',output_file)
if not os.path.exists(output_file):
FiftyHG_Chembl.reformat_blast_output_singleproteintargetsonly(myfile,species,singleproteintargets,output_file)
#====================================================================#
if __name__=="__main__":
main()
#====================================================================#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment