Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save avrilcoghlan/c91ca2a4ece1d29c3246262ffd624725 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/c91ca2a4ece1d29c3246262ffd624725 to your computer and use it in GitHub Desktop.
Python script to parse BLAST output from comparing ChEMBL proteins to human proteins
import os
import sys
from collections import defaultdict
import FiftyHG_Chembl
#====================================================================#
def main():
# find the blast output files:
mydir = os.getcwd() # the current directory
blastoutput = os.path.join(mydir, 'blast_output/blast.out')
# read in the gene identifier for each human protein:
protein_file = os.path.join(mydir, 'human.pep')
human_geneid = defaultdict()
fileObj = open(protein_file,"r")
for line in fileObj:
line = line.rstrip()
if line.startswith('>'):
# >ENSP00000469080 pep:known chromosome:GRCh37:HG1443_HG1444_PATCH:134478825:134497181:1 gene:ENSG00000269327 transcript:ENST00000593526 gene_biotype:protein_coding transcript_biotype:protein_coding
temp = line.split()
protein_name = temp[0] # >ENSP00000469080
protein_name = protein_name[1:] # ENSP00000469080
temp = line.split('gene:')
gene_name = temp[1] # gene:ENSG00000269327 tr...
temp = gene_name.split()
gene_name = temp[0] # gene:ENSG00000269327
assert(protein_name not in human_geneid)
human_geneid[protein_name] = gene_name
fileObj.close()
# format blastp output for each chembl vs. human:
output_file = 'chembl_vs_human_blast.txt'
print('Making file',output_file)
if not os.path.exists(output_file):
FiftyHG_Chembl.reformat_human_blast_output(blastoutput,output_file,human_geneid)
#====================================================================#
if __name__=="__main__":
main()
#====================================================================#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment