Last active
March 4, 2022 13:28
-
-
Save avrilcoghlan/c91ca2a4ece1d29c3246262ffd624725 to your computer and use it in GitHub Desktop.
Python script to parse BLAST output from comparing ChEMBL proteins to human proteins
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
from collections import defaultdict | |
import FiftyHG_Chembl | |
#====================================================================# | |
def main(): | |
# find the blast output files: | |
mydir = os.getcwd() # the current directory | |
blastoutput = os.path.join(mydir, 'blast_output/blast.out') | |
# read in the gene identifier for each human protein: | |
protein_file = os.path.join(mydir, 'human.pep') | |
human_geneid = defaultdict() | |
fileObj = open(protein_file,"r") | |
for line in fileObj: | |
line = line.rstrip() | |
if line.startswith('>'): | |
# >ENSP00000469080 pep:known chromosome:GRCh37:HG1443_HG1444_PATCH:134478825:134497181:1 gene:ENSG00000269327 transcript:ENST00000593526 gene_biotype:protein_coding transcript_biotype:protein_coding | |
temp = line.split() | |
protein_name = temp[0] # >ENSP00000469080 | |
protein_name = protein_name[1:] # ENSP00000469080 | |
temp = line.split('gene:') | |
gene_name = temp[1] # gene:ENSG00000269327 tr... | |
temp = gene_name.split() | |
gene_name = temp[0] # gene:ENSG00000269327 | |
assert(protein_name not in human_geneid) | |
human_geneid[protein_name] = gene_name | |
fileObj.close() | |
# format blastp output for each chembl vs. human: | |
output_file = 'chembl_vs_human_blast.txt' | |
print('Making file',output_file) | |
if not os.path.exists(output_file): | |
FiftyHG_Chembl.reformat_human_blast_output(blastoutput,output_file,human_geneid) | |
#====================================================================# | |
if __name__=="__main__": | |
main() | |
#====================================================================# |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment