Created
November 28, 2018 10:04
-
-
Save avrilcoghlan/c9bc38e897b0e45a653b128ea75548f1 to your computer and use it in GitHub Desktop.
Script to correct EMBL files so that CDSs with >50% internal Ns are marked as /pseudo
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
from collections import defaultdict | |
#====================================================================# | |
# read in the line numbers of dodgy CDS: | |
def read_error_file(error_file): | |
# record a set of line numbers with dodgy CDSs: | |
line_nums_set = set() | |
# read in the file that has the line numbers of CDSs with errors: | |
fileObj = open(error_file, "r") | |
for line in fileObj: | |
line = line.rstrip() | |
# Wed Nov 21 21:55:24 GMT 2018 USER ERROR: ERROR: Protein coding feature translation contains more than 50% X . [haemonchus_placei_new2.embl.gz, line: 1102966-1102969 | |
# Wed Nov 21 21:55:24 GMT 2018 USER ERROR: ERROR: Protein coding feature translation contains more than 50% X . [haemonchus_placei_new2.embl.gz, line: 1604928 of haem | |
if 'ERROR: Protein coding feature translation contains more than 50% X' in line: | |
temp = line.split("line: ") | |
line = temp[1] # '1102966-1102969' or '1604928 of haem' | |
temp = line.split() | |
line = temp[0] # '1102966-1102969' or '1604928' | |
temp = line.split('-') | |
line = int(temp[0]) # 1102966 or 1604928 | |
assert(line not in line_nums_set) | |
line_nums_set.add(line) | |
return line_nums_set | |
#====================================================================# | |
# define a function to read in the genes that are in families, for our species of interest: | |
def find_genes_in_families(families_file, our_species_name, locus_tag): | |
"""read in the genes that are in families, for our species of interest """ | |
# define a set to store the genes in families: | |
genes_in_families = set() | |
# read in the input file of genes in families: | |
fileObj = open(families_file, "r") | |
for line in fileObj: | |
line = line.rstrip() | |
if line.startswith('family'): # eg. family 5168 : SPAL_0001096400.1:mRNA (strongyloides_papillosus) Bm11175 (brugia_malayi) HCOI_1164600.1 (haemonchus_contortus)... | |
temp = line.split() | |
family = temp[1] # eg. 4376 | |
temp = line.split(': ') | |
line = temp[1] # SPAL_0000608500.1:mRNA (strongyloides_papillosus) Bm10870f (brugia_malayi) F23B12.1 (caenorhabditis_elegans)... | |
temp = line.split() | |
# make a list of tuples, with gene names and species names | |
# taken from http://stackoverflow.com/questions/2631189/python-every-other-element-idiom | |
# uses the zip function: http://docs.python.org/2/library/functions.html#zip | |
# see also on zip: http://nbviewer.ipython.org/github/pycam/python-intro/blob/release/Introduction_to_python_session_2.ipynb | |
# note the array 'pairs' seems to be empty after we have looked at it once, this must be a feature of 'zip'. | |
set_of_species = set() # define the set of species we have seen | |
family_genes = set() # define the set of genes in the family | |
pairs = zip(temp[0::2], temp[1::2]) | |
for pair in pairs: | |
(gene, species) = pair # unpack the tuple, eg. gives gene= SPAL_0000608500.1:mRNA species= (strongyloides_papillosus) | |
species = species[1:-1] # remove the parentheses | |
# get the gene name: | |
# eg. SRAE_X000112100.t1:mRNA/SSTP_0000225500.1:mRNA/SPAL_0001294400.1:mRNA/SVEN_0373900.1/PTRK_0001428200.1:mRNA/RSKR_0000682500.1:mRNA | |
if ':mRNA' in gene: | |
temp2 = gene.split(':mRNA') # eg. SRAE_X000112100.t1:mRNA | |
gene = temp2[0] # eg. SRAE_X000112100.t1 | |
if '-mRNA-' in gene: # eg. ASIM_0000655901-mRNA-1 | |
temp2 = gene.split('-mRNA-') | |
gene = temp2[0] # eg. ASIM_0000655901 | |
if '_' in gene and '.' in gene: # eg. SRAE_X000112100.t1 or MhA1_Contig1285.frz3.fgene2 | |
temp2 = gene.split('.') | |
afterdot = temp2[1] # eg. t1 or fgene2 | |
if (afterdot.startswith('t') or afterdot.isdigit() == True) and species != 'caenorhabditis_elegans': | |
gene = temp2[0] # eg. SRAE_X000112100 | |
if '.t' in gene: # eg. HCOI02162100.t1 | |
temp2 = gene.split('.t') | |
afterdot = temp2[1] # eg. 1 | |
if afterdot.isdigit() == True: | |
gene = temp2[0] # eg. HCOI02162100 | |
if species == our_species_name: | |
if locus_tag == 'NOO' or locus_tag == 'NLS' or locus_tag == 'NAV': | |
gene = get_new_gene_name(gene, locus_tag) | |
genes_in_families.add(gene) | |
fileObj.close() | |
return genes_in_families | |
#====================================================================# | |
# get the gene name from a line of the embl file: | |
def get_gene_name2(line): | |
""" get the name of a gene from line of an embl file | |
>>> get_gene_name2('FT /note="ID:gene:NAV_0000008934"') | |
'NAV_0000008934' | |
>>> get_gene_name2('FT /note="ID:cds:SMUV_0000034201-mRNA-1"') | |
'SMUV_0000034201' | |
""" | |
temp = line.split() | |
gene = temp[1] # /note="ID:gene:NAV_0000008934" | |
temp = gene.split("\"") | |
gene = temp[1] # ID:gene:NAV_0000008934 or ID:cds:SMUV_0000034201-mRNA-1 | |
temp = gene.split(":") | |
gene = temp[2] # NAV_0000008934 or SMUV_0000034201-mRNA-1 | |
temp = gene.split("-mRNA-") | |
gene = temp[0] # NAV_0000008934 or SMUV_0000034201 | |
return gene | |
#====================================================================# | |
# read in the input embl file to find genes that are already labelled dodgy: | |
def find_genes_labelled_dodgy_already(input_embl): | |
# make a set of genes labelled dodgy already: | |
genes_labelled_dodgy_already_set = set() | |
# read in the input file: | |
is_pseudo = False | |
found_a_gene = False | |
fileObj = open(input_embl, "r") | |
for line in fileObj: | |
line = line.rstrip() | |
if '/note="ID:gene:' in line: # FT /note="ID:gene:NAV_0000008934" | |
gene = get_gene_name2(line) # eg. NAV_0000008934 | |
found_a_gene = True | |
is_pseudo = False | |
if found_a_gene == True: | |
if '/pseudo' in line: | |
is_pseudo = True | |
if '/standard_name' in line: | |
if is_pseudo == True: | |
assert(gene not in genes_labelled_dodgy_already_set) | |
genes_labelled_dodgy_already_set.add(gene) | |
found_a_gene = False | |
is_pseudo = False | |
fileObj.close() | |
return genes_labelled_dodgy_already_set | |
#====================================================================# | |
# read in the input embl file, and write out a new embl file: | |
def read_input_embl_and_write_output_embl(input_embl, output_embl, dodgy_genes_set, genes_in_families, genes_labelled_dodgy_already_set): | |
# open an output file: | |
outputfileObj = open(output_embl,"w") | |
# read in the input file: | |
fileObj = open(input_embl, "r") | |
for line in fileObj: | |
line = line.rstrip() | |
if 'source:WormBase_imported' in line: | |
pass # don't print this line to the output | |
else: | |
if line.startswith('AC'): # AC HPLM_contig0000001; | |
output_line = "AC XXX;\n" | |
outputfileObj.write(output_line) | |
output_line = "XX\n" | |
outputfileObj.write(output_line) | |
output_line = make_embl_output_line1(line) | |
output_line = "%s\n" % output_line | |
outputfileObj.write(output_line) | |
elif '/note="ID:gene:' in line or 'note="ID:cds:' in line or 'note="ID:exon:' in line : # FT /note="ID:gene:NAV_0000008934" | |
# FT /note="ID:cds:SMUV_0000034201-mRNA-1" | |
# FT /note="ID:exon:SMUV_0000587401-mRNA-1.1" | |
gene = get_gene_name2(line) # eg. NAV_0000008934 | |
output_line = "%s\n" % line | |
outputfileObj.write(output_line) # write out the current line | |
if gene in dodgy_genes_set and gene not in genes_labelled_dodgy_already_set: | |
output_line = "FT /pseudo\n" | |
outputfileObj.write(output_line) | |
output_line = "FT /note=\"odd gene structure probably caused by assembly error\"\n" | |
outputfileObj.write(output_line) | |
if gene in genes_in_families: | |
output_line = "FT /note=\"predicted protein belongs to a gene family in the Compara database of the International Helminth Genomes Consortium\"\n" | |
outputfileObj.write(output_line) | |
else: | |
output_line = "%s\n" % line | |
outputfileObj.write(output_line) | |
fileObj.close() | |
outputfileObj.close() | |
return | |
#====================================================================# | |
# make a new output line with the accession: | |
def make_embl_output_line1(line): | |
"""make a new output line with the accession | |
>>> make_embl_output_line1('AC HPLM_contig0000001;') | |
'AC * _HPLM_contig0000001' | |
""" | |
temp = line.split() | |
scaffold = temp[1] # HPLM_contig0000001; | |
scaffold = scaffold[:-1] # HPLM_contig0000001 | |
output_line = "AC * _%s" % scaffold | |
return output_line | |
#====================================================================# | |
# define a function to trim off '0's at the start of a scaffold name: | |
def trim_off_zeroes(scaffold_name): | |
""" trim off zeroes at the start of a scaffold name: | |
>>> trim_off_zeroes("00001") | |
'1' | |
>>> trim_off_zeroes("00010") | |
'10' | |
""" | |
# trim off zeroes at the start: | |
finished = False | |
while finished == False: | |
if scaffold_name.startswith('0'): | |
scaffold_name = scaffold_name[1:] | |
else: | |
finished = True | |
return scaffold_name | |
#====================================================================# | |
# read in the set of genes/CDSs that look dodgy: | |
def find_dodgy_genes_and_cds(input_embl, locus_tag, line_nums_set): | |
dodgy_genes_set = set() | |
linecnt = 0 | |
searching_for_cds = False | |
fileObj = open(input_embl, "r") | |
for line in fileObj: | |
linecnt += 1 | |
if linecnt in line_nums_set: | |
searching_for_cds = True | |
line = line.rstrip() | |
# FT CDS complement(join(40602..44150,44239..44536,44646..44849, | |
# FT 46859..46950,47022..47072,47138..47194,47316..47395, | |
# FT 48487..48832,49403..49555,49625..49723,50906..50995, | |
# FT 52710..52783,55438..55570)) | |
# FT /locus_tag="HPLM_LOCUS3124" | |
# FT /codon_start=1 | |
# FT /note="ID:cds:HPLM_0000313201-mRNA-1" | |
# FT /transl_table=1 | |
if 'ID:cds:' in line: | |
if searching_for_cds == True: | |
# FT /note="ID:cds:HPLM_0000313201-mRNA-1" | |
temp = line.split("\"") | |
cds = temp[1] # ID:cds:HPLM_0000313201-mRNA-1 | |
temp = cds.split(":") | |
cds = temp[2] # HPLM_0000313201-mRNA-1 | |
assert('-mRNA-' in cds) | |
temp = cds.split("-mRNA-") | |
gene = temp[0] # HPLM_0000313201 | |
assert(gene not in dodgy_genes_set) | |
dodgy_genes_set.add(gene) | |
searching_for_cds = False | |
return dodgy_genes_set | |
#====================================================================# | |
# find the new name for a gene: | |
def get_new_gene_name(old_gene_name, locus_tag): | |
"""find the new name for a gene | |
>>> get_new_gene_name('nAv.1.0.1.g01019', 'NOO') | |
'NOO_0000001019' | |
""" | |
temp = old_gene_name.split('.g') | |
gene_name = temp[1] # 08934 | |
gene_name = trim_off_zeroes(gene_name) # 8934 | |
gene_name_len = len(gene_name) | |
num_zeroes_to_add = 10 - gene_name_len | |
zeroes_to_add = "0" * num_zeroes_to_add | |
new_gene_name = "%s_%s%s" % (locus_tag, zeroes_to_add, gene_name) | |
return new_gene_name | |
#====================================================================# | |
def main(): | |
# check the command-line arguments: | |
if len(sys.argv) != 7 or os.path.exists(sys.argv[1]) == False or os.path.exists(sys.argv[3]) == False or os.path.exists(sys.argv[6]) == False: | |
print("Usage: %s input_embl output_embl error_file locus_tag our_species families_file" % sys.argv[0]) | |
sys.exit(1) | |
input_embl = sys.argv[1] # e.g. haemonchus_placei_new2.embl | |
output_embl = sys.argv[2] # e.g. haemonchus_placei_new3_27nov2018.embl | |
error_file = sys.argv[3] # file with line number of errors | |
locus_tag = sys.argv[4] # eg. HPLM | |
our_species = sys.argv[5] # eg. haemonchus_placei | |
families_file = sys.argv[6] # /nfs/helminths02/analysis/50HGP/00ANALYSES/final_families/complete_families.txt_27mar2017 | |
# read in the line numbers of dodgy CDS: | |
line_nums_set = read_error_file(error_file) | |
# read in the set of genes/CDSs that look dodgy: | |
dodgy_genes_set = find_dodgy_genes_and_cds(input_embl, locus_tag, line_nums_set) | |
# read in the genes that are in families, for our species of interest: | |
genes_in_families = find_genes_in_families(families_file, our_species, locus_tag) | |
# read in the input embl file to find genes that are already labelled dodgy: | |
genes_labelled_dodgy_already_set = find_genes_labelled_dodgy_already(input_embl) | |
# read in the input embl file, and write out a new embl file: | |
read_input_embl_and_write_output_embl(input_embl, output_embl, dodgy_genes_set, genes_in_families, genes_labelled_dodgy_already_set) | |
print("FINISHED\n") | |
#====================================================================# | |
if __name__=="__main__": | |
main() | |
#====================================================================# |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment