Skip to content

Instantly share code, notes, and snippets.

@philippbayer
Created January 3, 2019 02:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save philippbayer/0052f5ad56121cd2252a1c5b90154ed1 to your computer and use it in GitHub Desktop.
Save philippbayer/0052f5ad56121cd2252a1c5b90154ed1 to your computer and use it in GitHub Desktop.
A stab at classifying NLR-Annotator output
'''
This will print something like
File Subset TN TCNL NL CN N TCN TNL CNL
assembly.fa_chopped_out.xml_txt complete 0 0 0 0 0 0 29 17
assembly.fa_chopped_out.xml_txt complete (pseudogene) 0 0 0 0 0 0 16 7
assembly.fa_chopped_out.xml_txt partial 3 0 1 0 4 1 0 0
assembly.fa_chopped_out.xml_txt partial (pseudogene) 1 0 0 5 1 2 2 2
'''
# this is the motifs table from https://bmcgenomics.biomedcentral.com/articles/10.1186/1471-2164-13-75
motifs_d = {1:'nb_arc_cnl_or_tnl',2:'nb_arc_cnl',3:'nb_arc_cnl_or_tnl',4:'nb_arc_cnl_or_tnl',5:'nb_arc_cnl_or_tnl',6:'nb_arc_cnl',7:'linker_cnl_or_tnl',8:'linker_cnl_or_tnl',9:'lrr_cnl_or_tnl',10:'nb_arc_cnl_or_tnl',11:'lrr_cnl_or_tnl',12:'nb_arc_cnl_or_tnl',13:'tir_tnl',14:'monocot',15:'tir_tnl',16:'prenb_cnl',17:'prenb_cnl',18:'tir_tnl',19:'lrr_cnl_or_tnl',20:'monocot'}
# this is my idea of class assignment
class_dict = {frozenset(['lrr_cnl_or_tnl', 'monocot', 'tir_tnl', 'nb_arc_cnl_or_tnl']):'TNL', frozenset(['linker_cnl_or_tnl', 'monocot', 'nb_arc_cnl', 'nb_arc_cnl_or_tnl', 'lrr_cnl_or_tnl']):'CNL',frozenset(['lrr_cnl_or_tnl', 'monocot', 'tir_tnl', 'nb_arc_cnl_or_tnl', 'nb_arc_cnl']):'TCNL', frozenset(['lrr_cnl_or_tnl', 'nb_arc_cnl_or_tnl', 'prenb_cnl', 'nb_arc_cnl']):'CNL',frozenset(['nb_arc_cnl_or_tnl', 'linker_cnl_or_tnl', 'lrr_cnl_or_tnl', 'monocot', 'prenb_cnl', 'nb_arc_cnl']):'CNL', frozenset(['monocot', 'nb_arc_cnl_or_tnl']):'N', frozenset(['linker_cnl_or_tnl', 'prenb_cnl', 'nb_arc_cnl']):'CN', frozenset(['linker_cnl_or_tnl', 'monocot', 'nb_arc_cnl_or_tnl', 'prenb_cnl', 'lrr_cnl_or_tnl']):'CNL', frozenset(['lrr_cnl_or_tnl', 'tir_tnl', 'prenb_cnl', 'nb_arc_cnl_or_tnl']):'TCNL', frozenset(['monocot', 'nb_arc_cnl_or_tnl', 'nb_arc_cnl']):'CN', frozenset(['linker_cnl_or_tnl', 'nb_arc_cnl', 'tir_tnl', 'nb_arc_cnl_or_tnl', 'lrr_cnl_or_tnl']):'TNL', frozenset(['linker_cnl_or_tnl', 'monocot', 'nb_arc_cnl_or_tnl', 'lrr_cnl_or_tnl']):'NL', frozenset(['nb_arc_cnl_or_tnl', 'nb_arc_cnl']):'CN', frozenset(['linker_cnl_or_tnl', 'nb_arc_cnl', 'prenb_cnl', 'lrr_cnl_or_tnl']):'CNL', frozenset(['linker_cnl_or_tnl', 'monocot', 'tir_tnl', 'nb_arc_cnl_or_tnl']):'TN',frozenset(['nb_arc_cnl_or_tnl', 'prenb_cnl', 'nb_arc_cnl']):'CN', frozenset(['linker_cnl_or_tnl', 'nb_arc_cnl_or_tnl', 'prenb_cnl', 'lrr_cnl_or_tnl']):'CNL', frozenset(['linker_cnl_or_tnl', 'monocot', 'nb_arc_cnl_or_tnl']):'N', frozenset(['tir_tnl', 'prenb_cnl', 'nb_arc_cnl_or_tnl']):'TCN', frozenset(['linker_cnl_or_tnl', 'nb_arc_cnl_or_tnl', 'lrr_cnl_or_tnl']):'NL', frozenset(['lrr_cnl_or_tnl', 'monocot', 'nb_arc_cnl_or_tnl', 'nb_arc_cnl']):'CNL', frozenset(['linker_cnl_or_tnl', 'tir_tnl', 'prenb_cnl', 'nb_arc_cnl_or_tnl']):'TCN', frozenset(['linker_cnl_or_tnl', 'tir_tnl', 'nb_arc_cnl_or_tnl', 'nb_arc_cnl']):'TCN', frozenset(['tir_tnl', 'nb_arc_cnl_or_tnl']):'TN', frozenset(['lrr_cnl_or_tnl', 'monocot', 'nb_arc_cnl_or_tnl', 'prenb_cnl', 'nb_arc_cnl']):'CNL', frozenset(['monocot', 'tir_tnl', 'nb_arc_cnl_or_tnl']):'TN', frozenset(['linker_cnl_or_tnl', 'nb_arc_cnl_or_tnl', 'tir_tnl', 'prenb_cnl', 'lrr_cnl_or_tnl']):'TCNL', frozenset(['linker_cnl_or_tnl', 'monocot', 'nb_arc_cnl', 'lrr_cnl_or_tnl']):'CNL', frozenset(['nb_arc_cnl_or_tnl', 'prenb_cnl']):'CN', frozenset(['lrr_cnl_or_tnl', 'tir_tnl', 'nb_arc_cnl_or_tnl', 'nb_arc_cnl']):'TCNL', frozenset(['lrr_cnl_or_tnl', 'tir_tnl', 'nb_arc_cnl_or_tnl']):'TNL', frozenset(['linker_cnl_or_tnl', 'tir_tnl', 'nb_arc_cnl_or_tnl']):'TN', frozenset(['linker_cnl_or_tnl', 'monocot', 'tir_tnl', 'nb_arc_cnl_or_tnl', 'lrr_cnl_or_tnl']):'TNL', frozenset(['linker_cnl_or_tnl', 'nb_arc_cnl', 'lrr_cnl_or_tnl']):'CNL', frozenset(['tir_tnl', 'nb_arc_cnl_or_tnl', 'nb_arc_cnl']):'TCN', frozenset(['linker_cnl_or_tnl', 'nb_arc_cnl', 'nb_arc_cnl_or_tnl', 'lrr_cnl_or_tnl']):'CNL', frozenset(['linker_cnl_or_tnl', 'monocot', 'nb_arc_cnl_or_tnl', 'prenb_cnl', 'nb_arc_cnl']):'CN', frozenset(['linker_cnl_or_tnl', 'nb_arc_cnl']):'N', frozenset(['lrr_cnl_or_tnl', 'nb_arc_cnl_or_tnl']):'NL', frozenset(['lrr_cnl_or_tnl', 'monocot', 'nb_arc_cnl_or_tnl']):'NL', frozenset(['monocot', 'tir_tnl', 'nb_arc_cnl_or_tnl', 'nb_arc_cnl']):'TCN', frozenset(['linker_cnl_or_tnl', 'tir_tnl', 'nb_arc_cnl_or_tnl', 'lrr_cnl_or_tnl']):'TNL', frozenset(['linker_cnl_or_tnl', 'nb_arc_cnl', 'nb_arc_cnl_or_tnl', 'prenb_cnl', 'lrr_cnl_or_tnl']):'CNL', frozenset(['nb_arc_cnl_or_tnl']):'N', frozenset(['linker_cnl_or_tnl', 'monocot', 'nb_arc_cnl_or_tnl', 'nb_arc_cnl']):'CN', frozenset(['tir_tnl', 'nb_arc_cnl_or_tnl', 'linker_cnl_or_tnl', 'lrr_cnl_or_tnl', 'prenb_cnl', 'nb_arc_cnl']):'TCNL', frozenset(['lrr_cnl_or_tnl', 'nb_arc_cnl_or_tnl', 'prenb_cnl']):'CNL', frozenset(['linker_cnl_or_tnl', 'nb_arc_cnl_or_tnl', 'prenb_cnl', 'nb_arc_cnl']):'CN', frozenset(['monocot', 'nb_arc_cnl_or_tnl', 'prenb_cnl', 'nb_arc_cnl']):'CN', frozenset(['tir_tnl', 'nb_arc_cnl_or_tnl', 'linker_cnl_or_tnl', 'lrr_cnl_or_tnl', 'monocot', 'nb_arc_cnl']):'TCNL', frozenset(['linker_cnl_or_tnl', 'nb_arc_cnl_or_tnl']):'N', frozenset(['linker_cnl_or_tnl', 'nb_arc_cnl_or_tnl', 'nb_arc_cnl']):'CN', frozenset(['linker_cnl_or_tnl', 'nb_arc_cnl_or_tnl', 'prenb_cnl']):'CN', frozenset(['lrr_cnl_or_tnl', 'nb_arc_cnl_or_tnl', 'nb_arc_cnl']):'CNL'}
# a list or a string of input files - NLR-Parser txt output
files = '''XXX'''.split('\n')
from collections import defaultdict
all_types = ['TN', 'TCNL', 'NL', 'CN', 'N', 'TCN', 'TNL','CNL']
header = ['File', 'Subset'] + all_types
print('\t'.join(header))
for f in files:
count_dict = {}
for a in ['complete', 'complete (pseudogene)', 'partial', 'partial (pseudogene)']:
count_dict[a] = {'TN':0, 'NL':0, 'CN':0, 'N':0, 'TCN':0, 'TCNL':0, 'TNL':0, 'CNL':0}
for line in open(f):
ll = line.rstrip().split('\t')
motifs = ll[-1].split(',')
this_domains = set()
# iterate ovr
for m in motifs:
m = int(m)
this_domains.add(motifs_d[m])
this_domains = frozenset(this_domains)
this_dict = count_dict[ll[2]]
this_class = class_dict[this_domains]
this_dict[this_class] += 1
count_dict[ll[2]] = this_dict
f = '_'.join(f.split('/')[-3:])
for a in sorted(count_dict):
thisll = [f, a]
for x in all_types:
thisll.append(count_dict[a][x])
print('\t'.join(map(str, thisll)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment