Created
January 3, 2019 02:50
-
-
Save philippbayer/0052f5ad56121cd2252a1c5b90154ed1 to your computer and use it in GitHub Desktop.
A stab at classifying NLR-Annotator output
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
This will print something like | |
File Subset TN TCNL NL CN N TCN TNL CNL | |
assembly.fa_chopped_out.xml_txt complete 0 0 0 0 0 0 29 17 | |
assembly.fa_chopped_out.xml_txt complete (pseudogene) 0 0 0 0 0 0 16 7 | |
assembly.fa_chopped_out.xml_txt partial 3 0 1 0 4 1 0 0 | |
assembly.fa_chopped_out.xml_txt partial (pseudogene) 1 0 0 5 1 2 2 2 | |
''' | |
# this is the motifs table from https://bmcgenomics.biomedcentral.com/articles/10.1186/1471-2164-13-75 | |
motifs_d = {1:'nb_arc_cnl_or_tnl',2:'nb_arc_cnl',3:'nb_arc_cnl_or_tnl',4:'nb_arc_cnl_or_tnl',5:'nb_arc_cnl_or_tnl',6:'nb_arc_cnl',7:'linker_cnl_or_tnl',8:'linker_cnl_or_tnl',9:'lrr_cnl_or_tnl',10:'nb_arc_cnl_or_tnl',11:'lrr_cnl_or_tnl',12:'nb_arc_cnl_or_tnl',13:'tir_tnl',14:'monocot',15:'tir_tnl',16:'prenb_cnl',17:'prenb_cnl',18:'tir_tnl',19:'lrr_cnl_or_tnl',20:'monocot'} | |
# this is my idea of class assignment | |
class_dict = {frozenset(['lrr_cnl_or_tnl', 'monocot', 'tir_tnl', 'nb_arc_cnl_or_tnl']):'TNL', frozenset(['linker_cnl_or_tnl', 'monocot', 'nb_arc_cnl', 'nb_arc_cnl_or_tnl', 'lrr_cnl_or_tnl']):'CNL',frozenset(['lrr_cnl_or_tnl', 'monocot', 'tir_tnl', 'nb_arc_cnl_or_tnl', 'nb_arc_cnl']):'TCNL', frozenset(['lrr_cnl_or_tnl', 'nb_arc_cnl_or_tnl', 'prenb_cnl', 'nb_arc_cnl']):'CNL',frozenset(['nb_arc_cnl_or_tnl', 'linker_cnl_or_tnl', 'lrr_cnl_or_tnl', 'monocot', 'prenb_cnl', 'nb_arc_cnl']):'CNL', frozenset(['monocot', 'nb_arc_cnl_or_tnl']):'N', frozenset(['linker_cnl_or_tnl', 'prenb_cnl', 'nb_arc_cnl']):'CN', frozenset(['linker_cnl_or_tnl', 'monocot', 'nb_arc_cnl_or_tnl', 'prenb_cnl', 'lrr_cnl_or_tnl']):'CNL', frozenset(['lrr_cnl_or_tnl', 'tir_tnl', 'prenb_cnl', 'nb_arc_cnl_or_tnl']):'TCNL', frozenset(['monocot', 'nb_arc_cnl_or_tnl', 'nb_arc_cnl']):'CN', frozenset(['linker_cnl_or_tnl', 'nb_arc_cnl', 'tir_tnl', 'nb_arc_cnl_or_tnl', 'lrr_cnl_or_tnl']):'TNL', frozenset(['linker_cnl_or_tnl', 'monocot', 'nb_arc_cnl_or_tnl', 'lrr_cnl_or_tnl']):'NL', frozenset(['nb_arc_cnl_or_tnl', 'nb_arc_cnl']):'CN', frozenset(['linker_cnl_or_tnl', 'nb_arc_cnl', 'prenb_cnl', 'lrr_cnl_or_tnl']):'CNL', frozenset(['linker_cnl_or_tnl', 'monocot', 'tir_tnl', 'nb_arc_cnl_or_tnl']):'TN',frozenset(['nb_arc_cnl_or_tnl', 'prenb_cnl', 'nb_arc_cnl']):'CN', frozenset(['linker_cnl_or_tnl', 'nb_arc_cnl_or_tnl', 'prenb_cnl', 'lrr_cnl_or_tnl']):'CNL', frozenset(['linker_cnl_or_tnl', 'monocot', 'nb_arc_cnl_or_tnl']):'N', frozenset(['tir_tnl', 'prenb_cnl', 'nb_arc_cnl_or_tnl']):'TCN', frozenset(['linker_cnl_or_tnl', 'nb_arc_cnl_or_tnl', 'lrr_cnl_or_tnl']):'NL', frozenset(['lrr_cnl_or_tnl', 'monocot', 'nb_arc_cnl_or_tnl', 'nb_arc_cnl']):'CNL', frozenset(['linker_cnl_or_tnl', 'tir_tnl', 'prenb_cnl', 'nb_arc_cnl_or_tnl']):'TCN', frozenset(['linker_cnl_or_tnl', 'tir_tnl', 'nb_arc_cnl_or_tnl', 'nb_arc_cnl']):'TCN', frozenset(['tir_tnl', 'nb_arc_cnl_or_tnl']):'TN', frozenset(['lrr_cnl_or_tnl', 'monocot', 'nb_arc_cnl_or_tnl', 'prenb_cnl', 'nb_arc_cnl']):'CNL', frozenset(['monocot', 'tir_tnl', 'nb_arc_cnl_or_tnl']):'TN', frozenset(['linker_cnl_or_tnl', 'nb_arc_cnl_or_tnl', 'tir_tnl', 'prenb_cnl', 'lrr_cnl_or_tnl']):'TCNL', frozenset(['linker_cnl_or_tnl', 'monocot', 'nb_arc_cnl', 'lrr_cnl_or_tnl']):'CNL', frozenset(['nb_arc_cnl_or_tnl', 'prenb_cnl']):'CN', frozenset(['lrr_cnl_or_tnl', 'tir_tnl', 'nb_arc_cnl_or_tnl', 'nb_arc_cnl']):'TCNL', frozenset(['lrr_cnl_or_tnl', 'tir_tnl', 'nb_arc_cnl_or_tnl']):'TNL', frozenset(['linker_cnl_or_tnl', 'tir_tnl', 'nb_arc_cnl_or_tnl']):'TN', frozenset(['linker_cnl_or_tnl', 'monocot', 'tir_tnl', 'nb_arc_cnl_or_tnl', 'lrr_cnl_or_tnl']):'TNL', frozenset(['linker_cnl_or_tnl', 'nb_arc_cnl', 'lrr_cnl_or_tnl']):'CNL', frozenset(['tir_tnl', 'nb_arc_cnl_or_tnl', 'nb_arc_cnl']):'TCN', frozenset(['linker_cnl_or_tnl', 'nb_arc_cnl', 'nb_arc_cnl_or_tnl', 'lrr_cnl_or_tnl']):'CNL', frozenset(['linker_cnl_or_tnl', 'monocot', 'nb_arc_cnl_or_tnl', 'prenb_cnl', 'nb_arc_cnl']):'CN', frozenset(['linker_cnl_or_tnl', 'nb_arc_cnl']):'N', frozenset(['lrr_cnl_or_tnl', 'nb_arc_cnl_or_tnl']):'NL', frozenset(['lrr_cnl_or_tnl', 'monocot', 'nb_arc_cnl_or_tnl']):'NL', frozenset(['monocot', 'tir_tnl', 'nb_arc_cnl_or_tnl', 'nb_arc_cnl']):'TCN', frozenset(['linker_cnl_or_tnl', 'tir_tnl', 'nb_arc_cnl_or_tnl', 'lrr_cnl_or_tnl']):'TNL', frozenset(['linker_cnl_or_tnl', 'nb_arc_cnl', 'nb_arc_cnl_or_tnl', 'prenb_cnl', 'lrr_cnl_or_tnl']):'CNL', frozenset(['nb_arc_cnl_or_tnl']):'N', frozenset(['linker_cnl_or_tnl', 'monocot', 'nb_arc_cnl_or_tnl', 'nb_arc_cnl']):'CN', frozenset(['tir_tnl', 'nb_arc_cnl_or_tnl', 'linker_cnl_or_tnl', 'lrr_cnl_or_tnl', 'prenb_cnl', 'nb_arc_cnl']):'TCNL', frozenset(['lrr_cnl_or_tnl', 'nb_arc_cnl_or_tnl', 'prenb_cnl']):'CNL', frozenset(['linker_cnl_or_tnl', 'nb_arc_cnl_or_tnl', 'prenb_cnl', 'nb_arc_cnl']):'CN', frozenset(['monocot', 'nb_arc_cnl_or_tnl', 'prenb_cnl', 'nb_arc_cnl']):'CN', frozenset(['tir_tnl', 'nb_arc_cnl_or_tnl', 'linker_cnl_or_tnl', 'lrr_cnl_or_tnl', 'monocot', 'nb_arc_cnl']):'TCNL', frozenset(['linker_cnl_or_tnl', 'nb_arc_cnl_or_tnl']):'N', frozenset(['linker_cnl_or_tnl', 'nb_arc_cnl_or_tnl', 'nb_arc_cnl']):'CN', frozenset(['linker_cnl_or_tnl', 'nb_arc_cnl_or_tnl', 'prenb_cnl']):'CN', frozenset(['lrr_cnl_or_tnl', 'nb_arc_cnl_or_tnl', 'nb_arc_cnl']):'CNL'} | |
# a list or a string of input files - NLR-Parser txt output | |
files = '''XXX'''.split('\n') | |
from collections import defaultdict | |
all_types = ['TN', 'TCNL', 'NL', 'CN', 'N', 'TCN', 'TNL','CNL'] | |
header = ['File', 'Subset'] + all_types | |
print('\t'.join(header)) | |
for f in files: | |
count_dict = {} | |
for a in ['complete', 'complete (pseudogene)', 'partial', 'partial (pseudogene)']: | |
count_dict[a] = {'TN':0, 'NL':0, 'CN':0, 'N':0, 'TCN':0, 'TCNL':0, 'TNL':0, 'CNL':0} | |
for line in open(f): | |
ll = line.rstrip().split('\t') | |
motifs = ll[-1].split(',') | |
this_domains = set() | |
# iterate ovr | |
for m in motifs: | |
m = int(m) | |
this_domains.add(motifs_d[m]) | |
this_domains = frozenset(this_domains) | |
this_dict = count_dict[ll[2]] | |
this_class = class_dict[this_domains] | |
this_dict[this_class] += 1 | |
count_dict[ll[2]] = this_dict | |
f = '_'.join(f.split('/')[-3:]) | |
for a in sorted(count_dict): | |
thisll = [f, a] | |
for x in all_types: | |
thisll.append(count_dict[a][x]) | |
print('\t'.join(map(str, thisll))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment