Last active
May 17, 2023 21:29
-
-
Save MrCreosote/925c721683b1fe59d76434bddc1bc4e4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Totally gross Q&D implementation of mapping microtrait traits to gene / hmm names. | |
""" | |
_RULE2_TRAIT = "./microtrait/data-raw/microtrait_rule2trait.txt" | |
_SUBSTRATE2_RULE = "./microtrait/data-raw/microtrait_substrate2rule.txt" | |
_RULE_UNWRAPPED = "./microtrait/data-raw/microtrait_ruleunwrapped.txt" | |
_OUT = "./traits2genes.tsv" | |
from collections import defaultdict | |
import json | |
trait2rule = defaultdict(set) | |
substrate2rule = defaultdict(set) | |
trait2substrate = defaultdict(set) | |
rule2unwrapped = {} | |
with open(_RULE2_TRAIT) as f: | |
f.readline() | |
for l in f: | |
rule_name, _, substrate, trait1, trait2, trait3, _ = l.split("\t") | |
for t in [trait1, trait2, trait3]: | |
if t: | |
trait2rule[t].add(rule_name) | |
if substrate: | |
for s in substrate.split(";"): | |
substrate2rule[s].add(rule_name) | |
with open(_SUBSTRATE2_RULE) as f: | |
f.readline() | |
for l in f: | |
substrate, _, _, _, trait1, trait2, trait3 = l.split("\t") | |
for t in [trait1, trait2, trait3]: | |
if t.strip(): | |
trait2substrate[t.strip()].add(substrate) | |
with open(_RULE_UNWRAPPED) as f: | |
for l in f: | |
rule, _, _, unwrapped = l.split("\t") | |
for c in "!()|&'": | |
unwrapped = unwrapped.replace(c, "") | |
rule2unwrapped[rule] = [u.strip() for u in unwrapped.split()] | |
def set_default(obj): | |
if isinstance(obj, set): | |
return list(obj) | |
raise TypeError | |
print("trait2rule", len(trait2rule)) | |
print(json.dumps(trait2rule, default=set_default, indent=2)) | |
print() | |
print("substrate2rule", len(substrate2rule)) | |
print(json.dumps(substrate2rule, default=set_default, indent=2)) | |
print() | |
print("trait2substrate", len(trait2substrate)) | |
print(json.dumps(trait2substrate, default=set_default, indent=2)) | |
print() | |
print("rule2unwrapped", len(rule2unwrapped)) | |
print(json.dumps(rule2unwrapped, default=set_default, indent=2)) | |
for t, sublist in trait2substrate.items(): | |
for substrate in sublist: | |
trait2rule[t].update(substrate2rule[substrate]) | |
print() | |
print("trait2rule updated", len(trait2rule)) | |
print(json.dumps(trait2rule, default=set_default, indent=2)) | |
with open(_OUT, "w") as out: | |
for t, rules in trait2rule.items(): | |
genes = set() | |
for r in rules: | |
genes.update(rule2unwrapped[r]) | |
out.write('\t'.join([t] + sorted(genes)) + "\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment