Skip to content

Instantly share code, notes, and snippets.

@MrCreosote
Last active May 17, 2023 21:29
Show Gist options
  • Save MrCreosote/925c721683b1fe59d76434bddc1bc4e4 to your computer and use it in GitHub Desktop.
Save MrCreosote/925c721683b1fe59d76434bddc1bc4e4 to your computer and use it in GitHub Desktop.
"""
Totally gross Q&D implementation of mapping microtrait traits to gene / hmm names.
"""
_RULE2_TRAIT = "./microtrait/data-raw/microtrait_rule2trait.txt"
_SUBSTRATE2_RULE = "./microtrait/data-raw/microtrait_substrate2rule.txt"
_RULE_UNWRAPPED = "./microtrait/data-raw/microtrait_ruleunwrapped.txt"
_OUT = "./traits2genes.tsv"
from collections import defaultdict
import json
trait2rule = defaultdict(set)
substrate2rule = defaultdict(set)
trait2substrate = defaultdict(set)
rule2unwrapped = {}
with open(_RULE2_TRAIT) as f:
f.readline()
for l in f:
rule_name, _, substrate, trait1, trait2, trait3, _ = l.split("\t")
for t in [trait1, trait2, trait3]:
if t:
trait2rule[t].add(rule_name)
if substrate:
for s in substrate.split(";"):
substrate2rule[s].add(rule_name)
with open(_SUBSTRATE2_RULE) as f:
f.readline()
for l in f:
substrate, _, _, _, trait1, trait2, trait3 = l.split("\t")
for t in [trait1, trait2, trait3]:
if t.strip():
trait2substrate[t.strip()].add(substrate)
with open(_RULE_UNWRAPPED) as f:
for l in f:
rule, _, _, unwrapped = l.split("\t")
for c in "!()|&'":
unwrapped = unwrapped.replace(c, "")
rule2unwrapped[rule] = [u.strip() for u in unwrapped.split()]
def set_default(obj):
if isinstance(obj, set):
return list(obj)
raise TypeError
print("trait2rule", len(trait2rule))
print(json.dumps(trait2rule, default=set_default, indent=2))
print()
print("substrate2rule", len(substrate2rule))
print(json.dumps(substrate2rule, default=set_default, indent=2))
print()
print("trait2substrate", len(trait2substrate))
print(json.dumps(trait2substrate, default=set_default, indent=2))
print()
print("rule2unwrapped", len(rule2unwrapped))
print(json.dumps(rule2unwrapped, default=set_default, indent=2))
for t, sublist in trait2substrate.items():
for substrate in sublist:
trait2rule[t].update(substrate2rule[substrate])
print()
print("trait2rule updated", len(trait2rule))
print(json.dumps(trait2rule, default=set_default, indent=2))
with open(_OUT, "w") as out:
for t, rules in trait2rule.items():
genes = set()
for r in rules:
genes.update(rule2unwrapped[r])
out.write('\t'.join([t] + sorted(genes)) + "\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment