Skip to content

Instantly share code, notes, and snippets.

@a-recknagel
Created August 2, 2023 20:35
Show Gist options
  • Save a-recknagel/fe5edb85f4b2ec389dcd029826b04a88 to your computer and use it in GitHub Desktop.
Save a-recknagel/fe5edb85f4b2ec389dcd029826b04a88 to your computer and use it in GitHub Desktop.
popmap_deduplicator
from collections import defaultdict
from os import path
def split_off_duplicate_individuals(mapping):
current = {}
dupes = {}
seen = set()
for group, (idv_a, idv_b) in mapping.items():
if idv_a not in seen and idv_b not in seen:
current[group] = [idv_a, idv_b]
seen.update({idv_a, idv_b})
else:
dupes[group] = [idv_a, idv_b]
return current, dupes
def main(source, sink, prefix):
# read data
with open(source, "r") as f:
raw = [e.strip().split("\t") for e in f.readlines()]
# make a map, key by group
data = defaultdict(list)
for idv, group in raw:
data[group].append(idv)
# sanity check
assert all(len(idvs) == 2 for idvs in data.values())
# generate subsets with no duplicate individuals
subsets = []
while data:
uniques, dupes = split_off_duplicate_individuals(data)
subsets.append(uniques)
data = dupes
# transform subsets back into the original format and write them to disk
for n, subset in enumerate(subsets):
content = []
for group, (idv_a, idv_b) in subset.items():
content.append("{idv_a}\t{group}\n".format(idv_a=idv_a, group=group))
content.append("{idv_b}\t{group}\n".format(idv_b=idv_b, group=group))
with open(path.join(sink, prefix + str(n)), "w") as f:
f.writelines(content)
if __name__ == '__main__':
main(
"/home/me/data/popmap_pairwise_combos.txt",
"/home/me/data/popmap_combo_groups", # needs to exist before running the script
"popmap_pairwise_unique_combos_", # shared name of all resulting files, index is added automatically
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment