Skip to content

Instantly share code, notes, and snippets.

@Colelyman
Created April 8, 2019 23:30
Show Gist options
  • Save Colelyman/6ce2698899dd0ce7a232ddf757fc260c to your computer and use it in GitHub Desktop.
Save Colelyman/6ce2698899dd0ce7a232ddf757fc260c to your computer and use it in GitHub Desktop.
d_grimshawi
d_virilis
d_mojavensis
d_willistoni
d_persimilis
d_pseudoobscura
d_ananassae
d_erecta
d_yakuba
d_melanogaster
d_sechellia
d_simulans
/-d_grimshawi
/-|
| | /-d_virilis
| \-|
--| \-d_mojavensis
|
| /-d_willistoni
| |
\-| /-d_persimilis
| /-|
| | \-d_pseudoobscura
\-|
| /-d_ananassae
| |
\-| /-d_erecta
| /-|
| | \-d_yakuba
\-|
| /-d_melanogaster
\-|
| /-d_sechellia
\-|
\-d_simulans
['1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1'] 4095
['1', '1', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0'] 3584
['0', '1', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0'] 1536
['0', '0', '0', '1', '1', '1', '1', '1', '1', '1', '1', '1'] 511
['0', '0', '0', '0', '1', '1', '1', '1', '1', '1', '1', '1'] 255
['0', '0', '0', '0', '1', '1', '0', '0', '0', '0', '0', '0'] 192
['0', '0', '0', '0', '0', '0', '1', '1', '1', '1', '1', '1'] 63
['0', '0', '0', '0', '0', '0', '0', '1', '1', '1', '1', '1'] 31
['0', '0', '0', '0', '0', '0', '0', '1', '1', '0', '0', '0'] 24
['0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1', '1'] 7
['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1'] 3
import argparse
from ete3 import Tree
def get_genome_ids(path):
genome_id, genomes = 0, {}
with open(path) as fh:
for line in fh:
genomes[line.strip()] = genome_id
genome_id += 1
return genomes
def get_clade_ints(path, genomes):
t = Tree(path)
clades = []
print(t)
for node in t.traverse('preorder'):
if not node.is_leaf():
clade = set(n.name for n in node.traverse('preorder') if len(n.name) > 0)
clades += [clade]
clade_ints = set()
for clade in clades:
binary_int = ['0'] * len(genomes)
for genome in clade:
binary_int[genomes[genome]] = '1'
base_10_int = int(''.join(binary_int), 2)
print(binary_int, base_10_int)
clade_ints.add(base_10_int)
return clade_ints
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Builds a CdBG using kcollections')
parser.add_argument('newick_file', type=str, help='Path to the newick file.')
parser.add_argument('genomes', type=str,
help='Path to a file that lists the order of the genomes (one per line).')
args = parser.parse_args()
print(args.genomes)
genomes = get_genome_ids(args.genomes)
get_clade_ints(args.newick_file, genomes)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment