Skip to content

Instantly share code, notes, and snippets.

@dogancankilment
Forked from meren/toNewick.py
Last active August 13, 2016 20:43
Show Gist options
  • Save dogancankilment/5f19281a0a89daa501b2 to your computer and use it in GitHub Desktop.
Save dogancankilment/5f19281a0a89daa501b2 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8
# Copyright (C) 2014, A. Murat Eren
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Please read the COPYING file.
import sys
import argparse
import hcluster
from ete2 import Tree
parser = argparse.ArgumentParser(description='Takes an observation matrix, returns a newick tree')
parser.add_argument('input_matrix', metavar='PATH', default=None,
help='Input matrix')
parser.add_argument('-o', '--output-file-name', metavar='PATH', default=None,
help='Output file name')
args = parser.parse_args()
def normalization(root):
farthest_node = root.get_farthest_node()
circle_radius = farthest_node[0].get_distance(root)
for cr_node in root.traverse("levelorder"):
if not cr_node.is_leaf():
farthest_leaf = cr_node.get_farthest_leaf()[0]
dist_to_root = farthest_leaf.get_distance(root)
diff = circle_radius - dist_to_root
cr_node.dist = cr_node.dist + diff
else:
cr_node_parent = cr_node.up
farthest_leaf = cr_node_parent.get_farthest_leaf()[0]
farthest_leaf_dist = farthest_leaf.get_distance(root)
cr_node_dist = cr_node.get_distance(root)
diff = farthest_leaf_dist - cr_node_dist
cr_node.dist = cr_node.dist + diff
def get_newick_tree_data(observation_matrix_path, output_file_name=None, clustering_distance='euclidean',
clustering_method='complete'):
vectors = []
id_to_sample_dict = {}
input_matrix = open(observation_matrix_path)
input_matrix.readline()
line_counter = 0
for line in input_matrix.readlines():
fields = line.strip().split('\t')
id_to_sample_dict[line_counter] = fields[0]
vector = [float(x) for x in fields[1:]]
denominator = sum(vector)
if denominator:
normalized_vector = [p * 1000 / denominator for p in vector]
vectors.append(normalized_vector)
else:
vectors.append(vector)
line_counter += 1
distance_matrix = hcluster.pdist(vectors, clustering_distance)
# clustering_result = hcluster.ward(distance_matrix)
clustering_result = hcluster.linkage(distance_matrix, method=clustering_method)
tree = hcluster.to_tree(clustering_result)
root = Tree()
root.dist = 0
root.name = "root"
item2node = {tree: root}
to_visit = [tree]
while to_visit:
node = to_visit.pop()
cl_dist = node.dist / 2.0
for ch_node in [node.left, node.right]:
if ch_node:
ch = Tree()
ch.dist = cl_dist
if ch_node.is_leaf():
ch.name = id_to_sample_dict[ch_node.id]
else:
ch.name = 'Int' + str(ch_node.id)
item2node[node].add_child(ch)
item2node[ch_node] = ch
to_visit.append(ch_node)
normalization(root)
if output_file_name:
root.write(format=1, outfile=output_file_name)
return root.write(format=1)
tree_data = get_newick_tree_data(args.input_matrix, args.output_file_name)
print tree_data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment