Skip to content

Instantly share code, notes, and snippets.

@meren
Last active August 29, 2015 14:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save meren/b606e99f972d16898c65 to your computer and use it in GitHub Desktop.
Save meren/b606e99f972d16898c65 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8
# Copyright (C) 2014, A. Murat Eren
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Please read the COPYING file.
import sys
import argparse
import hcluster
from ete2 import Tree
parser = argparse.ArgumentParser(description='Takes an observation matrix, returns a newick tree')
parser.add_argument('input_matrix', metavar = 'PATH', default = None,
help = 'Input matrix')
parser.add_argument('-o', '--output-file-name', metavar = 'PATH', default = None,
help = 'Output file name')
args = parser.parse_args()
def get_newick_tree_data(observation_matrix_path, output_file_name = None, clustering_distance='euclidean', clustering_method = 'complete'):
vectors = []
id_to_sample_dict = {}
input_matrix = open(observation_matrix_path)
input_matrix.readline()
line_counter = 0
for line in input_matrix.readlines():
fields = line.strip().split('\t')
id_to_sample_dict[line_counter] = fields[0]
vector = [float(x) for x in fields[1:]]
denominator = sum(vector)
if denominator:
normalized_vector = [p * 1000 / denominator for p in vector]
vectors.append(normalized_vector)
else:
vectors.append(vector)
line_counter += 1
distance_matrix = hcluster.pdist(vectors, clustering_distance)
#clustering_result = hcluster.ward(distance_matrix)
clustering_result = hcluster.linkage(distance_matrix, method = clustering_method)
tree = hcluster.to_tree(clustering_result)
root = Tree()
root.dist = 0
root.name = "root"
item2node = {tree: root}
to_visit = [tree]
while to_visit:
node = to_visit.pop()
cl_dist = node.dist / 2.0
for ch_node in [node.left, node.right]:
if ch_node:
ch = Tree()
ch.dist = cl_dist
if ch_node.is_leaf():
ch.name = id_to_sample_dict[ch_node.id]
else:
ch.name = 'Int' + str(ch_node.id)
item2node[node].add_child(ch)
item2node[ch_node] = ch
to_visit.append(ch_node)
if output_file_name:
root.write(format=1, outfile=output_file_name)
return root.write(format=1)
tree_data = get_newick_tree_data(args.input_matrix, args.output_file_name)
print tree_data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment