meren/toNewick.py

## toNewick.py
#!/usr/bin/env python
# -*- coding: utf-8

# Copyright (C) 2014, A. Murat Eren
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Please read the COPYING file.

import sys
import argparse
import hcluster
from ete2 import Tree


parser = argparse.ArgumentParser(description='Takes an observation matrix, returns a newick tree')
parser.add_argument('input_matrix', metavar = 'PATH', default = None,
                    help = 'Input matrix')
parser.add_argument('-o', '--output-file-name', metavar = 'PATH', default = None,
                    help = 'Output file name')

args = parser.parse_args()

def get_newick_tree_data(observation_matrix_path, output_file_name = None, clustering_distance='euclidean', clustering_method = 'complete'):
    vectors = []
    id_to_sample_dict = {}

    input_matrix = open(observation_matrix_path)
    input_matrix.readline()

    line_counter = 0
    for line in input_matrix.readlines():
        fields = line.strip().split('\t')
        id_to_sample_dict[line_counter] = fields[0]

        vector = [float(x) for x in fields[1:]]

        denominator = sum(vector)
        if denominator:
            normalized_vector = [p * 1000 / denominator for p in vector]
            vectors.append(normalized_vector)
        else:
            vectors.append(vector)

        line_counter += 1

    distance_matrix = hcluster.pdist(vectors, clustering_distance)

    #clustering_result = hcluster.ward(distance_matrix)
    clustering_result = hcluster.linkage(distance_matrix, method = clustering_method)

    tree = hcluster.to_tree(clustering_result)

    root = Tree()
    root.dist = 0
    root.name = "root"
    item2node = {tree: root}

    to_visit = [tree]
    while to_visit:
        node = to_visit.pop()
        cl_dist = node.dist / 2.0
        for ch_node in [node.left, node.right]:
            if ch_node:
                ch = Tree()
                ch.dist = cl_dist

                if ch_node.is_leaf():
                    ch.name = id_to_sample_dict[ch_node.id]
                else:
                    ch.name = 'Int' + str(ch_node.id)

                item2node[node].add_child(ch)
                item2node[ch_node] = ch
                to_visit.append(ch_node)

    if output_file_name:
        root.write(format=1, outfile=output_file_name)

    return root.write(format=1)

tree_data = get_newick_tree_data(args.input_matrix, args.output_file_name)
print tree_data
	#!/usr/bin/env python
	# -*- coding: utf-8

	# Copyright (C) 2014, A. Murat Eren
	#
	# This program is free software; you can redistribute it and/or modify it under
	# the terms of the GNU General Public License as published by the Free
	# Software Foundation; either version 2 of the License, or (at your option)
	# any later version.
	#
	# Please read the COPYING file.

	import sys
	import argparse
	import hcluster
	from ete2 import Tree


	parser = argparse.ArgumentParser(description='Takes an observation matrix, returns a newick tree')
	parser.add_argument('input_matrix', metavar = 'PATH', default = None,
	help = 'Input matrix')
	parser.add_argument('-o', '--output-file-name', metavar = 'PATH', default = None,
	help = 'Output file name')

	args = parser.parse_args()

	def get_newick_tree_data(observation_matrix_path, output_file_name = None, clustering_distance='euclidean', clustering_method = 'complete'):
	vectors = []
	id_to_sample_dict = {}

	input_matrix = open(observation_matrix_path)
	input_matrix.readline()

	line_counter = 0
	for line in input_matrix.readlines():
	fields = line.strip().split('\t')
	id_to_sample_dict[line_counter] = fields[0]

	vector = [float(x) for x in fields[1:]]

	denominator = sum(vector)
	if denominator:
	normalized_vector = [p * 1000 / denominator for p in vector]
	vectors.append(normalized_vector)
	else:
	vectors.append(vector)

	line_counter += 1

	distance_matrix = hcluster.pdist(vectors, clustering_distance)

	#clustering_result = hcluster.ward(distance_matrix)
	clustering_result = hcluster.linkage(distance_matrix, method = clustering_method)

	tree = hcluster.to_tree(clustering_result)

	root = Tree()
	root.dist = 0
	root.name = "root"
	item2node = {tree: root}

	to_visit = [tree]
	while to_visit:
	node = to_visit.pop()
	cl_dist = node.dist / 2.0
	for ch_node in [node.left, node.right]:
	if ch_node:
	ch = Tree()
	ch.dist = cl_dist

	if ch_node.is_leaf():
	ch.name = id_to_sample_dict[ch_node.id]
	else:
	ch.name = 'Int' + str(ch_node.id)

	item2node[node].add_child(ch)
	item2node[ch_node] = ch
	to_visit.append(ch_node)

	if output_file_name:
	root.write(format=1, outfile=output_file_name)

	return root.write(format=1)

	tree_data = get_newick_tree_data(args.input_matrix, args.output_file_name)
	print tree_data