Last active
August 29, 2015 14:06
-
-
Save meren/b606e99f972d16898c65 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 | |
# Copyright (C) 2014, A. Murat Eren | |
# | |
# This program is free software; you can redistribute it and/or modify it under | |
# the terms of the GNU General Public License as published by the Free | |
# Software Foundation; either version 2 of the License, or (at your option) | |
# any later version. | |
# | |
# Please read the COPYING file. | |
import sys | |
import argparse | |
import hcluster | |
from ete2 import Tree | |
parser = argparse.ArgumentParser(description='Takes an observation matrix, returns a newick tree') | |
parser.add_argument('input_matrix', metavar = 'PATH', default = None, | |
help = 'Input matrix') | |
parser.add_argument('-o', '--output-file-name', metavar = 'PATH', default = None, | |
help = 'Output file name') | |
args = parser.parse_args() | |
def get_newick_tree_data(observation_matrix_path, output_file_name = None, clustering_distance='euclidean', clustering_method = 'complete'): | |
vectors = [] | |
id_to_sample_dict = {} | |
input_matrix = open(observation_matrix_path) | |
input_matrix.readline() | |
line_counter = 0 | |
for line in input_matrix.readlines(): | |
fields = line.strip().split('\t') | |
id_to_sample_dict[line_counter] = fields[0] | |
vector = [float(x) for x in fields[1:]] | |
denominator = sum(vector) | |
if denominator: | |
normalized_vector = [p * 1000 / denominator for p in vector] | |
vectors.append(normalized_vector) | |
else: | |
vectors.append(vector) | |
line_counter += 1 | |
distance_matrix = hcluster.pdist(vectors, clustering_distance) | |
#clustering_result = hcluster.ward(distance_matrix) | |
clustering_result = hcluster.linkage(distance_matrix, method = clustering_method) | |
tree = hcluster.to_tree(clustering_result) | |
root = Tree() | |
root.dist = 0 | |
root.name = "root" | |
item2node = {tree: root} | |
to_visit = [tree] | |
while to_visit: | |
node = to_visit.pop() | |
cl_dist = node.dist / 2.0 | |
for ch_node in [node.left, node.right]: | |
if ch_node: | |
ch = Tree() | |
ch.dist = cl_dist | |
if ch_node.is_leaf(): | |
ch.name = id_to_sample_dict[ch_node.id] | |
else: | |
ch.name = 'Int' + str(ch_node.id) | |
item2node[node].add_child(ch) | |
item2node[ch_node] = ch | |
to_visit.append(ch_node) | |
if output_file_name: | |
root.write(format=1, outfile=output_file_name) | |
return root.write(format=1) | |
tree_data = get_newick_tree_data(args.input_matrix, args.output_file_name) | |
print tree_data |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment