Last active
February 17, 2022 15:57
-
-
Save jasonsahl/24c7cb0fb78b4769521752193a43b219 to your computer and use it in GitHub Desktop.
Create a distance dendrogram from MASH distances
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#A python implementation of building clusters from MASH distances | |
import optparse | |
import sys | |
from optparse import OptionParser | |
try: | |
from scipy.cluster.hierarchy import weighted | |
import scipy.spatial.distance as ssd | |
except: | |
print("scipy must be installed. try: conda install -c anaconda scipy") | |
sys.exit() | |
try: | |
import pandas as pd | |
except: | |
print("pandas must be installed. try: conda install -c conda-forge pandas") | |
sys.exit() | |
try: | |
from skbio.tree import TreeNode | |
except: | |
print("skbio must be installed. try: conda install -c anaconda scikit-bio") | |
sys.exit() | |
try: | |
import numpy as np | |
except: | |
print("numpy must be installed. try: conda install -c conda-forge numpy") | |
sys.exit() | |
import os | |
import subprocess | |
def is_tool(name): | |
from distutils.spawn import find_executable | |
return find_executable(name) is not None | |
def test_dir(option, opt_str, value, parser): | |
if os.path.exists(value): | |
setattr(parser.values, option.dest, value) | |
else: | |
print("%s directory cannot be found" % option) | |
sys.exit() | |
def write_tree(matrix): | |
dmx = pd.read_csv(matrix, index_col=0, sep="\t") | |
ids = dmx.index.tolist() | |
triu = np.square(dmx.values) | |
distArray = ssd.squareform(triu) | |
hclust = weighted(distArray) | |
t = TreeNode.from_linkage_matrix(hclust, ids) | |
nw = t.__str__().replace("'", "") | |
outfile = open("mashPy.tree", "w") | |
outfile.write(nw) | |
outfile.close() | |
def modify_matrix(in_matrix): | |
out_matrix = open("distance_matrix.txt", "w") | |
with open(in_matrix) as my_matrix: | |
for line in my_matrix: | |
newline = line.strip("\n") | |
if newline.startswith("#"): | |
fields = newline.split() | |
newlist = [] | |
for field in fields[1:]: | |
names = field.split("/") | |
fixed = names[-1].replace(".fasta","") | |
newlist.append(fixed) | |
newlist.insert(0," ") | |
out_matrix.write("\t".join(newlist)+"\n") | |
else: | |
fields = newline.split() | |
newlist = [] | |
names = fields[0].split("/") | |
fixed = names[-1].replace(".fasta","") | |
for field in fields[1:]: | |
newlist.append(field) | |
newlist.insert(0,fixed) | |
out_matrix.write("\t".join(newlist)+"\n") | |
out_matrix.close() | |
def main(fasta_dir,processors,sketch_size): | |
result = is_tool("mash") | |
if result is False: | |
print("mash isn't in your path, but needs to be!. Try: conda install -c bioconda mash") | |
sys.exit() | |
else: | |
print("citation: Ondov, B.D., Treangen, T.J., Melsted, P., Mallonee, A.B., Bergman, N.H., Koren, S., Phillippy, A.M. 2016. Mash: fast genome and metagenome distance estimation using MinHash. Genome Biology 17(132). PMID: 27323842") | |
subprocess.check_call("mash sketch -s %s -o reference %s/*fasta" % (sketch_size,fasta_dir),stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True) | |
subprocess.check_call("mash dist -t reference.msh -p %s -s %s %s/*fasta > tmp.txt.xyx" % (processors,sketch_size,fasta_dir),stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True) | |
modify_matrix("tmp.txt.xyx") | |
write_tree("distance_matrix.txt") | |
os.system("rm tmp.txt.xyx reference.msh") | |
if __name__ == "__main__": | |
parser = OptionParser(usage="usage: %prog [options]",version="%prog 0.0.2") | |
parser.add_option("-d", "--fasta_dir", dest="fasta_dir", | |
help="/path/to/FASTA dir [REQUIRED]", | |
type="string", action="callback", callback=test_dir) | |
parser.add_option("-p", "--processors", dest="processors", | |
help="number of processors to use, defaults to 2", | |
type="int", action="store", default=2) | |
parser.add_option("-s", "--sketch_size", dest="sketch_size", | |
help="mash sketch size to use, defaults to 10000", | |
type="int", action="store", default=10000) | |
options, args = parser.parse_args() | |
mandatories = ["fasta_dir","processors","sketch_size"] | |
for m in mandatories: | |
if not getattr(options, m, None): | |
print("\nMust provide %s.\n" %m) | |
parser.print_help() | |
exit(-1) | |
main(options.fasta_dir,options.processors,options.sketch_size) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment