Skip to content

Instantly share code, notes, and snippets.

@jasonsahl
Last active February 17, 2022 15:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jasonsahl/24c7cb0fb78b4769521752193a43b219 to your computer and use it in GitHub Desktop.
Save jasonsahl/24c7cb0fb78b4769521752193a43b219 to your computer and use it in GitHub Desktop.
Create a distance dendrogram from MASH distances
#!/usr/bin/env python
#A python implementation of building clusters from MASH distances
import optparse
import sys
from optparse import OptionParser
try:
from scipy.cluster.hierarchy import weighted
import scipy.spatial.distance as ssd
except:
print("scipy must be installed. try: conda install -c anaconda scipy")
sys.exit()
try:
import pandas as pd
except:
print("pandas must be installed. try: conda install -c conda-forge pandas")
sys.exit()
try:
from skbio.tree import TreeNode
except:
print("skbio must be installed. try: conda install -c anaconda scikit-bio")
sys.exit()
try:
import numpy as np
except:
print("numpy must be installed. try: conda install -c conda-forge numpy")
sys.exit()
import os
import subprocess
def is_tool(name):
from distutils.spawn import find_executable
return find_executable(name) is not None
def test_dir(option, opt_str, value, parser):
if os.path.exists(value):
setattr(parser.values, option.dest, value)
else:
print("%s directory cannot be found" % option)
sys.exit()
def write_tree(matrix):
dmx = pd.read_csv(matrix, index_col=0, sep="\t")
ids = dmx.index.tolist()
triu = np.square(dmx.values)
distArray = ssd.squareform(triu)
hclust = weighted(distArray)
t = TreeNode.from_linkage_matrix(hclust, ids)
nw = t.__str__().replace("'", "")
outfile = open("mashPy.tree", "w")
outfile.write(nw)
outfile.close()
def modify_matrix(in_matrix):
out_matrix = open("distance_matrix.txt", "w")
with open(in_matrix) as my_matrix:
for line in my_matrix:
newline = line.strip("\n")
if newline.startswith("#"):
fields = newline.split()
newlist = []
for field in fields[1:]:
names = field.split("/")
fixed = names[-1].replace(".fasta","")
newlist.append(fixed)
newlist.insert(0," ")
out_matrix.write("\t".join(newlist)+"\n")
else:
fields = newline.split()
newlist = []
names = fields[0].split("/")
fixed = names[-1].replace(".fasta","")
for field in fields[1:]:
newlist.append(field)
newlist.insert(0,fixed)
out_matrix.write("\t".join(newlist)+"\n")
out_matrix.close()
def main(fasta_dir,processors,sketch_size):
result = is_tool("mash")
if result is False:
print("mash isn't in your path, but needs to be!. Try: conda install -c bioconda mash")
sys.exit()
else:
print("citation: Ondov, B.D., Treangen, T.J., Melsted, P., Mallonee, A.B., Bergman, N.H., Koren, S., Phillippy, A.M. 2016. Mash: fast genome and metagenome distance estimation using MinHash. Genome Biology 17(132). PMID: 27323842")
subprocess.check_call("mash sketch -s %s -o reference %s/*fasta" % (sketch_size,fasta_dir),stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True)
subprocess.check_call("mash dist -t reference.msh -p %s -s %s %s/*fasta > tmp.txt.xyx" % (processors,sketch_size,fasta_dir),stdout=open(os.devnull, 'wb'),stderr=open(os.devnull, 'wb'),shell=True)
modify_matrix("tmp.txt.xyx")
write_tree("distance_matrix.txt")
os.system("rm tmp.txt.xyx reference.msh")
if __name__ == "__main__":
parser = OptionParser(usage="usage: %prog [options]",version="%prog 0.0.2")
parser.add_option("-d", "--fasta_dir", dest="fasta_dir",
help="/path/to/FASTA dir [REQUIRED]",
type="string", action="callback", callback=test_dir)
parser.add_option("-p", "--processors", dest="processors",
help="number of processors to use, defaults to 2",
type="int", action="store", default=2)
parser.add_option("-s", "--sketch_size", dest="sketch_size",
help="mash sketch size to use, defaults to 10000",
type="int", action="store", default=10000)
options, args = parser.parse_args()
mandatories = ["fasta_dir","processors","sketch_size"]
for m in mandatories:
if not getattr(options, m, None):
print("\nMust provide %s.\n" %m)
parser.print_help()
exit(-1)
main(options.fasta_dir,options.processors,options.sketch_size)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment