Ken-Kuroki

## itol_binary.py
import numpy as np
import pandas as pd
from ete3 import PhyloTree

def generate_binary(df: pd.DataFrame, save_file: str) -> None:
    # df columns must be labels, values, values, ..., values.
    num_cols = df.shape[1]-1
    shapes = ",".join(["1"]*num_cols)
    colors = ",".join(["#ff0000"]*num_cols)
    labels = ",".join([f"{i+1}" for i in range(num_cols)])

## itol_simplebar.py
import numpy as np
import pandas as pd
from ete3 import PhyloTree

def generate_simplebar(df: pd.DataFrame, save_file: str) -> None:
    # df columns must be labels and values.
    output = """DATASET_SIMPLEBAR\n
SEPARATOR COMMA\n
DATASET_LABEL,label_simplebar\n
COLOR,#ff0000\n

## itol_basic.py
from itolapi import Itol

def itol(tree_file: str, save_file: str, save_format: str) -> str:
  if tree_file[-5:] != ".tree" and tree_file[-9:] != ".tree.txt":
    raise Exception("Input tree file name must end with .tree or .tree.txt")
  if save_format not in ['png', 'svg', 'eps', 'ps', 'pdf', 'nexus', 'newick']:
    raise Exception("Unsupported save format")

  itol_uploader = Itol()
  itol_uploader.add_file(tree_file)

## randomforest.py
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict, cross_val_score, learning_curve
import altair as alt

def learn(X, y, **kwargs):  # X needs to be a pandas dataframe
    r = RandomForestClassifier(n_estimators=100, random_state=123, class_weight="balanced", **kwargs)
    steps, curve_train, curve_test = learning_curve(r, X.sample(frac=1, random_state=123), y.sample(frac=1, random_state=123), cv=5, scoring="roc_auc", n_jobs=20, train_sizes=np.linspace(0.05,1,20))
    learning = (pd.concat([pd.DataFrame(curve_train, index=steps).apply(np.average, axis="columns").rename("train"),

## dist_unifrac.py
import numpy as np
from skbio import TreeNode
from skbio.diversity import beta_diversity

tree = TreeNode.read("gg_13_8_otus/trees/61_otus_unannotated.tree")
sample_ids = [f"sample{i}" for i in range(6)]
with open("gg_13_8_otus/taxonomy/61_otu_taxonomy.txt", "r") as f:
    otu_ids = [each.strip().split("\t")[0] for each in f.readlines()]
data = np.random.randint(0, 100, size=(len(sample_ids), len(otu_ids))).tolist()

## dist_matrix.py
from itertools import product
import pandas as pd
from ete3 import Tree

t = Tree("some_tree_file.nwk")

leaves = t.get_leaves()
leaf_names = t.get_leaf_names()
pair = product(leaves, leaves)

## tfidf.py
import numpy as np
from sklearn.preprocessing import normalize

def tf_idf(X):  # corresponds to smooth=True and norm="l2" in sklearn.feature_extraction.text.TfidfVectorizer
    tf = normalize(X, norm="l1", axis=1)
    N = len(X)
    df = np.count_nonzero(X, axis=0)
    idf = np.log((N+1)/(df+1))+1
    return normalize(tf * idf, norm="l2")

## altair_dark.py
# Plotting in dark theme with Altair

import numpy as np
import pandas as pd
import altair as alt

alt.renderers.set_embed_options(theme='dark')

arr = np.concatenate([np.random.randn(100, 2), np.random.randint(0, 5, (100, 1))], axis=1)
df = pd.DataFrame(arr, columns=["X1", "X2", "Y"])

## plot.py
def plot(df, x, y, args, fill=False):
    df = df.reset_index()
    index = df.columns[0]

    if fill:
        chart = alt.Chart(df).mark_circle(size=30)
    else:
        chart = alt.Chart(df).mark_point(size=30)

    chart = chart.encode(

## get_taxonomy_hierarchy_ete3.py
# This is roughly 30 fold faster than my original implimentation.
# Install ETE3 via pip and run ncbi.update_taxonomy_database() first.

from collections import defaultdict
import pandas as pd
from ete3 import NCBITaxa

ncbi = NCBITaxa()

def get_taxonomy_hierarchy(taxid):
	import numpy as np
	import pandas as pd
	from ete3 import PhyloTree

	def generate_binary(df: pd.DataFrame, save_file: str) -> None:
	# df columns must be labels, values, values, ..., values.
	num_cols = df.shape[1]-1
	shapes = ",".join(["1"]*num_cols)
	colors = ",".join(["#ff0000"]*num_cols)
	labels = ",".join([f"{i+1}" for i in range(num_cols)])
	from itolapi import Itol

	def itol(tree_file: str, save_file: str, save_format: str) -> str:
	if tree_file[-5:] != ".tree" and tree_file[-9:] != ".tree.txt":
	raise Exception("Input tree file name must end with .tree or .tree.txt")
	if save_format not in ['png', 'svg', 'eps', 'ps', 'pdf', 'nexus', 'newick']:
	raise Exception("Unsupported save format")

	itol_uploader = Itol()
	itol_uploader.add_file(tree_file)
	import numpy as np
	import pandas as pd
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.model_selection import cross_val_predict, cross_val_score, learning_curve
	import altair as alt

	def learn(X, y, **kwargs): # X needs to be a pandas dataframe
	r = RandomForestClassifier(n_estimators=100, random_state=123, class_weight="balanced", **kwargs)
	steps, curve_train, curve_test = learning_curve(r, X.sample(frac=1, random_state=123), y.sample(frac=1, random_state=123), cv=5, scoring="roc_auc", n_jobs=20, train_sizes=np.linspace(0.05,1,20))
	learning = (pd.concat([pd.DataFrame(curve_train, index=steps).apply(np.average, axis="columns").rename("train"),
	import numpy as np
	from skbio import TreeNode
	from skbio.diversity import beta_diversity

	tree = TreeNode.read("gg_13_8_otus/trees/61_otus_unannotated.tree")
	sample_ids = [f"sample{i}" for i in range(6)]
	with open("gg_13_8_otus/taxonomy/61_otu_taxonomy.txt", "r") as f:
	otu_ids = [each.strip().split("\t")[0] for each in f.readlines()]
	data = np.random.randint(0, 100, size=(len(sample_ids), len(otu_ids))).tolist()
	from itertools import product
	import pandas as pd
	from ete3 import Tree

	t = Tree("some_tree_file.nwk")

	leaves = t.get_leaves()
	leaf_names = t.get_leaf_names()
	pair = product(leaves, leaves)
	import numpy as np
	from sklearn.preprocessing import normalize

	def tf_idf(X): # corresponds to smooth=True and norm="l2" in sklearn.feature_extraction.text.TfidfVectorizer
	tf = normalize(X, norm="l1", axis=1)
	N = len(X)
	df = np.count_nonzero(X, axis=0)
	idf = np.log((N+1)/(df+1))+1
	return normalize(tf * idf, norm="l2")
	# Plotting in dark theme with Altair

	import numpy as np
	import pandas as pd
	import altair as alt

	alt.renderers.set_embed_options(theme='dark')

	arr = np.concatenate([np.random.randn(100, 2), np.random.randint(0, 5, (100, 1))], axis=1)
	df = pd.DataFrame(arr, columns=["X1", "X2", "Y"])
	def plot(df, x, y, args, fill=False):
	df = df.reset_index()
	index = df.columns[0]

	if fill:
	chart = alt.Chart(df).mark_circle(size=30)
	else:
	chart = alt.Chart(df).mark_point(size=30)

	chart = chart.encode(
	# This is roughly 30 fold faster than my original implimentation.
	# Install ETE3 via pip and run ncbi.update_taxonomy_database() first.

	from collections import defaultdict
	import pandas as pd
	from ete3 import NCBITaxa

	ncbi = NCBITaxa()

	def get_taxonomy_hierarchy(taxid):