Skip to content

Instantly share code, notes, and snippets.

@Ken-Kuroki
Ken-Kuroki / itol_binary.py
Created October 4, 2019 12:46
Generate iTOL "binary data" annotation file from pandas dataframe
import numpy as np
import pandas as pd
from ete3 import PhyloTree
def generate_binary(df: pd.DataFrame, save_file: str) -> None:
# df columns must be labels, values, values, ..., values.
num_cols = df.shape[1]-1
shapes = ",".join(["1"]*num_cols)
colors = ",".join(["#ff0000"]*num_cols)
labels = ",".join([f"{i+1}" for i in range(num_cols)])
@Ken-Kuroki
Ken-Kuroki / itol_simplebar.py
Last active October 4, 2019 12:46
Generate iTOL "simple bar" annotation file from pandas dataframe
import numpy as np
import pandas as pd
from ete3 import PhyloTree
def generate_simplebar(df: pd.DataFrame, save_file: str) -> None:
# df columns must be labels and values.
output = """DATASET_SIMPLEBAR\n
SEPARATOR COMMA\n
DATASET_LABEL,label_simplebar\n
COLOR,#ff0000\n
@Ken-Kuroki
Ken-Kuroki / itol_basic.py
Created October 4, 2019 11:02
Call iTOL API to draw and save a phylogenetic tree
from itolapi import Itol
def itol(tree_file: str, save_file: str, save_format: str) -> str:
if tree_file[-5:] != ".tree" and tree_file[-9:] != ".tree.txt":
raise Exception("Input tree file name must end with .tree or .tree.txt")
if save_format not in ['png', 'svg', 'eps', 'ps', 'pdf', 'nexus', 'newick']:
raise Exception("Unsupported save format")
itol_uploader = Itol()
itol_uploader.add_file(tree_file)
@Ken-Kuroki
Ken-Kuroki / randomforest.py
Last active April 21, 2020 04:17
Generic random forest classification to draw learning curve
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict, cross_val_score, learning_curve
import altair as alt
def learn(X, y, **kwargs): # X needs to be a pandas dataframe
r = RandomForestClassifier(n_estimators=100, random_state=123, class_weight="balanced", **kwargs)
steps, curve_train, curve_test = learning_curve(r, X.sample(frac=1, random_state=123), y.sample(frac=1, random_state=123), cv=5, scoring="roc_auc", n_jobs=20, train_sizes=np.linspace(0.05,1,20))
learning = (pd.concat([pd.DataFrame(curve_train, index=steps).apply(np.average, axis="columns").rename("train"),
@Ken-Kuroki
Ken-Kuroki / dist_unifrac.py
Created August 31, 2019 14:03
Generate distance matrix among samples by unifrac distance
import numpy as np
from skbio import TreeNode
from skbio.diversity import beta_diversity
tree = TreeNode.read("gg_13_8_otus/trees/61_otus_unannotated.tree")
sample_ids = [f"sample{i}" for i in range(6)]
with open("gg_13_8_otus/taxonomy/61_otu_taxonomy.txt", "r") as f:
otu_ids = [each.strip().split("\t")[0] for each in f.readlines()]
data = np.random.randint(0, 100, size=(len(sample_ids), len(otu_ids))).tolist()
@Ken-Kuroki
Ken-Kuroki / dist_matrix.py
Last active August 30, 2019 16:00
Generate distance matrix from newick tree
from itertools import product
import pandas as pd
from ete3 import Tree
t = Tree("some_tree_file.nwk")
leaves = t.get_leaves()
leaf_names = t.get_leaf_names()
pair = product(leaves, leaves)
@Ken-Kuroki
Ken-Kuroki / tfidf.py
Created August 26, 2019 02:39
Calculate TF-IDF from a count matrix
import numpy as np
from sklearn.preprocessing import normalize
def tf_idf(X): # corresponds to smooth=True and norm="l2" in sklearn.feature_extraction.text.TfidfVectorizer
tf = normalize(X, norm="l1", axis=1)
N = len(X)
df = np.count_nonzero(X, axis=0)
idf = np.log((N+1)/(df+1))+1
return normalize(tf * idf, norm="l2")
# Plotting in dark theme with Altair
import numpy as np
import pandas as pd
import altair as alt
alt.renderers.set_embed_options(theme='dark')
arr = np.concatenate([np.random.randn(100, 2), np.random.randint(0, 5, (100, 1))], axis=1)
df = pd.DataFrame(arr, columns=["X1", "X2", "Y"])
@Ken-Kuroki
Ken-Kuroki / plot.py
Created April 5, 2019 02:11
Generic scatterplot function for Altair
def plot(df, x, y, args, fill=False):
df = df.reset_index()
index = df.columns[0]
if fill:
chart = alt.Chart(df).mark_circle(size=30)
else:
chart = alt.Chart(df).mark_point(size=30)
chart = chart.encode(
@Ken-Kuroki
Ken-Kuroki / get_taxonomy_hierarchy_ete3.py
Last active December 7, 2018 09:48
Get Taxonomy Hierarchy using ETE Toolkit 3 and Apply to GenBank Assembly Summary
# This is roughly 30 fold faster than my original implimentation.
# Install ETE3 via pip and run ncbi.update_taxonomy_database() first.
from collections import defaultdict
import pandas as pd
from ete3 import NCBITaxa
ncbi = NCBITaxa()
def get_taxonomy_hierarchy(taxid):