This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from ete3 import PhyloTree | |
def generate_binary(df: pd.DataFrame, save_file: str) -> None: | |
# df columns must be labels, values, values, ..., values. | |
num_cols = df.shape[1]-1 | |
shapes = ",".join(["1"]*num_cols) | |
colors = ",".join(["#ff0000"]*num_cols) | |
labels = ",".join([f"{i+1}" for i in range(num_cols)]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from ete3 import PhyloTree | |
def generate_simplebar(df: pd.DataFrame, save_file: str) -> None: | |
# df columns must be labels and values. | |
output = """DATASET_SIMPLEBAR\n | |
SEPARATOR COMMA\n | |
DATASET_LABEL,label_simplebar\n | |
COLOR,#ff0000\n |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from itolapi import Itol | |
def itol(tree_file: str, save_file: str, save_format: str) -> str: | |
if tree_file[-5:] != ".tree" and tree_file[-9:] != ".tree.txt": | |
raise Exception("Input tree file name must end with .tree or .tree.txt") | |
if save_format not in ['png', 'svg', 'eps', 'ps', 'pdf', 'nexus', 'newick']: | |
raise Exception("Unsupported save format") | |
itol_uploader = Itol() | |
itol_uploader.add_file(tree_file) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.model_selection import cross_val_predict, cross_val_score, learning_curve | |
import altair as alt | |
def learn(X, y, **kwargs): # X needs to be a pandas dataframe | |
r = RandomForestClassifier(n_estimators=100, random_state=123, class_weight="balanced", **kwargs) | |
steps, curve_train, curve_test = learning_curve(r, X.sample(frac=1, random_state=123), y.sample(frac=1, random_state=123), cv=5, scoring="roc_auc", n_jobs=20, train_sizes=np.linspace(0.05,1,20)) | |
learning = (pd.concat([pd.DataFrame(curve_train, index=steps).apply(np.average, axis="columns").rename("train"), |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from skbio import TreeNode | |
from skbio.diversity import beta_diversity | |
tree = TreeNode.read("gg_13_8_otus/trees/61_otus_unannotated.tree") | |
sample_ids = [f"sample{i}" for i in range(6)] | |
with open("gg_13_8_otus/taxonomy/61_otu_taxonomy.txt", "r") as f: | |
otu_ids = [each.strip().split("\t")[0] for each in f.readlines()] | |
data = np.random.randint(0, 100, size=(len(sample_ids), len(otu_ids))).tolist() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from itertools import product | |
import pandas as pd | |
from ete3 import Tree | |
t = Tree("some_tree_file.nwk") | |
leaves = t.get_leaves() | |
leaf_names = t.get_leaf_names() | |
pair = product(leaves, leaves) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from sklearn.preprocessing import normalize | |
def tf_idf(X): # corresponds to smooth=True and norm="l2" in sklearn.feature_extraction.text.TfidfVectorizer | |
tf = normalize(X, norm="l1", axis=1) | |
N = len(X) | |
df = np.count_nonzero(X, axis=0) | |
idf = np.log((N+1)/(df+1))+1 | |
return normalize(tf * idf, norm="l2") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Plotting in dark theme with Altair | |
import numpy as np | |
import pandas as pd | |
import altair as alt | |
alt.renderers.set_embed_options(theme='dark') | |
arr = np.concatenate([np.random.randn(100, 2), np.random.randint(0, 5, (100, 1))], axis=1) | |
df = pd.DataFrame(arr, columns=["X1", "X2", "Y"]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def plot(df, x, y, args, fill=False): | |
df = df.reset_index() | |
index = df.columns[0] | |
if fill: | |
chart = alt.Chart(df).mark_circle(size=30) | |
else: | |
chart = alt.Chart(df).mark_point(size=30) | |
chart = chart.encode( |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is roughly 30 fold faster than my original implimentation. | |
# Install ETE3 via pip and run ncbi.update_taxonomy_database() first. | |
from collections import defaultdict | |
import pandas as pd | |
from ete3 import NCBITaxa | |
ncbi = NCBITaxa() | |
def get_taxonomy_hierarchy(taxid): |