Skip to content

Instantly share code, notes, and snippets.

@mortonjt
Last active May 18, 2019 15:03
Show Gist options
  • Save mortonjt/f050972c37ebc897c1b1af7b1f7143e6 to your computer and use it in GitHub Desktop.
Save mortonjt/f050972c37ebc897c1b1af7b1f7143e6 to your computer and use it in GitHub Desktop.
## import modules
import os
import copy
import time
from tqdm import tqdm
import numpy as np
from skbio.stats.composition import clr_inv as softmax
from scipy.stats import spearmanr
import datetime
import pandas as pd
from skbio.stats.composition import ilr_inv, clr_inv, alr_inv, clr, centralize, closure
from scipy.sparse import coo_matrix, csr_matrix
import seaborn as sns
import matplotlib.pyplot as plt
# see http://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions.multinomial import Multinomial
from torch.nn.utils import clip_grad_norm
%matplotlib inline
# create toy example
def bimodal(num_x, num_y, means=(-1, 1), sigma=0.1):
beta = state.normal(0, 1, size=(2, num_y))
beta = np.sort(beta, axis=1)
X = np.vstack((np.ones(num_x),
np.linspace(means[0], means[1], num_x))).T
Q = np.tanh(state.normal(X @ beta, sigma))
return Q
num_microbes = 100
num_metabolites = 100
num_samples=100
latent_dim=2
low=-1
high=1
microbe_total=300
metabolite_total=900
uB=0
sigmaB=1
sigmaQ=2
uU=0
sigmaU=0.01
uV=0
sigmaV=0.01
s = 0.2
seed=1
eUun = 0.2
eVun = 0.2
state = np.random.RandomState(seed)
# only have two coefficients
#microbes = state.multivariate_normal(
# mean=np.ones(num_microbes-1), cov=np.diag([sigmaQ] * np.exp(state.randn(num_microbes-1))),
# size=num_samples)
microbes = bimodal(num_samples, num_microbes, means=(-3, 3))
microbes = clr_inv(microbes)
eUmain = bimodal(num_microbes, latent_dim, means=(-3, 3))
eVmain = bimodal(num_metabolites, latent_dim, means=(-3, 3)).T
eUmain = - eUmain.mean(axis=0) + eUmain - eUmain.mean(axis=1).reshape(-1, 1)
eVmain = - eVmain.mean(axis=0) + eVmain - eVmain.mean(axis=1).reshape(-1, 1)
eUbias = state.normal(
uU, sigmaU, size=(num_microbes, 1))
eVbias = state.normal(
uV, sigmaV, size=(1, num_metabolites))
# TODO: force eUmain and eVmain to be centered?
U_ = np.hstack(
(np.ones((num_microbes, 1)), eUbias, eUmain))
V_ = np.vstack(
(eVbias, np.ones((1, num_metabolites)), eVmain))
phi = U_ @ V_
microbe_counts = np.zeros((num_samples, num_microbes))
metabolite_counts = np.zeros((num_samples, num_metabolites))
n1 = microbe_total
n2 = metabolite_total // microbe_total
for n in range(num_samples):
u = state.normal(eUmain, eUun)
v = state.normal(eVmain, eVun)
ubias = state.normal(eUbias, eUun)
vbias = state.normal(eVbias, eVun)
u_ = np.hstack(
(np.ones((num_microbes, 1)), ubias, u))
v_ = np.vstack(
(vbias, np.ones((1, num_metabolites)), v))
#phi = np.hstack((np.zeros((num_microbes, 1)), u_ @ v_))
probs = softmax(u_ @ v_)
otu = state.multinomial(n1, microbes[n, :])
for _ in range(microbe_total):
i = state.choice(np.arange(num_microbes), p=microbes[n, :])
metabolite_counts[n, :] += state.multinomial(n2, probs[i, :])
microbe_counts[n, :]+= state.multinomial(microbe_total, microbes[n])
# visualize the simulations - just for fun
sns.heatmap(eUmain @ eVmain, cmap='seismic')
sns.heatmap(u_ @ v_, cmap='seismic')
sns.heatmap(microbe_counts)
sns.heatmap(metabolite_counts)
## Create mmvec classes
class GaussianEmbedding(torch.nn.Module):
def __init__(self, in_features, out_features):
"""
In the constructor we instantiate two nn.Linear modules and assign them as
member variables.
"""
super(GaussianEmbedding, self).__init__()
self.embedding = torch.nn.Embedding(in_features, out_features)
self.bias = torch.nn.Embedding(in_features, 1)
self.embedding_var = torch.nn.Embedding(in_features, out_features)
self.bias_var = torch.nn.Embedding(in_features, 1)
self.in_features = in_features
self.out_features = out_features
def _divergence(self, mu, logvar):
return 0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
def divergence(self):
""" Computes the KL divergence between posterior and prior. """
w = self._divergence(self.embedding.weight, self.embedding_var.weight)
b = self._divergence(self.bias.weight, self.bias_var.weight)
return w + b
def reparameterize(self, mu, logvar):
""" Samples from the posterior distribution via
reparameterization gradients"""
std = torch.exp(0.5 * logvar)
eps = torch.randn_like(std)
return mu + eps*std
def forward(self, x):
"""
In the forward function we accept a Tensor of input data and we must return
a Tensor of output data. We can use Modules defined in the constructor as
well as arbitrary operators on Tensors.
"""
embeds = self.reparameterize(
self.embedding(x),
self.embedding_var(x)
)
biases = self.reparameterize(
self.bias(x),
self.bias_var(x)
)
pred = embeds + biases
return pred
class GaussianDecoder(torch.nn.Module):
def __init__(self, in_features, out_features):
"""
In the constructor we instantiate two nn.Linear modules and assign them as
member variables.
"""
super(GaussianDecoder, self).__init__()
self.mean = torch.nn.Linear(in_features, out_features)
self.var = torch.nn.Linear(in_features, out_features)
self.in_features = in_features
self.out_features = out_features
def reparameterize(self, mu, logvar, mc_samples=10):
""" Samples from the posterior distribution via
reparameterization gradients"""
std = torch.exp(0.5 * logvar)
eps = torch.randn_like(std)
return mu + eps*std
def _divergence(self, mu, logvar):
return 0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
def divergence(self):
""" Computes the KL divergence between posterior and prior. """
# see Appendix B from VAE paper:
# Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
# https://arxiv.org/abs/1312.6114
w = self._divergence(self.mean.weight, self.var.weight)
b = self._divergence(self.mean.bias, self.var.bias)
return w + b
def forward(self, x):
"""
In the forward function we accept a Tensor of input data and we must return
a Tensor of output data. We can use Modules defined in the constructor as
well as arbitrary operators on Tensors.
"""
pred = self.reparameterize(
self.mean(x),
self.var(x)
)
return pred
class MMvec(nn.Module):
def __init__(self, num_samples, num_microbes, num_metabolites, microbe_total,
latent_dim, batch_size=10, subsample_size=100, mc_samples=10,
device='cpu'):
super(MMvec, self).__init__()
self.num_microbes = num_microbes
self.num_metabolites = num_metabolites
self.num_samples = num_samples
self.device = device
self.batch_size = batch_size
self.subsample_size = subsample_size
self.mc_samples = mc_samples
self.microbe_total = microbe_total
self.encoder = GaussianEmbedding(num_microbes, latent_dim)
self.decoder = GaussianDecoder(latent_dim, num_metabolites)
def forward(self, x):
code = self.encoder(x)
log_probs = self.decoder(code)
#zeros = torch.zeros(self.batch_size * self.subsample_size, 1)
#log_probs = torch.cat((zeros, alrs), dim=1)
return log_probs
def loss(self, pred, obs):
""" Computes the loss function to be minimized. """
mean_like = torch.zeros(mc_samples, device=self.device)
kld = self.encoder.divergence() + self.decoder.divergence()
n = self.microbe_total * self.num_samples
likelihood = n * torch.mean(Multinomial(logits=pred).log_prob(obs))
elbo = kld + likelihood
#elbo = likelihood
return -elbo, kld, likelihood
def get_batch(X, Y, i, subsample_size, batch_size):
""" Retrieves minibatch
Parameters
----------
X : scipy.sparse.csr_matrix
Input sparse matrix of abundances
Y : np.array
Output dense matrix of abundances
i : int
Sample index
subsample_size : int
Number of sequences to randomly draw per iteration
batch_size : int
Number of samples to load per minibatch
TODO
----
- It will be worth offloading this work to the torch.data.DataLoader class.
One advantage is that the GPU work can be done in parallel
to preparing the data for transfer. This could yield at least
a 2x speedup.
"""
Xs = []
Ys = []
for n in range(batch_size):
k = (i + n) % Y.shape[0]
row = X.getrow(k)
cnts = np.hstack([row.data[row.indptr[i]:row.indptr[i+1]]
for i in range(len(row.indptr)-1)])
feats = np.hstack([row.indices[row.indptr[i]:row.indptr[i+1]]
for i in range(len(row.indptr)-1)])
inp = np.random.choice(feats, p=cnts/np.sum(cnts), size=subsample_size)
Xs.append(inp)
Ys.append(Y[k])
Xs = np.hstack(Xs)
Ys = np.repeat(np.vstack(Ys), subsample_size, axis=0)
return torch.from_numpy(Xs).long(), torch.from_numpy(Ys).float()
## preprocessing for mmvec
trainX = csr_matrix(microbe_counts)
trainY = metabolite_counts
d1 = microbe_counts.shape[1]
d2 = metabolite_counts.shape[1]
epochs=100
mc_samples=1
beta1=0.8
beta2=0.9
gamma=0.1
step_size=20
batch_size=10
subsample_size=500
microbe_total = microbe_counts.sum()
# optimization parameters
learning_rate = 1e-1
clipnorm = 100
iterations = 1000
# log path
basename = "logdir"
suffix = datetime.datetime.now().strftime("%y%m%d_%H%M%S")
save_path = "_".join([basename, suffix])
## actually fit the model
model = MMvec(num_samples=num_samples, num_microbes=num_microbes, num_metabolites=num_metabolites,
microbe_total=microbe_total, latent_dim=latent_dim, batch_size=batch_size,
subsample_size=subsample_size,
device='cpu')
optimizer = optim.Adam(model.parameters(), betas=(beta1, beta2),
lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(
optimizer, step_size=step_size, gamma=gamma)
best_loss = np.inf
losses = []
klds = []
likes = []
errs = []
for ep in tqdm(range(0, epochs)):
model.train()
scheduler.step()
# 2 rounds so that both matrices can be iterated
for _ in range(2):
for i in range(0, num_samples, model.batch_size):
optimizer.zero_grad()
inp, out = get_batch(trainX, trainY, i % num_samples,
model.subsample_size, model.batch_size)
pred = model.forward(inp)
loss, kld, like = model.loss(pred, out)
err = torch.mean(torch.abs(F.softmax(pred, dim=1) * metabolite_total - out))
#loss.backward()
loss.backward()
errs.append(err.item())
losses.append(loss.item())
klds.append(kld.item())
likes.append(like.item())
optimizer.step()
## diagnostics for the fit
plt.plot(errs)
plt.xlabel('iterations', fontsize=14)
plt.ylabel('errors', fontsize=14)
plt.plot(-np.array(likes))
plt.xlabel('iterations', fontsize=14)
plt.ylabel('likelihood', fontsize=14)
plt.yscale('log')
plt.plot(klds)
plt.xlabel('iterations', fontsize=14)
plt.ylabel('KL', fontsize=14)
plt.plot(losses)
plt.xlabel('iterations', fontsize=14)
plt.ylabel('elbo', fontsize=14)
plt.yscale('log')
u = np.array(model.encoder.embedding.weight.detach())
v = np.array(model.decoder.mean.weight.detach())
ubias = np.array(model.encoder.bias.weight.detach())
vbias = np.array(model.decoder.mean.bias.detach())
def kl(mu, logvar):
return 1 + logvar - mu**2 - np.exp(logvar)
sns.distplot(np.ravel(eUmain))
sns.distplot(np.ravel(u))
from scipy.spatial.distance import pdist
plt.scatter(pdist(eUmain), pdist(u))
mu = np.ravel(model.encoder.embedding.weight.detach())
logv = np.ravel(model.encoder.embedding_var.weight.detach())
print(kl(mu, logv).sum())
sns.distplot(kl(mu, logv))
mu = np.ravel(model.encoder.bias.weight.detach())
logv = np.ravel(model.encoder.bias_var.weight.detach())
print(kl(mu, logv).sum())
sns.distplot(kl(mu, logv))
mu = np.ravel(model.decoder.mean.weight.detach())
logv = np.ravel(model.decoder.var.weight.detach())
print(kl(mu, logv).sum())
sns.distplot(kl(mu, logv))
mu = np.ravel(model.decoder.mean.bias.detach())
logv = np.ravel(model.decoder.var.bias.detach())
print(kl(mu, logv).sum())
sns.distplot(kl(mu, logv))
sns.distplot(np.ravel(eVmain))
sns.distplot(np.ravel(v))
plt.scatter(pdist(eVmain.T), pdist(v))
modelU = np.hstack(
(np.ones((u.shape[0], 1)), ubias, u))
modelV = np.hstack(
(vbias.reshape(-1, 1), np.ones((v.shape[0], 1)), v)).T
exp = phi - phi.mean(axis=1).reshape(-1, 1)
exp = exp - exp.mean(axis=0)
res = modelU @ modelV
res = res - res.mean(axis=1).reshape(-1, 1)
res = res - res.mean(axis=0)
sns.distplot(exp.ravel())
sns.distplot(res.ravel())
plt.scatter(exp.ravel(), res.ravel())
plt.xlabel('simulated ranks')
plt.ylabel('predicted ranks')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment