jxcodetw/UMAP_NN_batched.py

## UMAP_NN_batched.py
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from umap.umap_ import fuzzy_simplicial_set, find_ab_params
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import SpectralEmbedding
from scipy.sparse import save_npz, load_npz
import random
from functools import partial

MIN_DIST=0.1
SPREAD=1.0
EPS = 1e-12
N_EPOCHS = 50
NEG_RATE = 5
BATCH_SIZE = 4096 * NEG_RATE
D_GRAD_CLIP = 19006880743424
DATA_NPZ_PATH = 'mnist_70000.npz'

def get_activation(act):
    if act == 'lrelu':
        return nn.LeakyReLU(0.2, inplace=True)
    elif act == 'relu':
        return nn.ReLU(inplace=True)
    raise Exception('unsupported activation function')

class FCEncoder(nn.Module):
    def __init__(self, dim, num_layers=3, act='lrelu'):
        super(FCEncoder, self).__init__()
        self.dim = dim
        self.num_layers = num_layers
        self.act = partial(get_activation, act=act)
        hidden_dim = 256
        layers = [
            (nn.Linear(dim, hidden_dim*2)),
            self.act(),
            (nn.Linear(hidden_dim*2, hidden_dim)),
            self.act(),
        ]
        layers += [
            (nn.Linear(hidden_dim, hidden_dim)),
            self.act(),
        ] * num_layers
        layers += [
            (nn.Linear(hidden_dim, 2)),
        ]
        self.net = nn.Sequential(*layers)

    def forward(self, X):
        return self.net(X)

def make_graph(P, n_epochs=-1):
    graph = P.tocoo()
    graph.sum_duplicates()
    n_vertices = graph.shape[1]

    if n_epochs <= 0:
        # For smaller datasets we can use more epochs
        if graph.shape[0] <= 10000:
            n_epochs = 500
        else:
            n_epochs = 200

    graph.data[graph.data < (graph.data.max() / float(n_epochs))] = 0.0
    graph.eliminate_zeros()
    return graph

def make_epochs_per_sample(weights, n_epochs):
    result = -1.0 * np.ones(weights.shape[0], dtype=np.float64)
    n_samples = n_epochs * (weights / weights.max())
    result[n_samples > 0] = float(n_epochs) / n_samples[n_samples > 0]
    return result

def neg_squared_euc_dists(X):
    sum_X = X.pow(2).sum(dim=1)
    D = (-2 * X @ X.transpose(1, 0) + sum_X).transpose(1, 0) + sum_X
    return -D

def w_tsne(Y, a, b):
    distances = neg_squared_euc_dists(Y)
    inv_distances = 1. / (1. - a * (distances)) #1 / (1+ad^2b)
    return inv_distances

def KLD(P, Q):
    return P * torch.log((P+EPS) / Q)

def CE(V, W):
    return - V * torch.log(W + EPS) - (1 - V) * torch.log(1 - W + EPS)

def MXLK(P, w, gamma=7.0):
    return P * torch.log(w + EPS) + gamma * (1 - P) * torch.log(1 - w + EPS)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)

print('load data')
mnist = np.load(DATA_NPZ_PATH)
data = mnist['data']

print('estimate a, b')
ua, ub = find_ab_params(SPREAD, MIN_DIST)
# ua, ub = 1.0, 1.0
# ub = 1.0
print('a:', ua, 'b:', ub)

print('calc V')
try:
    V_csc = load_npz('V_csc.npz')
    print('Use V cache')
except:
    print('Use new V')
    V_csc = fuzzy_simplicial_set(data, n_neighbors=15,
                                 random_state=np.random.RandomState(42), metric='euclidean')
    save_npz('V_csc', V_csc)

# V = torch.Tensor(V_csc.toarray())
print('make_graph')
graph = make_graph(V_csc, N_EPOCHS)
print('make_epochs_per_sample')
epochs_per_sample = make_epochs_per_sample(graph.data, N_EPOCHS)

print('Trying to put X into GPU')
X = torch.from_numpy(data).float()
X = X.to(device)
# X = X.float()

print('Constructing NN')
encoder = FCEncoder(784, num_layers=5)
encoder = encoder.to(device)
encoder = encoder.float()

init_lr = 1e-3
optimizer = optim.SGD(encoder.parameters(), lr=init_lr, weight_decay=0)

epochs_per_negative_sample = epochs_per_sample / NEG_RATE
epoch_of_next_negative_sample = epochs_per_negative_sample.copy()
epoch_of_next_sample = epochs_per_sample.copy()

head = graph.row
tail = graph.col
rnd_max_idx = X.shape[0]

print('optimizing...')
for epoch in range(1, N_EPOCHS):
    batch_i = []
    batch_j = []

    batch_neg_i = []
    for i in range(epochs_per_sample.shape[0]):
        if epoch_of_next_sample[i] <= epoch:
            i_idx, j_idx = head[i], tail[i]
            batch_i.append(i_idx)
            batch_j.append(j_idx)

            epoch_of_next_sample[i] += epochs_per_sample[i]

            n_neg_samples = int(
                (epoch - epoch_of_next_negative_sample[i])
                / epochs_per_negative_sample[i]
            )
            for _ in range(n_neg_samples):
                batch_neg_i.append(i_idx)

            epoch_of_next_negative_sample[i] += (
                n_neg_samples * epochs_per_negative_sample[i]
            )
    batch_neg_j = torch.randint(0, rnd_max_idx, (len(batch_neg_i),)).tolist()
    batch_r = torch.zeros(len(batch_i), dtype=torch.long).tolist() + torch.ones(len(batch_neg_i), dtype=torch.long).tolist()


    batch_i += batch_neg_i
    batch_j += batch_neg_j

    rnd_perm = torch.randperm(len(batch_i))
    batch_i = torch.Tensor(batch_i).long()[rnd_perm]
    batch_j = torch.Tensor(batch_j).long()[rnd_perm]
    batch_r = torch.Tensor(batch_r).long()[rnd_perm]

    for i in range(0, len(batch_i), BATCH_SIZE):
        bi = batch_i[i:i+BATCH_SIZE]
        bj = batch_j[i:i+BATCH_SIZE]
        br = batch_r[i:i+BATCH_SIZE]

        optimizer.zero_grad()

        Y_bi = encoder(X[bi])
        Y_bj = encoder(X[bj])
        Y_bj[br==1] = Y_bj[br==1].detach()

        d = (Y_bi - Y_bj).pow(2).sum(dim=1)
        d.register_hook(lambda grad: grad.clamp(min=-D_GRAD_CLIP, max=D_GRAD_CLIP))
        dp = d.pow(ub)
        w = (1/(1+ua*(dp))).clamp(min=0, max=1)

        pw = w[br==0]
        rw = w[br==1]
        loss = - (torch.log(pw + EPS)).sum()
        loss += - (torch.log(1 - rw + EPS)).sum()
        loss.backward()

        torch.nn.utils.clip_grad_value_(encoder.parameters(), 4)
        optimizer.step()

    with torch.no_grad():
        Y = encoder(X)
#         w = w_tsne(Y, ua, ub).clamp(min=0, max=1)
#         loss = CE(V, w).sum()
    new_lr = (1 - epoch / N_EPOCHS) * init_lr
    for param_group in optimizer.param_groups:
        param_group['lr'] = new_lr
    np.savez_compressed('umap_fast_nn_Y', Y=Y.detach().cpu().numpy())
    np.savez_compressed('umap_nn/{:04d}'.format(epoch), Y=Y.detach().cpu().numpy())
    print("{:04d}".format(epoch), "{:.7f}".format(new_lr), "{:.2f}".format(loss.mean().item()))

print('Done.')
	import torch
	import torch.optim as optim
	import torch.nn as nn
	import torch.nn.functional as F
	from umap.umap_ import fuzzy_simplicial_set, find_ab_params
	import numpy as np
	import matplotlib.pyplot as plt
	from sklearn.manifold import SpectralEmbedding
	from scipy.sparse import save_npz, load_npz
	import random
	from functools import partial

	MIN_DIST=0.1
	SPREAD=1.0
	EPS = 1e-12
	N_EPOCHS = 50
	NEG_RATE = 5
	BATCH_SIZE = 4096 * NEG_RATE
	D_GRAD_CLIP = 19006880743424
	DATA_NPZ_PATH = 'mnist_70000.npz'

	def get_activation(act):
	if act == 'lrelu':
	return nn.LeakyReLU(0.2, inplace=True)
	elif act == 'relu':
	return nn.ReLU(inplace=True)
	raise Exception('unsupported activation function')

	class FCEncoder(nn.Module):
	def __init__(self, dim, num_layers=3, act='lrelu'):
	super(FCEncoder, self).__init__()
	self.dim = dim
	self.num_layers = num_layers
	self.act = partial(get_activation, act=act)
	hidden_dim = 256
	layers = [
	(nn.Linear(dim, hidden_dim*2)),
	self.act(),
	(nn.Linear(hidden_dim*2, hidden_dim)),
	self.act(),
	]
	layers += [
	(nn.Linear(hidden_dim, hidden_dim)),
	self.act(),
	] * num_layers
	layers += [
	(nn.Linear(hidden_dim, 2)),
	]
	self.net = nn.Sequential(*layers)

	def forward(self, X):
	return self.net(X)

	def make_graph(P, n_epochs=-1):
	graph = P.tocoo()
	graph.sum_duplicates()
	n_vertices = graph.shape[1]

	if n_epochs <= 0:
	# For smaller datasets we can use more epochs
	if graph.shape[0] <= 10000:
	n_epochs = 500
	else:
	n_epochs = 200

	graph.data[graph.data < (graph.data.max() / float(n_epochs))] = 0.0
	graph.eliminate_zeros()
	return graph

	def make_epochs_per_sample(weights, n_epochs):
	result = -1.0 * np.ones(weights.shape[0], dtype=np.float64)
	n_samples = n_epochs * (weights / weights.max())
	result[n_samples > 0] = float(n_epochs) / n_samples[n_samples > 0]
	return result

	def neg_squared_euc_dists(X):
	sum_X = X.pow(2).sum(dim=1)
	D = (-2 * X @ X.transpose(1, 0) + sum_X).transpose(1, 0) + sum_X
	return -D

	def w_tsne(Y, a, b):
	distances = neg_squared_euc_dists(Y)
	inv_distances = 1. / (1. - a * (distances)) #1 / (1+ad^2b)
	return inv_distances

	def KLD(P, Q):
	return P * torch.log((P+EPS) / Q)

	def CE(V, W):
	return - V * torch.log(W + EPS) - (1 - V) * torch.log(1 - W + EPS)

	def MXLK(P, w, gamma=7.0):
	return P * torch.log(w + EPS) + gamma * (1 - P) * torch.log(1 - w + EPS)

	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	print('Device:', device)

	print('load data')
	mnist = np.load(DATA_NPZ_PATH)
	data = mnist['data']

	print('estimate a, b')
	ua, ub = find_ab_params(SPREAD, MIN_DIST)
	# ua, ub = 1.0, 1.0
	# ub = 1.0
	print('a:', ua, 'b:', ub)

	print('calc V')
	try:
	V_csc = load_npz('V_csc.npz')
	print('Use V cache')
	except:
	print('Use new V')
	V_csc = fuzzy_simplicial_set(data, n_neighbors=15,
	random_state=np.random.RandomState(42), metric='euclidean')
	save_npz('V_csc', V_csc)

	# V = torch.Tensor(V_csc.toarray())
	print('make_graph')
	graph = make_graph(V_csc, N_EPOCHS)
	print('make_epochs_per_sample')
	epochs_per_sample = make_epochs_per_sample(graph.data, N_EPOCHS)

	print('Trying to put X into GPU')
	X = torch.from_numpy(data).float()
	X = X.to(device)
	# X = X.float()

	print('Constructing NN')
	encoder = FCEncoder(784, num_layers=5)
	encoder = encoder.to(device)
	encoder = encoder.float()

	init_lr = 1e-3
	optimizer = optim.SGD(encoder.parameters(), lr=init_lr, weight_decay=0)

	epochs_per_negative_sample = epochs_per_sample / NEG_RATE
	epoch_of_next_negative_sample = epochs_per_negative_sample.copy()
	epoch_of_next_sample = epochs_per_sample.copy()

	head = graph.row
	tail = graph.col
	rnd_max_idx = X.shape[0]

	print('optimizing...')
	for epoch in range(1, N_EPOCHS):
	batch_i = []
	batch_j = []

	batch_neg_i = []
	for i in range(epochs_per_sample.shape[0]):
	if epoch_of_next_sample[i] <= epoch:
	i_idx, j_idx = head[i], tail[i]
	batch_i.append(i_idx)
	batch_j.append(j_idx)

	epoch_of_next_sample[i] += epochs_per_sample[i]

	n_neg_samples = int(
	(epoch - epoch_of_next_negative_sample[i])
	/ epochs_per_negative_sample[i]
	)
	for _ in range(n_neg_samples):
	batch_neg_i.append(i_idx)

	epoch_of_next_negative_sample[i] += (
	n_neg_samples * epochs_per_negative_sample[i]
	)
	batch_neg_j = torch.randint(0, rnd_max_idx, (len(batch_neg_i),)).tolist()
	batch_r = torch.zeros(len(batch_i), dtype=torch.long).tolist() + torch.ones(len(batch_neg_i), dtype=torch.long).tolist()


	batch_i += batch_neg_i
	batch_j += batch_neg_j

	rnd_perm = torch.randperm(len(batch_i))
	batch_i = torch.Tensor(batch_i).long()[rnd_perm]
	batch_j = torch.Tensor(batch_j).long()[rnd_perm]
	batch_r = torch.Tensor(batch_r).long()[rnd_perm]

	for i in range(0, len(batch_i), BATCH_SIZE):
	bi = batch_i[i:i+BATCH_SIZE]
	bj = batch_j[i:i+BATCH_SIZE]
	br = batch_r[i:i+BATCH_SIZE]

	optimizer.zero_grad()

	Y_bi = encoder(X[bi])
	Y_bj = encoder(X[bj])
	Y_bj[br==1] = Y_bj[br==1].detach()

	d = (Y_bi - Y_bj).pow(2).sum(dim=1)
	d.register_hook(lambda grad: grad.clamp(min=-D_GRAD_CLIP, max=D_GRAD_CLIP))
	dp = d.pow(ub)
	w = (1/(1+ua*(dp))).clamp(min=0, max=1)

	pw = w[br==0]
	rw = w[br==1]
	loss = - (torch.log(pw + EPS)).sum()
	loss += - (torch.log(1 - rw + EPS)).sum()
	loss.backward()

	torch.nn.utils.clip_grad_value_(encoder.parameters(), 4)
	optimizer.step()

	with torch.no_grad():
	Y = encoder(X)
	# w = w_tsne(Y, ua, ub).clamp(min=0, max=1)
	# loss = CE(V, w).sum()
	new_lr = (1 - epoch / N_EPOCHS) * init_lr
	for param_group in optimizer.param_groups:
	param_group['lr'] = new_lr
	np.savez_compressed('umap_fast_nn_Y', Y=Y.detach().cpu().numpy())
	np.savez_compressed('umap_nn/{:04d}'.format(epoch), Y=Y.detach().cpu().numpy())
	print("{:04d}".format(epoch), "{:.7f}".format(new_lr), "{:.2f}".format(loss.mean().item()))

	print('Done.')