Skip to content

Instantly share code, notes, and snippets.

@ypeleg
Last active June 1, 2021 10:38
Show Gist options
  • Save ypeleg/76f99fc5d4bb72b0b95d27ad50a2a0ac to your computer and use it in GitHub Desktop.
Save ypeleg/76f99fc5d4bb72b0b95d27ad50a2a0ac to your computer and use it in GitHub Desktop.
import os
import math
import pickle
import shutil
import joblib
import warnings
import datetime
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import cluster
from functools import reduce
from functools import partial
from joblib import dump, load
warnings.filterwarnings("ignore")
from tqdm.autonotebook import tqdm
import tensorflow_probability as tfp
from sklearn.preprocessing import OneHotEncoder
from sklearn.exceptions import ConvergenceWarning
from sklearn.utils._testing import ignore_warnings
from sklearn.mixture import BayesianGaussianMixture
from tqdm.contrib.concurrent import process_map, thread_map
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
def pkl_dump(obj, path): pickle.dump(obj, open(path, 'wb'), protocol=4)
def pkl_load(path): return pickle.load(open(path, 'rb'))
class ConditionalGenerator:
@classmethod
def from_dict(cls, in_dict):
new_instance = ConditionalGenerator()
new_instance.__dict__ = in_dict
return new_instance
def __init__(self, data=None, output_info=None, log_frequency=None):
if data is None or output_info is None or log_frequency is None:
return
self._model = []
start = 0
skip = False
max_interval = 0
counter = 0
for item in output_info:
if item[1] == 'tanh':
start += item[0]
skip = True
continue
if item[1] == 'softmax':
if skip:
skip = False
start += item[0]
continue
end = start + item[0]
max_interval = max(max_interval, end - start)
counter += 1
self._model.append(np.argmax(data[:, start:end], axis=-1))
start = end
else:
assert 0
assert start == data.shape[1]
self._interval = []
self._n_col = 0
self.n_opt = 0
skip = False
start = 0
self._p = np.zeros((counter, max_interval))
for item in output_info:
if item[1] == 'tanh':
skip = True
start += item[0]
continue
if item[1] == 'softmax':
if skip:
start += item[0]
skip = False
continue
end = start + item[0]
tmp = np.sum(data[:, start:end], axis=0)
if log_frequency:
tmp = np.log(tmp + 1)
tmp = tmp / np.sum(tmp)
self._p[self._n_col, :item[0]] = tmp
self._interval.append((self.n_opt, item[0]))
self.n_opt += item[0]
self._n_col += 1
start = end
else:
assert 0
self._interval = np.asarray(self._interval)
def _random_choice_prob_index(self, idx):
prob = self._p[idx]
rand = np.expand_dims(np.random.rand(prob.shape[0]), axis=1)
return (prob.cumsum(axis=1) > rand).argmax(axis=1)
def sample(self, batch_size):
if self._n_col == 0: return None
col_idx = np.random.choice(np.arange(self._n_col), batch_size)
cond = np.zeros((batch_size, self.n_opt), dtype='float32')
mask = np.zeros((batch_size, self._n_col), dtype='float32')
mask[np.arange(batch_size), col_idx] = 1
opt_idx = self._random_choice_prob_index(col_idx)
opt = self._interval[col_idx, 0] + opt_idx
cond[np.arange(batch_size), opt] = 1
return cond, mask, col_idx, opt_idx
def sample_zero(self, batch_size):
if self._n_col == 0: return None
vec = np.zeros((batch_size, self.n_opt), dtype='float32')
idx = np.random.choice(np.arange(self._n_col), batch_size)
for i in range(batch_size):
col = idx[i]
pick = int(np.random.choice(self._model[col]))
vec[i, pick + self._interval[col, 0]] = 1
return vec
class ResidualLayer(Layer):
def __init__(self, input_dim, output_dim):
super(ResidualLayer, self).__init__()
self._output_dim = output_dim
self._fc = Dense( self._output_dim, input_dim=(input_dim,), kernel_initializer=partial(init_bounded, dim=input_dim), bias_initializer=partial(init_bounded, dim=input_dim))
self._bn = BatchNormalization(epsilon=1e-5, momentum=0.9)
self._relu = ReLU()
def call(self, inputs, **kwargs):
outputs = self._fc(inputs, **kwargs)
outputs = self._bn(outputs, **kwargs)
outputs = self._relu(outputs, **kwargs)
return tf.concat([outputs, inputs], 1)
class GenActivation(Layer):
def __init__(self, input_dim, output_dim, transformer_info, tau):
super(GenActivation, self).__init__()
self._output_dim = output_dim
self._transformer_info = transformer_info
self._tau = tau
self._fc = Dense(
output_dim, input_dim=(input_dim,),
kernel_initializer=partial(init_bounded, dim=input_dim),
bias_initializer=partial(init_bounded, dim=input_dim))
def call(self, inputs, **kwargs):
outputs = self._fc(inputs, **kwargs)
data_t = tf.zeros(tf.shape(outputs))
for idx in self._transformer_info:
act = tf.where(
idx[2] == 0,
tf.math.tanh(outputs[:, idx[0]:idx[1]]),
self._gumbel_softmax(outputs[:, idx[0]:idx[1]], tau=self._tau))
data_t = tf.concat([data_t[:, :idx[0]], act, data_t[:, idx[1]:]], 1)
return outputs, data_t
@tf.function(experimental_relax_shapes=True)
def _gumbel_softmax(self, logits, tau=1.0, hard=False, dim=-1):
gumbel_dist = tfp.distributions.Gumbel(loc=0, scale=1)
gumbels = gumbel_dist.sample(tf.shape(logits))
gumbels = (logits + gumbels) / tau
output = tf.nn.softmax(gumbels, dim)
if hard:
index = tf.math.reduce_max(output, 1, keepdims=True)
output_hard = tf.cast(tf.equal(output, index), output.dtype)
output = tf.stop_gradient(output_hard - output) + output
return output
class Generator(Model):
def __init__(self, input_dim, gen_dims, data_dim, transformer_info, tau):
super(Generator, self).__init__()
self._input_dim = input_dim
self._model = list()
dim = input_dim
for layer_dim in list(gen_dims):
self._model += [ResidualLayer(dim, layer_dim)]
dim += layer_dim
self._model += [GenActivation(dim, data_dim, transformer_info, tau)]
def call(self, inputs, **kwargs):
outputs = inputs
for layer in self._model:
outputs = layer(outputs, **kwargs)
return outputs
class OHE(OneHotEncoder):
def __eq__(self, other):
try:
np.testing.assert_equal(self.__dict__, other.__dict__)
return True
except AssertionError:
return False
class BGM(BayesianGaussianMixture):
def __eq__(self, other):
try:
np.testing.assert_equal(self.__dict__, other.__dict__)
return True
except AssertionError: return False
def _initialize_parameters(self, X, random_state):
n_samples, _ = X.shape
if self.init_params == 'kmeans':
resp = np.zeros((n_samples, self.n_components))
try:
from cuml.cluster import KMeans
label = KMeans(n_clusters=self.n_components, n_init=1).fit(X).labels_
except:
label = cluster.KMeans(n_clusters=self.n_components, n_init=1, random_state=random_state).fit(X).labels_
resp[np.arange(n_samples), label] = 1
elif self.init_params == 'random':
resp = random_state.rand(n_samples, self.n_components)
resp /= resp.sum(axis=1)[:, np.newaxis]
else: raise ValueError("Unimplemented initialization method '%s'" % self.init_params)
self._initialize(X, resp)
class Sampler(object):
def __init__(self, data, output_info):
super(Sampler, self).__init__()
self.data = data
self.model = []
self.n = len(data)
st = 0
skip = False
for item in output_info:
if item[1] == 'tanh':
st += item[0]
skip = True
elif item[1] == 'softmax':
if skip:
skip = False
st += item[0]
continue
ed = st + item[0]
tmp = []
for j in range(item[0]):
tmp.append(np.nonzero(data[:, st + j])[0])
self.model.append(tmp)
st = ed
else:
assert 0
assert st == data.shape[1]
def sample(self, n, col, opt):
if col is None:
idx = np.random.choice(np.arange(self.n), n)
return self.data[idx]
idx = []
for c, o in zip(col, opt):
idx.append(np.random.choice(self.model[c][o]))
return self.data[idx]
class DataTransformer(object):
@classmethod
def from_dict(cls, in_dict):
new_instance = DataTransformer()
new_instance.__dict__ = in_dict
return new_instance
def __init__(self, n_clusters=10, epsilon=0.005, parallel_preprocess=None):
self._parallel_preprocess = parallel_preprocess
self._n_clusters = n_clusters
self._epsilon = epsilon
self._is_dataframe = None
self._meta = None
self._dtypes = None
self.output_info = None
self.output_dimensions = None
self.output_tensor = None
self.cond_tensor = None
def generate_tensors(self):
if self.output_info is None: raise AttributeError("Output info still not available")
output_info = []
cond_info = []
i = 0
st_idx = 0
st_c = 0
for item in self.output_info:
ed_idx = st_idx + item[0]
if not item[2]: # Categorical: add conditions for all categ options (?)
ed_c = st_c + item[0]
cond_info.append(tf.constant([st_idx, ed_idx, st_c, ed_c, i], dtype=tf.int32))
st_c = ed_c
i += 1
output_info.append(tf.constant([st_idx, ed_idx, int(item[1] == 'softmax')], dtype=tf.int32))
st_idx = ed_idx
self.output_tensor = output_info
self.cond_tensor = cond_info
@ignore_warnings(category=ConvergenceWarning)
def _fit_continuous(self, column, data):
vgm = BGM(
self._n_clusters,
weight_concentration_prior_type='dirichlet_process',
weight_concentration_prior=0.001,
n_init=1
)
vgm.fit(data)
components = vgm.weights_ > self._epsilon
num_components = components.sum()
return {
'name': column,
'model': vgm,
'components': components,
'output_info': [(1, 'tanh', 1), (num_components, 'softmax', 1)],
'output_dimensions': 1 + num_components,
}
def _fit_discrete(self, column, data):
ohe = OHE(sparse=False)
ohe.fit(data)
categories = len(ohe.categories_[0])
return {
'name': column,
'encoder': ohe,
'output_info': [(categories, 'softmax', 0)],
'output_dimensions': categories
}
def fit(self, data, discrete_columns=tuple()):
self.output_info = []
self.output_dimensions = 0
if not isinstance(data, pd.DataFrame):
self._is_dataframe = False
data = pd.DataFrame(data)
else:
self._is_dataframe = True
self._dtypes = data.infer_objects().dtypes
self._meta = []
if self._parallel_preprocess:
args = []
for column in data.columns: args.append((self._n_clusters, self._epsilon, data, column, discrete_columns))
res = process_map(fit_component, args, max_workers=2)
for out_info, out_dim, mets in res:
self.output_info += out_info
self.output_dimensions += out_dim
self._meta.append(mets)
else:
for column in tqdm(data.columns, desc='fitting preprocessing', leave=False):
column_data = data[[column]].values
if column in discrete_columns:
meta = self._fit_discrete(column, column_data)
else:
meta = self._fit_continuous(column, column_data)
self.output_info += meta['output_info']
self.output_dimensions += meta['output_dimensions']
self._meta.append(meta)
def _transform_continuous(self, column_meta, data):
components = column_meta['components']
model = column_meta['model']
means = model.means_.reshape((1, self._n_clusters))
stds = np.sqrt(model.covariances_).reshape((1, self._n_clusters))
features = (data - means) / (4 * stds)
probs = model.predict_proba(data)
n_opts = components.sum()
features = features[:, components]
probs = probs[:, components]
opt_sel = np.zeros(len(data), dtype='int')
for i in range(len(data)):
norm_probs = probs[i] + 1e-6
norm_probs = norm_probs / norm_probs.sum()
opt_sel[i] = np.random.choice(np.arange(n_opts), p=norm_probs)
idx = np.arange((len(features)))
features = features[idx, opt_sel].reshape([-1, 1])
features = np.clip(features, -.99, .99)
probs_onehot = np.zeros_like(probs)
probs_onehot[np.arange(len(probs)), opt_sel] = 1
return [features, probs_onehot]
def _transform_discrete(self, column_meta, data):
encoder = column_meta['encoder']
return encoder.transform(data)
def transform(self, data):
if not isinstance(data, pd.DataFrame):
data = pd.DataFrame(data)
values = []
args = []
for meta in tqdm(self._meta):
column_data = data[[meta['name']]].values
args.append((self, meta, column_data))
rets = process_map(_transform, args)
for ret in rets:
values += ret
# for meta in tqdm(self._meta):
# column_data = data[[meta['name']]].values
# if 'model' in meta:
# values += self._transform_continuous(meta, column_data)
# else:
# values.append(self._transform_discrete(meta, column_data))
return np.concatenate(values, axis=1).astype(float)
def _inverse_transform_continuous(self, meta, data, sigma=None):
model = meta['model']
components = meta['components']
mean = data[:, 0]
variance = data[:, 1:]
if sigma is not None:
mean = np.random.normal(mean, sigma)
mean = np.clip(mean, -1, 1)
v_t = np.ones((len(data), self._n_clusters)) * -100
v_t[:, components] = variance
variance = v_t
means = model.means_.reshape([-1])
stds = np.sqrt(model.covariances_).reshape([-1])
p_argmax = np.argmax(variance, axis=1)
std_t = stds[p_argmax]
mean_t = means[p_argmax]
column = mean * 4 * std_t + mean_t
return column
def _inverse_transform_discrete(self, meta, data):
encoder = meta['encoder']
return encoder.inverse_transform(data)
def inverse_transform(self, data, sigmas=None):
start = 0
output = []
column_names = []
for meta in self._meta:
dimensions = meta['output_dimensions']
columns_data = data[:, start:start + dimensions]
if 'model' in meta:
sigma = sigmas[start] if sigmas else None
inverted = self._inverse_transform_continuous(
meta, columns_data, sigma)
else:
inverted = self._inverse_transform_discrete(meta, columns_data)
output.append(inverted)
column_names.append(meta['name'])
start += dimensions
output = np.column_stack(output)
output = pd.DataFrame(output, columns=column_names)\
.astype(self._dtypes)
if not self._is_dataframe:
output = output.values
return output
def _fit_discrete(column, data):
ohe = OHE(sparse=False)
ohe.fit(data)
categories = len(ohe.categories_[0])
return {
'name': column,
'encoder': ohe,
'output_info': [(categories, 'softmax', 0)],
'output_dimensions': categories
}
def _fit_continuous(_n_clusters, _epsilon, column, data):
vgm = BGM(
_n_clusters,
weight_concentration_prior_type='dirichlet_process',
weight_concentration_prior=0.001,
n_init=1
)
vgm.fit(data)
components = vgm.weights_ > _epsilon
num_components = components.sum()
return {
'name': column,
'model': vgm,
'components': components,
'output_info': [(1, 'tanh', 1), (num_components, 'softmax', 1)],
'output_dimensions': 1 + num_components,
}
def fit_component(args):
_n_clusters, _epsilon, data, column, discrete_columns = args
column_data = data[[column]].values
if column in discrete_columns:
meta = _fit_discrete(column, column_data)
else:
meta = _fit_continuous(_n_clusters, _epsilon, column, column_data)
return meta['output_info'], meta['output_dimensions'], meta
def _transform(args):
self, meta, column_data = args
if 'model' in meta:
return self._transform_continuous(meta, column_data)
else:
return([self._transform_discrete(meta, column_data)])
class TabularGAN(object):
def __init__(self, file_path=None, log_dir=None, z_dim=128, pac=10, gen_dim=(256, 256), crt_dim=(256, 256), l2_scale=1e-6, batch_size=500, gp_lambda=10.0, tau=0.2, parallel=False, transformer=None):
if file_path is not None: self.load(file_path)
else:
if log_dir is not None and os.path.exists(log_dir): raise IsADirectoryError("Log directory does not exist.")
if batch_size % 2 != 0 or batch_size % pac != 0: raise ValueError("batch_size needs to be an even value divisible by pac.")
self._parallel_preprocess = parallel
self._log_dir = log_dir
self._z_dim = z_dim
self._pac = pac
self._pac_dim = None
self._l2_scale = l2_scale
self._batch_size = batch_size
self._gp_lambda = gp_lambda
self._tau = tau
self._gen_dim = tuple(gen_dim)
self._crt_dim = tuple(crt_dim)
self._g_opt = Adam(learning_rate=2e-4, beta_1=0.5, beta_2=0.9, epsilon=1e-08)
self._c_opt = Adam(learning_rate=2e-4, beta_1=0.5, beta_2=0.9, epsilon=1e-08)
self._transformer = DataTransformer(parallel_preprocess = self._parallel_preprocess) if transformer is None else transformer
self._need_transform = transformer is None
self._data_sampler = None
self._cond_generator = None
self._generator = None
self._critic = None
self.fitted = False
def fit(self, train_data, categorical_cols=tuple(), epochs=300, log_frequency=True):
if categorical_cols is None: categorical_cols = [i for i in list(set(train_data.columns) - set(train_data._get_numeric_data().columns))]
if self._need_transform: self._transformer.fit(train_data, categorical_cols)
# pkl_dump(self._transformer, 'ctgan_dump')
train_data = self._transformer.transform(train_data)
self._transformer.generate_tensors()
self._data_sampler = DataSampler(train_data, self._transformer.output_info)
data_dim = self._transformer.output_dimensions
self._cond_generator = ConditionalGenerator(train_data, self._transformer.output_info, log_frequency)
self._generator = Generator( self._z_dim + self._cond_generator.n_opt, self._gen_dim, data_dim, self._transformer.output_tensor, self._tau)
self._critic = Critic( data_dim + self._cond_generator.n_opt, self._crt_dim, self._pac)
metrics = { 'g_loss': tf.metrics.Mean(), 'cond_loss': tf.metrics.Mean(), 'c_loss': tf.metrics.Mean(), 'gp': tf.metrics.Mean() }
if self._log_dir is not None:
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = self._log_dir + '/gradient_tape/' + current_time + '/train'
train_summary_writer = tf.summary.create_file_writer(train_log_dir)
self._generator.build((self._batch_size, self._generator._input_dim))
self._critic.build((self._batch_size, self._critic._input_dim))
steps_per_epoch = max(len(train_data) // self._batch_size, 1)
for epoch in range(epochs):
p_bar = ProgressBar(len(train_data), self._batch_size, epoch, epochs, metrics)
for _ in range(steps_per_epoch):
c_loss, g_p = self._train_c()
metrics['c_loss'](c_loss)
metrics['gp'](g_p)
g_loss, cond_loss = self._train_g()
metrics['g_loss'](g_loss)
metrics['cond_loss'](cond_loss)
p_bar._update(metrics)
if self._log_dir is not None:
with train_summary_writer.as_default():
for met in metrics:
tf.summary.scalar(met, metrics[met].result(), step=epoch)
metrics[met].reset_states()
# p_bar.close()
del p_bar
self.fitted = True
@tf.function
def train_c_step(self, fake_cat, real_cat):
with tf.GradientTape() as tape:
y_fake = self._critic(fake_cat, training=True)
y_real = self._critic(real_cat, training=True)
g_p = gradient_penalty( partial(self._critic, training=True), real_cat, fake_cat, self._pac, self._gp_lambda)
loss = -(tf.reduce_mean(y_real) - tf.reduce_mean(y_fake))
c_loss = loss + g_p
grad = tape.gradient(c_loss, self._critic.trainable_variables)
self._c_opt.apply_gradients( zip(grad, self._critic.trainable_variables))
return loss, g_p
def _train_c(self):
fake_z = tf.random.normal([self._batch_size, self._z_dim])
cond_vec = self._cond_generator.sample(self._batch_size)
if cond_vec is None:
_, _, col_idx, opt_idx = None, None, None, None
real = self._data_sampler.sample(self._batch_size, col_idx, opt_idx)
else:
cond, _, col_idx, opt_idx = cond_vec
cond = tf.convert_to_tensor(cond)
fake_z = tf.concat([fake_z, cond], 1)
perm = np.arange(self._batch_size)
np.random.shuffle(perm)
real = self._data_sampler.sample(self._batch_size, col_idx[perm], opt_idx[perm])
cond_perm = tf.gather(cond, perm)
fake, fake_act = self._generator(fake_z, training=True)
real = tf.convert_to_tensor(real.astype('float32'))
if cond_vec is not None:
fake_cat = tf.concat([fake_act, cond], 1)
real_cat = tf.concat([real, cond_perm], 1)
else:
fake_cat = fake
real_cat = real
return self.train_c_step(fake_cat, real_cat)
@tf.function
def train_g_step(self, fake_z):
with tf.GradientTape() as tape:
_, fake_act = self._generator(fake_z, training=True)
y_fake = self._critic(fake_act, training=True)
g_loss = -tf.reduce_mean(y_fake)
weights = self._generator.trainable_variables
grad = tape.gradient(g_loss, weights)
grad = [grad[i] + self._l2_scale * weights[i] for i in range(len(grad))]
self._g_opt.apply_gradients( zip(grad, self._generator.trainable_variables))
return g_loss, tf.constant(0, dtype=tf.float32)
@tf.function
def train_g_cond_step(self, fake_z, cond, mask, cond_info):
with tf.GradientTape() as tape:
fake, fake_act = self._generator(fake_z, training=True)
y_fake = self._critic(tf.concat([fake_act, cond], 1), training=True)
cond_loss = conditional_loss(cond_info, fake, cond, mask)
g_loss = -tf.reduce_mean(y_fake) + cond_loss
weights = self._generator.trainable_variables
grad = tape.gradient(g_loss, weights)
grad = [grad[i] + self._l2_scale * weights[i] for i in range(len(grad))]
self._g_opt.apply_gradients(zip(grad, self._generator.trainable_variables))
return g_loss, cond_loss
def _train_g(self):
fake_z = tf.random.normal([self._batch_size, self._z_dim])
cond_vec = self._cond_generator.sample(self._batch_size)
if cond_vec is None: return self.train_g_step(fake_z)
cond, mask, _, _ = cond_vec
cond = tf.convert_to_tensor(cond, name="c1")
mask = tf.convert_to_tensor(mask, name="m1")
fake_z = tf.concat([fake_z, cond], 1, name="fake_z")
return self.train_g_cond_step(fake_z, cond, mask, self._transformer.cond_tensor)
def predict(self, n_samples=1000):
if n_samples <= 0: raise ValueError("Invalid number of samples.")
steps = n_samples // self._batch_size + 1
data = []
for _ in tf.range(steps):
fake_z = tf.random.normal([self._batch_size, self._z_dim])
cond_vec = self._cond_generator.sample_zero(self._batch_size)
if cond_vec is not None:
cond = tf.constant(cond_vec)
fake_z = tf.concat([fake_z, cond], 1)
fake = self._generator(fake_z)[1]
data.append(fake.numpy())
data = np.concatenate(data, 0)
data = data[:n_samples]
return self._transformer.inverse_transform(data, None)
def save(self, file_path, overwrite=False):
if file_path is None or len(file_path) == 0: raise NameError("Invalid file_path.")
dir_name = os.path.dirname(file_path)
if len(dir_name) and not os.path.exists(os.path.dirname(file_path)): raise NotADirectoryError("The file directory does not exist.")
if not overwrite and os.path.exists(file_path): raise FileExistsError( "File already exists. If you wish to replace it, use overwrite=True")
class_dict = {k: v for k, v in self.__dict__.items() if type(v) in [int, float, tuple]}
class_dict['_cond_generator'] = self._cond_generator.__dict__
class_dict['_transformer'] = self._transformer.__dict__
class_dict['_gen_weights'] = self._generator.get_weights()
joblib.dump(class_dict, file_path)
del class_dict
def load(self, file_path):
if file_path is None or len(file_path) == 0: raise NameError("Invalid file_path.")
if not os.path.exists(file_path): raise FileNotFoundError("The provided file_path does not exist.")
class_dict = joblib.load(file_path)
if class_dict is None: raise AttributeError
for key, value in class_dict.items():
if type(value) in [int, float, tuple]: setattr(self, key, value)
self._transformer = DataTransformer.from_dict(class_dict['_transformer'])
self._cond_generator = ConditionalGenerator.from_dict(class_dict['_cond_generator'])
self._generator = Generator( self._z_dim + self._cond_generator.n_opt, self._gen_dim, self._transformer.output_dimensions, self._transformer.output_tensor, self._tau)
self._generator.build((self._batch_size, self._generator._input_dim))
self._generator.set_weights(class_dict['_gen_weights'])
def transform(self, X, augmentation_proba=0.2, concat=False):
X_aug = X.copy()
if not self.fitted: self.fit(X_aug)
auged_data = self.predict(n_samples = int(len(X_aug) * augmentation_proba))
aug_indices = np.random.choice( list(range(len(X_aug))), int(len(X_aug) * (1 - augmentation_proba)) )
if not concat: X_aug[aug_indices] = auged_data
else: X_aug = pd.concat([X_aug, auged_data])
return X_aug
def augment(self, X, augmentation_proba = 0.2): return self.transform(X, augmentation_proba = 0.2)
class DataSampler(object):
def __init__(self, data, output_info):
super(DataSampler, self).__init__()
self._data = data
self._model = []
self._n = len(data)
st_idx = 0
skip = False
for item in output_info:
if item[1] == 'tanh':
st_idx += item[0]
skip = True
elif item[1] == 'softmax':
if skip:
skip = False
st_idx += item[0]
continue
ed_idx = st_idx + item[0]
tmp = []
for j in range(item[0]):
tmp.append(np.nonzero(data[:, st_idx + j])[0])
self._model.append(tmp)
st_idx = ed_idx
else:
assert 0
assert st_idx == data.shape[1]
def sample(self, n_samples, col_idx, opt_idx):
if col_idx is None:
idx = np.random.choice(np.arange(self._n), n_samples)
return self._data[idx]
idx = []
for col, opt in zip(col_idx, opt_idx):
idx.append(np.random.choice(self._model[col][opt]))
return self._data[idx]
def init_bounded(shape, **kwargs):
if 'dim' not in kwargs: raise AttributeError('dim not passed as input')
if 'dtype' not in kwargs: raise AttributeError('dtype not passed as input')
dim = kwargs['dim']
d_type = kwargs['dtype']
bound = 1 / math.sqrt(dim)
return tf.random.uniform(shape, minval=-bound, maxval=bound, dtype=d_type)
class Critic(Model):
def __init__(self, input_dim, dis_dims, pac):
super(Critic, self).__init__()
self._pac = pac
self._input_dim = input_dim
self._model = [self._reshape_func]
dim = input_dim * self._pac
for layer_dim in list(dis_dims):
self._model += [Dense(layer_dim, input_dim=(dim,), kernel_initializer=partial(init_bounded, dim=dim), bias_initializer=partial(init_bounded, dim=dim)), LeakyReLU(0.2), Dropout(0.5)]
dim = layer_dim
layer_dim = 1
self._model += [Dense(layer_dim, input_dim=(dim,), kernel_initializer=partial(init_bounded, dim=dim), bias_initializer=partial(init_bounded, dim=dim))]
def _reshape_func(self, inputs, **kwargs):
dims = inputs.get_shape().as_list()
return tf.reshape(inputs, [-1, dims[1] * self._pac])
def call(self, inputs, **kwargs):
outputs = inputs
for layer in self._model:
outputs = layer(outputs, **kwargs)
return outputs
import keras
class ProgressBar(keras.utils.generic_utils.Progbar):
@classmethod
def _get_terminal_width(cls):
width = shutil.get_terminal_size(fallback=(200, 24))[0]
return width if width != 0 else 120
def __init__(self, total_samples, batch_size, epoch, num_epochs, metrics):
self._seen = 0
self.log_values = []
postfix = {m: f'{0:6.3f}' for m in metrics}
postfix[1] = 1
str_format = '{n_fmt}/{total_fmt} |{bar}| {rate_fmt} ' \
'ETA: {remaining} Elapsed Time: {elapsed} ' + \
reduce(lambda x, y: x + y,
["%s:{postfix[%s]} " % (m, m) for m in metrics],
"")
super(ProgressBar, self).__init__(
target=(total_samples // batch_size) * batch_size)
self._batch_size = batch_size
def _update(self, metrics):
new_metrics = {}
for met in metrics: new_metrics[met] = metrics[met].result()
for k in new_metrics:
self.log_values.append((k, new_metrics[k]))
self._seen += self._batch_size
super(ProgressBar, self).update(self._seen, self.log_values)
def gradient_penalty(func, real, fake, pac=10, gp_lambda=10.0):
alpha = tf.random.uniform([real.shape[0] // pac, 1, 1], 0., 1.)
alpha = tf.tile(alpha, tf.constant([1, pac, real.shape[1]], tf.int32))
alpha = tf.reshape(alpha, [-1, real.shape[1]])
interpolates = alpha * real + ((1 - alpha) * fake)
with tf.GradientTape() as tape:
tape.watch(interpolates)
pred = func(interpolates)
grad = tape.gradient(pred, [interpolates])[0]
grad = tf.reshape(grad, tf.constant([-1, pac * real.shape[1]], tf.int32))
slopes = tf.math.reduce_euclidean_norm(grad, axis=1)
return tf.reduce_mean((slopes - 1.) ** 2) * gp_lambda
def conditional_loss(cond_info, data, cond, mask):
shape = tf.shape(mask)
c_loss = tf.zeros(shape)
for item in cond_info:
data_logsoftmax = data[:, item[0]:item[1]]
cond_vec = tf.math.argmax(cond[:, item[2]:item[3]], 1)
loss = tf.reshape(tf.nn.sparse_softmax_cross_entropy_with_logits(cond_vec, data_logsoftmax), [-1, 1])
c_loss = tf.concat([c_loss[:, :item[-1]], loss, c_loss[:, item[-1]+1:]], 1)
return tf.reduce_sum(c_loss * mask) / tf.cast(shape[0], dtype=tf.float32)
import os
import math
import pickle
import shutil
import joblib
import warnings
import datetime
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import cluster
from functools import reduce
from functools import partial
from joblib import dump, load
warnings.filterwarnings("ignore")
from tqdm.autonotebook import tqdm
import tensorflow_probability as tfp
from sklearn.preprocessing import OneHotEncoder
from sklearn.exceptions import ConvergenceWarning
from sklearn.utils._testing import ignore_warnings
from sklearn.mixture import BayesianGaussianMixture
from tqdm.contrib.concurrent import process_map, thread_map
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
def pkl_dump(obj, path): pickle.dump(obj, open(path, 'wb'), protocol=4)
def pkl_load(path): return pickle.load(open(path, 'rb'))
class ConditionalGenerator:
@classmethod
def from_dict(cls, in_dict):
new_instance = ConditionalGenerator()
new_instance.__dict__ = in_dict
return new_instance
def __init__(self, data=None, output_info=None, log_frequency=None):
if data is None or output_info is None or log_frequency is None:
return
self._model = []
start = 0
skip = False
max_interval = 0
counter = 0
for item in output_info:
if item[1] == 'tanh':
start += item[0]
skip = True
continue
if item[1] == 'softmax':
if skip:
skip = False
start += item[0]
continue
end = start + item[0]
max_interval = max(max_interval, end - start)
counter += 1
self._model.append(np.argmax(data[:, start:end], axis=-1))
start = end
else:
assert 0
assert start == data.shape[1]
self._interval = []
self._n_col = 0
self.n_opt = 0
skip = False
start = 0
self._p = np.zeros((counter, max_interval))
for item in output_info:
if item[1] == 'tanh':
skip = True
start += item[0]
continue
if item[1] == 'softmax':
if skip:
start += item[0]
skip = False
continue
end = start + item[0]
tmp = np.sum(data[:, start:end], axis=0)
if log_frequency:
tmp = np.log(tmp + 1)
tmp = tmp / np.sum(tmp)
self._p[self._n_col, :item[0]] = tmp
self._interval.append((self.n_opt, item[0]))
self.n_opt += item[0]
self._n_col += 1
start = end
else:
assert 0
self._interval = np.asarray(self._interval)
def _random_choice_prob_index(self, idx):
prob = self._p[idx]
rand = np.expand_dims(np.random.rand(prob.shape[0]), axis=1)
return (prob.cumsum(axis=1) > rand).argmax(axis=1)
def sample(self, batch_size):
if self._n_col == 0: return None
col_idx = np.random.choice(np.arange(self._n_col), batch_size)
cond = np.zeros((batch_size, self.n_opt), dtype='float32')
mask = np.zeros((batch_size, self._n_col), dtype='float32')
mask[np.arange(batch_size), col_idx] = 1
opt_idx = self._random_choice_prob_index(col_idx)
opt = self._interval[col_idx, 0] + opt_idx
cond[np.arange(batch_size), opt] = 1
return cond, mask, col_idx, opt_idx
def sample_zero(self, batch_size):
if self._n_col == 0: return None
vec = np.zeros((batch_size, self.n_opt), dtype='float32')
idx = np.random.choice(np.arange(self._n_col), batch_size)
for i in range(batch_size):
col = idx[i]
pick = int(np.random.choice(self._model[col]))
vec[i, pick + self._interval[col, 0]] = 1
return vec
class ResidualLayer(Layer):
def __init__(self, input_dim, output_dim):
super(ResidualLayer, self).__init__()
self._output_dim = output_dim
self._fc = Dense( self._output_dim, input_dim=(input_dim,), kernel_initializer=partial(init_bounded, dim=input_dim), bias_initializer=partial(init_bounded, dim=input_dim))
self._bn = BatchNormalization(epsilon=1e-5, momentum=0.9)
self._relu = ReLU()
def call(self, inputs, **kwargs):
outputs = self._fc(inputs, **kwargs)
outputs = self._bn(outputs, **kwargs)
outputs = self._relu(outputs, **kwargs)
return tf.concat([outputs, inputs], 1)
class GenActivation(Layer):
def __init__(self, input_dim, output_dim, transformer_info, tau):
super(GenActivation, self).__init__()
self._output_dim = output_dim
self._transformer_info = transformer_info
self._tau = tau
self._fc = Dense(
output_dim, input_dim=(input_dim,),
kernel_initializer=partial(init_bounded, dim=input_dim),
bias_initializer=partial(init_bounded, dim=input_dim))
def call(self, inputs, **kwargs):
outputs = self._fc(inputs, **kwargs)
data_t = tf.zeros(tf.shape(outputs))
for idx in self._transformer_info:
act = tf.where(
idx[2] == 0,
tf.math.tanh(outputs[:, idx[0]:idx[1]]),
self._gumbel_softmax(outputs[:, idx[0]:idx[1]], tau=self._tau))
data_t = tf.concat([data_t[:, :idx[0]], act, data_t[:, idx[1]:]], 1)
return outputs, data_t
@tf.function(experimental_relax_shapes=True)
def _gumbel_softmax(self, logits, tau=1.0, hard=False, dim=-1):
gumbel_dist = tfp.distributions.Gumbel(loc=0, scale=1)
gumbels = gumbel_dist.sample(tf.shape(logits))
gumbels = (logits + gumbels) / tau
output = tf.nn.softmax(gumbels, dim)
if hard:
index = tf.math.reduce_max(output, 1, keepdims=True)
output_hard = tf.cast(tf.equal(output, index), output.dtype)
output = tf.stop_gradient(output_hard - output) + output
return output
class Generator(Model):
def __init__(self, input_dim, gen_dims, data_dim, transformer_info, tau):
super(Generator, self).__init__()
self._input_dim = input_dim
self._model = list()
dim = input_dim
for layer_dim in list(gen_dims):
self._model += [ResidualLayer(dim, layer_dim)]
dim += layer_dim
self._model += [GenActivation(dim, data_dim, transformer_info, tau)]
def call(self, inputs, **kwargs):
outputs = inputs
for layer in self._model:
outputs = layer(outputs, **kwargs)
return outputs
class OHE(OneHotEncoder):
def __eq__(self, other):
try:
np.testing.assert_equal(self.__dict__, other.__dict__)
return True
except AssertionError:
return False
class BGM(BayesianGaussianMixture):
def __eq__(self, other):
try:
np.testing.assert_equal(self.__dict__, other.__dict__)
return True
except AssertionError: return False
def _initialize_parameters(self, X, random_state):
n_samples, _ = X.shape
if self.init_params == 'kmeans':
resp = np.zeros((n_samples, self.n_components))
try:
from cuml.cluster import KMeans
label = KMeans(n_clusters=self.n_components, n_init=1).fit(X).labels_
except:
label = cluster.KMeans(n_clusters=self.n_components, n_init=1, random_state=random_state).fit(X).labels_
resp[np.arange(n_samples), label] = 1
elif self.init_params == 'random':
resp = random_state.rand(n_samples, self.n_components)
resp /= resp.sum(axis=1)[:, np.newaxis]
else: raise ValueError("Unimplemented initialization method '%s'" % self.init_params)
self._initialize(X, resp)
class Sampler(object):
def __init__(self, data, output_info):
super(Sampler, self).__init__()
self.data = data
self.model = []
self.n = len(data)
st = 0
skip = False
for item in output_info:
if item[1] == 'tanh':
st += item[0]
skip = True
elif item[1] == 'softmax':
if skip:
skip = False
st += item[0]
continue
ed = st + item[0]
tmp = []
for j in range(item[0]):
tmp.append(np.nonzero(data[:, st + j])[0])
self.model.append(tmp)
st = ed
else:
assert 0
assert st == data.shape[1]
def sample(self, n, col, opt):
if col is None:
idx = np.random.choice(np.arange(self.n), n)
return self.data[idx]
idx = []
for c, o in zip(col, opt):
idx.append(np.random.choice(self.model[c][o]))
return self.data[idx]
class DataTransformer(object):
@classmethod
def from_dict(cls, in_dict):
new_instance = DataTransformer()
new_instance.__dict__ = in_dict
return new_instance
def __init__(self, n_clusters=10, epsilon=0.005, parallel_preprocess=None):
self._parallel_preprocess = parallel_preprocess
self._n_clusters = n_clusters
self._epsilon = epsilon
self._is_dataframe = None
self._meta = None
self._dtypes = None
self.output_info = None
self.output_dimensions = None
self.output_tensor = None
self.cond_tensor = None
def generate_tensors(self):
if self.output_info is None: raise AttributeError("Output info still not available")
output_info = []
cond_info = []
i = 0
st_idx = 0
st_c = 0
for item in self.output_info:
ed_idx = st_idx + item[0]
if not item[2]: # Categorical: add conditions for all categ options (?)
ed_c = st_c + item[0]
cond_info.append(tf.constant([st_idx, ed_idx, st_c, ed_c, i], dtype=tf.int32))
st_c = ed_c
i += 1
output_info.append(tf.constant([st_idx, ed_idx, int(item[1] == 'softmax')], dtype=tf.int32))
st_idx = ed_idx
self.output_tensor = output_info
self.cond_tensor = cond_info
@ignore_warnings(category=ConvergenceWarning)
def _fit_continuous(self, column, data):
vgm = BGM(
self._n_clusters,
weight_concentration_prior_type='dirichlet_process',
weight_concentration_prior=0.001,
n_init=1
)
vgm.fit(data)
components = vgm.weights_ > self._epsilon
num_components = components.sum()
return {
'name': column,
'model': vgm,
'components': components,
'output_info': [(1, 'tanh', 1), (num_components, 'softmax', 1)],
'output_dimensions': 1 + num_components,
}
def _fit_discrete(self, column, data):
ohe = OHE(sparse=False)
ohe.fit(data)
categories = len(ohe.categories_[0])
return {
'name': column,
'encoder': ohe,
'output_info': [(categories, 'softmax', 0)],
'output_dimensions': categories
}
def fit(self, data, discrete_columns=tuple()):
self.output_info = []
self.output_dimensions = 0
if not isinstance(data, pd.DataFrame):
self._is_dataframe = False
data = pd.DataFrame(data)
else:
self._is_dataframe = True
self._dtypes = data.infer_objects().dtypes
self._meta = []
if self._parallel_preprocess:
args = []
for column in data.columns: args.append((self._n_clusters, self._epsilon, data, column, discrete_columns))
res = process_map(fit_component, args, max_workers=2)
for out_info, out_dim, mets in res:
self.output_info += out_info
self.output_dimensions += out_dim
self._meta.append(mets)
else:
for column in tqdm(data.columns, desc='fitting preprocessing', leave=False):
column_data = data[[column]].values
if column in discrete_columns:
meta = self._fit_discrete(column, column_data)
else:
meta = self._fit_continuous(column, column_data)
self.output_info += meta['output_info']
self.output_dimensions += meta['output_dimensions']
self._meta.append(meta)
def _transform_continuous(self, column_meta, data):
components = column_meta['components']
model = column_meta['model']
means = model.means_.reshape((1, self._n_clusters))
stds = np.sqrt(model.covariances_).reshape((1, self._n_clusters))
features = (data - means) / (4 * stds)
probs = model.predict_proba(data)
n_opts = components.sum()
features = features[:, components]
probs = probs[:, components]
opt_sel = np.zeros(len(data), dtype='int')
for i in range(len(data)):
norm_probs = probs[i] + 1e-6
norm_probs = norm_probs / norm_probs.sum()
opt_sel[i] = np.random.choice(np.arange(n_opts), p=norm_probs)
idx = np.arange((len(features)))
features = features[idx, opt_sel].reshape([-1, 1])
features = np.clip(features, -.99, .99)
probs_onehot = np.zeros_like(probs)
probs_onehot[np.arange(len(probs)), opt_sel] = 1
return [features, probs_onehot]
def _transform_discrete(self, column_meta, data):
encoder = column_meta['encoder']
return encoder.transform(data)
def transform(self, data):
if not isinstance(data, pd.DataFrame):
data = pd.DataFrame(data)
values = []
args = []
for meta in tqdm(self._meta):
column_data = data[[meta['name']]].values
args.append((self, meta, column_data))
rets = process_map(_transform, args)
for ret in rets:
values += ret
# for meta in tqdm(self._meta):
# column_data = data[[meta['name']]].values
# if 'model' in meta:
# values += self._transform_continuous(meta, column_data)
# else:
# values.append(self._transform_discrete(meta, column_data))
return np.concatenate(values, axis=1).astype(float)
def _inverse_transform_continuous(self, meta, data, sigma=None):
model = meta['model']
components = meta['components']
mean = data[:, 0]
variance = data[:, 1:]
if sigma is not None:
mean = np.random.normal(mean, sigma)
mean = np.clip(mean, -1, 1)
v_t = np.ones((len(data), self._n_clusters)) * -100
v_t[:, components] = variance
variance = v_t
means = model.means_.reshape([-1])
stds = np.sqrt(model.covariances_).reshape([-1])
p_argmax = np.argmax(variance, axis=1)
std_t = stds[p_argmax]
mean_t = means[p_argmax]
column = mean * 4 * std_t + mean_t
return column
def _inverse_transform_discrete(self, meta, data):
encoder = meta['encoder']
return encoder.inverse_transform(data)
def inverse_transform(self, data, sigmas=None):
start = 0
output = []
column_names = []
for meta in self._meta:
dimensions = meta['output_dimensions']
columns_data = data[:, start:start + dimensions]
if 'model' in meta:
sigma = sigmas[start] if sigmas else None
inverted = self._inverse_transform_continuous(
meta, columns_data, sigma)
else:
inverted = self._inverse_transform_discrete(meta, columns_data)
output.append(inverted)
column_names.append(meta['name'])
start += dimensions
output = np.column_stack(output)
output = pd.DataFrame(output, columns=column_names)\
.astype(self._dtypes)
if not self._is_dataframe:
output = output.values
return output
def _fit_discrete(column, data):
ohe = OHE(sparse=False)
ohe.fit(data)
categories = len(ohe.categories_[0])
return {
'name': column,
'encoder': ohe,
'output_info': [(categories, 'softmax', 0)],
'output_dimensions': categories
}
def _fit_continuous(_n_clusters, _epsilon, column, data):
vgm = BGM(
_n_clusters,
weight_concentration_prior_type='dirichlet_process',
weight_concentration_prior=0.001,
n_init=1
)
vgm.fit(data)
components = vgm.weights_ > _epsilon
num_components = components.sum()
return {
'name': column,
'model': vgm,
'components': components,
'output_info': [(1, 'tanh', 1), (num_components, 'softmax', 1)],
'output_dimensions': 1 + num_components,
}
def fit_component(args):
_n_clusters, _epsilon, data, column, discrete_columns = args
column_data = data[[column]].values
if column in discrete_columns:
meta = _fit_discrete(column, column_data)
else:
meta = _fit_continuous(_n_clusters, _epsilon, column, column_data)
return meta['output_info'], meta['output_dimensions'], meta
def _transform(args):
self, meta, column_data = args
if 'model' in meta:
return self._transform_continuous(meta, column_data)
else:
return([self._transform_discrete(meta, column_data)])
class TabularGAN(object):
def __init__(self, file_path=None, log_dir=None, z_dim=128, pac=10, gen_dim=(256, 256), crt_dim=(256, 256), l2_scale=1e-6, batch_size=500, gp_lambda=10.0, tau=0.2, parallel=False, transformer=None):
if file_path is not None: self.load(file_path)
else:
if log_dir is not None and os.path.exists(log_dir): raise IsADirectoryError("Log directory does not exist.")
if batch_size % 2 != 0 or batch_size % pac != 0: raise ValueError("batch_size needs to be an even value divisible by pac.")
self._parallel_preprocess = parallel
self._log_dir = log_dir
self._z_dim = z_dim
self._pac = pac
self._pac_dim = None
self._l2_scale = l2_scale
self._batch_size = batch_size
self._gp_lambda = gp_lambda
self._tau = tau
self._gen_dim = tuple(gen_dim)
self._crt_dim = tuple(crt_dim)
self._g_opt = Adam(learning_rate=2e-4, beta_1=0.5, beta_2=0.9, epsilon=1e-08)
self._c_opt = Adam(learning_rate=2e-4, beta_1=0.5, beta_2=0.9, epsilon=1e-08)
self._transformer = DataTransformer(parallel_preprocess = self._parallel_preprocess) if transformer is None else transformer
self._need_transform = transformer is None
self._data_sampler = None
self._cond_generator = None
self._generator = None
self._critic = None
self.fitted = False
def fit(self, train_data, categorical_cols=tuple(), epochs=300, log_frequency=True):
if categorical_cols is None: categorical_cols = [i for i in list(set(train_data.columns) - set(train_data._get_numeric_data().columns))]
if self._need_transform: self._transformer.fit(train_data, categorical_cols)
# pkl_dump(self._transformer, 'ctgan_dump')
train_data = self._transformer.transform(train_data)
self._transformer.generate_tensors()
self._data_sampler = DataSampler(train_data, self._transformer.output_info)
data_dim = self._transformer.output_dimensions
self._cond_generator = ConditionalGenerator(train_data, self._transformer.output_info, log_frequency)
self._generator = Generator( self._z_dim + self._cond_generator.n_opt, self._gen_dim, data_dim, self._transformer.output_tensor, self._tau)
self._critic = Critic( data_dim + self._cond_generator.n_opt, self._crt_dim, self._pac)
metrics = { 'g_loss': tf.metrics.Mean(), 'cond_loss': tf.metrics.Mean(), 'c_loss': tf.metrics.Mean(), 'gp': tf.metrics.Mean() }
if self._log_dir is not None:
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = self._log_dir + '/gradient_tape/' + current_time + '/train'
train_summary_writer = tf.summary.create_file_writer(train_log_dir)
self._generator.build((self._batch_size, self._generator._input_dim))
self._critic.build((self._batch_size, self._critic._input_dim))
steps_per_epoch = max(len(train_data) // self._batch_size, 1)
for epoch in range(epochs):
p_bar = ProgressBar(len(train_data), self._batch_size, epoch, epochs, metrics)
for _ in range(steps_per_epoch):
c_loss, g_p = self._train_c()
metrics['c_loss'](c_loss)
metrics['gp'](g_p)
g_loss, cond_loss = self._train_g()
metrics['g_loss'](g_loss)
metrics['cond_loss'](cond_loss)
p_bar._update(metrics)
if self._log_dir is not None:
with train_summary_writer.as_default():
for met in metrics:
tf.summary.scalar(met, metrics[met].result(), step=epoch)
metrics[met].reset_states()
# p_bar.close()
del p_bar
self.fitted = True
@tf.function
def train_c_step(self, fake_cat, real_cat):
with tf.GradientTape() as tape:
y_fake = self._critic(fake_cat, training=True)
y_real = self._critic(real_cat, training=True)
g_p = gradient_penalty( partial(self._critic, training=True), real_cat, fake_cat, self._pac, self._gp_lambda)
loss = -(tf.reduce_mean(y_real) - tf.reduce_mean(y_fake))
c_loss = loss + g_p
grad = tape.gradient(c_loss, self._critic.trainable_variables)
self._c_opt.apply_gradients( zip(grad, self._critic.trainable_variables))
return loss, g_p
def _train_c(self):
fake_z = tf.random.normal([self._batch_size, self._z_dim])
cond_vec = self._cond_generator.sample(self._batch_size)
if cond_vec is None:
_, _, col_idx, opt_idx = None, None, None, None
real = self._data_sampler.sample(self._batch_size, col_idx, opt_idx)
else:
cond, _, col_idx, opt_idx = cond_vec
cond = tf.convert_to_tensor(cond)
fake_z = tf.concat([fake_z, cond], 1)
perm = np.arange(self._batch_size)
np.random.shuffle(perm)
real = self._data_sampler.sample(self._batch_size, col_idx[perm], opt_idx[perm])
cond_perm = tf.gather(cond, perm)
fake, fake_act = self._generator(fake_z, training=True)
real = tf.convert_to_tensor(real.astype('float32'))
if cond_vec is not None:
fake_cat = tf.concat([fake_act, cond], 1)
real_cat = tf.concat([real, cond_perm], 1)
else:
fake_cat = fake
real_cat = real
return self.train_c_step(fake_cat, real_cat)
@tf.function
def train_g_step(self, fake_z):
with tf.GradientTape() as tape:
_, fake_act = self._generator(fake_z, training=True)
y_fake = self._critic(fake_act, training=True)
g_loss = -tf.reduce_mean(y_fake)
weights = self._generator.trainable_variables
grad = tape.gradient(g_loss, weights)
grad = [grad[i] + self._l2_scale * weights[i] for i in range(len(grad))]
self._g_opt.apply_gradients( zip(grad, self._generator.trainable_variables))
return g_loss, tf.constant(0, dtype=tf.float32)
@tf.function
def train_g_cond_step(self, fake_z, cond, mask, cond_info):
with tf.GradientTape() as tape:
fake, fake_act = self._generator(fake_z, training=True)
y_fake = self._critic(tf.concat([fake_act, cond], 1), training=True)
cond_loss = conditional_loss(cond_info, fake, cond, mask)
g_loss = -tf.reduce_mean(y_fake) + cond_loss
weights = self._generator.trainable_variables
grad = tape.gradient(g_loss, weights)
grad = [grad[i] + self._l2_scale * weights[i] for i in range(len(grad))]
self._g_opt.apply_gradients(zip(grad, self._generator.trainable_variables))
return g_loss, cond_loss
def _train_g(self):
fake_z = tf.random.normal([self._batch_size, self._z_dim])
cond_vec = self._cond_generator.sample(self._batch_size)
if cond_vec is None: return self.train_g_step(fake_z)
cond, mask, _, _ = cond_vec
cond = tf.convert_to_tensor(cond, name="c1")
mask = tf.convert_to_tensor(mask, name="m1")
fake_z = tf.concat([fake_z, cond], 1, name="fake_z")
return self.train_g_cond_step(fake_z, cond, mask, self._transformer.cond_tensor)
def predict(self, n_samples=1000):
if n_samples <= 0: raise ValueError("Invalid number of samples.")
steps = n_samples // self._batch_size + 1
data = []
for _ in tf.range(steps):
fake_z = tf.random.normal([self._batch_size, self._z_dim])
cond_vec = self._cond_generator.sample_zero(self._batch_size)
if cond_vec is not None:
cond = tf.constant(cond_vec)
fake_z = tf.concat([fake_z, cond], 1)
fake = self._generator(fake_z)[1]
data.append(fake.numpy())
data = np.concatenate(data, 0)
data = data[:n_samples]
return self._transformer.inverse_transform(data, None)
def save(self, file_path, overwrite=False):
if file_path is None or len(file_path) == 0: raise NameError("Invalid file_path.")
dir_name = os.path.dirname(file_path)
if len(dir_name) and not os.path.exists(os.path.dirname(file_path)): raise NotADirectoryError("The file directory does not exist.")
if not overwrite and os.path.exists(file_path): raise FileExistsError( "File already exists. If you wish to replace it, use overwrite=True")
class_dict = {k: v for k, v in self.__dict__.items() if type(v) in [int, float, tuple]}
class_dict['_cond_generator'] = self._cond_generator.__dict__
class_dict['_transformer'] = self._transformer.__dict__
class_dict['_gen_weights'] = self._generator.get_weights()
joblib.dump(class_dict, file_path)
del class_dict
def load(self, file_path):
if file_path is None or len(file_path) == 0: raise NameError("Invalid file_path.")
if not os.path.exists(file_path): raise FileNotFoundError("The provided file_path does not exist.")
class_dict = joblib.load(file_path)
if class_dict is None: raise AttributeError
for key, value in class_dict.items():
if type(value) in [int, float, tuple]: setattr(self, key, value)
self._transformer = DataTransformer.from_dict(class_dict['_transformer'])
self._cond_generator = ConditionalGenerator.from_dict(class_dict['_cond_generator'])
self._generator = Generator( self._z_dim + self._cond_generator.n_opt, self._gen_dim, self._transformer.output_dimensions, self._transformer.output_tensor, self._tau)
self._generator.build((self._batch_size, self._generator._input_dim))
self._generator.set_weights(class_dict['_gen_weights'])
def transform(self, X, augmentation_proba=0.2, concat=False):
X_aug = X.copy()
if not self.fitted: self.fit(X_aug)
auged_data = self.predict(n_samples = int(len(X_aug) * augmentation_proba))
aug_indices = np.random.choice( list(range(len(X_aug))), int(len(X_aug) * (1 - augmentation_proba)) )
if not concat: X_aug[aug_indices] = auged_data
else: X_aug = pd.concat([X_aug, auged_data])
return X_aug
def augment(self, X, augmentation_proba = 0.2): return self.transform(X, augmentation_proba = 0.2)
class DataSampler(object):
def __init__(self, data, output_info):
super(DataSampler, self).__init__()
self._data = data
self._model = []
self._n = len(data)
st_idx = 0
skip = False
for item in output_info:
if item[1] == 'tanh':
st_idx += item[0]
skip = True
elif item[1] == 'softmax':
if skip:
skip = False
st_idx += item[0]
continue
ed_idx = st_idx + item[0]
tmp = []
for j in range(item[0]):
tmp.append(np.nonzero(data[:, st_idx + j])[0])
self._model.append(tmp)
st_idx = ed_idx
else:
assert 0
assert st_idx == data.shape[1]
def sample(self, n_samples, col_idx, opt_idx):
if col_idx is None:
idx = np.random.choice(np.arange(self._n), n_samples)
return self._data[idx]
idx = []
for col, opt in zip(col_idx, opt_idx):
idx.append(np.random.choice(self._model[col][opt]))
return self._data[idx]
def init_bounded(shape, **kwargs):
if 'dim' not in kwargs: raise AttributeError('dim not passed as input')
if 'dtype' not in kwargs: raise AttributeError('dtype not passed as input')
dim = kwargs['dim']
d_type = kwargs['dtype']
bound = 1 / math.sqrt(dim)
return tf.random.uniform(shape, minval=-bound, maxval=bound, dtype=d_type)
class Critic(Model):
def __init__(self, input_dim, dis_dims, pac):
super(Critic, self).__init__()
self._pac = pac
self._input_dim = input_dim
self._model = [self._reshape_func]
dim = input_dim * self._pac
for layer_dim in list(dis_dims):
self._model += [Dense(layer_dim, input_dim=(dim,), kernel_initializer=partial(init_bounded, dim=dim), bias_initializer=partial(init_bounded, dim=dim)), LeakyReLU(0.2), Dropout(0.5)]
dim = layer_dim
layer_dim = 1
self._model += [Dense(layer_dim, input_dim=(dim,), kernel_initializer=partial(init_bounded, dim=dim), bias_initializer=partial(init_bounded, dim=dim))]
def _reshape_func(self, inputs, **kwargs):
dims = inputs.get_shape().as_list()
return tf.reshape(inputs, [-1, dims[1] * self._pac])
def call(self, inputs, **kwargs):
outputs = inputs
for layer in self._model:
outputs = layer(outputs, **kwargs)
return outputs
import keras
class ProgressBar(keras.utils.generic_utils.Progbar):
@classmethod
def _get_terminal_width(cls):
width = shutil.get_terminal_size(fallback=(200, 24))[0]
return width if width != 0 else 120
def __init__(self, total_samples, batch_size, epoch, num_epochs, metrics):
self._seen = 0
self.log_values = []
postfix = {m: f'{0:6.3f}' for m in metrics}
postfix[1] = 1
str_format = '{n_fmt}/{total_fmt} |{bar}| {rate_fmt} ' \
'ETA: {remaining} Elapsed Time: {elapsed} ' + \
reduce(lambda x, y: x + y,
["%s:{postfix[%s]} " % (m, m) for m in metrics],
"")
super(ProgressBar, self).__init__(
target=(total_samples // batch_size) * batch_size)
self._batch_size = batch_size
def _update(self, metrics):
new_metrics = {}
for met in metrics: new_metrics[met] = metrics[met].result()
for k in new_metrics:
self.log_values.append((k, new_metrics[k]))
self._seen += self._batch_size
super(ProgressBar, self).update(self._seen, self.log_values)
def gradient_penalty(func, real, fake, pac=10, gp_lambda=10.0):
alpha = tf.random.uniform([real.shape[0] // pac, 1, 1], 0., 1.)
alpha = tf.tile(alpha, tf.constant([1, pac, real.shape[1]], tf.int32))
alpha = tf.reshape(alpha, [-1, real.shape[1]])
interpolates = alpha * real + ((1 - alpha) * fake)
with tf.GradientTape() as tape:
tape.watch(interpolates)
pred = func(interpolates)
grad = tape.gradient(pred, [interpolates])[0]
grad = tf.reshape(grad, tf.constant([-1, pac * real.shape[1]], tf.int32))
slopes = tf.math.reduce_euclidean_norm(grad, axis=1)
return tf.reduce_mean((slopes - 1.) ** 2) * gp_lambda
def conditional_loss(cond_info, data, cond, mask):
shape = tf.shape(mask)
c_loss = tf.zeros(shape)
for item in cond_info:
data_logsoftmax = data[:, item[0]:item[1]]
cond_vec = tf.math.argmax(cond[:, item[2]:item[3]], 1)
loss = tf.reshape(tf.nn.sparse_softmax_cross_entropy_with_logits(cond_vec, data_logsoftmax), [-1, 1])
c_loss = tf.concat([c_loss[:, :item[-1]], loss, c_loss[:, item[-1]+1:]], 1)
return tf.reduce_sum(c_loss * mask) / tf.cast(shape[0], dtype=tf.float32)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment