Created
May 4, 2020 15:19
-
-
Save zredlined/e2f206a179006650e07fba4f3b0fb275 to your computer and use it in GitHub Desktop.
Gretel synthetic data configuration optimized for EHR datasets
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from gretel_synthetics.config import LocalConfig | |
# EHR configuration, optimal settings | |
# Note: this config is optimized for calculation on a GPU | |
config = LocalConfig( | |
max_lines=0, # read all lines (zero) | |
epochs=30, # 30 epochs for production | |
vocab_size=25000, # vocabulary size | |
character_coverage=1.0, # tokenizer model character coverage percent | |
gen_chars=0, # the maximum number of characters possible per-generated line of text | |
gen_lines=0, # generate dataset equal in size to training set (zero) | |
rnn_units=256, # dimensionality of LSTM output space | |
dropout_rate=0.2, # fraction of the inputs to drop | |
dp=True, # let's use differential privacy | |
dp_learning_rate=0.015, # learning rate | |
dp_noise_multiplier=1.1, # control how much noise is added to gradients | |
dp_l2_norm_clip=1.0, # bound optimizer's sensitivity to individual training points | |
dp_microbatches=256, # split batches into minibatches for parallelism | |
checkpoint_dir=(Path.cwd() / 'checkpoints').as_posix(), | |
input_data_path=dest_file # filepath or S3 | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment