Skip to content

Instantly share code, notes, and snippets.

@KeitaTakenouchi
Last active April 1, 2021 14:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save KeitaTakenouchi/7a9911f36aefbbd01860e7a5c3dce18a to your computer and use it in GitHub Desktop.
Save KeitaTakenouchi/7a9911f36aefbbd01860e7a5c3dce18a to your computer and use it in GitHub Desktop.
Pytorch Tabular Example
# !pip install torch torchvision torchaudio
# !pip install pytorch_tabular[all]
## Prepare utility functions
from sklearn.datasets import make_classification
def make_mixed_classification(n_samples, n_features, n_categories):
X,y = make_classification(n_samples=n_samples, n_features=n_features, random_state=42, n_informative=5)
cat_cols = random.choices(list(range(X.shape[-1])),k=n_categories)
num_cols = [i for i in range(X.shape[-1]) if i not in cat_cols]
for col in cat_cols:
X[:,col] = pd.qcut(X[:,col], q=4).codes.astype(int)
col_names = []
num_col_names=[]
cat_col_names=[]
for i in range(X.shape[-1]):
if i in cat_cols:
col_names.append(f"cat_col_{i}")
cat_col_names.append(f"cat_col_{i}")
if i in num_cols:
col_names.append(f"num_col_{i}")
num_col_names.append(f"num_col_{i}")
X = pd.DataFrame(X, columns=col_names)
y = pd.Series(y, name="target")
data = X.join(y)
return data, cat_col_names, num_col_names
## Obtain trainign data
from sklearn.model_selection import train_test_split
import random
import pandas as pd
data, cat_col_names, num_col_names = make_mixed_classification(n_samples=100, n_features=20, n_categories=4)
train, test = train_test_split(data, random_state=42)
train, val = train_test_split(train, random_state=42)
## Define a machine learning model using Pytorch Tabular
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig
data_config = DataConfig(
target=['target'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
continuous_cols=num_col_names,
categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate
batch_size=1024,
max_epochs=100,
gpus=1, #index of the GPU to use. 0, means CPU
)
optimizer_config = OptimizerConfig()
model_config = CategoryEmbeddingModelConfig(
task="classification",
layers="1024-512-512", # Number of nodes in each layer
activation="LeakyReLU", # Activation between each layers
learning_rate = 1e-2
)
tabular_model = TabularModel(
data_config=data_config,
model_config=model_config,
optimizer_config=optimizer_config,
trainer_config=trainer_config,
)
## Start learning
# see https://stackoverflow.com/questions/43769068/jupyter-notebook-widget-javascript-not-detected if error occurs
tabular_model.fit(train=train, validation=val)
result = tabular_model.evaluate(test)
pred_df = tabular_model.predict(test)
tabular_model.save_model("examples/basic")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment