Skip to content

Instantly share code, notes, and snippets.

@gildniy
Created April 15, 2021 09:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gildniy/5b88cd51f7d66c755464f227c56b9878 to your computer and use it in GitHub Desktop.
Save gildniy/5b88cd51f7d66c755464f227c56b9878 to your computer and use it in GitHub Desktop.
Classify structured data using Keras Preprocessing Layers
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental.preprocessing import \
Normalization, CategoryEncoding, IntegerLookup
dataset_url = 'https://storage.googleapis.com/kaggle-data-sets/1226038/2047221/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20210414%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20210414T143749Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=host&X-Goog-Signature=6cefbf3a9411483ae20d76e70c31f465c92380544351145764bdfb678d88c27622dee6c47cb9d2504859c08df24648c576fb1b8d8af192263f8c2b6485157cca2751d166fdaf358243774c4eaa1677c561af42961344b5c011e8bbfdd8fc1237f1101658d29c57b10cd1e9a5bb816461e05ff2190b520a5ae115147203e2e9c2a042787e26e6c314431e9caad65c7203419c3c52097f1ab00416a928aecba4ee78d2b559d1ea29c0ba68736c7ed2c630ef4574092aec1825f36f228f7aacdc44122ec8bff18a6051d9d74a9445924ceec8d0edf1de4741b465766876f4e27a491a20aded988467b4cefa75661d79b0d322e49d6985887307459dae559af523e6'
tf.keras.utils.get_file('archive.zip', dataset_url, extract=True,
cache_dir='.')
dataframe = pd.read_csv('datasets/heart.csv')
print(dataframe.shape) # (303, 14)
print(dataframe.head())
train_df, val_df = train_test_split(dataframe, test_size=0.2)
train_df, pred_df = train_test_split(train_df, test_size=0.01)
def df_to_dataset(df, predictor, shuffle=True, batch_size=32):
df = df.copy()
labels = df.pop(predictor)
ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
if shuffle:
ds = ds.shuffle(buffer_size=len(df))
ds = ds.batch(batch_size)
return ds
train_ds = df_to_dataset(train_df, 'output', batch_size=25)
val_ds = df_to_dataset(val_df, 'output', shuffle=False, batch_size=25)
def get_normalization_layer(name, dataset):
normalizer = Normalization()
feature_ds = dataset.map(lambda x, y: x[name])
normalizer.adapt(feature_ds)
return normalizer
def get_category_encoding_layer(name, dataset, max_tokens=None):
index = IntegerLookup(max_tokens=max_tokens)
feature_ds = dataset.map(lambda x, y: x[name])
index.adapt(feature_ds)
encoder = CategoryEncoding(num_tokens=index.vocabulary_size())
feature_ds = feature_ds.map(index)
encoder.adapt(feature_ds)
return lambda feature: encoder(index(feature))
all_inputs = []
encoded_features = []
numeric_cols = ['age', 'trtbps', 'chol', 'thalachh', 'oldpeak', 'slp']
for header in numeric_cols:
numeric_col = tf.keras.Input(shape=(1,), name=header)
all_inputs.append(numeric_col)
normalization_layer = get_normalization_layer(header, train_ds)
encoded_numeric_col = normalization_layer(numeric_col)
encoded_features.append(encoded_numeric_col)
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exng', 'caa', 'thall']
for header in categorical_cols:
categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='int64')
all_inputs.append(categorical_col)
encoding_layer = get_category_encoding_layer(
header,
train_ds,
max_tokens=5
)
encoded_categorical_col = encoding_layer(categorical_col)
encoded_features.append(encoded_categorical_col)
def build_model(n_units):
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(n_units, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model = tf.keras.Model(all_inputs, output)
model.compile(optimizer='adam',
loss=tf.keras.losses.BinaryCrossentropy(),
metrics=["accuracy"])
return model
model = build_model(32)
tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")
model.summary()
model.fit(train_ds, validation_data=val_ds, epochs=50)
loss, accuracy = model.evaluate(val_ds)
print("\nLoss: ", loss)
print("Accuracy: ", accuracy)
sample = list(pred_df.to_dict('index').values())[0]
input_dict = {name: tf.convert_to_tensor([value]) for name, value in
sample.items() if name != 'output'}
predictions = model.predict(input_dict)
print(f"\nThis particular patient had a {100 * predictions[0][0]:.1f}% "
f"probability of having a heart disease, as evaluated by our model.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment