Skip to content

Instantly share code, notes, and snippets.

@helinwang
Last active October 31, 2021 01:27
Show Gist options
  • Save helinwang/52512c7574c47c56af77d9107385ae2e to your computer and use it in GitHub Desktop.
Save helinwang/52512c7574c47c56af77d9107385ae2e to your computer and use it in GitHub Desktop.
A survey of TensorFlow feature columns
from tensorflow.keras import layers
import tensorflow as tf
import numpy as np
video_id = tf.feature_column.categorical_column_with_identity(
key="video_id", num_buckets=1000000, default_value=0
)
features = {
"video_id": tf.sparse.from_dense([[2, 85, 0, 0, 0], [33, 78, 2, 73, 1]]),
"year": tf.constant([[1.0, 2.0], [3.0, 4.0]]),
"number_str": tf.sparse.from_dense(
[["2", "85", "", "", ""], ["33", "78", "2", "73", "1"]]
),
"keywords": tf.constant(
[
["Tensorflow", "Keras", "RNN"],
["LSTM", "CNN", "Tensorflow"],
]
),
"latitude": tf.constant([33.7]),
"longitude": tf.constant([-84.3]),
}
print(tf.sparse.from_dense([[2, 85, 0, 0, 0], [33, 78, 2, 73, 1]]))
print(video_id._transform_feature(features))
print(video_id._transform_input_tensor(features["video_id"]))
numeric_feature_column = tf.feature_column.numeric_column("year")
numeric_feature_column._transform_feature(features)
bucketized_feature_column = tf.feature_column.bucketized_column(
source_column=numeric_feature_column, boundaries=[0, 1, 2]
)
bucketized_feature_column._transform_feature(
{numeric_feature_column: numeric_feature_column._transform_feature(features)}
)
categorical_column = tf.feature_column.categorical_column_with_vocabulary_list(
"number_str", [str(i) for i in range(100)]
)
print(categorical_column._transform_feature(features))
keywords = tf.feature_column.categorical_column_with_hash_bucket("keywords", 10000)
print(keywords._transform_feature(features))
indicator_column = tf.feature_column.indicator_column(categorical_column)
print(
indicator_column._transform_feature(
{categorical_column: categorical_column._transform_feature(features)}
)
)
latitude_buckets = list(np.linspace(33.641336, 33.887157, 99))
la_fc = tf.feature_column.numeric_column("latitude")
latitude_fc = tf.feature_column.bucketized_column(la_fc, latitude_buckets)
# Do the same bucketization for longitude as done for latitude.
longitude_buckets = list(np.linspace(-84.558798, -84.287259, 99))
lo_fc = tf.feature_column.numeric_column("longitude")
longitude_fc = tf.feature_column.bucketized_column(lo_fc, longitude_buckets)
# Create a feature cross of fc_longitude x fc_latitude.
fc_san_francisco_boxed = tf.feature_column.crossed_column(
keys=[latitude_fc, longitude_fc], hash_bucket_size=10000
)
print(
fc_san_francisco_boxed._transform_feature(
{
latitude_fc: latitude_fc._transform_feature(
{la_fc: la_fc._transform_feature(features)}
),
longitude_fc: longitude_fc._transform_feature(
{lo_fc: lo_fc._transform_feature(features)}
),
}
)
)
print(latitude_fc._transform_feature({la_fc: la_fc._transform_feature(features)}))
print(longitude_fc._transform_feature({lo_fc: lo_fc._transform_feature(features)}))
crossed_column = tf.feature_column.crossed_column(["keywords", "year"], 5000)
print(crossed_column._transform_feature(features))
embedding_column = tf.feature_column.embedding_column(
categorical_column=categorical_column,
dimension=10,
)
print(
embedding_column._transform_feature(
{categorical_column: categorical_column._transform_feature(features)}
)
)
def print_output(feature_column, feature):
feature_layer = layers.DenseFeatures(feature_column)
print(feature_layer(feature).numpy())
print_output(embedding_column, features)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment