Skip to content

Instantly share code, notes, and snippets.

View BryanCutler's full-sized avatar

Bryan Cutler BryanCutler

View GitHub Profile
@BryanCutler
BryanCutler / tf_arrow_blog_pt9.py
Last active August 5, 2019 17:36
TensorFlow Arrow Blog Part 9 - Read and Process Directory
def read_and_process_dir(directory):
"""Read a directory of CSV files and yield processed Arrow batches."""
for f in os.listdir(directory):
if f.endswith(".csv"):
filename = os.path.join(directory, f)
for batch in read_and_process(filename):
yield batch
@BryanCutler
BryanCutler / tf_arrow_blog_pt8.py
Last active August 5, 2019 17:36
TensorFlow Arrow Blog Part 8 - Run Training Local
ds = make_local_dataset(filename)
model = model_fit(ds)
print("Fit model with weights: {}".format(model.get_weights()))
# Fit model with weights:
# [array([[0.7793554 ], [0.61216295]], dtype=float32),
# array([0.03328196], dtype=float32)]
@BryanCutler
BryanCutler / tf_arrow_blog_pt7.py
Last active February 25, 2020 18:24
TensorFlow Arrow Blog Part 7 - Model Training Local Dataset
def make_local_dataset(filename):
"""Make a TensorFlow Arrow Dataset that reads from a local CSV file."""
# Read the local file and get a record batch iterator
batch_iter = read_and_process(filename)
# Create the Arrow Dataset as a stream from local iterator of record batches
ds = arrow_io.ArrowStreamDataset.from_record_batches(
batch_iter,
output_types=(tf.int64, tf.float64, tf.float64),
@BryanCutler
BryanCutler / tf_arrow_blog_pt6.py
Last active August 5, 2019 19:36
TensorFlow Arrow Blog Part 6 - Read and Process CSV File
def read_and_process(filename):
"""Read the given CSV file and yield processed Arrow batches."""
# Read a CSV file into an Arrow Table with threading enabled and
# set block_size in bytes to break the file into chunks for granularity,
# which determines the number of batches in the resulting pyarrow.Table
opts = pyarrow.csv.ReadOptions(use_threads=True, block_size=4096)
table = pyarrow.csv.read_csv(filename, opts)
# Fit the feature transform
@BryanCutler
BryanCutler / tf_arrow_blog_pt5.py
Last active August 5, 2019 17:37
TensorFlow Arrow Blog Part 5 - Model Definition
def model_fit(ds):
"""Create and fit a Keras logistic regression model."""
# Build the Keras model
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(1, input_shape=(2,),
activation='sigmoid'))
model.compile(optimizer='sgd', loss='mean_squared_error',
metrics=['accuracy'])
@BryanCutler
BryanCutler / tf_arrow_blog_pt4.py
Last active August 5, 2019 17:38
TensorFlow Arrow Blog Part 4 - ArrowStreamDataset
import tensorflow_io.arrow as arrow_io
ds = arrow_io.ArrowStreamDataset.from_pandas(
df,
batch_size=2,
preserve_index=False)
@BryanCutler
BryanCutler / tf_arrow_blog_pt3.py
Last active August 5, 2019 17:38
TensorFlow Arrow Blog Part 3 - ArrowFeatherDataset
import tensorflow_io.arrow as arrow_io
from pyarrow.feather import write_feather
# Write the Pandas DataFrame to a Feather file
write_feather(df, '/path/to/df.feather')
# Create the dataset with one or more filenames
ds = arrow_io.ArrowFeatherDataset(
['/path/to/df.feather'],
columns=(0, 1, 2),
@BryanCutler
BryanCutler / tf_arrow_blog_pt2.py
Last active August 5, 2019 19:34
TensorFlow Arrow Blog Part 2 - ArrowDataset
import tensorflow_io.arrow as arrow_io
ds = arrow_io.ArrowDataset.from_pandas(
df,
batch_size=2,
preserve_index=False)
# Make an iterator to the dataset
ds_iter = iter(ds)
@BryanCutler
BryanCutler / tf_arrow_blog_pt1.py
Last active August 5, 2019 19:15
TensorFlow Arrow Blog Part 1 - Create Sample DataFrame
import numpy as np
import pandas as pd
data = {'label': np.random.binomial(1, 0.5, 10)}
data['x0'] = np.random.randn(10) + 5 * data['label']
data['x1'] = np.random.randn(10) + 5 * data['label']
df = pd.DataFrame(data)
print(df.head())
@BryanCutler
BryanCutler / tf_arrow_model_training.py
Last active June 28, 2021 16:13
TensorFlow Keras Model Training Example with Apache Arrow Dataset
from functools import partial
import multiprocessing
import os
import socket
import sys
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd