Skip to content

Instantly share code, notes, and snippets.

View BryanCutler's full-sized avatar

Bryan Cutler BryanCutler

View GitHub Profile
@BryanCutler
BryanCutler / tep_extending_pandas_blog1_1.ipynb
Last active May 3, 2021 16:57
Text Extensions for Pandas: Tips and Techniques for Extending Pandas, Part 1 Blog
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@BryanCutler
BryanCutler / PySpark_createDataFrame_with_Arrow.ipynb
Last active September 16, 2020 02:30
How to create a Spark DataFrame from Pandas or NumPy with Arrow
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@BryanCutler
BryanCutler / tf_arrow_blog_pt7.py
Last active February 25, 2020 18:24
TensorFlow Arrow Blog Part 7 - Model Training Local Dataset
def make_local_dataset(filename):
"""Make a TensorFlow Arrow Dataset that reads from a local CSV file."""
# Read the local file and get a record batch iterator
batch_iter = read_and_process(filename)
# Create the Arrow Dataset as a stream from local iterator of record batches
ds = arrow_io.ArrowStreamDataset.from_record_batches(
batch_iter,
output_types=(tf.int64, tf.float64, tf.float64),
@BryanCutler
BryanCutler / tf_arrow_blog_p11.py
Last active February 25, 2020 18:23
TensorFlow Arrow Blog Part 11 - Model Training Remote Dataset
def make_remote_dataset(endpoint):
"""Make a TensorFlow Arrow Dataset that reads from a remote Arrow stream."""
# Create the Arrow Dataset from a remote host serving a stream
ds = arrow_io.ArrowStreamDataset(
[endpoint],
columns=(0, 1, 2),
output_types=(tf.int64, tf.float64, tf.float64),
output_shapes=(tf.TensorShape([]), tf.TensorShape([]), tf.TensorShape([])),
batch_mode='auto')
@BryanCutler
BryanCutler / tf_arrow_blog_pt6.py
Last active August 5, 2019 19:36
TensorFlow Arrow Blog Part 6 - Read and Process CSV File
def read_and_process(filename):
"""Read the given CSV file and yield processed Arrow batches."""
# Read a CSV file into an Arrow Table with threading enabled and
# set block_size in bytes to break the file into chunks for granularity,
# which determines the number of batches in the resulting pyarrow.Table
opts = pyarrow.csv.ReadOptions(use_threads=True, block_size=4096)
table = pyarrow.csv.read_csv(filename, opts)
# Fit the feature transform
@BryanCutler
BryanCutler / tf_arrow_blog_pt2.py
Last active August 5, 2019 19:34
TensorFlow Arrow Blog Part 2 - ArrowDataset
import tensorflow_io.arrow as arrow_io
ds = arrow_io.ArrowDataset.from_pandas(
df,
batch_size=2,
preserve_index=False)
# Make an iterator to the dataset
ds_iter = iter(ds)
@BryanCutler
BryanCutler / tf_arrow_blog_pt1.py
Last active August 5, 2019 19:15
TensorFlow Arrow Blog Part 1 - Create Sample DataFrame
import numpy as np
import pandas as pd
data = {'label': np.random.binomial(1, 0.5, 10)}
data['x0'] = np.random.randn(10) + 5 * data['label']
data['x1'] = np.random.randn(10) + 5 * data['label']
df = pd.DataFrame(data)
print(df.head())
@BryanCutler
BryanCutler / tf_arrow_blog_pt3.py
Last active August 5, 2019 17:38
TensorFlow Arrow Blog Part 3 - ArrowFeatherDataset
import tensorflow_io.arrow as arrow_io
from pyarrow.feather import write_feather
# Write the Pandas DataFrame to a Feather file
write_feather(df, '/path/to/df.feather')
# Create the dataset with one or more filenames
ds = arrow_io.ArrowFeatherDataset(
['/path/to/df.feather'],
columns=(0, 1, 2),
@BryanCutler
BryanCutler / tf_arrow_blog_pt4.py
Last active August 5, 2019 17:38
TensorFlow Arrow Blog Part 4 - ArrowStreamDataset
import tensorflow_io.arrow as arrow_io
ds = arrow_io.ArrowStreamDataset.from_pandas(
df,
batch_size=2,
preserve_index=False)
@BryanCutler
BryanCutler / tf_arrow_blog_pt5.py
Last active August 5, 2019 17:37
TensorFlow Arrow Blog Part 5 - Model Definition
def model_fit(ds):
"""Create and fit a Keras logistic regression model."""
# Build the Keras model
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(1, input_shape=(2,),
activation='sigmoid'))
model.compile(optimizer='sgd', loss='mean_squared_error',
metrics=['accuracy'])