Skip to content

Instantly share code, notes, and snippets.

View BryanCutler's full-sized avatar

Bryan Cutler BryanCutler

View GitHub Profile
import io.netty.buffer.ArrowBuf;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.file.ArrowWriter;
import org.apache.arrow.vector.schema.ArrowFieldNode;
import org.apache.arrow.vector.schema.ArrowRecordBatch;
import org.apache.arrow.vector.types.pojo.Field;
@BryanCutler
BryanCutler / pandas_rdd.py
Last active March 14, 2018 05:47
Vectorized UDFs in Python SPARK-21190
class DataFrame(object):
...
def asPandas(self):
return ArrowDataFrame(self)
class ArrowDataFrame(object):
"""
Wraps a Python DataFrame to group/winow then apply using``pandas.DataFrame``
"""
@BryanCutler
BryanCutler / PySpark_to_Pandas_with_Arrow.ipynb
Last active January 24, 2019 11:12
Spark to Pandas Conversion with Arrow Example
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@BryanCutler
BryanCutler / tf_arrow_blog_pt10.py
Last active August 5, 2019 17:36
TensorFlow Arrow Blog Part 10 - Serve CSV Data
def serve_csv_data(ip_addr, port_num, directory):
"""
Create a socket and serve Arrow record batches as a stream read from the
given directory containing CVS files.
"""
# Create the socket
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.bind((ip_addr, port_num))
sock.listen(1)
@BryanCutler
BryanCutler / tf_arrow_blog_pt9.py
Last active August 5, 2019 17:36
TensorFlow Arrow Blog Part 9 - Read and Process Directory
def read_and_process_dir(directory):
"""Read a directory of CSV files and yield processed Arrow batches."""
for f in os.listdir(directory):
if f.endswith(".csv"):
filename = os.path.join(directory, f)
for batch in read_and_process(filename):
yield batch
@BryanCutler
BryanCutler / tf_arrow_blog_pt8.py
Last active August 5, 2019 17:36
TensorFlow Arrow Blog Part 8 - Run Training Local
ds = make_local_dataset(filename)
model = model_fit(ds)
print("Fit model with weights: {}".format(model.get_weights()))
# Fit model with weights:
# [array([[0.7793554 ], [0.61216295]], dtype=float32),
# array([0.03328196], dtype=float32)]
@BryanCutler
BryanCutler / tf_arrow_blog_pt5.py
Last active August 5, 2019 17:37
TensorFlow Arrow Blog Part 5 - Model Definition
def model_fit(ds):
"""Create and fit a Keras logistic regression model."""
# Build the Keras model
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(1, input_shape=(2,),
activation='sigmoid'))
model.compile(optimizer='sgd', loss='mean_squared_error',
metrics=['accuracy'])
@BryanCutler
BryanCutler / tf_arrow_blog_pt4.py
Last active August 5, 2019 17:38
TensorFlow Arrow Blog Part 4 - ArrowStreamDataset
import tensorflow_io.arrow as arrow_io
ds = arrow_io.ArrowStreamDataset.from_pandas(
df,
batch_size=2,
preserve_index=False)
@BryanCutler
BryanCutler / tf_arrow_blog_pt3.py
Last active August 5, 2019 17:38
TensorFlow Arrow Blog Part 3 - ArrowFeatherDataset
import tensorflow_io.arrow as arrow_io
from pyarrow.feather import write_feather
# Write the Pandas DataFrame to a Feather file
write_feather(df, '/path/to/df.feather')
# Create the dataset with one or more filenames
ds = arrow_io.ArrowFeatherDataset(
['/path/to/df.feather'],
columns=(0, 1, 2),
@BryanCutler
BryanCutler / tf_arrow_blog_pt1.py
Last active August 5, 2019 19:15
TensorFlow Arrow Blog Part 1 - Create Sample DataFrame
import numpy as np
import pandas as pd
data = {'label': np.random.binomial(1, 0.5, 10)}
data['x0'] = np.random.randn(10) + 5 * data['label']
data['x1'] = np.random.randn(10) + 5 * data['label']
df = pd.DataFrame(data)
print(df.head())