Bryan Cutler BryanCutler

## tep_extending_pandas_blog1_1.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                BryanCutler
                / tep_extending_pandas_blog1_1.ipynb
            
            
              Last active
              May 3, 2021 16:57
            
              
                Text Extensions for Pandas: Tips and Techniques for Extending Pandas, Part 1 Blog
              
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## PySpark_createDataFrame_with_Arrow.ipynb

      
              1 file
            
          
              2 forks
            
          
              2 comments
            
          
              1 star
            
          
                BryanCutler
                / PySpark_createDataFrame_with_Arrow.ipynb
            
            
              Last active
              September 16, 2020 02:30
            
              
                How to create a Spark DataFrame from Pandas or NumPy with Arrow
              
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## tf_arrow_blog_pt7.py
def make_local_dataset(filename):
  """Make a TensorFlow Arrow Dataset that reads from a local CSV file."""

  # Read the local file and get a record batch iterator
  batch_iter = read_and_process(filename)

  # Create the Arrow Dataset as a stream from local iterator of record batches
  ds = arrow_io.ArrowStreamDataset.from_record_batches(
    batch_iter,
    output_types=(tf.int64, tf.float64, tf.float64),

## tf_arrow_blog_p11.py
def make_remote_dataset(endpoint):
  """Make a TensorFlow Arrow Dataset that reads from a remote Arrow stream."""

  # Create the Arrow Dataset from a remote host serving a stream
  ds = arrow_io.ArrowStreamDataset(
      [endpoint],
      columns=(0, 1, 2),
      output_types=(tf.int64, tf.float64, tf.float64),
      output_shapes=(tf.TensorShape([]), tf.TensorShape([]), tf.TensorShape([])),
      batch_mode='auto')

## tf_arrow_blog_pt6.py
def read_and_process(filename):
  """Read the given CSV file and yield processed Arrow batches."""

  # Read a CSV file into an Arrow Table with threading enabled and
  # set block_size in bytes to break the file into chunks for granularity,
  # which determines the number of batches in the resulting pyarrow.Table
  opts = pyarrow.csv.ReadOptions(use_threads=True, block_size=4096)
  table = pyarrow.csv.read_csv(filename, opts)

  # Fit the feature transform

## tf_arrow_blog_pt2.py
import tensorflow_io.arrow as arrow_io

ds = arrow_io.ArrowDataset.from_pandas(
    df,
    batch_size=2,
    preserve_index=False)

# Make an iterator to the dataset
ds_iter = iter(ds)

## tf_arrow_blog_pt1.py
import numpy as np
import pandas as pd

data = {'label': np.random.binomial(1, 0.5, 10)}
data['x0'] = np.random.randn(10) + 5 * data['label']
data['x1'] = np.random.randn(10) + 5 * data['label']

df = pd.DataFrame(data)

print(df.head())

## tf_arrow_blog_pt3.py
import tensorflow_io.arrow as arrow_io
from pyarrow.feather import write_feather

# Write the Pandas DataFrame to a Feather file
write_feather(df, '/path/to/df.feather')

# Create the dataset with one or more filenames
ds = arrow_io.ArrowFeatherDataset(
    ['/path/to/df.feather'],
    columns=(0, 1, 2),

## tf_arrow_blog_pt4.py
import tensorflow_io.arrow as arrow_io

ds = arrow_io.ArrowStreamDataset.from_pandas(
    df,
    batch_size=2,
    preserve_index=False)

## tf_arrow_blog_pt5.py
def model_fit(ds):
  """Create and fit a Keras logistic regression model."""

  # Build the Keras model
  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Dense(1, input_shape=(2,),
            activation='sigmoid'))
  model.compile(optimizer='sgd', loss='mean_squared_error',
                metrics=['accuracy'])
	def make_local_dataset(filename):
	"""Make a TensorFlow Arrow Dataset that reads from a local CSV file."""

	# Read the local file and get a record batch iterator
	batch_iter = read_and_process(filename)

	# Create the Arrow Dataset as a stream from local iterator of record batches
	ds = arrow_io.ArrowStreamDataset.from_record_batches(
	batch_iter,
	output_types=(tf.int64, tf.float64, tf.float64),
	def make_remote_dataset(endpoint):
	"""Make a TensorFlow Arrow Dataset that reads from a remote Arrow stream."""

	# Create the Arrow Dataset from a remote host serving a stream
	ds = arrow_io.ArrowStreamDataset(
	[endpoint],
	columns=(0, 1, 2),
	output_types=(tf.int64, tf.float64, tf.float64),
	output_shapes=(tf.TensorShape([]), tf.TensorShape([]), tf.TensorShape([])),
	batch_mode='auto')
	def read_and_process(filename):
	"""Read the given CSV file and yield processed Arrow batches."""

	# Read a CSV file into an Arrow Table with threading enabled and
	# set block_size in bytes to break the file into chunks for granularity,
	# which determines the number of batches in the resulting pyarrow.Table
	opts = pyarrow.csv.ReadOptions(use_threads=True, block_size=4096)
	table = pyarrow.csv.read_csv(filename, opts)

	# Fit the feature transform
	import tensorflow_io.arrow as arrow_io

	ds = arrow_io.ArrowDataset.from_pandas(
	df,
	batch_size=2,
	preserve_index=False)

	# Make an iterator to the dataset
	ds_iter = iter(ds)
	import numpy as np
	import pandas as pd

	data = {'label': np.random.binomial(1, 0.5, 10)}
	data['x0'] = np.random.randn(10) + 5 * data['label']
	data['x1'] = np.random.randn(10) + 5 * data['label']

	df = pd.DataFrame(data)

	print(df.head())
	import tensorflow_io.arrow as arrow_io
	from pyarrow.feather import write_feather

	# Write the Pandas DataFrame to a Feather file
	write_feather(df, '/path/to/df.feather')

	# Create the dataset with one or more filenames
	ds = arrow_io.ArrowFeatherDataset(
	['/path/to/df.feather'],
	columns=(0, 1, 2),
	import tensorflow_io.arrow as arrow_io

	ds = arrow_io.ArrowStreamDataset.from_pandas(
	df,
	batch_size=2,
	preserve_index=False)
	def model_fit(ds):
	"""Create and fit a Keras logistic regression model."""

	# Build the Keras model
	model = tf.keras.Sequential()
	model.add(tf.keras.layers.Dense(1, input_shape=(2,),
	activation='sigmoid'))
	model.compile(optimizer='sgd', loss='mean_squared_error',
	metrics=['accuracy'])