BryanCutler/tf_arrow_blog_pt6.py

## tf_arrow_blog_pt6.py
def read_and_process(filename):
  """Read the given CSV file and yield processed Arrow batches."""

  # Read a CSV file into an Arrow Table with threading enabled and
  # set block_size in bytes to break the file into chunks for granularity,
  # which determines the number of batches in the resulting pyarrow.Table
  opts = pyarrow.csv.ReadOptions(use_threads=True, block_size=4096)
  table = pyarrow.csv.read_csv(filename, opts)

  # Fit the feature transform
  df = table.to_pandas()
  scaler = StandardScaler().fit(df[['x0', 'x1']])

  # Iterate over batches in the pyarrow.Table and apply processing
  for batch in table.to_batches():
    df = batch.to_pandas()

    # Process the batch and apply feature transform
    X_scaled = scaler.transform(df[['x0', 'x1']])
    df_scaled = pd.DataFrame({'label': df['label'],
                              'x0': X_scaled[:, 0],
                              'x1': X_scaled[:, 1]})
    batch_scaled = pa.RecordBatch.from_pandas(df_scaled, preserve_index=False)

    yield batch_scaled
	def read_and_process(filename):
	"""Read the given CSV file and yield processed Arrow batches."""

	# Read a CSV file into an Arrow Table with threading enabled and
	# set block_size in bytes to break the file into chunks for granularity,
	# which determines the number of batches in the resulting pyarrow.Table
	opts = pyarrow.csv.ReadOptions(use_threads=True, block_size=4096)
	table = pyarrow.csv.read_csv(filename, opts)

	# Fit the feature transform
	df = table.to_pandas()
	scaler = StandardScaler().fit(df[['x0', 'x1']])

	# Iterate over batches in the pyarrow.Table and apply processing
	for batch in table.to_batches():
	df = batch.to_pandas()

	# Process the batch and apply feature transform
	X_scaled = scaler.transform(df[['x0', 'x1']])
	df_scaled = pd.DataFrame({'label': df['label'],
	'x0': X_scaled[:, 0],
	'x1': X_scaled[:, 1]})
	batch_scaled = pa.RecordBatch.from_pandas(df_scaled, preserve_index=False)

	yield batch_scaled