Bryan Cutler BryanCutler

## ArrowJavaToPython.java


import io.netty.buffer.ArrowBuf;

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.file.ArrowWriter;
import org.apache.arrow.vector.schema.ArrowFieldNode;
import org.apache.arrow.vector.schema.ArrowRecordBatch;
import org.apache.arrow.vector.types.pojo.Field;

## pandas_rdd.py
class DataFrame(object):
    ...
    def asPandas(self):
        return ArrowDataFrame(self)

class ArrowDataFrame(object):
    """
    Wraps a Python DataFrame to group/winow then apply using``pandas.DataFrame``
    """

## PySpark_to_Pandas_with_Arrow.ipynb

      
              1 file
            
          
              2 forks
            
          
              0 comments
            
          
              1 star
            
          
                BryanCutler
                / PySpark_to_Pandas_with_Arrow.ipynb
            
            
              Last active
              January 24, 2019 11:12
            
              
                Spark to Pandas Conversion with Arrow Example
              
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## tf_arrow_blog_pt10.py
def serve_csv_data(ip_addr, port_num, directory):
  """
  Create a socket and serve Arrow record batches as a stream read from the
  given directory containing CVS files.
  """

  # Create the socket
  sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
  sock.bind((ip_addr, port_num))
  sock.listen(1)

## tf_arrow_blog_pt9.py
def read_and_process_dir(directory):
  """Read a directory of CSV files and yield processed Arrow batches."""

  for f in os.listdir(directory):
    if f.endswith(".csv"):
      filename = os.path.join(directory, f)
      for batch in read_and_process(filename):
        yield batch

## tf_arrow_blog_pt8.py
ds = make_local_dataset(filename)
model = model_fit(ds)

print("Fit model with weights: {}".format(model.get_weights()))
# Fit model with weights:
# [array([[0.7793554 ], [0.61216295]], dtype=float32),
#  array([0.03328196], dtype=float32)]

## tf_arrow_blog_pt5.py
def model_fit(ds):
  """Create and fit a Keras logistic regression model."""

  # Build the Keras model
  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Dense(1, input_shape=(2,),
            activation='sigmoid'))
  model.compile(optimizer='sgd', loss='mean_squared_error',
                metrics=['accuracy'])

## tf_arrow_blog_pt4.py
import tensorflow_io.arrow as arrow_io

ds = arrow_io.ArrowStreamDataset.from_pandas(
    df,
    batch_size=2,
    preserve_index=False)

## tf_arrow_blog_pt3.py
import tensorflow_io.arrow as arrow_io
from pyarrow.feather import write_feather

# Write the Pandas DataFrame to a Feather file
write_feather(df, '/path/to/df.feather')

# Create the dataset with one or more filenames
ds = arrow_io.ArrowFeatherDataset(
    ['/path/to/df.feather'],
    columns=(0, 1, 2),

## tf_arrow_blog_pt1.py
import numpy as np
import pandas as pd

data = {'label': np.random.binomial(1, 0.5, 10)}
data['x0'] = np.random.randn(10) + 5 * data['label']
data['x1'] = np.random.randn(10) + 5 * data['label']

df = pd.DataFrame(data)

print(df.head())


	import io.netty.buffer.ArrowBuf;

	import org.apache.arrow.memory.BufferAllocator;
	import org.apache.arrow.memory.RootAllocator;
	import org.apache.arrow.vector.file.ArrowWriter;
	import org.apache.arrow.vector.schema.ArrowFieldNode;
	import org.apache.arrow.vector.schema.ArrowRecordBatch;
	import org.apache.arrow.vector.types.pojo.Field;
	class DataFrame(object):
	...
	def asPandas(self):
	return ArrowDataFrame(self)

	class ArrowDataFrame(object):
	"""
	Wraps a Python DataFrame to group/winow then apply using``pandas.DataFrame``
	"""
	def serve_csv_data(ip_addr, port_num, directory):
	"""
	Create a socket and serve Arrow record batches as a stream read from the
	given directory containing CVS files.
	"""

	# Create the socket
	sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
	sock.bind((ip_addr, port_num))
	sock.listen(1)
	def read_and_process_dir(directory):
	"""Read a directory of CSV files and yield processed Arrow batches."""

	for f in os.listdir(directory):
	if f.endswith(".csv"):
	filename = os.path.join(directory, f)
	for batch in read_and_process(filename):
	yield batch
	ds = make_local_dataset(filename)
	model = model_fit(ds)

	print("Fit model with weights: {}".format(model.get_weights()))
	# Fit model with weights:
	# [array([[0.7793554 ], [0.61216295]], dtype=float32),
	# array([0.03328196], dtype=float32)]
	def model_fit(ds):
	"""Create and fit a Keras logistic regression model."""

	# Build the Keras model
	model = tf.keras.Sequential()
	model.add(tf.keras.layers.Dense(1, input_shape=(2,),
	activation='sigmoid'))
	model.compile(optimizer='sgd', loss='mean_squared_error',
	metrics=['accuracy'])
	import tensorflow_io.arrow as arrow_io

	ds = arrow_io.ArrowStreamDataset.from_pandas(
	df,
	batch_size=2,
	preserve_index=False)
	import tensorflow_io.arrow as arrow_io
	from pyarrow.feather import write_feather

	# Write the Pandas DataFrame to a Feather file
	write_feather(df, '/path/to/df.feather')

	# Create the dataset with one or more filenames
	ds = arrow_io.ArrowFeatherDataset(
	['/path/to/df.feather'],
	columns=(0, 1, 2),
	import numpy as np
	import pandas as pd

	data = {'label': np.random.binomial(1, 0.5, 10)}
	data['x0'] = np.random.randn(10) + 5 * data['label']
	data['x1'] = np.random.randn(10) + 5 * data['label']

	df = pd.DataFrame(data)

	print(df.head())