ianmcook/ibis_spark_pgsql.py

## ibis_spark_pgsql.py
import pandas as pd
import pyarrow as pa
import ibis
from pyspark.sql import SparkSession

# create example data in a pandas DataFrame
df = pd.DataFrame(data={'fruit': ['apple', 'apple', 'apple', 'orange', 'orange', 'orange'],
                        'variety': ['gala', 'honeycrisp', 'fuji', 'navel', 'valencia', 'cara cara'],
                        'weight': [134.2 , 158.6, None, 142.1, 96.7, None]})

# create an Ibis schema for the example data
sc = ibis.schema(names=['fruit', 'variety', 'weight'], types=['string', 'string', 'double'])

# start a Spark session and connect Ibis to it
spark = SparkSession.builder.getOrCreate()
con_spark = ibis.pyspark.connect(spark)

# load the example data into a Spark table
con_spark.create_table('fruit', df, schema=sc)

# connect Ibis to a PostgreSQL database
con_pgsql = ibis.connect('postgresql://localhost/ian')

# create an empty PostgreSQL table with the same schema as the Spark table
con_pgsql.create_table('fruit', schema=sc)

# load the data from the Spark table into the PostgreSQL table

# ideally this would work, but it does not yet
# (https://github.com/ibis-project/ibis/issues/8115)
#con_pgsql.insert('fruit', con_spark.tables.fruit)

# second best would be if this works, but it does not yet
# (https://github.com/ibis-project/ibis/issues/8133)
#for batch in con_spark.tables.fruit.to_pyarrow_batches():
#  con_pgsql.insert('fruit', batch)

# this works
for batch in con_spark.tables.fruit.to_pandas_batches():
  con_pgsql.insert('fruit', batch)
	import pandas as pd
	import pyarrow as pa
	import ibis
	from pyspark.sql import SparkSession

	# create example data in a pandas DataFrame
	df = pd.DataFrame(data={'fruit': ['apple', 'apple', 'apple', 'orange', 'orange', 'orange'],
	'variety': ['gala', 'honeycrisp', 'fuji', 'navel', 'valencia', 'cara cara'],
	'weight': [134.2 , 158.6, None, 142.1, 96.7, None]})

	# create an Ibis schema for the example data
	sc = ibis.schema(names=['fruit', 'variety', 'weight'], types=['string', 'string', 'double'])

	# start a Spark session and connect Ibis to it
	spark = SparkSession.builder.getOrCreate()
	con_spark = ibis.pyspark.connect(spark)

	# load the example data into a Spark table
	con_spark.create_table('fruit', df, schema=sc)

	# connect Ibis to a PostgreSQL database
	con_pgsql = ibis.connect('postgresql://localhost/ian')

	# create an empty PostgreSQL table with the same schema as the Spark table
	con_pgsql.create_table('fruit', schema=sc)

	# load the data from the Spark table into the PostgreSQL table

	# ideally this would work, but it does not yet
	# (https://github.com/ibis-project/ibis/issues/8115)
	#con_pgsql.insert('fruit', con_spark.tables.fruit)

	# second best would be if this works, but it does not yet
	# (https://github.com/ibis-project/ibis/issues/8133)
	#for batch in con_spark.tables.fruit.to_pyarrow_batches():
	# con_pgsql.insert('fruit', batch)

	# this works
	for batch in con_spark.tables.fruit.to_pandas_batches():
	con_pgsql.insert('fruit', batch)