Simple example of how you can run PySpark locally with Iceberg. There's no need to spin up Docker containers or install additional packages (besides PySpark). The script demonstrates how to create a Spark session with Iceberg support, create a sample DataFrame, and save it as an Iceberg table
from pyspark.sql import SparkSession
spark = (
SparkSession.builder
.appName("IcebergExample")
# Import the needed jars for Iceberg using MAven
.config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.7.1")
# Enables Iceberg
.config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
.config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog")
.config("spark.sql.catalog.spark_catalog.type", "hadoop")
.config("spark.sql.catalog.spark_catalog.warehouse", "spark-warehouse")
.getOrCreate()
)
data = [
(1, "Alice", 29),
(2, "Bob", 31),
(3, "Cathy", 25)
]
columns = ["id", "name", "age"]
# Create a dummy data frame and saves as iceberg
(
spark.createDataFrame(data, columns)
.writeTo("default.original_ice")
.using('iceberg')
.createOrReplace()
)