Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save thanoojgithub/c6872b67635e60900029b9281ed3858e to your computer and use it in GitHub Desktop.
Save thanoojgithub/c6872b67635e60900029b9281ed3858e to your computer and use it in GitHub Desktop.
create empty parquet file from existing parquet file
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
if __name__ == "__main__":
spark = SparkSession \
.builder \
.master('local') \
.appName('pyspark-test-run') \
.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
ccData1 = [('A', 'Due to heavy rainfall'), ('B', 'Due to heavy snowfall'), ('C', 'Due to some technical issue'),
('D', 'Due to bad weather'), ('D', None), ('A', 'Due to heavy rainfall')]
# TODO - Schema creation
ccColumns = StructType([
StructField('CancellationCode', StringType(), True),
StructField('CancellationDesc', StringType(), True)
])
ccDF1 = spark.createDataFrame(data=ccData1, schema=ccColumns) \
.withColumn("StartDate", to_timestamp(lit("2021-12-31 23:59:59"), 'yyyy-MM-dd HH:mm:ss')) \
.withColumn("EndDate", to_timestamp(lit("9999-12-31 23:59:59"), 'yyyy-MM-dd HH:mm:ss'))
ccDF1.write.parquet("/home/thanooj/dataParquet/data1/data")
schema = StructType([
StructField('CancellationCode', StringType(), True),
StructField('CancellationDesc', StringType(), True),
StructField('StartDate', TimestampType(), True),
StructField('EndDate', TimestampType(), True)
])
data1 = spark.read.parquet("/home/thanooj/dataParquet/data1/data")
print("data1 : ")
data1.printSchema()
data1.show(truncate=False)
rDD = spark.sparkContext.emptyRDD()
data2 = spark.createDataFrame(data=rDD, schema=data1.schema)
print("data2 : ")
data2.printSchema()
data2.show(truncate=False)
data2.write.parquet("/home/thanooj/dataParquet/data2/data")
data3 = spark.read.parquet("/home/thanooj/dataParquet/data2/data")
print("data3 : ")
data3.printSchema()
data3.show(truncate=False)
data1 :
root
|-- CancellationCode: string (nullable = true)
|-- CancellationDesc: string (nullable = true)
|-- StartDate: timestamp (nullable = true)
|-- EndDate: timestamp (nullable = true)
+----------------+---------------------------+-------------------+-------------------+
|CancellationCode|CancellationDesc |StartDate |EndDate |
+----------------+---------------------------+-------------------+-------------------+
|A |Due to heavy rainfall |2021-12-31 23:59:59|9999-12-31 23:59:59|
|B |Due to heavy snowfall |2021-12-31 23:59:59|9999-12-31 23:59:59|
|C |Due to some technical issue|2021-12-31 23:59:59|9999-12-31 23:59:59|
|D |Due to bad weather |2021-12-31 23:59:59|9999-12-31 23:59:59|
|D |null |2021-12-31 23:59:59|9999-12-31 23:59:59|
|A |Due to heavy rainfall |2021-12-31 23:59:59|9999-12-31 23:59:59|
+----------------+---------------------------+-------------------+-------------------+
data2 :
root
|-- CancellationCode: string (nullable = true)
|-- CancellationDesc: string (nullable = true)
|-- StartDate: timestamp (nullable = true)
|-- EndDate: timestamp (nullable = true)
+----------------+----------------+---------+-------+
|CancellationCode|CancellationDesc|StartDate|EndDate|
+----------------+----------------+---------+-------+
+----------------+----------------+---------+-------+
data3 :
root
|-- CancellationCode: string (nullable = true)
|-- CancellationDesc: string (nullable = true)
|-- StartDate: timestamp (nullable = true)
|-- EndDate: timestamp (nullable = true)
+----------------+----------------+---------+-------+
|CancellationCode|CancellationDesc|StartDate|EndDate|
+----------------+----------------+---------+-------+
+----------------+----------------+---------+-------+
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment