Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save karpanGit/3ce5ed2eb6412a5af4605bcf038e04b3 to your computer and use it in GitHub Desktop.
Save karpanGit/3ce5ed2eb6412a5af4605bcf038e04b3 to your computer and use it in GitHub Desktop.
pyspark, read csv by specifying date and datetime format
# note the FAILFAST mode. It is much preferred to PERMISSIVE to catch errors early
# read dates and datetimes using the default ISO format
# date: yyyy-MM-dd
# datetime: yyyy-MM-ddTHH:mm:ss.SSS
import pyspark.sql.types as T
with open(r'D:/junk/tmp.csv', 'wt') as f:
f.write('1\t2022-10-03\t2022-10-03T06:02:01.657\n')
f.write('1\t2022-10-13\t2021-10-03T06:32:01.001')
schema = T.StructType([
T.StructField('id', T.LongType()),
T.StructField('my_date', T.DateType()),
T.StructField('my_datetime', T.TimestampType()),
])
df = spark.read.csv(r'D:/junk/tmp.csv', schema=schema, header=False, sep=r'\t', mode='FAILFAST')
df.printSchema()
# root
# |-- id: long (nullable = true)
# |-- my_date: date (nullable = true)
# |-- my_datetime: timestamp (nullable = true)
df.show()
# +---+----------+--------------------+
# | id| my_date| my_datetime|
# +---+----------+--------------------+
# | 1|2022-10-03|2022-10-03 06:02:...|
# | 1|2022-10-13|2021-10-03 06:32:...|
# +---+----------+--------------------+
# read dates and datetimes by specifying the format
# date: yyyy-MM-dd
# datetime: yyyy-MM-ddTHH:mm:ss.SSS
import pyspark.sql.types as T
with open(r'D:/junk/tmp.csv', 'wt') as f:
f.write('1\t22-10-03\t2022-10-03T06\n')
f.write('1\t22-10-13\t2021-10-03T06')
schema = T.StructType([
T.StructField('id', T.LongType()),
T.StructField('my_date', T.DateType()),
T.StructField('my_datetime', T.TimestampType())
])
df = spark.read.csv(r'D:/junk/tmp.csv', schema=schema, header=False, sep=r'\t', dateFormat='yy-MM-dd', timestampFormat='yyyy-MM-ddTHH', mode='FAILFAST')
df.printSchema()
# root
# |-- id: long (nullable = true)
# |-- my_date: date (nullable = true)
# |-- my_datetime: timestamp (nullable = true)
df.show()
# +---+----------+-------------------+
# | id| my_date| my_datetime|
# +---+----------+-------------------+
# | 1|2022-10-03|2022-10-03 06:00:00|
# | 1|2022-10-13|2021-10-03 06:00:00|
# +---+----------+-------------------+
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment