Created
May 1, 2022 08:39
-
-
Save karpanGit/3ce5ed2eb6412a5af4605bcf038e04b3 to your computer and use it in GitHub Desktop.
pyspark, read csv by specifying date and datetime format
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# note the FAILFAST mode. It is much preferred to PERMISSIVE to catch errors early | |
# read dates and datetimes using the default ISO format | |
# date: yyyy-MM-dd | |
# datetime: yyyy-MM-ddTHH:mm:ss.SSS | |
import pyspark.sql.types as T | |
with open(r'D:/junk/tmp.csv', 'wt') as f: | |
f.write('1\t2022-10-03\t2022-10-03T06:02:01.657\n') | |
f.write('1\t2022-10-13\t2021-10-03T06:32:01.001') | |
schema = T.StructType([ | |
T.StructField('id', T.LongType()), | |
T.StructField('my_date', T.DateType()), | |
T.StructField('my_datetime', T.TimestampType()), | |
]) | |
df = spark.read.csv(r'D:/junk/tmp.csv', schema=schema, header=False, sep=r'\t', mode='FAILFAST') | |
df.printSchema() | |
# root | |
# |-- id: long (nullable = true) | |
# |-- my_date: date (nullable = true) | |
# |-- my_datetime: timestamp (nullable = true) | |
df.show() | |
# +---+----------+--------------------+ | |
# | id| my_date| my_datetime| | |
# +---+----------+--------------------+ | |
# | 1|2022-10-03|2022-10-03 06:02:...| | |
# | 1|2022-10-13|2021-10-03 06:32:...| | |
# +---+----------+--------------------+ | |
# read dates and datetimes by specifying the format | |
# date: yyyy-MM-dd | |
# datetime: yyyy-MM-ddTHH:mm:ss.SSS | |
import pyspark.sql.types as T | |
with open(r'D:/junk/tmp.csv', 'wt') as f: | |
f.write('1\t22-10-03\t2022-10-03T06\n') | |
f.write('1\t22-10-13\t2021-10-03T06') | |
schema = T.StructType([ | |
T.StructField('id', T.LongType()), | |
T.StructField('my_date', T.DateType()), | |
T.StructField('my_datetime', T.TimestampType()) | |
]) | |
df = spark.read.csv(r'D:/junk/tmp.csv', schema=schema, header=False, sep=r'\t', dateFormat='yy-MM-dd', timestampFormat='yyyy-MM-ddTHH', mode='FAILFAST') | |
df.printSchema() | |
# root | |
# |-- id: long (nullable = true) | |
# |-- my_date: date (nullable = true) | |
# |-- my_datetime: timestamp (nullable = true) | |
df.show() | |
# +---+----------+-------------------+ | |
# | id| my_date| my_datetime| | |
# +---+----------+-------------------+ | |
# | 1|2022-10-03|2022-10-03 06:00:00| | |
# | 1|2022-10-13|2021-10-03 06:00:00| | |
# +---+----------+-------------------+ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment