Created
July 30, 2020 20:04
-
-
Save rikturr/fe6ccab1ef0c7c46c91c24e5e0d3f175 to your computer and use it in GitHub Desktop.
spark_csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import functools | |
from pyspark.sql.types import * | |
import pyspark.sql.functions as F | |
from pyspark.sql import DataFrame | |
# manually specify schema because inferSchema in read.csv is quite slow | |
schema = StructType([ | |
StructField('VendorID', DoubleType()), | |
StructField('tpep_pickup_datetime', TimestampType()), | |
... | |
# refer to notebook for full schema object | |
]) | |
def read_csv(path): | |
df = spark.read.csv(path, | |
header=True, | |
schema=schema, | |
timestampFormat='yyyy-MM-dd HH:mm:ss', | |
) | |
df = df.select(cols) | |
return df | |
dfs = [] | |
for tf in files: | |
df = read_csv(tf) | |
dfs.append(df) | |
taxi = functools.reduce(DataFrame.unionAll, dfs) | |
taxi.count() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment