Skip to content

Instantly share code, notes, and snippets.

@rikturr
Created July 30, 2020 20:04
Show Gist options
  • Save rikturr/fe6ccab1ef0c7c46c91c24e5e0d3f175 to your computer and use it in GitHub Desktop.
Save rikturr/fe6ccab1ef0c7c46c91c24e5e0d3f175 to your computer and use it in GitHub Desktop.
spark_csv
import functools
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql import DataFrame
# manually specify schema because inferSchema in read.csv is quite slow
schema = StructType([
StructField('VendorID', DoubleType()),
StructField('tpep_pickup_datetime', TimestampType()),
...
# refer to notebook for full schema object
])
def read_csv(path):
df = spark.read.csv(path,
header=True,
schema=schema,
timestampFormat='yyyy-MM-dd HH:mm:ss',
)
df = df.select(cols)
return df
dfs = []
for tf in files:
df = read_csv(tf)
dfs.append(df)
taxi = functools.reduce(DataFrame.unionAll, dfs)
taxi.count()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment