Skip to content

Instantly share code, notes, and snippets.

@rikturr
Created July 21, 2020 14:37
Show Gist options
  • Save rikturr/3a413b32eea396f608484b90ae2e033d to your computer and use it in GitHub Desktop.
Save rikturr/3a413b32eea396f608484b90ae2e033d to your computer and use it in GitHub Desktop.
spark features
import pyspark.sql.functions as F
import pyspark.sql.types as T
taxi = taxi.withColumn('pickup_weekday', F.dayofweek(taxi.tpep_pickup_datetime).cast(T.DoubleType()))
taxi = taxi.withColumn('pickup_weekofyear', F.weekofyear(taxi.tpep_pickup_datetime).cast(T.DoubleType()))
taxi = taxi.withColumn('pickup_hour', F.hour(taxi.tpep_pickup_datetime).cast(T.DoubleType()))
taxi = taxi.withColumn('pickup_minute', F.minute(taxi.tpep_pickup_datetime).cast(T.DoubleType()))
taxi = taxi.withColumn('pickup_year_seconds',
(F.unix_timestamp(taxi.tpep_pickup_datetime) -
F.unix_timestamp(F.lit(datetime.datetime(2019, 1, 1, 0, 0, 0)))).cast(T.DoubleType()))
taxi = taxi.withColumn('pickup_week_hour', ((taxi.pickup_weekday * 24) + taxi.pickup_hour).cast(T.DoubleType()))
taxi = taxi.withColumn('passenger_count', F.coalesce(taxi.passenger_count, F.lit(-1)).cast(T.DoubleType()))
taxi = taxi.fillna({'VendorID': 'missing', 'RatecodeID': 'missing', 'store_and_fwd_flag': 'missing' })
# Spark ML expects a "label" column for the dependent variable
taxi = taxi.withColumn('label', taxi.total_amount)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment