Skip to content

Instantly share code, notes, and snippets.

@gidutz
Created June 6, 2019 12:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gidutz/86c701c621e48860c8ab135df56a8076 to your computer and use it in GitHub Desktop.
Save gidutz/86c701c621e48860c8ab135df56a8076 to your computer and use it in GitHub Desktop.
chicago taxi feature extraction
WITH dataset AS( SELECT
EXTRACT(HOUR FROM trip_start_timestamp) trip_start_hour
, EXTRACT(DAYOFWEEK FROM trip_start_timestamp) trip_start_weekday
, EXTRACT(WEEK FROM trip_start_timestamp) trip_start_week
, EXTRACT(DAYOFYEAR FROM trip_start_timestamp) trip_start_yearday
, EXTRACT(MONTH FROM trip_start_timestamp) trip_start_month
, (trip_miles * 1.60934 ) / ((trip_seconds + .01) / (60 * 60)) trip_speed_kmph
, trip_miles
, pickup_latitude
, pickup_longitude
, dropoff_latitude
, dropoff_longitude
, pickup_community_area
, dropoff_community_area
, ST_DISTANCE(
(ST_GEOGPOINT(pickup_longitude,pickup_latitude)),
(ST_GEOGPOINT(dropoff_longitude,dropoff_latitude))) air_distance
, CAST (trip_seconds AS FLOAT64) trip_seconds
FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
WHERE RAND() < (3000000/112860054) --sample maximum ~3M records
AND trip_start_timestamp < '2016-01-01'
AND pickup_location IS NOT NULL
AND dropoff_location IS NOT NULL)
SELECT
trip_seconds
, air_distance
, pickup_latitude
, pickup_longitude
, dropoff_latitude
, dropoff_longitude
, pickup_community_area
, dropoff_community_area
, trip_start_hour
, trip_start_weekday
, trip_start_week
, trip_start_yearday
, trip_start_month
FROM dataset
WHERE trip_speed_kmph BETWEEN 5 AND 90
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment