Skip to content

Instantly share code, notes, and snippets.

@hack-c
Last active January 25, 2019 21:51
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hack-c/3e97490aa25db829a5d5 to your computer and use it in GitHub Desktop.
Save hack-c/3e97490aa25db829a5d5 to your computer and use it in GitHub Desktop.
a few taxi metrix
/$$$$$$$$ /$$$$$$ /$$ /$$ /$$$$$$ /$$$$$$$ /$$$$$$ /$$$$$$$$ /$$$$$$
|__ $$__//$$__ $$| $$ / $$|_ $$_/| $$__ $$ /$$__ $$|__ $$__//$$__ $$
| $$ | $$ \ $$| $$/ $$/ | $$ | $$ \ $$| $$ \ $$ | $$ | $$ \ $$
| $$ | $$$$$$$$ \ $$$$/ | $$ | $$ | $$| $$$$$$$$ | $$ | $$$$$$$$
| $$ | $$__ $$ >$$ $$ | $$ | $$ | $$| $$__ $$ | $$ | $$__ $$
| $$ | $$ | $$ /$$/\ $$ | $$ | $$ | $$| $$ | $$ | $$ | $$ | $$
| $$ | $$ | $$| $$ \ $$ /$$$$$$| $$$$$$$/| $$ | $$ | $$ | $$ | $$
|__/ |__/ |__/|__/ |__/|______/|_______/ |__/ |__/ |__/ |__/ |__/
-- by Charlie Hack
-- the data is waiting 4U: https://bigquery.cloud.google.com/table/833682135931:nyctaxi.trip_data
-- Some aggregate stats
-----------------------
SELECT
COUNT(*) AS trips,
ROUND(AVG(trip_fare.total_amount), 2) AS avg_fare,
ROUND(AVG(trip_fare.tip_amount), 2) AS avg_tip,
INTEGER(AVG((FLOAT(trip_fare.tip_amount) / FLOAT(trip_fare.total_amount))*100)) AS avg_tip_percent
FROM
[833682135931:nyctaxi.trip_fare] AS trip_fare
-- Total trips: 187287452
-- Trips with 0 tip: 89099496
-- Total CRD trips: 101089735
-- Total CRD trips with 0 tip: 3133890
--Most Winningest Cabbies
-------------------------
SELECT
earnings,
trips,
(earnings / trips) as avg_fare
medallion,
FROM
(SELECT
ROUND(SUM(FLOAT(total_amount)), 2) AS earnings,
medallion,
count(*) AS trips
FROM
[833682135931:nyctaxi.trip_fare]
GROUP EACH BY
medallion
ORDER BY
hustle DESC
LIMIT
10)
earnings,trips,avg_fare,medallion
894549.01,15151,59.04224209623127,89EDAF45090C74611B52AFFC3E10A69D
739888.6,12278,60.26132920671119,CC664699259C9867E735976611A82F64
427608.96,19744,21.657666126418153,155EBAC6C5A22D6CFD3518F6A0E9190C
324753.63,18895,17.187278645144218,D4CA68ECC21536DE406F3D58C7813241
316144.67,22735,13.9056375632285,19E063791B0DF5A558B8488180DDAB67
314569.01,21066,14.932545808411659,8DEB70907D00AA1D7FF5E2683240549B
314350.28,22460,13.996005342831703,C2A40A5B0F1B26BF17FD07F91CD6176C
310220.79,23227,13.356042106169543,B6585890F68EE02702F32DECDEABC2A8
309505.65,23011,13.450334622571814,DACFA6EF35923081481A22BE96339B6E
307047.84,20834,14.737824709609294,49618C1CB1B96F6884917A4FF2A849D2
305662.46,24536,12.457713563743072,20BA941F62CC07F1FA3EF3E122B1E9B2
304590.85,22607,13.47329809351086,9F1FEF916240E64AD4EC5B7883E6A435
304512.95,21363,14.254222253428827,2C159C8FCCDE50174CF6CFC07E75F1BA
303085.82,21563,14.055828038770116,E5D06DBAEF87F704EA5D8F5D584C22D4
303047.85,20729,14.619511312653769,6945300E90C69061B463CCDA370DE5D6
303043.32,22733,13.330546782210883,8CE240F0796D072D5DCFE06A364FB5A0
302885.46,22421,13.509007626778468,2F2F160D8428A7757500371F980468D5
301504.86,22150,13.611957562076748,33955A2FCAF62C6E91A11AE97D96C99A
301251.87,21868,13.775922352295591,BC9EE7F807E71ACA2AEA11CAB51604F8
300969.58,20488,14.690041975790708,F2967DBF707C06314CEB745A83332D62
--'Best' Cabbies (highest distance per fare)
----------------------------------------------
SELECT
ROUND((total_distance / earnings),2) AS miles_per_dolla,
total_distance,
earnings,
tips,
trips,
trip_data.medallion AS medallion
FROM
(SELECT
trip_data.medallion,
ROUND(SUM( (3959 * acos( cos(radians(FLOAT(trip_data.pickup_latitude)))*cos(radians(FLOAT(trip_data.dropoff_latitude)))*cos(radians(FLOAT(trip_data.dropoff_longitude))-radians(FLOAT(trip_data.pickup_longitude))) + sin(radians(FLOAT(trip_data.pickup_latitude)))*sin(radians(FLOAT(trip_data.dropoff_latitude))) )) ),2) AS total_distance, --haversine formula.
ROUND(SUM(FLOAT(trip_fare.total_amount)), 2) AS earnings,
ROUND(SUM(FLOAT(trip_fare.tip_amount)), 2) AS tips,
SUM(COUNT(*)) AS trips
FROM [833682135931:nyctaxi.trip_data] AS trip_data
JOIN EACH [833682135931:nyctaxi.trip_fare] AS trip_fare
ON trip_data.medallion = trip_fare.medallion
AND trip_data.pickup_datetime = trip_fare.pickup_datetime
GROUP EACH BY trip_data.medallion)
ORDER BY miles_per_dolla DESC
LIMIT 20
miles_per_dolla,total_distance,earnings,tips,trips,medallion
134.97,43062.99,319.05,2.0,48,08F04B9FDB246C105F9DADEB232C1C35
126.48,16148.83,127.68,3.02,24,28B646E3B0A34F8756AC60E2E7670F0D
100.45,1.137581447E7,113249.57,10130.52,7721,342381FAD34CA62E35CF71409DFEED43
95.0,80824.57,850.8,0.47,45,14B938A25E96E2928E3B167A038D17D5
94.31,16148.62,171.23,9.0,36,1CD95D7378F2999FE588A598A466ABC8
90.01,10765.77,119.6,0.0,12,AD8ABBDDFAF82EF07592BA9107B05D9F
78.47,269144.28,3430.07,327.5,429,4CE74C6E7448FF1D5BD4E54162508CF1
74.56,1.543145627E7,206974.76,19919.07,13387,DD9022A19340DE6C974072C5A6C8A732
68.57,5849870.86,85310.43,8145.66,6323,266CCA39BBE9D879BB5CF8DEA1B19871
62.11,22002.18,354.25,0.0,47,994D1CAD9132E48C993D58B492F71FC1
58.97,37360.54,633.5,127.0,156,220E43F9890CBB691F8348EA51B19F19
57.56,5385.09,93.55,6.25,2,0EE3FFCBDFD8B2979E87F38369A28FD9
54.88,3511595.83,63988.56,6608.52,2805,4EF7FFB140F99C849410B914939B4949
48.17,10765.79,223.5,0.0,51,3282D437BDE390612023C5237BE0FA6D
43.59,10765.78,246.99,0.0,8,7A0FB79952AD02B1AD496FB9BA9A61FB
42.89,1.006932368E7,234752.38,21797.37,16654,399DB6E048D813564DA0358503474A2C
41.69,2423260.9,58126.42,5635.17,3931,4CBBA37839940ADE243D1B4B2A4583C4
38.04,4154827.29,109226.34,10678.33,6884,C751E707D7D20AA0761EE82793EFC6BD
36.51,4699712.71,128717.79,12124.26,6795,D2D00AA857BA7917A99D1E1781A897D1
35.14,59212.35,1685.23,194.95,238,00C1977D0DFAA5D97557054EDF473017
-- Longest rides
-- note: the distance data is pretty messy it turns out, lots of zero values, truncated decimals
-- etc. that throw the distance calculation out of whack. sort by fare instead
----------------
SELECT
ROUND(3959 * acos( cos(radians(FLOAT(trip_data.pickup_latitude)))*cos(radians(FLOAT(trip_data.dropoff_latitude)))*cos(radians(FLOAT(trip_data.dropoff_longitude))-radians(FLOAT(trip_data.pickup_longitude))) + sin(radians(FLOAT(trip_data.pickup_latitude)))*sin(radians(FLOAT(trip_data.dropoff_latitude))) ) ,2) AS distance, --haversine formula.
trip_data.pickup_latitude AS pickup_latitude,
trip_data.pickup_longitude AS pickup_longitude,
trip_data.dropoff_latitude AS dropoff_latitude,
trip_data.dropoff_longitude AS dropoff_longitude,
trip_fare.total_amount AS total_fare,
trip_fare.tip_amount AS tip,
INTEGER((FLOAT(trip_fare.tip_amount) / FLOAT(trip_fare.total_amount))*100) AS tip_percent,
trip_data.medallion AS medallion
FROM
[833682135931:nyctaxi.trip_data] AS trip_data
JOIN EACH [833682135931:nyctaxi.trip_fare] AS trip_fare
ON trip_data.medallion = trip_fare.medallion AND trip_data.pickup_datetime = trip_fare.pickup_datetime
ORDER BY total_fare DESC
LIMIT 100
-- looks like it's popular to hand over a thou or a hunny. the first guy must have done a loop
distance,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,total_fare,tip,tip_percent,medallion
0.0,40.811169,-73.930168,40.811165,-73.930153,999.99,0,0,0E1DBB68725F8E2F36BCF596C2719561
6.45,40.773533,-73.870659,40.732693,-73.981476,995.59,0,0,08B54C66963A4F5894CC4EDA638FD27F
0.0,40.784782,-73.951828,40.784782,-73.951836,995,0,0,81D63C85750AF2C4D125380942446905
0.47,40.80946,-73.93795,40.812065,-73.929642,99.99,0,0,1D53A0A8DBFD2F3382F764180633C1EF
17.86,40.749981,-73.991196,40.751194,-73.650047,99.99,16.66,16,69D3ACF9CEAD3386114387EF57BB02DF
1.26,40.768665,-73.892746,40.772736,-73.86937,99.99,0,0,1D53A0A8DBFD2F3382F764180633C1EF
0.0,41.377293,-73.961311,41.3773,-73.961281,99.99,0,0,8E7A10A881DEF75BE6BA7AFF896610FD
12.45,40.758072,-73.99295,40.813076,-74.219482,99.99,19.99,19,14D8F54D4F64DB9B8C4D79EBC2DA8218
0.0,-180,-180,-180,-180,99.99,0,0,A485E6CAA482169A2A3837DEC32AEBAB
13.72,40.702736,-74.012192,40.541527,-74.164764,99.99,28,28,3F38E295118F0BFFEF93ED81A5EFB47D
19.83,40.646698,-73.777184,40.928585,-73.848419,99.99,16.66,16,60091B1217A9E629BEA6AF080AAB9BCD
18.01,40.731499,-74.001038,40.801723,-73.669609,99.99,16.66,16,AE9E14F1EBFECEFF6ADFD437CA64F957
7.08,40.751312,-73.897438,40.781467,-74.026772,99.99,16.66,16,D2A54CCEA77527E69E6C65125CACD01D
-- Tips
-------
SELECT
INTEGER(ROUND(FLOAT(tip_amount) / FLOAT(fare_amount) * 100)) tip_pct,
count(*) trips
FROM [833682135931:nyctaxi.trip_fare]
WHERE payment_type='CRD' and float(fare_amount) > 0.00
GROUP BY tip_pct
ORDER BY tip_pct
-- Tips grouped by geo blocks
-----------------------------
SELECT
CONCAT(STRING(ROUND(FLOAT(trip_data.pickup_latitude), 4)), ',', STRING(ROUND(FLOAT(trip_data.pickup_longitude), 4))) AS geobox,
ROUND(AVG(FLOAT(trip_data.pickup_latitude)), 4) as lat,
ROUND(AVG(FLOAT(trip_data.pickup_longitude)), 4) as lon,
COUNT(*) AS trips,
ROUND(AVG(trip_fare.total_amount), 2) AS avg_fare,
ROUND(AVG(trip_fare.tip_amount), 2) AS avg_tip,
INTEGER(AVG((FLOAT(trip_fare.tip_amount) / FLOAT(trip_fare.total_amount))*100)) AS avg_tip_percent
FROM
[833682135931:nyctaxi.trip_data] AS trip_data
JOIN EACH [833682135931:nyctaxi.trip_fare] AS trip_fare
ON trip_data.medallion = trip_fare.medallion AND trip_data.pickup_datetime = trip_fare.pickup_datetime
WHERE FLOAT(trip_data.pickup_longitude) != 0 AND FLOAT(trip_data.pickup_latitude) != 0
GROUP EACH BY geobox
ORDER BY trips DESC
-- Most ridiculous tips
-----------------------
SELECT
tip_pct,
hack_license,
pickup_datetime,
fare_amount,
payment_type,
tip_amount
FROM
(SELECT
hack_license,
INTEGER(ROUND(FLOAT(tip_amount) / FLOAT(fare_amount) * 100)) AS tip_pct,
pickup_datetime,
fare_amount,
payment_type,
tip_amount
FROM
[833682135931:nyctaxi.trip_fare]
WHERE
float(fare_amount) > 50.00)
WHERE hack_license != "CFCD208495D565EF66E7DFF9F98764DA"
ORDER BY 1 DESC
LIMIT 10
-- Laziest neighborhoods
------------------------
SELECT
CONCAT(STRING(ROUND(FLOAT(trip_data.pickup_latitude), 3)), '|', STRING(ROUND(FLOAT(trip_data.pickup_longitude), 3))) AS geobox,
ROUND(AVG(FLOAT(trip_data.pickup_latitude)), 3) as lat,
ROUND(AVG(FLOAT(trip_data.pickup_longitude)), 3) as lon,
COUNT(*) AS trips,
ROUND(AVG(3959 * acos( cos(radians(FLOAT(trip_data.pickup_latitude)))*cos(radians(FLOAT(trip_data.dropoff_latitude)))*cos(radians(FLOAT(trip_data.dropoff_longitude))-radians(FLOAT(trip_data.pickup_longitude))) + sin(radians(FLOAT(trip_data.pickup_latitude)))*sin(radians(FLOAT(trip_data.dropoff_latitude))) ) ,2)) AS distance, --haversine formula.
FROM
[833682135931:nyctaxi.trip_data] AS trip_data
WHERE FLOAT(trip_data.pickup_longitude) != 0 AND FLOAT(trip_data.pickup_latitude) != 0
GROUP EACH BY geobox
ORDER BY trips DESC
-- Pickup density
-----------------
SELECT
CONCAT(STRING(ROUND(FLOAT(trip_data.pickup_latitude), 4)), ',', STRING(ROUND(FLOAT(trip_data.pickup_longitude), 4))) AS geobox,
ROUND(AVG(FLOAT(trip_data.pickup_latitude)), 4) as lat,
ROUND(AVG(FLOAT(trip_data.pickup_longitude)), 4) as lon,
COUNT(*) AS trips,
ROUND(AVG(trip_fare.total_amount), 2) AS avg_fare,
ROUND(AVG(trip_fare.tip_amount), 2) AS avg_tip,
INTEGER(AVG((FLOAT(trip_fare.tip_amount) / FLOAT(trip_fare.total_amount))*100)) AS avg_tip_percent
FROM
[833682135931:nyctaxi.trip_data] AS trip_data
JOIN EACH [833682135931:nyctaxi.trip_fare] AS trip_fare
ON trip_data.medallion = trip_fare.medallion AND trip_data.pickup_datetime = trip_fare.pickup_datetime
WHERE FLOAT(trip_data.pickup_longitude) != 0 AND FLOAT(trip_data.pickup_latitude) != 0
GROUP EACH BY geobox
ORDER BY trips DESC
-- Dropoff density
------------------
SELECT
CONCAT(STRING(ROUND(FLOAT(trip_data.dropoff_latitude), 3)), '|', STRING(ROUND(FLOAT(trip_data.dropoff_longitude), 3))) AS geobox,
ROUND(AVG(FLOAT(trip_data.dropoff_latitude)), 3) as lat,
ROUND(AVG(FLOAT(trip_data.dropoff_longitude)), 3) as lon,
COUNT(*) AS trips,
ROUND(AVG(trip_fare.total_amount), 2) AS avg_fare,
ROUND(AVG(trip_fare.tip_amount), 2) AS avg_tip,
INTEGER(AVG((FLOAT(trip_fare.tip_amount) / FLOAT(trip_fare.total_amount))*100)) AS avg_tip_percent
FROM
[833682135931:nyctaxi.trip_data] AS trip_data
JOIN EACH [833682135931:nyctaxi.trip_fare] AS trip_fare
ON trip_data.medallion = trip_fare.medallion AND trip_data.pickup_datetime = trip_fare.pickup_datetime
WHERE FLOAT(trip_data.pickup_longitude) != 0 AND FLOAT(trip_data.pickup_latitude) != 0
GROUP EACH BY geobox
ORDER BY trips DESC
-- Most common trips
--------------------
SELECT
CONCAT(STRING(ROUND(FLOAT(trip_data.pickup_latitude), 3)), '|', STRING(ROUND(FLOAT(trip_data.pickup_longitude), 3)), '|', CONCAT(STRING(ROUND(FLOAT(trip_data.dropoff_latitude), 3)), '|', STRING(ROUND(FLOAT(trip_data.dropoff_longitude), 3)))) AS trip_string,
COUNT(*) AS trips,
ROUND(AVG(trip_fare.total_amount), 2) AS avg_fare,
ROUND(AVG(trip_fare.tip_amount), 2) AS avg_tip,
INTEGER(AVG((FLOAT(trip_fare.tip_amount) / FLOAT(trip_fare.total_amount))*100)) AS avg_tip_percent
FROM
[833682135931:nyctaxi.trip_data] AS trip_data
JOIN EACH [833682135931:nyctaxi.trip_fare] AS trip_fare
ON trip_data.medallion = trip_fare.medallion AND trip_data.pickup_datetime = trip_fare.pickup_datetime
WHERE FLOAT(trip_data.pickup_longitude) != 0 AND FLOAT(trip_data.pickup_latitude) != 0
GROUP EACH BY trip_string
ORDER BY trips DESC
LIMIT 20
-- Largest fares
----------------
SELECT
ROUND(3959 * acos( cos(radians(FLOAT(trip_data.pickup_latitude)))*cos(radians(FLOAT(trip_data.dropoff_latitude)))*cos(radians(FLOAT(trip_data.dropoff_longitude))-radians(FLOAT(trip_data.pickup_longitude))) + sin(radians(FLOAT(trip_data.pickup_latitude)))*sin(radians(FLOAT(trip_data.dropoff_latitude))) ) ,2) AS distance, --haversine formula.
trip_data.pickup_datetime,
trip_data.dropoff_datetime,
trip_data.trip_time_in_secs,
trip_data.pickup_latitude AS pickup_latitude,
trip_data.pickup_longitude AS pickup_longitude,
trip_data.dropoff_latitude AS dropoff_latitude,
trip_data.dropoff_longitude AS dropoff_longitude,
trip_fare.payment_type AS payment_type,
trip_fare.total_amount AS total_fare,
trip_fare.tip_amount AS tip,
INTEGER((FLOAT(trip_fare.tip_amount) / FLOAT(trip_fare.total_amount))*100) AS tip_percent,
trip_data.medallion AS medallion
FROM
[833682135931:nyctaxi.trip_data] AS trip_data
JOIN EACH [833682135931:nyctaxi.trip_fare] AS trip_fare
ON trip_data.medallion = trip_fare.medallion AND trip_data.pickup_datetime = trip_fare.pickup_datetime
ORDER BY total_fare DESC
LIMIT 100
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment