|
#!/bin/bash |
|
|
|
filename=$1 |
|
regex="([[:alnum:]]+)_tripdata_([[:digit:]]{4})-([[:digit:]]{2})" |
|
|
|
if [[ $filename =~ $regex ]]; then |
|
type=${BASH_REMATCH[1]} |
|
year=${BASH_REMATCH[2]} |
|
month=$((10#${BASH_REMATCH[3]})) |
|
else |
|
echo >&2 "Pattern unrecognized file ${filename}" |
|
exit 1 |
|
fi |
|
|
|
green_schema_pre_2015="(vendor_id,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,rate_code_id,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,total_amount,payment_type,trip_type,junk1,junk2)" |
|
green_schema_2015_h1="(vendor_id,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,rate_code_id,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,junk1,junk2)" |
|
green_schema_2015_h2_2016_h1="(vendor_id,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,rate_code_id,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type)" |
|
green_schema_2016_h2="(vendor_id,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,rate_code_id,pickup_location_id,dropoff_location_id,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,junk1,junk2)" |
|
green_schema_2017_h1="(vendor_id,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,rate_code_id,pickup_location_id,dropoff_location_id,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type)" |
|
green_schema_2019_h1="(vendor_id,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,rate_code_id,pickup_location_id,dropoff_location_id,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge)" |
|
|
|
yellow_schema_pre_2015="(vendor_id,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code_id,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,total_amount)" |
|
yellow_schema_2015_2016_h1="(vendor_id,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code_id,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount)" |
|
yellow_schema_2016_h2="(vendor_id,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,rate_code_id,store_and_fwd_flag,pickup_location_id,dropoff_location_id,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,junk1,junk2)" |
|
yellow_schema_2017_h1="(vendor_id,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,rate_code_id,store_and_fwd_flag,pickup_location_id,dropoff_location_id,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount)" |
|
yellow_schema_2019_h1="(vendor_id,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,rate_code_id,store_and_fwd_flag,pickup_location_id,dropoff_location_id,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge)" |
|
|
|
fhv_schema_pre_2017="(dispatching_base_num,pickup_datetime,pickup_location_id)" |
|
fhv_schema_2017_h1="(dispatching_base_num,pickup_datetime,dropoff_datetime,pickup_location_id,dropoff_location_id)" |
|
fhv_schema_2017_h2="(dispatching_base_num,pickup_datetime,dropoff_datetime,pickup_location_id,dropoff_location_id,shared_ride)" |
|
fhv_schema_2018="(pickup_datetime,dropoff_datetime,pickup_location_id,dropoff_location_id,shared_ride,dispatching_base_num,junk)" |
|
fhv_schema_2019="(dispatching_base_num,pickup_datetime,dropoff_datetime,pickup_location_id,dropoff_location_id,shared_ride)" |
|
|
|
fhvhv_schema="(hvfhs_license_num,dispatching_base_num,pickup_datetime,dropoff_datetime,pickup_location_id,dropoff_location_id,shared_ride)" |
|
|
|
################################################# |
|
function sql_green { |
|
local schema="" |
|
if [ $year -lt 2015 ]; then |
|
schema=$green_schema_pre_2015 |
|
elif [ $year -eq 2015 ] && [ $month -lt 7 ]; then |
|
schema=$green_schema_2015_h1 |
|
elif [ $year -eq 2015 ] || ([ $year -eq 2016 ] && [ $month -lt 7 ]); then |
|
schema=$green_schema_2015_h2_2016_h1 |
|
elif [ $year -eq 2016 ] && [ $month -gt 6 ]; then |
|
schema=$green_schema_2016_h2 |
|
elif [ $year -lt 2019 ]; then |
|
schema=$green_schema_2017_h1 |
|
else |
|
schema=$green_schema_2019_h1 |
|
fi |
|
SQL="COPY green_tripdata_staging ${schema} FROM stdin WITH CSV HEADER;" |
|
} |
|
|
|
function sql_yellow { |
|
local schema="" |
|
if [ $year -lt 2015 ]; then |
|
schema=$yellow_schema_pre_2015 |
|
elif [ $year -eq 2015 ] || ([ $year -eq 2016 ] && [ $month -lt 7 ]); then |
|
schema=$yellow_schema_2015_2016_h1 |
|
elif [ $year -eq 2016 ] && [ $month -gt 6 ]; then |
|
schema=$yellow_schema_2016_h2 |
|
elif [ $year -lt 2019 ]; then |
|
schema=$yellow_schema_2017_h1 |
|
else |
|
schema=$yellow_schema_2019_h1 |
|
fi |
|
SQL="COPY yellow_tripdata_staging ${schema} FROM stdin WITH CSV HEADER;" |
|
} |
|
|
|
function sql_fhv { |
|
local schema="" |
|
if [ $year -lt 2017 ]; then |
|
schema=$fhv_schema_pre_2017 |
|
elif [ $year -eq 2017 ] && [ $month -lt 7 ]; then |
|
schema=$fhv_schema_2017_h1 |
|
elif [ $year -eq 2017 ]; then |
|
schema=$fhv_schema_2017_h2 |
|
elif [ $year -eq 2018 ]; then |
|
schema=$fhv_schema_2018 |
|
else |
|
schema=$fhv_schema_2019 |
|
fi |
|
SQL="COPY fhv_trips_staging ${schema} FROM stdin WITH CSV HEADER;" |
|
} |
|
|
|
function sql_fhvhv { |
|
local schema="${fhvhv_schema}" |
|
SQL="COPY fhv_trips_staging ${schema} FROM stdin WITH CSV HEADER;" |
|
} |
|
################################################# |
|
SQL="" |
|
"sql_${type}" |
|
|
|
echo "$(date -Isecond): beginning raw load for ${filename}" |
|
sed $'s/\r$//' $filename | sed '/^\s*$/d' | psql "nyc-taxi-data" -c "${SQL}" |
|
echo "$(date -Isecond): finished raw load for ${filename}" |