Skip to content

Instantly share code, notes, and snippets.

@djouallah
Last active June 5, 2022 10:43
Show Gist options
  • Save djouallah/7adc5498cf4aaf55422491a6e12106cd to your computer and use it in GitHub Desktop.
Save djouallah/7adc5498cf4aaf55422491a6e12106cd to your computer and use it in GitHub Desktop.
let
parquetfunctin =(params) =>
let
Parquet = Parquet.Document(Binary.Buffer(Web.Contents("https://nyc-tlc.s3.amazonaws.com/trip+data/",[RelativePath=params])))
in
Parquet,
Source = Table.FromColumns({Lines.FromBinary(Web.Contents("https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"))}),
#"Filtered Rows" = Table.SelectRows(Source, each Text.StartsWith([Column1], "<li><a href=""https://s3.amazonaws.com/nyc-tlc/trip+data/")),
#"Split Column by Delimiter" = Table.SplitColumn(#"Filtered Rows", "Column1", Splitter.SplitTextByDelimiter("title=""", QuoteStyle.None), {"Column1.1", "Column1.2"}),
#"Removed Other Columns" = Table.SelectColumns(#"Split Column by Delimiter",{"Column1.1"}),
#"Replaced Value" = Table.ReplaceValue(#"Removed Other Columns","<li><a href=""https://s3.amazonaws.com/nyc-tlc/trip+data/","",Replacer.ReplaceText,{"Column1.1"}),
#"Replaced Value1" = Table.ReplaceValue(#"Replaced Value","""","",Replacer.ReplaceText,{"Column1.1"}),
#"Renamed Columns" = Table.RenameColumns(#"Replaced Value1",{{"Column1.1", "URL"}}),
#"Trimmed Text" = Table.TransformColumns(#"Renamed Columns",{{"URL", Text.Trim, type text}}),
#"Cleaned Text" = Table.TransformColumns(#"Trimmed Text",{{"URL", Text.Clean, type text}}),
#"Kept First Rows" = Table.FirstN(#"Cleaned Text",2),
#"Added Custom" = Table.AddColumn(#"Kept First Rows", "Custom", each parquetfunctin([URL])),
#"Expanded Custom" = Table.ExpandTableColumn(#"Added Custom", "Custom", {"VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance", "RatecodeID", "store_and_fwd_flag", "PULocationID", "DOLocationID", "payment_type", "fare_amount", "extra", "mta_tax", "tip_amount", "tolls_amount", "improvement_surcharge", "total_amount", "congestion_surcharge", "airport_fee"}, {"VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance", "RatecodeID", "store_and_fwd_flag", "PULocationID", "DOLocationID", "payment_type", "fare_amount", "extra", "mta_tax", "tip_amount", "tolls_amount", "improvement_surcharge", "total_amount", "congestion_surcharge", "airport_fee"})
in
#"Expanded Custom"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment