Skip to content

Instantly share code, notes, and snippets.

@thrasibule
Created March 17, 2017 01:19
Show Gist options
  • Save thrasibule/7587b15e5361850b1e4563bd4d3ab23e to your computer and use it in GitHub Desktop.
Save thrasibule/7587b15e5361850b1e4563bd4d3ab23e to your computer and use it in GitHub Desktop.
import dask.dataframe as dd
import numpy as np
from pathlib import Path
#get headers
data_dir = Path("~/data/Datasets/nyc-taxi").expanduser()
with (data_dir / "green_tripdata_2014-01.csv").open() as fh:
headers = next(fh).rstrip().split(",")
df = dd.read_csv("{}/*.csv".format(data_dir), names=headers+["junk"]*3, skiprows=2,
parse_dates=['lpep_pickup_datetime', 'Lpep_dropoff_datetime'],
dtype={'Passenger_count':np.uint8,
'Trip_distance':np.float32,
'MTA_tax':np.float32,
'Payment_type':np.uint8,
'Tip_amount':np.float32,
'Total_amount':np.float32,
'RateCodeID':np.uint8,
'Store_and_fwd_flag':'category',
'Trip_type':'category'})
df = df.drop(['junk', 'junk.1', 'junk.2'], axis=1)
df.to_hdf('green_tripdata.hdf', 'green_2014_*', complevel=6, complib='blosc')
df.to_parquet("green_2014", compression="SNAPPY")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment