Skip to content

Instantly share code, notes, and snippets.

@usmcamp0811
Created September 25, 2018 14:45
Show Gist options
  • Save usmcamp0811/a6dd9e217f7e72fbe4e117ea725464c0 to your computer and use it in GitHub Desktop.
Save usmcamp0811/a6dd9e217f7e72fbe4e117ea725464c0 to your computer and use it in GitHub Desktop.
Quick test to see if loading Feather files into Dask was any better or worse than Parquet files in Dask. They seem to be comparable in performance and not much extra code for the Feather files.
import os
import feather
from tqdm import tqdm
import dask
import datetime
from dask import delayed
from dask import visualize
import pandas as pd
import dask.dataframe as dd
""""
Example code to show how to load lots of binary files into Dask
Loading custom file types like Feather files is a little more boiler plate
but it seems to condense down to the same dask dataframe object so no
real loss by using feathers or other custom data types
"""
lazy_dataframes = []
dir = "/media/mcamp/HDD1/Datasets/bci_data/train/feather"
dirp = "/media/mcamp/HDD1/Datasets/bci_data/train/parquet"
t1 = datetime.datetime.now()
filepaths = os.listdir(dir)
pfilepaths = os.listdir(dirp)
print("Starting the Feather files")
for file in filepaths:
df = delayed(feather.read_dataframe)(os.path.join(dir, file))
df = delayed(pd.DataFrame.assign)(df, file=file.replace(".feather", ""))
lazy_dataframes.append(df)
df = dd.from_delayed(lazy_dataframes, meta=lazy_dataframes[0].compute())
t2 = datetime.datetime.now()
_std = df.Fp1.std()
_mean = df.Fp1.mean()
t3 = datetime.datetime.now()
_std1, _mean1 = dask.compute(_std, _mean)
t4 = datetime.datetime.now()
print(_std1, _mean1)
print("T2:", t2-t1)
print("T3:", t3-t1)
print("T4:", t4-t1)
del df
print("Starting the Parquet files")
t1 = datetime.datetime.now()
df = dd.read_parquet(os.path.join(dirp, '*.gzip'))
t2 = datetime.datetime.now()
_std = df.Fp1.std()
_mean = df.Fp1.mean()
t3 = datetime.datetime.now()
_std1, _mean1 = dask.compute(_std, _mean)
t4 = datetime.datetime.now()
print(_std1, _mean1)
print("T2:", t2-t1)
print("T3:", t3-t1)
print("T4:", t4-t1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment