Skip to content

Instantly share code, notes, and snippets.

@DeastinY
Last active October 25, 2019 15:29
Show Gist options
  • Save DeastinY/02f17d539ff7008d2b8dde1f9a6a9480 to your computer and use it in GitHub Desktop.
Save DeastinY/02f17d539ff7008d2b8dde1f9a6a9480 to your computer and use it in GitHub Desktop.
import timeit
import numpy as np
import pandas as pd
from pathlib import Path
def create_test_data(large=False, small=False):
# Note that a large_number by 1 dataframe is not really representative.
root = Path("tmp")
root.mkdir(exist_ok=True)
if large:
large_df = pd.DataFrame(np.random.randint(0,100,size=(800000000,1)))
large_df.columns = large_df.columns.astype(str)
large_df.to_feather(root/"large_df.feather")
if small:
for i in range(20000):
small_df = pd.DataFrame(np.random.randint(0,100,size=(40000,1)))
small_df.columns = small_df.columns.astype(str)
small_df.to_feather(root/f"small_df_{i}.feather")
def test_speed(large=False, small=False):
if large:
print(timeit.timeit("""pd.read_feather("tmp/large_df.feather")""", setup="import pandas as pd", number=10))
if small:
print(timeit.timeit("""for f in Path("tmp").glob("small_*.feather"): pd.read_feather(f)""", setup="import pandas as pd; from pathlib import Path", number=10))
if __name__ == '__main__':
create_test_data(large=False, small=True)
test_speed(large=False, small=True)
# Loading a single 6 GB file takes roughly 15.2 seconds
# In contrast to loading 20,000 300kb files, which takes roughly forever (296.8 seconds).
# (Ten times average)
@jsbhat
Copy link

jsbhat commented Oct 16, 2019

Just FYI I get 8 seconds for large and 195.2 seconds for small

@DeastinY
Copy link
Author

Thanks for running it! Did you change the timeit number? I messed that up in the original version ^^

@jsbhat
Copy link

jsbhat commented Oct 25, 2019

Nope, I ran it as is i.e number=10. only changed the boolean values

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment