Skip to content

Instantly share code, notes, and snippets.

@vietvudanh
Created July 22, 2020 03:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vietvudanh/4d2881db833887b68304273453beb8b3 to your computer and use it in GitHub Desktop.
Save vietvudanh/4d2881db833887b68304273453beb8b3 to your computer and use it in GitHub Desktop.
Convert csv to hdf5
# -*- coding: utf-8 -*-
import h5py
import sys
import pandas as pd
import datetime
if len(sys.argv) == 1:
print("No provided file")
sys.exit(1)
csv_file_name = sys.argv[1]
with open(csv_file_name, 'r') as tmp_file:
# list of columns (labels) that should be indexed
df_cols_to_index = tmp_file.readline().split(",")
print(f"file name:: {csv_file_name}")
print(f"headers:: {df_cols_to_index}")
# some errors might appear with column having len > limit len of first batch, provide it here
item_size = {
# 'col_0': 50
}
hdf_key = 'hdf_key'
store = pd.HDFStore(f"{csv_file_name}.hdf5")
for chunk in pd.read_csv(csv_file_name, chunksize=500000):
# don't index data columns in each iteration - we'll do it later ...
store.append(hdf_key, chunk, data_columns=df_cols_to_index, index=False, min_itemsize=item_size)
# index data columns in HDFStore
print(f"Done chunk:: {datetime.datetime.now()}")
store.create_table_index(hdf_key, columns=df_cols_to_index, optlevel=9, kind='full')
store.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment