rikturr/csv_to_h5.py

## csv_to_h5.py
import pandas as pd
from datetime import datetime

CHUNK_SIZE = 1000000
POS_KEY = 'positive'
NEG_KEY = 'negative'
CLASS_COLUMN = 'class'
FILE = '<FILEPATH>'
OUTFILE = '<OUTPATH>'

store = pd.HDFStore(OUTFILE, complib='blosc', complevel=9)

i = 0
for chunk in pd.read_csv(FILE, chunksize=CHUNK_SIZE):
    print('{}  {}'.format(i, datetime.now()))
    store.append(POS_KEY, chunk[chunk[CLASS_COLUMN] == 1], index=False)
    store.append(NEG_KEY, chunk[chunk[CLASS_COLUMN] == 0], index=False)
    i += 1

store.create_table_index(POS_KEY, columns=True, optlevel=9, kind='full')
store.create_table_index(NEG_KEY, columns=True, optlevel=9, kind='full')

store.close()
	import pandas as pd
	from datetime import datetime

	CHUNK_SIZE = 1000000
	POS_KEY = 'positive'
	NEG_KEY = 'negative'
	CLASS_COLUMN = 'class'
	FILE = '<FILEPATH>'
	OUTFILE = '<OUTPATH>'

	store = pd.HDFStore(OUTFILE, complib='blosc', complevel=9)

	i = 0
	for chunk in pd.read_csv(FILE, chunksize=CHUNK_SIZE):
	print('{} {}'.format(i, datetime.now()))
	store.append(POS_KEY, chunk[chunk[CLASS_COLUMN] == 1], index=False)
	store.append(NEG_KEY, chunk[chunk[CLASS_COLUMN] == 0], index=False)
	i += 1

	store.create_table_index(POS_KEY, columns=True, optlevel=9, kind='full')
	store.create_table_index(NEG_KEY, columns=True, optlevel=9, kind='full')

	store.close()