Skip to content

Instantly share code, notes, and snippets.

@wabu
Last active August 29, 2015 14:01
Show Gist options
  • Save wabu/79ffa83dd368b3a15438 to your computer and use it in GitHub Desktop.
Save wabu/79ffa83dd368b3a15438 to your computer and use it in GitHub Desktop.
test for pandas hdf5 corruption (GH6505)
import numpy as np
import pandas as pd
pd.show_versions()
for i in range(100000):
print(i, end='\r')
strings = [np.random.bytes(20).decode('utf8', errors='ignore') for _ in range(100)]
encoded = [s.encode('utf8') for s in strings]
strings = np.array(strings).astype(object)
encoded = np.array(encoded)
# put it in the store
with pd.get_store('test.h5') as st:
st.put('bar', pd.DataFrame(strings), format='t', append=False, data_columns=[], min_itemsize=20)
# try pytables read
with pd.get_store('test.h5') as st:
raw = st.root.bar.table.cols.values_block_0[:][:, 0]
assert (raw == encoded).all().all()
try:
l = pd.lib.max_len_string_array(pd.core.common._ensure_object(raw.ravel()))
decoded = raw.astype('U%d' % l).astype(object)
assert (decoded == strings).all().all()
except UnicodeDecodeError:
pass
assert (raw == encoded).all().all()
decoded = np.vectorize(lambda x: x.decode('utf8'), otypes=[np.object])(raw)
assert (decoded == strings).all().all()
with pd.get_store('test.h5') as st:
stored = st.select('bar')[0].values
try:
assert (stored == strings).all().all()
except AssertionError:
diff = stored != strings
print("ERROR: data read corrupted (%d/%d)" % (sum(diff), len(strings)))
view = pd.DataFrame.from_items([
('orig', strings[diff]),
('h5ed', stored[diff])])
print(view.applymap(repr).head())
@wabu
Copy link
Author

wabu commented May 27, 2014

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment