wabu/utf8-test.py

## utf8-test.py
import numpy as np
import pandas as pd

pd.show_versions()

for i in range(100000):
    print(i, end='\r')
    strings = [np.random.bytes(20).decode('utf8', errors='ignore') for _ in range(100)]
    encoded = [s.encode('utf8') for s in strings]

    strings = np.array(strings).astype(object)
    encoded = np.array(encoded)

    # put it in the store
    with pd.get_store('test.h5') as st:
        st.put('bar', pd.DataFrame(strings), format='t', append=False, data_columns=[], min_itemsize=20)

    # try pytables read
    with pd.get_store('test.h5') as st:
        raw = st.root.bar.table.cols.values_block_0[:][:, 0]
        assert (raw == encoded).all().all()
        try:
            l = pd.lib.max_len_string_array(pd.core.common._ensure_object(raw.ravel()))
            decoded = raw.astype('U%d' % l).astype(object)
            assert (decoded == strings).all().all()
        except UnicodeDecodeError:
            pass

        assert (raw == encoded).all().all()
        decoded = np.vectorize(lambda x: x.decode('utf8'), otypes=[np.object])(raw)
        assert (decoded == strings).all().all()

    with pd.get_store('test.h5') as st:
        stored = st.select('bar')[0].values
        try:
            assert (stored == strings).all().all()
        except AssertionError:
            diff = stored != strings
            print("ERROR: data read corrupted (%d/%d)" % (sum(diff), len(strings)))
            view = pd.DataFrame.from_items([
                    ('orig', strings[diff]),
                    ('h5ed', stored[diff])])
            print(view.applymap(repr).head())
	import numpy as np
	import pandas as pd

	pd.show_versions()

	for i in range(100000):
	print(i, end='\r')
	strings = [np.random.bytes(20).decode('utf8', errors='ignore') for _ in range(100)]
	encoded = [s.encode('utf8') for s in strings]

	strings = np.array(strings).astype(object)
	encoded = np.array(encoded)

	# put it in the store
	with pd.get_store('test.h5') as st:
	st.put('bar', pd.DataFrame(strings), format='t', append=False, data_columns=[], min_itemsize=20)

	# try pytables read
	with pd.get_store('test.h5') as st:
	raw = st.root.bar.table.cols.values_block_0[:][:, 0]
	assert (raw == encoded).all().all()
	try:
	l = pd.lib.max_len_string_array(pd.core.common._ensure_object(raw.ravel()))
	decoded = raw.astype('U%d' % l).astype(object)
	assert (decoded == strings).all().all()
	except UnicodeDecodeError:
	pass

	assert (raw == encoded).all().all()
	decoded = np.vectorize(lambda x: x.decode('utf8'), otypes=[np.object])(raw)
	assert (decoded == strings).all().all()

	with pd.get_store('test.h5') as st:
	stored = st.select('bar')[0].values
	try:
	assert (stored == strings).all().all()
	except AssertionError:
	diff = stored != strings
	print("ERROR: data read corrupted (%d/%d)" % (sum(diff), len(strings)))
	view = pd.DataFrame.from_items([
	('orig', strings[diff]),
	('h5ed', stored[diff])])
	print(view.applymap(repr).head())