Skip to content

Instantly share code, notes, and snippets.

@zmjjmz
Last active April 4, 2019 19:52
Show Gist options
  • Save zmjjmz/1bf738966d2df147a4fae7268ee3d812 to your computer and use it in GitHub Desktop.
Save zmjjmz/1bf738966d2df147a4fae7268ee3d812 to your computer and use it in GitHub Desktop.
PyArrow chunked array output thingie
import os
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
import numpy as np
from tqdm import tqdm
TEST_DIR = 'jaggedbug_testpath'
MAX_SIZE = 15
MIN_SIZE = 1
RAND_RANGE = [2**60,2**63]
POST_TYPE = str
N_ROWS_RANGE = map(int, [1e5, 2e5, 1e4])
def setup_test_df(nrows):
test_df = pd.DataFrame.from_dict({'a':[
list(np.random.randint(2**60,2**63, size=np.random.randint(MIN_SIZE-1,MAX_SIZE)).astype(POST_TYPE))
for _ in tqdm(range(nrows),
desc='Making a DF of {0} rows'.format(nrows),
leave=False
)]})
print(test_df.a.apply(len).describe())
return test_df
if __name__ == "__main__":
path_str = '{0}_rows_maxsize_{1}_{2}.parquet'
for n_rows in tqdm(range(*N_ROWS_RANGE), desc="Running test"):
df = setup_test_df(n_rows)
path = path_str.format(
n_rows,
MAX_SIZE,
# bleh whatever
'str' if POST_TYPE == str else 'int',
)
fullpath = os.path.join(TEST_DIR, path)
pa_table = pa.Table.from_pandas(df)
pq.write_table(pa_table, fullpath)
read_table = pq.read_table(fullpath)
Traceback (most recent call last):
File "parquet_jagged_test.py", line 44, in <module>
read_table = pq.read_table(fullpath)
File "/home/u1/zach/proj/dataplayground2/local/lib/python2.7/site-packages/pyarrow/parquet.py", line 1107, in read_table
use_pandas_metadata=use_pandas_metadata)
File "/home/u1/zach/proj/dataplayground2/local/lib/python2.7/site-packages/pyarrow/filesystem.py", line 181, in read_parquet
use_pandas_metadata=use_pandas_metadata)
File "/home/u1/zach/proj/dataplayground2/local/lib/python2.7/site-packages/pyarrow/parquet.py", line 973, in read
use_pandas_metadata=use_pandas_metadata)
File "/home/u1/zach/proj/dataplayground2/local/lib/python2.7/site-packages/pyarrow/parquet.py", line 529, in read
table = reader.read(**options)
File "/home/u1/zach/proj/dataplayground2/local/lib/python2.7/site-packages/pyarrow/parquet.py", line 212, in read
use_threads=use_threads)
File "pyarrow/_parquet.pyx", line 722, in pyarrow._parquet.ParquetReader.read_all
File "pyarrow/error.pxi", line 89, in pyarrow.lib.check_status
pyarrow.lib.ArrowNotImplementedError: Nested data conversions not implemented for chunked array outputs
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment