Last active
April 4, 2019 19:52
-
-
Save zmjjmz/1bf738966d2df147a4fae7268ee3d812 to your computer and use it in GitHub Desktop.
PyArrow chunked array output thingie
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import pandas as pd | |
import pyarrow.parquet as pq | |
import pyarrow as pa | |
import numpy as np | |
from tqdm import tqdm | |
TEST_DIR = 'jaggedbug_testpath' | |
MAX_SIZE = 15 | |
MIN_SIZE = 1 | |
RAND_RANGE = [2**60,2**63] | |
POST_TYPE = str | |
N_ROWS_RANGE = map(int, [1e5, 2e5, 1e4]) | |
def setup_test_df(nrows): | |
test_df = pd.DataFrame.from_dict({'a':[ | |
list(np.random.randint(2**60,2**63, size=np.random.randint(MIN_SIZE-1,MAX_SIZE)).astype(POST_TYPE)) | |
for _ in tqdm(range(nrows), | |
desc='Making a DF of {0} rows'.format(nrows), | |
leave=False | |
)]}) | |
print(test_df.a.apply(len).describe()) | |
return test_df | |
if __name__ == "__main__": | |
path_str = '{0}_rows_maxsize_{1}_{2}.parquet' | |
for n_rows in tqdm(range(*N_ROWS_RANGE), desc="Running test"): | |
df = setup_test_df(n_rows) | |
path = path_str.format( | |
n_rows, | |
MAX_SIZE, | |
# bleh whatever | |
'str' if POST_TYPE == str else 'int', | |
) | |
fullpath = os.path.join(TEST_DIR, path) | |
pa_table = pa.Table.from_pandas(df) | |
pq.write_table(pa_table, fullpath) | |
read_table = pq.read_table(fullpath) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Traceback (most recent call last): | |
File "parquet_jagged_test.py", line 44, in <module> | |
read_table = pq.read_table(fullpath) | |
File "/home/u1/zach/proj/dataplayground2/local/lib/python2.7/site-packages/pyarrow/parquet.py", line 1107, in read_table | |
use_pandas_metadata=use_pandas_metadata) | |
File "/home/u1/zach/proj/dataplayground2/local/lib/python2.7/site-packages/pyarrow/filesystem.py", line 181, in read_parquet | |
use_pandas_metadata=use_pandas_metadata) | |
File "/home/u1/zach/proj/dataplayground2/local/lib/python2.7/site-packages/pyarrow/parquet.py", line 973, in read | |
use_pandas_metadata=use_pandas_metadata) | |
File "/home/u1/zach/proj/dataplayground2/local/lib/python2.7/site-packages/pyarrow/parquet.py", line 529, in read | |
table = reader.read(**options) | |
File "/home/u1/zach/proj/dataplayground2/local/lib/python2.7/site-packages/pyarrow/parquet.py", line 212, in read | |
use_threads=use_threads) | |
File "pyarrow/_parquet.pyx", line 722, in pyarrow._parquet.ParquetReader.read_all | |
File "pyarrow/error.pxi", line 89, in pyarrow.lib.check_status | |
pyarrow.lib.ArrowNotImplementedError: Nested data conversions not implemented for chunked array outputs |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment