Created
October 30, 2019 14:40
-
-
Save jorisvandenbossche/d36bceb82fd2dda38ee419ba51dff5ed to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Profile overhead of checking the column metadata and types for extension dtypes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import pyarrow as pa\n", | |
"\n", | |
"import random\n", | |
"import string" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"N_columns = 100\n", | |
"N_rows = 1000" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = pd.DataFrame(\n", | |
" np.random.randn(N_rows, N_columns),\n", | |
" columns=[''.join([random.choice(string.ascii_letters) for _ in range(5)]) for _ in range(N_columns)])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"table = pa.table(df)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"3.18 ms ± 101 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit table.to_pandas()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def to_pandas(table, check_extension=True):\n", | |
" # Table.to_pandas equivalent with control whether\n", | |
" # to check for extension types or not\n", | |
" from pyarrow.pandas_compat import table_to_blockmanager\n", | |
"\n", | |
" options = dict(\n", | |
" pool=None,\n", | |
" strings_to_categorical=False,\n", | |
" zero_copy_only=False,\n", | |
" integer_object_nulls=False,\n", | |
" date_as_object=True,\n", | |
" use_threads=True,\n", | |
" deduplicate_objects=True)\n", | |
"\n", | |
" if check_extension:\n", | |
" extension_columns = None\n", | |
" else:\n", | |
" extension_columns = []\n", | |
"\n", | |
" mgr = table_to_blockmanager(\n", | |
" options, table, extension_columns=extension_columns)\n", | |
" return pd.DataFrame(mgr)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"This should be the same as `table.to_pandas()` above:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"3.45 ms ± 110 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit to_pandas(table, check_extension=True)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"And now with skipping the checks:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"2.73 ms ± 419 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit to_pandas(table, check_extension=False)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Long table" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"N_columns = 100\n", | |
"N_rows = 100_000" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = pd.DataFrame(\n", | |
" np.random.randn(N_rows, N_columns),\n", | |
" columns=[''.join([random.choice(string.ascii_letters) for _ in range(5)]) for _ in range(N_columns)])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"table = pa.table(df)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"28.7 ms ± 1.4 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit table.to_pandas()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"29.2 ms ± 740 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit to_pandas(table, check_extension=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"26.4 ms ± 1.45 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit to_pandas(table, check_extension=False)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Wide table" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"N_columns = 1000\n", | |
"N_rows = 100" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = pd.DataFrame(\n", | |
" np.random.randn(N_rows, N_columns),\n", | |
" columns=[''.join([random.choice(string.ascii_letters) for _ in range(5)]) for _ in range(N_columns)])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"table = pa.table(df)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"22.4 ms ± 419 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit table.to_pandas()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"22.6 ms ± 659 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit to_pandas(table, check_extension=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"10.6 ms ± 415 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit to_pandas(table, check_extension=False)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Profile:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" " | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
" 83239 function calls (83218 primitive calls) in 0.061 seconds\n", | |
"\n", | |
" Ordered by: internal time\n", | |
"\n", | |
" ncalls tottime percall cumtime percall filename:lineno(function)\n", | |
" 1026 0.009 0.000 0.029 0.000 dtypes.py:82(find)\n", | |
" 10033 0.007 0.000 0.012 0.000 base.py:180(construct_from_string)\n", | |
" 1 0.005 0.005 0.005 0.005 {pyarrow.lib.table_to_blocks}\n", | |
" 12042 0.004 0.000 0.004 0.000 {method 'format' of 'str' objects}\n", | |
" 1013 0.003 0.000 0.033 0.000 common.py:2007(pandas_dtype)\n", | |
" 22394 0.003 0.000 0.004 0.000 {built-in method builtins.isinstance}\n", | |
" 1 0.003 0.003 0.003 0.003 decoder.py:343(raw_decode)\n", | |
" 1 0.002 0.002 0.003 0.003 pandas_compat.py:1098(<listcomp>)\n", | |
" 1 0.002 0.002 0.036 0.036 pandas_compat.py:756(_get_extension_dtypes)\n", | |
" 1000 0.002 0.000 0.003 0.000 {method 'get_field_index' of 'pyarrow.lib.Schema' objects}\n", | |
" 1004 0.001 0.000 0.003 0.000 dtypes.py:873(construct_from_string)\n", | |
" 1005 0.001 0.000 0.002 0.000 dtypes.py:1047(construct_from_string)\n", | |
" 1 0.001 0.001 0.005 0.005 pandas_compat.py:1104(_add_any_metadata)\n", | |
" 1007 0.001 0.000 0.002 0.000 dtypes.py:721(construct_from_string)\n", | |
" 1 0.001 0.001 0.001 0.001 pandas_compat.py:870(<dictcomp>)\n", | |
" 1000 0.001 0.000 0.001 0.000 compat.py:112(tobytes)\n", | |
" 4005 0.001 0.000 0.001 0.000 {method 'get' of 'dict' objects}\n", | |
" 1000 0.001 0.000 0.034 0.000 {method 'pandas_dtype' of 'pyarrow.lib._PandasAPIShim' objects}\n", | |
" 1000 0.001 0.000 0.001 0.000 base.py:4177(__getitem__)\n", | |
" 1003 0.001 0.000 0.002 0.000 string_.py:75(construct_from_string)\n", | |
" 1003 0.001 0.000 0.001 0.000 dtype.py:177(construct_from_string)\n", | |
" 3010 0.001 0.000 0.001 0.000 {method 'startswith' of 'str' objects}\n", | |
" 1 0.001 0.001 0.009 0.009 pandas_compat.py:825(_deserialize_column_index)\n", | |
" 1005 0.001 0.000 0.001 0.000 {method 'match' of 're.Pattern' objects}\n", | |
" 1189 0.000 0.000 0.001 0.000 generic.py:7(_check)\n", | |
" 1 0.000 0.000 0.006 0.006 pandas_compat.py:1075(_table_to_blocks)\n", | |
" 1 0.000 0.000 0.001 0.001 pandas_compat.py:828(<dictcomp>)\n", | |
" 1001 0.000 0.000 0.001 0.000 inference.py:382(is_hashable)\n", | |
" 1000 0.000 0.000 0.000 0.000 {method 'encode' of 'str' objects}\n", | |
" 2 0.000 0.000 0.000 0.000 {method 'argsort' of 'numpy.ndarray' objects}\n", | |
" 2000 0.000 0.000 0.001 0.000 compat.py:123(frombytes)\n", | |
" 1 0.000 0.000 0.060 0.060 pandas_compat.py:721(table_to_blockmanager)\n", | |
" 2000 0.000 0.000 0.000 0.000 {method 'decode' of 'bytes' objects}\n", | |
" 1003 0.000 0.000 0.000 0.000 {method 'search' of 're.Pattern' objects}\n", | |
" 1 0.000 0.000 0.060 0.060 <ipython-input-6-9d225b52ec17>:1(to_pandas)\n", | |
" 1001 0.000 0.000 0.000 0.000 {built-in method builtins.hash}\n", | |
" 3 0.000 0.000 0.000 0.000 base.py:1732(is_unique)\n", | |
" 1000 0.000 0.000 0.000 0.000 common.py:150(cast_scalar_indexer)\n", | |
" 1001 0.000 0.000 0.000 0.000 pandas_compat.py:820(<genexpr>)\n", | |
"1282/1277 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", | |
" 1000 0.000 0.000 0.000 0.000 pandas_compat.py:272(_column_name_to_strings)\n", | |
" 1003 0.000 0.000 0.000 0.000 {method 'lower' of 'str' objects}\n", | |
" 4 0.000 0.000 0.000 0.000 {pandas._libs.lib.infer_dtype}\n", | |
" 1000 0.000 0.000 0.000 0.000 compat.py:118(u_utf8)\n", | |
" 1 0.000 0.000 0.000 0.000 pandas_compat.py:826(<listcomp>)\n", | |
" 4/3 0.000 0.000 0.001 0.000 base.py:261(__new__)\n", | |
" 1 0.000 0.000 0.000 0.000 {method 'factorize' of 'pandas._libs.hashtable.StringHashTable' objects}\n", | |
" 1 0.000 0.000 0.000 0.000 pandas_compat.py:833(<listcomp>)\n", | |
" 1001 0.000 0.000 0.000 0.000 {pandas._libs.lib.is_float}\n", | |
" 1007 0.000 0.000 0.000 0.000 {pandas._libs.lib.is_scalar}\n", | |
" 1 0.000 0.000 0.060 0.060 <string>:1(<module>)\n", | |
" 3 0.000 0.000 0.000 0.000 {method 'astype' of 'numpy.ndarray' objects}\n", | |
" 12 0.000 0.000 0.000 0.000 {built-in method numpy.array}\n", | |
" 3 0.000 0.000 0.000 0.000 {built-in method builtins.all}\n", | |
" 2 0.000 0.000 0.000 0.000 {built-in method pandas._libs.missing.isnaobj}\n", | |
" 7 0.000 0.000 0.000 0.000 {method 'reduce' of 'numpy.ufunc' objects}\n", | |
" 1000 0.000 0.000 0.000 0.000 pandas_compat.py:841(<lambda>)\n", | |
" 1 0.000 0.000 0.061 0.061 {built-in method builtins.exec}\n", | |
" 34 0.000 0.000 0.000 0.000 base.py:231(is_dtype)\n", | |
" 1 0.000 0.000 0.000 0.000 multi.py:1356(_engine)\n", | |
" 1 0.000 0.000 0.000 0.000 cast.py:1351(construct_1d_object_array_from_listlike)\n", | |
" 1 0.000 0.000 0.000 0.000 {pandas._libs.lib.to_object_array_tuples}\n", | |
" 75 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", | |
" 87/72 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", | |
" 1 0.000 0.000 0.001 0.001 pandas_compat.py:864(_reconstruct_index)\n", | |
" 1 0.000 0.000 0.000 0.000 multi.py:317(_verify_integrity)\n", | |
" 1 0.000 0.000 0.000 0.000 managers.py:216(_rebuild_blknos_and_blklocs)\n", | |
" 15 0.000 0.000 0.000 0.000 base.py:509(_simple_new)\n", | |
" 29 0.000 0.000 0.000 0.000 common.py:1873(_is_dtype_type)\n", | |
" 6 0.000 0.000 0.000 0.000 {built-in method numpy.empty}\n", | |
" 8 0.000 0.000 0.000 0.000 _dtype.py:319(_name_get)\n", | |
" 28 0.000 0.000 0.000 0.000 <frozen importlib._bootstrap>:1009(_handle_fromlist)\n", | |
" 12 0.000 0.000 0.000 0.000 base.py:564(_shallow_copy)\n", | |
" 25 0.000 0.000 0.000 0.000 common.py:1730(is_extension_array_dtype)\n", | |
" 1 0.000 0.000 0.004 0.004 pandas_compat.py:1088(_flatten_single_level_multiindex)\n", | |
" 1 0.000 0.000 0.000 0.000 range.py:148(_simple_new)\n", | |
" 1 0.000 0.000 0.000 0.000 range.py:83(__new__)\n", | |
" 1 0.000 0.000 0.003 0.003 decoder.py:332(decode)\n", | |
" 2 0.000 0.000 0.000 0.000 multi.py:247(__new__)\n", | |
" 1 0.000 0.000 0.002 0.002 multi.py:455(from_tuples)\n", | |
" 1 0.000 0.000 0.000 0.000 base.py:750(astype)\n", | |
" 15 0.000 0.000 0.000 0.000 {pandas._libs.lib.is_list_like}\n", | |
" 2 0.000 0.000 0.000 0.000 missing.py:220(_isna_ndarraylike)\n", | |
" 99 0.000 0.000 0.000 0.000 {built-in method builtins.issubclass}\n", | |
" 1 0.000 0.000 0.000 0.000 common.py:182(all_none)\n", | |
" 21 0.000 0.000 0.000 0.000 {built-in method _abc._abc_instancecheck}\n", | |
" 1 0.000 0.000 0.001 0.001 categorical.py:332(__init__)\n", | |
" 1 0.000 0.000 0.001 0.001 algorithms.py:569(factorize)\n", | |
" 16 0.000 0.000 0.000 0.000 <frozen importlib._bootstrap>:416(parent)\n", | |
" 10 0.000 0.000 0.000 0.000 multi.py:640(levels)\n", | |
" 2 0.000 0.000 0.000 0.000 multi.py:701(_set_levels)\n", | |
" 1 0.000 0.000 0.001 0.001 pandas_compat.py:1018(_reconstruct_columns_from_metadata)\n", | |
" 1 0.000 0.000 0.000 0.000 {method 'take' of 'numpy.ndarray' objects}\n", | |
" 1 0.000 0.000 0.000 0.000 algorithms.py:1551(take_nd)\n", | |
" 5 0.000 0.000 0.000 0.000 common.py:497(is_datetime64_dtype)\n", | |
" 2 0.000 0.000 0.000 0.000 dtypes.py:497(validate_categories)\n", | |
" 22 0.000 0.000 0.000 0.000 {method 'view' of 'numpy.ndarray' objects}\n", | |
" 8 0.000 0.000 0.000 0.000 numerictypes.py:365(issubdtype)\n", | |
" 1 0.000 0.000 0.000 0.000 {pandas._libs.algos.take_1d_int64_int64}\n", | |
" 3 0.000 0.000 0.000 0.000 {method 'any' of 'numpy.ndarray' objects}\n", | |
" 1 0.000 0.000 0.000 0.000 managers.py:126(__init__)\n", | |
" 4 0.000 0.000 0.000 0.000 common.py:228(asarray_tuplesafe)\n", | |
" 7 0.000 0.000 0.000 0.000 dtypes.py:919(is_dtype)\n", | |
" 2 0.000 0.000 0.000 0.000 _dtype.py:46(__str__)\n", | |
" 1 0.000 0.000 0.002 0.002 multi.py:389(from_arrays)\n", | |
" 10 0.000 0.000 0.000 0.000 multi.py:643(<listcomp>)\n", | |
" 1 0.000 0.000 0.000 0.000 frame.py:403(__init__)\n", | |
" 3 0.000 0.000 0.000 0.000 frozen.py:73(__getitem__)\n", | |
" 17 0.000 0.000 0.000 0.000 base.py:3816(values)\n", | |
" 12 0.000 0.000 0.000 0.000 base.py:543(_get_attributes_dict)\n", | |
" 1 0.000 0.000 0.000 0.000 cast.py:337(maybe_promote)\n", | |
" 14 0.000 0.000 0.000 0.000 common.py:667(is_categorical_dtype)\n", | |
" 1 0.000 0.000 0.000 0.000 pandas_compat.py:648(_reconstruct_block)\n", | |
" 16 0.000 0.000 0.000 0.000 numerictypes.py:293(issubclass_)\n", | |
" 1 0.000 0.000 0.000 0.000 managers.py:340(_verify_integrity)\n", | |
" 3 0.000 0.000 0.000 0.000 managers.py:167(shape)\n", | |
" 18 0.000 0.000 0.000 0.000 base.py:630(_reset_identity)\n", | |
" 1 0.000 0.000 0.001 0.001 sorting.py:394(safe_sort)\n", | |
" 1 0.000 0.000 0.001 0.001 categorical.py:2681(_factorize_from_iterable)\n", | |
" 9 0.000 0.000 0.000 0.000 common.py:631(is_interval_dtype)\n", | |
" 6 0.000 0.000 0.000 0.000 dtypes.py:1099(is_dtype)\n", | |
" 1 0.000 0.000 0.003 0.003 __init__.py:299(loads)\n", | |
" 1 0.000 0.000 0.000 0.000 blocks.py:2940(get_block_type)\n", | |
" 2 0.000 0.000 0.000 0.000 multi.py:1225(_set_names)\n", | |
" 3 0.000 0.000 0.000 0.000 common.py:181(ensure_python_int)\n", | |
" 1 0.000 0.000 0.000 0.000 pandas_compat.py:1050(<listcomp>)\n", | |
" 5 0.000 0.000 0.000 0.000 _internal.py:886(npy_ctypes_check)\n", | |
" 1 0.000 0.000 0.000 0.000 multi.py:74(_codes_to_ints)\n", | |
" 2 0.000 0.000 0.000 0.000 multi.py:825(_set_codes)\n", | |
" 3 0.000 0.000 0.000 0.000 dtypes.py:342(_finalize)\n", | |
" 1 0.000 0.000 0.000 0.000 numeric.py:676(require)\n", | |
" 18 0.000 0.000 0.000 0.000 {built-in method __new__ of type object at 0x560e1bc923c0}\n", | |
" 1 0.000 0.000 0.000 0.000 blocks.py:231(mgr_locs)\n", | |
" 2 0.000 0.000 0.000 0.000 base.py:640(_engine)\n", | |
" 22 0.000 0.000 0.000 0.000 common.py:212(<lambda>)\n", | |
" 1 0.000 0.000 0.000 0.000 {built-in method numpy.arange}\n", | |
" 1 0.000 0.000 0.000 0.000 blocks.py:2986(make_block)\n", | |
" 4 0.000 0.000 0.000 0.000 multi.py:716(<genexpr>)\n", | |
" 1 0.000 0.000 0.000 0.000 generic.py:202(__init__)\n", | |
" 1 0.000 0.000 0.001 0.001 categorical.py:2719(_factorize_from_iterables)\n", | |
" 1 0.000 0.000 0.000 0.000 cast.py:989(maybe_infer_to_datetimelike)\n", | |
" 8 0.000 0.000 0.000 0.000 common.py:566(is_timedelta64_dtype)\n", | |
" 4 0.000 0.000 0.000 0.000 common.py:1609(is_bool_dtype)\n", | |
" 1 0.000 0.000 0.000 0.000 algorithms.py:440(_factorize_array)\n", | |
" 3 0.000 0.000 0.000 0.000 {pandas._libs.algos.ensure_int16}\n", | |
" 2 0.000 0.000 0.000 0.000 {method 'fill' of 'numpy.ndarray' objects}\n", | |
" 8 0.000 0.000 0.000 0.000 numeric.py:469(asarray)\n", | |
" 1 0.000 0.000 0.000 0.000 blocks.py:112(__init__)\n", | |
" 4 0.000 0.000 0.000 0.000 multi.py:835(<genexpr>)\n", | |
" 4 0.000 0.000 0.000 0.000 multi.py:1222(_get_names)\n", | |
" 5 0.000 0.000 0.000 0.000 multi.py:1981(nlevels)\n", | |
" 11 0.000 0.000 0.000 0.000 base.py:653(__len__)\n", | |
" 3 0.000 0.000 0.001 0.000 _decorators.py:178(wrapper)\n", | |
" 3 0.000 0.000 0.000 0.000 cast.py:710(coerce_indexer_dtype)\n", | |
" 22 0.000 0.000 0.000 0.000 common.py:210(classes)\n", | |
" 10 0.000 0.000 0.000 0.000 common.py:1837(_get_dtype)\n", | |
" 21 0.000 0.000 0.000 0.000 abc.py:137(__instancecheck__)\n", | |
" 18 0.000 0.000 0.000 0.000 {method 'items' of 'dict' objects}\n", | |
" 4 0.000 0.000 0.000 0.000 range.py:701(__len__)\n", | |
" 1 0.000 0.000 0.000 0.000 base.py:1881(hasnans)\n", | |
" 1 0.000 0.000 0.000 0.000 algorithms.py:1424(_get_take_nd_function)\n", | |
" 5 0.000 0.000 0.000 0.000 common.py:226(is_object_dtype)\n", | |
" 1 0.000 0.000 0.000 0.000 common.py:256(is_sparse)\n", | |
" 9 0.000 0.000 0.000 0.000 common.py:528(is_datetime64tz_dtype)\n", | |
" 4 0.000 0.000 0.000 0.000 common.py:1575(is_float_dtype)\n", | |
" 4 0.000 0.000 0.000 0.000 common.py:1812(_is_dtype)\n", | |
" 1 0.000 0.000 0.000 0.000 dtypes.py:251(_from_values_or_dtype)\n", | |
" 1 0.000 0.000 0.000 0.000 dtypes.py:533(update_dtype)\n", | |
" 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", | |
" 1 0.000 0.000 0.000 0.000 {built-in method numpy.concatenate}\n", | |
" 1 0.000 0.000 0.000 0.000 fromnumeric.py:69(_wrapreduction)\n", | |
" 1 0.000 0.000 0.000 0.000 blocks.py:335(ftype)\n", | |
" 1 0.000 0.000 0.000 0.000 multi.py:296(_validate_codes)\n", | |
" 12 0.000 0.000 0.000 0.000 base.py:547(<dictcomp>)\n", | |
" 1 0.000 0.000 0.000 0.000 cast.py:465(_ensure_dtype_type)\n", | |
" 2 0.000 0.000 0.000 0.000 common.py:186(<genexpr>)\n", | |
" 2 0.000 0.000 0.000 0.000 algorithms.py:59(_ensure_data)\n", | |
" 1 0.000 0.000 0.000 0.000 algorithms.py:166(_reconstruct_data)\n", | |
" 7 0.000 0.000 0.000 0.000 common.py:220(<lambda>)\n", | |
" 7 0.000 0.000 0.000 0.000 common.py:597(is_period_dtype)\n", | |
" 3 0.000 0.000 0.000 0.000 common.py:996(is_signed_integer_dtype)\n", | |
" 3 0.000 0.000 0.000 0.000 common.py:1053(is_unsigned_integer_dtype)\n", | |
" 1 0.000 0.000 0.000 0.000 {pandas._libs.lib.infer_datetimelike_array}\n", | |
" 3 0.000 0.000 0.000 0.000 missing.py:130(_isna_new)\n", | |
" 3 0.000 0.000 0.000 0.000 _methods.py:42(_any)\n", | |
" 16 0.000 0.000 0.000 0.000 {method 'rpartition' of 'str' objects}\n", | |
" 3 0.000 0.000 0.000 0.000 {method 'pop' of 'dict' objects}\n", | |
" 12 0.000 0.000 0.000 0.000 {method 'update' of 'dict' objects}\n", | |
" 3 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", | |
" 1 0.000 0.000 0.000 0.000 managers.py:655(<listcomp>)\n", | |
" 1 0.000 0.000 0.000 0.000 managers.py:654(_consolidate_check)\n", | |
" 3 0.000 0.000 0.000 0.000 blocks.py:227(mgr_locs)\n", | |
" 1 0.000 0.000 0.000 0.000 managers.py:132(<listcomp>)\n", | |
" 9 0.000 0.000 0.000 0.000 managers.py:169(<genexpr>)\n", | |
" 6 0.000 0.000 0.000 0.000 multi.py:812(codes)\n", | |
" 3 0.000 0.000 0.000 0.000 inference.py:120(is_iterator)\n", | |
" 2 0.000 0.000 0.000 0.000 common.py:346(is_categorical)\n", | |
" 1 0.000 0.000 0.000 0.000 algorithms.py:252(_get_data_algo)\n", | |
" 3 0.000 0.000 0.000 0.000 dtypes.py:224(__init__)\n", | |
" 1 0.000 0.000 0.000 0.000 pandas_compat.py:814(_check_data_column_metadata_consistency)\n", | |
" 1 0.000 0.000 0.000 0.000 pandas_compat.py:1084(<listcomp>)\n", | |
" 1 0.000 0.000 0.000 0.000 pandas_compat.py:1112(<listcomp>)\n", | |
" 1 0.000 0.000 0.000 0.000 numeric.py:748(<setcomp>)\n", | |
" 1 0.000 0.000 0.000 0.000 fromnumeric.py:54(_wrapfunc)\n", | |
" 2 0.000 0.000 0.000 0.000 {method 'end' of 're.Match' objects}\n", | |
" 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", | |
" 3 0.000 0.000 0.000 0.000 {function FrozenList.__getitem__ at 0x7f480ea079d8}\n", | |
" 2 0.000 0.000 0.000 0.000 managers.py:342(<genexpr>)\n", | |
" 1 0.000 0.000 0.000 0.000 blocks.py:123(_check_ndim)\n", | |
" 1 0.000 0.000 0.000 0.000 generic.py:231(_init_mgr)\n", | |
" 1 0.000 0.000 0.000 0.000 base.py:1860(_isnan)\n", | |
" 4 0.000 0.000 0.000 0.000 base.py:5541(ensure_index)\n", | |
" 2 0.000 0.000 0.000 0.000 base.py:466(<genexpr>)\n", | |
" 1 0.000 0.000 0.000 0.000 categorical.py:699(_get_codes)\n", | |
" 2 0.000 0.000 0.000 0.000 base.py:1000(_ndarray_values)\n", | |
" 8 0.000 0.000 0.000 0.000 common.py:1153(is_datetime64_any_dtype)\n", | |
" 2 0.000 0.000 0.000 0.000 common.py:1543(is_string_like_dtype)\n", | |
" 1 0.000 0.000 0.000 0.000 algorithms.py:195(_ensure_arraylike)\n", | |
" 1 0.000 0.000 0.000 0.000 fromnumeric.py:2083(any)\n", | |
" 1 0.000 0.000 0.000 0.000 _methods.py:30(_amin)\n", | |
" 1 0.000 0.000 0.000 0.000 {method 'max' of 'numpy.ndarray' objects}\n", | |
" 2 0.000 0.000 0.000 0.000 {method 'ravel' of 'numpy.ndarray' objects}\n", | |
" 2 0.000 0.000 0.000 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", | |
" 1 0.000 0.000 0.000 0.000 {built-in method numpy.can_cast}\n", | |
" 4 0.000 0.000 0.000 0.000 typing.py:890(cast)\n", | |
" 2 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", | |
" 1 0.000 0.000 0.000 0.000 {method 'upper' of 'str' objects}\n", | |
" 1 0.000 0.000 0.000 0.000 {method 'keys' of 'dict' objects}\n", | |
" 1 0.000 0.000 0.000 0.000 managers.py:236(items)\n", | |
" 1 0.000 0.000 0.000 0.000 blocks.py:327(shape)\n", | |
" 1 0.000 0.000 0.000 0.000 blocks.py:331(dtype)\n", | |
" 1 0.000 0.000 0.000 0.000 range.py:170(_validate_dtype)\n", | |
" 1 0.000 0.000 0.000 0.000 multi.py:384(<listcomp>)\n", | |
" 1 0.000 0.000 0.000 0.000 multi.py:1360(<listcomp>)\n", | |
" 2 0.000 0.000 0.000 0.000 frozen.py:140(_shallow_copy)\n", | |
" 2 0.000 0.000 0.000 0.000 frozen.py:182(_ensure_frozen)\n", | |
" 2 0.000 0.000 0.000 0.000 base.py:647(<lambda>)\n", | |
" 3 0.000 0.000 0.000 0.000 base.py:676(dtype)\n", | |
" 1 0.000 0.000 0.000 0.000 categorical.py:416(categories)\n", | |
" 4 0.000 0.000 0.000 0.000 base.py:65(_reset_cache)\n", | |
" 1 0.000 0.000 0.000 0.000 construction.py:317(extract_array)\n", | |
" 1 0.000 0.000 0.000 0.000 inference.py:244(is_array_like)\n", | |
" 7 0.000 0.000 0.000 0.000 common.py:215(classes_and_not_datetimelike)\n", | |
" 2 0.000 0.000 0.000 0.000 common.py:730(condition)\n", | |
" 2 0.000 0.000 0.000 0.000 common.py:700(is_string_dtype)\n", | |
" 1 0.000 0.000 0.000 0.000 common.py:941(is_integer_dtype)\n", | |
" 1 0.000 0.000 0.000 0.000 algorithms.py:244(_get_values_for_rank)\n", | |
" 1 0.000 0.000 0.000 0.000 {pandas._libs.algos.ensure_int64}\n", | |
" 3 0.000 0.000 0.000 0.000 {pandas._libs.algos.ensure_object}\n", | |
" 3 0.000 0.000 0.000 0.000 {pandas._libs.algos.ensure_platform_int}\n", | |
" 4 0.000 0.000 0.000 0.000 {pandas._libs.lib.is_bool}\n", | |
" 1 0.000 0.000 0.000 0.000 {built-in method pandas._libs.missing.checknull}\n", | |
" 3 0.000 0.000 0.000 0.000 missing.py:50(isna)\n", | |
" 3 0.000 0.000 0.000 0.000 dtypes.py:478(validate_ordered)\n", | |
" 4 0.000 0.000 0.000 0.000 dtypes.py:584(categories)\n", | |
" 1 0.000 0.000 0.000 0.000 fromnumeric.py:2252(cumsum)\n", | |
" 1 0.000 0.000 0.000 0.000 pandas_compat.py:988(_pandas_type_to_numpy_type)\n", | |
" 2 0.000 0.000 0.000 0.000 pandas_compat.py:1010(_get_multiindex_codes)\n", | |
" 1 0.000 0.000 0.000 0.000 fromnumeric.py:70(<dictcomp>)\n", | |
" 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", | |
" 1 0.000 0.000 0.000 0.000 managers.py:171(ndim)\n", | |
" 1 0.000 0.000 0.000 0.000 categorical.py:466(dtype)\n", | |
" 2 0.000 0.000 0.001 0.001 categorical.py:2741(<genexpr>)\n", | |
" 1 0.000 0.000 0.000 0.000 common.py:851(is_dtype_equal)\n", | |
" 2 0.000 0.000 0.000 0.000 common.py:1572(<lambda>)\n", | |
" 1 0.000 0.000 0.000 0.000 {pandas._libs.lib.is_integer}\n", | |
" 1 0.000 0.000 0.000 0.000 _methods.py:26(_amax)\n", | |
" 1 0.000 0.000 0.000 0.000 pandas_compat.py:851(<listcomp>)\n", | |
" 1 0.000 0.000 0.000 0.000 {method 'min' of 'numpy.ndarray' objects}" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"%prun to_pandas(table, check_extension=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%load_ext line_profiler" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from pyarrow.pandas_compat import _get_extension_dtypes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"pandas_metadata = table.schema.pandas_metadata" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Timer unit: 1e-06 s\n", | |
"\n", | |
"Total time: 0.062417 s\n", | |
"File: /home/joris/scipy/repos/arrow/python/pyarrow/pandas_compat.py\n", | |
"Function: _get_extension_dtypes at line 756\n", | |
"\n", | |
"Line # Hits Time Per Hit % Time Line Contents\n", | |
"==============================================================\n", | |
" 756 def _get_extension_dtypes(table, columns_metadata, extension_columns):\n", | |
" 757 \"\"\"\n", | |
" 758 Based on the stored column pandas metadata and the extension types\n", | |
" 759 in the arrow schema, infer which columns should be converted to a\n", | |
" 760 pandas extension dtype.\n", | |
" 761 \n", | |
" 762 The 'numpy_type' field in the column metadata stores the string\n", | |
" 763 representation of the original pandas dtype (and, despite its name,\n", | |
" 764 not the 'pandas_type' field).\n", | |
" 765 Based on this string representation, a pandas/numpy dtype is constructed\n", | |
" 766 and then we can check if this dtype supports conversion from arrow.\n", | |
" 767 \n", | |
" 768 \"\"\"\n", | |
" 769 1 7.0 7.0 0.0 ext_columns = {}\n", | |
" 770 \n", | |
" 771 # older pandas version that does not yet support extension dtypes\n", | |
" 772 1 7.0 7.0 0.0 if _pandas_api.extension_dtype is None:\n", | |
" 773 if extension_columns is not None:\n", | |
" 774 raise ValueError(\n", | |
" 775 \"Converting to pandas ExtensionDtypes is not supported\")\n", | |
" 776 return ext_columns\n", | |
" 777 \n", | |
" 778 1 3.0 3.0 0.0 if extension_columns is None:\n", | |
" 779 # infer the extension columns from the pandas metadata\n", | |
" 780 1001 845.0 0.8 1.4 for col_meta in columns_metadata:\n", | |
" 781 1000 985.0 1.0 1.6 name = col_meta['name']\n", | |
" 782 #if col_meta['numpy_type'] not in _pandas_supported_numpy_types:\n", | |
" 783 1000 55897.0 55.9 89.6 pandas_dtype = _pandas_api.pandas_dtype(col_meta['numpy_type'])\n", | |
" 784 1000 1256.0 1.3 2.0 if isinstance(pandas_dtype, _pandas_api.extension_dtype):\n", | |
" 785 if hasattr(pandas_dtype, \"__from_arrow__\"):\n", | |
" 786 ext_columns[name] = pandas_dtype\n", | |
" 787 # infer from extension type in the schema\n", | |
" 788 1001 1752.0 1.8 2.8 for field in table.schema:\n", | |
" 789 1000 780.0 0.8 1.2 typ = field.type\n", | |
" 790 1000 884.0 0.9 1.4 if isinstance(typ, pa.BaseExtensionType):\n", | |
" 791 try:\n", | |
" 792 pandas_dtype = typ.to_pandas_dtype()\n", | |
" 793 except NotImplementedError:\n", | |
" 794 pass\n", | |
" 795 else:\n", | |
" 796 ext_columns[field.name] = pandas_dtype\n", | |
" 797 \n", | |
" 798 else:\n", | |
" 799 # get the extension dtype for the specified columns\n", | |
" 800 for name in extension_columns:\n", | |
" 801 col_meta = [\n", | |
" 802 meta for meta in columns_metadata if meta['name'] == name][0]\n", | |
" 803 pandas_dtype = _pandas_api.pandas_dtype(col_meta['numpy_type'])\n", | |
" 804 if not isinstance(pandas_dtype, _pandas_api.extension_dtype):\n", | |
" 805 raise ValueError(\"not an extension dtype\")\n", | |
" 806 if not hasattr(pandas_dtype, \"__from_arrow__\"):\n", | |
" 807 raise ValueError(\"this column does not support to be \"\n", | |
" 808 \"converted to extension dtype\")\n", | |
" 809 ext_columns[name] = pandas_dtype\n", | |
" 810 \n", | |
" 811 1 1.0 1.0 0.0 return ext_columns" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"%lprun -f _get_extension_dtypes _get_extension_dtypes(table, pandas_metadata['columns'], None)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python (arrow-dev)", | |
"language": "python", | |
"name": "arrow-dev" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment