Skip to content

Instantly share code, notes, and snippets.

@jorisvandenbossche
Created October 30, 2019 14:40
Show Gist options
  • Save jorisvandenbossche/d36bceb82fd2dda38ee419ba51dff5ed to your computer and use it in GitHub Desktop.
Save jorisvandenbossche/d36bceb82fd2dda38ee419ba51dff5ed to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Profile overhead of checking the column metadata and types for extension dtypes"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import pyarrow as pa\n",
"\n",
"import random\n",
"import string"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"N_columns = 100\n",
"N_rows = 1000"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(\n",
" np.random.randn(N_rows, N_columns),\n",
" columns=[''.join([random.choice(string.ascii_letters) for _ in range(5)]) for _ in range(N_columns)])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"table = pa.table(df)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3.18 ms ± 101 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"%timeit table.to_pandas()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def to_pandas(table, check_extension=True):\n",
" # Table.to_pandas equivalent with control whether\n",
" # to check for extension types or not\n",
" from pyarrow.pandas_compat import table_to_blockmanager\n",
"\n",
" options = dict(\n",
" pool=None,\n",
" strings_to_categorical=False,\n",
" zero_copy_only=False,\n",
" integer_object_nulls=False,\n",
" date_as_object=True,\n",
" use_threads=True,\n",
" deduplicate_objects=True)\n",
"\n",
" if check_extension:\n",
" extension_columns = None\n",
" else:\n",
" extension_columns = []\n",
"\n",
" mgr = table_to_blockmanager(\n",
" options, table, extension_columns=extension_columns)\n",
" return pd.DataFrame(mgr)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This should be the same as `table.to_pandas()` above:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3.45 ms ± 110 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"%timeit to_pandas(table, check_extension=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"And now with skipping the checks:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2.73 ms ± 419 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"%timeit to_pandas(table, check_extension=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Long table"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"N_columns = 100\n",
"N_rows = 100_000"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(\n",
" np.random.randn(N_rows, N_columns),\n",
" columns=[''.join([random.choice(string.ascii_letters) for _ in range(5)]) for _ in range(N_columns)])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"table = pa.table(df)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"28.7 ms ± 1.4 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
]
}
],
"source": [
"%timeit table.to_pandas()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"29.2 ms ± 740 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
]
}
],
"source": [
"%timeit to_pandas(table, check_extension=True)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"26.4 ms ± 1.45 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
]
}
],
"source": [
"%timeit to_pandas(table, check_extension=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Wide table"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"N_columns = 1000\n",
"N_rows = 100"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(\n",
" np.random.randn(N_rows, N_columns),\n",
" columns=[''.join([random.choice(string.ascii_letters) for _ in range(5)]) for _ in range(N_columns)])"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"table = pa.table(df)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"22.4 ms ± 419 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
]
}
],
"source": [
"%timeit table.to_pandas()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"22.6 ms ± 659 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
]
}
],
"source": [
"%timeit to_pandas(table, check_extension=True)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10.6 ms ± 415 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"%timeit to_pandas(table, check_extension=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Profile:"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" "
]
},
{
"data": {
"text/plain": [
" 83239 function calls (83218 primitive calls) in 0.061 seconds\n",
"\n",
" Ordered by: internal time\n",
"\n",
" ncalls tottime percall cumtime percall filename:lineno(function)\n",
" 1026 0.009 0.000 0.029 0.000 dtypes.py:82(find)\n",
" 10033 0.007 0.000 0.012 0.000 base.py:180(construct_from_string)\n",
" 1 0.005 0.005 0.005 0.005 {pyarrow.lib.table_to_blocks}\n",
" 12042 0.004 0.000 0.004 0.000 {method 'format' of 'str' objects}\n",
" 1013 0.003 0.000 0.033 0.000 common.py:2007(pandas_dtype)\n",
" 22394 0.003 0.000 0.004 0.000 {built-in method builtins.isinstance}\n",
" 1 0.003 0.003 0.003 0.003 decoder.py:343(raw_decode)\n",
" 1 0.002 0.002 0.003 0.003 pandas_compat.py:1098(<listcomp>)\n",
" 1 0.002 0.002 0.036 0.036 pandas_compat.py:756(_get_extension_dtypes)\n",
" 1000 0.002 0.000 0.003 0.000 {method 'get_field_index' of 'pyarrow.lib.Schema' objects}\n",
" 1004 0.001 0.000 0.003 0.000 dtypes.py:873(construct_from_string)\n",
" 1005 0.001 0.000 0.002 0.000 dtypes.py:1047(construct_from_string)\n",
" 1 0.001 0.001 0.005 0.005 pandas_compat.py:1104(_add_any_metadata)\n",
" 1007 0.001 0.000 0.002 0.000 dtypes.py:721(construct_from_string)\n",
" 1 0.001 0.001 0.001 0.001 pandas_compat.py:870(<dictcomp>)\n",
" 1000 0.001 0.000 0.001 0.000 compat.py:112(tobytes)\n",
" 4005 0.001 0.000 0.001 0.000 {method 'get' of 'dict' objects}\n",
" 1000 0.001 0.000 0.034 0.000 {method 'pandas_dtype' of 'pyarrow.lib._PandasAPIShim' objects}\n",
" 1000 0.001 0.000 0.001 0.000 base.py:4177(__getitem__)\n",
" 1003 0.001 0.000 0.002 0.000 string_.py:75(construct_from_string)\n",
" 1003 0.001 0.000 0.001 0.000 dtype.py:177(construct_from_string)\n",
" 3010 0.001 0.000 0.001 0.000 {method 'startswith' of 'str' objects}\n",
" 1 0.001 0.001 0.009 0.009 pandas_compat.py:825(_deserialize_column_index)\n",
" 1005 0.001 0.000 0.001 0.000 {method 'match' of 're.Pattern' objects}\n",
" 1189 0.000 0.000 0.001 0.000 generic.py:7(_check)\n",
" 1 0.000 0.000 0.006 0.006 pandas_compat.py:1075(_table_to_blocks)\n",
" 1 0.000 0.000 0.001 0.001 pandas_compat.py:828(<dictcomp>)\n",
" 1001 0.000 0.000 0.001 0.000 inference.py:382(is_hashable)\n",
" 1000 0.000 0.000 0.000 0.000 {method 'encode' of 'str' objects}\n",
" 2 0.000 0.000 0.000 0.000 {method 'argsort' of 'numpy.ndarray' objects}\n",
" 2000 0.000 0.000 0.001 0.000 compat.py:123(frombytes)\n",
" 1 0.000 0.000 0.060 0.060 pandas_compat.py:721(table_to_blockmanager)\n",
" 2000 0.000 0.000 0.000 0.000 {method 'decode' of 'bytes' objects}\n",
" 1003 0.000 0.000 0.000 0.000 {method 'search' of 're.Pattern' objects}\n",
" 1 0.000 0.000 0.060 0.060 <ipython-input-6-9d225b52ec17>:1(to_pandas)\n",
" 1001 0.000 0.000 0.000 0.000 {built-in method builtins.hash}\n",
" 3 0.000 0.000 0.000 0.000 base.py:1732(is_unique)\n",
" 1000 0.000 0.000 0.000 0.000 common.py:150(cast_scalar_indexer)\n",
" 1001 0.000 0.000 0.000 0.000 pandas_compat.py:820(<genexpr>)\n",
"1282/1277 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n",
" 1000 0.000 0.000 0.000 0.000 pandas_compat.py:272(_column_name_to_strings)\n",
" 1003 0.000 0.000 0.000 0.000 {method 'lower' of 'str' objects}\n",
" 4 0.000 0.000 0.000 0.000 {pandas._libs.lib.infer_dtype}\n",
" 1000 0.000 0.000 0.000 0.000 compat.py:118(u_utf8)\n",
" 1 0.000 0.000 0.000 0.000 pandas_compat.py:826(<listcomp>)\n",
" 4/3 0.000 0.000 0.001 0.000 base.py:261(__new__)\n",
" 1 0.000 0.000 0.000 0.000 {method 'factorize' of 'pandas._libs.hashtable.StringHashTable' objects}\n",
" 1 0.000 0.000 0.000 0.000 pandas_compat.py:833(<listcomp>)\n",
" 1001 0.000 0.000 0.000 0.000 {pandas._libs.lib.is_float}\n",
" 1007 0.000 0.000 0.000 0.000 {pandas._libs.lib.is_scalar}\n",
" 1 0.000 0.000 0.060 0.060 <string>:1(<module>)\n",
" 3 0.000 0.000 0.000 0.000 {method 'astype' of 'numpy.ndarray' objects}\n",
" 12 0.000 0.000 0.000 0.000 {built-in method numpy.array}\n",
" 3 0.000 0.000 0.000 0.000 {built-in method builtins.all}\n",
" 2 0.000 0.000 0.000 0.000 {built-in method pandas._libs.missing.isnaobj}\n",
" 7 0.000 0.000 0.000 0.000 {method 'reduce' of 'numpy.ufunc' objects}\n",
" 1000 0.000 0.000 0.000 0.000 pandas_compat.py:841(<lambda>)\n",
" 1 0.000 0.000 0.061 0.061 {built-in method builtins.exec}\n",
" 34 0.000 0.000 0.000 0.000 base.py:231(is_dtype)\n",
" 1 0.000 0.000 0.000 0.000 multi.py:1356(_engine)\n",
" 1 0.000 0.000 0.000 0.000 cast.py:1351(construct_1d_object_array_from_listlike)\n",
" 1 0.000 0.000 0.000 0.000 {pandas._libs.lib.to_object_array_tuples}\n",
" 75 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n",
" 87/72 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n",
" 1 0.000 0.000 0.001 0.001 pandas_compat.py:864(_reconstruct_index)\n",
" 1 0.000 0.000 0.000 0.000 multi.py:317(_verify_integrity)\n",
" 1 0.000 0.000 0.000 0.000 managers.py:216(_rebuild_blknos_and_blklocs)\n",
" 15 0.000 0.000 0.000 0.000 base.py:509(_simple_new)\n",
" 29 0.000 0.000 0.000 0.000 common.py:1873(_is_dtype_type)\n",
" 6 0.000 0.000 0.000 0.000 {built-in method numpy.empty}\n",
" 8 0.000 0.000 0.000 0.000 _dtype.py:319(_name_get)\n",
" 28 0.000 0.000 0.000 0.000 <frozen importlib._bootstrap>:1009(_handle_fromlist)\n",
" 12 0.000 0.000 0.000 0.000 base.py:564(_shallow_copy)\n",
" 25 0.000 0.000 0.000 0.000 common.py:1730(is_extension_array_dtype)\n",
" 1 0.000 0.000 0.004 0.004 pandas_compat.py:1088(_flatten_single_level_multiindex)\n",
" 1 0.000 0.000 0.000 0.000 range.py:148(_simple_new)\n",
" 1 0.000 0.000 0.000 0.000 range.py:83(__new__)\n",
" 1 0.000 0.000 0.003 0.003 decoder.py:332(decode)\n",
" 2 0.000 0.000 0.000 0.000 multi.py:247(__new__)\n",
" 1 0.000 0.000 0.002 0.002 multi.py:455(from_tuples)\n",
" 1 0.000 0.000 0.000 0.000 base.py:750(astype)\n",
" 15 0.000 0.000 0.000 0.000 {pandas._libs.lib.is_list_like}\n",
" 2 0.000 0.000 0.000 0.000 missing.py:220(_isna_ndarraylike)\n",
" 99 0.000 0.000 0.000 0.000 {built-in method builtins.issubclass}\n",
" 1 0.000 0.000 0.000 0.000 common.py:182(all_none)\n",
" 21 0.000 0.000 0.000 0.000 {built-in method _abc._abc_instancecheck}\n",
" 1 0.000 0.000 0.001 0.001 categorical.py:332(__init__)\n",
" 1 0.000 0.000 0.001 0.001 algorithms.py:569(factorize)\n",
" 16 0.000 0.000 0.000 0.000 <frozen importlib._bootstrap>:416(parent)\n",
" 10 0.000 0.000 0.000 0.000 multi.py:640(levels)\n",
" 2 0.000 0.000 0.000 0.000 multi.py:701(_set_levels)\n",
" 1 0.000 0.000 0.001 0.001 pandas_compat.py:1018(_reconstruct_columns_from_metadata)\n",
" 1 0.000 0.000 0.000 0.000 {method 'take' of 'numpy.ndarray' objects}\n",
" 1 0.000 0.000 0.000 0.000 algorithms.py:1551(take_nd)\n",
" 5 0.000 0.000 0.000 0.000 common.py:497(is_datetime64_dtype)\n",
" 2 0.000 0.000 0.000 0.000 dtypes.py:497(validate_categories)\n",
" 22 0.000 0.000 0.000 0.000 {method 'view' of 'numpy.ndarray' objects}\n",
" 8 0.000 0.000 0.000 0.000 numerictypes.py:365(issubdtype)\n",
" 1 0.000 0.000 0.000 0.000 {pandas._libs.algos.take_1d_int64_int64}\n",
" 3 0.000 0.000 0.000 0.000 {method 'any' of 'numpy.ndarray' objects}\n",
" 1 0.000 0.000 0.000 0.000 managers.py:126(__init__)\n",
" 4 0.000 0.000 0.000 0.000 common.py:228(asarray_tuplesafe)\n",
" 7 0.000 0.000 0.000 0.000 dtypes.py:919(is_dtype)\n",
" 2 0.000 0.000 0.000 0.000 _dtype.py:46(__str__)\n",
" 1 0.000 0.000 0.002 0.002 multi.py:389(from_arrays)\n",
" 10 0.000 0.000 0.000 0.000 multi.py:643(<listcomp>)\n",
" 1 0.000 0.000 0.000 0.000 frame.py:403(__init__)\n",
" 3 0.000 0.000 0.000 0.000 frozen.py:73(__getitem__)\n",
" 17 0.000 0.000 0.000 0.000 base.py:3816(values)\n",
" 12 0.000 0.000 0.000 0.000 base.py:543(_get_attributes_dict)\n",
" 1 0.000 0.000 0.000 0.000 cast.py:337(maybe_promote)\n",
" 14 0.000 0.000 0.000 0.000 common.py:667(is_categorical_dtype)\n",
" 1 0.000 0.000 0.000 0.000 pandas_compat.py:648(_reconstruct_block)\n",
" 16 0.000 0.000 0.000 0.000 numerictypes.py:293(issubclass_)\n",
" 1 0.000 0.000 0.000 0.000 managers.py:340(_verify_integrity)\n",
" 3 0.000 0.000 0.000 0.000 managers.py:167(shape)\n",
" 18 0.000 0.000 0.000 0.000 base.py:630(_reset_identity)\n",
" 1 0.000 0.000 0.001 0.001 sorting.py:394(safe_sort)\n",
" 1 0.000 0.000 0.001 0.001 categorical.py:2681(_factorize_from_iterable)\n",
" 9 0.000 0.000 0.000 0.000 common.py:631(is_interval_dtype)\n",
" 6 0.000 0.000 0.000 0.000 dtypes.py:1099(is_dtype)\n",
" 1 0.000 0.000 0.003 0.003 __init__.py:299(loads)\n",
" 1 0.000 0.000 0.000 0.000 blocks.py:2940(get_block_type)\n",
" 2 0.000 0.000 0.000 0.000 multi.py:1225(_set_names)\n",
" 3 0.000 0.000 0.000 0.000 common.py:181(ensure_python_int)\n",
" 1 0.000 0.000 0.000 0.000 pandas_compat.py:1050(<listcomp>)\n",
" 5 0.000 0.000 0.000 0.000 _internal.py:886(npy_ctypes_check)\n",
" 1 0.000 0.000 0.000 0.000 multi.py:74(_codes_to_ints)\n",
" 2 0.000 0.000 0.000 0.000 multi.py:825(_set_codes)\n",
" 3 0.000 0.000 0.000 0.000 dtypes.py:342(_finalize)\n",
" 1 0.000 0.000 0.000 0.000 numeric.py:676(require)\n",
" 18 0.000 0.000 0.000 0.000 {built-in method __new__ of type object at 0x560e1bc923c0}\n",
" 1 0.000 0.000 0.000 0.000 blocks.py:231(mgr_locs)\n",
" 2 0.000 0.000 0.000 0.000 base.py:640(_engine)\n",
" 22 0.000 0.000 0.000 0.000 common.py:212(<lambda>)\n",
" 1 0.000 0.000 0.000 0.000 {built-in method numpy.arange}\n",
" 1 0.000 0.000 0.000 0.000 blocks.py:2986(make_block)\n",
" 4 0.000 0.000 0.000 0.000 multi.py:716(<genexpr>)\n",
" 1 0.000 0.000 0.000 0.000 generic.py:202(__init__)\n",
" 1 0.000 0.000 0.001 0.001 categorical.py:2719(_factorize_from_iterables)\n",
" 1 0.000 0.000 0.000 0.000 cast.py:989(maybe_infer_to_datetimelike)\n",
" 8 0.000 0.000 0.000 0.000 common.py:566(is_timedelta64_dtype)\n",
" 4 0.000 0.000 0.000 0.000 common.py:1609(is_bool_dtype)\n",
" 1 0.000 0.000 0.000 0.000 algorithms.py:440(_factorize_array)\n",
" 3 0.000 0.000 0.000 0.000 {pandas._libs.algos.ensure_int16}\n",
" 2 0.000 0.000 0.000 0.000 {method 'fill' of 'numpy.ndarray' objects}\n",
" 8 0.000 0.000 0.000 0.000 numeric.py:469(asarray)\n",
" 1 0.000 0.000 0.000 0.000 blocks.py:112(__init__)\n",
" 4 0.000 0.000 0.000 0.000 multi.py:835(<genexpr>)\n",
" 4 0.000 0.000 0.000 0.000 multi.py:1222(_get_names)\n",
" 5 0.000 0.000 0.000 0.000 multi.py:1981(nlevels)\n",
" 11 0.000 0.000 0.000 0.000 base.py:653(__len__)\n",
" 3 0.000 0.000 0.001 0.000 _decorators.py:178(wrapper)\n",
" 3 0.000 0.000 0.000 0.000 cast.py:710(coerce_indexer_dtype)\n",
" 22 0.000 0.000 0.000 0.000 common.py:210(classes)\n",
" 10 0.000 0.000 0.000 0.000 common.py:1837(_get_dtype)\n",
" 21 0.000 0.000 0.000 0.000 abc.py:137(__instancecheck__)\n",
" 18 0.000 0.000 0.000 0.000 {method 'items' of 'dict' objects}\n",
" 4 0.000 0.000 0.000 0.000 range.py:701(__len__)\n",
" 1 0.000 0.000 0.000 0.000 base.py:1881(hasnans)\n",
" 1 0.000 0.000 0.000 0.000 algorithms.py:1424(_get_take_nd_function)\n",
" 5 0.000 0.000 0.000 0.000 common.py:226(is_object_dtype)\n",
" 1 0.000 0.000 0.000 0.000 common.py:256(is_sparse)\n",
" 9 0.000 0.000 0.000 0.000 common.py:528(is_datetime64tz_dtype)\n",
" 4 0.000 0.000 0.000 0.000 common.py:1575(is_float_dtype)\n",
" 4 0.000 0.000 0.000 0.000 common.py:1812(_is_dtype)\n",
" 1 0.000 0.000 0.000 0.000 dtypes.py:251(_from_values_or_dtype)\n",
" 1 0.000 0.000 0.000 0.000 dtypes.py:533(update_dtype)\n",
" 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n",
" 1 0.000 0.000 0.000 0.000 {built-in method numpy.concatenate}\n",
" 1 0.000 0.000 0.000 0.000 fromnumeric.py:69(_wrapreduction)\n",
" 1 0.000 0.000 0.000 0.000 blocks.py:335(ftype)\n",
" 1 0.000 0.000 0.000 0.000 multi.py:296(_validate_codes)\n",
" 12 0.000 0.000 0.000 0.000 base.py:547(<dictcomp>)\n",
" 1 0.000 0.000 0.000 0.000 cast.py:465(_ensure_dtype_type)\n",
" 2 0.000 0.000 0.000 0.000 common.py:186(<genexpr>)\n",
" 2 0.000 0.000 0.000 0.000 algorithms.py:59(_ensure_data)\n",
" 1 0.000 0.000 0.000 0.000 algorithms.py:166(_reconstruct_data)\n",
" 7 0.000 0.000 0.000 0.000 common.py:220(<lambda>)\n",
" 7 0.000 0.000 0.000 0.000 common.py:597(is_period_dtype)\n",
" 3 0.000 0.000 0.000 0.000 common.py:996(is_signed_integer_dtype)\n",
" 3 0.000 0.000 0.000 0.000 common.py:1053(is_unsigned_integer_dtype)\n",
" 1 0.000 0.000 0.000 0.000 {pandas._libs.lib.infer_datetimelike_array}\n",
" 3 0.000 0.000 0.000 0.000 missing.py:130(_isna_new)\n",
" 3 0.000 0.000 0.000 0.000 _methods.py:42(_any)\n",
" 16 0.000 0.000 0.000 0.000 {method 'rpartition' of 'str' objects}\n",
" 3 0.000 0.000 0.000 0.000 {method 'pop' of 'dict' objects}\n",
" 12 0.000 0.000 0.000 0.000 {method 'update' of 'dict' objects}\n",
" 3 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n",
" 1 0.000 0.000 0.000 0.000 managers.py:655(<listcomp>)\n",
" 1 0.000 0.000 0.000 0.000 managers.py:654(_consolidate_check)\n",
" 3 0.000 0.000 0.000 0.000 blocks.py:227(mgr_locs)\n",
" 1 0.000 0.000 0.000 0.000 managers.py:132(<listcomp>)\n",
" 9 0.000 0.000 0.000 0.000 managers.py:169(<genexpr>)\n",
" 6 0.000 0.000 0.000 0.000 multi.py:812(codes)\n",
" 3 0.000 0.000 0.000 0.000 inference.py:120(is_iterator)\n",
" 2 0.000 0.000 0.000 0.000 common.py:346(is_categorical)\n",
" 1 0.000 0.000 0.000 0.000 algorithms.py:252(_get_data_algo)\n",
" 3 0.000 0.000 0.000 0.000 dtypes.py:224(__init__)\n",
" 1 0.000 0.000 0.000 0.000 pandas_compat.py:814(_check_data_column_metadata_consistency)\n",
" 1 0.000 0.000 0.000 0.000 pandas_compat.py:1084(<listcomp>)\n",
" 1 0.000 0.000 0.000 0.000 pandas_compat.py:1112(<listcomp>)\n",
" 1 0.000 0.000 0.000 0.000 numeric.py:748(<setcomp>)\n",
" 1 0.000 0.000 0.000 0.000 fromnumeric.py:54(_wrapfunc)\n",
" 2 0.000 0.000 0.000 0.000 {method 'end' of 're.Match' objects}\n",
" 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n",
" 3 0.000 0.000 0.000 0.000 {function FrozenList.__getitem__ at 0x7f480ea079d8}\n",
" 2 0.000 0.000 0.000 0.000 managers.py:342(<genexpr>)\n",
" 1 0.000 0.000 0.000 0.000 blocks.py:123(_check_ndim)\n",
" 1 0.000 0.000 0.000 0.000 generic.py:231(_init_mgr)\n",
" 1 0.000 0.000 0.000 0.000 base.py:1860(_isnan)\n",
" 4 0.000 0.000 0.000 0.000 base.py:5541(ensure_index)\n",
" 2 0.000 0.000 0.000 0.000 base.py:466(<genexpr>)\n",
" 1 0.000 0.000 0.000 0.000 categorical.py:699(_get_codes)\n",
" 2 0.000 0.000 0.000 0.000 base.py:1000(_ndarray_values)\n",
" 8 0.000 0.000 0.000 0.000 common.py:1153(is_datetime64_any_dtype)\n",
" 2 0.000 0.000 0.000 0.000 common.py:1543(is_string_like_dtype)\n",
" 1 0.000 0.000 0.000 0.000 algorithms.py:195(_ensure_arraylike)\n",
" 1 0.000 0.000 0.000 0.000 fromnumeric.py:2083(any)\n",
" 1 0.000 0.000 0.000 0.000 _methods.py:30(_amin)\n",
" 1 0.000 0.000 0.000 0.000 {method 'max' of 'numpy.ndarray' objects}\n",
" 2 0.000 0.000 0.000 0.000 {method 'ravel' of 'numpy.ndarray' objects}\n",
" 2 0.000 0.000 0.000 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n",
" 1 0.000 0.000 0.000 0.000 {built-in method numpy.can_cast}\n",
" 4 0.000 0.000 0.000 0.000 typing.py:890(cast)\n",
" 2 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n",
" 1 0.000 0.000 0.000 0.000 {method 'upper' of 'str' objects}\n",
" 1 0.000 0.000 0.000 0.000 {method 'keys' of 'dict' objects}\n",
" 1 0.000 0.000 0.000 0.000 managers.py:236(items)\n",
" 1 0.000 0.000 0.000 0.000 blocks.py:327(shape)\n",
" 1 0.000 0.000 0.000 0.000 blocks.py:331(dtype)\n",
" 1 0.000 0.000 0.000 0.000 range.py:170(_validate_dtype)\n",
" 1 0.000 0.000 0.000 0.000 multi.py:384(<listcomp>)\n",
" 1 0.000 0.000 0.000 0.000 multi.py:1360(<listcomp>)\n",
" 2 0.000 0.000 0.000 0.000 frozen.py:140(_shallow_copy)\n",
" 2 0.000 0.000 0.000 0.000 frozen.py:182(_ensure_frozen)\n",
" 2 0.000 0.000 0.000 0.000 base.py:647(<lambda>)\n",
" 3 0.000 0.000 0.000 0.000 base.py:676(dtype)\n",
" 1 0.000 0.000 0.000 0.000 categorical.py:416(categories)\n",
" 4 0.000 0.000 0.000 0.000 base.py:65(_reset_cache)\n",
" 1 0.000 0.000 0.000 0.000 construction.py:317(extract_array)\n",
" 1 0.000 0.000 0.000 0.000 inference.py:244(is_array_like)\n",
" 7 0.000 0.000 0.000 0.000 common.py:215(classes_and_not_datetimelike)\n",
" 2 0.000 0.000 0.000 0.000 common.py:730(condition)\n",
" 2 0.000 0.000 0.000 0.000 common.py:700(is_string_dtype)\n",
" 1 0.000 0.000 0.000 0.000 common.py:941(is_integer_dtype)\n",
" 1 0.000 0.000 0.000 0.000 algorithms.py:244(_get_values_for_rank)\n",
" 1 0.000 0.000 0.000 0.000 {pandas._libs.algos.ensure_int64}\n",
" 3 0.000 0.000 0.000 0.000 {pandas._libs.algos.ensure_object}\n",
" 3 0.000 0.000 0.000 0.000 {pandas._libs.algos.ensure_platform_int}\n",
" 4 0.000 0.000 0.000 0.000 {pandas._libs.lib.is_bool}\n",
" 1 0.000 0.000 0.000 0.000 {built-in method pandas._libs.missing.checknull}\n",
" 3 0.000 0.000 0.000 0.000 missing.py:50(isna)\n",
" 3 0.000 0.000 0.000 0.000 dtypes.py:478(validate_ordered)\n",
" 4 0.000 0.000 0.000 0.000 dtypes.py:584(categories)\n",
" 1 0.000 0.000 0.000 0.000 fromnumeric.py:2252(cumsum)\n",
" 1 0.000 0.000 0.000 0.000 pandas_compat.py:988(_pandas_type_to_numpy_type)\n",
" 2 0.000 0.000 0.000 0.000 pandas_compat.py:1010(_get_multiindex_codes)\n",
" 1 0.000 0.000 0.000 0.000 fromnumeric.py:70(<dictcomp>)\n",
" 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n",
" 1 0.000 0.000 0.000 0.000 managers.py:171(ndim)\n",
" 1 0.000 0.000 0.000 0.000 categorical.py:466(dtype)\n",
" 2 0.000 0.000 0.001 0.001 categorical.py:2741(<genexpr>)\n",
" 1 0.000 0.000 0.000 0.000 common.py:851(is_dtype_equal)\n",
" 2 0.000 0.000 0.000 0.000 common.py:1572(<lambda>)\n",
" 1 0.000 0.000 0.000 0.000 {pandas._libs.lib.is_integer}\n",
" 1 0.000 0.000 0.000 0.000 _methods.py:26(_amax)\n",
" 1 0.000 0.000 0.000 0.000 pandas_compat.py:851(<listcomp>)\n",
" 1 0.000 0.000 0.000 0.000 {method 'min' of 'numpy.ndarray' objects}"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%prun to_pandas(table, check_extension=True)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"%load_ext line_profiler"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"from pyarrow.pandas_compat import _get_extension_dtypes"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"pandas_metadata = table.schema.pandas_metadata"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Timer unit: 1e-06 s\n",
"\n",
"Total time: 0.062417 s\n",
"File: /home/joris/scipy/repos/arrow/python/pyarrow/pandas_compat.py\n",
"Function: _get_extension_dtypes at line 756\n",
"\n",
"Line # Hits Time Per Hit % Time Line Contents\n",
"==============================================================\n",
" 756 def _get_extension_dtypes(table, columns_metadata, extension_columns):\n",
" 757 \"\"\"\n",
" 758 Based on the stored column pandas metadata and the extension types\n",
" 759 in the arrow schema, infer which columns should be converted to a\n",
" 760 pandas extension dtype.\n",
" 761 \n",
" 762 The 'numpy_type' field in the column metadata stores the string\n",
" 763 representation of the original pandas dtype (and, despite its name,\n",
" 764 not the 'pandas_type' field).\n",
" 765 Based on this string representation, a pandas/numpy dtype is constructed\n",
" 766 and then we can check if this dtype supports conversion from arrow.\n",
" 767 \n",
" 768 \"\"\"\n",
" 769 1 7.0 7.0 0.0 ext_columns = {}\n",
" 770 \n",
" 771 # older pandas version that does not yet support extension dtypes\n",
" 772 1 7.0 7.0 0.0 if _pandas_api.extension_dtype is None:\n",
" 773 if extension_columns is not None:\n",
" 774 raise ValueError(\n",
" 775 \"Converting to pandas ExtensionDtypes is not supported\")\n",
" 776 return ext_columns\n",
" 777 \n",
" 778 1 3.0 3.0 0.0 if extension_columns is None:\n",
" 779 # infer the extension columns from the pandas metadata\n",
" 780 1001 845.0 0.8 1.4 for col_meta in columns_metadata:\n",
" 781 1000 985.0 1.0 1.6 name = col_meta['name']\n",
" 782 #if col_meta['numpy_type'] not in _pandas_supported_numpy_types:\n",
" 783 1000 55897.0 55.9 89.6 pandas_dtype = _pandas_api.pandas_dtype(col_meta['numpy_type'])\n",
" 784 1000 1256.0 1.3 2.0 if isinstance(pandas_dtype, _pandas_api.extension_dtype):\n",
" 785 if hasattr(pandas_dtype, \"__from_arrow__\"):\n",
" 786 ext_columns[name] = pandas_dtype\n",
" 787 # infer from extension type in the schema\n",
" 788 1001 1752.0 1.8 2.8 for field in table.schema:\n",
" 789 1000 780.0 0.8 1.2 typ = field.type\n",
" 790 1000 884.0 0.9 1.4 if isinstance(typ, pa.BaseExtensionType):\n",
" 791 try:\n",
" 792 pandas_dtype = typ.to_pandas_dtype()\n",
" 793 except NotImplementedError:\n",
" 794 pass\n",
" 795 else:\n",
" 796 ext_columns[field.name] = pandas_dtype\n",
" 797 \n",
" 798 else:\n",
" 799 # get the extension dtype for the specified columns\n",
" 800 for name in extension_columns:\n",
" 801 col_meta = [\n",
" 802 meta for meta in columns_metadata if meta['name'] == name][0]\n",
" 803 pandas_dtype = _pandas_api.pandas_dtype(col_meta['numpy_type'])\n",
" 804 if not isinstance(pandas_dtype, _pandas_api.extension_dtype):\n",
" 805 raise ValueError(\"not an extension dtype\")\n",
" 806 if not hasattr(pandas_dtype, \"__from_arrow__\"):\n",
" 807 raise ValueError(\"this column does not support to be \"\n",
" 808 \"converted to extension dtype\")\n",
" 809 ext_columns[name] = pandas_dtype\n",
" 810 \n",
" 811 1 1.0 1.0 0.0 return ext_columns"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%lprun -f _get_extension_dtypes _get_extension_dtypes(table, pandas_metadata['columns'], None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (arrow-dev)",
"language": "python",
"name": "arrow-dev"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment