Skip to content

Instantly share code, notes, and snippets.

@jjerphan
Last active June 28, 2023 09:37
Show Gist options
  • Save jjerphan/8c532ec65ed6e2110df0620786dcfa4f to your computer and use it in GitHub Desktop.
Save jjerphan/8c532ec65ed6e2110df0620786dcfa4f to your computer and use it in GitHub Desktop.
pyarrow to Arrow C Data API
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 0,
"id": "30103ds0",
"metadata": {},
"outputs": [],
"source": [
"!pip install cffi"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "47699ffe",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"import pyarrow as pa\n",
"\n",
"from pyarrow.cffi import ffi\n",
"\n",
"NUM_COLUMNS=11\n",
"NUM_ROWS=7\n",
"\n",
"df = pd.DataFrame(\n",
" np.random.randint(0, 100, size=(NUM_ROWS, NUM_COLUMNS)),\n",
" columns=[f\"COL_{i}\" for i in range(NUM_COLUMNS)],\n",
" index=pd.date_range('2000', periods=NUM_ROWS, freq='h'),\n",
" dtype=\"float32[pyarrow]\",\n",
" \n",
")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "b1e0ea40",
"metadata": {},
"outputs": [],
"source": [
"# Pandas wrappers of several `pyarrow.lib.Array`s\n",
"arrow_extension_arrays = df._mgr.arrays\n",
"\n",
"# Likely a list of `pyarrow.lib.FloatArray` (a subclass of `pyarrow.lib.Array`)\n",
"# Depending on the dtype, we might have other `pyarrow.lib.*Array` extending `pyarrow.lib.Array`\n",
"#\n",
"# We need to call `combine_chunks` because `pyarrow.lib.ChunkArray` have (for now) no way to\n",
"# export or import data (e.g. with `_import_from_c`/`_export_to_c`).\n",
"pyarrow_arrays = list(map(lambda array: array._data.combine_chunks(), arrow_extension_arrays))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e625ffd3",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"[<pyarrow.lib.FloatArray object at 0x7fd645c57e80>\n",
" [\n",
" 81,\n",
" 14,\n",
" 11,\n",
" 98,\n",
" 27,\n",
" 39,\n",
" 22\n",
" ],\n",
" <pyarrow.lib.FloatArray object at 0x7fd645c57fa0>\n",
" [\n",
" 38,\n",
" 67,\n",
" 11,\n",
" 30,\n",
" 30,\n",
" 61,\n",
" 45\n",
" ],\n",
" <pyarrow.lib.FloatArray object at 0x7fd645ca4040>\n",
" [\n",
" 67,\n",
" 46,\n",
" 21,\n",
" 59,\n",
" 88,\n",
" 79,\n",
" 41\n",
" ],\n",
" <pyarrow.lib.FloatArray object at 0x7fd645ca40a0>\n",
" [\n",
" 9,\n",
" 6,\n",
" 7,\n",
" 87,\n",
" 93,\n",
" 69,\n",
" 9\n",
" ],\n",
" <pyarrow.lib.FloatArray object at 0x7fd6d651b700>\n",
" [\n",
" 21,\n",
" 46,\n",
" 90,\n",
" 76,\n",
" 4,\n",
" 12,\n",
" 80\n",
" ],\n",
" <pyarrow.lib.FloatArray object at 0x7fd645ca4160>\n",
" [\n",
" 99,\n",
" 36,\n",
" 20,\n",
" 88,\n",
" 2,\n",
" 89,\n",
" 80\n",
" ],\n",
" <pyarrow.lib.FloatArray object at 0x7fd645ca4220>\n",
" [\n",
" 4,\n",
" 97,\n",
" 0,\n",
" 68,\n",
" 57,\n",
" 77,\n",
" 85\n",
" ],\n",
" <pyarrow.lib.FloatArray object at 0x7fd645ca4280>\n",
" [\n",
" 56,\n",
" 64,\n",
" 96,\n",
" 9,\n",
" 55,\n",
" 87,\n",
" 33\n",
" ],\n",
" <pyarrow.lib.FloatArray object at 0x7fd645ca42e0>\n",
" [\n",
" 92,\n",
" 79,\n",
" 83,\n",
" 8,\n",
" 35,\n",
" 47,\n",
" 90\n",
" ],\n",
" <pyarrow.lib.FloatArray object at 0x7fd645ca4340>\n",
" [\n",
" 97,\n",
" 94,\n",
" 37,\n",
" 94,\n",
" 34,\n",
" 56,\n",
" 24\n",
" ],\n",
" <pyarrow.lib.FloatArray object at 0x7fd645ca43a0>\n",
" [\n",
" 60,\n",
" 45,\n",
" 41,\n",
" 16,\n",
" 22,\n",
" 18,\n",
" 46\n",
" ]]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pyarrow_arrays"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "efa0aed2",
"metadata": {},
"outputs": [],
"source": [
"first_pyarrow_array = pyarrow_arrays[0]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "ccd49550",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Help on built-in function _export_to_c:\n",
"\n",
"_export_to_c(...) method of pyarrow.lib.DataType instance\n",
" DataType._export_to_c(self, out_ptr)\n",
" \n",
" Export to a C ArrowSchema struct, given its pointer.\n",
" \n",
" Be careful: if you don't pass the ArrowSchema struct to a consumer,\n",
" its memory will leak. This is a low-level function intended for\n",
" expert users.\n",
"\n"
]
}
],
"source": [
"help(first_pyarrow_array.type._export_to_c)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "43049565",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Help on built-in function _export_to_c:\n",
"\n",
"_export_to_c(...) method of pyarrow.lib.FloatArray instance\n",
" Array._export_to_c(self, out_ptr, out_schema_ptr=0)\n",
" \n",
" Export to a C ArrowArray struct, given its pointer.\n",
" \n",
" If a C ArrowSchema struct pointer is also given, the array type\n",
" is exported to it at the same time.\n",
" \n",
" Parameters\n",
" ----------\n",
" out_ptr: int\n",
" The raw pointer to a C ArrowArray struct.\n",
" out_schema_ptr: int (optional)\n",
" The raw pointer to a C ArrowSchema struct.\n",
" \n",
" Be careful: if you don't pass the ArrowArray struct to a consumer,\n",
" array memory will leak. This is a low-level function intended for\n",
" expert users.\n",
"\n"
]
}
],
"source": [
"help(first_pyarrow_array._export_to_c)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "b061cb6f",
"metadata": {},
"outputs": [],
"source": [
"# Opaque pointers via cffi, usable in C and C++\n",
"# See: https://cffi.readthedocs.io/en/latest/using.html#working-with-pointers-structures-and-arrays\n",
"c_schema = ffi.new(\"struct ArrowSchema*\")\n",
"c_schema_ptr = int(ffi.cast(\"uintptr_t\", c_schema))\n",
"\n",
"c_array = ffi.new(\"struct ArrowArray*\")\n",
"c_array_ptr = int(ffi.cast(\"uintptr_t\", c_array))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "ee41451a",
"metadata": {},
"outputs": [],
"source": [
"# Populate opaque pointers\n",
"first_pyarrow_array.type._export_to_c(c_schema_ptr)\n",
"first_pyarrow_array._export_to_c(c_array_ptr)\n",
"\n",
"# Deserialize schema\n",
"deserialized_schema = pa.DataType._import_from_c(c_schema_ptr)\n",
"\n",
"# Deserialize array\n",
"deserialized_first_pyarrow_array = pa.Array._import_from_c(c_array_ptr, deserialized_schema)\n",
"\n",
"assert deserialized_first_pyarrow_array.equals(first_pyarrow_array)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cb4f412c",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment