Skip to content

Instantly share code, notes, and snippets.

@rjzamora
Created May 30, 2024 18:24
Show Gist options
  • Save rjzamora/46a68955a0443790e3d8c972ec606d67 to your computer and use it in GitHub Desktop.
Save rjzamora/46a68955a0443790e3d8c972ec606d67 to your computer and use it in GitHub Desktop.
Multi-file json read experiments
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "9b86d06a-962e-4f04-9aab-55a6a49734dc",
"metadata": {},
"source": [
"# Multi-file JSON Read Experiments"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "4986df6e-fdb8-470c-9e87-c85def0ac47d",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import fsspec\n",
"import glob\n",
"from functools import partial\n",
"from concurrent.futures import ThreadPoolExecutor\n",
"\n",
"import cudf\n",
"\n",
"import pyarrow as pa\n",
"from pyarrow import json\n",
"from pyarrow.json import ReadOptions\n",
"\n",
"import dask\n",
"import dask.dataframe as dd\n",
"from dask.base import tokenize\n",
"from dask.distributed import get_worker\n",
"\n",
"\n",
"def _read_json_arrow(path):\n",
" # Wraps pyarrow's json reader.\n",
" # Seems like we need to use a \"large\" block_size to\n",
" # make sure each contiguous read captures a delimiter\n",
" read_options=ReadOptions(block_size=5*2**20)\n",
" return json.read_json(path, read_options=read_options)\n",
"\n",
"\n",
"def custom_read_pyarrow(paths):\n",
" # Use a local threaded scheduler to read json files\n",
" # with pyarrow, and then convert to cudf\n",
" token = tokenize(paths)\n",
" name = f\"read-{token}\"\n",
" chunk_name = f\"read-chunk-{token}\"\n",
" dsk = {\n",
" (chunk_name, i): (_read_json_arrow, path)\n",
" for i, path in enumerate(paths)\n",
" }\n",
" dsk[chunk_name] = (pa.concat_tables, list(dsk.keys()))\n",
"\n",
" try:\n",
" worker = get_worker()\n",
" except ValueError:\n",
" return cudf.DataFrame.from_arrow(dask.threaded.get(dsk, chunk_name))\n",
"\n",
" if not hasattr(worker, \"_rapids_executor\"):\n",
" num_threads = len(os.sched_getaffinity(0)) - 1\n",
" worker._rapids_executor = ThreadPoolExecutor(num_threads)\n",
" with dask.config.set(pool=worker._rapids_executor):\n",
" return cudf.DataFrame.from_arrow(dask.threaded.get(dsk, chunk_name))\n",
"\n",
"\n",
"def custom_read_bytes(paths):\n",
" # Use a local threaded scheduler to move json bytes\n",
" # inot host memory, and then read with cudf\n",
" fs = fsspec.core.get_fs_token_paths(paths[0])[0] \n",
" token = tokenize(paths)\n",
" chunk_name = f\"read-chunk-{token}\"\n",
" dsk = {\n",
" (chunk_name, i): (fs.cat_file, path)\n",
" for i, path in enumerate(paths)\n",
" }\n",
" dsk[chunk_name] = (partial(cudf.read_json, lines=True), list(dsk.keys()))\n",
"\n",
" try:\n",
" worker = get_worker()\n",
" except ValueError:\n",
" return dask.threaded.get(dsk, chunk_name)\n",
"\n",
" if not hasattr(worker, \"_rapids_executor\"):\n",
" num_threads = len(os.sched_getaffinity(0)) - 1\n",
" worker._rapids_executor = ThreadPoolExecutor(num_threads)\n",
" with dask.config.set(pool=worker._rapids_executor):\n",
" return dask.threaded.get(dsk, chunk_name)\n",
"\n",
"\n",
"def custom_read_ranges(path_list, blocksize=4_000_000):\n",
" # Use cat_ranges to transfer data for\n",
" # multiple files at once. Then read\n",
" # in-memory bytes with cudf. This \"should\"\n",
" # be the \"fastest\" way to use fsspec alone.\n",
" fs = fsspec.core.get_fs_token_paths(path_list[0])[0] \n",
" paths, starts, ends = [], [], []\n",
" for i, size in enumerate(fs.sizes(path_list)):\n",
" blocksize = blocksize or size\n",
" for j in range(0, size, blocksize):\n",
" paths.append(path_list[i])\n",
" starts.append(j)\n",
" ends.append(min(j + blocksize, size))\n",
" return cudf.read_json(\n",
" b\"\".join(fs.cat_ranges(paths, starts, ends)),\n",
" lines=True,\n",
" )\n",
"\n",
"\n",
"def custom_read_serial(paths):\n",
" # Serialized pyarrow reads, followed by cudf conversion\n",
" return cudf.DataFrame.from_arrow(\n",
" pa.concat_tables([_read_json(path) for path in paths])\n",
" )\n",
"\n",
"\n",
"def native_read(paths):\n",
" # Native multi-file read with cudf\n",
" return cudf.read_json(paths, lines=True)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "9e0878b8-c45f-42c3-a5cb-d31184ca8b6d",
"metadata": {},
"outputs": [],
"source": [
"storage = \"local\"\n",
"if storage == \"local\":\n",
" # Read from fast local storage\n",
" file_per_partition = 32\n",
" root = \"/raid/rzamora/100gb/\"\n",
" paths = list(glob.glob(root + \"*.jsonl\"))\n",
"elif storage == \"s3\":\n",
" # Read from s3 storage\n",
" s3_root = \"s3://curator-data/1_tb_dataset\"\n",
" file_per_partition = 32\n",
" fs = fsspec.core.get_fs_token_paths(s3_root)[0]\n",
" # Only take the first 1000 files for the sake of testing\n",
" paths = [\"s3://\" + path for path in fs.ls(s3_root)[:1000]]\n",
"else:\n",
" raise ValueError"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "6bef7e81-70d5-47fa-80ff-9ffebe6920e2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"388 ms ± 20.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%%timeit\n",
"\n",
"df = custom_read_bytes(paths[:file_per_partition])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "89cf25b2-b3c4-45a8-82f9-980a9eb5eee0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.3 s ± 12.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%%timeit\n",
"\n",
"df = custom_read_ranges(paths[:file_per_partition])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "d9eff806-f950-40d6-b754-7e83d347f005",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The slowest run took 4.89 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
"1.45 s ± 1.12 s per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%%timeit\n",
"\n",
"df = native_read(paths[:file_per_partition])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "a9824b58-c092-45fb-9fba-2cc36c1cd3be",
"metadata": {},
"outputs": [],
"source": [
"from dask_cuda import LocalCUDACluster\n",
"from distributed import LocalCluster, Client\n",
"\n",
"client = Client(LocalCUDACluster(rmm_pool_size=\"24GB\"))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "fe9d6055-2c38-4be5-8592-e03330719356",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
" <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <h3 style=\"margin-bottom: 0px;\">Client</h3>\n",
" <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Client-82c55c4c-1eb0-11ef-92e2-d8c49764f70a</p>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
"\n",
" <tr>\n",
" \n",
" <td style=\"text-align: left;\"><strong>Connection method:</strong> Cluster object</td>\n",
" <td style=\"text-align: left;\"><strong>Cluster type:</strong> dask_cuda.LocalCUDACluster</td>\n",
" \n",
" </tr>\n",
"\n",
" \n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:8787/status\" target=\"_blank\">http://127.0.0.1:8787/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" \n",
"\n",
" </table>\n",
"\n",
" \n",
"\n",
" \n",
" <details>\n",
" <summary style=\"margin-bottom: 20px;\"><h3 style=\"display: inline;\">Cluster Info</h3></summary>\n",
" <div class=\"jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-output\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\">\n",
" </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <h3 style=\"margin-bottom: 0px; margin-top: 0px;\">LocalCUDACluster</h3>\n",
" <p style=\"color: #9D9D9D; margin-bottom: 0px;\">f752437b</p>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard:</strong> <a href=\"http://127.0.0.1:8787/status\" target=\"_blank\">http://127.0.0.1:8787/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Workers:</strong> 8\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads:</strong> 8\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total memory:</strong> 0.98 TiB\n",
" </td>\n",
" </tr>\n",
" \n",
" <tr>\n",
" <td style=\"text-align: left;\"><strong>Status:</strong> running</td>\n",
" <td style=\"text-align: left;\"><strong>Using processes:</strong> True</td>\n",
"</tr>\n",
"\n",
" \n",
" </table>\n",
"\n",
" <details>\n",
" <summary style=\"margin-bottom: 20px;\">\n",
" <h3 style=\"display: inline;\">Scheduler Info</h3>\n",
" </summary>\n",
"\n",
" <div style=\"\">\n",
" <div>\n",
" <div style=\"width: 24px; height: 24px; background-color: #FFF7E5; border: 3px solid #FF6132; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <h3 style=\"margin-bottom: 0px;\">Scheduler</h3>\n",
" <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Scheduler-214cc3f7-31ac-48b3-b86c-5303345b8bb2</p>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm:</strong> tcp://127.0.0.1:42589\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Workers:</strong> 8\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard:</strong> <a href=\"http://127.0.0.1:8787/status\" target=\"_blank\">http://127.0.0.1:8787/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads:</strong> 8\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Started:</strong> Just now\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total memory:</strong> 0.98 TiB\n",
" </td>\n",
" </tr>\n",
" </table>\n",
" </div>\n",
" </div>\n",
"\n",
" <details style=\"margin-left: 48px;\">\n",
" <summary style=\"margin-bottom: 20px;\">\n",
" <h3 style=\"display: inline;\">Workers</h3>\n",
" </summary>\n",
"\n",
" \n",
" <div style=\"margin-bottom: 20px;\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <details>\n",
" <summary>\n",
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 0</h4>\n",
" </summary>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm: </strong> tcp://127.0.0.1:43385\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads: </strong> 1\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:37923/status\" target=\"_blank\">http://127.0.0.1:37923/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Memory: </strong> 125.97 GiB\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Nanny: </strong> tcp://127.0.0.1:44069\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" <tr>\n",
" <td colspan=\"2\" style=\"text-align: left;\">\n",
" <strong>Local directory: </strong> /raid/dask-space/rzamora/dask-space/dask-scratch-space/worker-9y711t7a\n",
" </td>\n",
" </tr>\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" </table>\n",
" </details>\n",
" </div>\n",
" </div>\n",
" \n",
" <div style=\"margin-bottom: 20px;\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <details>\n",
" <summary>\n",
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 1</h4>\n",
" </summary>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm: </strong> tcp://127.0.0.1:34305\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads: </strong> 1\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:37723/status\" target=\"_blank\">http://127.0.0.1:37723/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Memory: </strong> 125.97 GiB\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Nanny: </strong> tcp://127.0.0.1:33309\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" <tr>\n",
" <td colspan=\"2\" style=\"text-align: left;\">\n",
" <strong>Local directory: </strong> /raid/dask-space/rzamora/dask-space/dask-scratch-space/worker-4q_ue4z2\n",
" </td>\n",
" </tr>\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" </table>\n",
" </details>\n",
" </div>\n",
" </div>\n",
" \n",
" <div style=\"margin-bottom: 20px;\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <details>\n",
" <summary>\n",
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 2</h4>\n",
" </summary>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm: </strong> tcp://127.0.0.1:43275\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads: </strong> 1\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:33789/status\" target=\"_blank\">http://127.0.0.1:33789/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Memory: </strong> 125.97 GiB\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Nanny: </strong> tcp://127.0.0.1:36037\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" <tr>\n",
" <td colspan=\"2\" style=\"text-align: left;\">\n",
" <strong>Local directory: </strong> /raid/dask-space/rzamora/dask-space/dask-scratch-space/worker-s8t1vz3a\n",
" </td>\n",
" </tr>\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" </table>\n",
" </details>\n",
" </div>\n",
" </div>\n",
" \n",
" <div style=\"margin-bottom: 20px;\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <details>\n",
" <summary>\n",
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 3</h4>\n",
" </summary>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm: </strong> tcp://127.0.0.1:41713\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads: </strong> 1\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:33881/status\" target=\"_blank\">http://127.0.0.1:33881/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Memory: </strong> 125.97 GiB\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Nanny: </strong> tcp://127.0.0.1:36839\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" <tr>\n",
" <td colspan=\"2\" style=\"text-align: left;\">\n",
" <strong>Local directory: </strong> /raid/dask-space/rzamora/dask-space/dask-scratch-space/worker-z5asdg4m\n",
" </td>\n",
" </tr>\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" </table>\n",
" </details>\n",
" </div>\n",
" </div>\n",
" \n",
" <div style=\"margin-bottom: 20px;\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <details>\n",
" <summary>\n",
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 4</h4>\n",
" </summary>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm: </strong> tcp://127.0.0.1:35989\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads: </strong> 1\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:45883/status\" target=\"_blank\">http://127.0.0.1:45883/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Memory: </strong> 125.97 GiB\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Nanny: </strong> tcp://127.0.0.1:41513\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" <tr>\n",
" <td colspan=\"2\" style=\"text-align: left;\">\n",
" <strong>Local directory: </strong> /raid/dask-space/rzamora/dask-space/dask-scratch-space/worker-a8x3l6go\n",
" </td>\n",
" </tr>\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" </table>\n",
" </details>\n",
" </div>\n",
" </div>\n",
" \n",
" <div style=\"margin-bottom: 20px;\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <details>\n",
" <summary>\n",
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 5</h4>\n",
" </summary>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm: </strong> tcp://127.0.0.1:39481\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads: </strong> 1\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:35183/status\" target=\"_blank\">http://127.0.0.1:35183/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Memory: </strong> 125.97 GiB\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Nanny: </strong> tcp://127.0.0.1:39699\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" <tr>\n",
" <td colspan=\"2\" style=\"text-align: left;\">\n",
" <strong>Local directory: </strong> /raid/dask-space/rzamora/dask-space/dask-scratch-space/worker-vbxlz471\n",
" </td>\n",
" </tr>\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" </table>\n",
" </details>\n",
" </div>\n",
" </div>\n",
" \n",
" <div style=\"margin-bottom: 20px;\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <details>\n",
" <summary>\n",
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 6</h4>\n",
" </summary>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm: </strong> tcp://127.0.0.1:33645\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads: </strong> 1\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:46299/status\" target=\"_blank\">http://127.0.0.1:46299/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Memory: </strong> 125.97 GiB\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Nanny: </strong> tcp://127.0.0.1:46747\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" <tr>\n",
" <td colspan=\"2\" style=\"text-align: left;\">\n",
" <strong>Local directory: </strong> /raid/dask-space/rzamora/dask-space/dask-scratch-space/worker-dqiisl7y\n",
" </td>\n",
" </tr>\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" </table>\n",
" </details>\n",
" </div>\n",
" </div>\n",
" \n",
" <div style=\"margin-bottom: 20px;\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <details>\n",
" <summary>\n",
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 7</h4>\n",
" </summary>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm: </strong> tcp://127.0.0.1:40797\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads: </strong> 1\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:33757/status\" target=\"_blank\">http://127.0.0.1:33757/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Memory: </strong> 125.97 GiB\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Nanny: </strong> tcp://127.0.0.1:41987\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" <tr>\n",
" <td colspan=\"2\" style=\"text-align: left;\">\n",
" <strong>Local directory: </strong> /raid/dask-space/rzamora/dask-space/dask-scratch-space/worker-kvj8b20u\n",
" </td>\n",
" </tr>\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" </table>\n",
" </details>\n",
" </div>\n",
" </div>\n",
" \n",
"\n",
" </details>\n",
"</div>\n",
"\n",
" </details>\n",
" </div>\n",
"</div>\n",
" </details>\n",
" \n",
"\n",
" </div>\n",
"</div>"
],
"text/plain": [
"<Client: 'tcp://127.0.0.1:42589' processes=8 threads=8, memory=0.98 TiB>"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"client"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "c167ec97-43ce-4be6-abc4-fae428667bbd",
"metadata": {},
"outputs": [],
"source": [
"inputs = [\n",
" paths[start:start + file_per_partition]\n",
" for start in range(0, len(paths), file_per_partition)\n",
"]\n",
"meta = native_read(paths[0])\n",
"df_custom_fsspec = dd.from_map(custom_read_bytes, inputs, meta=meta)\n",
"df_custom_ranges = dd.from_map(custom_read_ranges, inputs, meta=meta)\n",
"df_native_cudf = dd.from_map(native_read, inputs, meta=meta)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "12231207-a246-4769-b7c4-608d853c3ee8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 2.98 s, sys: 817 ms, total: 3.79 s\n",
"Wall time: 10.8 s\n"
]
},
{
"data": {
"text/plain": [
"21925118"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"\n",
"len(df_custom_fsspec)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "b7b746c9-7965-46cc-a3ce-3ed477b13af3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 4.55 s, sys: 980 ms, total: 5.53 s\n",
"Wall time: 25.6 s\n"
]
},
{
"data": {
"text/plain": [
"21925118"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"\n",
"len(df_custom_ranges)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "d6fc0741-c75f-4905-8237-2abd59cc791d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 8.83 s, sys: 6.61 s, total: 15.4 s\n",
"Wall time: 1min 9s\n"
]
},
{
"data": {
"text/plain": [
"21925118"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"\n",
"len(df_native_cudf)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cbeea4f3-bdfa-4f9f-8800-10d6b9e8aaea",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment