Created
October 12, 2021 17:34
-
-
Save fjetter/86d009f9365dbd4864aa13946f69b758 to your computer and use it in GitHub Desktop.
Parquet read+write example dask
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"id": "e57d7139-a175-4671-a294-c70cf7353b97", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from dask.datasets import timeseries\n", | |
"\n", | |
"ddf_jan = timeseries(\n", | |
" start='2000-01-01',\n", | |
" end='2000-02-01',\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"id": "6a4295b1-fd87-4202-88fa-772ea711f762", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>id</th>\n", | |
" <th>name</th>\n", | |
" <th>x</th>\n", | |
" <th>y</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>timestamp</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>2000-01-01 00:00:00</th>\n", | |
" <td>1063</td>\n", | |
" <td>Hannah</td>\n", | |
" <td>0.276926</td>\n", | |
" <td>0.058821</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2000-01-01 00:00:01</th>\n", | |
" <td>1012</td>\n", | |
" <td>Yvonne</td>\n", | |
" <td>0.472681</td>\n", | |
" <td>-0.218207</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2000-01-01 00:00:02</th>\n", | |
" <td>971</td>\n", | |
" <td>Zelda</td>\n", | |
" <td>-0.364523</td>\n", | |
" <td>-0.519655</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2000-01-01 00:00:03</th>\n", | |
" <td>1023</td>\n", | |
" <td>Dan</td>\n", | |
" <td>-0.282011</td>\n", | |
" <td>0.794970</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2000-01-01 00:00:04</th>\n", | |
" <td>1030</td>\n", | |
" <td>Quinn</td>\n", | |
" <td>0.215620</td>\n", | |
" <td>0.043371</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" id name x y\n", | |
"timestamp \n", | |
"2000-01-01 00:00:00 1063 Hannah 0.276926 0.058821\n", | |
"2000-01-01 00:00:01 1012 Yvonne 0.472681 -0.218207\n", | |
"2000-01-01 00:00:02 971 Zelda -0.364523 -0.519655\n", | |
"2000-01-01 00:00:03 1023 Dan -0.282011 0.794970\n", | |
"2000-01-01 00:00:04 1030 Quinn 0.215620 0.043371" | |
] | |
}, | |
"execution_count": 33, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ddf_jan.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 35, | |
"id": "5bf42ecd-da81-4220-a501-2ac7c2a63a2c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div><strong>Dask DataFrame Structure:</strong></div>\n", | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>id</th>\n", | |
" <th>name</th>\n", | |
" <th>x</th>\n", | |
" <th>y</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>npartitions=31</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>2000-01-01</th>\n", | |
" <td>int64</td>\n", | |
" <td>object</td>\n", | |
" <td>float64</td>\n", | |
" <td>float64</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2000-01-02</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2000-01-31</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2000-02-01</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>\n", | |
"<div>Dask Name: make-timeseries, 31 tasks</div>" | |
], | |
"text/plain": [ | |
"Dask DataFrame Structure:\n", | |
" id name x y\n", | |
"npartitions=31 \n", | |
"2000-01-01 int64 object float64 float64\n", | |
"2000-01-02 ... ... ... ...\n", | |
"... ... ... ... ...\n", | |
"2000-01-31 ... ... ... ...\n", | |
"2000-02-01 ... ... ... ...\n", | |
"Dask Name: make-timeseries, 31 tasks" | |
] | |
}, | |
"execution_count": 35, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ddf_jan" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"id": "155251e1-e39a-4da7-8461-93482996a058", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ddf_jan.to_parquet(\n", | |
" path=\"my_first_dataset\", # File path\n", | |
" engine=\"pyarrow\", # parquet implementation\n", | |
" \n", | |
" # some file format specific stuff\n", | |
" compression={\"name\": \"snappy\"},\n", | |
" \n", | |
" # Format specific\n", | |
" partition_on=[\"name\"],\n", | |
" \n", | |
" # Compute engine specifics (i.e. stuff for basic dask)\n", | |
" compute=True,\n", | |
" \n", | |
" # storage backend stuff (S3, GCS, ...)\n", | |
" storage_options={},\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "9906910a-840e-4d03-aa3a-c9f0e014c936", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import dask.dataframe as dd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"id": "859aad0d-5567-4ea8-adf2-95d8413274c3", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>id</th>\n", | |
" <th>x</th>\n", | |
" <th>y</th>\n", | |
" <th>name</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>timestamp</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>2000-01-01 00:00:07</th>\n", | |
" <td>1010</td>\n", | |
" <td>0.284375</td>\n", | |
" <td>-0.963815</td>\n", | |
" <td>Alice</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2000-01-01 00:00:13</th>\n", | |
" <td>1034</td>\n", | |
" <td>0.551905</td>\n", | |
" <td>0.383799</td>\n", | |
" <td>Alice</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2000-01-01 00:00:15</th>\n", | |
" <td>1025</td>\n", | |
" <td>-0.611963</td>\n", | |
" <td>0.314208</td>\n", | |
" <td>Alice</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2000-01-01 00:00:33</th>\n", | |
" <td>1042</td>\n", | |
" <td>0.075010</td>\n", | |
" <td>-0.713647</td>\n", | |
" <td>Alice</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2000-01-01 00:00:40</th>\n", | |
" <td>952</td>\n", | |
" <td>0.079303</td>\n", | |
" <td>0.389080</td>\n", | |
" <td>Alice</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" id x y name\n", | |
"timestamp \n", | |
"2000-01-01 00:00:07 1010 0.284375 -0.963815 Alice\n", | |
"2000-01-01 00:00:13 1034 0.551905 0.383799 Alice\n", | |
"2000-01-01 00:00:15 1025 -0.611963 0.314208 Alice\n", | |
"2000-01-01 00:00:33 1042 0.075010 -0.713647 Alice\n", | |
"2000-01-01 00:00:40 952 0.079303 0.389080 Alice" | |
] | |
}, | |
"execution_count": 29, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# minimal example\n", | |
"ddf = dd.read_parquet(\n", | |
" path=\"my_first_dataset\",\n", | |
")\n", | |
"ddf.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"id": "ab84529c-1e49-45d4-b86d-1153aadae59a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ddf = dd.read_parquet(\n", | |
" path=\"my_first_dataset\", # File path\n", | |
" columns=[\"name\", \"x\"],\n", | |
" # Fitlers have to be in disjunctive normal form\n", | |
" filters=[\n", | |
" [\n", | |
" (\"x\", \">\", 0.0), # OR\n", | |
" (\"y\", \"<\", 1.0),\n", | |
" ],\n", | |
" # AND\n", | |
" [\n", | |
" (\"x\", \">\", 5.0),\n", | |
" (\"name\", \"==\", \"Ingrid\"),\n", | |
" ]\n", | |
" ],\n", | |
" \n", | |
" # Hints for sizes of splits/tasks. Task per RG, per file, multiple files?\n", | |
" split_row_groups=None,\n", | |
" chunksize=None,\n", | |
" aggregate_files=None,\n", | |
" \n", | |
" # Other hints for query planner. Ugly implementation specifics\n", | |
" gather_statistics=None,\n", | |
" ignore_metadata_file=False,\n", | |
"\n", | |
"\n", | |
" # How to materialize. Impacts the output data dtype (e.g. dictionary encoding vs plain, etc.)\n", | |
" categories=None,\n", | |
" index=None,\n", | |
" \n", | |
" # storage backend stuff (S3, GCS, ...)\n", | |
" storage_options={},\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"id": "561e8fee-93dc-48ec-aff3-13b3e312897b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div><strong>Dask DataFrame Structure:</strong></div>\n", | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>name</th>\n", | |
" <th>x</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>npartitions=806</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th></th>\n", | |
" <td>category[known]</td>\n", | |
" <td>float64</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th></th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th></th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th></th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>\n", | |
"<div>Dask Name: read-parquet, 806 tasks</div>" | |
], | |
"text/plain": [ | |
"Dask DataFrame Structure:\n", | |
" name x\n", | |
"npartitions=806 \n", | |
" category[known] float64\n", | |
" ... ...\n", | |
"... ... ...\n", | |
" ... ...\n", | |
" ... ...\n", | |
"Dask Name: read-parquet, 806 tasks" | |
] | |
}, | |
"execution_count": 31, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ddf" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"id": "4b4d1947-8595-47bd-a1c3-4db5d6cea993", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"timeseries(\n", | |
" start='2000-02-01',\n", | |
" end='2000-03-01',\n", | |
").to_parquet(\n", | |
" path=\"my_first_dataset\", # File path\n", | |
" engine=\"pyarrow\", # parquet implementation\n", | |
" # Format specific\n", | |
" partition_on=[\"name\"],\n", | |
" append=True,\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"id": "591456d4-4c42-45a8-a099-307250434d8b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"5184000" | |
] | |
}, | |
"execution_count": 37, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(\n", | |
" dd.read_parquet(\n", | |
" path=\"my_first_dataset\",\n", | |
" )\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 39, | |
"id": "3cf6fa24-aca8-4198-9e16-2e67339ec10e", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"ename": "ValueError", | |
"evalue": "Appended columns not the same.\nPrevious: ['id', 'x', 'y', 'timestamp'] | New: ['timestamp', 'name', 'x']", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m/var/folders/h0/kd1gdptx7gzb6cbywplfrx1r0000gn/T/ipykernel_12127/535303701.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m timeseries(\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'2000-02-01'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'2000-03-01'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"name\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"x\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_parquet\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"my_first_dataset\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# File path\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/workspace/dask/dask/dataframe/core.py\u001b[0m in \u001b[0;36mto_parquet\u001b[0;34m(self, path, *args, **kwargs)\u001b[0m\n\u001b[1;32m 4558\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mio\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mto_parquet\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4559\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4560\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mto_parquet\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4561\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4562\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mto_orc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/workspace/dask/dask/dataframe/io/parquet/core.py\u001b[0m in \u001b[0;36mto_parquet\u001b[0;34m(df, path, engine, compression, write_index, append, overwrite, ignore_divisions, partition_on, storage_options, custom_metadata, write_metadata_file, compute, compute_kwargs, schema, **kwargs)\u001b[0m\n\u001b[1;32m 642\u001b[0m \u001b[0;31m# Engine-specific initialization steps to write the dataset.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 643\u001b[0m \u001b[0;31m# Possibly create parquet metadata, and load existing stuff if appending\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 644\u001b[0;31m meta, schema, i_offset = engine.initialize_write(\n\u001b[0m\u001b[1;32m 645\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 646\u001b[0m \u001b[0mfs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/workspace/dask/dask/dataframe/io/parquet/arrow.py\u001b[0m in \u001b[0;36minitialize_write\u001b[0;34m(cls, df, fs, path, append, partition_on, ignore_divisions, division_info, schema, index_cols, **kwargs)\u001b[0m\n\u001b[1;32m 847\u001b[0m \u001b[0mdtypes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_get_pyarrow_dtypes\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marrow_schema\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcategories\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 848\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnames\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpartition_on\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 849\u001b[0;31m raise ValueError(\n\u001b[0m\u001b[1;32m 850\u001b[0m \u001b[0;34m\"Appended columns not the same.\\n\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 851\u001b[0m \u001b[0;34m\"Previous: {} | New: {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnames\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;31mValueError\u001b[0m: Appended columns not the same.\nPrevious: ['id', 'x', 'y', 'timestamp'] | New: ['timestamp', 'name', 'x']" | |
] | |
} | |
], | |
"source": [ | |
"# Value error due to schema errors\n", | |
"timeseries(\n", | |
" start='2000-02-01',\n", | |
" end='2000-03-01',\n", | |
")[[\"name\", \"x\"]].to_parquet(\n", | |
" path=\"my_first_dataset\", # File path\n", | |
" engine=\"pyarrow\", # parquet implementation\n", | |
" # Format specific\n", | |
" partition_on=[\"name\"],\n", | |
" append=True,\n", | |
")" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment