fjetter/example_parquet_dask.ipynb

## example_parquet_dask.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "e57d7139-a175-4671-a294-c70cf7353b97",
   "metadata": {},
   "outputs": [],
   "source": [
    "from dask.datasets import timeseries\n",
    "\n",
    "ddf_jan = timeseries(\n",
    "    start='2000-01-01',\n",
    "    end='2000-02-01',\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "6a4295b1-fd87-4202-88fa-772ea711f762",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>name</th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>timestamp</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2000-01-01 00:00:00</th>\n",
       "      <td>1063</td>\n",
       "      <td>Hannah</td>\n",
       "      <td>0.276926</td>\n",
       "      <td>0.058821</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-01 00:00:01</th>\n",
       "      <td>1012</td>\n",
       "      <td>Yvonne</td>\n",
       "      <td>0.472681</td>\n",
       "      <td>-0.218207</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-01 00:00:02</th>\n",
       "      <td>971</td>\n",
       "      <td>Zelda</td>\n",
       "      <td>-0.364523</td>\n",
       "      <td>-0.519655</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-01 00:00:03</th>\n",
       "      <td>1023</td>\n",
       "      <td>Dan</td>\n",
       "      <td>-0.282011</td>\n",
       "      <td>0.794970</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-01 00:00:04</th>\n",
       "      <td>1030</td>\n",
       "      <td>Quinn</td>\n",
       "      <td>0.215620</td>\n",
       "      <td>0.043371</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       id    name         x         y\n",
       "timestamp                                            \n",
       "2000-01-01 00:00:00  1063  Hannah  0.276926  0.058821\n",
       "2000-01-01 00:00:01  1012  Yvonne  0.472681 -0.218207\n",
       "2000-01-01 00:00:02   971   Zelda -0.364523 -0.519655\n",
       "2000-01-01 00:00:03  1023     Dan -0.282011  0.794970\n",
       "2000-01-01 00:00:04  1030   Quinn  0.215620  0.043371"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ddf_jan.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "5bf42ecd-da81-4220-a501-2ac7c2a63a2c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><strong>Dask DataFrame Structure:</strong></div>\n",
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>name</th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>npartitions=31</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2000-01-01</th>\n",
       "      <td>int64</td>\n",
       "      <td>object</td>\n",
       "      <td>float64</td>\n",
       "      <td>float64</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-02</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-31</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-02-01</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>\n",
       "<div>Dask Name: make-timeseries, 31 tasks</div>"
      ],
      "text/plain": [
       "Dask DataFrame Structure:\n",
       "                   id    name        x        y\n",
       "npartitions=31                                 \n",
       "2000-01-01      int64  object  float64  float64\n",
       "2000-01-02        ...     ...      ...      ...\n",
       "...               ...     ...      ...      ...\n",
       "2000-01-31        ...     ...      ...      ...\n",
       "2000-02-01        ...     ...      ...      ...\n",
       "Dask Name: make-timeseries, 31 tasks"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ddf_jan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "155251e1-e39a-4da7-8461-93482996a058",
   "metadata": {},
   "outputs": [],
   "source": [
    "ddf_jan.to_parquet(\n",
    "    path=\"my_first_dataset\", # File path\n",
    "    engine=\"pyarrow\", # parquet implementation\n",
    "    \n",
    "    # some file format specific stuff\n",
    "    compression={\"name\": \"snappy\"},\n",
    "    \n",
    "    # Format specific\n",
    "    partition_on=[\"name\"],\n",
    "    \n",
    "    # Compute engine specifics (i.e. stuff for basic dask)\n",
    "    compute=True,\n",
    "    \n",
    "    # storage backend stuff (S3, GCS, ...)\n",
    "    storage_options={},\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "9906910a-840e-4d03-aa3a-c9f0e014c936",
   "metadata": {},
   "outputs": [],
   "source": [
    "import dask.dataframe as dd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "859aad0d-5567-4ea8-adf2-95d8413274c3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "      <th>name</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>timestamp</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2000-01-01 00:00:07</th>\n",
       "      <td>1010</td>\n",
       "      <td>0.284375</td>\n",
       "      <td>-0.963815</td>\n",
       "      <td>Alice</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-01 00:00:13</th>\n",
       "      <td>1034</td>\n",
       "      <td>0.551905</td>\n",
       "      <td>0.383799</td>\n",
       "      <td>Alice</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-01 00:00:15</th>\n",
       "      <td>1025</td>\n",
       "      <td>-0.611963</td>\n",
       "      <td>0.314208</td>\n",
       "      <td>Alice</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-01 00:00:33</th>\n",
       "      <td>1042</td>\n",
       "      <td>0.075010</td>\n",
       "      <td>-0.713647</td>\n",
       "      <td>Alice</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-01 00:00:40</th>\n",
       "      <td>952</td>\n",
       "      <td>0.079303</td>\n",
       "      <td>0.389080</td>\n",
       "      <td>Alice</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       id         x         y   name\n",
       "timestamp                                           \n",
       "2000-01-01 00:00:07  1010  0.284375 -0.963815  Alice\n",
       "2000-01-01 00:00:13  1034  0.551905  0.383799  Alice\n",
       "2000-01-01 00:00:15  1025 -0.611963  0.314208  Alice\n",
       "2000-01-01 00:00:33  1042  0.075010 -0.713647  Alice\n",
       "2000-01-01 00:00:40   952  0.079303  0.389080  Alice"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# minimal example\n",
    "ddf = dd.read_parquet(\n",
    "    path=\"my_first_dataset\",\n",
    ")\n",
    "ddf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "ab84529c-1e49-45d4-b86d-1153aadae59a",
   "metadata": {},
   "outputs": [],
   "source": [
    "ddf = dd.read_parquet(\n",
    "    path=\"my_first_dataset\", # File path\n",
    "    columns=[\"name\", \"x\"],\n",
    "    # Fitlers have to be in disjunctive normal form\n",
    "    filters=[\n",
    "        [\n",
    "            (\"x\", \">\", 0.0), # OR\n",
    "            (\"y\", \"<\", 1.0),\n",
    "        ],\n",
    "        # AND\n",
    "        [\n",
    "            (\"x\", \">\", 5.0),\n",
    "            (\"name\", \"==\", \"Ingrid\"),\n",
    "        ]\n",
    "    ],\n",
    "    \n",
    "    # Hints for sizes of splits/tasks. Task per RG, per file, multiple files?\n",
    "    split_row_groups=None,\n",
    "    chunksize=None,\n",
    "    aggregate_files=None,\n",
    "    \n",
    "    # Other hints for query planner. Ugly implementation specifics\n",
    "    gather_statistics=None,\n",
    "    ignore_metadata_file=False,\n",
    "\n",
    "\n",
    "    # How to materialize. Impacts the output data dtype (e.g. dictionary encoding vs plain, etc.)\n",
    "    categories=None,\n",
    "    index=None,\n",
    "     \n",
    "    # storage backend stuff (S3, GCS, ...)\n",
    "    storage_options={},\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "561e8fee-93dc-48ec-aff3-13b3e312897b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><strong>Dask DataFrame Structure:</strong></div>\n",
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>x</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>npartitions=806</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>category[known]</td>\n",
       "      <td>float64</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>\n",
       "<div>Dask Name: read-parquet, 806 tasks</div>"
      ],
      "text/plain": [
       "Dask DataFrame Structure:\n",
       "                            name        x\n",
       "npartitions=806                          \n",
       "                 category[known]  float64\n",
       "                             ...      ...\n",
       "...                          ...      ...\n",
       "                             ...      ...\n",
       "                             ...      ...\n",
       "Dask Name: read-parquet, 806 tasks"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ddf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "4b4d1947-8595-47bd-a1c3-4db5d6cea993",
   "metadata": {},
   "outputs": [],
   "source": [
    "timeseries(\n",
    "    start='2000-02-01',\n",
    "    end='2000-03-01',\n",
    ").to_parquet(\n",
    "    path=\"my_first_dataset\", # File path\n",
    "    engine=\"pyarrow\", # parquet implementation\n",
    "    # Format specific\n",
    "    partition_on=[\"name\"],\n",
    "    append=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "591456d4-4c42-45a8-a099-307250434d8b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5184000"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(\n",
    "    dd.read_parquet(\n",
    "        path=\"my_first_dataset\",\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "3cf6fa24-aca8-4198-9e16-2e67339ec10e",
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "Appended columns not the same.\nPrevious: ['id', 'x', 'y', 'timestamp'] | New: ['timestamp', 'name', 'x']",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m/var/folders/h0/kd1gdptx7gzb6cbywplfrx1r0000gn/T/ipykernel_12127/535303701.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m timeseries(\n\u001b[0m\u001b[1;32m      2\u001b[0m     \u001b[0mstart\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'2000-02-01'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m     \u001b[0mend\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'2000-03-01'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"name\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"x\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_parquet\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m     \u001b[0mpath\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"my_first_dataset\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# File path\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/workspace/dask/dask/dataframe/core.py\u001b[0m in \u001b[0;36mto_parquet\u001b[0;34m(self, path, *args, **kwargs)\u001b[0m\n\u001b[1;32m   4558\u001b[0m         \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mio\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mto_parquet\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   4559\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4560\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mto_parquet\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   4561\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   4562\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mto_orc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/workspace/dask/dask/dataframe/io/parquet/core.py\u001b[0m in \u001b[0;36mto_parquet\u001b[0;34m(df, path, engine, compression, write_index, append, overwrite, ignore_divisions, partition_on, storage_options, custom_metadata, write_metadata_file, compute, compute_kwargs, schema, **kwargs)\u001b[0m\n\u001b[1;32m    642\u001b[0m     \u001b[0;31m# Engine-specific initialization steps to write the dataset.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    643\u001b[0m     \u001b[0;31m# Possibly create parquet metadata, and load existing stuff if appending\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 644\u001b[0;31m     meta, schema, i_offset = engine.initialize_write(\n\u001b[0m\u001b[1;32m    645\u001b[0m         \u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    646\u001b[0m         \u001b[0mfs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/workspace/dask/dask/dataframe/io/parquet/arrow.py\u001b[0m in \u001b[0;36minitialize_write\u001b[0;34m(cls, df, fs, path, append, partition_on, ignore_divisions, division_info, schema, index_cols, **kwargs)\u001b[0m\n\u001b[1;32m    847\u001b[0m             \u001b[0mdtypes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_get_pyarrow_dtypes\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marrow_schema\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcategories\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    848\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnames\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpartition_on\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 849\u001b[0;31m                 raise ValueError(\n\u001b[0m\u001b[1;32m    850\u001b[0m                     \u001b[0;34m\"Appended columns not the same.\\n\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    851\u001b[0m                     \u001b[0;34m\"Previous: {} | New: {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnames\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: Appended columns not the same.\nPrevious: ['id', 'x', 'y', 'timestamp'] | New: ['timestamp', 'name', 'x']"
     ]
    }
   ],
   "source": [
    "# Value error due to schema errors\n",
    "timeseries(\n",
    "    start='2000-02-01',\n",
    "    end='2000-03-01',\n",
    ")[[\"name\", \"x\"]].to_parquet(\n",
    "    path=\"my_first_dataset\", # File path\n",
    "    engine=\"pyarrow\", # parquet implementation\n",
    "    # Format specific\n",
    "    partition_on=[\"name\"],\n",
    "    append=True,\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 32,
	"id": "e57d7139-a175-4671-a294-c70cf7353b97",
	"metadata": {},
	"outputs": [],
	"source": [
	"from dask.datasets import timeseries\n",
	"\n",
	"ddf_jan = timeseries(\n",
	" start='2000-01-01',\n",
	" end='2000-02-01',\n",
	")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 33,
	"id": "6a4295b1-fd87-4202-88fa-772ea711f762",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>id</th>\n",
	" <th>name</th>\n",
	" <th>x</th>\n",
	" <th>y</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>timestamp</th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>2000-01-01 00:00:00</th>\n",
	" <td>1063</td>\n",
	" <td>Hannah</td>\n",
	" <td>0.276926</td>\n",
	" <td>0.058821</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2000-01-01 00:00:01</th>\n",
	" <td>1012</td>\n",
	" <td>Yvonne</td>\n",
	" <td>0.472681</td>\n",
	" <td>-0.218207</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2000-01-01 00:00:02</th>\n",
	" <td>971</td>\n",
	" <td>Zelda</td>\n",
	" <td>-0.364523</td>\n",
	" <td>-0.519655</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2000-01-01 00:00:03</th>\n",
	" <td>1023</td>\n",
	" <td>Dan</td>\n",
	" <td>-0.282011</td>\n",
	" <td>0.794970</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2000-01-01 00:00:04</th>\n",
	" <td>1030</td>\n",
	" <td>Quinn</td>\n",
	" <td>0.215620</td>\n",
	" <td>0.043371</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" id name x y\n",
	"timestamp \n",
	"2000-01-01 00:00:00 1063 Hannah 0.276926 0.058821\n",
	"2000-01-01 00:00:01 1012 Yvonne 0.472681 -0.218207\n",
	"2000-01-01 00:00:02 971 Zelda -0.364523 -0.519655\n",
	"2000-01-01 00:00:03 1023 Dan -0.282011 0.794970\n",
	"2000-01-01 00:00:04 1030 Quinn 0.215620 0.043371"
	]
	},
	"execution_count": 33,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"ddf_jan.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 35,
	"id": "5bf42ecd-da81-4220-a501-2ac7c2a63a2c",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div><strong>Dask DataFrame Structure:</strong></div>\n",
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>id</th>\n",
	" <th>name</th>\n",
	" <th>x</th>\n",
	" <th>y</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>npartitions=31</th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>2000-01-01</th>\n",
	" <td>int64</td>\n",
	" <td>object</td>\n",
	" <td>float64</td>\n",
	" <td>float64</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2000-01-02</th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>...</th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2000-01-31</th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2000-02-01</th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>\n",
	"<div>Dask Name: make-timeseries, 31 tasks</div>"
	],
	"text/plain": [
	"Dask DataFrame Structure:\n",
	" id name x y\n",
	"npartitions=31 \n",
	"2000-01-01 int64 object float64 float64\n",
	"2000-01-02 ... ... ... ...\n",
	"... ... ... ... ...\n",
	"2000-01-31 ... ... ... ...\n",
	"2000-02-01 ... ... ... ...\n",
	"Dask Name: make-timeseries, 31 tasks"
	]
	},
	"execution_count": 35,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"ddf_jan"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"id": "155251e1-e39a-4da7-8461-93482996a058",
	"metadata": {},
	"outputs": [],
	"source": [
	"ddf_jan.to_parquet(\n",
	" path=\"my_first_dataset\", # File path\n",
	" engine=\"pyarrow\", # parquet implementation\n",
	" \n",
	" # some file format specific stuff\n",
	" compression={\"name\": \"snappy\"},\n",
	" \n",
	" # Format specific\n",
	" partition_on=[\"name\"],\n",
	" \n",
	" # Compute engine specifics (i.e. stuff for basic dask)\n",
	" compute=True,\n",
	" \n",
	" # storage backend stuff (S3, GCS, ...)\n",
	" storage_options={},\n",
	")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"id": "9906910a-840e-4d03-aa3a-c9f0e014c936",
	"metadata": {},
	"outputs": [],
	"source": [
	"import dask.dataframe as dd"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 29,
	"id": "859aad0d-5567-4ea8-adf2-95d8413274c3",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>id</th>\n",
	" <th>x</th>\n",
	" <th>y</th>\n",
	" <th>name</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>timestamp</th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>2000-01-01 00:00:07</th>\n",
	" <td>1010</td>\n",
	" <td>0.284375</td>\n",
	" <td>-0.963815</td>\n",
	" <td>Alice</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2000-01-01 00:00:13</th>\n",
	" <td>1034</td>\n",
	" <td>0.551905</td>\n",
	" <td>0.383799</td>\n",
	" <td>Alice</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2000-01-01 00:00:15</th>\n",
	" <td>1025</td>\n",
	" <td>-0.611963</td>\n",
	" <td>0.314208</td>\n",
	" <td>Alice</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2000-01-01 00:00:33</th>\n",
	" <td>1042</td>\n",
	" <td>0.075010</td>\n",
	" <td>-0.713647</td>\n",
	" <td>Alice</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2000-01-01 00:00:40</th>\n",
	" <td>952</td>\n",
	" <td>0.079303</td>\n",
	" <td>0.389080</td>\n",
	" <td>Alice</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" id x y name\n",
	"timestamp \n",
	"2000-01-01 00:00:07 1010 0.284375 -0.963815 Alice\n",
	"2000-01-01 00:00:13 1034 0.551905 0.383799 Alice\n",
	"2000-01-01 00:00:15 1025 -0.611963 0.314208 Alice\n",
	"2000-01-01 00:00:33 1042 0.075010 -0.713647 Alice\n",
	"2000-01-01 00:00:40 952 0.079303 0.389080 Alice"
	]
	},
	"execution_count": 29,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# minimal example\n",
	"ddf = dd.read_parquet(\n",
	" path=\"my_first_dataset\",\n",
	")\n",
	"ddf.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 30,
	"id": "ab84529c-1e49-45d4-b86d-1153aadae59a",
	"metadata": {},
	"outputs": [],
	"source": [
	"ddf = dd.read_parquet(\n",
	" path=\"my_first_dataset\", # File path\n",
	" columns=[\"name\", \"x\"],\n",
	" # Fitlers have to be in disjunctive normal form\n",
	" filters=[\n",
	" [\n",
	" (\"x\", \">\", 0.0), # OR\n",
	" (\"y\", \"<\", 1.0),\n",
	" ],\n",
	" # AND\n",
	" [\n",
	" (\"x\", \">\", 5.0),\n",
	" (\"name\", \"==\", \"Ingrid\"),\n",
	" ]\n",
	" ],\n",
	" \n",
	" # Hints for sizes of splits/tasks. Task per RG, per file, multiple files?\n",
	" split_row_groups=None,\n",
	" chunksize=None,\n",
	" aggregate_files=None,\n",
	" \n",
	" # Other hints for query planner. Ugly implementation specifics\n",
	" gather_statistics=None,\n",
	" ignore_metadata_file=False,\n",
	"\n",
	"\n",
	" # How to materialize. Impacts the output data dtype (e.g. dictionary encoding vs plain, etc.)\n",
	" categories=None,\n",
	" index=None,\n",
	" \n",
	" # storage backend stuff (S3, GCS, ...)\n",
	" storage_options={},\n",
	")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 31,
	"id": "561e8fee-93dc-48ec-aff3-13b3e312897b",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div><strong>Dask DataFrame Structure:</strong></div>\n",
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>name</th>\n",
	" <th>x</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>npartitions=806</th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th></th>\n",
	" <td>category[known]</td>\n",
	" <td>float64</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th></th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>...</th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th></th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th></th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>\n",
	"<div>Dask Name: read-parquet, 806 tasks</div>"
	],
	"text/plain": [
	"Dask DataFrame Structure:\n",
	" name x\n",
	"npartitions=806 \n",
	" category[known] float64\n",
	" ... ...\n",
	"... ... ...\n",
	" ... ...\n",
	" ... ...\n",
	"Dask Name: read-parquet, 806 tasks"
	]
	},
	"execution_count": 31,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"ddf"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 36,
	"id": "4b4d1947-8595-47bd-a1c3-4db5d6cea993",
	"metadata": {},
	"outputs": [],
	"source": [
	"timeseries(\n",
	" start='2000-02-01',\n",
	" end='2000-03-01',\n",
	").to_parquet(\n",
	" path=\"my_first_dataset\", # File path\n",
	" engine=\"pyarrow\", # parquet implementation\n",
	" # Format specific\n",
	" partition_on=[\"name\"],\n",
	" append=True,\n",
	")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 37,
	"id": "591456d4-4c42-45a8-a099-307250434d8b",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"5184000"
	]
	},
	"execution_count": 37,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(\n",
	" dd.read_parquet(\n",
	" path=\"my_first_dataset\",\n",
	" )\n",
	")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 39,
	"id": "3cf6fa24-aca8-4198-9e16-2e67339ec10e",
	"metadata": {},
	"outputs": [
	{
	"ename": "ValueError",
	"evalue": "Appended columns not the same.\nPrevious: ['id', 'x', 'y', 'timestamp'] \| New: ['timestamp', 'name', 'x']",
	"output_type": "error",
	"traceback": [
	"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
	"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
	"\u001b[0;32m/var/folders/h0/kd1gdptx7gzb6cbywplfrx1r0000gn/T/ipykernel_12127/535303701.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m timeseries(\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'2000-02-01'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'2000-03-01'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"name\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"x\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_parquet\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"my_first_dataset\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# File path\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
	"\u001b[0;32m~/workspace/dask/dask/dataframe/core.py\u001b[0m in \u001b[0;36mto_parquet\u001b[0;34m(self, path, args, kwargs)\u001b[0m\n\u001b[1;32m 4558\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mio\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mto_parquet\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4559\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4560\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mto_parquet\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4561\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4562\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mto_orc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
	"\u001b[0;32m~/workspace/dask/dask/dataframe/io/parquet/core.py\u001b[0m in \u001b[0;36mto_parquet\u001b[0;34m(df, path, engine, compression, write_index, append, overwrite, ignore_divisions, partition_on, storage_options, custom_metadata, write_metadata_file, compute, compute_kwargs, schema, **kwargs)\u001b[0m\n\u001b[1;32m 642\u001b[0m \u001b[0;31m# Engine-specific initialization steps to write the dataset.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 643\u001b[0m \u001b[0;31m# Possibly create parquet metadata, and load existing stuff if appending\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 644\u001b[0;31m meta, schema, i_offset = engine.initialize_write(\n\u001b[0m\u001b[1;32m 645\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 646\u001b[0m \u001b[0mfs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
	"\u001b[0;32m~/workspace/dask/dask/dataframe/io/parquet/arrow.py\u001b[0m in \u001b[0;36minitialize_write\u001b[0;34m(cls, df, fs, path, append, partition_on, ignore_divisions, division_info, schema, index_cols, **kwargs)\u001b[0m\n\u001b[1;32m 847\u001b[0m \u001b[0mdtypes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_get_pyarrow_dtypes\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marrow_schema\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcategories\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 848\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnames\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpartition_on\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 849\u001b[0;31m raise ValueError(\n\u001b[0m\u001b[1;32m 850\u001b[0m \u001b[0;34m\"Appended columns not the same.\\n\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 851\u001b[0m \u001b[0;34m\"Previous: {} \| New: {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnames\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
	"\u001b[0;31mValueError\u001b[0m: Appended columns not the same.\nPrevious: ['id', 'x', 'y', 'timestamp'] \| New: ['timestamp', 'name', 'x']"
	]
	}
	],
	"source": [
	"# Value error due to schema errors\n",
	"timeseries(\n",
	" start='2000-02-01',\n",
	" end='2000-03-01',\n",
	")[[\"name\", \"x\"]].to_parquet(\n",
	" path=\"my_first_dataset\", # File path\n",
	" engine=\"pyarrow\", # parquet implementation\n",
	" # Format specific\n",
	" partition_on=[\"name\"],\n",
	" append=True,\n",
	")"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.12"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}