ihnorton/tiledb-df-demo.ipynb

## tiledb-df-demo.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction\n",
    "\n",
    "This notebook presents several examples of TileDB usage. First, we import a CSV dataset as a 1D dense array, with automatic TileDB schema creation based on inferred CSV column types (loading to/from a Pandas DataFrame in the process). Next, we build a multi-dimensional sparse array with heterogeneous columns manually, and demonstrate slicing along different axes.\n",
    " \n",
    "\n",
    "# Setup\n",
    "\n",
    "- install tiledb:\n",
    "```\n",
    "pip install tiledb\n",
    "```\n",
    "\n",
    "or\n",
    "\n",
    "```\n",
    "conda install tiledb-py\n",
    "```\n",
    "\n",
    "- (also install `pandas` if not already installed)\n",
    "\n",
    "- download data from UCI ML Repository\n",
    "  https://archive.ics.uci.edu/ml/datasets/Bank+Marketing\n",
    "- unzip to usable path\n",
    "  ```\n",
    "  unzip bank.zip\n",
    "  ```\n",
    "- update `csv_path` below if necessary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# imports\n",
    "import os\n",
    "import tiledb\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "if (os.name == 'nt'):\n",
    "    prefix = \"C:/tmp\"\n",
    "else:\n",
    "    prefix = \"/tmp\"\n",
    "\n",
    "array_uri1 = prefix + \"/tiledb_demo1\"\n",
    "array_uri2 = prefix + \"/tiledb_demo2\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "csv_path = \"bank-full.csv\"\n",
    "\n",
    "# note that this array URI could also be an `s3://` URI with\n",
    "# no extra dependencies:\n",
    "\n",
    "tiledb.from_csv(array_uri1, csv_path, sep=';')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# the dataset was saved with column types preserved, and can\n",
    "# be re-loaded as a dataframe\n",
    "df = tiledb.open_dataframe(array_uri1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pandas.core.frame.DataFrame"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# it is a dataframe!\n",
    "type(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>job</th>\n",
       "      <th>marital</th>\n",
       "      <th>education</th>\n",
       "      <th>default</th>\n",
       "      <th>balance</th>\n",
       "      <th>housing</th>\n",
       "      <th>loan</th>\n",
       "      <th>contact</th>\n",
       "      <th>day</th>\n",
       "      <th>month</th>\n",
       "      <th>duration</th>\n",
       "      <th>campaign</th>\n",
       "      <th>pdays</th>\n",
       "      <th>previous</th>\n",
       "      <th>poutcome</th>\n",
       "      <th>y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>58</td>\n",
       "      <td>management</td>\n",
       "      <td>married</td>\n",
       "      <td>tertiary</td>\n",
       "      <td>no</td>\n",
       "      <td>2143</td>\n",
       "      <td>yes</td>\n",
       "      <td>no</td>\n",
       "      <td>unknown</td>\n",
       "      <td>5</td>\n",
       "      <td>may</td>\n",
       "      <td>261</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>unknown</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>44</td>\n",
       "      <td>technician</td>\n",
       "      <td>single</td>\n",
       "      <td>secondary</td>\n",
       "      <td>no</td>\n",
       "      <td>29</td>\n",
       "      <td>yes</td>\n",
       "      <td>no</td>\n",
       "      <td>unknown</td>\n",
       "      <td>5</td>\n",
       "      <td>may</td>\n",
       "      <td>151</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>unknown</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>33</td>\n",
       "      <td>entrepreneur</td>\n",
       "      <td>married</td>\n",
       "      <td>secondary</td>\n",
       "      <td>no</td>\n",
       "      <td>2</td>\n",
       "      <td>yes</td>\n",
       "      <td>yes</td>\n",
       "      <td>unknown</td>\n",
       "      <td>5</td>\n",
       "      <td>may</td>\n",
       "      <td>76</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>unknown</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>47</td>\n",
       "      <td>blue-collar</td>\n",
       "      <td>married</td>\n",
       "      <td>unknown</td>\n",
       "      <td>no</td>\n",
       "      <td>1506</td>\n",
       "      <td>yes</td>\n",
       "      <td>no</td>\n",
       "      <td>unknown</td>\n",
       "      <td>5</td>\n",
       "      <td>may</td>\n",
       "      <td>92</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>unknown</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>33</td>\n",
       "      <td>unknown</td>\n",
       "      <td>single</td>\n",
       "      <td>unknown</td>\n",
       "      <td>no</td>\n",
       "      <td>1</td>\n",
       "      <td>no</td>\n",
       "      <td>no</td>\n",
       "      <td>unknown</td>\n",
       "      <td>5</td>\n",
       "      <td>may</td>\n",
       "      <td>198</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>unknown</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45206</th>\n",
       "      <td>51</td>\n",
       "      <td>technician</td>\n",
       "      <td>married</td>\n",
       "      <td>tertiary</td>\n",
       "      <td>no</td>\n",
       "      <td>825</td>\n",
       "      <td>no</td>\n",
       "      <td>no</td>\n",
       "      <td>cellular</td>\n",
       "      <td>17</td>\n",
       "      <td>nov</td>\n",
       "      <td>977</td>\n",
       "      <td>3</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>unknown</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45207</th>\n",
       "      <td>71</td>\n",
       "      <td>retired</td>\n",
       "      <td>divorced</td>\n",
       "      <td>primary</td>\n",
       "      <td>no</td>\n",
       "      <td>1729</td>\n",
       "      <td>no</td>\n",
       "      <td>no</td>\n",
       "      <td>cellular</td>\n",
       "      <td>17</td>\n",
       "      <td>nov</td>\n",
       "      <td>456</td>\n",
       "      <td>2</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>unknown</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45208</th>\n",
       "      <td>72</td>\n",
       "      <td>retired</td>\n",
       "      <td>married</td>\n",
       "      <td>secondary</td>\n",
       "      <td>no</td>\n",
       "      <td>5715</td>\n",
       "      <td>no</td>\n",
       "      <td>no</td>\n",
       "      <td>cellular</td>\n",
       "      <td>17</td>\n",
       "      <td>nov</td>\n",
       "      <td>1127</td>\n",
       "      <td>5</td>\n",
       "      <td>184</td>\n",
       "      <td>3</td>\n",
       "      <td>success</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45209</th>\n",
       "      <td>57</td>\n",
       "      <td>blue-collar</td>\n",
       "      <td>married</td>\n",
       "      <td>secondary</td>\n",
       "      <td>no</td>\n",
       "      <td>668</td>\n",
       "      <td>no</td>\n",
       "      <td>no</td>\n",
       "      <td>telephone</td>\n",
       "      <td>17</td>\n",
       "      <td>nov</td>\n",
       "      <td>508</td>\n",
       "      <td>4</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>unknown</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45210</th>\n",
       "      <td>37</td>\n",
       "      <td>entrepreneur</td>\n",
       "      <td>married</td>\n",
       "      <td>secondary</td>\n",
       "      <td>no</td>\n",
       "      <td>2971</td>\n",
       "      <td>no</td>\n",
       "      <td>no</td>\n",
       "      <td>cellular</td>\n",
       "      <td>17</td>\n",
       "      <td>nov</td>\n",
       "      <td>361</td>\n",
       "      <td>2</td>\n",
       "      <td>188</td>\n",
       "      <td>11</td>\n",
       "      <td>other</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>45211 rows × 17 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       age           job   marital  education default  balance housing loan  \\\n",
       "0       58    management   married   tertiary      no     2143     yes   no   \n",
       "1       44    technician    single  secondary      no       29     yes   no   \n",
       "2       33  entrepreneur   married  secondary      no        2     yes  yes   \n",
       "3       47   blue-collar   married    unknown      no     1506     yes   no   \n",
       "4       33       unknown    single    unknown      no        1      no   no   \n",
       "...    ...           ...       ...        ...     ...      ...     ...  ...   \n",
       "45206   51    technician   married   tertiary      no      825      no   no   \n",
       "45207   71       retired  divorced    primary      no     1729      no   no   \n",
       "45208   72       retired   married  secondary      no     5715      no   no   \n",
       "45209   57   blue-collar   married  secondary      no      668      no   no   \n",
       "45210   37  entrepreneur   married  secondary      no     2971      no   no   \n",
       "\n",
       "         contact  day month  duration  campaign  pdays  previous poutcome    y  \n",
       "0        unknown    5   may       261         1     -1         0  unknown   no  \n",
       "1        unknown    5   may       151         1     -1         0  unknown   no  \n",
       "2        unknown    5   may        76         1     -1         0  unknown   no  \n",
       "3        unknown    5   may        92         1     -1         0  unknown   no  \n",
       "4        unknown    5   may       198         1     -1         0  unknown   no  \n",
       "...          ...  ...   ...       ...       ...    ...       ...      ...  ...  \n",
       "45206   cellular   17   nov       977         3     -1         0  unknown  yes  \n",
       "45207   cellular   17   nov       456         2     -1         0  unknown  yes  \n",
       "45208   cellular   17   nov      1127         5    184         3  success  yes  \n",
       "45209  telephone   17   nov       508         4     -1         0  unknown   no  \n",
       "45210   cellular   17   nov       361         2    188        11    other   no  \n",
       "\n",
       "[45211 rows x 17 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we can preview the data as usual in the notebook display\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>job</th>\n",
       "      <th>marital</th>\n",
       "      <th>education</th>\n",
       "      <th>default</th>\n",
       "      <th>balance</th>\n",
       "      <th>housing</th>\n",
       "      <th>loan</th>\n",
       "      <th>contact</th>\n",
       "      <th>day</th>\n",
       "      <th>month</th>\n",
       "      <th>duration</th>\n",
       "      <th>campaign</th>\n",
       "      <th>pdays</th>\n",
       "      <th>previous</th>\n",
       "      <th>poutcome</th>\n",
       "      <th>y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>58</td>\n",
       "      <td>management</td>\n",
       "      <td>married</td>\n",
       "      <td>tertiary</td>\n",
       "      <td>no</td>\n",
       "      <td>2143</td>\n",
       "      <td>yes</td>\n",
       "      <td>no</td>\n",
       "      <td>unknown</td>\n",
       "      <td>5</td>\n",
       "      <td>may</td>\n",
       "      <td>261</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>unknown</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>58</td>\n",
       "      <td>retired</td>\n",
       "      <td>married</td>\n",
       "      <td>primary</td>\n",
       "      <td>no</td>\n",
       "      <td>121</td>\n",
       "      <td>yes</td>\n",
       "      <td>no</td>\n",
       "      <td>unknown</td>\n",
       "      <td>5</td>\n",
       "      <td>may</td>\n",
       "      <td>50</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>unknown</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>53</td>\n",
       "      <td>technician</td>\n",
       "      <td>married</td>\n",
       "      <td>secondary</td>\n",
       "      <td>no</td>\n",
       "      <td>6</td>\n",
       "      <td>yes</td>\n",
       "      <td>no</td>\n",
       "      <td>unknown</td>\n",
       "      <td>5</td>\n",
       "      <td>may</td>\n",
       "      <td>517</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>unknown</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>58</td>\n",
       "      <td>technician</td>\n",
       "      <td>married</td>\n",
       "      <td>unknown</td>\n",
       "      <td>no</td>\n",
       "      <td>71</td>\n",
       "      <td>yes</td>\n",
       "      <td>no</td>\n",
       "      <td>unknown</td>\n",
       "      <td>5</td>\n",
       "      <td>may</td>\n",
       "      <td>71</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>unknown</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>57</td>\n",
       "      <td>services</td>\n",
       "      <td>married</td>\n",
       "      <td>secondary</td>\n",
       "      <td>no</td>\n",
       "      <td>162</td>\n",
       "      <td>yes</td>\n",
       "      <td>no</td>\n",
       "      <td>unknown</td>\n",
       "      <td>5</td>\n",
       "      <td>may</td>\n",
       "      <td>174</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>unknown</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45204</th>\n",
       "      <td>73</td>\n",
       "      <td>retired</td>\n",
       "      <td>married</td>\n",
       "      <td>secondary</td>\n",
       "      <td>no</td>\n",
       "      <td>2850</td>\n",
       "      <td>no</td>\n",
       "      <td>no</td>\n",
       "      <td>cellular</td>\n",
       "      <td>17</td>\n",
       "      <td>nov</td>\n",
       "      <td>300</td>\n",
       "      <td>1</td>\n",
       "      <td>40</td>\n",
       "      <td>8</td>\n",
       "      <td>failure</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45206</th>\n",
       "      <td>51</td>\n",
       "      <td>technician</td>\n",
       "      <td>married</td>\n",
       "      <td>tertiary</td>\n",
       "      <td>no</td>\n",
       "      <td>825</td>\n",
       "      <td>no</td>\n",
       "      <td>no</td>\n",
       "      <td>cellular</td>\n",
       "      <td>17</td>\n",
       "      <td>nov</td>\n",
       "      <td>977</td>\n",
       "      <td>3</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>unknown</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45207</th>\n",
       "      <td>71</td>\n",
       "      <td>retired</td>\n",
       "      <td>divorced</td>\n",
       "      <td>primary</td>\n",
       "      <td>no</td>\n",
       "      <td>1729</td>\n",
       "      <td>no</td>\n",
       "      <td>no</td>\n",
       "      <td>cellular</td>\n",
       "      <td>17</td>\n",
       "      <td>nov</td>\n",
       "      <td>456</td>\n",
       "      <td>2</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>unknown</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45208</th>\n",
       "      <td>72</td>\n",
       "      <td>retired</td>\n",
       "      <td>married</td>\n",
       "      <td>secondary</td>\n",
       "      <td>no</td>\n",
       "      <td>5715</td>\n",
       "      <td>no</td>\n",
       "      <td>no</td>\n",
       "      <td>cellular</td>\n",
       "      <td>17</td>\n",
       "      <td>nov</td>\n",
       "      <td>1127</td>\n",
       "      <td>5</td>\n",
       "      <td>184</td>\n",
       "      <td>3</td>\n",
       "      <td>success</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45209</th>\n",
       "      <td>57</td>\n",
       "      <td>blue-collar</td>\n",
       "      <td>married</td>\n",
       "      <td>secondary</td>\n",
       "      <td>no</td>\n",
       "      <td>668</td>\n",
       "      <td>no</td>\n",
       "      <td>no</td>\n",
       "      <td>telephone</td>\n",
       "      <td>17</td>\n",
       "      <td>nov</td>\n",
       "      <td>508</td>\n",
       "      <td>4</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>unknown</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>9255 rows × 17 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       age          job   marital  education default  balance housing loan  \\\n",
       "0       58   management   married   tertiary      no     2143     yes   no   \n",
       "8       58      retired   married    primary      no      121     yes   no   \n",
       "12      53   technician   married  secondary      no        6     yes   no   \n",
       "13      58   technician   married    unknown      no       71     yes   no   \n",
       "14      57     services   married  secondary      no      162     yes   no   \n",
       "...    ...          ...       ...        ...     ...      ...     ...  ...   \n",
       "45204   73      retired   married  secondary      no     2850      no   no   \n",
       "45206   51   technician   married   tertiary      no      825      no   no   \n",
       "45207   71      retired  divorced    primary      no     1729      no   no   \n",
       "45208   72      retired   married  secondary      no     5715      no   no   \n",
       "45209   57  blue-collar   married  secondary      no      668      no   no   \n",
       "\n",
       "         contact  day month  duration  campaign  pdays  previous poutcome    y  \n",
       "0        unknown    5   may       261         1     -1         0  unknown   no  \n",
       "8        unknown    5   may        50         1     -1         0  unknown   no  \n",
       "12       unknown    5   may       517         1     -1         0  unknown   no  \n",
       "13       unknown    5   may        71         1     -1         0  unknown   no  \n",
       "14       unknown    5   may       174         1     -1         0  unknown   no  \n",
       "...          ...  ...   ...       ...       ...    ...       ...      ...  ...  \n",
       "45204   cellular   17   nov       300         1     40         8  failure  yes  \n",
       "45206   cellular   17   nov       977         3     -1         0  unknown  yes  \n",
       "45207   cellular   17   nov       456         2     -1         0  unknown  yes  \n",
       "45208   cellular   17   nov      1127         5    184         3  success  yes  \n",
       "45209  telephone   17   nov       508         4     -1         0  unknown   no  \n",
       "\n",
       "[9255 rows x 17 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# this dataset easily fits in memory, and of course we can slice with\n",
    "# normal pandas operators:\n",
    "df[df.age>50]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ArraySchema(\n",
       "  domain=Domain(*[\n",
       "    Dim(name='rows', domain=(0, 45210), tile=11, dtype='uint64'),\n",
       "  ]),\n",
       "  attrs=[\n",
       "    Attr(name='age', dtype='int64'),\n",
       "    Attr(name='job', dtype='<U0'),\n",
       "    Attr(name='marital', dtype='<U0'),\n",
       "    Attr(name='education', dtype='<U0'),\n",
       "    Attr(name='default', dtype='<U0'),\n",
       "    Attr(name='balance', dtype='int64'),\n",
       "    Attr(name='housing', dtype='<U0'),\n",
       "    Attr(name='loan', dtype='<U0'),\n",
       "    Attr(name='contact', dtype='<U0'),\n",
       "    Attr(name='day', dtype='int64'),\n",
       "    Attr(name='month', dtype='<U0'),\n",
       "    Attr(name='duration', dtype='int64'),\n",
       "    Attr(name='campaign', dtype='int64'),\n",
       "    Attr(name='pdays', dtype='int64'),\n",
       "    Attr(name='previous', dtype='int64'),\n",
       "    Attr(name='poutcome', dtype='<U0'),\n",
       "    Attr(name='y', dtype='<U0'),\n",
       "  ],\n",
       "  cell_order='row-major',\n",
       "  tile_order='row-major', sparse=False)\n",
       "# note: filters omitted"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# now we can look directly at the underlying TileDB array\n",
    "# and see that each column is a TileDB \"attribute\" with a\n",
    "# specific type.\n",
    "\n",
    "a = tiledb.open(array_uri1)\n",
    "a.schema"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Building a sparse array from scratch\n",
    "\n",
    "Now, let's use some of the data above and build a sparse TileDB array\n",
    "manually, in order to demonstrate TileDB's new heterogeneous dimensions and string-typed column slicing features.\n",
    "\n",
    "We use `id` and `job` below, but you can choose any other subset of columns as dimensions -- column choice should be determined by those you wish to slice mostly efficiently."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# first we need to create an array schema\n",
    "# we keep the first column as the row id, but use the\n",
    "# `job` description as the second column:\n",
    "\n",
    "# note that the \"domain\" of a string-typed dimension is\n",
    "# formed automatically as the data is written.\n",
    "\n",
    "from tiledb import *\n",
    "\n",
    "schema = ArraySchema(\n",
    "  domain=Domain(*[\n",
    "    Dim(name='id', domain=(0, 45211), tile=11, dtype='uint64'),\n",
    "    Dim(name='job', domain=(None,None), tile=None, dtype=np.bytes_),\n",
    "  ]),\n",
    "  attrs=[\n",
    "    Attr(name='age', dtype='int64'),\n",
    "    Attr(name='marital', dtype='<U0'),\n",
    "    Attr(name='education', dtype='<U0'),\n",
    "    Attr(name='default', dtype='<U0'),\n",
    "    Attr(name='balance', dtype='int64'),\n",
    "    Attr(name='housing', dtype='<U0'),\n",
    "    Attr(name='loan', dtype='<U0'),\n",
    "    Attr(name='contact', dtype='<U0'),\n",
    "    Attr(name='day', dtype='int64'),\n",
    "    Attr(name='month', dtype='<U0'),\n",
    "    Attr(name='duration', dtype='int64'),\n",
    "    Attr(name='campaign', dtype='int64'),\n",
    "    Attr(name='pdays', dtype='int64'),\n",
    "    Attr(name='previous', dtype='int64'),\n",
    "    Attr(name='poutcome', dtype='<U0'),\n",
    "    Attr(name='y', dtype='<U0'),\n",
    "  ],\n",
    "  cell_order='row-major',\n",
    "  tile_order='row-major', sparse=True)\n",
    "# note: filters omitted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# write the array schema\n",
    "tiledb.SparseArray.create(array_uri2, schema)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# write the data to disk\n",
    "# we must provide coordinates for each item written\n",
    "\n",
    "with tiledb.open(array_uri2, 'w') as A:\n",
    "    df_dict = {k: v.values for k,v in df.items()}\n",
    "    c_id = df.index.values\n",
    "    c_job = df_dict.pop('job')\n",
    "    A[c_id, c_job] = df_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "A = tiledb.open(array_uri2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "OrderedDict([('age', array([35, 28, 58], dtype=int64)),\n",
       "             ('balance', array([231, 447, 121], dtype=int64)),\n",
       "             ('campaign', array([1, 1, 1], dtype=int64)),\n",
       "             ('contact',\n",
       "              array(['unknown', 'unknown', 'unknown'], dtype=object)),\n",
       "             ('day', array([5, 5, 5], dtype=int64)),\n",
       "             ('default', array(['no', 'no', 'no'], dtype=object)),\n",
       "             ('duration', array([139, 217,  50], dtype=int64)),\n",
       "             ('education',\n",
       "              array(['tertiary', 'tertiary', 'primary'], dtype=object)),\n",
       "             ('housing', array(['yes', 'yes', 'yes'], dtype=object)),\n",
       "             ('id', array([5, 6, 8], dtype=uint64)),\n",
       "             ('job',\n",
       "              array([b'management', b'management', b'retired'], dtype=object)),\n",
       "             ('loan', array(['no', 'yes', 'no'], dtype=object)),\n",
       "             ('marital',\n",
       "              array(['married', 'single', 'married'], dtype=object)),\n",
       "             ('month', array(['may', 'may', 'may'], dtype=object)),\n",
       "             ('pdays', array([-1, -1, -1], dtype=int64)),\n",
       "             ('poutcome',\n",
       "              array(['unknown', 'unknown', 'unknown'], dtype=object)),\n",
       "             ('previous', array([0, 0, 0], dtype=int64)),\n",
       "             ('y', array(['no', 'no', 'no'], dtype=object))])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we can slice on each dimension separately, returning a dictionary:\n",
    "\n",
    "# note that if the data was huge or stored on S3, we would efficiently pull\n",
    "# only tiles matching the requested subset, and end with `val` in-memory locally:\n",
    "\n",
    "val = A.multi_index[1:10, ['retired', 'management']]\n",
    "\n",
    "val"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>balance</th>\n",
       "      <th>campaign</th>\n",
       "      <th>contact</th>\n",
       "      <th>day</th>\n",
       "      <th>default</th>\n",
       "      <th>duration</th>\n",
       "      <th>education</th>\n",
       "      <th>housing</th>\n",
       "      <th>id</th>\n",
       "      <th>job</th>\n",
       "      <th>loan</th>\n",
       "      <th>marital</th>\n",
       "      <th>month</th>\n",
       "      <th>pdays</th>\n",
       "      <th>poutcome</th>\n",
       "      <th>previous</th>\n",
       "      <th>y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>35</td>\n",
       "      <td>231</td>\n",
       "      <td>1</td>\n",
       "      <td>unknown</td>\n",
       "      <td>5</td>\n",
       "      <td>no</td>\n",
       "      <td>139</td>\n",
       "      <td>tertiary</td>\n",
       "      <td>yes</td>\n",
       "      <td>5</td>\n",
       "      <td>b'management'</td>\n",
       "      <td>no</td>\n",
       "      <td>married</td>\n",
       "      <td>may</td>\n",
       "      <td>-1</td>\n",
       "      <td>unknown</td>\n",
       "      <td>0</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>28</td>\n",
       "      <td>447</td>\n",
       "      <td>1</td>\n",
       "      <td>unknown</td>\n",
       "      <td>5</td>\n",
       "      <td>no</td>\n",
       "      <td>217</td>\n",
       "      <td>tertiary</td>\n",
       "      <td>yes</td>\n",
       "      <td>6</td>\n",
       "      <td>b'management'</td>\n",
       "      <td>yes</td>\n",
       "      <td>single</td>\n",
       "      <td>may</td>\n",
       "      <td>-1</td>\n",
       "      <td>unknown</td>\n",
       "      <td>0</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>58</td>\n",
       "      <td>121</td>\n",
       "      <td>1</td>\n",
       "      <td>unknown</td>\n",
       "      <td>5</td>\n",
       "      <td>no</td>\n",
       "      <td>50</td>\n",
       "      <td>primary</td>\n",
       "      <td>yes</td>\n",
       "      <td>8</td>\n",
       "      <td>b'retired'</td>\n",
       "      <td>no</td>\n",
       "      <td>married</td>\n",
       "      <td>may</td>\n",
       "      <td>-1</td>\n",
       "      <td>unknown</td>\n",
       "      <td>0</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   age  balance  campaign  contact  day default  duration education housing  \\\n",
       "0   35      231         1  unknown    5      no       139  tertiary     yes   \n",
       "1   28      447         1  unknown    5      no       217  tertiary     yes   \n",
       "2   58      121         1  unknown    5      no        50   primary     yes   \n",
       "\n",
       "   id            job loan  marital month  pdays poutcome  previous   y  \n",
       "0   5  b'management'   no  married   may     -1  unknown         0  no  \n",
       "1   6  b'management'  yes   single   may     -1  unknown         0  no  \n",
       "2   8     b'retired'   no  married   may     -1  unknown         0  no  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we can also view the slice result as a dataframe\n",
    "\n",
    "pd.DataFrame(val)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# clean up steps if necessary\n",
    "if False:\n",
    "    if tiledb.VFS().is_dir(array_uri1):\n",
    "        tiledb.VFS().remove_dir(array_uri1)\n",
    "    if tiledb.VFS().is_dir(array_uri2):\n",
    "        tiledb.VFS().remove_dir(array_uri2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python (tdb6)",
   "language": "python",
   "name": "tdb6"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}