Bodobolero/ReadCSVAndWriteToAOT.ipynb

## ReadCSVAndWriteToAOT.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-07T12:40:21.681653Z",
     "start_time": "2020-05-07T12:39:38.664896Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Starting Spark application\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<tr><th>ID</th><th>YARN Application ID</th><th>Kind</th><th>State</th><th>Spark UI</th><th>Driver log</th><th>Current session?</th></tr><tr><td>1</td><td>None</td><td>pyspark</td><td>idle</td><td></td><td></td><td>✔</td></tr></table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SparkSession available as 'spark'.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "jdbcOptions = { \\\n",
    "    \"driver\" : \"com.ibm.db2.jcc.DB2Driver\", \\\n",
    "    \"user\" : sc.getConf().get(\"spark.__livy__.jdbcUserid\"), \\\n",
    "    \"password\" : sc.getConf().get(\"spark.__livy__.jdbcPassword\"), \\\n",
    "}\n",
    "url = \"jdbc:db2://idaasim-148.svl.ibm.com:8030/DWHDB0A\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Read a CSV file from the scratch directory on the accelerator\n",
    "we are connecting Jupyter to Livy REST interface with user 'acceladm' so the CSV file must be available in that user's home directory or another directory readable by user acceladm in the accelerator docker container."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-07T12:44:25.132413Z",
     "start_time": "2020-05-07T12:44:21.582918Z"
    }
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "dfNationCSV = spark.read.format(\"csv\").option(\"header\", \"false\").load(\"/scratch/home/acceladm/NATION.csv\").toDF(\"N_NATIONKEY\",\"N_NAME\",\"N_REGIONKEY\",\"N_COMMENT\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-07T12:44:30.487437Z",
     "start_time": "2020-05-07T12:44:26.969781Z"
    }
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------+--------------------+-----------+--------------------+\n",
      "|N_NATIONKEY|              N_NAME|N_REGIONKEY|           N_COMMENT|\n",
      "+-----------+--------------------+-----------+--------------------+\n",
      "|          0|ALGERIA          ...|          0|final accounts wa...|\n",
      "|          1|ARGENTINA        ...|          1|idly final instru...|\n",
      "|          2|BRAZIL           ...|          1|always pending pi...|\n",
      "|          3|CANADA           ...|          1|foxes among the b...|\n",
      "|          4|EGYPT            ...|          4|pending accounts ...|\n",
      "|          5|ETHIOPIA         ...|          0|fluffily ruthless...|\n",
      "|          6|FRANCE           ...|          3|even requests det...|\n",
      "|          7|GERMANY          ...|          3|blithely ironic f...|\n",
      "|          8|INDIA            ...|          2|ironic packages s...|\n",
      "|          9|INDONESIA        ...|          2|unusual excuses a...|\n",
      "|         10|IRAN             ...|          4|blithely even acc...|\n",
      "|         11|IRAQ             ...|          4|express, pending ...|\n",
      "|         12|JAPAN            ...|          2|blithely final pa...|\n",
      "|         13|JORDAN           ...|          4|blithe, express d...|\n",
      "|         14|KENYA            ...|          0|ironic requests b...|\n",
      "|         15|MOROCCO          ...|          0|ideas according t...|\n",
      "|         16|MOZAMBIQUE       ...|          0|ironic courts wak...|\n",
      "|         17|PERU             ...|          1|final, final acco...|\n",
      "|         18|CHINA            ...|          2|bold accounts are...|\n",
      "|         19|ROMANIA          ...|          3|deposits boost ag...|\n",
      "+-----------+--------------------+-----------+--------------------+\n",
      "only showing top 20 rows"
     ]
    }
   ],
   "source": [
    "dfNationCSV.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "\n",
    "Now we want to write back the dfNation dataframe into the database. For that we created an AOT in IDAA that has the same structure as the TPCH.NATION table with the following DDL statement:\n",
    "\n",
    "CREATE TABLE \"TPCH\".\"NATIONAOT\" (\n",
    "\"N_NATIONKEY\" INTEGER NOT NULL,\n",
    "\"N_NAME\" CHAR(25) FOR SBCS DATA NOT NULL,\n",
    "\"N_REGIONKEY\" INTEGER NOT NULL,\n",
    "\"N_COMMENT\" VARCHAR(152) FOR SBCS DATA\n",
    ")\n",
    "IN ACCELERATOR SIM148\n",
    "IN DATABASE PETER\n",
    "CCSID EBCDIC;\n",
    "\n",
    "Then we can append the rows in our dataframe into the AOT using:\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-07T12:44:54.806366Z",
     "start_time": "2020-05-07T12:44:39.168193Z"
    }
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "dfNationCSV.write.mode(\"append\") \\\n",
    "    .jdbc(url,\"TPCH.NATIONAOT\", properties = jdbcOptions)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "PySpark",
   "language": "",
   "name": "pysparkkernel"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "python",
    "version": 2
   },
   "mimetype": "text/x-python",
   "name": "pyspark",
   "pygments_lexer": "python2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2020-05-07T12:40:21.681653Z",
	"start_time": "2020-05-07T12:39:38.664896Z"
	},
	"scrolled": true
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Starting Spark application\n"
	]
	},
	{
	"data": {
	"text/html": [
	"<table>\n",
	"<tr><th>ID</th><th>YARN Application ID</th><th>Kind</th><th>State</th><th>Spark UI</th><th>Driver log</th><th>Current session?</th></tr><tr><td>1</td><td>None</td><td>pyspark</td><td>idle</td><td></td><td></td><td>✔</td></tr></table>"
	],
	"text/plain": [
	"<IPython.core.display.HTML object>"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	"FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"SparkSession available as 'spark'.\n"
	]
	},
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	"FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	}
	],
	"source": [
	"jdbcOptions = { \\\n",
	" \"driver\" : \"com.ibm.db2.jcc.DB2Driver\", \\\n",
	" \"user\" : sc.getConf().get(\"spark.__livy__.jdbcUserid\"), \\\n",
	" \"password\" : sc.getConf().get(\"spark.__livy__.jdbcPassword\"), \\\n",
	"}\n",
	"url = \"jdbc:db2://idaasim-148.svl.ibm.com:8030/DWHDB0A\""
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Read a CSV file from the scratch directory on the accelerator\n",
	"we are connecting Jupyter to Livy REST interface with user 'acceladm' so the CSV file must be available in that user's home directory or another directory readable by user acceladm in the accelerator docker container."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2020-05-07T12:44:25.132413Z",
	"start_time": "2020-05-07T12:44:21.582918Z"
	}
	},
	"outputs": [
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	"FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	}
	],
	"source": [
	"dfNationCSV = spark.read.format(\"csv\").option(\"header\", \"false\").load(\"/scratch/home/acceladm/NATION.csv\").toDF(\"N_NATIONKEY\",\"N_NAME\",\"N_REGIONKEY\",\"N_COMMENT\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2020-05-07T12:44:30.487437Z",
	"start_time": "2020-05-07T12:44:26.969781Z"
	}
	},
	"outputs": [
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	"FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"+-----------+--------------------+-----------+--------------------+\n",
	"\|N_NATIONKEY\| N_NAME\|N_REGIONKEY\| N_COMMENT\|\n",
	"+-----------+--------------------+-----------+--------------------+\n",
	"\| 0\|ALGERIA ...\| 0\|final accounts wa...\|\n",
	"\| 1\|ARGENTINA ...\| 1\|idly final instru...\|\n",
	"\| 2\|BRAZIL ...\| 1\|always pending pi...\|\n",
	"\| 3\|CANADA ...\| 1\|foxes among the b...\|\n",
	"\| 4\|EGYPT ...\| 4\|pending accounts ...\|\n",
	"\| 5\|ETHIOPIA ...\| 0\|fluffily ruthless...\|\n",
	"\| 6\|FRANCE ...\| 3\|even requests det...\|\n",
	"\| 7\|GERMANY ...\| 3\|blithely ironic f...\|\n",
	"\| 8\|INDIA ...\| 2\|ironic packages s...\|\n",
	"\| 9\|INDONESIA ...\| 2\|unusual excuses a...\|\n",
	"\| 10\|IRAN ...\| 4\|blithely even acc...\|\n",
	"\| 11\|IRAQ ...\| 4\|express, pending ...\|\n",
	"\| 12\|JAPAN ...\| 2\|blithely final pa...\|\n",
	"\| 13\|JORDAN ...\| 4\|blithe, express d...\|\n",
	"\| 14\|KENYA ...\| 0\|ironic requests b...\|\n",
	"\| 15\|MOROCCO ...\| 0\|ideas according t...\|\n",
	"\| 16\|MOZAMBIQUE ...\| 0\|ironic courts wak...\|\n",
	"\| 17\|PERU ...\| 1\|final, final acco...\|\n",
	"\| 18\|CHINA ...\| 2\|bold accounts are...\|\n",
	"\| 19\|ROMANIA ...\| 3\|deposits boost ag...\|\n",
	"+-----------+--------------------+-----------+--------------------+\n",
	"only showing top 20 rows"
	]
	}
	],
	"source": [
	"dfNationCSV.show()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"\n",
	"\n",
	"Now we want to write back the dfNation dataframe into the database. For that we created an AOT in IDAA that has the same structure as the TPCH.NATION table with the following DDL statement:\n",
	"\n",
	"CREATE TABLE \"TPCH\".\"NATIONAOT\" (\n",
	"\"N_NATIONKEY\" INTEGER NOT NULL,\n",
	"\"N_NAME\" CHAR(25) FOR SBCS DATA NOT NULL,\n",
	"\"N_REGIONKEY\" INTEGER NOT NULL,\n",
	"\"N_COMMENT\" VARCHAR(152) FOR SBCS DATA\n",
	")\n",
	"IN ACCELERATOR SIM148\n",
	"IN DATABASE PETER\n",
	"CCSID EBCDIC;\n",
	"\n",
	"Then we can append the rows in our dataframe into the AOT using:\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2020-05-07T12:44:54.806366Z",
	"start_time": "2020-05-07T12:44:39.168193Z"
	}
	},
	"outputs": [
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	"FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	}
	],
	"source": [
	"dfNationCSV.write.mode(\"append\") \\\n",
	" .jdbc(url,\"TPCH.NATIONAOT\", properties = jdbcOptions)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "PySpark",
	"language": "",
	"name": "pysparkkernel"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "python",
	"version": 2
	},
	"mimetype": "text/x-python",
	"name": "pyspark",
	"pygments_lexer": "python2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}