rmoff/Import XLS all sheets.ipynb

## Import XLS all sheets.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "execfile('ipython/00-bdd-shell-init.py')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import the whole workbook\n",
    "By specifying `sheetname=None` all sheets are imported. \n",
    "*See [documentation](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_excel.html) for details*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "xls = pd.read_excel(io='file:///home/oracle/custom/Data-Guide.xls',sheetname=None)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### List the sheets in the workbook"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "xls.keys()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Number of sheets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "total_count = len(xls.keys())\n",
    "print total_count"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Show contents of a sheet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "xls['Sheet name']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Convert the sheet to a Spark dataframe, write it to Hive, and add it to BDD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from subprocess import call\n",
    "\n",
    "def slugify(value):\n",
    "    # Function to strip characters that are not going to work in table names\n",
    "    # Based on http://stackoverflow.com/a/295146/350613\n",
    "    import string\n",
    "    valid_chars = \"_.%s%s\" % (string.ascii_letters, string.digits)\n",
    "    value = value.replace(' ','_')\n",
    "    return ''.join(c for c in value if c in valid_chars)\n",
    "    \n",
    "tables_written=0\n",
    "failed=0\n",
    "for title,df in xls.iteritems():\n",
    "    print 'Processing %s...' % title\n",
    "    try:\n",
    "        print '\\tConvert to Spark data frame'\n",
    "        spark_df=sqlContext.createDataFrame(df)\n",
    "        try:\n",
    "            print '\\tWrite to Hive'\n",
    "            tablename=slugify(title)\n",
    "            qualified_tablename='default.' + tablename\n",
    "            spark_df.write.mode('Overwrite').saveAsTable(qualified_tablename)\n",
    "            tables_written+=1\n",
    "            try:\n",
    "                #print '\\tAdd the table to BDD'\n",
    "                #call([\"/u01/bdd/v1.2.0/BDD-1.2.0.31.813/dataprocessing/edp_cli/data_processing_CLI\",\"--table\",tablename])\n",
    "                print '/u01/bdd/v1.2.0/BDD-1.2.0.31.813/dataprocessing/edp_cli/data_processing_CLI --table %s' % tablename\n",
    "            except Exception as e:\n",
    "                print 'Failed in calling data_processing_CLI to process HDFS table %s into BDD\\n\\tError: %s' % (tablename,e)\n",
    "        except Exception as e:\n",
    "            print 'Failed to write %s to HDFS table %s\\n\\tError: %s' % (title,tablename,e)\n",
    "            failed+=1\n",
    "    except Exception as e:\n",
    "        print 'Failed to convert %s to Spark dataframe \\n\\tError: %s' % (title,e)\n",
    "        failed+=1\n",
    "        \n",
    "print \"Of %s sheets, %s tables successfully written and %s failed\" % (total_count,tables_written,failed)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To get the inline `data_processing_CLI` to run, I had to update `/etc/hadoop/conf/yarn-site.xml` and add a configuration for `yarn.nodemanager.resource.memory-mb` which I set to `16000`. Without this, the `data_processing_CLI` job could be seen in YARN as stuck at ACCEPTED.\n",
    "\n",
    "**NB** Currently fail when invoking `data_processing_CLI` - see https://gist.github.com/rmoff/f7e484b0c94c4c7f14e9a680c445af6b for details. Run from the bash shell it works just fine. "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"execfile('ipython/00-bdd-shell-init.py')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import pandas as pd"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Import the whole workbook\n",
	"By specifying `sheetname=None` all sheets are imported. \n",
	"See [documentation](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_excel.html) for details"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"xls = pd.read_excel(io='file:///home/oracle/custom/Data-Guide.xls',sheetname=None)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### List the sheets in the workbook"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"xls.keys()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Number of sheets"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"total_count = len(xls.keys())\n",
	"print total_count"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Show contents of a sheet"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"xls['Sheet name']"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Convert the sheet to a Spark dataframe, write it to Hive, and add it to BDD"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from subprocess import call\n",
	"\n",
	"def slugify(value):\n",
	" # Function to strip characters that are not going to work in table names\n",
	" # Based on http://stackoverflow.com/a/295146/350613\n",
	" import string\n",
	" valid_chars = \"_.%s%s\" % (string.ascii_letters, string.digits)\n",
	" value = value.replace(' ','_')\n",
	" return ''.join(c for c in value if c in valid_chars)\n",
	" \n",
	"tables_written=0\n",
	"failed=0\n",
	"for title,df in xls.iteritems():\n",
	" print 'Processing %s...' % title\n",
	" try:\n",
	" print '\\tConvert to Spark data frame'\n",
	" spark_df=sqlContext.createDataFrame(df)\n",
	" try:\n",
	" print '\\tWrite to Hive'\n",
	" tablename=slugify(title)\n",
	" qualified_tablename='default.' + tablename\n",
	" spark_df.write.mode('Overwrite').saveAsTable(qualified_tablename)\n",
	" tables_written+=1\n",
	" try:\n",
	" #print '\\tAdd the table to BDD'\n",
	" #call([\"/u01/bdd/v1.2.0/BDD-1.2.0.31.813/dataprocessing/edp_cli/data_processing_CLI\",\"--table\",tablename])\n",
	" print '/u01/bdd/v1.2.0/BDD-1.2.0.31.813/dataprocessing/edp_cli/data_processing_CLI --table %s' % tablename\n",
	" except Exception as e:\n",
	" print 'Failed in calling data_processing_CLI to process HDFS table %s into BDD\\n\\tError: %s' % (tablename,e)\n",
	" except Exception as e:\n",
	" print 'Failed to write %s to HDFS table %s\\n\\tError: %s' % (title,tablename,e)\n",
	" failed+=1\n",
	" except Exception as e:\n",
	" print 'Failed to convert %s to Spark dataframe \\n\\tError: %s' % (title,e)\n",
	" failed+=1\n",
	" \n",
	"print \"Of %s sheets, %s tables successfully written and %s failed\" % (total_count,tables_written,failed)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"To get the inline `data_processing_CLI` to run, I had to update `/etc/hadoop/conf/yarn-site.xml` and add a configuration for `yarn.nodemanager.resource.memory-mb` which I set to `16000`. Without this, the `data_processing_CLI` job could be seen in YARN as stuck at ACCEPTED.\n",
	"\n",
	"NB Currently fail when invoking `data_processing_CLI` - see https://gist.github.com/rmoff/f7e484b0c94c4c7f14e9a680c445af6b for details. Run from the bash shell it works just fine. "
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.11"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}