steveoni/Resource_translation.ipynb

## Resource_translation.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Main Resource Translation Functionality"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import openpyxl\n",
    "from openpyxl import load_workbook\n",
    "from io import BytesIO\n",
    "import urllib.request"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'3.0.5'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "openpyxl.__version__"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Fetch Uploaded file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "url = \"http://ckan-dev:5000/dataset/934b159c-56e7-4aa2-99dc-8228b609ee31/resource/72047c27-f5e7-4781-9f78-f9b2b2369b91/download/cdc-sample.xlsx\"\n",
    "api_key = \"037a0c03-b6c0-4be3-97a0-c86683d7697c\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "req = urllib.request.Request(url)\n",
    "req.add_header('X-CKAN-API-KEY', api_key)\n",
    "xfile = urllib.request.urlopen(req).read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "wb = load_workbook(filename = BytesIO(xfile))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<openpyxl.workbook.workbook.Workbook at 0x7fac4a8ec0a0>"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from werkzeug.datastructures import FileStorage\n",
    "import tempfile"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def _cell2str(cell):\n",
    "    \"\"\"[summary]\n",
    "    Transforms an XLSX cell into string\n",
    "    Args:\n",
    "        cell ([type]): [description]\n",
    "    \"\"\"\n",
    "    c = cell.value or ''  # Deal with None value in the cell\n",
    "    if type(c) != str:\n",
    "        return str(c)  # Deal with any type that should be convertible\n",
    "    # WARNING that there are many more error sources when converting from excel,\n",
    "    # this deals with some of them but not all possible instances\n",
    "    try:\n",
    "        sc = str(c)  # most well-encoded text should work correctly here\n",
    "        return sc\n",
    "    except:\n",
    "        # try to do some magic to deal with invalid encoding\n",
    "        # converting each character to raw string and printing that\n",
    "        return r''.join(c)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Obtain the content of the file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sheet1\n",
      "Processing sheet: Sheet1\n"
     ]
    }
   ],
   "source": [
    "sheets = []\n",
    "separator=','\n",
    "for sheet in wb.worksheets:\n",
    "    row_len = 0\n",
    "    sname = sheet.title\n",
    "    print(sname)\n",
    "    print(\"Processing sheet: {}\".format(sname))\n",
    "    rows = []\n",
    "    for row in sheet.rows:\n",
    "        row_items = [_cell2str(c) for c in row]\n",
    "        # clean intermediate newline chars, join by separator and add a newline only at the end\n",
    "        rows.append(separator.join(row_items).replace('\\n', '')+'\\n')\n",
    "    rows = [r for r in rows if r]\n",
    "    sheets.append((sname, rows))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Topic,Group,Sex,Age,Area,Year,Number,Percentage,ci_high,ci_low,citext,data,data2,,\\n',\n",
       " 'Tobacco Smokers,Current tobacco smoker,All,All,,2016,867000,13.1,14.1,12.2,,,,,\\n',\n",
       " 'Tobacco Smokers,Current tobacco smoker,All,18-29,,2016,101000,11.8,15.1,9.2,,,,,\\n',\n",
       " 'Tobacco Smokers,Current tobacco smoker,All,30-44,,2016,414000,15.8,17.7,14.1,,,,,\\n',\n",
       " 'Tobacco Smokers,Current tobacco smoker,All,45-59,,2016,291000,14.1,15.9,12.5,,,,,\\n',\n",
       " 'Tobacco Smokers,Current tobacco smoker,All,60 years and older,,2016,61000,6.1,7.6,5,,,,,\\n',\n",
       " 'Tobacco Smokers,Current daily tobacco smoker,All,All,,2016,516000,14.700000000000001,22.9,9.675,,,,,\\n',\n",
       " 'Tobacco Smokers,Current daily tobacco smoker,All,18-29,,2016,58000,13.7,27.5,7,,,,,\\n',\n",
       " 'Tobacco Smokers,Current daily tobacco smoker,All,30-44,,2016,215000,16.6,22.9,12.1,,,,,\\n',\n",
       " 'Tobacco Smokers,Current daily tobacco smoker,All,45-59,,2016,200000,19.6,26.4,14.399999999999999,,,,,\\n']"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sheets[0][1][:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## create a temporary file storage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "file = tempfile.NamedTemporaryFile('w+')\n",
    "file.writelines(sheets[0][1])\n",
    "file.seek(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[b'Topic,Group,Sex,Age,Area,Year,Number,Percentage,ci_high,ci_low,citext,data,data2,,\\n',\n",
       " b'Tobacco Smokers,Current tobacco smoker,All,All,,2016,867000,13.1,14.1,12.2,,,,,\\n',\n",
       " b'Tobacco Smokers,Current tobacco smoker,All,18-29,,2016,101000,11.8,15.1,9.2,,,,,\\n',\n",
       " b'Tobacco Smokers,Current tobacco smoker,All,30-44,,2016,414000,15.8,17.7,14.1,,,,,\\n',\n",
       " b'Tobacco Smokers,Current tobacco smoker,All,45-59,,2016,291000,14.1,15.9,12.5,,,,,\\n',\n",
       " b'Tobacco Smokers,Current tobacco smoker,All,60 years and older,,2016,61000,6.1,7.6,5,,,,,\\n',\n",
       " b'Tobacco Smokers,Current daily tobacco smoker,All,All,,2016,516000,14.700000000000001,22.9,9.675,,,,,\\n',\n",
       " b'Tobacco Smokers,Current daily tobacco smoker,All,18-29,,2016,58000,13.7,27.5,7,,,,,\\n',\n",
       " b'Tobacco Smokers,Current daily tobacco smoker,All,30-44,,2016,215000,16.6,22.9,12.1,,,,,\\n',\n",
       " b'Tobacco Smokers,Current daily tobacco smoker,All,45-59,,2016,200000,19.6,26.4,14.399999999999999,,,,,\\n']"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "open(file.name, \"rb\").readlines()[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create a csv file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "csv_file = FileStorage(filename=\"name.csv\", content_type=u'CSV', stream=BytesIO(open(file.name, \"rb\").read()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'name.csv'"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "csv_file.filename"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## File is then re-uploaded with `resource_create` action"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Main Resource Translation Functionality"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import openpyxl\n",
	"from openpyxl import load_workbook\n",
	"from io import BytesIO\n",
	"import urllib.request"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'3.0.5'"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"openpyxl.__version__"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Fetch Uploaded file"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"url = \"http://ckan-dev:5000/dataset/934b159c-56e7-4aa2-99dc-8228b609ee31/resource/72047c27-f5e7-4781-9f78-f9b2b2369b91/download/cdc-sample.xlsx\"\n",
	"api_key = \"037a0c03-b6c0-4be3-97a0-c86683d7697c\"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"req = urllib.request.Request(url)\n",
	"req.add_header('X-CKAN-API-KEY', api_key)\n",
	"xfile = urllib.request.urlopen(req).read()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"wb = load_workbook(filename = BytesIO(xfile))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"<openpyxl.workbook.workbook.Workbook at 0x7fac4a8ec0a0>"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"wb"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [],
	"source": [
	"from werkzeug.datastructures import FileStorage\n",
	"import tempfile"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [],
	"source": [
	"def _cell2str(cell):\n",
	" \"\"\"[summary]\n",
	" Transforms an XLSX cell into string\n",
	" Args:\n",
	" cell ([type]): [description]\n",
	" \"\"\"\n",
	" c = cell.value or '' # Deal with None value in the cell\n",
	" if type(c) != str:\n",
	" return str(c) # Deal with any type that should be convertible\n",
	" # WARNING that there are many more error sources when converting from excel,\n",
	" # this deals with some of them but not all possible instances\n",
	" try:\n",
	" sc = str(c) # most well-encoded text should work correctly here\n",
	" return sc\n",
	" except:\n",
	" # try to do some magic to deal with invalid encoding\n",
	" # converting each character to raw string and printing that\n",
	" return r''.join(c)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Obtain the content of the file"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Sheet1\n",
	"Processing sheet: Sheet1\n"
	]
	}
	],
	"source": [
	"sheets = []\n",
	"separator=','\n",
	"for sheet in wb.worksheets:\n",
	" row_len = 0\n",
	" sname = sheet.title\n",
	" print(sname)\n",
	" print(\"Processing sheet: {}\".format(sname))\n",
	" rows = []\n",
	" for row in sheet.rows:\n",
	" row_items = [_cell2str(c) for c in row]\n",
	" # clean intermediate newline chars, join by separator and add a newline only at the end\n",
	" rows.append(separator.join(row_items).replace('\\n', '')+'\\n')\n",
	" rows = [r for r in rows if r]\n",
	" sheets.append((sname, rows))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['Topic,Group,Sex,Age,Area,Year,Number,Percentage,ci_high,ci_low,citext,data,data2,,\\n',\n",
	" 'Tobacco Smokers,Current tobacco smoker,All,All,,2016,867000,13.1,14.1,12.2,,,,,\\n',\n",
	" 'Tobacco Smokers,Current tobacco smoker,All,18-29,,2016,101000,11.8,15.1,9.2,,,,,\\n',\n",
	" 'Tobacco Smokers,Current tobacco smoker,All,30-44,,2016,414000,15.8,17.7,14.1,,,,,\\n',\n",
	" 'Tobacco Smokers,Current tobacco smoker,All,45-59,,2016,291000,14.1,15.9,12.5,,,,,\\n',\n",
	" 'Tobacco Smokers,Current tobacco smoker,All,60 years and older,,2016,61000,6.1,7.6,5,,,,,\\n',\n",
	" 'Tobacco Smokers,Current daily tobacco smoker,All,All,,2016,516000,14.700000000000001,22.9,9.675,,,,,\\n',\n",
	" 'Tobacco Smokers,Current daily tobacco smoker,All,18-29,,2016,58000,13.7,27.5,7,,,,,\\n',\n",
	" 'Tobacco Smokers,Current daily tobacco smoker,All,30-44,,2016,215000,16.6,22.9,12.1,,,,,\\n',\n",
	" 'Tobacco Smokers,Current daily tobacco smoker,All,45-59,,2016,200000,19.6,26.4,14.399999999999999,,,,,\\n']"
	]
	},
	"execution_count": 15,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"sheets[0][1][:10]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## create a temporary file storage"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0"
	]
	},
	"execution_count": 17,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"file = tempfile.NamedTemporaryFile('w+')\n",
	"file.writelines(sheets[0][1])\n",
	"file.seek(0)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[b'Topic,Group,Sex,Age,Area,Year,Number,Percentage,ci_high,ci_low,citext,data,data2,,\\n',\n",
	" b'Tobacco Smokers,Current tobacco smoker,All,All,,2016,867000,13.1,14.1,12.2,,,,,\\n',\n",
	" b'Tobacco Smokers,Current tobacco smoker,All,18-29,,2016,101000,11.8,15.1,9.2,,,,,\\n',\n",
	" b'Tobacco Smokers,Current tobacco smoker,All,30-44,,2016,414000,15.8,17.7,14.1,,,,,\\n',\n",
	" b'Tobacco Smokers,Current tobacco smoker,All,45-59,,2016,291000,14.1,15.9,12.5,,,,,\\n',\n",
	" b'Tobacco Smokers,Current tobacco smoker,All,60 years and older,,2016,61000,6.1,7.6,5,,,,,\\n',\n",
	" b'Tobacco Smokers,Current daily tobacco smoker,All,All,,2016,516000,14.700000000000001,22.9,9.675,,,,,\\n',\n",
	" b'Tobacco Smokers,Current daily tobacco smoker,All,18-29,,2016,58000,13.7,27.5,7,,,,,\\n',\n",
	" b'Tobacco Smokers,Current daily tobacco smoker,All,30-44,,2016,215000,16.6,22.9,12.1,,,,,\\n',\n",
	" b'Tobacco Smokers,Current daily tobacco smoker,All,45-59,,2016,200000,19.6,26.4,14.399999999999999,,,,,\\n']"
	]
	},
	"execution_count": 20,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"open(file.name, \"rb\").readlines()[:10]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Create a csv file"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {},
	"outputs": [],
	"source": [
	"csv_file = FileStorage(filename=\"name.csv\", content_type=u'CSV', stream=BytesIO(open(file.name, \"rb\").read()))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'name.csv'"
	]
	},
	"execution_count": 24,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"csv_file.filename"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## File is then re-uploaded with `resource_create` action"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.5"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}