Skip to content

Instantly share code, notes, and snippets.

@steveoni
Created January 29, 2023 16:14
Show Gist options
  • Save steveoni/2e6be8dcd9dc735bd9549b7e1942758a to your computer and use it in GitHub Desktop.
Save steveoni/2e6be8dcd9dc735bd9549b7e1942758a to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Main Resource Translation Functionality"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import openpyxl\n",
"from openpyxl import load_workbook\n",
"from io import BytesIO\n",
"import urllib.request"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'3.0.5'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"openpyxl.__version__"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Fetch Uploaded file"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"url = \"http://ckan-dev:5000/dataset/934b159c-56e7-4aa2-99dc-8228b609ee31/resource/72047c27-f5e7-4781-9f78-f9b2b2369b91/download/cdc-sample.xlsx\"\n",
"api_key = \"037a0c03-b6c0-4be3-97a0-c86683d7697c\"\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"req = urllib.request.Request(url)\n",
"req.add_header('X-CKAN-API-KEY', api_key)\n",
"xfile = urllib.request.urlopen(req).read()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"wb = load_workbook(filename = BytesIO(xfile))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<openpyxl.workbook.workbook.Workbook at 0x7fac4a8ec0a0>"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wb"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"from werkzeug.datastructures import FileStorage\n",
"import tempfile"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"def _cell2str(cell):\n",
" \"\"\"[summary]\n",
" Transforms an XLSX cell into string\n",
" Args:\n",
" cell ([type]): [description]\n",
" \"\"\"\n",
" c = cell.value or '' # Deal with None value in the cell\n",
" if type(c) != str:\n",
" return str(c) # Deal with any type that should be convertible\n",
" # WARNING that there are many more error sources when converting from excel,\n",
" # this deals with some of them but not all possible instances\n",
" try:\n",
" sc = str(c) # most well-encoded text should work correctly here\n",
" return sc\n",
" except:\n",
" # try to do some magic to deal with invalid encoding\n",
" # converting each character to raw string and printing that\n",
" return r''.join(c)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Obtain the content of the file"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sheet1\n",
"Processing sheet: Sheet1\n"
]
}
],
"source": [
"sheets = []\n",
"separator=','\n",
"for sheet in wb.worksheets:\n",
" row_len = 0\n",
" sname = sheet.title\n",
" print(sname)\n",
" print(\"Processing sheet: {}\".format(sname))\n",
" rows = []\n",
" for row in sheet.rows:\n",
" row_items = [_cell2str(c) for c in row]\n",
" # clean intermediate newline chars, join by separator and add a newline only at the end\n",
" rows.append(separator.join(row_items).replace('\\n', '')+'\\n')\n",
" rows = [r for r in rows if r]\n",
" sheets.append((sname, rows))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Topic,Group,Sex,Age,Area,Year,Number,Percentage,ci_high,ci_low,citext,data,data2,,\\n',\n",
" 'Tobacco Smokers,Current tobacco smoker,All,All,,2016,867000,13.1,14.1,12.2,,,,,\\n',\n",
" 'Tobacco Smokers,Current tobacco smoker,All,18-29,,2016,101000,11.8,15.1,9.2,,,,,\\n',\n",
" 'Tobacco Smokers,Current tobacco smoker,All,30-44,,2016,414000,15.8,17.7,14.1,,,,,\\n',\n",
" 'Tobacco Smokers,Current tobacco smoker,All,45-59,,2016,291000,14.1,15.9,12.5,,,,,\\n',\n",
" 'Tobacco Smokers,Current tobacco smoker,All,60 years and older,,2016,61000,6.1,7.6,5,,,,,\\n',\n",
" 'Tobacco Smokers,Current daily tobacco smoker,All,All,,2016,516000,14.700000000000001,22.9,9.675,,,,,\\n',\n",
" 'Tobacco Smokers,Current daily tobacco smoker,All,18-29,,2016,58000,13.7,27.5,7,,,,,\\n',\n",
" 'Tobacco Smokers,Current daily tobacco smoker,All,30-44,,2016,215000,16.6,22.9,12.1,,,,,\\n',\n",
" 'Tobacco Smokers,Current daily tobacco smoker,All,45-59,,2016,200000,19.6,26.4,14.399999999999999,,,,,\\n']"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sheets[0][1][:10]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## create a temporary file storage"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"file = tempfile.NamedTemporaryFile('w+')\n",
"file.writelines(sheets[0][1])\n",
"file.seek(0)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[b'Topic,Group,Sex,Age,Area,Year,Number,Percentage,ci_high,ci_low,citext,data,data2,,\\n',\n",
" b'Tobacco Smokers,Current tobacco smoker,All,All,,2016,867000,13.1,14.1,12.2,,,,,\\n',\n",
" b'Tobacco Smokers,Current tobacco smoker,All,18-29,,2016,101000,11.8,15.1,9.2,,,,,\\n',\n",
" b'Tobacco Smokers,Current tobacco smoker,All,30-44,,2016,414000,15.8,17.7,14.1,,,,,\\n',\n",
" b'Tobacco Smokers,Current tobacco smoker,All,45-59,,2016,291000,14.1,15.9,12.5,,,,,\\n',\n",
" b'Tobacco Smokers,Current tobacco smoker,All,60 years and older,,2016,61000,6.1,7.6,5,,,,,\\n',\n",
" b'Tobacco Smokers,Current daily tobacco smoker,All,All,,2016,516000,14.700000000000001,22.9,9.675,,,,,\\n',\n",
" b'Tobacco Smokers,Current daily tobacco smoker,All,18-29,,2016,58000,13.7,27.5,7,,,,,\\n',\n",
" b'Tobacco Smokers,Current daily tobacco smoker,All,30-44,,2016,215000,16.6,22.9,12.1,,,,,\\n',\n",
" b'Tobacco Smokers,Current daily tobacco smoker,All,45-59,,2016,200000,19.6,26.4,14.399999999999999,,,,,\\n']"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"open(file.name, \"rb\").readlines()[:10]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create a csv file"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"csv_file = FileStorage(filename=\"name.csv\", content_type=u'CSV', stream=BytesIO(open(file.name, \"rb\").read()))"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'name.csv'"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"csv_file.filename"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## File is then re-uploaded with `resource_create` action"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment