Created
January 29, 2023 16:14
-
-
Save steveoni/2e6be8dcd9dc735bd9549b7e1942758a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Main Resource Translation Functionality" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import openpyxl\n", | |
"from openpyxl import load_workbook\n", | |
"from io import BytesIO\n", | |
"import urllib.request" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'3.0.5'" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"openpyxl.__version__" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Fetch Uploaded file" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"url = \"http://ckan-dev:5000/dataset/934b159c-56e7-4aa2-99dc-8228b609ee31/resource/72047c27-f5e7-4781-9f78-f9b2b2369b91/download/cdc-sample.xlsx\"\n", | |
"api_key = \"037a0c03-b6c0-4be3-97a0-c86683d7697c\"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"req = urllib.request.Request(url)\n", | |
"req.add_header('X-CKAN-API-KEY', api_key)\n", | |
"xfile = urllib.request.urlopen(req).read()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"wb = load_workbook(filename = BytesIO(xfile))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<openpyxl.workbook.workbook.Workbook at 0x7fac4a8ec0a0>" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"wb" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from werkzeug.datastructures import FileStorage\n", | |
"import tempfile" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def _cell2str(cell):\n", | |
" \"\"\"[summary]\n", | |
" Transforms an XLSX cell into string\n", | |
" Args:\n", | |
" cell ([type]): [description]\n", | |
" \"\"\"\n", | |
" c = cell.value or '' # Deal with None value in the cell\n", | |
" if type(c) != str:\n", | |
" return str(c) # Deal with any type that should be convertible\n", | |
" # WARNING that there are many more error sources when converting from excel,\n", | |
" # this deals with some of them but not all possible instances\n", | |
" try:\n", | |
" sc = str(c) # most well-encoded text should work correctly here\n", | |
" return sc\n", | |
" except:\n", | |
" # try to do some magic to deal with invalid encoding\n", | |
" # converting each character to raw string and printing that\n", | |
" return r''.join(c)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Obtain the content of the file" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Sheet1\n", | |
"Processing sheet: Sheet1\n" | |
] | |
} | |
], | |
"source": [ | |
"sheets = []\n", | |
"separator=','\n", | |
"for sheet in wb.worksheets:\n", | |
" row_len = 0\n", | |
" sname = sheet.title\n", | |
" print(sname)\n", | |
" print(\"Processing sheet: {}\".format(sname))\n", | |
" rows = []\n", | |
" for row in sheet.rows:\n", | |
" row_items = [_cell2str(c) for c in row]\n", | |
" # clean intermediate newline chars, join by separator and add a newline only at the end\n", | |
" rows.append(separator.join(row_items).replace('\\n', '')+'\\n')\n", | |
" rows = [r for r in rows if r]\n", | |
" sheets.append((sname, rows))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['Topic,Group,Sex,Age,Area,Year,Number,Percentage,ci_high,ci_low,citext,data,data2,,\\n',\n", | |
" 'Tobacco Smokers,Current tobacco smoker,All,All,,2016,867000,13.1,14.1,12.2,,,,,\\n',\n", | |
" 'Tobacco Smokers,Current tobacco smoker,All,18-29,,2016,101000,11.8,15.1,9.2,,,,,\\n',\n", | |
" 'Tobacco Smokers,Current tobacco smoker,All,30-44,,2016,414000,15.8,17.7,14.1,,,,,\\n',\n", | |
" 'Tobacco Smokers,Current tobacco smoker,All,45-59,,2016,291000,14.1,15.9,12.5,,,,,\\n',\n", | |
" 'Tobacco Smokers,Current tobacco smoker,All,60 years and older,,2016,61000,6.1,7.6,5,,,,,\\n',\n", | |
" 'Tobacco Smokers,Current daily tobacco smoker,All,All,,2016,516000,14.700000000000001,22.9,9.675,,,,,\\n',\n", | |
" 'Tobacco Smokers,Current daily tobacco smoker,All,18-29,,2016,58000,13.7,27.5,7,,,,,\\n',\n", | |
" 'Tobacco Smokers,Current daily tobacco smoker,All,30-44,,2016,215000,16.6,22.9,12.1,,,,,\\n',\n", | |
" 'Tobacco Smokers,Current daily tobacco smoker,All,45-59,,2016,200000,19.6,26.4,14.399999999999999,,,,,\\n']" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"sheets[0][1][:10]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## create a temporary file storage" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0" | |
] | |
}, | |
"execution_count": 17, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"file = tempfile.NamedTemporaryFile('w+')\n", | |
"file.writelines(sheets[0][1])\n", | |
"file.seek(0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[b'Topic,Group,Sex,Age,Area,Year,Number,Percentage,ci_high,ci_low,citext,data,data2,,\\n',\n", | |
" b'Tobacco Smokers,Current tobacco smoker,All,All,,2016,867000,13.1,14.1,12.2,,,,,\\n',\n", | |
" b'Tobacco Smokers,Current tobacco smoker,All,18-29,,2016,101000,11.8,15.1,9.2,,,,,\\n',\n", | |
" b'Tobacco Smokers,Current tobacco smoker,All,30-44,,2016,414000,15.8,17.7,14.1,,,,,\\n',\n", | |
" b'Tobacco Smokers,Current tobacco smoker,All,45-59,,2016,291000,14.1,15.9,12.5,,,,,\\n',\n", | |
" b'Tobacco Smokers,Current tobacco smoker,All,60 years and older,,2016,61000,6.1,7.6,5,,,,,\\n',\n", | |
" b'Tobacco Smokers,Current daily tobacco smoker,All,All,,2016,516000,14.700000000000001,22.9,9.675,,,,,\\n',\n", | |
" b'Tobacco Smokers,Current daily tobacco smoker,All,18-29,,2016,58000,13.7,27.5,7,,,,,\\n',\n", | |
" b'Tobacco Smokers,Current daily tobacco smoker,All,30-44,,2016,215000,16.6,22.9,12.1,,,,,\\n',\n", | |
" b'Tobacco Smokers,Current daily tobacco smoker,All,45-59,,2016,200000,19.6,26.4,14.399999999999999,,,,,\\n']" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"open(file.name, \"rb\").readlines()[:10]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Create a csv file" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"csv_file = FileStorage(filename=\"name.csv\", content_type=u'CSV', stream=BytesIO(open(file.name, \"rb\").read()))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'name.csv'" | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"csv_file.filename" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## File is then re-uploaded with `resource_create` action" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment