Created
March 8, 2022 12:36
-
-
Save lorinc/cd8bbc376753b60325c4c61424005eaf to your computer and use it in GitHub Desktop.
iPyton notebook that I used to prototype the generation of the 3200 formal delegation letters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "f4f25983-fff1-43b6-967b-8ba3bf5ec285", | |
"metadata": { | |
"tags": [] | |
}, | |
"source": [ | |
"## 1. installing required python libraries" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "cdf271cd-aca3-414c-acb2-bf1963911292", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%%capture\n", | |
"%pip install jinja2 jsonschema" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "94c0206b-b530-46a8-8633-6343a8293d01", | |
"metadata": { | |
"tags": [] | |
}, | |
"source": [ | |
"## 2. generating documents" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "6365bc98-5dd2-413d-af60-375bc9775db4", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# merging user data and template into html files\n", | |
"\n", | |
"import json\n", | |
"from jinja2 import Template\n", | |
"\n", | |
"################################################################# read\n", | |
"\n", | |
"def get_delegation_data(file_str='../input/delegate.json'):\n", | |
" \"\"\"reads the json dump file and returns as json variable\"\"\"\n", | |
" with open(file_str, 'r', encoding='utf-8') as f:\n", | |
" return json.load(f)['data']['delegation_processes']\n", | |
"\n", | |
"\n", | |
"def drop_old_runs(delegation_data):\n", | |
" max_delegation_round = max([e['round'] for e in delegation_data])\n", | |
" for run in delegation_data:\n", | |
" if run['round'] == max_delegation_round:\n", | |
" yield run\n", | |
"\n", | |
"################################################################# validate\n", | |
"\n", | |
"def test(json_ref):\n", | |
" if json_ref not in [None, '', [], {}]:\n", | |
" return json_ref\n", | |
" else:\n", | |
" return None\n", | |
"\n", | |
"\n", | |
"def is_valid_user(user_json):\n", | |
" \"\"\"very-very forgiving data quality check\"\"\"\n", | |
" try:\n", | |
" if all([\n", | |
" test(user_json['user_detail']['legal_name']),\n", | |
" test(user_json['user_detail']['personal_identity_num']),\n", | |
" test(user_json['user_addresses'][0]),\n", | |
" test(user_json['user_addresses'][0]['postal_cd']),\n", | |
" test(user_json['user_addresses'][0]['city_name']),\n", | |
" test(user_json['user_addresses'][0]['street_name']) \n", | |
" ]):\n", | |
" return True\n", | |
" else:\n", | |
" return False\n", | |
" except:\n", | |
" return False\n", | |
"\n", | |
" \n", | |
"def is_valid_analog(analog_json):\n", | |
" \"\"\"very-very forgiving data quality check\"\"\"\n", | |
" if all([\n", | |
" test(analog_json['full_name']),\n", | |
" test(analog_json['identity_num']),\n", | |
" test(analog_json['address'])\n", | |
" ]):\n", | |
" return True\n", | |
" else:\n", | |
" return False\n", | |
" \n", | |
"\n", | |
"def user_address_builder(user_json):\n", | |
" \"\"\"building a string from things that might or might not be there\"\"\"\n", | |
" return ' '.join([\n", | |
" test(user_json['user_addresses'][0]['postal_cd']) or '',\n", | |
" test(user_json['user_addresses'][0]['city_name']) or '',\n", | |
" \",\",\n", | |
" test(user_json['user_addresses'][0]['street_name']) or '',\n", | |
" (\n", | |
" test(user_json['user_addresses'][0]['street_type']['comment'])\n", | |
" if (\n", | |
" test(user_json['user_addresses'][0]['street_type']\n", | |
" and test(user_json['user_addresses'][0]['street_type']['comment']))\n", | |
" ) else ''\n", | |
" ),\n", | |
" test(user_json['user_addresses'][0]['street_num']) or ''\n", | |
" ])\n", | |
"\n", | |
"################################################################# build\n", | |
"\n", | |
"def doc_builder(delegation_run_data):\n", | |
" for run_town in delegation_run_data:\n", | |
" fileload = {'filename':f\"{run_town['town_id']}.html\",\n", | |
" 'name' : run_town['town']['town_name'], \n", | |
" 'delegees' : []}\n", | |
"\n", | |
" for voting_location in run_town['town']['voting_locations']:\n", | |
" if voting_location['user_2_voting_locations']:\n", | |
" for user_location in voting_location['user_2_voting_locations']:\n", | |
"\n", | |
" if is_valid_user(user_location['user']):\n", | |
" fileload['delegees'].append(\n", | |
" {\n", | |
" 'name':user_location['user']['user_detail']['legal_name'],\n", | |
" 'id':user_location['user']['user_detail']['personal_identity_num'],\n", | |
" 'address':user_address_builder(user_location['user']),\n", | |
" 'email_address':user_location['user']['email_address'],\n", | |
" 'phone_num':user_location['user']['phone_num'],\n", | |
" 'zone':voting_location['location_number']\n", | |
" }\n", | |
" )\n", | |
"\n", | |
" elif is_valid_analog(user_location['analog_user']):\n", | |
" fileload['delegees'].append(\n", | |
" {\n", | |
" 'name':user_location['analog_user']['full_name'],\n", | |
" 'id':user_location['analog_user']['identity_num'],\n", | |
" 'address':user_location['analog_user']['address'],\n", | |
" 'email_address':user_location['analog_user']['email_address'],\n", | |
" 'phone_num':user_location['analog_user']['phone_num'],\n", | |
" 'zone':voting_location['location_number']\n", | |
" }\n", | |
" ) \n", | |
" else:\n", | |
" pass # this should be error handling\n", | |
" else:\n", | |
" pass # empty delegation letter\n", | |
" \n", | |
" yield fileload\n", | |
"\n", | |
"################################################################# merge\n", | |
"\n", | |
"def read_letter_template(file_str='../input/megbizolevel_jinja_template.html'):\n", | |
" with open(file_str, 'r', encoding='utf-8') as f:\n", | |
" template = Template(f.read())\n", | |
" return template\n", | |
"\n", | |
"\n", | |
"def generate_documents(fileload, jinja_template):\n", | |
" path = \"../output/\" \n", | |
" for town in fileload:\n", | |
" if town['delegees']:\n", | |
" document = jinja_template.render(town = town)\n", | |
" with open(path+town['filename'], 'w', encoding='utf-8') as f:\n", | |
" f.write(document)\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "ee212b73-77e3-4f43-8c3e-4ee2b2044728", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"generate_documents(\n", | |
" doc_builder(\n", | |
" drop_old_runs(\n", | |
" get_delegation_data()\n", | |
" )\n", | |
" ),\n", | |
" read_letter_template()\n", | |
") " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "a1727c07-4bdb-4fc8-9589-06c708708dce", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# transforming the json array to a tabular report\n", | |
"\n", | |
"import pandas as pd\n", | |
"\n", | |
"report = pd.json_normalize(\n", | |
" doc_builder(\n", | |
" drop_old_runs(\n", | |
" get_delegation_data()\n", | |
" )\n", | |
" )\n", | |
")\n", | |
"\n", | |
"report = report.join(\n", | |
" pd.DataFrame(\n", | |
" report.delegees.explode()\n", | |
" ), rsuffix='_exploded'\n", | |
").drop(\n", | |
" columns=['delegees']\n", | |
")\n", | |
"\n", | |
"report = report.join(\n", | |
" pd.json_normalize(\n", | |
" report.delegees_exploded\n", | |
" ).set_index(\n", | |
" report.index\n", | |
" ), rsuffix='_usr'\n", | |
").drop(\n", | |
" columns=['delegees_exploded']\n", | |
").drop_duplicates()\n", | |
"\n", | |
"report.to_csv('../output/delegalas_report.csv')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "887fc299-7ca0-4b20-b641-5aff5f97b5a6", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# 399 entries, but not all of them have delegees\n", | |
"%ls /home/jupyter/20k_docgen/output/ | wc -l" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "c18b5333-c620-4113-80d2-6f9a4c32f700", | |
"metadata": { | |
"tags": [] | |
}, | |
"source": [ | |
"## 3. installing libreoffice writer for pdf conversion" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "4a282c4c-69c1-4930-b847-7a36eef4a0ce", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%%capture\n", | |
"!apt-get -qq install -y libreoffice-writer" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "50bc0d4c-dc6a-4ddb-b181-8c2efa96e33d", | |
"metadata": { | |
"tags": [] | |
}, | |
"source": [ | |
"## 4. converting HTMLs to PDFs and zipping them for download\n", | |
"*(did you clean the output folder before running the generation script??!)*" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "ea44aa72-9766-4e20-9340-91130dd8bd57", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%%bash\n", | |
"cd /home/jupyter/20k_docgen/output/\n", | |
"for f in *.html ; do lowriter --headless --convert-to pdf \"$f\" ; done\n", | |
"zip /home/jupyter/20k_docgen/pub/500_megbizo_html.zip /home/jupyter/20k_docgen/output/*.pdf" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment