lorinc/docugen.ipynb

## docugen.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "f4f25983-fff1-43b6-967b-8ba3bf5ec285",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 1. installing required python libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cdf271cd-aca3-414c-acb2-bf1963911292",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%capture\n",
    "%pip install jinja2 jsonschema"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "94c0206b-b530-46a8-8633-6343a8293d01",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 2. generating documents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6365bc98-5dd2-413d-af60-375bc9775db4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# merging user data and template into html files\n",
    "\n",
    "import json\n",
    "from jinja2 import Template\n",
    "\n",
    "################################################################# read\n",
    "\n",
    "def get_delegation_data(file_str='../input/delegate.json'):\n",
    "    \"\"\"reads the json dump file and returns as json variable\"\"\"\n",
    "    with open(file_str, 'r', encoding='utf-8') as f:\n",
    "        return json.load(f)['data']['delegation_processes']\n",
    "\n",
    "\n",
    "def drop_old_runs(delegation_data):\n",
    "    max_delegation_round = max([e['round'] for e in delegation_data])\n",
    "    for run in delegation_data:\n",
    "        if run['round'] == max_delegation_round:\n",
    "            yield run\n",
    "\n",
    "################################################################# validate\n",
    "\n",
    "def test(json_ref):\n",
    "    if json_ref not in [None, '', [], {}]:\n",
    "        return json_ref\n",
    "    else:\n",
    "        return None\n",
    "\n",
    "\n",
    "def is_valid_user(user_json):\n",
    "    \"\"\"very-very forgiving data quality check\"\"\"\n",
    "    try:\n",
    "        if all([\n",
    "            test(user_json['user_detail']['legal_name']),\n",
    "            test(user_json['user_detail']['personal_identity_num']),\n",
    "            test(user_json['user_addresses'][0]),\n",
    "            test(user_json['user_addresses'][0]['postal_cd']),\n",
    "            test(user_json['user_addresses'][0]['city_name']),\n",
    "            test(user_json['user_addresses'][0]['street_name'])       \n",
    "        ]):\n",
    "            return True\n",
    "        else:\n",
    "            return False\n",
    "    except:\n",
    "        return False\n",
    "\n",
    "        \n",
    "def is_valid_analog(analog_json):\n",
    "    \"\"\"very-very forgiving data quality check\"\"\"\n",
    "    if all([\n",
    "        test(analog_json['full_name']),\n",
    "        test(analog_json['identity_num']),\n",
    "        test(analog_json['address'])\n",
    "    ]):\n",
    "        return True\n",
    "    else:\n",
    "        return False\n",
    "    \n",
    "\n",
    "def user_address_builder(user_json):\n",
    "    \"\"\"building a string from things that might or might not be there\"\"\"\n",
    "    return ' '.join([\n",
    "        test(user_json['user_addresses'][0]['postal_cd']) or '',\n",
    "        test(user_json['user_addresses'][0]['city_name']) or '',\n",
    "        \",\",\n",
    "        test(user_json['user_addresses'][0]['street_name']) or '',\n",
    "        (\n",
    "            test(user_json['user_addresses'][0]['street_type']['comment'])\n",
    "            if (\n",
    "                test(user_json['user_addresses'][0]['street_type']\n",
    "                and test(user_json['user_addresses'][0]['street_type']['comment']))\n",
    "            ) else ''\n",
    "        ),\n",
    "        test(user_json['user_addresses'][0]['street_num']) or ''\n",
    "    ])\n",
    "\n",
    "################################################################# build\n",
    "\n",
    "def doc_builder(delegation_run_data):\n",
    "    for run_town in delegation_run_data:\n",
    "        fileload = {'filename':f\"{run_town['town_id']}.html\",\n",
    "                    'name' : run_town['town']['town_name'], \n",
    "                    'delegees' : []}\n",
    "\n",
    "        for voting_location in run_town['town']['voting_locations']:\n",
    "            if voting_location['user_2_voting_locations']:\n",
    "                for user_location in voting_location['user_2_voting_locations']:\n",
    "\n",
    "                    if is_valid_user(user_location['user']):\n",
    "                        fileload['delegees'].append(\n",
    "                            {\n",
    "                                'name':user_location['user']['user_detail']['legal_name'],\n",
    "                                'id':user_location['user']['user_detail']['personal_identity_num'],\n",
    "                                'address':user_address_builder(user_location['user']),\n",
    "                                'email_address':user_location['user']['email_address'],\n",
    "                                'phone_num':user_location['user']['phone_num'],\n",
    "                                'zone':voting_location['location_number']\n",
    "                            }\n",
    "                        )\n",
    "\n",
    "                    elif is_valid_analog(user_location['analog_user']):\n",
    "                         fileload['delegees'].append(\n",
    "                             {\n",
    "                                'name':user_location['analog_user']['full_name'],\n",
    "                                'id':user_location['analog_user']['identity_num'],\n",
    "                                'address':user_location['analog_user']['address'],\n",
    "                                'email_address':user_location['analog_user']['email_address'],\n",
    "                                'phone_num':user_location['analog_user']['phone_num'],\n",
    "                                'zone':voting_location['location_number']\n",
    "                             }\n",
    "                         )                       \n",
    "                    else:\n",
    "                        pass # this should be error handling\n",
    "            else:\n",
    "                pass # empty delegation letter\n",
    "            \n",
    "        yield fileload\n",
    "\n",
    "################################################################# merge\n",
    "\n",
    "def read_letter_template(file_str='../input/megbizolevel_jinja_template.html'):\n",
    "    with open(file_str, 'r', encoding='utf-8') as f:\n",
    "        template = Template(f.read())\n",
    "    return template\n",
    "\n",
    "\n",
    "def generate_documents(fileload, jinja_template):\n",
    "    path = \"../output/\"    \n",
    "    for town in fileload:\n",
    "        if town['delegees']:\n",
    "            document = jinja_template.render(town = town)\n",
    "            with open(path+town['filename'], 'w', encoding='utf-8') as f:\n",
    "                    f.write(document)\n",
    " "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ee212b73-77e3-4f43-8c3e-4ee2b2044728",
   "metadata": {},
   "outputs": [],
   "source": [
    "generate_documents(\n",
    "    doc_builder(\n",
    "        drop_old_runs(\n",
    "            get_delegation_data()\n",
    "        )\n",
    "    ),\n",
    "    read_letter_template()\n",
    ") "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a1727c07-4bdb-4fc8-9589-06c708708dce",
   "metadata": {},
   "outputs": [],
   "source": [
    "# transforming the json array to a tabular report\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "report = pd.json_normalize(\n",
    "    doc_builder(\n",
    "        drop_old_runs(\n",
    "            get_delegation_data()\n",
    "        )\n",
    "    )\n",
    ")\n",
    "\n",
    "report = report.join(\n",
    "    pd.DataFrame(\n",
    "        report.delegees.explode()\n",
    "    ), rsuffix='_exploded'\n",
    ").drop(\n",
    "    columns=['delegees']\n",
    ")\n",
    "\n",
    "report = report.join(\n",
    "    pd.json_normalize(\n",
    "        report.delegees_exploded\n",
    "    ).set_index(\n",
    "        report.index\n",
    "    ), rsuffix='_usr'\n",
    ").drop(\n",
    "    columns=['delegees_exploded']\n",
    ").drop_duplicates()\n",
    "\n",
    "report.to_csv('../output/delegalas_report.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "887fc299-7ca0-4b20-b641-5aff5f97b5a6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 399 entries, but not all of them have delegees\n",
    "%ls /home/jupyter/20k_docgen/output/ | wc -l"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c18b5333-c620-4113-80d2-6f9a4c32f700",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 3. installing libreoffice writer for pdf conversion"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4a282c4c-69c1-4930-b847-7a36eef4a0ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%capture\n",
    "!apt-get -qq install -y libreoffice-writer"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "50bc0d4c-dc6a-4ddb-b181-8c2efa96e33d",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 4. converting HTMLs to PDFs and zipping them for download\n",
    "*(did you clean the output folder before running the generation script??!)*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ea44aa72-9766-4e20-9340-91130dd8bd57",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%bash\n",
    "cd /home/jupyter/20k_docgen/output/\n",
    "for f in *.html ; do lowriter --headless --convert-to pdf \"$f\" ; done\n",
    "zip /home/jupyter/20k_docgen/pub/500_megbizo_html.zip /home/jupyter/20k_docgen/output/*.pdf"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"id": "f4f25983-fff1-43b6-967b-8ba3bf5ec285",
	"metadata": {
	"tags": []
	},
	"source": [
	"## 1. installing required python libraries"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "cdf271cd-aca3-414c-acb2-bf1963911292",
	"metadata": {},
	"outputs": [],
	"source": [
	"%%capture\n",
	"%pip install jinja2 jsonschema"
	]
	},
	{
	"cell_type": "markdown",
	"id": "94c0206b-b530-46a8-8633-6343a8293d01",
	"metadata": {
	"tags": []
	},
	"source": [
	"## 2. generating documents"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "6365bc98-5dd2-413d-af60-375bc9775db4",
	"metadata": {},
	"outputs": [],
	"source": [
	"# merging user data and template into html files\n",
	"\n",
	"import json\n",
	"from jinja2 import Template\n",
	"\n",
	"################################################################# read\n",
	"\n",
	"def get_delegation_data(file_str='../input/delegate.json'):\n",
	" \"\"\"reads the json dump file and returns as json variable\"\"\"\n",
	" with open(file_str, 'r', encoding='utf-8') as f:\n",
	" return json.load(f)['data']['delegation_processes']\n",
	"\n",
	"\n",
	"def drop_old_runs(delegation_data):\n",
	" max_delegation_round = max([e['round'] for e in delegation_data])\n",
	" for run in delegation_data:\n",
	" if run['round'] == max_delegation_round:\n",
	" yield run\n",
	"\n",
	"################################################################# validate\n",
	"\n",
	"def test(json_ref):\n",
	" if json_ref not in [None, '', [], {}]:\n",
	" return json_ref\n",
	" else:\n",
	" return None\n",
	"\n",
	"\n",
	"def is_valid_user(user_json):\n",
	" \"\"\"very-very forgiving data quality check\"\"\"\n",
	" try:\n",
	" if all([\n",
	" test(user_json['user_detail']['legal_name']),\n",
	" test(user_json['user_detail']['personal_identity_num']),\n",
	" test(user_json['user_addresses'][0]),\n",
	" test(user_json['user_addresses'][0]['postal_cd']),\n",
	" test(user_json['user_addresses'][0]['city_name']),\n",
	" test(user_json['user_addresses'][0]['street_name']) \n",
	" ]):\n",
	" return True\n",
	" else:\n",
	" return False\n",
	" except:\n",
	" return False\n",
	"\n",
	" \n",
	"def is_valid_analog(analog_json):\n",
	" \"\"\"very-very forgiving data quality check\"\"\"\n",
	" if all([\n",
	" test(analog_json['full_name']),\n",
	" test(analog_json['identity_num']),\n",
	" test(analog_json['address'])\n",
	" ]):\n",
	" return True\n",
	" else:\n",
	" return False\n",
	" \n",
	"\n",
	"def user_address_builder(user_json):\n",
	" \"\"\"building a string from things that might or might not be there\"\"\"\n",
	" return ' '.join([\n",
	" test(user_json['user_addresses'][0]['postal_cd']) or '',\n",
	" test(user_json['user_addresses'][0]['city_name']) or '',\n",
	" \",\",\n",
	" test(user_json['user_addresses'][0]['street_name']) or '',\n",
	" (\n",
	" test(user_json['user_addresses'][0]['street_type']['comment'])\n",
	" if (\n",
	" test(user_json['user_addresses'][0]['street_type']\n",
	" and test(user_json['user_addresses'][0]['street_type']['comment']))\n",
	" ) else ''\n",
	" ),\n",
	" test(user_json['user_addresses'][0]['street_num']) or ''\n",
	" ])\n",
	"\n",
	"################################################################# build\n",
	"\n",
	"def doc_builder(delegation_run_data):\n",
	" for run_town in delegation_run_data:\n",
	" fileload = {'filename':f\"{run_town['town_id']}.html\",\n",
	" 'name' : run_town['town']['town_name'], \n",
	" 'delegees' : []}\n",
	"\n",
	" for voting_location in run_town['town']['voting_locations']:\n",
	" if voting_location['user_2_voting_locations']:\n",
	" for user_location in voting_location['user_2_voting_locations']:\n",
	"\n",
	" if is_valid_user(user_location['user']):\n",
	" fileload['delegees'].append(\n",
	" {\n",
	" 'name':user_location['user']['user_detail']['legal_name'],\n",
	" 'id':user_location['user']['user_detail']['personal_identity_num'],\n",
	" 'address':user_address_builder(user_location['user']),\n",
	" 'email_address':user_location['user']['email_address'],\n",
	" 'phone_num':user_location['user']['phone_num'],\n",
	" 'zone':voting_location['location_number']\n",
	" }\n",
	" )\n",
	"\n",
	" elif is_valid_analog(user_location['analog_user']):\n",
	" fileload['delegees'].append(\n",
	" {\n",
	" 'name':user_location['analog_user']['full_name'],\n",
	" 'id':user_location['analog_user']['identity_num'],\n",
	" 'address':user_location['analog_user']['address'],\n",
	" 'email_address':user_location['analog_user']['email_address'],\n",
	" 'phone_num':user_location['analog_user']['phone_num'],\n",
	" 'zone':voting_location['location_number']\n",
	" }\n",
	" ) \n",
	" else:\n",
	" pass # this should be error handling\n",
	" else:\n",
	" pass # empty delegation letter\n",
	" \n",
	" yield fileload\n",
	"\n",
	"################################################################# merge\n",
	"\n",
	"def read_letter_template(file_str='../input/megbizolevel_jinja_template.html'):\n",
	" with open(file_str, 'r', encoding='utf-8') as f:\n",
	" template = Template(f.read())\n",
	" return template\n",
	"\n",
	"\n",
	"def generate_documents(fileload, jinja_template):\n",
	" path = \"../output/\" \n",
	" for town in fileload:\n",
	" if town['delegees']:\n",
	" document = jinja_template.render(town = town)\n",
	" with open(path+town['filename'], 'w', encoding='utf-8') as f:\n",
	" f.write(document)\n",
	" "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "ee212b73-77e3-4f43-8c3e-4ee2b2044728",
	"metadata": {},
	"outputs": [],
	"source": [
	"generate_documents(\n",
	" doc_builder(\n",
	" drop_old_runs(\n",
	" get_delegation_data()\n",
	" )\n",
	" ),\n",
	" read_letter_template()\n",
	") "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "a1727c07-4bdb-4fc8-9589-06c708708dce",
	"metadata": {},
	"outputs": [],
	"source": [
	"# transforming the json array to a tabular report\n",
	"\n",
	"import pandas as pd\n",
	"\n",
	"report = pd.json_normalize(\n",
	" doc_builder(\n",
	" drop_old_runs(\n",
	" get_delegation_data()\n",
	" )\n",
	" )\n",
	")\n",
	"\n",
	"report = report.join(\n",
	" pd.DataFrame(\n",
	" report.delegees.explode()\n",
	" ), rsuffix='_exploded'\n",
	").drop(\n",
	" columns=['delegees']\n",
	")\n",
	"\n",
	"report = report.join(\n",
	" pd.json_normalize(\n",
	" report.delegees_exploded\n",
	" ).set_index(\n",
	" report.index\n",
	" ), rsuffix='_usr'\n",
	").drop(\n",
	" columns=['delegees_exploded']\n",
	").drop_duplicates()\n",
	"\n",
	"report.to_csv('../output/delegalas_report.csv')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "887fc299-7ca0-4b20-b641-5aff5f97b5a6",
	"metadata": {},
	"outputs": [],
	"source": [
	"# 399 entries, but not all of them have delegees\n",
	"%ls /home/jupyter/20k_docgen/output/ \| wc -l"
	]
	},
	{
	"cell_type": "markdown",
	"id": "c18b5333-c620-4113-80d2-6f9a4c32f700",
	"metadata": {
	"tags": []
	},
	"source": [
	"## 3. installing libreoffice writer for pdf conversion"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "4a282c4c-69c1-4930-b847-7a36eef4a0ce",
	"metadata": {},
	"outputs": [],
	"source": [
	"%%capture\n",
	"!apt-get -qq install -y libreoffice-writer"
	]
	},
	{
	"cell_type": "markdown",
	"id": "50bc0d4c-dc6a-4ddb-b181-8c2efa96e33d",
	"metadata": {
	"tags": []
	},
	"source": [
	"## 4. converting HTMLs to PDFs and zipping them for download\n",
	"(did you clean the output folder before running the generation script??!)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "ea44aa72-9766-4e20-9340-91130dd8bd57",
	"metadata": {},
	"outputs": [],
	"source": [
	"%%bash\n",
	"cd /home/jupyter/20k_docgen/output/\n",
	"for f in *.html ; do lowriter --headless --convert-to pdf \"$f\" ; done\n",
	"zip /home/jupyter/20k_docgen/pub/500_megbizo_html.zip /home/jupyter/20k_docgen/output/*.pdf"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.12"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}