Skip to content

Instantly share code, notes, and snippets.

@versae
Created March 5, 2018 20:52
Show Gist options
  • Save versae/b4631fda466670206f240b7eb2712d45 to your computer and use it in GitHub Desktop.
Save versae/b4631fda466670206f240b7eb2712d45 to your computer and use it in GitHub Desktop.
NREGA scrapping
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# NREGA Scrapping\n",
"\n",
"## Setup the driver"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import itertools\n",
"import time\n",
"import os\n",
"\n",
"import numpy as np\n",
"\n",
"from selenium import webdriver\n",
"from selenium.webdriver.support.ui import Select\n",
"from selenium.webdriver.chrome.options import Options\n",
"from tqdm import tqdm\n",
"\n",
"download_directory = os.path.join(os.path.expanduser(\"~\"), \"Downloads\", \"nrega\")\n",
"\n",
"options = webdriver.ChromeOptions()\n",
"options.set_headless(headless=True)\n",
"options.add_argument(\"--incognito\")\n",
"options.add_argument(\"--disable-extensions\")\n",
"options.add_experimental_option(\"prefs\", {\n",
" \"download.default_directory\": download_directory,\n",
" \"download.prompt_for_download\": False,\n",
" \"download.directory_upgrade\": True,\n",
" \"safebrowsing.enabled\": True\n",
"})\n",
"# FirefoxProfile fxProfile = new FirefoxProfile();\n",
"# fxProfile.setPreference(\"browser.download.folderList\",2);\n",
"# fxProfile.setPreference(\"browser.download.manager.showWhenStarting\",false);\n",
"# fxProfile.setPreference(\"browser.download.dir\",\"c:\\\\mydownloads\");\n",
"# fxProfile.setPreference(\"browser.helperApps.neverAsk.saveToDisk\",\"text/csv\");"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def click_all(elements, delay=0):\n",
" for element in elements:\n",
" try:\n",
" element.location_once_scrolled_into_view # also scrolls to element\n",
" element.click()\n",
" except:\n",
" pass\n",
" time.sleep(delay)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def expand_click(lis, delay=0):\n",
" for li in lis:\n",
" click_all((element for element in li.find_elements_by_css_selector(\".accordion\")\n",
" if element.text.strip() != \"-\"), delay)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def select_by_id(element_id, value=None, text=None):\n",
" select_element = browser.find_element_by_id(element_id)\n",
" select_element.location_once_scrolled_into_view\n",
" if value:\n",
" Select(select_element).select_by_value(str(value))\n",
" if text:\n",
" Select(select_element).select_by_visible_text(str(text))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def enable_download_in_headless_chrome(browser, download_dir):\n",
" # https://bugs.chromium.org/p/chromium/issues/detail?id=696481#c86\n",
" # add missing support for chrome \"send_command\" to selenium webdriver\n",
" browser.command_executor._commands[\"send_command\"] = (\"POST\", '/session/$sessionId/chromium/send_command')\n",
" params = {'cmd': 'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': download_dir}}\n",
" browser.execute(\"send_command\", params)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"browser = webdriver.Chrome(chrome_options=options)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Regions"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def select_regions(state_names=None, delay=0):\n",
" select_by_id(\"regionselect\", text=\"GP\")\n",
" container = browser.find_element_by_id(\"middlecontainer\")\n",
" if state_names:\n",
" states = [li for li in container.find_elements_by_class_name(\"statebox\") if li.text[2:] in state_names]\n",
" else:\n",
" states = [li for li in container.find_elements_by_class_name(\"statebox\")]\n",
" expand_click(states, delay)\n",
" # run it twice each since sometimes the requests fail\n",
" expand_click(container.find_elements_by_class_name(\"districtbox\"), delay)\n",
" expand_click(container.find_elements_by_class_name(\"districtbox\"), delay)\n",
" expand_click(container.find_elements_by_class_name(\"blockbox\"), delay)\n",
" expand_click(container.find_elements_by_class_name(\"blockbox\"), delay)\n",
" for li in states:\n",
" state_input = li.find_element_by_tag_name(\"input\")\n",
" state_input.click()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Indicators, years, and download"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def select_indicators(person_days=None, age_range=None, month=None):\n",
" for panel in (\"UpdatePanelmiddle\", \"UpdatePanelleft\", \"UpdatePanelright\"):\n",
" click_all(browser.find_element_by_id(panel).find_elements_by_css_selector(\"input\"))\n",
" if person_days:\n",
" select_by_id(\"DdlstTxtBox1\", text=person_days)\n",
" else:\n",
" click_all(browser.find_elements_by_id(\"TxtBox1\"))\n",
" if age_range:\n",
" select_by_id(\"DdlstTxtBox9\", text=age_range)\n",
" else:\n",
" click_all(browser.find_elements_by_id(\"TxtBox9\"))\n",
" if month:\n",
" for month_select in (\"DdlstTxtBox5\", \"DdlstTxtBox6\", \"DdlstTxtBox7\", \"DdlstTxtBox8\"):\n",
" select_by_id(month_select, text=month)\n",
" else:\n",
" for month_select in (\"TxtBox5\", \"TxtBox6\", \"TxtBox7\", \"TxtBox8\"):\n",
" click_all(browser.find_elements_by_id(month_select))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def select_year(year):\n",
" select_by_id(\"DdlstFinYear\", value=year)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Main"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total form submissions to make: 234234\n",
" 1. Andaman and Nicobar, 2011... "
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-10-081051bce1c9>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 48\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 50\u001b[0;31m \u001b[0mmain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindicators\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtotal\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprod\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mm\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mm\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindicator_list\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mclick_delay\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdownload_delay\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m<ipython-input-10-081051bce1c9>\u001b[0m in \u001b[0;36mmain\u001b[0;34m(indicators, url, total, click_delay, download_delay)\u001b[0m\n\u001b[1;32m 24\u001b[0m )\n\u001b[1;32m 25\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mheader\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 26\u001b[0;31m \u001b[0mselect_regions\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstate_name\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdelay\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mclick_delay\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 27\u001b[0m \u001b[0mlogs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbrowser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_log\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"browser\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[0mis_severe\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m<ipython-input-7-6ebc710968d3>\u001b[0m in \u001b[0;36mselect_regions\u001b[0;34m(state_names, delay)\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mexpand_click\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcontainer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind_elements_by_class_name\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"districtbox\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdelay\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0mexpand_click\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcontainer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind_elements_by_class_name\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"districtbox\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdelay\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0mexpand_click\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcontainer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind_elements_by_class_name\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"blockbox\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdelay\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 13\u001b[0m \u001b[0mexpand_click\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcontainer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind_elements_by_class_name\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"blockbox\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdelay\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mli\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mstates\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m<ipython-input-3-4bdfea6cc853>\u001b[0m in \u001b[0;36mexpand_click\u001b[0;34m(lis, delay)\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mli\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mlis\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m click_all((element for element in li.find_elements_by_css_selector(\".accordion\")\n\u001b[0;32m----> 4\u001b[0;31m if element.text.strip() != \"-\"), delay)\n\u001b[0m",
"\u001b[0;32m<ipython-input-2-25ea49488784>\u001b[0m in \u001b[0;36mclick_all\u001b[0;34m(elements, delay)\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mpass\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdelay\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"url = \"http://nregarep2.nic.in/netnrega/dynamic2/DynamicReport_new4.aspx\"\n",
"state_names = ('Andaman and Nicobar', 'Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar', 'Chhattisgarh', 'Dadra & Nagar Haveli', 'Daman & Diu', 'Goa', 'Gujarat', 'Haryana', 'Himachal Pradesh', 'Jammu and Kashmir', 'Jharkhand', 'Karnataka', 'Kerala', 'Lakshadweep', 'Madhya Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 'Puducherry', 'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Tripura', 'Uttar Pradesh', 'Uttarakhand', 'West Bengal')\n",
"years = [str(y) for y in range(2011, 2018)] # 2011 doesn't quite work\n",
"persons_days = (\"\", \">14\", \"1-10\", \"11-20\", \"21-30\", \"31-40\", \"41-50\", \"51-60\", \"61-70\", \"71-80\", \"81-99\", \"100\", \">100\")\n",
"age_ranges = (\"\", \"18-30 yrs\", \"30-40 yrs\", \"40-50 yrs\", \"50-60 yrs\", \">60 yrs\")\n",
"months = (\"\", \"April\", \"May\", \"June\", \"July\", \"August\", \"September\", \"October\", \"November\", \"December\", \"January\", \"February\", \"March\")\n",
"indicator_list = [state_names, years, months, persons_days, age_ranges]\n",
"indicators = itertools.product(*indicator_list)\n",
"\n",
"def main(indicators, url, total=None, click_delay=0.5, download_delay=5):\n",
" count = 0\n",
" print(f\"Total form submissions to make: {total or 0}\")\n",
" for indicator in indicators:\n",
" count += 1\n",
" log_set = set()\n",
" browser.get(url)\n",
" state_name, year, month, persons_day, age_range, *_ = indicator\n",
" enable_download_in_headless_chrome(browser, download_directory)\n",
" select_year(year)\n",
" select_indicators(persons_day, age_range, month)\n",
" header = \"{count:5}. {indicator}... \".format(\n",
" count=count,\n",
" indicator=\", \".join(filter(bool, indicator))\n",
" )\n",
" print(header, end=\"\")\n",
" select_regions([state_name], delay=click_delay)\n",
" logs = browser.get_log(\"browser\")\n",
" is_severe = False\n",
" if logs:\n",
" for log in logs:\n",
" if (log[\"level\"] == \"SEVERE\"\n",
" and \"favicon\" not in log[\"message\"]\n",
" and log[\"message\"] not in log_set):\n",
" is_severe = True\n",
" print(f\"\\n\\tERROR: {log['message']}\", end=\"\")\n",
" log_set.add(log[\"message\"])\n",
" if is_severe:\n",
" print(\"\\n\\t\", \"-\" * len(header), sep=\"\")\n",
" # download\n",
" browser.find_element_by_id(\"dwnldDummy\").click()\n",
" time.sleep(download_delay)\n",
" filepath = os.path.join(download_directory, \"report.xls\")\n",
" if os.path.isfile(filepath):\n",
" os.rename(filepath, os.path.join(download_directory, f\"{header[:-4].strip()}.xls\"))\n",
" print(\"OK\")\n",
" elif not is_severe:\n",
" print(\"FAIL\")\n",
" print()\n",
" \n",
"main(indicators, url, total=np.prod([m for m in map(len, indicator_list)]), click_delay=0.5, download_delay=5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"browser.quit()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
# coding: utf-8
# # NREGA Scrapping
#
# ## Setup the driver
# In[1]:
import itertools
import time
import os
import numpy as np
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.options import Options
from tqdm import tqdm
download_directory = os.path.join(os.path.expanduser("~"), "Downloads", "nrega")
options = webdriver.ChromeOptions()
options.set_headless(headless=True)
options.add_argument("--incognito")
options.add_argument("--disable-extensions")
options.add_experimental_option("prefs", {
"download.default_directory": download_directory,
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"safebrowsing.enabled": True
})
# FirefoxProfile fxProfile = new FirefoxProfile();
# fxProfile.setPreference("browser.download.folderList",2);
# fxProfile.setPreference("browser.download.manager.showWhenStarting",false);
# fxProfile.setPreference("browser.download.dir","c:\\mydownloads");
# fxProfile.setPreference("browser.helperApps.neverAsk.saveToDisk","text/csv");
# In[2]:
def click_all(elements, delay=0):
for element in elements:
try:
element.location_once_scrolled_into_view # also scrolls to element
element.click()
except:
pass
time.sleep(delay)
# In[3]:
def expand_click(lis, delay=0):
for li in lis:
click_all((element for element in li.find_elements_by_css_selector(".accordion")
if element.text.strip() != "-"), delay)
# In[4]:
def select_by_id(element_id, value=None, text=None):
select_element = browser.find_element_by_id(element_id)
select_element.location_once_scrolled_into_view
if value:
Select(select_element).select_by_value(str(value))
if text:
Select(select_element).select_by_visible_text(str(text))
# In[5]:
def enable_download_in_headless_chrome(browser, download_dir):
# https://bugs.chromium.org/p/chromium/issues/detail?id=696481#c86
# add missing support for chrome "send_command" to selenium webdriver
browser.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')
params = {'cmd': 'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': download_dir}}
browser.execute("send_command", params)
# In[6]:
browser = webdriver.Chrome(chrome_options=options)
# ## Regions
# In[7]:
def select_regions(state_names=None, delay=0):
select_by_id("regionselect", text="GP")
container = browser.find_element_by_id("middlecontainer")
if state_names:
states = [li for li in container.find_elements_by_class_name("statebox") if li.text[2:] in state_names]
else:
states = [li for li in container.find_elements_by_class_name("statebox")]
expand_click(states, delay)
# run it twice each since sometimes the requests fail
expand_click(container.find_elements_by_class_name("districtbox"), delay)
expand_click(container.find_elements_by_class_name("districtbox"), delay)
expand_click(container.find_elements_by_class_name("blockbox"), delay)
expand_click(container.find_elements_by_class_name("blockbox"), delay)
for li in states:
state_input = li.find_element_by_tag_name("input")
state_input.click()
# ## Indicators, years, and download
# In[8]:
def select_indicators(person_days=None, age_range=None, month=None):
for panel in ("UpdatePanelmiddle", "UpdatePanelleft", "UpdatePanelright"):
click_all(browser.find_element_by_id(panel).find_elements_by_css_selector("input"))
if person_days:
select_by_id("DdlstTxtBox1", text=person_days)
else:
click_all(browser.find_elements_by_id("TxtBox1"))
if age_range:
select_by_id("DdlstTxtBox9", text=age_range)
else:
click_all(browser.find_elements_by_id("TxtBox9"))
if month:
for month_select in ("DdlstTxtBox5", "DdlstTxtBox6", "DdlstTxtBox7", "DdlstTxtBox8"):
select_by_id(month_select, text=month)
else:
for month_select in ("TxtBox5", "TxtBox6", "TxtBox7", "TxtBox8"):
click_all(browser.find_elements_by_id(month_select))
# In[9]:
def select_year(year):
select_by_id("DdlstFinYear", value=year)
# ## Main
# In[10]:
url = "http://nregarep2.nic.in/netnrega/dynamic2/DynamicReport_new4.aspx"
state_names = ('Andaman and Nicobar', 'Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar', 'Chhattisgarh', 'Dadra & Nagar Haveli', 'Daman & Diu', 'Goa', 'Gujarat', 'Haryana', 'Himachal Pradesh', 'Jammu and Kashmir', 'Jharkhand', 'Karnataka', 'Kerala', 'Lakshadweep', 'Madhya Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 'Puducherry', 'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Tripura', 'Uttar Pradesh', 'Uttarakhand', 'West Bengal')
years = [str(y) for y in range(2011, 2018)] # 2011 doesn't quite work
persons_days = ("", ">14", "1-10", "11-20", "21-30", "31-40", "41-50", "51-60", "61-70", "71-80", "81-99", "100", ">100")
age_ranges = ("", "18-30 yrs", "30-40 yrs", "40-50 yrs", "50-60 yrs", ">60 yrs")
months = ("", "April", "May", "June", "July", "August", "September", "October", "November", "December", "January", "February", "March")
indicator_list = [state_names, years, months, persons_days, age_ranges]
indicators = itertools.product(*indicator_list)
def main(indicators, url, total=None, click_delay=0.5, download_delay=5):
count = 0
print(f"Total form submissions to make: {total or 0}")
for indicator in indicators:
count += 1
log_set = set()
browser.get(url)
state_name, year, month, persons_day, age_range, *_ = indicator
enable_download_in_headless_chrome(browser, download_directory)
select_year(year)
select_indicators(persons_day, age_range, month)
header = "{count:5}. {indicator}... ".format(
count=count,
indicator=", ".join(filter(bool, indicator))
)
print(header, end="")
select_regions([state_name], delay=click_delay)
logs = browser.get_log("browser")
is_severe = False
if logs:
for log in logs:
if (log["level"] == "SEVERE"
and "favicon" not in log["message"]
and log["message"] not in log_set):
is_severe = True
print(f"\n\tERROR: {log['message']}", end="")
log_set.add(log["message"])
if is_severe:
print("\n\t", "-" * len(header), sep="")
# download
browser.find_element_by_id("dwnldDummy").click()
time.sleep(download_delay)
filepath = os.path.join(download_directory, "report.xls")
if os.path.isfile(filepath):
os.rename(filepath, os.path.join(download_directory, f"{header[:-4].strip()}.xls"))
print("OK")
elif not is_severe:
print("FAIL")
print()
main(indicators, url, total=np.prod([m for m in map(len, indicator_list)]), click_delay=0.5, download_delay=5)
# In[ ]:
browser.quit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment