Skip to content

Instantly share code, notes, and snippets.

@tdsmith
Created February 26, 2024 01:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tdsmith/60006dfc03cf63a5756c76048be8ad5e to your computer and use it in GitHub Desktop.
Save tdsmith/60006dfc03cf63a5756c76048be8ad5e to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "28a9d190-a0db-4587-8cb3-a771c37ba7b6",
"metadata": {},
"outputs": [],
"source": [
"from typing import Iterable, Mapping\n",
"\n",
"import attrs\n",
"from frozendict import frozendict\n",
"from selenium import webdriver\n",
"from selenium.webdriver.remote.webdriver import BaseWebDriver\n",
"from selenium.webdriver.common.by import By\n",
"from selenium.webdriver.support import expected_conditions\n",
"from selenium.webdriver.support.wait import WebDriverWait"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3b50f217-0086-4656-ae4d-a637b2d03935",
"metadata": {},
"outputs": [],
"source": [
"@attrs.define(frozen=True)\n",
"class CellValue:\n",
" text: str\n",
" href: str | None\n",
"\n",
"\n",
"@attrs.define()\n",
"class CouncilMeetingSite:\n",
" driver: BaseWebDriver\n",
" wait: WebDriverWait = attrs.field()\n",
"\n",
" @wait.default\n",
" def _default_wait(self) -> WebDriverWait:\n",
" return WebDriverWait(driver=self.driver, timeout=10)\n",
"\n",
" def __attrs_post_init__(self) -> None:\n",
" driver.get(\"https://covapp.vancouver.ca/councilMeetingPublic/\")\n",
"\n",
" def set_date_range(self, begin: str, end: str) -> None:\n",
" # Selector for the submit button to execute the date range query\n",
" SUBMIT_SELECTOR = 'input[type=\"submit\"][value=\"Display\"]'\n",
" \n",
" self.driver.find_element(By.LINK_TEXT, \"By Date\").click()\n",
" self.wait.until(expected_conditions.visibility_of_element_located((By.CSS_SELECTOR, SUBMIT_SELECTOR)))\n",
"\n",
" # detect when the table is rewritten by grabbing a reference to the first row\n",
" old_td = self.driver.find_element(By.CSS_SELECTOR, \"table.TableRecords tr td\")\n",
" \n",
" date_range_boxes = driver.find_elements(By.CSS_SELECTOR, 'input[placeholder=\"YYYY-MM-DD\"]')\n",
" date_range_boxes[0].click()\n",
" date_range_boxes[0].clear()\n",
" date_range_boxes[0].send_keys(begin)\n",
" date_range_boxes[1].click()\n",
" date_range_boxes[1].clear()\n",
" date_range_boxes[1].send_keys(end)\n",
"\n",
" driver.find_element(By.CSS_SELECTOR, SUBMIT_SELECTOR).click()\n",
" self.wait.until(expected_conditions.staleness_of(old_td))\n",
"\n",
" @property\n",
" def table_contents(self) -> list[Mapping[str, CellValue]]:\n",
" rows = self.driver.find_elements(By.CSS_SELECTOR, 'table.TableRecords tr')\n",
" header, *rows = rows\n",
" labels = [th.text for th in header.find_elements(By.TAG_NAME, \"th\")]\n",
" \n",
" table = []\n",
" for row in rows:\n",
" kvs = []\n",
" for label, td in zip(labels, row.find_elements(By.TAG_NAME, \"td\")):\n",
" href = None\n",
" if a := td.find_elements(By.TAG_NAME, \"a\"):\n",
" href = a[0].get_attribute(\"href\")\n",
" value = CellValue(td.text, href)\n",
" kvs.append((label, value))\n",
" table.append(frozendict(kvs))\n",
" \n",
" return table\n",
"\n",
" @property\n",
" def is_last_page(self) -> bool:\n",
" next_disabled = self.driver.find_elements(By.CSS_SELECTOR, \"nav.ListNavigation_Wrapper span.ListNavigation_DisabledNext\")\n",
" return bool(next_disabled)\n",
"\n",
" def next_page(self):\n",
" assert not self.is_last_page\n",
" \n",
" # detect when the table is rewritten by grabbing a reference to the first row\n",
" old_td = self.driver.find_element(By.CSS_SELECTOR, \"table.TableRecords tr td\")\n",
"\n",
" link = self.driver.find_element(By.CSS_SELECTOR, \"nav.ListNavigation_Wrapper a.ListNavigation_Next\")\n",
" link.click()\n",
"\n",
" self.wait.until(expected_conditions.staleness_of(old_td))\n",
"\n",
" def iter_pages(self) -> Iterable[None]:\n",
" yield\n",
" while not self.is_last_page:\n",
" self.next_page()\n",
" yield"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e355d8b2-fd10-4f2d-a732-5efe8e8f2965",
"metadata": {},
"outputs": [],
"source": [
"driver = webdriver.Chrome()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4efe0a4e-28d9-4c70-a49e-442f5a4336c7",
"metadata": {},
"outputs": [],
"source": [
"site = CouncilMeetingSite(driver)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b87c2fa1-f7e4-4bc6-aad7-bec3e31fed40",
"metadata": {},
"outputs": [],
"source": [
"extracted = []\n",
"site.set_date_range(\"2024-01-01\", \"2024-02-29\")\n",
"for _ in site.iter_pages():\n",
" extracted.extend(site.table_contents)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "91f54fd3-594b-4561-849c-21da7bfd5625",
"metadata": {},
"outputs": [],
"source": [
"driver.quit()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "abd2f171-ed6b-45d5-87ba-cd4a8af8287f",
"metadata": {},
"outputs": [],
"source": [
"extracted"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "50e44e77-bb35-4dcd-bf87-21681a56b9ca",
"metadata": {},
"outputs": [],
"source": [
"len(set(extracted))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment