Skip to content

Instantly share code, notes, and snippets.

@DiPierro

DiPierro/demo.py Secret

Created November 20, 2020 18:02
Show Gist options
  • Save DiPierro/ffa84cd4348492eb0c52bd3759eaf8fc to your computer and use it in GitHub Desktop.
Save DiPierro/ffa84cd4348492eb0c52bd3759eaf8fc to your computer and use it in GitHub Desktop.
Gist: Working with Legistar scraper
"""demo.py"""
from legistar.base import LegistarScraper, LegistarSession
from legistar.events import *
from requests import Session
import scrapelib
import esprima
import pdb
from datetime import datetime
# https://miro.com/app/board/o9J_kguby6k=/
# https://github.com/opencivicdata/python-legistar-scraper/tree/8a362c7e838d523545484c06d91aa766e64d5817
# https://github.com/opencivicdata/scrapers-us-municipal/blob/9270a958249789e3be47a974f79a2de6297dddd5/sacramento/people.py
# https://github.com/datamade/scrapers-us-municipal/blob/27df297174b169127e808dfb94bd3ce5a650c981/lametro/events.py
class SonomaScraper(LegistarScraper):
# Instead of this, make a general scraper class
# match civicplus scraper interface
BASE_URL = 'https://sonoma-county.legistar.com/Calendar.aspx'
class SonomaEvents(LegistarEventsScraper):
EVENTSPAGE = 'https://sonoma-county.legistar.com/Calendar.aspx'
# TODO:
#
# pip install it into our repo
# add this as a dependency to civicscraper
# pipenv install with the github URL - check syntax
# get paging working
# annotate what's going on here
# use the class in composition, maybe, not inheritance
# What's the unit of search when we go to a page?
# Look at current year and year ahead?
# Need: The ability to backfill - to initially populate
# and the ability to update what we've already scraped.
# Need: The ability to grab new things on a subset of data
if __name__ == "__main__":
from legistar.base import LegistarScraper, LegistarSession
from requests import Session
import scrapelib
import esprima
# To scrape a table (with info about meetings) from a Calendar.aspx page
# l = SonomaScraper()
# lxml = l.lxmlize(l.BASE_URL)
# table = l.parseDataTable(lxml)
# To navigate to a different page of results on a Calendar.aspx page:
# TODO
l = SonomaEvents()
# event_pages = l.eventPages(since=2019)
# for event in event_pages:
# print(event)
events = []
for event, _ in l.events(since=datetime.today().year): # Only grabs info for the next upcoming event.
events.append((dict(event), None))
pdb.set_trace()
# pages = l.pages(l.BASE_URL) # Returns a generator containing items like this: <Element html at 0x7fcab9b8ed60>
# for page in pages:
# print("page")
# table = l.parseDataTable(page)
# for i in table:
# print("table entry")
# https://sonoma-county.legistar.com/Calendar.aspx
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment