-
-
Save DiPierro/ffa84cd4348492eb0c52bd3759eaf8fc to your computer and use it in GitHub Desktop.
Gist: Working with Legistar scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""demo.py""" | |
from legistar.base import LegistarScraper, LegistarSession | |
from legistar.events import * | |
from requests import Session | |
import scrapelib | |
import esprima | |
import pdb | |
from datetime import datetime | |
# https://miro.com/app/board/o9J_kguby6k=/ | |
# https://github.com/opencivicdata/python-legistar-scraper/tree/8a362c7e838d523545484c06d91aa766e64d5817 | |
# https://github.com/opencivicdata/scrapers-us-municipal/blob/9270a958249789e3be47a974f79a2de6297dddd5/sacramento/people.py | |
# https://github.com/datamade/scrapers-us-municipal/blob/27df297174b169127e808dfb94bd3ce5a650c981/lametro/events.py | |
class SonomaScraper(LegistarScraper): | |
# Instead of this, make a general scraper class | |
# match civicplus scraper interface | |
BASE_URL = 'https://sonoma-county.legistar.com/Calendar.aspx' | |
class SonomaEvents(LegistarEventsScraper): | |
EVENTSPAGE = 'https://sonoma-county.legistar.com/Calendar.aspx' | |
# TODO: | |
# | |
# pip install it into our repo | |
# add this as a dependency to civicscraper | |
# pipenv install with the github URL - check syntax | |
# get paging working | |
# annotate what's going on here | |
# use the class in composition, maybe, not inheritance | |
# What's the unit of search when we go to a page? | |
# Look at current year and year ahead? | |
# Need: The ability to backfill - to initially populate | |
# and the ability to update what we've already scraped. | |
# Need: The ability to grab new things on a subset of data | |
if __name__ == "__main__": | |
from legistar.base import LegistarScraper, LegistarSession | |
from requests import Session | |
import scrapelib | |
import esprima | |
# To scrape a table (with info about meetings) from a Calendar.aspx page | |
# l = SonomaScraper() | |
# lxml = l.lxmlize(l.BASE_URL) | |
# table = l.parseDataTable(lxml) | |
# To navigate to a different page of results on a Calendar.aspx page: | |
# TODO | |
l = SonomaEvents() | |
# event_pages = l.eventPages(since=2019) | |
# for event in event_pages: | |
# print(event) | |
events = [] | |
for event, _ in l.events(since=datetime.today().year): # Only grabs info for the next upcoming event. | |
events.append((dict(event), None)) | |
pdb.set_trace() | |
# pages = l.pages(l.BASE_URL) # Returns a generator containing items like this: <Element html at 0x7fcab9b8ed60> | |
# for page in pages: | |
# print("page") | |
# table = l.parseDataTable(page) | |
# for i in table: | |
# print("table entry") | |
# https://sonoma-county.legistar.com/Calendar.aspx | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment