Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Python script to scrape public SAPD data
from bs4 import BeautifulSoup
import requests
from urlparse import parse_qs
import logging
import time
import json
from datetime import timedelta, datetime
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.poolmanager import PoolManager
import ssl
import sys
# This script will attempt to download the PUBLIC sapd call records available.
# The end goal is to perform analysis that can give the public better information on
# what kind of calls are being placed, and possibly give SAPD better analysis/tools/etc
# to help assist them in their duties.
# This is needed because the site has terrible SSL support :(
class TLSv1Adapter(HTTPAdapter):
def init_poolmanager(self, connections, maxsize, block=False):
self.poolmanager = PoolManager(num_pools=connections,
maxsize=maxsize,
block=block,
ssl_version=ssl.PROTOCOL_TLSv1)
# Setup logging config
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
# Set our BASE URL constant to be used in requests
BASE_URL = "https://webapps2.sanantonio.gov/policecalls/Reports.aspx"
if __name__ == "__main__":
s = requests.Session()
# Mount our SSL Hack
s.mount('https://', TLSv1Adapter())
logger = logging.getLogger(__name__)
# Get the initial URL to grab the pesky VIEWSTATE, EVENTVALIDATION, etc.
response = s.get(BASE_URL)
if not response.ok:
logger.info("Error getting URL: {0}".format(BASE_URL))
logger.info("Status Code: {0}".format(str(response.status_code)))
soup = BeautifulSoup(response.text, "html.parser")
# Now let's setup our data to run through
data = {}
# Start on January 1, 2011
currentDate = datetime.strptime("03/26/2014", "%m/%d/%Y")
endDate = currentDate + timedelta(weeks=1)
# This will be a placeholder in case we manually need to get a week by days
daysToGet = 2
# Max date allowed by the SAPD system (Nov. 30, 2015)
maxDate = datetime.strptime("12/01/2015", "%m/%d/%Y")
output_file = open("records_2.json", "w")
# A hacky way to write a JSON file without storing the whole object in memory
output_file.write("[\n")
# Apparently, these only need to be set once?
data["__EVENTVALIDATION"] = soup.find("input", {"id" : "__EVENTVALIDATION"})['value']
data["__VIEWSTATE"] = soup.find("input", {"id" : "__VIEWSTATE"})['value']
while currentDate < maxDate:
# We need to set our parameter (date, category, etc) for our search
data["txtStart"] = currentDate.strftime("%m/%d/%Y")
data["ddlCategory"] = "Other Calls"
data["btnSearch"] = "View Data"
data["cbxHOA$cbxHOA_HiddenField"] = "-1"
data["cbxHOA$cbxHOA_TextBox"] = ""
data["ddlSchoolDistrict"] = " "
data["ddlCouncilDistrict"] = " "
data["txtZipcode"] = ""
# If we have less than a week until the end date, let's run it per day
# Also, if we previously hit the max results limit, get the week by days
if ((maxDate - currentDate) < timedelta(weeks=1)) or daysToGet > 0:
data["rdbSearchRange"] = "day"
endDate = currentDate + timedelta(days=1)
if daysToGet > 0:
daysToGet -= 1
# Otherwise (99% of the queries) run the query by week
else:
data["rdbSearchRange"] = "week"
endDate = currentDate + timedelta(weeks=1)
data["txtEndData"] = endDate.strftime("%m/%d/%Y")
# Now, we actually post the response to the server and get back the results
response = s.post(BASE_URL, data=data)
# If something went wrong, time to bail!
if not response.ok or response.status_code == 500:
logger.info("Error getting URL: {0}".format(BASE_URL))
logger.info("Status Code: {0}".format(str(response.status_code)))
logger.info(str(response.text))
sys.exit(1)
# Parse the HTML returned into another "soup" object
soup = BeautifulSoup(response.text, "html.parser")
# Grab all the rows in the table (skip the header)
calls = soup.find("table",{"id" : "gvCFS"}).find_all("tr")[1:]
# If we have 10k rows, we hit a max limit. Let's set a counter and get this week day by day
if len(calls) >= 10000:
logging.info("Reached max result limit - Getting by days instead of weeks.")
daysToGet = 7
time.sleep(10)
continue
logger.info("Sent: {0} - {1} - {2} - {3}".format(data["txtStart"], data["txtEndData"], data["rdbSearchRange"], str(len(calls))))
logger.info("Found: {0} - {1}".format(soup.find("span", {"id" : "lblStartDate"}).text, soup.find("span", {"id" : "lblEndDate"}).text))
for call in calls:
# Let's extract all the columns from the row into an array called "fields"
fields = call.find_all("td")
# Create the JSON object from the table information
record = {
"id" : fields[0].text.strip(),
"category" : fields[1].text.strip() or "-",
"type" : fields[2].text.strip() or "-",
"@timestamp" : fields[3].text.strip() or "-",
"address" : fields[4].text.strip(),
"hoa" : fields[5].text.strip().replace(u'\xa0',"") or "-",
"schoolDistrict" : fields[6].text.strip().replace(u'\xa0',"") or "-",
"councilDistrict": fields[7].text.strip().replace(u'\xa0',"") or "-",
"zipcode" : fields[8].text.strip().replace(u'\xa0',"") or "-"
}
output_file.write("{0},\n".format(json.dumps(record, indent=4)))
# Little Timmy doesn't like it when people don't sleep between
# calls to a public service!
time.sleep(10)
# Update the currentDate
currentDate = endDate + timedelta(days=1)
output_file.write("]")
output_file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.
You signed in with another tab or window. Reload to refresh your session. You signed out in another tab or window. Reload to refresh your session.