Skip to content

Instantly share code, notes, and snippets.

Another nyc scraper error: ```00:09:55 INFO scrapelib: GET - http://legistar.council.nyc.gov/LegislationDetail.aspx?ID=2460504&GUID=0BE0D55D-995F-4256-B64E-35DD926ACB68
nyc (scrape, import, report)
people: {}
bills: {}
events: {}
Not checking sessions...
http://legistar.council.nyc.gov/DepartmentDetail.aspx?ID=6897&GUID=CDC6E691-8A8C-4F25-97CB-86F31EDAB081
2002-11-05 2013-12-31
2003-03-12 2003-12-31
1998-01-01 2001-12-31
05:42:12 INFO pupa: save organization Committee on Parks and Recreation as organization_c0ff4aec-5d28-11e5-9429-122a3d729da3.json
--em--budget hearings -- chicago public library, office of emergency management & communications, dept. of fleet & facility management and office of inspector general--em--
--em--budget hearings -- city clerk, city treasurer, license appeal commission and department of innovation & technology--em--
--em--budget hearings -- budget overview (budget director, comptroller - dept. of finance and chief financial officer)--em--
--em--public hearings -- city of chicago -- special service areas 57 and 59--em--
INFO:dedupe.blocking:Canopy: TfidfTextCanopyPredicate: (0.6, contributor_name)
INFO:dedupe.blocking:Canopy: TfidfNGramCanopyPredicate: (0.8, contributor_name)
Traceback (most recent call last):
  File "/home/datamade/dedupe-api/api/queue.py", line 80, in processMessage
    upd_args['return_value'] = func(*args, **kwargs)
  File "/home/datamade/dedupe-api/api/utils/delayed_tasks.py", line 708, in dedupeRaw
    block_gen = blockDedupe(session_id)
  File "/home/datamade/dedupe-api/api/utils/delayed_tasks.py", line 542, in blockDedupe
    full_data = list(full_data)
[ (b'distinct', [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.8092105388641357, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.
@evz
evz / data311.py
Last active August 29, 2015 14:20
Reimagining Jonathans script
# This python file pulls the data for Chicago at a census tract level
# from plenario's api for the listed 311 call datasets below
import requests
import time
import sys
import csv
import os
datasets=['311_service_requests_abandoned_vehicles',
'311_service_requests_alley_lights_out', '311_service_requests_tree_trims',
@evz
evz / gist:4c5b055bef94aed5d526
Created April 13, 2015 21:21
address_matcher.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This code demonstrates how to use dedupe with to match messy records
against a deduplicated, canonical dataset. In this example, we'll be
matching messy address strings against a list of valid adddresses in
Chicago.
"""
import os
@evz
evz / Makefile
Created February 14, 2015 15:58
Import plow data to postgis
include config.mk
plows.db:
@wget -O $@ https://s3.amazonaws.com/clearstreets-data/02-08-2015/plow-02-08-2015.db
route_points.table: plows.db
@createdb -U $(PG_USER) -h $(PG_HOST) -p $(PG_PORT) plows
@psql -U $(PG_USER) -h $(PG_HOST) -p $(PG_PORT) -d plows -c \
"CREATE EXTENSION postgis"
@psql -U $(PG_USER) -h $(PG_HOST) -p $(PG_PORT) -d plows -c \
@evz
evz / geoocoder.py
Created February 11, 2015 15:00
GeoCoder script
import requests
import json
import usaddress
import csv
def getRecords():
with open('addresses.csv', 'rb') as f:
reader = csv.reader(f)
for row in reader:
yield row
@evz
evz / review_machine.py
Last active August 29, 2015 14:14
Review Prediction machine
import numpy
import rlr
from collections import OrderedDict
class ReviewMachine(object):
def __init__(self, entity_examples):
"""
Entity examples should be a dict where the key is the entity_id
and the value is a dict like so:
@evz
evz / Makefile
Last active August 29, 2015 14:13
include config.mk
%.zip:
@wget -O $@ http://cdo.ncdc.noaa.gov/qclcd_ascii/QCLCD$(basename $@).zip
touch $@
%.csv: %.zip
@unzip -p $< $(basename $<)daily.txt | \
tail +2 > $(basename $<).csv