Eric van Zanten evz

## nyc_traceback.txt
Another nyc scraper error: ```00:09:55 INFO scrapelib: GET - http://legistar.council.nyc.gov/LegislationDetail.aspx?ID=2460504&GUID=0BE0D55D-995F-4256-B64E-35DD926ACB68
nyc (scrape, import, report)
  people: {}
  bills: {}
  events: {}
Not checking sessions...
http://legistar.council.nyc.gov/DepartmentDetail.aspx?ID=6897&GUID=CDC6E691-8A8C-4F25-97CB-86F31EDAB081
2002-11-05 2013-12-31
2003-03-12 2003-12-31
1998-01-01 2001-12-31

## chicago_traceback.txt
05:42:12 INFO pupa: save organization Committee on Parks and Recreation as organization_c0ff4aec-5d28-11e5-9429-122a3d729da3.json


--em--budget hearings -- chicago public library, office of emergency management & communications, dept. of fleet & facility management and office of inspector general--em--

--em--budget hearings -- city clerk, city treasurer, license appeal commission and department of innovation & technology--em--

--em--budget hearings -- budget overview (budget director, comptroller - dept. of finance and chief financial officer)--em--

--em--public hearings -- city of chicago -- special service areas 57 and 59--em--

## gist:ff6251973b9c438537d5

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                evz
                / gist:ff6251973b9c438537d5
            
            
              Created
              May 7, 2015 15:57
            
              
                Traceback
              
          
    INFO:dedupe.blocking:Canopy: TfidfTextCanopyPredicate: (0.6, contributor_name)
INFO:dedupe.blocking:Canopy: TfidfNGramCanopyPredicate: (0.8, contributor_name)
Traceback (most recent call last):
  File "/home/datamade/dedupe-api/api/queue.py", line 80, in processMessage
    upd_args['return_value'] = func(*args, **kwargs)
  File "/home/datamade/dedupe-api/api/utils/delayed_tasks.py", line 708, in dedupeRaw
    block_gen = blockDedupe(session_id)
  File "/home/datamade/dedupe-api/api/utils/delayed_tasks.py", line 542, in blockDedupe
    full_data = list(full_data)


## gist:7d5b75a4021eb2d4d4e3
[ (b'distinct', [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.8092105388641357, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.

## data311.py
# This python file pulls the data for Chicago at a census tract level
# from plenario's api for the listed 311 call datasets below
import requests
import time
import sys
import csv
import os

datasets=['311_service_requests_abandoned_vehicles',
    '311_service_requests_alley_lights_out', '311_service_requests_tree_trims',

## gist:4c5b055bef94aed5d526
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This code demonstrates how to use dedupe with to match messy records
against a deduplicated, canonical dataset. In this example, we'll be
matching messy address strings against a list of valid adddresses in
Chicago.
"""

import os

## Makefile
include config.mk

plows.db:
        @wget -O $@ https://s3.amazonaws.com/clearstreets-data/02-08-2015/plow-02-08-2015.db

route_points.table: plows.db
        @createdb -U $(PG_USER) -h $(PG_HOST) -p $(PG_PORT) plows
        @psql -U $(PG_USER) -h $(PG_HOST) -p $(PG_PORT) -d plows -c \
                "CREATE EXTENSION postgis"
        @psql -U $(PG_USER) -h $(PG_HOST) -p $(PG_PORT) -d plows -c \

## geoocoder.py
import requests
import json
import usaddress
import csv

def getRecords():
    with open('addresses.csv', 'rb') as f:
        reader = csv.reader(f)
        for row in reader:
            yield row

## review_machine.py
import numpy
import rlr
from collections import OrderedDict

class ReviewMachine(object):
    def __init__(self, entity_examples):
        """
        Entity examples should be a dict where the key is the entity_id
        and the value is a dict like so:

## Makefile
include config.mk

%.zip:
  @wget -O $@ http://cdo.ncdc.noaa.gov/qclcd_ascii/QCLCD$(basename $@).zip
    touch $@

%.csv: %.zip
  @unzip -p $< $(basename $<)daily.txt | \
    tail +2 > $(basename $<).csv
	Another nyc scraper error: ```00:09:55 INFO scrapelib: GET - http://legistar.council.nyc.gov/LegislationDetail.aspx?ID=2460504&GUID=0BE0D55D-995F-4256-B64E-35DD926ACB68
	nyc (scrape, import, report)
	people: {}
	bills: {}
	events: {}
	Not checking sessions...
	http://legistar.council.nyc.gov/DepartmentDetail.aspx?ID=6897&GUID=CDC6E691-8A8C-4F25-97CB-86F31EDAB081
	2002-11-05 2013-12-31
	2003-03-12 2003-12-31
	1998-01-01 2001-12-31
	05:42:12 INFO pupa: save organization Committee on Parks and Recreation as organization_c0ff4aec-5d28-11e5-9429-122a3d729da3.json


	--em--budget hearings -- chicago public library, office of emergency management & communications, dept. of fleet & facility management and office of inspector general--em--

	--em--budget hearings -- city clerk, city treasurer, license appeal commission and department of innovation & technology--em--

	--em--budget hearings -- budget overview (budget director, comptroller - dept. of finance and chief financial officer)--em--

	--em--public hearings -- city of chicago -- special service areas 57 and 59--em--
	# This python file pulls the data for Chicago at a census tract level
	# from plenario's api for the listed 311 call datasets below
	import requests
	import time
	import sys
	import csv
	import os

	datasets=['311_service_requests_abandoned_vehicles',
	'311_service_requests_alley_lights_out', '311_service_requests_tree_trims',
	#!/usr/bin/python
	# -- coding: utf-8 --
	"""
	This code demonstrates how to use dedupe with to match messy records
	against a deduplicated, canonical dataset. In this example, we'll be
	matching messy address strings against a list of valid adddresses in
	Chicago.
	"""

	import os
	include config.mk

	plows.db:
	@wget -O $@ https://s3.amazonaws.com/clearstreets-data/02-08-2015/plow-02-08-2015.db

	route_points.table: plows.db
	@createdb -U $(PG_USER) -h $(PG_HOST) -p $(PG_PORT) plows
	@psql -U $(PG_USER) -h $(PG_HOST) -p $(PG_PORT) -d plows -c \
	"CREATE EXTENSION postgis"
	@psql -U $(PG_USER) -h $(PG_HOST) -p $(PG_PORT) -d plows -c \
	import requests
	import json
	import usaddress
	import csv

	def getRecords():
	with open('addresses.csv', 'rb') as f:
	reader = csv.reader(f)
	for row in reader:
	yield row
	import numpy
	import rlr
	from collections import OrderedDict

	class ReviewMachine(object):
	def __init__(self, entity_examples):
	"""
	Entity examples should be a dict where the key is the entity_id
	and the value is a dict like so:
	include config.mk

	%.zip:
	@wget -O $@ http://cdo.ncdc.noaa.gov/qclcd_ascii/QCLCD$(basename $@).zip
	touch $@

	%.csv: %.zip
	@unzip -p $< $(basename $<)daily.txt \| \
	tail +2 > $(basename $<).csv