Chase Davis cjdd3b

## gist:1714081
name|id
1911 United|C00508200
50 State Strategy|C00502633
9-9-9 FUND|C00504241
Accountability 2010|C00489641
AFL-CIO Workers' Voices PAC|C00484287
Alaskans Standing Together|C00489385
America for the People|C00497081
America Get Up|C00494278
America Votes Action Fund|C00492520

## pairwise.py
'''
pairwise.py

This script uses the Python Gensim library and heapq from the standard library to make
massively fast and scalable pairwise comparisons between an aribtrarily large number of
documents using TF-IDF and cosine distance.

The script first generates a similarity matrix between all documents in a set, then uses
heapq to retrieve the top K most similar matches to each document in that set. It has been
tested on sets as large as 400,000 documents on a Macbook Air.

## gist:5886658
AMAZON_ACCESS_KEY = 'WHATEVER'
AMAZON_SECRET_KEY = 'SECRET_WHATEVER'

# I'm old-school, so I like the AWS-S3 gem. It's just a lightweight wrapper around Amazon's API.
# https://github.com/marcel/aws-s3
require "aws/s3"
include AWS::S3

def publish_json!(bucket='int.nyt.com', path='applications/represent-json/', filename='foo.json')

## gist:5922941
{
   "discipline_id":"AS",
   "discipline_name":"Alpine Skiing",
   "results":
   [
      {
         "id":"ASM010",
         "name":"Men's Downhill",
         "competitor_type":"ATH",
         "results":

## nnsearch.py
import numpy

def get_similar(vec, matrix, K=10):
    # Set up the query vector and the whole dataset for K-nearest neighbors query
    qvector = numpy.array([vec]).transpose()
    alldata = numpy.array(matrix).transpose()

    # You can't get more neighbors than there are entities
    ndata = alldata.shape[1]
    K = K if K < ndata else ndata

## compare.py
'''
compare.py

Quickly produces a pairwise similarity matrix of lawmakers' roll call votes, given
an input *.ord matrix file from Poole, McCarty and Lewis: http://www.voteview.com/dwnl.htm
'''

import numpy, string
from scipy.spatial.distance import cdist

## bktree.py
def levenshtein(a,b):
    "Calculates the Levenshtein distance between a and b."
    n, m = len(a), len(b)
    if n > m:
        # Make sure n <= m, to use O(min(n,m)) space
        a,b = b,a
        n,m = m,n

    current = range(n+1)
    for i in range(1,m+1):

## columbiacrime.html
<!DOCTYPE html>
<html>

<head>
    <title>Leaflet Example</title>
    <meta name="viewport" content="width=device-width, initial-scale=1.0">

    <!-- Import Leaflet assets -->
    <link rel="stylesheet" href="http://leafletjs.com/dist/leaflet.css" />
    <script src="http://leafletjs.com/dist/leaflet.js"></script>

## graph-cluster.py
'''
graph-cluster.py

Some notes for doing graph clustering in a couple different ways: simple spectral
partitioning based on the Fiedler vector, and a density-based clustering using DBSCAN.

Why might this be useful? I'm using it to identify weakly connected (and therefore
probably false) graph components in my campaign finance standardization workflow, the
basic idea of which is here: https://github.com/cjdd3b/fec-standardizer/wiki

## minhasher.py
import random

class MinHasher(object):
    def __init__(self, n, universe_size, seed=None):
        if seed != None: random.seed(seed)
        self.hash_functions = [self._create_random_hash_function(universe_size) for i in range(n)]

    def _create_random_hash_function(self, universe_size):
        a = random.randint(0, universe_size)
        b = random.randint(0, universe_size)
	name\|id
	1911 United\|C00508200
	50 State Strategy\|C00502633
	9-9-9 FUND\|C00504241
	Accountability 2010\|C00489641
	AFL-CIO Workers' Voices PAC\|C00484287
	Alaskans Standing Together\|C00489385
	America for the People\|C00497081
	America Get Up\|C00494278
	America Votes Action Fund\|C00492520
	'''
	pairwise.py

	This script uses the Python Gensim library and heapq from the standard library to make
	massively fast and scalable pairwise comparisons between an aribtrarily large number of
	documents using TF-IDF and cosine distance.

	The script first generates a similarity matrix between all documents in a set, then uses
	heapq to retrieve the top K most similar matches to each document in that set. It has been
	tested on sets as large as 400,000 documents on a Macbook Air.
	AMAZON_ACCESS_KEY = 'WHATEVER'
	AMAZON_SECRET_KEY = 'SECRET_WHATEVER'

	# I'm old-school, so I like the AWS-S3 gem. It's just a lightweight wrapper around Amazon's API.
	# https://github.com/marcel/aws-s3
	require "aws/s3"
	include AWS::S3

	def publish_json!(bucket='int.nyt.com', path='applications/represent-json/', filename='foo.json')
	{
	"discipline_id":"AS",
	"discipline_name":"Alpine Skiing",
	"results":
	[
	{
	"id":"ASM010",
	"name":"Men's Downhill",
	"competitor_type":"ATH",
	"results":
	import numpy

	def get_similar(vec, matrix, K=10):
	# Set up the query vector and the whole dataset for K-nearest neighbors query
	qvector = numpy.array([vec]).transpose()
	alldata = numpy.array(matrix).transpose()

	# You can't get more neighbors than there are entities
	ndata = alldata.shape[1]
	K = K if K < ndata else ndata
	'''
	compare.py

	Quickly produces a pairwise similarity matrix of lawmakers' roll call votes, given
	an input *.ord matrix file from Poole, McCarty and Lewis: http://www.voteview.com/dwnl.htm
	'''

	import numpy, string
	from scipy.spatial.distance import cdist
	def levenshtein(a,b):
	"Calculates the Levenshtein distance between a and b."
	n, m = len(a), len(b)
	if n > m:
	# Make sure n <= m, to use O(min(n,m)) space
	a,b = b,a
	n,m = m,n

	current = range(n+1)
	for i in range(1,m+1):
	<!DOCTYPE html>
	<html>

	<head>
	<title>Leaflet Example</title>
	<meta name="viewport" content="width=device-width, initial-scale=1.0">

	<!-- Import Leaflet assets -->
	<link rel="stylesheet" href="http://leafletjs.com/dist/leaflet.css" />
	<script src="http://leafletjs.com/dist/leaflet.js"></script>
	'''
	graph-cluster.py

	Some notes for doing graph clustering in a couple different ways: simple spectral
	partitioning based on the Fiedler vector, and a density-based clustering using DBSCAN.

	Why might this be useful? I'm using it to identify weakly connected (and therefore
	probably false) graph components in my campaign finance standardization workflow, the
	basic idea of which is here: https://github.com/cjdd3b/fec-standardizer/wiki
	import random

	class MinHasher(object):
	def __init__(self, n, universe_size, seed=None):
	if seed != None: random.seed(seed)
	self.hash_functions = [self._create_random_hash_function(universe_size) for i in range(n)]

	def _create_random_hash_function(self, universe_size):
	a = random.randint(0, universe_size)
	b = random.randint(0, universe_size)