konklone/1_extract_names.rb

## 1_extract_names.rb
#!/usr/bin/env ruby

filename = ARGV.first

if filename.nil? or filename == ""
  puts "Provide the filename of the CSV file with disbursement details as an argument."
  exit
end

begin
  require 'fileutils'
  require 'rubygems'
  require 'fastercsv'
rescue
  puts "Couldn't load FasterCSV. Try running \"sudo gem install fastercsv\" and try again."
  exit
end

unless File.exists?(filename)
  puts "Couldn't locate #{filename}. Place it in the same directory as this script."
  exit
end


puts "Reading #{filename} for names..."
names = {}
i = 0
FasterCSV.foreach(filename) do |row|
  name = row[0]
  names[name] ||= 0
  names[name] += 1

  i += 1
  puts "Read #{i} rows..." if i % 50000 == 0
end

FileUtils.rm("all-names.csv") if File.exist?("all-names.csv")
FasterCSV.open("all-names.csv", "w") do |csv|
  csv << ['name', 'num_rows']
  names.keys.sort.each do |key|
    csv << [key, names[key]]
  end
end
puts "Wrote names to all-names.csv."

## 2_assign_bioguide_ids.rb
#!/usr/bin/env ruby

filename = ARGV.first

if filename.nil? or filename == ""
  filename = "all-names.csv"
end

begin
  require 'fileutils'
  require 'rubygems'
  require 'sunlight'
  require 'fastercsv'
rescue
  puts "Couldn't load dependencies. Try running two commands and try again:\n\nsudo gem install fastercsv\nsudo gem install sunlight"
  exit
end

unless File.exists?(filename)
  puts "Couldn't locate #{filename}. Place it in the same directory as this script."
  exit
end

@@misses = 0
@@duplicates = 0
Sunlight::Base.api_key = 'sunlight9'


def legislator_for_name(name)
  options = {}

  # get rid of "HON." prefix and split on spaces
  name = name.gsub /^HON\.\s?/i, ''
  pieces = name.split /\s+/

  # might be a state in parentheses at the end
  options[:state] = pieces.pop.gsub(/[\(\)]/, '') if pieces.last =~ /^\([a-zA-Z]+\)$/

  # might be a suffix at the end
  options[:name_suffix] = "#{pieces.pop.gsub(/\./, '')}." if pieces.last =~ /^Jr\.?$/i
  options[:name_suffix] = pieces.pop if pieces.last =~ /^I+$/i

  options[:lastname] = pieces.pop.gsub /,/, ''

  options[:firstname] = pieces.first

  results = Sunlight::Legislator.all_where options
  if results.size == 1
    results.first

  # no result, could be either the wrong first name, or out of office
  elsif results.size == 0
    # try the name as a nickname first
    options[:nickname] = options.delete :firstname

    results = Sunlight::Legislator.all_where options
    if results.size == 1
      results.first

    # must be out of office then?
    elsif results.size == 0
      options[:in_office] = 0

      # reset to doing firstname first
      options[:firstname] = options.delete :nickname

      results = Sunlight::Legislator.all_where options
      if results.size == 1
        results.first

      elsif results.size == 0
        # try as nickname again, this time out of office
        options[:nickname] = options.delete :firstname

        results = Sunlight::Legislator.all_where options
        if results.size == 1
          results.first

        # OK, we'll accept a result if it matches on last name only,
        # but only if there's only one result amongst both in and out of office legislators
        elsif results.size == 0
          options.delete :nickname
          options.delete :firstname

          if legislator = unique_for(options)
            legislator

          else
            # finally, try the combo last name
            options[:lastname] = "#{pieces.pop} #{options[:lastname]}"

            if legislator = unique_for(options)
              legislator

            else
              @@misses += 1
              puts "I GIVE UP. Couldn't match on options: #{options.merge(:pieces => pieces).inspect}"
            end
          end

        elsif results.size > 0
          @@duplicates += 1
          puts "Duplicates for options: #{options.inspect}"
        end

      elsif results.size > 0
        @@duplicates += 1
        puts "Duplicates for options: #{options.inspect}"
      end

    elsif results.size > 0
      @@duplicates += 1
      puts "Duplicates for options: #{options.inspect}"
    end

  # duplicate first name and last name of in-office legislator
  elsif results.size > 0
    @@duplicates += 1
    puts "Duplicates for options: #{options.inspect}"
  end

end


# need a unique result or nothing, across both in and out of office legislators, for the given options
# this is done for last name only checks
def unique_for(options)
  options[:in_office] = 1
  in_results = Sunlight::Legislator.all_where options
  return nil if in_results.size > 1

  options[:in_office] = 0
  out_results = Sunlight::Legislator.all_where options

  if in_results.size == 1 and out_results.size == 0
    in_results.first
  elsif in_results.size == 0 and out_results.size == 1
    out_results.first
  else
    nil
  end
end

def name_for(legislator)
  nickname = legislator.nickname && legislator.nickname != "" ? " \'#{legislator.nickname}\'" : ""
  firstname = "#{legislator.firstname}#{nickname}"
  lastname = legislator.name_suffix && legislator.name_suffix != "" ? "#{legislator.lastname} #{legislator.name_suffix}" : legislator.lastname

  "#{legislator.title}. #{firstname} #{lastname}".upcase
end


puts "Trying to match up names in #{filename}..."

names = {}
FasterCSV.foreach(filename) do |row|
  name = row[0]

  # Members' names will always start with "HON."
  if name =~ /HON\./
    legislator = legislator_for_name name

    if legislator
      names[name] = {:bioguide_id => legislator.bioguide_id, :name_check => name_for(legislator), :in_office => legislator.in_office}
    end
  end
end

FileUtils.rm("bioguide_ids.csv") if File.exist? "bioguide_ids.csv"
FasterCSV.open("bioguide_ids.csv", "w") do |csv|
  csv << ['bioguide_id', 'name', 'name_check', 'in_office']
  names.each do |name, values|
    csv << [values[:bioguide_id], name, values[:name_check], values[:in_office]]
  end
end

puts ""
puts "Out of #{names.keys.size} names:"
puts "#{@@misses} attempts failed to match a legislator entirely."
puts "#{@@duplicates} attempts matched too many legislators."
puts ""
puts "Wrote names and bioguide IDs out to bioguide_ids.csv."

## 3_update_expenditures.rb
#!/usr/bin/env ruby

filename = ARGV.first
bioguide_filename = "bioguide_ids.csv"

if filename.nil? or filename == ""
  puts "Provide the filename of the disbursements CSV file you want to add the bioguide IDs to."
  exit
end


begin
  require 'fileutils'
  require 'rubygems'
  require 'fastercsv'
rescue
  puts "Couldn't load dependencies. Try running:\nsudo gem install fastercsv"
  exit
end

[filename, bioguide_filename].each do |f|
  unless File.exists?(f)
    puts "Couldn't locate #{f}. Place it in the same directory as this script."
    exit
  end
end

# Read through the bioguide ID CSV and create a hash of names to bioguide_ids

legislators = {}
FasterCSV.foreach(bioguide_filename) do |row|
  # key is name, value is bioguide_id
  legislators[row[1]] = row[0]
end

# open up a file for writing, and in it:
  # go through the expenditures CSV line by line and find the bioguide_id for each name and re-write it out

FasterCSV.open("expenditures-updated.csv", "w") do |csv|
  i = 0

  FasterCSV.foreach(filename) do |row|
    name = row[0]
    row.unshift legislators[name]
    csv << row

    i += 1
    puts "Wrote #{i} rows..." if i % 50000 == 0
  end

end

puts ""
puts "Wrote out updated expenditure report to expenditures-updated.csv."

## README
This is a set of 3 scripts that will take the CSV that Sunlight prepares of the House expenditure reports, and attempt to match up all legislator names to bioguide IDs, for the purpose of making it easier to link expenditure reports to other information about a legislator.


= Requirements =

It requires Ruby to be installed, with the "fastercsv" and "sunlight" gems installed.

It uses the Sunlight Labs Congress API to do the name lookups, and requires the expenditures CSV to be exactly what we currently host on Socrata.


= Assumptions =

It assumes that all legislators in the CSV will have their names prepended with "HON." (with or without a space between "HON." and the rest of their name).  It also assumes that no non-legislators have name fields that begin with "HON.".

It assumes that if there is a case of two identically named legislators (i.e. Mike Rogers in Q3 and Q4 of 2009), that the state will be noted in the name field in parentheses, i.e. "HON. MIKE ROGERS (MI)" and "HON. MIKE ROGERS (AL)".


= Known Mistakes =

Mary Bono Mack is mistaken for Connie Mack.  There's no easy way to correct this without hardcoding Mack into the code.  Make sure to correct this during step 4.


= Instructions =

1) Place the expenditures CSV in this directory.  Open a terminal in this directory and run:

   ruby 1_extract_names.rb [expenditures.csv]

   Replace "[expenditures.csv]" with the name of the actual expenditures CSV file. Don't include the brackets.

2) This will produce a file called all-names.csv, containing a row with every unique name found in the name field of the expenditures CSV. Open it and verify that everything is correct, and that the assumptions listed in the "Assumptions" section above are met.

3) Run this command in the terminal:

   ruby 2_assign_bioguide_ids.rb

   As this runs, it will print out whenever it can't match a legislator to a bioguide ID, and when it's done it'll print a summary of how many legislators it failed to match.

4) This will produce a file called bioguide_ids.csv, containing a row with every legislator's name, a bioguide ID,  and then two fields to help verify the work. The first (under the heading "name_check") is the full standardized name of the legislator, including their nickname, of the legislator that the script believed was the match.  The second is whether or not the legislator is currently in office ("TRUE") or not in office ("FALSE").

   You need to look over this file and fill in the missing bioguide IDs for rows missing them.  The output of the prior script should have said how many missing bioguide IDs there are.

   You should also look for mistakes, such as if the two name columns obviously don't match, and replace the bioguide IDs manually.  Look in the "Known Mistakes" section above for any that you should expect to see.

5) Run this command in the terminal:

   ruby 3_update_expenditures.rb [expenditures.csv]

   Replace "[expenditures.csv]" with the name of the actual expenditures CSV file. Don't include the brackets.

6) This will produce a file called expenditures-updated.csv, which is a copy of the original expenditures CSV, with the bioguide ID column and data prepended to the beginning.  Give it a quick look to make sure everything looks right.


== Looking up Bioguide IDs ==

If you're a developer, the easiest way to find Bioguide IDs is by using the Sunlight Labs API, using a wrapper tool, such as the "sunlight" Ruby gem.

If you're not, the easiest way is to visit http://bioguide.congress.gov and use the search feature to find the page for the legislator in question. Once there, look at the URL, it will look something like this:

http://bioguide.congress.gov/scripts/biodisplay.pl?index=Y000031

In this URL, the bioguide ID is at the very end, after the equals sign: "Y000031".
	#!/usr/bin/env ruby

	filename = ARGV.first

	if filename.nil? or filename == ""
	puts "Provide the filename of the CSV file with disbursement details as an argument."
	exit
	end

	begin
	require 'fileutils'
	require 'rubygems'
	require 'fastercsv'
	rescue
	puts "Couldn't load FasterCSV. Try running \"sudo gem install fastercsv\" and try again."
	exit
	end

	unless File.exists?(filename)
	puts "Couldn't locate #{filename}. Place it in the same directory as this script."
	exit
	end


	puts "Reading #{filename} for names..."
	names = {}
	i = 0
	FasterCSV.foreach(filename) do \|row\|
	name = row[0]
	names[name] \|\|= 0
	names[name] += 1

	i += 1
	puts "Read #{i} rows..." if i % 50000 == 0
	end

	FileUtils.rm("all-names.csv") if File.exist?("all-names.csv")
	FasterCSV.open("all-names.csv", "w") do \|csv\|
	csv << ['name', 'num_rows']
	names.keys.sort.each do \|key\|
	csv << [key, names[key]]
	end
	end
	puts "Wrote names to all-names.csv."
	#!/usr/bin/env ruby

	filename = ARGV.first
	bioguide_filename = "bioguide_ids.csv"

	if filename.nil? or filename == ""
	puts "Provide the filename of the disbursements CSV file you want to add the bioguide IDs to."
	exit
	end


	begin
	require 'fileutils'
	require 'rubygems'
	require 'fastercsv'
	rescue
	puts "Couldn't load dependencies. Try running:\nsudo gem install fastercsv"
	exit
	end

	[filename, bioguide_filename].each do \|f\|
	unless File.exists?(f)
	puts "Couldn't locate #{f}. Place it in the same directory as this script."
	exit
	end
	end

	# Read through the bioguide ID CSV and create a hash of names to bioguide_ids

	legislators = {}
	FasterCSV.foreach(bioguide_filename) do \|row\|
	# key is name, value is bioguide_id
	legislators[row[1]] = row[0]
	end

	# open up a file for writing, and in it:
	# go through the expenditures CSV line by line and find the bioguide_id for each name and re-write it out

	FasterCSV.open("expenditures-updated.csv", "w") do \|csv\|
	i = 0

	FasterCSV.foreach(filename) do \|row\|
	name = row[0]
	row.unshift legislators[name]
	csv << row

	i += 1
	puts "Wrote #{i} rows..." if i % 50000 == 0
	end

	end

	puts ""
	puts "Wrote out updated expenditure report to expenditures-updated.csv."
	This is a set of 3 scripts that will take the CSV that Sunlight prepares of the House expenditure reports, and attempt to match up all legislator names to bioguide IDs, for the purpose of making it easier to link expenditure reports to other information about a legislator.


	= Requirements =

	It requires Ruby to be installed, with the "fastercsv" and "sunlight" gems installed.

	It uses the Sunlight Labs Congress API to do the name lookups, and requires the expenditures CSV to be exactly what we currently host on Socrata.


	= Assumptions =

	It assumes that all legislators in the CSV will have their names prepended with "HON." (with or without a space between "HON." and the rest of their name). It also assumes that no non-legislators have name fields that begin with "HON.".

	It assumes that if there is a case of two identically named legislators (i.e. Mike Rogers in Q3 and Q4 of 2009), that the state will be noted in the name field in parentheses, i.e. "HON. MIKE ROGERS (MI)" and "HON. MIKE ROGERS (AL)".


	= Known Mistakes =

	Mary Bono Mack is mistaken for Connie Mack. There's no easy way to correct this without hardcoding Mack into the code. Make sure to correct this during step 4.


	= Instructions =

	1) Place the expenditures CSV in this directory. Open a terminal in this directory and run:

	ruby 1_extract_names.rb [expenditures.csv]

	Replace "[expenditures.csv]" with the name of the actual expenditures CSV file. Don't include the brackets.

	2) This will produce a file called all-names.csv, containing a row with every unique name found in the name field of the expenditures CSV. Open it and verify that everything is correct, and that the assumptions listed in the "Assumptions" section above are met.

	3) Run this command in the terminal:

	ruby 2_assign_bioguide_ids.rb

	As this runs, it will print out whenever it can't match a legislator to a bioguide ID, and when it's done it'll print a summary of how many legislators it failed to match.

	4) This will produce a file called bioguide_ids.csv, containing a row with every legislator's name, a bioguide ID, and then two fields to help verify the work. The first (under the heading "name_check") is the full standardized name of the legislator, including their nickname, of the legislator that the script believed was the match. The second is whether or not the legislator is currently in office ("TRUE") or not in office ("FALSE").

	You need to look over this file and fill in the missing bioguide IDs for rows missing them. The output of the prior script should have said how many missing bioguide IDs there are.

	You should also look for mistakes, such as if the two name columns obviously don't match, and replace the bioguide IDs manually. Look in the "Known Mistakes" section above for any that you should expect to see.

	5) Run this command in the terminal:

	ruby 3_update_expenditures.rb [expenditures.csv]

	Replace "[expenditures.csv]" with the name of the actual expenditures CSV file. Don't include the brackets.

	6) This will produce a file called expenditures-updated.csv, which is a copy of the original expenditures CSV, with the bioguide ID column and data prepended to the beginning. Give it a quick look to make sure everything looks right.


	== Looking up Bioguide IDs ==

	If you're a developer, the easiest way to find Bioguide IDs is by using the Sunlight Labs API, using a wrapper tool, such as the "sunlight" Ruby gem.

	If you're not, the easiest way is to visit http://bioguide.congress.gov and use the search feature to find the page for the legislator in question. Once there, look at the URL, it will look something like this:

	http://bioguide.congress.gov/scripts/biodisplay.pl?index=Y000031

	In this URL, the bioguide ID is at the very end, after the equals sign: "Y000031".