Skip to content

Instantly share code, notes, and snippets.

@tylerpearson
Last active July 18, 2016 00:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tylerpearson/99bf4cdc038a6acca83596b641255a8c to your computer and use it in GitHub Desktop.
Save tylerpearson/99bf4cdc038a6acca83596b641255a8c to your computer and use it in GitHub Desktop.
#!/usr/bin/env ruby
require 'simhash'
require 'json'
require 'pp'
YEAR = ARGV.first
if YEAR.nil?
puts 'Error: Pass in a year as an argument'
exit
end
###
def load_json(filename)
JSON.parse(IO.read(filename))
end
def hamming_distance(a, b)
(a ^ b).to_s(2).count('1')
end
def save_results(results, filename)
File.open(filename, 'w') do |f|
f.write(results.to_json)
end
end
###
path = "../bill-scraper/wv-bills/#{YEAR}"
results = []
# loop through each bill in the directory
Dir.foreach(path) do |item|
next if item == '.' || item == '..'
# load the bill
bill = load_json("#{path}/#{item}")
# calculate the simhash
simhash = bill['text'].simhash(split_by: / /, stop_words: true, hashbits: 512)
bill_hashes = []
# puts bill['title']
# load the ALEC model bills
load_json('../bill-analyzer/alec-simhashes.json').each do |alec_bill|
distance = hamming_distance(alec_bill['simhash'], simhash)
bill_hashes << {
title: alec_bill['title'],
hamming_distance: distance
}
end
results << {
title: bill['title'],
number: bill['number'],
url: bill['url'],
alec_similar: bill_hashes.sort_by { |k| k[:hamming_distance] }.take(3)
}
# save_results(results, 'comparison-results.json')
end
results.sort_by! { |k| k[:alec_similar][0][:hamming_distance] }
results.reverse!
save_results(results, "#{YEAR}-wv-comparison-results.json")
puts "Bill # | WV bill name | ALEC Bill Name | Similarity"
puts "--- | --- | --- | --- "
results.select { |bill| bill[:alec_similar][0][:hamming_distance] <= 22 }.reverse.each do |bill|
puts "#{bill[:number]} | #{bill[:title]} | #{bill[:alec_similar][0][:title]} | #{bill[:alec_similar][0][:hamming_distance]}"
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment