Last active
July 18, 2016 00:48
-
-
Save tylerpearson/99bf4cdc038a6acca83596b641255a8c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'simhash' | |
require 'json' | |
require 'pp' | |
YEAR = ARGV.first | |
if YEAR.nil? | |
puts 'Error: Pass in a year as an argument' | |
exit | |
end | |
### | |
def load_json(filename) | |
JSON.parse(IO.read(filename)) | |
end | |
def hamming_distance(a, b) | |
(a ^ b).to_s(2).count('1') | |
end | |
def save_results(results, filename) | |
File.open(filename, 'w') do |f| | |
f.write(results.to_json) | |
end | |
end | |
### | |
path = "../bill-scraper/wv-bills/#{YEAR}" | |
results = [] | |
# loop through each bill in the directory | |
Dir.foreach(path) do |item| | |
next if item == '.' || item == '..' | |
# load the bill | |
bill = load_json("#{path}/#{item}") | |
# calculate the simhash | |
simhash = bill['text'].simhash(split_by: / /, stop_words: true, hashbits: 512) | |
bill_hashes = [] | |
# puts bill['title'] | |
# load the ALEC model bills | |
load_json('../bill-analyzer/alec-simhashes.json').each do |alec_bill| | |
distance = hamming_distance(alec_bill['simhash'], simhash) | |
bill_hashes << { | |
title: alec_bill['title'], | |
hamming_distance: distance | |
} | |
end | |
results << { | |
title: bill['title'], | |
number: bill['number'], | |
url: bill['url'], | |
alec_similar: bill_hashes.sort_by { |k| k[:hamming_distance] }.take(3) | |
} | |
# save_results(results, 'comparison-results.json') | |
end | |
results.sort_by! { |k| k[:alec_similar][0][:hamming_distance] } | |
results.reverse! | |
save_results(results, "#{YEAR}-wv-comparison-results.json") | |
puts "Bill # | WV bill name | ALEC Bill Name | Similarity" | |
puts "--- | --- | --- | --- " | |
results.select { |bill| bill[:alec_similar][0][:hamming_distance] <= 22 }.reverse.each do |bill| | |
puts "#{bill[:number]} | #{bill[:title]} | #{bill[:alec_similar][0][:title]} | #{bill[:alec_similar][0][:hamming_distance]}" | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment