Created
December 30, 2015 01:45
-
-
Save tylerpearson/ed2e47fed892e51a1cac to your computer and use it in GitHub Desktop.
Ruby script to convert bills into simhashes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'simhash' | |
require 'json' | |
require 'pp' | |
### Helper methods | |
def load_json(filename) | |
JSON.parse(IO.read(filename)) | |
end | |
def hamming_distance(a, b) | |
(a^b).to_s(2).count("1") | |
end | |
def save_results(results, filename) | |
File.open(filename, "w") do |f| | |
f.write(results.to_json) | |
end | |
end | |
def to_slug(str) | |
str.downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '') | |
end | |
### Scraper | |
bill_hashes = [] | |
if ARGV.first == 'wv' | |
bills_path = "../bill-scraper/wv-bills/#{ARGV.second}" | |
Dir.foreach(bills_path) do |item| | |
next if item == '.' or item == '..' | |
bill = load_json("#{bills_path}/#{item}") | |
bill_number = item.split('.').first | |
text = bill['text'] | |
simhash = text.simhash(:split_by => / /, :stop_words => true, :hashbits => 512) | |
puts "#{bill_number.upcase}'s simhash is #{simhash}" | |
bill_hashes << { | |
number: bill_number, | |
simhash: simhash | |
} | |
end | |
save_results(bill_hashes, "wv-#{ARGV.second}-bill-simhashes.json") | |
end | |
if ARGV.first == 'alec' | |
alec_path = '../alec-finder/alec-model-bills.json' | |
load_json(alec_path).each do |item| | |
bill_number = to_slug(item['title']) | |
text = item['text'] | |
simhash = text.simhash(:split_by => / /, :stop_words => true, :hashbits => 512) | |
puts "#{item['title']}'s simhash is #{simhash}" | |
bill_hashes << { | |
slug: bill_number, | |
title: item['title'], | |
simhash: simhash | |
} | |
end | |
pp bill_hashes | |
save_results(bill_hashes, 'alec-simhashes.json') | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment