Skip to content

Instantly share code, notes, and snippets.

@lukas
Created February 21, 2010 00:18
Show Gist options
  • Save lukas/310002 to your computer and use it in GitHub Desktop.
Save lukas/310002 to your computer and use it in GitHub Desktop.
require 'rubygems'
require 'json'
require 'ruby-crowdflower'
require 'builder'
require 'sinatra'
CrowdFlower.connect!('6d96e6206e025396d62814c905eebc0687349b42')
#JobId = 4859
@@jobId = File.open(File.dirname(__FILE__) + "/job_id").readline.chomp.to_i
Mapping =
{
"address" => "address",
"city" => "city",
"first_name" => "firstname",
"last_name" => "lastname",
"department" => "department",
"author" => "author",
"_updated_at" => "updated",
"categoryaid_request" => "categorization",
"notes" => "notes",
"sms_translation" => "summary",
"gender" => "gender",
"carrier_id" => "carrierid",
"status" => "status",
"title" => "sms"
}
def get_raw_judgment(cf_j)
if cf_j.is_a?(String)
return cf_j
elsif cf_j.is_a?(Time)
return cf_j.to_s
elsif cf_j.is_a?(Array)
return cf_j[0]
elsif cf_j["res"]
return cf_j["res"][0]
end
raise
end
def crowdflower_judgment_to_u_judgment(id, cf_judgment)
puts "ID: #{id}"
#puts cf_judgment.inspect
u_judgment = {}
Mapping.each_pair do |cf_term, u_term|
u_judgment[u_term] = ""
if cf_judgment[cf_term]
puts cf_term
puts cf_judgment[cf_term]
puts cf_judgment[cf_term].class
u_judgment[u_term] = get_raw_judgment(cf_judgment[cf_term])
end
end
u_judgment["id"] = id
u_judgment["georss:point"] = "#{get_raw_judgment(cf_judgment["latitude"])} #{get_raw_judgment(cf_judgment["longitude"])}"
u_judgment
end
def generate_feed(u_judgments)
buffer =
'<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom" xmlns:georss="http://www.georss.org/georss">
'
xml = Builder::XmlMarkup.new(:indent => 2, :target => buffer)
xml.title "4636.crowdflower.com"
xml.link :href => "http://4636.crowdflower.com"
xml.id "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a"
if u_judgments.size > 0
xml.updated Time.parse(u_judgments[0]["updated"]).xmlschema
end
xml.author { xml.name "CrowdFlower" }
u_judgments.each do |u_judgment|
xml.entry do
xml.title "#{u_judgment["firstname"]} #{u_judgment["lastname"]} at #{u_judgment["georss:point"]}"
xml.link :href => "http://4636.crowdflower.com"
u_judgment.each_pair do |key, value|
xml.tag!(key, key == "updated" ? Time.parse(value).xmlschema : value)
end
end
end
#puts buffer
buffer <<= "</feed>"
buffer
end
get '/' do
'Hello world!'
end
# [16:10:13] Robert Munro: Are they requesting all N records at once each time?
# [16:53:39] Brian Herbert: no
# [16:53:44] Brian Herbert: gotta paginate
# [16:53:51] Brian Herbert: &limit=0,10
# [16:53:55] Robert Munro: clear
# [16:53:56] Brian Herbert: &limit=10,10
# [16:53:58] Brian Herbert: etc
# [16:53:59] Brian Herbert: also
# [16:54:05] Brian Herbert: we have timestamp filtering
# [16:54:44] Brian Herbert: &uptots=[UNIXTIMESTAMP] and &sincets=[UNIXTIMESTAMP]
# [16:54:56] Brian Herbert: and &category=4,5,6
# [16:55:08] Brian Herbert: or &category=4a,4b
# [16:55:08] Brian Herbert: and &carrierid=1 or 2
def judgment_satisfy_params?(u_j, params)
if (params[:uptots])
j_time = Time.parse(u_j["updated"])
up_to_time = Time.at(params[:uptots].to_i)
return false unless j_time < up_to_time
end
if (params[:sincets])
j_time = Time.parse(u_j["updated"])
up_to_time = Time.at(params[:sincets].to_i)
return false unless j_time > up_to_time
end
if (params[:category])
categories = params[:category].split(",")
match = categories.any? do |cat| # is cat a substring of the category
return false if !u_j["categorization"]
u_j["categorization"][cat]
end
if !match
return false
end
end
if (params[:carrierid])
if u_j["carrierid"] != params[:carrierid]
return false
end
end
return true
end
def get_crowdflower_results(jobId, limit)
page_size = 30
all_u_judgments = []
(1..1000).each do |page|
job = CrowdFlower::Job.new(jobId)
puts "-----"
puts page_size
puts page
judgments = CrowdFlower::Judgment.new(job).all(page, page_size)
break if judgments.size == 0
raise("Couldn't load #{page} #{page_size}") if judgments.class != Hash # some kind of error response - we need to handle better
u_judgments = judgments.to_a.map { |id, j| crowdflower_judgment_to_u_judgment(id, j) }
filtered_u_judgments = u_judgments.select do |u_j|
judgment_satisfy_params?(u_j, params)
end
all_u_judgments += filtered_u_judgments
break if all_u_judgments.size >= limit
end
all_u_judgments
end
get '/feed' do
content_type 'application/xml', :charset => 'utf-8'
return "invalid key\n\n" unless params[:key] == "yqNm7FHSwfdRb8nC2653"
# were gonna page through these motherfucking results
all_u_judgments = []
offset = 0
limit = 10
if (params[:limit])
offset, limit = params[:limit].split(",")
offset = offset.to_i
limit = limit.to_i
end
all_u_judgments = get_crowdflower_results(@@jobId, limit)
if all_u_judgments.size < limit
new_results = get_crowdflower_results(4901, limit)
all_u_judgments += new_results
end
all_u_judgments = all_u_judgments.slice(offset,limit-offset)
all_u_judgments ||= []
puts all_u_judgments.length
generate_feed(all_u_judgments)
end
get '/label' do
redirect "http://crowdflower.com/judgments/mob/4980"
end
get '/status' do
j = CrowdFlower::Job.new(@@jobId)
status = j.status
aj = status['all_judgments']
au = status['all_units']
nj = status['needed_judgments']
<<END
<html>
<body>
<p>Number of messages in queue: <b>#{nj}</b> Number of messages classified: <b>#{aj+9856+1714}</b>
</body>
</html>
END
end
get '/feedold' do
offset = 0
limit = 10
if (params[:limit])
offset, limit = params[:limit].split(",")
end
job = CrowdFlower::Job.new(@@jobId)
judgments = CrowdFlower::Judgment.new(job).all(1, limit)
judgments = judgments.slice(offset,limit-offset)
u_judgments = judgments.to_a.map { |id, j| crowdflower_judgment_to_u_judgment(id, j) }
generate_feed(u_judgments)
end
#puts u_judgments[0].inspect
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment