Skip to content

Instantly share code, notes, and snippets.

#! /bin/sh
### BEGIN INIT INFO
# Provides: cloud-crowd
# Required-Start: $all
# Required-Stop: $all
# Default-Start: 2 3 4 5
# Default-Stop: S 0 1 6
# Short-Description: starts a cloud-crowd node
# Description: starts a cloud-crowd node using start-stop-daemon
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>KeepAlive</key>
<false/>
<key>Label</key>
<string>org.documentcloud.cloudcrowd</string>
<key>Program</key>
<string>crowd</string>
full_text = Nokogiri::XML::Node.new('full_text', xml)
full_text.content = File.read(path).gsub(/[^[:print:]]/, '')
existence = Nokogiri::XML::Node.new('exists', xml)
existence.content = '1'
# Alternate approaches that don't work...
# to_xs is way, way too slow for production -- especially if
# we're rebuilding the index all the time. At least parallelize it
# in CloudCrowd.
# full_text.content = File.read(path).to_xs
def levenshtein(a, b)
case
when a.empty?: b.length
when b.empty?: a.length
else [(a[0] == b[0] ? 0 : 1) + levenshtein(a[1..-1], b[1..-1]),
1 + levenshtein(a[1..-1], b),
1 + levenshtein(a, b[1..-1])].min
end
end
// Javascript can be so much more pleasant when it's functional -- re-implement
// a bunch of utility methods from Prototype and Steele's Functional...
window._ = {
// The centerpiece, an each implementation.
// Handles objects implementing forEach, _each, arrays, and raw objects.
each : function(obj, iterator, context) {
var index = 0;
try {
if (obj.forEach) {
// Extend the JQuery namespace with core utility methods for DOM manipulation.
$.extend({
// Quick-create a dom element with attributes.
el : function(tagName, attributes) {
var el = document.createElement(tagName);
$(el).attr(attributes);
return el;
},
$.fn.extend({
// When the next click or keypress happens, anywhere on the screen, hide the
// element. 'clickable' makes the element and its contents clickable without
// hiding. The 'onHide' callback runs when the hide fires, and has a chance
// to cancel it.
autohide : function(options) {
var me = this;
options = _.extend({clickable : null, onHide : null}, options || {});
me._autoignore = true;
class Scrape < CloudCrowd::Action
# Extract the title from an HTML page.
def process
html = File.read(input_path)
match = html.match(/<title>(.*)<\/title>/i)
match ? match[1] : "Untitled"
end
end
#!/usr/bin/env ruby -rubygems
require 'restclient'
require 'json'
RestClient.post('http://localhost:9173/jobs',
{:job => {
'action' => 'scrape',
SELECT distinct on (documents.id) documents.*
FROM "documents"
WHERE (documents.id in (
(select document_id from metadata where
(metadata.kind = E'person' and
to_tsvector('english', metadata.value) @@ plainto_tsquery(E'bush')))
intersect
(select document_id from metadata where
(metadata.kind = E'city' and
to_tsvector('english', metadata.value) @@ plainto_tsquery(E'washington')))))