Skip to content

Instantly share code, notes, and snippets.

@leeomara
Forked from jashkenas/document_import.rb
Created August 28, 2011 05:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save leeomara/1176275 to your computer and use it in GitHub Desktop.
Save leeomara/1176275 to your computer and use it in GitHub Desktop.
require File.dirname(__FILE__) + '/support/setup'
class DocumentImport < CloudCrowd::Action
def split
inputs_for_processing(document, 'images', 'text')
end
def process
@pdf = document.slug + '.pdf'
File.open(@pdf, 'w+') {|f| f.write(asset_store.read_pdf(document)) }
case input['task']
when 'text' then process_text
when 'images' then process_images
end
end
def merge
document.update_attributes :access => access
document.id
end
def process_images
Docsplit.extract_images(@pdf, :format => :gif, :size => Page::IMAGE_SIZES.values, :output => 'images')
Dir['images/700x/*.gif'].length.times do |i|
image = "#{document.slug}_#{i + 1}.gif"
asset_store.save_page_images(
Page.new(:document_id => document.id, :page_number => i + 1),
{'normal' => "images/700x/#{image}",
'large' => "images/1000x/#{image}",
'thumbnail' => "images/60x75!/#{image}"},
access
)
end
end
def process_text
pages = []
extractor = DC::Import::TextExtractor.new(@pdf)
if extractor.contains_text?
Docsplit.extract_text(@pdf, :pages => :all, :output => 'text')
Dir['text/*.txt'].length.times do |i|
save_page_text(File.read("text/#{document.slug}_#{i + 1}.txt"), i + 1)
end
else
Docsplit.extract_pages(@pdf, :output => 'text')
Dir['text/*.pdf'].length.times do |i|
text = DC::Import::TextExtractor.new("text/#{document.slug}_#{i + 1}.pdf").text_from_ocr
save_page_text(text, i + 1)
end
end
text = document.combined_page_text
document.full_text = FullText.new(:text => text, :document => document)
# document.description = document.full_text.summarize
Page.refresh_page_map(document)
MetadataDate.refresh(document)
document.save!
meta_extractor = DC::Import::MetadataExtractor.new
meta_extractor.extract_metadata(document)
# asset_store.save_rdf(document, meta_extractor.rdf, DC::Access::PRIVATE)
asset_store.save_full_text(document, access)
document.id
end
private
def save_page_text(text, page_number)
page = Page.create!(:document => document, :text => text, :page_number => page_number)
asset_store.save_page_text(page, access)
end
def document
return @document if @document
ActiveRecord::Base.establish_connection
@document = Document.find(options['id'])
end
def asset_store
@asset_store ||= DC::Store::AssetStore.new
end
def access
options['access'] || DC::Access::PRIVATE
end
def inputs_for_processing(doc, *tasks)
tasks.map {|t| {:task => t, :id => doc.id} }
end
end
# Inherit Rails environment from Sinatra.
RAILS_ROOT = File.expand_path(Dir.pwd)
RAILS_ENV = ENV['RAILS_ENV'] = ENV['RACK_ENV']
# Load the DocumentCloud environment if we're in a Node context.
if CloudCrowd.node?
require 'logger'
Object.const_set "RAILS_DEFAULT_LOGGER", Logger.new(STDOUT)
require 'rubygems'
gem 'rails', '~> 2.0'
require 'active_record'
require 'config/environment'
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment