Created
October 25, 2011 13:55
-
-
Save seanhandley/1312798 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# translate supplier content nodes | |
# looks horrible, gets it done | |
require 'google_fish' | |
class T | |
class << self | |
def translate_html_chunks(google, target_lang, translation_chunks) | |
translated_page = '' | |
translation_chunks.each do | chunk_to_translate | | |
translated_page += translate_html_chunk(google, target_lang, chunk_to_translate) | |
end | |
translated_page | |
end | |
def translate_html_chunk(google, target_lang, chunk_to_translate) | |
translation_result = '' | |
(1..4).each do |i| | |
begin | |
translation_result = google.translate(:en, target_lang, chunk_to_translate, :html => true) | |
break | |
rescue StandardError => e | |
Rails.logger.warn "googlefish - attempt #{i} - translation exception - #{e.inspect}" | |
translation_result = chunk_to_translate | |
end | |
end | |
translation_result | |
end | |
def prepare_translation_chunks(body_html) | |
lastpos=0 | |
translation_chunks = [] | |
while(true) | |
substr = truncate(body_html[lastpos, body_html.length], 100) | |
break if substr.empty? | |
translation_chunks << substr | |
lastpos += substr.length | |
end | |
translation_chunks | |
end | |
def truncate(text, limit) | |
t = text[0, limit] | |
seperators = '\.\?\!;:' | |
broken_sentence_matcher = Regexp.new("[" + seperators + "](?![^" + seperators + "]*[" + seperators + "])") | |
if not m1 = t =~ /<(?![^<]*>)/ # no broken tag present | |
if not m2 = t =~ />\s*$/ #if doesn't end with '>' | |
if m3 = t =~ broken_sentence_matcher #if broken sentence present | |
if m4 = t =~ />(?![^>]*<)/ | |
if m3 > m4 | |
t[0, m3+1] | |
else | |
t[0, m4+1] | |
end | |
else | |
t[0, m3+1] | |
end | |
else | |
t | |
end | |
else | |
t | |
end | |
else | |
t[0, m1] | |
end | |
end | |
end | |
end | |
english = Language.find_by_code 'en' | |
languages = Language.all.reject { |l| l == english } | |
report_filename = "google_vs_human.csv" | |
error_filename = "hotel_errors.log" | |
puts "Please enter your google api key:" | |
api_key = STDIN.gets.chomp | |
google = GoogleFish.new(api_key) | |
['AvailabilitySearch::Hotel'].each do |attachable_type| | |
english_phrases = ContentNode.find(:all, :conditions => { :language_id => english.id, :content_attachable_type => attachable_type}) | |
languages.each do |lang| | |
puts "processing #{lang.name}" | |
I18n.locale = lang.code | |
english_phrases.each do |node| | |
if ContentNode.find(:first, :conditions => {:identifier => 'booking_conditions', :language_id => lang.id, :content_attachable_id => node.content_attachable_id, :content_attachable_type => attachable_type } ) | |
#File.open(report_filename, 'a') {|f| f.write("#{node.id},'#{lang.name}','human'\n") } | |
puts "skipping" | |
next | |
end | |
begin | |
puts "translating #{node.id}" | |
new_node = ContentNode.create | |
new_node.content_attachable_id = node.content_attachable_id | |
new_node.content_attachable_type = node.content_attachable_type | |
html_chunks = T.prepare_translation_chunks(new_node.content) | |
new_node.content = T.translate_html_chunks(google, lang.code.to_sym, html_chunks) | |
new_node.language_id = lang.id | |
puts new_node.content[0..100] | |
puts new_node.inspect | |
new_node.save! | |
#File.open(report_filename, 'a') {|f| f.write("#{node.id},'#{lang.name}','google'\n") } | |
puts "saved #{node.id} in #{lang.name}" | |
sleep 1 | |
rescue Exception => e | |
#File.open(report_filename, 'a') {|f| f.write("#{node.id},'#{lang.name}','error'\n") } | |
#File.open(error_filename, 'a') {|f| f.write("#{node.id},'#{lang.name}','error','#{e.backtrace}'\n") } | |
puts "Something went wrong while processing id #{node.id} for #{lang.name}: #{e.backtrace}" | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment