Skip to content

Instantly share code, notes, and snippets.

@dgleich
Created September 23, 2009 18:33
Show Gist options
  • Save dgleich/192177 to your computer and use it in GitHub Desktop.
Save dgleich/192177 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Google Translate API code
Based on http://code.activestate.com/recipes/576890/ but heavily modified.
"""
__author__ = 'David F. Gleich'
__version__ = 1
import re
import sys
import urllib
import urllib2
import simplejson
import time
import pprint
baseTranslateUrl = "http://ajax.googleapis.com/ajax/services/language/translate"
""" The URL for the Google translate API. """
baseUserAgent = "Python-Google-Translate-%i"%(__version__)
""" The default User-Agent string for the browser. """
retryDelay = 10
""" The default retry delay when a AJAX call fails. """
htmlCodes = [
('&', '&'),
('<', '&lt;'),
('>', '&gt;'),
('"', '&quot;'),
("'", '&#39;'),
]
""" A base set of html characters to encode and decode. """
def htmlDecode(s, codes=htmlCodes):
""" Returns the ASCII decoded version of the given HTML string. This does
NOT remove normal HTML tags like <p>. It is the inverse of htmlEncode().
Ripped from somewhere on the internet.)
"""
for code in codes:
s = s.replace(code[1], code[0])
return s
def translateRaw(text,format='text',langpair='|en',
key=None,v='1.0',curReferrer=None,retry=5):
""" Perform a raw call to the Google translate API.
This function requires all attributes to be listed and only provides
a simple retry option as a convinence.
"""
params = {'q':text.encode('utf-8'),'format':format,'langpair':langpair,'v':v}
if key is not None:
params['key']=key
if curReferrer is None:
try:
curReferrer = referrer # load referrer from outer namespace
except:
print "invalid referrer"
return None
if len(params['q'])>4500:
return None
request = urllib2.Request(baseTranslateUrl,data=urllib.urlencode(params))
request.add_header("Referer", curReferrer)
request.add_header("Accept-encoding", "gzip")
request.add_header("User-Agent", baseUserAgent)
resp = simplejson.load(urllib2.urlopen(request))
try:
retText = resp['responseData']['translatedText']
except:
if retry<=0:
print "**************** Error! ****************"
pprint.pprint(resp)
retText = ""
else:
print "Warning (retry=%i): %s"%(retry,resp['responseDetails'])
retText = translateRaw(text,format,langpair,key,v,curReferrer,
retry-1)
return retText
#return params['q']
def translateLines(lines,src='',to='en',delay=1,callback=None,
sentinal='___UNABLE_TO_TRANSLATE___'):
"""
Translate multiple lines of independent text.
This function takes a set of lines of text (non-html) and encodes them
into a single translate call using the maximum possible size.
If translation of any group of lines fail, they are translated with
a sentinal value.
"""
output = []
langpair = '%s|%s'%(src,to)
oldlen = 0
curlen = 7 # 7 extra chars on everything
curlines = []
for i,line in enumerate(lines):
linelen = len(line.encode('utf-8'))+7
if curlen + linelen > 4500:
# translate the current group
q = '<p>'+'</p><p>'.join(curlines)+'</p>'
resp = translateRaw(q,format='html',langpair=langpair)
if len(resp) == 0:
# this is an error, so append a sentinal
for line in curlines:
output.append(sentinal)
else:
for p in resp.split('</p>'):
output.append(htmlDecode(p[4:-1]))
output.pop() # we always get an extra
if callback:
callback(i,len(lines))
time.sleep(delay)
curlines = [line]
curlen = linelen+7 # first line has 7 extra chars
else:
curlines.append(line)
curlen += linelen
q = '<p>'+'</p><p>'.join(curlines)+'</p>'
resp = translateRaw(q,format='html',langpair=langpair)
if len(resp) == 0:
# this is an error, so append a sentinal
for line in curlines:
output.append(sentinal)
else:
for p in resp.split('</p>'):
output.append(htmlDecode(p[4:-1]))
output.pop() # we always get an extra
return output
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment