Created
September 23, 2009 18:33
-
-
Save dgleich/192177 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
Google Translate API code | |
Based on http://code.activestate.com/recipes/576890/ but heavily modified. | |
""" | |
__author__ = 'David F. Gleich' | |
__version__ = 1 | |
import re | |
import sys | |
import urllib | |
import urllib2 | |
import simplejson | |
import time | |
import pprint | |
baseTranslateUrl = "http://ajax.googleapis.com/ajax/services/language/translate" | |
""" The URL for the Google translate API. """ | |
baseUserAgent = "Python-Google-Translate-%i"%(__version__) | |
""" The default User-Agent string for the browser. """ | |
retryDelay = 10 | |
""" The default retry delay when a AJAX call fails. """ | |
htmlCodes = [ | |
('&', '&'), | |
('<', '<'), | |
('>', '>'), | |
('"', '"'), | |
("'", '''), | |
] | |
""" A base set of html characters to encode and decode. """ | |
def htmlDecode(s, codes=htmlCodes): | |
""" Returns the ASCII decoded version of the given HTML string. This does | |
NOT remove normal HTML tags like <p>. It is the inverse of htmlEncode(). | |
Ripped from somewhere on the internet.) | |
""" | |
for code in codes: | |
s = s.replace(code[1], code[0]) | |
return s | |
def translateRaw(text,format='text',langpair='|en', | |
key=None,v='1.0',curReferrer=None,retry=5): | |
""" Perform a raw call to the Google translate API. | |
This function requires all attributes to be listed and only provides | |
a simple retry option as a convinence. | |
""" | |
params = {'q':text.encode('utf-8'),'format':format,'langpair':langpair,'v':v} | |
if key is not None: | |
params['key']=key | |
if curReferrer is None: | |
try: | |
curReferrer = referrer # load referrer from outer namespace | |
except: | |
print "invalid referrer" | |
return None | |
if len(params['q'])>4500: | |
return None | |
request = urllib2.Request(baseTranslateUrl,data=urllib.urlencode(params)) | |
request.add_header("Referer", curReferrer) | |
request.add_header("Accept-encoding", "gzip") | |
request.add_header("User-Agent", baseUserAgent) | |
resp = simplejson.load(urllib2.urlopen(request)) | |
try: | |
retText = resp['responseData']['translatedText'] | |
except: | |
if retry<=0: | |
print "**************** Error! ****************" | |
pprint.pprint(resp) | |
retText = "" | |
else: | |
print "Warning (retry=%i): %s"%(retry,resp['responseDetails']) | |
retText = translateRaw(text,format,langpair,key,v,curReferrer, | |
retry-1) | |
return retText | |
#return params['q'] | |
def translateLines(lines,src='',to='en',delay=1,callback=None, | |
sentinal='___UNABLE_TO_TRANSLATE___'): | |
""" | |
Translate multiple lines of independent text. | |
This function takes a set of lines of text (non-html) and encodes them | |
into a single translate call using the maximum possible size. | |
If translation of any group of lines fail, they are translated with | |
a sentinal value. | |
""" | |
output = [] | |
langpair = '%s|%s'%(src,to) | |
oldlen = 0 | |
curlen = 7 # 7 extra chars on everything | |
curlines = [] | |
for i,line in enumerate(lines): | |
linelen = len(line.encode('utf-8'))+7 | |
if curlen + linelen > 4500: | |
# translate the current group | |
q = '<p>'+'</p><p>'.join(curlines)+'</p>' | |
resp = translateRaw(q,format='html',langpair=langpair) | |
if len(resp) == 0: | |
# this is an error, so append a sentinal | |
for line in curlines: | |
output.append(sentinal) | |
else: | |
for p in resp.split('</p>'): | |
output.append(htmlDecode(p[4:-1])) | |
output.pop() # we always get an extra | |
if callback: | |
callback(i,len(lines)) | |
time.sleep(delay) | |
curlines = [line] | |
curlen = linelen+7 # first line has 7 extra chars | |
else: | |
curlines.append(line) | |
curlen += linelen | |
q = '<p>'+'</p><p>'.join(curlines)+'</p>' | |
resp = translateRaw(q,format='html',langpair=langpair) | |
if len(resp) == 0: | |
# this is an error, so append a sentinal | |
for line in curlines: | |
output.append(sentinal) | |
else: | |
for p in resp.split('</p>'): | |
output.append(htmlDecode(p[4:-1])) | |
output.pop() # we always get an extra | |
return output | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment