dgleich/translate.py

## translate.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Google Translate API code
Based on http://code.activestate.com/recipes/576890/ but heavily modified.
"""

__author__ = 'David F. Gleich'
__version__ = 1

import re
import sys
import urllib
import urllib2
import simplejson
import time
import pprint

baseTranslateUrl = "http://ajax.googleapis.com/ajax/services/language/translate"
""" The URL for the Google translate API. """

baseUserAgent = "Python-Google-Translate-%i"%(__version__)
""" The default User-Agent string for the browser. """

retryDelay = 10
""" The default retry delay when a AJAX call fails. """

htmlCodes = [
    ('&', '&amp;'),
    ('<', '&lt;'),
    ('>', '&gt;'),
    ('"', '&quot;'),
    ("'", '&#39;'),
]
""" A base set of html characters to encode and decode. """

def htmlDecode(s, codes=htmlCodes):
    """ Returns the ASCII decoded version of the given HTML string. This does
        NOT remove normal HTML tags like <p>. It is the inverse of htmlEncode().
        Ripped from somewhere on the internet.)
    """
    for code in codes:
        s = s.replace(code[1], code[0])
    return s


def translateRaw(text,format='text',langpair='|en',
        key=None,v='1.0',curReferrer=None,retry=5):
    """ Perform a raw call to the Google translate API.

    This function requires all attributes to be listed and only provides
    a simple retry option as a convinence.


    """
    params = {'q':text.encode('utf-8'),'format':format,'langpair':langpair,'v':v}
    if key is not None:
        params['key']=key

    if curReferrer is None:
        try:
            curReferrer = referrer # load referrer from outer namespace
        except:
            print "invalid referrer"
            return None

    if len(params['q'])>4500:
        return None

    request = urllib2.Request(baseTranslateUrl,data=urllib.urlencode(params))
    request.add_header("Referer", curReferrer)
    request.add_header("Accept-encoding", "gzip")
    request.add_header("User-Agent", baseUserAgent)

    resp = simplejson.load(urllib2.urlopen(request))

    try:
        retText = resp['responseData']['translatedText']
    except:
        if retry<=0:
            print "**************** Error! ****************"
            pprint.pprint(resp)
            retText = ""
        else:
            print "Warning (retry=%i): %s"%(retry,resp['responseDetails'])
            retText = translateRaw(text,format,langpair,key,v,curReferrer,
                retry-1)

    return retText
    #return params['q']

def translateLines(lines,src='',to='en',delay=1,callback=None,
    sentinal='___UNABLE_TO_TRANSLATE___'):
    """
    Translate multiple lines of independent text.

    This function takes a set of lines of text (non-html) and encodes them
    into a single translate call using the maximum possible size.
    If translation of any group of lines fail, they are translated with
    a sentinal value.
    """

    output = []
    langpair = '%s|%s'%(src,to)
    oldlen = 0
    curlen = 7 # 7 extra chars on everything
    curlines = []
    for i,line in enumerate(lines):
        linelen = len(line.encode('utf-8'))+7
        if curlen + linelen > 4500:
            # translate the current group
            q = '<p>'+'</p><p>'.join(curlines)+'</p>'
            resp = translateRaw(q,format='html',langpair=langpair)
            if len(resp) == 0:
                # this is an error, so append a sentinal
                for line in curlines:
                    output.append(sentinal)
            else:
                for p in resp.split('</p>'):
                    output.append(htmlDecode(p[4:-1]))
                output.pop() # we always get an extra
            if callback:
                callback(i,len(lines))
            time.sleep(delay)
            curlines = [line]
            curlen = linelen+7 # first line has 7 extra chars
        else:
            curlines.append(line)
            curlen += linelen
    q = '<p>'+'</p><p>'.join(curlines)+'</p>'
    resp = translateRaw(q,format='html',langpair=langpair)
    if len(resp) == 0:
        # this is an error, so append a sentinal
        for line in curlines:
            output.append(sentinal)
    else:
        for p in resp.split('</p>'):
            output.append(htmlDecode(p[4:-1]))
        output.pop() # we always get an extra

    return output
	#!/usr/bin/env python
	# -- coding: utf-8 --

	"""
	Google Translate API code
	Based on http://code.activestate.com/recipes/576890/ but heavily modified.
	"""

	__author__ = 'David F. Gleich'
	__version__ = 1

	import re
	import sys
	import urllib
	import urllib2
	import simplejson
	import time
	import pprint

	baseTranslateUrl = "http://ajax.googleapis.com/ajax/services/language/translate"
	""" The URL for the Google translate API. """

	baseUserAgent = "Python-Google-Translate-%i"%(__version__)
	""" The default User-Agent string for the browser. """

	retryDelay = 10
	""" The default retry delay when a AJAX call fails. """

	htmlCodes = [
	('&', '&'),
	('<', '<'),
	('>', '>'),
	('"', '"'),
	("'", '''),
	]
	""" A base set of html characters to encode and decode. """

	def htmlDecode(s, codes=htmlCodes):
	""" Returns the ASCII decoded version of the given HTML string. This does
	NOT remove normal HTML tags like <p>. It is the inverse of htmlEncode().
	Ripped from somewhere on the internet.)
	"""
	for code in codes:
	s = s.replace(code[1], code[0])
	return s


	def translateRaw(text,format='text',langpair='\|en',
	key=None,v='1.0',curReferrer=None,retry=5):
	""" Perform a raw call to the Google translate API.

	This function requires all attributes to be listed and only provides
	a simple retry option as a convinence.


	"""
	params = {'q':text.encode('utf-8'),'format':format,'langpair':langpair,'v':v}
	if key is not None:
	params['key']=key

	if curReferrer is None:
	try:
	curReferrer = referrer # load referrer from outer namespace
	except:
	print "invalid referrer"
	return None

	if len(params['q'])>4500:
	return None

	request = urllib2.Request(baseTranslateUrl,data=urllib.urlencode(params))
	request.add_header("Referer", curReferrer)
	request.add_header("Accept-encoding", "gzip")
	request.add_header("User-Agent", baseUserAgent)

	resp = simplejson.load(urllib2.urlopen(request))

	try:
	retText = resp['responseData']['translatedText']
	except:
	if retry<=0:
	print "************** Error! **************"
	pprint.pprint(resp)
	retText = ""
	else:
	print "Warning (retry=%i): %s"%(retry,resp['responseDetails'])
	retText = translateRaw(text,format,langpair,key,v,curReferrer,
	retry-1)

	return retText
	#return params['q']

	def translateLines(lines,src='',to='en',delay=1,callback=None,
	sentinal='___UNABLE_TO_TRANSLATE___'):
	"""
	Translate multiple lines of independent text.

	This function takes a set of lines of text (non-html) and encodes them
	into a single translate call using the maximum possible size.
	If translation of any group of lines fail, they are translated with
	a sentinal value.
	"""

	output = []
	langpair = '%s\|%s'%(src,to)
	oldlen = 0
	curlen = 7 # 7 extra chars on everything
	curlines = []
	for i,line in enumerate(lines):
	linelen = len(line.encode('utf-8'))+7
	if curlen + linelen > 4500:
	# translate the current group
	q = '<p>'+'</p><p>'.join(curlines)+'</p>'
	resp = translateRaw(q,format='html',langpair=langpair)
	if len(resp) == 0:
	# this is an error, so append a sentinal
	for line in curlines:
	output.append(sentinal)
	else:
	for p in resp.split('</p>'):
	output.append(htmlDecode(p[4:-1]))
	output.pop() # we always get an extra
	if callback:
	callback(i,len(lines))
	time.sleep(delay)
	curlines = [line]
	curlen = linelen+7 # first line has 7 extra chars
	else:
	curlines.append(line)
	curlen += linelen
	q = '<p>'+'</p><p>'.join(curlines)+'</p>'
	resp = translateRaw(q,format='html',langpair=langpair)
	if len(resp) == 0:
	# this is an error, so append a sentinal
	for line in curlines:
	output.append(sentinal)
	else:
	for p in resp.split('</p>'):
	output.append(htmlDecode(p[4:-1]))
	output.pop() # we always get an extra

	return output