jonathanschilling/translate_java_comments.py

## translate_java_comments.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 10 20:28:00 2020

Translate the comments found in a given (Java) source code file
from Japanese to English using Google Translate.
Run from bash as follows to iterate over all files found in a given directory:
> for i in `find src/jp/riken/kscope/ -name "*.java"` ; do python3 ../translate_java_comments.py $i ; done

@author: Jonathan Schilling (jonathan.schilling@mail.de)
"""

# https://pypi.org/project/googletrans/
from googletrans import Translator
translator = Translator()

import time
import sys

if len(sys.argv) > 1:
    currentFile = sys.argv[1]
else:
    # debugging
    currentFile = "/data/jonathan/work/code/K-scope/src/jp/riken/kscope/profiler/common/MagicKey.java"

# logging
print(currentFile)

# read source code file
srcCode = None
with open(currentFile, "r") as f:
    srcCode = "".join(f.readlines())

totalCodeLen = len(srcCode)
# print("length of source code: %d"%(totalCodeLen))

# target for translated code
target = ""

# position in source code file
currentStart = 0

# keep track of comments
numComment = 1

# loop over whole source code file
while currentStart < totalCodeLen:

    # find comments: /* ... */ or /** ... */
    commentStart1 = sys.maxsize
    commentEnd1 = sys.maxsize
    try:
        commentStart1 = srcCode.index("/*", currentStart)
        commentEnd1 = srcCode.index("*/", commentStart1)+2
        # print("found /* ... */ comment at %d"%(commentStart1,))
    except ValueError:
        pass

    # find comments: // ... \n
    commentStart2 = sys.maxsize
    commentEnd2 = sys.maxsize
    try:
        commentStart2 = srcCode.index("//", currentStart)
        commentEnd2 = srcCode.index("\n", commentStart2)+1
        # print("found // ... \\n comment at %d"%(commentStart2,))
    except ValueError:
        pass

    if commentStart1 != sys.maxsize or commentStart2 != sys.maxsize:

        # print(" found any comment")

        # look which of /*... */ or // ... \n comment comes first
        needNewline = False

        commentStart = sys.maxsize
        commentEnd = sys.maxsize
        if commentStart1 < commentStart2:
            commentStart = commentStart1
            commentEnd = commentEnd1
            # print("  comment /* ... */ is first")
        else:
            commentStart = commentStart2
            commentEnd = commentEnd2
            needNewline = True
            # print("  comment // ... \\n is first")

        print("   comment %d from %d to %d"%(numComment, commentStart, commentEnd))
        #print("'"+srcCode[commentStart:commentEnd]+"'")

        # code before start of comment
        target += srcCode[currentStart:commentStart]

        # extract comment
        comment = srcCode[commentStart:commentEnd]
        translatedComment = comment

        numRetries = 2
        for i in range(numRetries):

            # check if comment contains only ASCII characters --> skip translation
            allAscii = all(ord(c) < 128 for c in translatedComment)
            if not allAscii:

                if i==numRetries-1:
                    raise RuntimeError("Could not get translation. Try again!")
                elif i>0: # WTF? Why is this necessary?
                    print("retry %d/%d"%(i,numRetries-1))

                    # if a retry was necessary since translation did not remove
                    # all non-ASCII characters, list the remaining ones here:
                    for c in translatedComment:
                        if ord(c) >= 128:
                            print("non-ascii: "+str(ord(c)))

                    print("    comment: '"+translatedComment+"'")


                # if i > 1:
                #     print("retry since still non-ascii characters present")

                # translate; retry in case something weird happens
                try:
                    # actually translate
                    translationSuggestion = translator.translate(comment, src='ja').text

                    # after each translate, wait just a little bit to prevent clobbering
                    time.sleep(0.2)

                    if translationSuggestion.strip() == "":
                        print("   no translated response...")
                        raise AttributeError("no response from translation!")
                except AttributeError:
                    time.sleep(5)
                    #print("  retry %d/%d"%(i+1,numRetries))
                    continue

                # if we got to this point, the translation succeeded eventually
                translatedComment = translationSuggestion

                # fix common translation errors;
                # seems like Google Translate is not so much optimized
                # for source code...
                translatedComment = translatedComment.replace("/ *", "/*") # also handles "/ **" --> "/**"
                translatedComment = translatedComment.replace("* /", "*/")
                translatedComment = translatedComment.replace("<br />", "<br/>")

                # zero-width space
                translatedComment = translatedComment.replace(chr(8203), '')
                translatedComment = translatedComment.replace(chr(8211), '-')
                translatedComment = translatedComment.replace(chr(8213), '-')
                translatedComment = translatedComment.replace(chr(8722), '-')

                if needNewline:
                    translatedComment += "\n"

                #print("    translated: '"+comment.replace("\n", "\\n")+"' --> '"+translatedComment.replace("\n", "\\n")+"'")
                #print("    translated: '"+translatedComment+"'")

        target += translatedComment

        # print("'"+target+"'")

        numComment += 1
        currentStart = commentEnd
    else:
        # no comments found anymore --> done with this file
        break

# code part after end of last comment
target += srcCode[commentEnd:]

# print(target)

# replace source code file contents
with open(currentFile, "w") as f:
    f.write(target)
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	"""
	Created on Tue Nov 10 20:28:00 2020

	Translate the comments found in a given (Java) source code file
	from Japanese to English using Google Translate.
	Run from bash as follows to iterate over all files found in a given directory:
	> for i in `find src/jp/riken/kscope/ -name "*.java"` ; do python3 ../translate_java_comments.py $i ; done

	@author: Jonathan Schilling (jonathan.schilling@mail.de)
	"""

	# https://pypi.org/project/googletrans/
	from googletrans import Translator
	translator = Translator()

	import time
	import sys

	if len(sys.argv) > 1:
	currentFile = sys.argv[1]
	else:
	# debugging
	currentFile = "/data/jonathan/work/code/K-scope/src/jp/riken/kscope/profiler/common/MagicKey.java"

	# logging
	print(currentFile)

	# read source code file
	srcCode = None
	with open(currentFile, "r") as f:
	srcCode = "".join(f.readlines())

	totalCodeLen = len(srcCode)
	# print("length of source code: %d"%(totalCodeLen))

	# target for translated code
	target = ""

	# position in source code file
	currentStart = 0

	# keep track of comments
	numComment = 1

	# loop over whole source code file
	while currentStart < totalCodeLen:

	# find comments: /* ... / or /* ... */
	commentStart1 = sys.maxsize
	commentEnd1 = sys.maxsize
	try:
	commentStart1 = srcCode.index("/*", currentStart)
	commentEnd1 = srcCode.index("*/", commentStart1)+2
	# print("found /* ... */ comment at %d"%(commentStart1,))
	except ValueError:
	pass

	# find comments: // ... \n
	commentStart2 = sys.maxsize
	commentEnd2 = sys.maxsize
	try:
	commentStart2 = srcCode.index("//", currentStart)
	commentEnd2 = srcCode.index("\n", commentStart2)+1
	# print("found // ... \\n comment at %d"%(commentStart2,))
	except ValueError:
	pass

	if commentStart1 != sys.maxsize or commentStart2 != sys.maxsize:

	# print(" found any comment")

	# look which of /... / or // ... \n comment comes first
	needNewline = False

	commentStart = sys.maxsize
	commentEnd = sys.maxsize
	if commentStart1 < commentStart2:
	commentStart = commentStart1
	commentEnd = commentEnd1
	# print(" comment /* ... */ is first")
	else:
	commentStart = commentStart2
	commentEnd = commentEnd2
	needNewline = True
	# print(" comment // ... \\n is first")

	print(" comment %d from %d to %d"%(numComment, commentStart, commentEnd))
	#print("'"+srcCode[commentStart:commentEnd]+"'")

	# code before start of comment
	target += srcCode[currentStart:commentStart]

	# extract comment
	comment = srcCode[commentStart:commentEnd]
	translatedComment = comment

	numRetries = 2
	for i in range(numRetries):

	# check if comment contains only ASCII characters --> skip translation
	allAscii = all(ord(c) < 128 for c in translatedComment)
	if not allAscii:

	if i==numRetries-1:
	raise RuntimeError("Could not get translation. Try again!")
	elif i>0: # WTF? Why is this necessary?
	print("retry %d/%d"%(i,numRetries-1))

	# if a retry was necessary since translation did not remove
	# all non-ASCII characters, list the remaining ones here:
	for c in translatedComment:
	if ord(c) >= 128:
	print("non-ascii: "+str(ord(c)))

	print(" comment: '"+translatedComment+"'")


	# if i > 1:
	# print("retry since still non-ascii characters present")

	# translate; retry in case something weird happens
	try:
	# actually translate
	translationSuggestion = translator.translate(comment, src='ja').text

	# after each translate, wait just a little bit to prevent clobbering
	time.sleep(0.2)

	if translationSuggestion.strip() == "":
	print(" no translated response...")
	raise AttributeError("no response from translation!")
	except AttributeError:
	time.sleep(5)
	#print(" retry %d/%d"%(i+1,numRetries))
	continue

	# if we got to this point, the translation succeeded eventually
	translatedComment = translationSuggestion

	# fix common translation errors;
	# seems like Google Translate is not so much optimized
	# for source code...
	translatedComment = translatedComment.replace("/ ", "/") # also handles "/ " --> "/"
	translatedComment = translatedComment.replace("* /", "*/")
	translatedComment = translatedComment.replace("<br />", "<br/>")

	# zero-width space
	translatedComment = translatedComment.replace(chr(8203), '')
	translatedComment = translatedComment.replace(chr(8211), '-')
	translatedComment = translatedComment.replace(chr(8213), '-')
	translatedComment = translatedComment.replace(chr(8722), '-')

	if needNewline:
	translatedComment += "\n"

	#print(" translated: '"+comment.replace("\n", "\\n")+"' --> '"+translatedComment.replace("\n", "\\n")+"'")
	#print(" translated: '"+translatedComment+"'")

	target += translatedComment

	# print("'"+target+"'")

	numComment += 1
	currentStart = commentEnd
	else:
	# no comments found anymore --> done with this file
	break

	# code part after end of last comment
	target += srcCode[commentEnd:]

	# print(target)

	# replace source code file contents
	with open(currentFile, "w") as f:
	f.write(target)