Last active
November 11, 2020 15:33
-
-
Save jonathanschilling/01e07729cbc5ea872858f02d0f086fc3 to your computer and use it in GitHub Desktop.
Translate comments in (Java) source code using Google Translate
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
""" | |
Created on Tue Nov 10 20:28:00 2020 | |
Translate the comments found in a given (Java) source code file | |
from Japanese to English using Google Translate. | |
Run from bash as follows to iterate over all files found in a given directory: | |
> for i in `find src/jp/riken/kscope/ -name "*.java"` ; do python3 ../translate_java_comments.py $i ; done | |
@author: Jonathan Schilling (jonathan.schilling@mail.de) | |
""" | |
# https://pypi.org/project/googletrans/ | |
from googletrans import Translator | |
translator = Translator() | |
import time | |
import sys | |
if len(sys.argv) > 1: | |
currentFile = sys.argv[1] | |
else: | |
# debugging | |
currentFile = "/data/jonathan/work/code/K-scope/src/jp/riken/kscope/profiler/common/MagicKey.java" | |
# logging | |
print(currentFile) | |
# read source code file | |
srcCode = None | |
with open(currentFile, "r") as f: | |
srcCode = "".join(f.readlines()) | |
totalCodeLen = len(srcCode) | |
# print("length of source code: %d"%(totalCodeLen)) | |
# target for translated code | |
target = "" | |
# position in source code file | |
currentStart = 0 | |
# keep track of comments | |
numComment = 1 | |
# loop over whole source code file | |
while currentStart < totalCodeLen: | |
# find comments: /* ... */ or /** ... */ | |
commentStart1 = sys.maxsize | |
commentEnd1 = sys.maxsize | |
try: | |
commentStart1 = srcCode.index("/*", currentStart) | |
commentEnd1 = srcCode.index("*/", commentStart1)+2 | |
# print("found /* ... */ comment at %d"%(commentStart1,)) | |
except ValueError: | |
pass | |
# find comments: // ... \n | |
commentStart2 = sys.maxsize | |
commentEnd2 = sys.maxsize | |
try: | |
commentStart2 = srcCode.index("//", currentStart) | |
commentEnd2 = srcCode.index("\n", commentStart2)+1 | |
# print("found // ... \\n comment at %d"%(commentStart2,)) | |
except ValueError: | |
pass | |
if commentStart1 != sys.maxsize or commentStart2 != sys.maxsize: | |
# print(" found any comment") | |
# look which of /*... */ or // ... \n comment comes first | |
needNewline = False | |
commentStart = sys.maxsize | |
commentEnd = sys.maxsize | |
if commentStart1 < commentStart2: | |
commentStart = commentStart1 | |
commentEnd = commentEnd1 | |
# print(" comment /* ... */ is first") | |
else: | |
commentStart = commentStart2 | |
commentEnd = commentEnd2 | |
needNewline = True | |
# print(" comment // ... \\n is first") | |
print(" comment %d from %d to %d"%(numComment, commentStart, commentEnd)) | |
#print("'"+srcCode[commentStart:commentEnd]+"'") | |
# code before start of comment | |
target += srcCode[currentStart:commentStart] | |
# extract comment | |
comment = srcCode[commentStart:commentEnd] | |
translatedComment = comment | |
numRetries = 2 | |
for i in range(numRetries): | |
# check if comment contains only ASCII characters --> skip translation | |
allAscii = all(ord(c) < 128 for c in translatedComment) | |
if not allAscii: | |
if i==numRetries-1: | |
raise RuntimeError("Could not get translation. Try again!") | |
elif i>0: # WTF? Why is this necessary? | |
print("retry %d/%d"%(i,numRetries-1)) | |
# if a retry was necessary since translation did not remove | |
# all non-ASCII characters, list the remaining ones here: | |
for c in translatedComment: | |
if ord(c) >= 128: | |
print("non-ascii: "+str(ord(c))) | |
print(" comment: '"+translatedComment+"'") | |
# if i > 1: | |
# print("retry since still non-ascii characters present") | |
# translate; retry in case something weird happens | |
try: | |
# actually translate | |
translationSuggestion = translator.translate(comment, src='ja').text | |
# after each translate, wait just a little bit to prevent clobbering | |
time.sleep(0.2) | |
if translationSuggestion.strip() == "": | |
print(" no translated response...") | |
raise AttributeError("no response from translation!") | |
except AttributeError: | |
time.sleep(5) | |
#print(" retry %d/%d"%(i+1,numRetries)) | |
continue | |
# if we got to this point, the translation succeeded eventually | |
translatedComment = translationSuggestion | |
# fix common translation errors; | |
# seems like Google Translate is not so much optimized | |
# for source code... | |
translatedComment = translatedComment.replace("/ *", "/*") # also handles "/ **" --> "/**" | |
translatedComment = translatedComment.replace("* /", "*/") | |
translatedComment = translatedComment.replace("<br />", "<br/>") | |
# zero-width space | |
translatedComment = translatedComment.replace(chr(8203), '') | |
translatedComment = translatedComment.replace(chr(8211), '-') | |
translatedComment = translatedComment.replace(chr(8213), '-') | |
translatedComment = translatedComment.replace(chr(8722), '-') | |
if needNewline: | |
translatedComment += "\n" | |
#print(" translated: '"+comment.replace("\n", "\\n")+"' --> '"+translatedComment.replace("\n", "\\n")+"'") | |
#print(" translated: '"+translatedComment+"'") | |
target += translatedComment | |
# print("'"+target+"'") | |
numComment += 1 | |
currentStart = commentEnd | |
else: | |
# no comments found anymore --> done with this file | |
break | |
# code part after end of last comment | |
target += srcCode[commentEnd:] | |
# print(target) | |
# replace source code file contents | |
with open(currentFile, "w") as f: | |
f.write(target) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment