Skip to content

Instantly share code, notes, and snippets.

@jonathanschilling
Last active November 11, 2020 15:33
Show Gist options
  • Save jonathanschilling/01e07729cbc5ea872858f02d0f086fc3 to your computer and use it in GitHub Desktop.
Save jonathanschilling/01e07729cbc5ea872858f02d0f086fc3 to your computer and use it in GitHub Desktop.
Translate comments in (Java) source code using Google Translate
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 10 20:28:00 2020
Translate the comments found in a given (Java) source code file
from Japanese to English using Google Translate.
Run from bash as follows to iterate over all files found in a given directory:
> for i in `find src/jp/riken/kscope/ -name "*.java"` ; do python3 ../translate_java_comments.py $i ; done
@author: Jonathan Schilling (jonathan.schilling@mail.de)
"""
# https://pypi.org/project/googletrans/
from googletrans import Translator
translator = Translator()
import time
import sys
if len(sys.argv) > 1:
currentFile = sys.argv[1]
else:
# debugging
currentFile = "/data/jonathan/work/code/K-scope/src/jp/riken/kscope/profiler/common/MagicKey.java"
# logging
print(currentFile)
# read source code file
srcCode = None
with open(currentFile, "r") as f:
srcCode = "".join(f.readlines())
totalCodeLen = len(srcCode)
# print("length of source code: %d"%(totalCodeLen))
# target for translated code
target = ""
# position in source code file
currentStart = 0
# keep track of comments
numComment = 1
# loop over whole source code file
while currentStart < totalCodeLen:
# find comments: /* ... */ or /** ... */
commentStart1 = sys.maxsize
commentEnd1 = sys.maxsize
try:
commentStart1 = srcCode.index("/*", currentStart)
commentEnd1 = srcCode.index("*/", commentStart1)+2
# print("found /* ... */ comment at %d"%(commentStart1,))
except ValueError:
pass
# find comments: // ... \n
commentStart2 = sys.maxsize
commentEnd2 = sys.maxsize
try:
commentStart2 = srcCode.index("//", currentStart)
commentEnd2 = srcCode.index("\n", commentStart2)+1
# print("found // ... \\n comment at %d"%(commentStart2,))
except ValueError:
pass
if commentStart1 != sys.maxsize or commentStart2 != sys.maxsize:
# print(" found any comment")
# look which of /*... */ or // ... \n comment comes first
needNewline = False
commentStart = sys.maxsize
commentEnd = sys.maxsize
if commentStart1 < commentStart2:
commentStart = commentStart1
commentEnd = commentEnd1
# print(" comment /* ... */ is first")
else:
commentStart = commentStart2
commentEnd = commentEnd2
needNewline = True
# print(" comment // ... \\n is first")
print(" comment %d from %d to %d"%(numComment, commentStart, commentEnd))
#print("'"+srcCode[commentStart:commentEnd]+"'")
# code before start of comment
target += srcCode[currentStart:commentStart]
# extract comment
comment = srcCode[commentStart:commentEnd]
translatedComment = comment
numRetries = 2
for i in range(numRetries):
# check if comment contains only ASCII characters --> skip translation
allAscii = all(ord(c) < 128 for c in translatedComment)
if not allAscii:
if i==numRetries-1:
raise RuntimeError("Could not get translation. Try again!")
elif i>0: # WTF? Why is this necessary?
print("retry %d/%d"%(i,numRetries-1))
# if a retry was necessary since translation did not remove
# all non-ASCII characters, list the remaining ones here:
for c in translatedComment:
if ord(c) >= 128:
print("non-ascii: "+str(ord(c)))
print(" comment: '"+translatedComment+"'")
# if i > 1:
# print("retry since still non-ascii characters present")
# translate; retry in case something weird happens
try:
# actually translate
translationSuggestion = translator.translate(comment, src='ja').text
# after each translate, wait just a little bit to prevent clobbering
time.sleep(0.2)
if translationSuggestion.strip() == "":
print(" no translated response...")
raise AttributeError("no response from translation!")
except AttributeError:
time.sleep(5)
#print(" retry %d/%d"%(i+1,numRetries))
continue
# if we got to this point, the translation succeeded eventually
translatedComment = translationSuggestion
# fix common translation errors;
# seems like Google Translate is not so much optimized
# for source code...
translatedComment = translatedComment.replace("/ *", "/*") # also handles "/ **" --> "/**"
translatedComment = translatedComment.replace("* /", "*/")
translatedComment = translatedComment.replace("<br />", "<br/>")
# zero-width space
translatedComment = translatedComment.replace(chr(8203), '')
translatedComment = translatedComment.replace(chr(8211), '-')
translatedComment = translatedComment.replace(chr(8213), '-')
translatedComment = translatedComment.replace(chr(8722), '-')
if needNewline:
translatedComment += "\n"
#print(" translated: '"+comment.replace("\n", "\\n")+"' --> '"+translatedComment.replace("\n", "\\n")+"'")
#print(" translated: '"+translatedComment+"'")
target += translatedComment
# print("'"+target+"'")
numComment += 1
currentStart = commentEnd
else:
# no comments found anymore --> done with this file
break
# code part after end of last comment
target += srcCode[commentEnd:]
# print(target)
# replace source code file contents
with open(currentFile, "w") as f:
f.write(target)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment