Skip to content

Instantly share code, notes, and snippets.

@masa-ita
Created April 2, 2020 05:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save masa-ita/070d208fe63f9445697b7b44fc78f5df to your computer and use it in GitHub Desktop.
Save masa-ita/070d208fe63f9445697b7b44fc78f5df to your computer and use it in GitHub Desktop.
Translate Jupyter Notebook with Google Cloud Translation
#!/usr/bin/env python
# coding: utf-8
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# USAGE: translate_notebook.py [flags]
# flags:
# translate_notebook.py:
# --input_notebook: jupyter notebook to translate
# (default: '')
# --language: language code
# (default: 'ja')
# --output_notebook: translated notebook name
# (default: '')
# Try --helpfull to get a list of all flags.
# set environment variable GOOGLE_APPLICATION_CREDENTIALS
import json
import six
from google.cloud import translate_v2 as translate_v2
import re
import os
from absl import app
from absl import flags
from absl import logging
FLAGS = flags.FLAGS
flags.DEFINE_string('input_notebook', '', 'jupyter notebook to translate')
flags.DEFINE_string('output_notebook', '', 'translated notebook name')
flags.DEFINE_string('language', 'ja', 'language code')
flags.mark_flag_as_required('input_notebook')
flags.mark_flag_as_required('output_notebook')
class GoogleCloudTranslate():
def __init__(self):
self.client = translate_v2.Client()
def translate(self, lang, text):
if isinstance(text, six.binary_type):
text = text.decode('utf-8')
result = self.client.translate(
text, target_language=lang)
return result['translatedText']
cloud_translator = GoogleCloudTranslate()
anchor_re = re.compile(r'(.*?)\[(.+?)\]\((.+?)\)(.*)(\n?)')
def recurse_anchor(lang, line):
anchor_match = anchor_re.match(line)
lines = []
if anchor_match:
groups = anchor_match.groups()
lines.append(cloud_translator.translate(lang, groups[0]))
lines.append('[' + cloud_translator.translate(lang, groups[1]) + ']')
lines.append('(' + groups[2] + ')')
lines.append(recurse_anchor(lang, groups[3]))
else:
lines.append(cloud_translator.translate(lang, line))
return ''.join(lines)
md_line_re = re.compile(r'(\s*)((?:#+|\*|\-|\d+\.)\s+)?(.+)(\n?)')
def translate_md_line(lang, line):
if line:
md_line_match = md_line_re.match(line)
if md_line_match:
md_line_groups = md_line_match.groups()
translated_line = md_line_groups[0]
if md_line_groups[1]:
translated_line += md_line_groups[1]
translated_line += recurse_anchor(lang, md_line_groups[2])
translated_line += md_line_groups[3]
else:
translated_line = line
else:
translated_line = ''
return translated_line
prog_re = re.compile(r'(.*?)(#+\s*)(.*)(\n)')
def translate_prog_line(lang, line):
comment_match = prog_re.match(line)
if comment_match:
prog_groups = comment_match.groups()
translated_line = prog_groups[0] + prog_groups[1] + \
cloud_translator.translate(lang, prog_groups[2]) + prog_groups[3]
else:
translated_line = line
return translated_line
def translate_json(lang, jn_json):
total_cells = len(jn_json['cells'])
for i, cell in enumerate(jn_json['cells']):
logging.info('processing cell #{}/{} {}'.format(i + 1, total_cells, cell['cell_type']))
if cell['cell_type'] == 'markdown':
lines = []
prog_block = False
for line in cell['source']:
if '```' in line:
lines.append(line)
if prog_block:
prog_block = False
continue
else:
prog_block = True
continue
if prog_block:
lines.append(translate_prog_line(lang, line))
else:
lines.append(translate_md_line(lang, line))
jn_json['cells'][i]['source'] = lines
elif cell['cell_type'] == 'code':
lines = []
for line in cell['source']:
lines.append(translate_prog_line(lang, line))
jn_json['cells'][i]['source'] = lines
else:
pass
return jn_json
def main(argv):
with open(FLAGS.input_notebook, 'r') as f:
jn_json = json.load(f)
jn_json = translate_json(FLAGS.language,jn_json)
with open(FLAGS.output_notebook, 'w') as f:
json.dump(jn_json, f)
if __name__ == '__main__':
app.run(main)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment