tomo-makes/jupyter_translate.md

## jupyter_translate.md

      
    Raw
  

              jupyter_translate.md
            
          
    Overview


Lots of jupyter notebooks for machine learning tutorials are available in English
Draft machine translations of markdown cells help self motivated learners, who are non-native English speakers, to reach more resources

Usage

generate translated notebook

$ ./jupyter_translate.py en ja source.ipynb translated.ipynb

generate html

$ jupyter nbconvert --to html translated.ipynb

Note: Jupyter custom themes would be used. $ jt -r if you would like to use the default theme.
todo


 translate markdown cells
 specify source/target langs
 import successfully into Google Colab

 delete unnecessary metadata


 translate @title of source code cells
 rendering html using nbconvert

 MkDocsによるドキュメント作成 - Qiita


## jupyter_translate.py
#!/usr/bin/env python
# coding:utf-8
import requests
import json
import time
import mistune # markdown > html
#import tomd # html > markdown
import modified_tomd
import re

google_apikey = "<YOUR API KEY>"
args = sys.argv
source_lang = args[1]  # e.g. en
target_lang = args[2] # e.g. ja
filename = args[3] # e.g. source.ipynb
filename_translated = args[4] # e.g. target_ja.ipynb

def google_translate(original_text):
    google_translate_url = "https://translation.googleapis.com/language/translate/v2?key=" + google_apikey

    payload = {
        "q": original_text,
        "source": source_lang,
        "target": target_lang,
        "format": "html"
    }
    json_data = json.dumps(payload).encode("utf-8")
    headers = {"content-type": "application/json"}
    r = requests.post(google_translate_url, headers=headers, data=json_data)
    translated = r.json()
    return translated

# read ipynb and parse JSON
with open(filename, "r") as f:
  notebook = json.load(f)

# insert translated cells
new_notebook = notebook.copy()
cells = notebook["cells"].copy()
new_cells = cells.copy()
offset = 1

for index, cell in enumerate(cells):
  print("===index: " + str(index) + " offset: " + str(offset) + "===")

  if cell["cell_type"] == "markdown":
    print("cell: " + str(cell["source"]))

    original_text = "\n".join(cell["source"])
    original_html = mistune.markdown(original_text)
    r = google_translate(original_html)
    translated_html = r["data"]["translations"][0]["translatedText"]
#    translated_md = tomd.convert(translated_html)
    translated_md = modified_tomd.Tomd(translated_html).markdown
#    new_source_list = translated_md.split("\n")
    new_source_list = re.findall(".*\n", translated_md)
    new_cell = { 'cell_type': 'markdown', 'source': new_source_list }
    new_cells.insert(index + offset, new_cell)

    offset = offset + 1

new_notebook["cells"] = new_cells

# delete meta data
for index in range(0,len(new_notebook["cells"])):
  new_notebook["cells"][index]["metadata"] = {}

new_notebook["metadata"] = {}

# write .ipynb file
with open(filename_translated, "w") as f:
  json.dump(new_notebook, f, ensure_ascii=False, indent=2)

## modified_tomd.py
# https://github.com/gaojiuli/tomd
#  by gaojiuli, GNU General Public License v3.0

# coding: utf-8

import re
import os
import warnings

__all__ = ['Tomd', 'convert']

MARKDOWN = {
    'h1': ('\n# ', '\n'),
    'h2': ('\n## ', '\n'),
    'h3': ('\n### ', '\n'),
    'h4': ('\n#### ', '\n'),
    'h5': ('\n##### ', '\n'),
    'h6': ('\n###### ', '\n'),
    'code': ('`', '`'),
    'ul': ('', ''),
    'ol': ('', ''),
    'li': ('- ', '\n'), # modified here
    'blockquote': ('\n> ', '\n'),
    'em': ('*', '*'),
    'strong': ('**', '**'),
    'block_code': ('\n```\n', '\n```\n'),
    'span': ('', ''),
    'p': ('\n', '\n'),
    'p_with_out_class': ('\n', '\n'),
    'inline_p': ('', ''),
    'inline_p_with_out_class': ('', ''),
    'b': ('**', '**'),
    'i': ('*', '*'),
    'del': ('~~', '~~'),
    'hr': ('\n---', '\n\n'),
    'thead': ('\n', '|------\n'),
    'tbody': ('\n', '\n'),
    'td': ('|', ''),
    'th': ('|', ''),
    'tr': ('', '\n'),
    'table': ('', '\n'),
    'e_p': ('', '\n')
}

BlOCK_ELEMENTS = {
    'h1': '<h1.*?>(.*?)</h1>',
    'h2': '<h2.*?>(.*?)</h2>',
    'h3': '<h3.*?>(.*?)</h3>',
    'h4': '<h4.*?>(.*?)</h4>',
    'h5': '<h5.*?>(.*?)</h5>',
    'h6': '<h6.*?>(.*?)</h6>',
    'hr': '<hr/>',
    'blockquote': '<blockquote.*?>(.*?)</blockquote>',
    'ul': '<ul.*?>(.*?)</ul>',
    'ol': '<ol.*?>(.*?)</ol>',
    'block_code': '<pre.*?><code.*?>(.*?)</code></pre>',
    'p': '<p\s.*?>(.*?)</p>',
    'p_with_out_class': '<p>(.*?)</p>',
    'thead': '<thead.*?>(.*?)</thead>',
    'tr': '<tr.*?>(.*?)</tr>'
}


INLINE_ELEMENTS = {
    'td': '<td.*?>((.|\n)*?)</td>',  # td element may span lines
    'tr': '<tr.*?>((.|\n)*?)</tr>',
    'th': '<th.*?>(.*?)</th>',
    'b': '<b.*?>(.*?)</b>',
    'i': '<i.*?>(.*?)</i>',
    'del': '<del.*?>(.*?)</del>',
    'inline_p': '<p\s.*?>(.*?)</p>',
    'inline_p_with_out_class': '<p>(.*?)</p>',
    'code': '<code.*?>(.*?)</code>',
    'span': '<span.*?>(.*?)</span>',
    'ul': '<ul.*?>(.*?)</ul>',
    'ol': '<ol.*?>(.*?)</ol>',
    'li': '<li.*?>(.*?)</li>',
    'img': '<img.*?src="(.*?)".*?>(.*?)</img>',
    'img_single': '<img.*?src="(.*?)".*?/>',
    'img_single_no_close': '<img.*?src="(.*?)".*?>',
    'a': '<a.*?href="(.*?)".*?>(.*?)</a>',
    'em': '<em.*?>(.*?)</em>',
    'strong': '<strong.*?>(\s*)(.*?)(\s*)</strong>',
    'tbody': '<tbody.*?>((.|\n)*)</tbody>',
}

DELETE_ELEMENTS = ['<span.*?>', '</span>', '<div.*?>', '</div>', '<br clear="none"/>', '<center.*?>', '</center>']


class Element:
    def __init__(self, start_pos, end_pos, content, tag, folder, is_block=False):
        self.start_pos = start_pos
        self.end_pos = end_pos
        self.content = content
        self._elements = []
        self.is_block = is_block
        self.tag = tag
        self.folder = folder
        self._result = None

        if self.is_block:
            self.parse_inline()

    def __str__(self):
        wrapper = MARKDOWN.get(self.tag)
        self._result = '{}{}{}'.format(wrapper[0], self.content, wrapper[1])
        return self._result

    def parse_inline(self):
        self.content = self.content.replace('\r', '')  # windows \r character
        self.content = self.content.replace('\xc2\xa0', ' ')  # no break space
        self.content = self.content.replace('&quot;', '\"')  # html quote mark

        for m in re.finditer("<img(.*?)en_todo.*?>", self.content):
            # remove img and change to [ ] and [x]
            # evernote specific parsing
            imgSrc = re.search('src=".*?"', m.group())
            imgLoc = imgSrc.group()[5:-1]  # remove source and " "
            imgLoc = imgLoc.replace('\\', '/')  # \\ folder slash rotate
            if os.stat(self.folder + "/" + imgLoc).st_size < 250:
                self.content = self.content.replace(m.group(), "[ ] ")
            else:
                self.content = self.content.replace(m.group(), "[x] ")

        if "e_" in self.tag:  # evernote-specific parsing
            for m in re.finditer(BlOCK_ELEMENTS['table'], self.content, re.I | re.S | re.M):
                # hmm can there only be one table?
                inner = Element(start_pos=m.start(),
                                end_pos=m.end(),
                                content=''.join(m.groups()),
                                tag='table', folder=self.folder,
                                is_block=True)
                self.content = inner.content
                return  # no need for further parsing ?

            # if no table, parse as usual
            self.content = self.content.replace('<hr/>', '\n---\n')
            self.content = self.content.replace('<br/>', '')

        if self.tag == "table":  # for removing tbody
            self.content = re.sub(INLINE_ELEMENTS['tbody'], '\g<1>', self.content)

        INLINE_ELEMENTS_LIST_KEYS = list(INLINE_ELEMENTS.keys())
        INLINE_ELEMENTS_LIST_KEYS.sort()
        for tag in INLINE_ELEMENTS_LIST_KEYS:
            pattern = INLINE_ELEMENTS[tag]

            if tag == 'a':
                self.content = re.sub(pattern, '[\g<2>](\g<1>)', self.content)
            elif tag == 'img':
                self.content = re.sub(pattern, '![\g<2>](\g<1>)', self.content)
            elif tag == 'img_single':
                self.content = re.sub(pattern, '![](\g<1>)', self.content)
            elif tag == 'img_single_no_close':
                self.content = re.sub(pattern, '![](\g<1>)', self.content)
            elif self.tag == 'ul' and tag == 'li':
                self.content = re.sub(pattern, '- \g<1>\n', self.content)
            elif self.tag == 'ol' and tag == 'li':
                self.content = re.sub(pattern, '1. \g<1>\n', self.content)
            elif self.tag == 'thead' and tag == 'tr':
                self.content = re.sub(pattern, '\g<1>\n', self.content.replace('\n', ''))
            elif self.tag == 'tr' and tag == 'th':
                self.content = re.sub(pattern, '|\g<1>', self.content.replace('\n', ''))
            elif self.tag == 'tr' and tag == 'td':
                self.content = re.sub(pattern, '|\g<1>|', self.content.replace('\n', ''))
                self.content = self.content.replace("||", "|")  # end of column also needs a pipe
            elif self.tag == 'table' and tag == 'td':
                self.content = re.sub(pattern, '|\g<1>|', self.content)
                self.content = self.content.replace("||", "|")  # end of column also needs a pipe
                self.content = self.content.replace('|\n\n', '|\n')  # replace double new line
                self.construct_table()
            else:
                wrapper = MARKDOWN.get(tag)
                if tag == "strong":
                    self.content = re.sub(pattern, '{}\g<2>{}'.format(wrapper[0], wrapper[1]), self.content)
                else:
                    self.content = re.sub(pattern, '{}\g<1>{}'.format(wrapper[0], wrapper[1]), self.content)

        if self.tag == "e_p" and self.content[-1:] != '\n' and len(self.content) > 2:
            # focusing on div, add new line if not there (and if content is long enough)
            self.content += '\n'

    def construct_table(self):
        # this function, after self.content has gained | for table entries,
        # adds the |---| in markdown to create a proper table

        temp = self.content.split('\n', 3)
        for elt in temp:
            if elt != "":
                count = elt.count("|")  # count number of pipes
                break
        pipe = "\n|"  # beginning \n for safety
        for i in range(count - 1):
            pipe += "---|"
        pipe += "\n"
        self.content = pipe + pipe + self.content + "\n"  # TODO: column titles?
        self.content = self.content.replace('|\n\n', '|\n')  # replace double new line
        self.content = self.content.replace("<br/>\n", "<br/>")  # end of column also needs a pipe


class Tomd:
    def __init__(self, html='', folder='', file='', options=None):
        self.html = html  # actual data
        self.folder = folder
        self.file = file
        self.options = options  # haven't been implemented yet
        self._markdown = self.convert(self.html, self.options)

    def convert(self, html="", options=None):
        if html == "":
            html = self.html
        # main function here
        elements = []
        for tag, pattern in BlOCK_ELEMENTS.items():
            for m in re.finditer(pattern, html, re.I | re.S | re.M):
                # now m contains the pattern without the tag
                element = Element(start_pos=m.start(),
                                  end_pos=m.end(),
                                  content=''.join(m.groups()),
                                  tag=tag,
                                  folder=self.folder,
                                  is_block=True)
                can_append = True
                for e in elements:
                    if e.start_pos < m.start() and e.end_pos > m.end():
                        can_append = False
                    elif e.start_pos > m.start() and e.end_pos < m.end():
                        elements.remove(e)
                if can_append:
                    elements.append(element)
        elements.sort(key=lambda element: element.start_pos)
        self._markdown = ''.join([str(e) for e in elements])

        for index, element in enumerate(DELETE_ELEMENTS):
            self._markdown = re.sub(element, '', self._markdown)
        return self._markdown

    @property
    def markdown(self):
        self.convert(self.html, self.options)
        return self._markdown

    def export(self, folder=False):
        if len(self.file) < 1:
            warnings.warn("file not specified, renamed to tmp.md")
            file = "tmp.md"
        else:
            file = self.file.replace('.html', '.md')  # rename to md
        if len(self.folder) < 2:
            warnings.warn("folder not specified, will save to pwd")
        elif not folder:
            file = self.folder + '/' + file
        else:  # if folder is specified
            file = folder + '/' + file
        f = open(file, 'w')
        f.write(self._markdown)
        f.close()


_inst = Tomd()
convert = _inst.convert

## nbconvert_all_to_html.sh
#!/bin/bash
find . -maxdepth 1 -name '*_ja.ipynb' -print0 | while read -r -d '' file; do jupyter nbconvert --to html $file; done

## nbconvert_all_to_md.sh
#! /bin/bash
find . -maxdepth 1 -name '*_ja.ipynb' -print0 | while read -r -d '' file; do jupyter nbconvert --to markdown $file; done

## translate_all_notebooks.sh
#! /bin/bash
find . -maxdepth 1 -name '*.ipynb' -print0 | while read -r -d '' file; do ./jupyter_translate.py en ja $file ${file%%.ipynb}_ja.ipynb; done
	#!/usr/bin/env python
	# coding:utf-8
	import requests
	import json
	import time
	import mistune # markdown > html
	#import tomd # html > markdown
	import modified_tomd
	import re

	google_apikey = "<YOUR API KEY>"
	args = sys.argv
	source_lang = args[1] # e.g. en
	target_lang = args[2] # e.g. ja
	filename = args[3] # e.g. source.ipynb
	filename_translated = args[4] # e.g. target_ja.ipynb

	def google_translate(original_text):
	google_translate_url = "https://translation.googleapis.com/language/translate/v2?key=" + google_apikey

	payload = {
	"q": original_text,
	"source": source_lang,
	"target": target_lang,
	"format": "html"
	}
	json_data = json.dumps(payload).encode("utf-8")
	headers = {"content-type": "application/json"}
	r = requests.post(google_translate_url, headers=headers, data=json_data)
	translated = r.json()
	return translated

	# read ipynb and parse JSON
	with open(filename, "r") as f:
	notebook = json.load(f)

	# insert translated cells
	new_notebook = notebook.copy()
	cells = notebook["cells"].copy()
	new_cells = cells.copy()
	offset = 1

	for index, cell in enumerate(cells):
	print("===index: " + str(index) + " offset: " + str(offset) + "===")

	if cell["cell_type"] == "markdown":
	print("cell: " + str(cell["source"]))

	original_text = "\n".join(cell["source"])
	original_html = mistune.markdown(original_text)
	r = google_translate(original_html)
	translated_html = r["data"]["translations"][0]["translatedText"]
	# translated_md = tomd.convert(translated_html)
	translated_md = modified_tomd.Tomd(translated_html).markdown
	# new_source_list = translated_md.split("\n")
	new_source_list = re.findall(".*\n", translated_md)
	new_cell = { 'cell_type': 'markdown', 'source': new_source_list }
	new_cells.insert(index + offset, new_cell)

	offset = offset + 1

	new_notebook["cells"] = new_cells

	# delete meta data
	for index in range(0,len(new_notebook["cells"])):
	new_notebook["cells"][index]["metadata"] = {}

	new_notebook["metadata"] = {}

	# write .ipynb file
	with open(filename_translated, "w") as f:
	json.dump(new_notebook, f, ensure_ascii=False, indent=2)
	# https://github.com/gaojiuli/tomd
	# by gaojiuli, GNU General Public License v3.0

	# coding: utf-8

	import re
	import os
	import warnings

	__all__ = ['Tomd', 'convert']

	MARKDOWN = {
	'h1': ('\n# ', '\n'),
	'h2': ('\n## ', '\n'),
	'h3': ('\n### ', '\n'),
	'h4': ('\n#### ', '\n'),
	'h5': ('\n##### ', '\n'),
	'h6': ('\n###### ', '\n'),
	'code': ('`', '`'),
	'ul': ('', ''),
	'ol': ('', ''),
	'li': ('- ', '\n'), # modified here
	'blockquote': ('\n> ', '\n'),
	'em': ('', ''),
	'strong': ('', ''),
	'block_code': ('\n```\n', '\n```\n'),
	'span': ('', ''),
	'p': ('\n', '\n'),
	'p_with_out_class': ('\n', '\n'),
	'inline_p': ('', ''),
	'inline_p_with_out_class': ('', ''),
	'b': ('', ''),
	'i': ('', ''),
	'del': ('~~', '~~'),
	'hr': ('\n---', '\n\n'),
	'thead': ('\n', '\|------\n'),
	'tbody': ('\n', '\n'),
	'td': ('\|', ''),
	'th': ('\|', ''),
	'tr': ('', '\n'),
	'table': ('', '\n'),
	'e_p': ('', '\n')
	}

	BlOCK_ELEMENTS = {
	'h1': '<h1.?>(.?)</h1>',
	'h2': '<h2.?>(.?)</h2>',
	'h3': '<h3.?>(.?)</h3>',
	'h4': '<h4.?>(.?)</h4>',
	'h5': '<h5.?>(.?)</h5>',
	'h6': '<h6.?>(.?)</h6>',
	'hr': '<hr/>',
	'blockquote': '<blockquote.?>(.?)</blockquote>',
	'ul': '<ul.?>(.?)</ul>',
	'ol': '<ol.?>(.?)</ol>',
	'block_code': '<pre.?><code.?>(.*?)</code></pre>',
	'p': '<p\s.?>(.?)</p>',
	'p_with_out_class': '<p>(.*?)</p>',
	'thead': '<thead.?>(.?)</thead>',
	'tr': '<tr.?>(.?)</tr>'
	}


	INLINE_ELEMENTS = {
	'td': '<td.?>((.\|\n)?)</td>', # td element may span lines
	'tr': '<tr.?>((.\|\n)?)</tr>',
	'th': '<th.?>(.?)</th>',
	'b': '<b.?>(.?)</b>',
	'i': '<i.?>(.?)</i>',
	'del': '<del.?>(.?)</del>',
	'inline_p': '<p\s.?>(.?)</p>',
	'inline_p_with_out_class': '<p>(.*?)</p>',
	'code': '<code.?>(.?)</code>',
	'span': '<span.?>(.?)</span>',
	'ul': '<ul.?>(.?)</ul>',
	'ol': '<ol.?>(.?)</ol>',
	'li': '<li.?>(.?)</li>',
	'img': '<img.?src="(.?)".?>(.?)</img>',
	'img_single': '<img.?src="(.?)".*?/>',
	'img_single_no_close': '<img.?src="(.?)".*?>',
	'a': '<a.?href="(.?)".?>(.?)</a>',
	'em': '<em.?>(.?)</em>',
	'strong': '<strong.?>(\s)(.?)(\s)</strong>',
	'tbody': '<tbody.?>((.\|\n))</tbody>',
	}

	DELETE_ELEMENTS = ['<span.?>', '</span>', '<div.?>', '</div>', '<br clear="none"/>', '<center.*?>', '</center>']


	class Element:
	def __init__(self, start_pos, end_pos, content, tag, folder, is_block=False):
	self.start_pos = start_pos
	self.end_pos = end_pos
	self.content = content
	self._elements = []
	self.is_block = is_block
	self.tag = tag
	self.folder = folder
	self._result = None

	if self.is_block:
	self.parse_inline()

	def __str__(self):
	wrapper = MARKDOWN.get(self.tag)
	self._result = '{}{}{}'.format(wrapper[0], self.content, wrapper[1])
	return self._result

	def parse_inline(self):
	self.content = self.content.replace('\r', '') # windows \r character
	self.content = self.content.replace('\xc2\xa0', ' ') # no break space
	self.content = self.content.replace('"', '\"') # html quote mark

	for m in re.finditer("<img(.?)en_todo.?>", self.content):
	# remove img and change to [ ] and [x]
	# evernote specific parsing
	imgSrc = re.search('src=".*?"', m.group())
	imgLoc = imgSrc.group()[5:-1] # remove source and " "
	imgLoc = imgLoc.replace('\\', '/') # \\ folder slash rotate
	if os.stat(self.folder + "/" + imgLoc).st_size < 250:
	self.content = self.content.replace(m.group(), "[ ] ")
	else:
	self.content = self.content.replace(m.group(), "[x] ")

	if "e_" in self.tag: # evernote-specific parsing
	for m in re.finditer(BlOCK_ELEMENTS['table'], self.content, re.I \| re.S \| re.M):
	# hmm can there only be one table?
	inner = Element(start_pos=m.start(),
	end_pos=m.end(),
	content=''.join(m.groups()),
	tag='table', folder=self.folder,
	is_block=True)
	self.content = inner.content
	return # no need for further parsing ?

	# if no table, parse as usual
	self.content = self.content.replace('<hr/>', '\n---\n')
	self.content = self.content.replace('<br/>', '')

	if self.tag == "table": # for removing tbody
	self.content = re.sub(INLINE_ELEMENTS['tbody'], '\g<1>', self.content)

	INLINE_ELEMENTS_LIST_KEYS = list(INLINE_ELEMENTS.keys())
	INLINE_ELEMENTS_LIST_KEYS.sort()
	for tag in INLINE_ELEMENTS_LIST_KEYS:
	pattern = INLINE_ELEMENTS[tag]

	if tag == 'a':
	self.content = re.sub(pattern, '[\g<2>](\g<1>)', self.content)
	elif tag == 'img':
	self.content = re.sub(pattern, '![\g<2>](\g<1>)', self.content)
	elif tag == 'img_single':
	self.content = re.sub(pattern, '![](\g<1>)', self.content)
	elif tag == 'img_single_no_close':
	self.content = re.sub(pattern, '![](\g<1>)', self.content)
	elif self.tag == 'ul' and tag == 'li':
	self.content = re.sub(pattern, '- \g<1>\n', self.content)
	elif self.tag == 'ol' and tag == 'li':
	self.content = re.sub(pattern, '1. \g<1>\n', self.content)
	elif self.tag == 'thead' and tag == 'tr':
	self.content = re.sub(pattern, '\g<1>\n', self.content.replace('\n', ''))
	elif self.tag == 'tr' and tag == 'th':
	self.content = re.sub(pattern, '\|\g<1>', self.content.replace('\n', ''))
	elif self.tag == 'tr' and tag == 'td':
	self.content = re.sub(pattern, '\|\g<1>\|', self.content.replace('\n', ''))
	self.content = self.content.replace("\|\|", "\|") # end of column also needs a pipe
	elif self.tag == 'table' and tag == 'td':
	self.content = re.sub(pattern, '\|\g<1>\|', self.content)
	self.content = self.content.replace("\|\|", "\|") # end of column also needs a pipe
	self.content = self.content.replace('\|\n\n', '\|\n') # replace double new line
	self.construct_table()
	else:
	wrapper = MARKDOWN.get(tag)
	if tag == "strong":
	self.content = re.sub(pattern, '{}\g<2>{}'.format(wrapper[0], wrapper[1]), self.content)
	else:
	self.content = re.sub(pattern, '{}\g<1>{}'.format(wrapper[0], wrapper[1]), self.content)

	if self.tag == "e_p" and self.content[-1:] != '\n' and len(self.content) > 2:
	# focusing on div, add new line if not there (and if content is long enough)
	self.content += '\n'

	def construct_table(self):
	# this function, after self.content has gained \| for table entries,
	# adds the \|---\| in markdown to create a proper table

	temp = self.content.split('\n', 3)
	for elt in temp:
	if elt != "":
	count = elt.count("\|") # count number of pipes
	break
	pipe = "\n\|" # beginning \n for safety
	for i in range(count - 1):
	pipe += "---\|"
	pipe += "\n"
	self.content = pipe + pipe + self.content + "\n" # TODO: column titles?
	self.content = self.content.replace('\|\n\n', '\|\n') # replace double new line
	self.content = self.content.replace("<br/>\n", "<br/>") # end of column also needs a pipe


	class Tomd:
	def __init__(self, html='', folder='', file='', options=None):
	self.html = html # actual data
	self.folder = folder
	self.file = file
	self.options = options # haven't been implemented yet
	self._markdown = self.convert(self.html, self.options)

	def convert(self, html="", options=None):
	if html == "":
	html = self.html
	# main function here
	elements = []
	for tag, pattern in BlOCK_ELEMENTS.items():
	for m in re.finditer(pattern, html, re.I \| re.S \| re.M):
	# now m contains the pattern without the tag
	element = Element(start_pos=m.start(),
	end_pos=m.end(),
	content=''.join(m.groups()),
	tag=tag,
	folder=self.folder,
	is_block=True)
	can_append = True
	for e in elements:
	if e.start_pos < m.start() and e.end_pos > m.end():
	can_append = False
	elif e.start_pos > m.start() and e.end_pos < m.end():
	elements.remove(e)
	if can_append:
	elements.append(element)
	elements.sort(key=lambda element: element.start_pos)
	self._markdown = ''.join([str(e) for e in elements])

	for index, element in enumerate(DELETE_ELEMENTS):
	self._markdown = re.sub(element, '', self._markdown)
	return self._markdown

	@property
	def markdown(self):
	self.convert(self.html, self.options)
	return self._markdown

	def export(self, folder=False):
	if len(self.file) < 1:
	warnings.warn("file not specified, renamed to tmp.md")
	file = "tmp.md"
	else:
	file = self.file.replace('.html', '.md') # rename to md
	if len(self.folder) < 2:
	warnings.warn("folder not specified, will save to pwd")
	elif not folder:
	file = self.folder + '/' + file
	else: # if folder is specified
	file = folder + '/' + file
	f = open(file, 'w')
	f.write(self._markdown)
	f.close()


	_inst = Tomd()
	convert = _inst.convert
	#!/bin/bash
	find . -maxdepth 1 -name '*_ja.ipynb' -print0 \| while read -r -d '' file; do jupyter nbconvert --to html $file; done
	#! /bin/bash
	find . -maxdepth 1 -name '*_ja.ipynb' -print0 \| while read -r -d '' file; do jupyter nbconvert --to markdown $file; done
	#! /bin/bash
	find . -maxdepth 1 -name '*.ipynb' -print0 \| while read -r -d '' file; do ./jupyter_translate.py en ja $file ${file%%.ipynb}_ja.ipynb; done