aphlysia/paper2joplin.py

## paper2joplin.py
#coding: utf-8
'''
paper2joplin.py
Dropbox paper から Joplin への変換
'''

import re, sys, argparse, pathlib

class Re:
    # from https://stackoverflow.com/questions/597476/how-to-concisely-cascade-through-multiple-regex-statements-in-python
    def __init__(self):
        self.last_match = None
    def match(self, pattern, text):
        self.last_match = re.match(pattern, text)
        return self.last_match
    def search(self, pattern, text):
        self.last_match = re.search(pattern, text)
        return self.last_match

def convert(filepath, out_dir):
    out_dir.mkdir(parents=True, exist_ok=True)
    n_images = 0
    n_links = 0
    out_file = (out_dir / filepath.name).open('w')
    gre = Re()
    code_block = False
    code_block_may_starts = False

    def common(out_file):
        if code_block:
            out_file.write('```\n')
            return False, False
        return code_block, False

    for line in filepath.open():
        line = line.replace('\u200b', '')
        m = re.findall(r'[!]\[[^\]]*\]\([^\)]+\)', line)
        n_images += len(m)
        m = re.findall(r'\[[^\]]*\]\(https://paper\.dropbox\.com/doc/[^\)]+\)', line)
        n_links += len(m)
        if gre.match('^$', line):
            if code_block:
                code_block, code_block_may_starts = common(out_file)
            code_block_may_starts = True
            out_file.write(line)
        elif gre.match('^#', line):
            if code_block:
                code_block, code_block_may_starts = common(out_file)
            code_block_may_starts = True
            out_file.write(line.replace('$$', '$'))
        elif gre.match(r'^\s*\$\$([^$]*)\$\$\s*\(([^$]+)\)$', line):
            code_block, code_block_may_starts = common(out_file)
            m = gre.last_match
            out_file.write('$$\\tag{{{}}}{}$$\n'.format(m.group(2), m.group(1)))
        elif gre.match(r'^\s*\$\$([^$]*)\$\$\s*$', line):
            code_block, code_block_may_starts = common(out_file)
            m = gre.last_match
            out_file.write('$${}$$\n'.format(m.group(1)))
        elif gre.match(r'^\s*\$\$([^$]*)\$\$[,、，]$', line):
            code_block, code_block_may_starts = common(out_file)
            m = gre.last_match
            out_file.write('$${},$$\n'.format(m.group(1)))
        elif gre.match(r'^\s*\$\$([^$]*)\$\$[.。．]$', line):
            code_block, code_block_may_starts = common(out_file)
            m = gre.last_match
            out_file.write('$${}.$$\n'.format(m.group(1)))
        elif gre.match(r'^    (.*)', line):
            if code_block_may_starts:
                code_block = True
                code_block_may_starts = False
                out_file.write('```\n')
            if code_block:
                out_file.write('{}\n'.format(gre.last_match.group(1)))
                if '$$' in line:
                    print(f'warn: {line}', file=sys.stderr)
            else:
                out_file.write(line.replace('$$', '$'))
        else:
            code_block, code_block_may_starts = common(out_file)
            out_file.write(line.replace('$$', '$'))
    if n_images > 0 or n_links > 0:
        print(filepath, file=sys.stderr)
        if n_images > 0:
            print(f'{n_images} images', file=sys.stderr)
        if n_links > 0:
            print(f'{n_links} internal links', file=sys.stderr)
        print(file=sys.stderr)


def find(in_dir, out_dir):
    for item in in_dir.glob('*'):
        if item.is_dir():
            find(in_dir / item.name, out_dir / item.name)
        if item.name.endswith('.md'):
            convert(item, out_dir)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('in_dir')
    parser.add_argument('out_dir')
    args = parser.parse_args()
    in_dir = pathlib.Path(args.in_dir)
    out_dir = pathlib.Path(args.out_dir)
    find(in_dir, out_dir)
	#coding: utf-8
	'''
	paper2joplin.py
	Dropbox paper から Joplin への変換
	'''

	import re, sys, argparse, pathlib

	class Re:
	# from https://stackoverflow.com/questions/597476/how-to-concisely-cascade-through-multiple-regex-statements-in-python
	def __init__(self):
	self.last_match = None
	def match(self, pattern, text):
	self.last_match = re.match(pattern, text)
	return self.last_match
	def search(self, pattern, text):
	self.last_match = re.search(pattern, text)
	return self.last_match

	def convert(filepath, out_dir):
	out_dir.mkdir(parents=True, exist_ok=True)
	n_images = 0
	n_links = 0
	out_file = (out_dir / filepath.name).open('w')
	gre = Re()
	code_block = False
	code_block_may_starts = False

	def common(out_file):
	if code_block:
	out_file.write('```\n')
	return False, False
	return code_block, False

	for line in filepath.open():
	line = line.replace('\u200b', '')
	m = re.findall(r'[!]\[[^\]]*\]\([^\)]+\)', line)
	n_images += len(m)
	m = re.findall(r'\[[^\]]*\]\(https://paper\.dropbox\.com/doc/[^\)]+\)', line)
	n_links += len(m)
	if gre.match('^$', line):
	if code_block:
	code_block, code_block_may_starts = common(out_file)
	code_block_may_starts = True
	out_file.write(line)
	elif gre.match('^#', line):
	if code_block:
	code_block, code_block_may_starts = common(out_file)
	code_block_may_starts = True
	out_file.write(line.replace('$$', '$'))
	elif gre.match(r'^\s\$\$([^$])\$\$\s*\(([^$]+)\)$', line):
	code_block, code_block_may_starts = common(out_file)
	m = gre.last_match
	out_file.write('$$\\tag{{{}}}{}$$\n'.format(m.group(2), m.group(1)))
	elif gre.match(r'^\s\$\$([^$])\$\$\s*$', line):
	code_block, code_block_may_starts = common(out_file)
	m = gre.last_match
	out_file.write('$${}$$\n'.format(m.group(1)))
	elif gre.match(r'^\s\$\$([^$])\$\$[,、，]$', line):
	code_block, code_block_may_starts = common(out_file)
	m = gre.last_match
	out_file.write('$${},$$\n'.format(m.group(1)))
	elif gre.match(r'^\s\$\$([^$])\$\$[.。．]$', line):
	code_block, code_block_may_starts = common(out_file)
	m = gre.last_match
	out_file.write('$${}.$$\n'.format(m.group(1)))
	elif gre.match(r'^ (.*)', line):
	if code_block_may_starts:
	code_block = True
	code_block_may_starts = False
	out_file.write('```\n')
	if code_block:
	out_file.write('{}\n'.format(gre.last_match.group(1)))
	if '$$' in line:
	print(f'warn: {line}', file=sys.stderr)
	else:
	out_file.write(line.replace('$$', '$'))
	else:
	code_block, code_block_may_starts = common(out_file)
	out_file.write(line.replace('$$', '$'))
	if n_images > 0 or n_links > 0:
	print(filepath, file=sys.stderr)
	if n_images > 0:
	print(f'{n_images} images', file=sys.stderr)
	if n_links > 0:
	print(f'{n_links} internal links', file=sys.stderr)
	print(file=sys.stderr)


	def find(in_dir, out_dir):
	for item in in_dir.glob('*'):
	if item.is_dir():
	find(in_dir / item.name, out_dir / item.name)
	if item.name.endswith('.md'):
	convert(item, out_dir)


	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('in_dir')
	parser.add_argument('out_dir')
	args = parser.parse_args()
	in_dir = pathlib.Path(args.in_dir)
	out_dir = pathlib.Path(args.out_dir)
	find(in_dir, out_dir)