Skip to content

Instantly share code, notes, and snippets.

@nishio
Created February 14, 2019 12:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nishio/d4deea950939ba96b5c38f57a92d065a to your computer and use it in GitHub Desktop.
Save nishio/d4deea950939ba96b5c38f57a92d065a to your computer and use it in GitHub Desktop.
scrapbox parser
# -*- encoding: utf-8 -*-
import json
import re
import doctest
INDENT = re.compile("(\s*)(.*)")
VERBATIM = re.compile("(.*?)(`(.*?)`)(.*)")
BRACKET = re.compile("(.*?)(\[(.*?)\])(.*)")
class Verbatim:
def __init__(self, inner):
self.inner = inner
def to_markdown(self):
return f"`{self.inner}`"
def to_scrapbox(self):
return f"`{self.inner}`"
def __repr__(self):
return f"Verbatim('{self.inner}')"
class InnerLink:
def __init__(self, inner):
self.inner = inner
def to_markdown(self):
ref = "" # FIXME
return f"[{self.inner}]({ref})"
def to_scrapbox(self):
return f"[{self.inner}]"
def __repr__(self):
return f"InnerLink('{self.inner}')"
class Image:
def __init__(self, url):
self.url = url
def to_markdown(self):
return f"![]({self.url})"
class Strong:
def __init__(self, inner):
self.inner = inner
def to_markdown(self):
return f"**{self.inner}**"
class Line:
def __init__(self, indent, body):
assert isinstance(indent, int)
self.indent = indent
self.body = body
def to_markdown(self):
body = to_markdown(self.body)
if self.indent == 0:
return f"{body}\n"
else:
indent = " " * (self.indent - 1)
return f"{indent}- {body}\n"
def parse_in_bracket(s):
items = s.split()
if len(items) == 1:
s = items[0]
if s.startswith("https://gyazo.com/"):
return Image(s)
if s.startswith("https://") or s.startswith("http://"):
raise NotImplemented("ExternalLink")
return InnerLink(s)
if len(items) > 1:
if s == "*":
return Strong(s[1:].strip())
return InnerLink(s)
raise NotImplementedError
def parse_bracket(s):
m = re.match(BRACKET, s)
if m:
pre, tag, inner, post = m.groups()
b = parse_in_bracket(inner)
return [pre, b, parse_bracket(post)]
return s
def parse_verbatim(s):
"""
>>> parse_verbatim("aaa`bbb`ccc")
['aaa', Verbatim('bbb'), 'ccc']
>>> parse_verbatim("aaa`bbb`ccc`ddd`eee")
['aaa', Verbatim('bbb'), ['ccc', Verbatim('ddd'), 'eee']]
"""
m = re.match(VERBATIM, s)
if m:
pre, tag, inner, post = m.groups()
return [parse_bracket(pre), Verbatim(inner), parse_verbatim(post)]
return parse_bracket(s)
def work():
data = json.load(open("exported/intellitech-en-20190204.json"))
pages = {}
for p in data["pages"]:
pages[p["title"]] = p
page = pages["0.3: Structure of this book"]
result = []
for line in page["lines"]:
if not isinstance(line, str):
# has mtime and ctime
line = line["text"]
m = re.match(INDENT, line)
assert m
indent, body = m.groups()
result.append(Line(len(indent), parse_verbatim(body)))
return result
def to_markdown(x):
if isinstance(x, str):
return x
if isinstance(x, list):
return "".join(to_markdown(y) for y in x)
return x.to_markdown()
def _test():
doctest.testmod()
_test()
tree = work()
print(to_markdown(tree))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment