Created
February 14, 2019 12:48
-
-
Save nishio/d4deea950939ba96b5c38f57a92d065a to your computer and use it in GitHub Desktop.
scrapbox parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- encoding: utf-8 -*- | |
import json | |
import re | |
import doctest | |
INDENT = re.compile("(\s*)(.*)") | |
VERBATIM = re.compile("(.*?)(`(.*?)`)(.*)") | |
BRACKET = re.compile("(.*?)(\[(.*?)\])(.*)") | |
class Verbatim: | |
def __init__(self, inner): | |
self.inner = inner | |
def to_markdown(self): | |
return f"`{self.inner}`" | |
def to_scrapbox(self): | |
return f"`{self.inner}`" | |
def __repr__(self): | |
return f"Verbatim('{self.inner}')" | |
class InnerLink: | |
def __init__(self, inner): | |
self.inner = inner | |
def to_markdown(self): | |
ref = "" # FIXME | |
return f"[{self.inner}]({ref})" | |
def to_scrapbox(self): | |
return f"[{self.inner}]" | |
def __repr__(self): | |
return f"InnerLink('{self.inner}')" | |
class Image: | |
def __init__(self, url): | |
self.url = url | |
def to_markdown(self): | |
return f"![]({self.url})" | |
class Strong: | |
def __init__(self, inner): | |
self.inner = inner | |
def to_markdown(self): | |
return f"**{self.inner}**" | |
class Line: | |
def __init__(self, indent, body): | |
assert isinstance(indent, int) | |
self.indent = indent | |
self.body = body | |
def to_markdown(self): | |
body = to_markdown(self.body) | |
if self.indent == 0: | |
return f"{body}\n" | |
else: | |
indent = " " * (self.indent - 1) | |
return f"{indent}- {body}\n" | |
def parse_in_bracket(s): | |
items = s.split() | |
if len(items) == 1: | |
s = items[0] | |
if s.startswith("https://gyazo.com/"): | |
return Image(s) | |
if s.startswith("https://") or s.startswith("http://"): | |
raise NotImplemented("ExternalLink") | |
return InnerLink(s) | |
if len(items) > 1: | |
if s == "*": | |
return Strong(s[1:].strip()) | |
return InnerLink(s) | |
raise NotImplementedError | |
def parse_bracket(s): | |
m = re.match(BRACKET, s) | |
if m: | |
pre, tag, inner, post = m.groups() | |
b = parse_in_bracket(inner) | |
return [pre, b, parse_bracket(post)] | |
return s | |
def parse_verbatim(s): | |
""" | |
>>> parse_verbatim("aaa`bbb`ccc") | |
['aaa', Verbatim('bbb'), 'ccc'] | |
>>> parse_verbatim("aaa`bbb`ccc`ddd`eee") | |
['aaa', Verbatim('bbb'), ['ccc', Verbatim('ddd'), 'eee']] | |
""" | |
m = re.match(VERBATIM, s) | |
if m: | |
pre, tag, inner, post = m.groups() | |
return [parse_bracket(pre), Verbatim(inner), parse_verbatim(post)] | |
return parse_bracket(s) | |
def work(): | |
data = json.load(open("exported/intellitech-en-20190204.json")) | |
pages = {} | |
for p in data["pages"]: | |
pages[p["title"]] = p | |
page = pages["0.3: Structure of this book"] | |
result = [] | |
for line in page["lines"]: | |
if not isinstance(line, str): | |
# has mtime and ctime | |
line = line["text"] | |
m = re.match(INDENT, line) | |
assert m | |
indent, body = m.groups() | |
result.append(Line(len(indent), parse_verbatim(body))) | |
return result | |
def to_markdown(x): | |
if isinstance(x, str): | |
return x | |
if isinstance(x, list): | |
return "".join(to_markdown(y) for y in x) | |
return x.to_markdown() | |
def _test(): | |
doctest.testmod() | |
_test() | |
tree = work() | |
print(to_markdown(tree)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment