Skip to content

Instantly share code, notes, and snippets.

@hanya
Last active March 19, 2017 16:25
Show Gist options
  • Save hanya/3956e8b9c0f39f9a7ef124a83dc98e8d to your computer and use it in GitHub Desktop.
Save hanya/3956e8b9c0f39f9a7ef124a83dc98e8d to your computer and use it in GitHub Desktop.
Converts outline file between hierarchical text and OPML

Outline conversion script

Converts outline file between hierarchical text and OPML. This python script works with only Python3.

Here is list of options can be specified.

python3 olconv.py -h
usage: olconv [-h] [--version] [--char CHAR] [-f F] [-o O] [-i {hier,opml}]
              [-x {hier,opml}] [--title TITLE] [--text TEXT]

Converts file between WZ and OPML for Python3

optional arguments:
  -h, --help      show this help message and exit
  --version       show program's version number and exit
  --char CHAR     Level specification in regular expression
  -f F            Input file
  -o O            Output file
  -i {hier,opml}  Input type
  -x {hier,opml}  Output type
  --title TITLE   Title attribute in OPML file, default is title
  --text TEXT     Text attribute in OPML file, default is text

Here is an example to convert a hierarchical text into OPML format.

python3 olconv.py -i hier -x opml -f hoge.txt -o fuga.opml
# -*- coding: utf-8 -*-
import argparse
import re
import sys
class Outline:
""" Outline top item. """
def __init__(self):
self.children = []
# todo, last child which can be used without deep search
def append_child(self, item):
if item.level == 1:
self.children.append(item)
else:
parent = self.get_last_child()
# find
parent = self._find_child(parent, item.level -1)
parent.children.append(item)
def get_last_child(self):
if len(self.children) == 0:
item = Item(1, "", "")
self.children.append(item)
return self.children[-1]
def _find_child(self, parent, level):
if parent.level == level:
return parent
item = parent.get_last_child()
if item is None:
item = Item(parent.level +1, "", "")
parent.append_child(item)
if item.level == level:
return item
return self._find_child(item, item.level +1)
class Item:
""" Individual entry in outline entries. """
def __init__(self, level, title, text=None):
self.level = level
self.title = title
self.text = text
self.children = []
def append_child(self, child):
self.children.append(child)
def get_last_child(self):
if len(self.children) == 0:
return None
else:
return self.children[-1]
def __str__(self):
return "<Item lv=\"{}\" title=\"{}\" text=\"{}\">".format(
self.level, self.title, self.text[0:10])
class Writer:
""" Writer base. """
pass
class OPMLWriter(Writer):
""" Writer for OPML file. """
def __init__(self, title_attr="title", text_attr="text"):
self.title_attr = title_attr
self.text_attr = text_attr
def open(self, f):
self.f = f
from xml.sax.xmlreader import AttributesImpl
self.AttributesImpl = AttributesImpl
import xml.sax.saxutils
self.x = xml.sax.saxutils.XMLGenerator(self.f, encoding="utf-8")
x = self.x
x.startDocument()
x.startElement("opml", AttributesImpl({"version": "2.0"}))
x.startElement("head", {})
# todo head contents
x.startElement("title", {})
x.endElement("title")
#x.startElement("dateCreated", {})
#x.endElement("dateCreated")
x.endElement("head")
x.startElement("body", {})
def close(self):
self.x.endElement("body")
self.x.endElement("opml")
self.x.endDocument()
self.f.close()
def write(self, outline):
for item in outline.children:
self._write_item(item)
def _write_item(self, item):
self.x.startElement("outline", self.AttributesImpl({self.title_attr: item.title, self.text_attr: item.text}))
self.write(item)
self.x.endElement("outline")
class HierWriter(Writer):
""" Writer for hierarchical outline text. """
def __init__(self, level_char="."):
""" Initialize.
@param level_char defines level of an entry. Each character +1 level.
"""
self.level_char = level_char
def open(self, f):
self.f = f
def close(self):
self.f.close()
def write(self, outline):
for item in outline.children:
self._write_item(item)
def _write_item(self, item):
f = self.f
f.write(self.level_char * item.level)
f.write(item.title)
f.write("\n")
f.write(item.text)
f.write("\n")
self.write(item)
class Reader:
""" Reader base. """
pass
class OPMLReader(Reader):
""" Reader for OPML file. """
def __init__(self, f, title_attr="title", text_attr="text"):
self.contents = Outline()
self.title_attr = title_attr
self.text_attr = text_attr
import xml.dom.minidom
dom = xml.dom.minidom.parse(f)
body = dom.getElementsByTagName("body")
if body is None:
raise Exception("Empty document")
for node in body[0].childNodes:
if node.nodeType == node.ELEMENT_NODE:
self._read_outline(node, 1)
def _append_item(self, level, title, text):
item = Item(level, title, text)
self.contents.append_child(item)
def _read_outline(self, node, level):
title = node.getAttribute(self.title_attr)
text = node.getAttribute(self.text_attr)
self._append_item(level, title if title else "", text if text else "")
for child in node.childNodes:
if child.nodeType == child.ELEMENT_NODE:
self._read_outline(child, level + 1)
class HierReader(Reader):
""" Reader for hierarchical outline file. """
def __init__(self, f, level_char="."):
self.level_exp = "(" + re.escape(level_char) + "+)"
exp = re.compile(self.level_exp)
self.contents = Outline()
it = iter(f.readlines())
# skip until first one
line = self._skip_head(exp, it)
# read lines
self._read_lines(exp, line, it)
def _append_item(self, level, title, text):
item = Item(level, title.rstrip("\n"), text.rstrip("\n"))
self.contents.append_child(item)
def _skip_head(self, exp, it):
for line in it:
if exp.match(line):
break
return line
def _read_lines(self, exp, line, it):
lines = []
title = None
text = None
level = None
try:
while True:
m = exp.match(line)
if m:
if not title is None:
self._append_item(level, title[level:], "".join(lines))
lines = []
level = len(m.group(1))
title = line
else:
lines.append(line)
line = next(it)
except StopIteration:
pass
if not title is None:
m = exp.match(title)
if m:
self._append_item(level, title[level:], "".join(lines))
def main():
version = "0.1.1"
input_formats = ["hier", "opml"]
output_formats = input_formats
p = argparse.ArgumentParser(prog="olconv",
description="Converts file between WZ and OPML for Python3")
p.add_argument("--version", action="version", version="%(prog) " + version)
p.add_argument("--char", default=".", help="Level specification in regular expression")
p.add_argument("-f", help="Input file")
p.add_argument("-o", help="Output file")
p.add_argument("-i", choices=input_formats, help="Input type")
p.add_argument("-x", choices=output_formats, help="Output type")
p.add_argument("--title", default="title", help="Title attribute in OPML file, default is title")
p.add_argument("--text", default="text", help="Text attribute in OPML file, default is text")
args = p.parse_args()
input_format = args.i
output_format = args.x
reader = {"hier": HierReader, "opml": OPMLReader}[input_format]
writer = {"hier": HierWriter, "opml": OPMLWriter}[output_format]
reader_args = {}
if input_format == "hier":
reader_args["level_char"] = args.char
elif input_format == "opml":
reader_args["title_attr"] = args.title
reader_args["text_attr"] = args.text
writer_args = {}
if output_format == "hier":
writer_args["level_char"] = args.char
elif output_format == "opml":
writer_args["title_attr"] = args.title
writer_args["text_attr"] = args.text
input_path = args.f
input_file = open(input_path, "r", encoding="utf-8") if input_path else sys.stdin
out_path = args.o
out_file = open(out_path, "w", encoding="utf-8") if out_path else sys.stdout
try:
r = reader(input_file, **reader_args)
w = writer(**writer_args)
w.open(out_file)
w.write(r.contents)
w.close()
except Exception:
import traceback
traceback.print_exc()
finally:
out_file.close()
input_file.close()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment