Skip to content

Instantly share code, notes, and snippets.

@qzane
Created February 25, 2022 02:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save qzane/2605f712d9765380730b332e446e38b3 to your computer and use it in GitHub Desktop.
Save qzane/2605f712d9765380730b332e446e38b3 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
#-*- coding: UTF-8 -*-
import sys, os, re, time, pkgutil, logging, click, subprocess, zipfile
from jinja2 import Template, Environment, PackageLoader
from datetime import datetime
__version__ = "1.1"
RE_CHAPTER_AND_SECTIONS=[
u'.*(第.*[卷章部分][  :].*)[  :](第.*[章节][  :]*.*)$',
u'.*(第.*[卷章部分][  :].*)[  :](序[  :]*幕.*)$',
]
RE_CHAPTERS=[
u'.*(第.{1,8}[卷部分][  :].*)$',
]
RE_SECTIONS=[
u'^\s*(后记|番外.*)\s*$',
u'.*(第.*[章节][  :].*)$',
u'.*(第.*[章节])$',
u'(尾[  :]*声.*)$',
u'(序[  :]*[章幕].*)$',
]
RE_TITLE_1=u'^《([^》]*)》$'
RE_TITLE_2=u'^<<([^》]*)>>$'
TAG_REPLACE=[
(u'简介', "##brief:"),
(u'内容简介', "##brief:"),
(u'封面', "#cover:"),
(u'作者:', "#author:"),
(u'作者:', "#author:"),
]
LINE_STYLE_AUTO=0
LINE_STYLE_APPEND=1
LINE_STYLE_MERGE=2
class App:
def __init__(self):
self._idx = 0
def get_tpl(self, filename):
from pkg_resources import resource_string
s = resource_string(__name__, filename).decode("UTF-8")
return Template( s )
def idx(self):
self._idx += 1
return "%05d" % self._idx
def build_book(self, epub_file, meta):
def F(name):
z = zipfile.ZipInfo(name)
z.external_attr = 0o666 << 16
z.compress_type = zipfile.ZIP_DEFLATED
return z
epub = zipfile.ZipFile(epub_file, "w", compression=zipfile.ZIP_DEFLATED)
gen_files = ['book.ncx', 'content.opf', 'mimetype', 'META-INF/container.xml']
for out in gen_files:
tpl = self.get_tpl( "templates/" + out )
txt = tpl.render(meta=meta)
epub.writestr(F(out), txt.encode('utf-8'))
tpl = self.get_tpl('templates/book.html')
out = "welcome.html"
txt = tpl.render(meta=meta, action="welcome")
epub.writestr(F(out), txt.encode('utf-8'))
for chapter in meta['chapters']:
out = "text/book-chapter-%s.html" % chapter['idx']
txt = tpl.render(meta=meta, action="chapter", chapter=chapter)
epub.writestr(F(out), txt.encode('utf-8'))
for section in chapter['sections']:
out = "text/book-section-%s.html" % section['idx']
txt = tpl.render(meta=meta, action="section", section=section)
epub.writestr(F(out), txt.encode('utf-8'))
for section in meta['sections']:
out = "text/book-section-%s.html" % section['idx']
txt = tpl.render(meta=meta, action="section", section=section)
epub.writestr(F(out), txt.encode('utf-8'))
def convert(self, txt_file):
fsize = os.path.getsize(txt_file)
logging.info("Input : %s (%.2fMB)" % (txt_file, fsize/1048576))
epub_file = txt_file.replace('.txt', '') +".epub"
meta = {
'title': '', 'author': '',
'isbn': int(time.mktime(datetime.now().timetuple())),
'date': datetime.now().strftime("%Y-%m-%d"),
'cover': None, 'chapters': [], 'sections': [],
}
state = 'paragraph'
paras = []
section = {'idx': 0, 'name':'_', 'paras': paras}
chapter = {'idx': 0, 'name':'_', 'sections': [], 'default': section}
line_style = LINE_STYLE_AUTO
for raw in open(txt_file, 'rb'):
try:
raw = raw.decode('utf-8')
except:
raw = raw.decode('gb18030')
line = raw.replace('\r', '\n').replace('\t', ' ').strip()
if len(line) < 2: continue
if line.startswith(u'《') and not meta['title']:
line = line.split(u'》')[0].strip().replace(u'《', '#title:')
m = re.match(RE_TITLE_1, line)
if m is not None and not meta['title']:
line = '#title:' + m.groups()[0]
m = re.match(RE_TITLE_2, line)
if m is not None and not meta['title']:
line = '#title:' + m.groups()[0]
for tag_from, tag_to in TAG_REPLACE:
if line.startswith(tag_from):
if tag_to.startswith("##"):
line = tag_to + line
else:
line = line.replace(tag_from, tag_to)
# 参数值
m = re.match(u'#([a-z]+):(.*)', line)
if m is not None:
tag, val = m.groups()
meta[tag] = val
continue
# 段落内容
m = re.match(u'##([a-z]+):(.*)', line)
if m is not None:
line_style = LINE_STYLE_AUTO #reset line style
tag, val = m.groups()
paras = []
meta[tag] = paras
continue
chapter_name=None
section_name=None
# 多个段落内容(例如N个章节)
m = re.match(u'#@([a-z]+)(::)(.*)', line)
if m is not None:
tag, _, val = m.groups()
if tag == 'chapter': chapter_name = val
if tag == 'section': section_name = val
while True:
# 猜测章节序号
m = None
for r in RE_CHAPTER_AND_SECTIONS:
m = re.match(r, line)
if m: break
if m is not None:
vals = m.groups()
chapter_name = vals[0]
section_name = vals[1]
break;
# 猜测章节序号
m = None
for r in RE_CHAPTERS:
m = re.match(r, line)
if m: break
if m is not None:
chapter_name = m.groups()[0]
break;
# 等效与 #@section:
m = None
for r in RE_SECTIONS:
m = re.match(r, line)
if m: break
if m is not None:
section_name = m.groups()[0]
break;
break;
if chapter_name:
chapter_name = chapter_name.strip().replace(" ", " ").replace(" ", " ")
if chapter_name != chapter['name']:
logging.debug(u'chapter: %s' % chapter_name)
tag = None
paras = []
section = {'idx': self.idx(), 'name': u'_', 'paras': paras}
chapter = {'idx': len(meta['chapters']), 'name': chapter_name, 'sections': [], 'default': section}
meta['chapters'].append(chapter)
line_style = LINE_STYLE_AUTO #reset line style
if section_name:
section_name = section_name.strip().replace(" ", " ").replace(" ", " ")
if section_name != section['name']:
tag = None
paras = []
section = {'idx': self.idx(), 'name': section_name, 'paras': paras}
chapter['sections'].append(section)
logging.debug( "\t%s %s" % (len(chapter['sections']), section_name))
if chapter_name or section_name:
continue
# 处理正文(增加换行检测)
has_space = raw.startswith(u" ") or raw.startswith(u"  ")
has_special = line.startswith("--") or line.startswith("==")
if line_style == LINE_STYLE_AUTO:
if has_space: line_style = LINE_STYLE_APPEND
else: line_style = LINE_STYLE_MERGE
if line_style == LINE_STYLE_APPEND:
paras.append(line)
elif line_style == LINE_STYLE_MERGE:
if has_space or has_special or len(paras) == 0: paras.append(line)
else: paras[-1] += line
#if tag == 'brief':
# logging.debug(line)
# logging.debug('\n'.join(meta[tag]))
logging.info("Result : %s. %d Chapters, %d single sections\nTitle: %s\nAuthor: %s\nBrief: \n%s" % (
txt_file, len(meta['chapters']), len(meta['sections']),
meta['title'], meta['author'], "\n".join(meta['brief']),
))
if chapter['name'] == '_' and chapter['sections']: #no chapter
meta['sections'].extend( chapter['sections'] )
self.build_book(epub_file, meta)
fsize = os.path.getsize(epub_file)
logging.info("Output : %s(%.2fMB)" % (epub_file, fsize/1048576))
return epub_file
@click.command()
@click.option("--debug", is_flag=True, default=False, help="parse txt, but do not convert")
@click.argument("TXT_FILES", nargs=-1, required=True, type=click.Path(exists=True))
def main(debug, txt_files):
'''将带有简单格式的TXT文件转换为带有目录、作者信息的epub文件。'''
logging.basicConfig(level=logging.DEBUG if debug else logging.INFO)
app = App()
for f in txt_files:
app.convert(f)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment