Created
February 25, 2022 02:35
-
-
Save qzane/2605f712d9765380730b332e446e38b3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#-*- coding: UTF-8 -*- | |
import sys, os, re, time, pkgutil, logging, click, subprocess, zipfile | |
from jinja2 import Template, Environment, PackageLoader | |
from datetime import datetime | |
__version__ = "1.1" | |
RE_CHAPTER_AND_SECTIONS=[ | |
u'.*(第.*[卷章部分][ :].*)[ :](第.*[章节][ :]*.*)$', | |
u'.*(第.*[卷章部分][ :].*)[ :](序[ :]*幕.*)$', | |
] | |
RE_CHAPTERS=[ | |
u'.*(第.{1,8}[卷部分][ :].*)$', | |
] | |
RE_SECTIONS=[ | |
u'^\s*(后记|番外.*)\s*$', | |
u'.*(第.*[章节][ :].*)$', | |
u'.*(第.*[章节])$', | |
u'(尾[ :]*声.*)$', | |
u'(序[ :]*[章幕].*)$', | |
] | |
RE_TITLE_1=u'^《([^》]*)》$' | |
RE_TITLE_2=u'^<<([^》]*)>>$' | |
TAG_REPLACE=[ | |
(u'简介', "##brief:"), | |
(u'内容简介', "##brief:"), | |
(u'封面', "#cover:"), | |
(u'作者:', "#author:"), | |
(u'作者:', "#author:"), | |
] | |
LINE_STYLE_AUTO=0 | |
LINE_STYLE_APPEND=1 | |
LINE_STYLE_MERGE=2 | |
class App: | |
def __init__(self): | |
self._idx = 0 | |
def get_tpl(self, filename): | |
from pkg_resources import resource_string | |
s = resource_string(__name__, filename).decode("UTF-8") | |
return Template( s ) | |
def idx(self): | |
self._idx += 1 | |
return "%05d" % self._idx | |
def build_book(self, epub_file, meta): | |
def F(name): | |
z = zipfile.ZipInfo(name) | |
z.external_attr = 0o666 << 16 | |
z.compress_type = zipfile.ZIP_DEFLATED | |
return z | |
epub = zipfile.ZipFile(epub_file, "w", compression=zipfile.ZIP_DEFLATED) | |
gen_files = ['book.ncx', 'content.opf', 'mimetype', 'META-INF/container.xml'] | |
for out in gen_files: | |
tpl = self.get_tpl( "templates/" + out ) | |
txt = tpl.render(meta=meta) | |
epub.writestr(F(out), txt.encode('utf-8')) | |
tpl = self.get_tpl('templates/book.html') | |
out = "welcome.html" | |
txt = tpl.render(meta=meta, action="welcome") | |
epub.writestr(F(out), txt.encode('utf-8')) | |
for chapter in meta['chapters']: | |
out = "text/book-chapter-%s.html" % chapter['idx'] | |
txt = tpl.render(meta=meta, action="chapter", chapter=chapter) | |
epub.writestr(F(out), txt.encode('utf-8')) | |
for section in chapter['sections']: | |
out = "text/book-section-%s.html" % section['idx'] | |
txt = tpl.render(meta=meta, action="section", section=section) | |
epub.writestr(F(out), txt.encode('utf-8')) | |
for section in meta['sections']: | |
out = "text/book-section-%s.html" % section['idx'] | |
txt = tpl.render(meta=meta, action="section", section=section) | |
epub.writestr(F(out), txt.encode('utf-8')) | |
def convert(self, txt_file): | |
fsize = os.path.getsize(txt_file) | |
logging.info("Input : %s (%.2fMB)" % (txt_file, fsize/1048576)) | |
epub_file = txt_file.replace('.txt', '') +".epub" | |
meta = { | |
'title': '', 'author': '', | |
'isbn': int(time.mktime(datetime.now().timetuple())), | |
'date': datetime.now().strftime("%Y-%m-%d"), | |
'cover': None, 'chapters': [], 'sections': [], | |
} | |
state = 'paragraph' | |
paras = [] | |
section = {'idx': 0, 'name':'_', 'paras': paras} | |
chapter = {'idx': 0, 'name':'_', 'sections': [], 'default': section} | |
line_style = LINE_STYLE_AUTO | |
for raw in open(txt_file, 'rb'): | |
try: | |
raw = raw.decode('utf-8') | |
except: | |
raw = raw.decode('gb18030') | |
line = raw.replace('\r', '\n').replace('\t', ' ').strip() | |
if len(line) < 2: continue | |
if line.startswith(u'《') and not meta['title']: | |
line = line.split(u'》')[0].strip().replace(u'《', '#title:') | |
m = re.match(RE_TITLE_1, line) | |
if m is not None and not meta['title']: | |
line = '#title:' + m.groups()[0] | |
m = re.match(RE_TITLE_2, line) | |
if m is not None and not meta['title']: | |
line = '#title:' + m.groups()[0] | |
for tag_from, tag_to in TAG_REPLACE: | |
if line.startswith(tag_from): | |
if tag_to.startswith("##"): | |
line = tag_to + line | |
else: | |
line = line.replace(tag_from, tag_to) | |
# 参数值 | |
m = re.match(u'#([a-z]+):(.*)', line) | |
if m is not None: | |
tag, val = m.groups() | |
meta[tag] = val | |
continue | |
# 段落内容 | |
m = re.match(u'##([a-z]+):(.*)', line) | |
if m is not None: | |
line_style = LINE_STYLE_AUTO #reset line style | |
tag, val = m.groups() | |
paras = [] | |
meta[tag] = paras | |
continue | |
chapter_name=None | |
section_name=None | |
# 多个段落内容(例如N个章节) | |
m = re.match(u'#@([a-z]+)(::)(.*)', line) | |
if m is not None: | |
tag, _, val = m.groups() | |
if tag == 'chapter': chapter_name = val | |
if tag == 'section': section_name = val | |
while True: | |
# 猜测章节序号 | |
m = None | |
for r in RE_CHAPTER_AND_SECTIONS: | |
m = re.match(r, line) | |
if m: break | |
if m is not None: | |
vals = m.groups() | |
chapter_name = vals[0] | |
section_name = vals[1] | |
break; | |
# 猜测章节序号 | |
m = None | |
for r in RE_CHAPTERS: | |
m = re.match(r, line) | |
if m: break | |
if m is not None: | |
chapter_name = m.groups()[0] | |
break; | |
# 等效与 #@section: | |
m = None | |
for r in RE_SECTIONS: | |
m = re.match(r, line) | |
if m: break | |
if m is not None: | |
section_name = m.groups()[0] | |
break; | |
break; | |
if chapter_name: | |
chapter_name = chapter_name.strip().replace(" ", " ").replace(" ", " ") | |
if chapter_name != chapter['name']: | |
logging.debug(u'chapter: %s' % chapter_name) | |
tag = None | |
paras = [] | |
section = {'idx': self.idx(), 'name': u'_', 'paras': paras} | |
chapter = {'idx': len(meta['chapters']), 'name': chapter_name, 'sections': [], 'default': section} | |
meta['chapters'].append(chapter) | |
line_style = LINE_STYLE_AUTO #reset line style | |
if section_name: | |
section_name = section_name.strip().replace(" ", " ").replace(" ", " ") | |
if section_name != section['name']: | |
tag = None | |
paras = [] | |
section = {'idx': self.idx(), 'name': section_name, 'paras': paras} | |
chapter['sections'].append(section) | |
logging.debug( "\t%s %s" % (len(chapter['sections']), section_name)) | |
if chapter_name or section_name: | |
continue | |
# 处理正文(增加换行检测) | |
has_space = raw.startswith(u" ") or raw.startswith(u" ") | |
has_special = line.startswith("--") or line.startswith("==") | |
if line_style == LINE_STYLE_AUTO: | |
if has_space: line_style = LINE_STYLE_APPEND | |
else: line_style = LINE_STYLE_MERGE | |
if line_style == LINE_STYLE_APPEND: | |
paras.append(line) | |
elif line_style == LINE_STYLE_MERGE: | |
if has_space or has_special or len(paras) == 0: paras.append(line) | |
else: paras[-1] += line | |
#if tag == 'brief': | |
# logging.debug(line) | |
# logging.debug('\n'.join(meta[tag])) | |
logging.info("Result : %s. %d Chapters, %d single sections\nTitle: %s\nAuthor: %s\nBrief: \n%s" % ( | |
txt_file, len(meta['chapters']), len(meta['sections']), | |
meta['title'], meta['author'], "\n".join(meta['brief']), | |
)) | |
if chapter['name'] == '_' and chapter['sections']: #no chapter | |
meta['sections'].extend( chapter['sections'] ) | |
self.build_book(epub_file, meta) | |
fsize = os.path.getsize(epub_file) | |
logging.info("Output : %s(%.2fMB)" % (epub_file, fsize/1048576)) | |
return epub_file | |
@click.command() | |
@click.option("--debug", is_flag=True, default=False, help="parse txt, but do not convert") | |
@click.argument("TXT_FILES", nargs=-1, required=True, type=click.Path(exists=True)) | |
def main(debug, txt_files): | |
'''将带有简单格式的TXT文件转换为带有目录、作者信息的epub文件。''' | |
logging.basicConfig(level=logging.DEBUG if debug else logging.INFO) | |
app = App() | |
for f in txt_files: | |
app.convert(f) | |
if __name__ == '__main__': | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment