Skip to content

Instantly share code, notes, and snippets.

@digglife
Last active April 16, 2016 03:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save digglife/b0536bc9188f23de5fe005cc71de76c0 to your computer and use it in GitHub Desktop.
Save digglife/b0536bc9188f23de5fe005cc71de76c0 to your computer and use it in GitHub Desktop.
重新格式化经史子集中的正文和注释。脚本中以《庄子集释》为例。
#!/usr/bin/env python
# encoding:utf-8
import re
import os
import json
from bs4 import BeautifulSoup as bs
chinese_number = u"○一二三四五六七八九"
annotation_cats = {
u'注': 'zhu',
u'疏': 'shu',
u'釋': "shi",
u'校': "jiao"
}
def convert_inline_annotations(soup):
paragraphs = soup.findAll('p')
format_data = []
text_index = 0
annotation_index = 0
for p in paragraphs:
if p.text.startswith((u'【', u'◎')):
annotations = format_data[text_index]['annotations']
annotation = p.text
match = re.match(ur'^【([○一二三四五六七八九]+)】.*', annotation)
if match:
# print "=== ANNOTATOIN WITH INDEX FOUND ==="
# print annotation
annotation_index = match.group(1)
#annotation_index = int(''.join(map(lambda x: str(chinese_number.find(x)), annotation_index)))-1
#cat = get_annotation_cat(annotation)
annotations[annotation_index] = remove_index(annotation)
#annotations[annotation_index][cat] = annotation
p.extract()
else:
# print "=== ANNOTATOIN WITH MARKER FOUND ==="
# print annotation
if annotations:
annotations[annotation_index] += remove_index(annotation)
p.extract()
else:
print u"This is not a inline annotation. =>{}<=".format(p.text)
else:
# print "=== MAIN TEXT FOUND: ==="
# print p.text
# skip increase index for the first main text.
if format_data:
text_index += 1
format_data.append({'text': '', 'annotations': {}})
format_data[text_index]['text'] = p.text
p.extract()
print json.dumps(format_data, indent=2, ensure_ascii=False)
soup = get_text_with_markers(format_data, soup)
return soup
def get_text_with_annotations(format_data, soup):
for text_index, data in enumerate(format_data):
annotations = data['annotations']
main_text = data['text']
# print main_text
if annotations:
for index, items in annotations.iteritems():
#concated_annotaion = ' '.join(items.values())
annotation = add_span_for_marker(items)
annotations_span = u"<span class='annotation'>{}</span>".format(annotation)
main_text = main_text.replace(u'【{}】'.format(index), annotations_span)
main_text_p = bs(u"<p>{}</p>".format(main_text)).p
soup.div.append(main_text_p)
return soup
def get_text_with_markers(format_data, soup):
for text_index, data in enumerate(format_data):
annotations = data['annotations']
main_text = data['text']
footnotes=[]
if annotations:
#print json.dumps(annotations, indent=2, ensure_ascii=False)
for index, items in sorted(annotations.iteritems()):
main_text = main_text.replace(
u'【{}】'.format(index), add_footnote_link(text_index, index)
)
footnotes.append( add_footnote_id(items, text_index, index) )
main_text_p = bs(u"<p>{}</p>".format(main_text)).p
soup.div.append(main_text_p)
for footnote in footnotes:
footnote_p = bs(footnote)
soup.div.append(footnote_p)
return soup
def add_footnote_link(p_index, a_index):
a_index_ascii = str(chinese_number.find(a_index))
footnote_link = u'<a epub:type="noteref" href="#p{0}a{1}" id="p{0}a{1}ref"><sup>{1}</sup></a>'.format(p_index, a_index_ascii)
return footnote_link
def add_footnote_id(content, p_index, a_index):
a_index_ascii = str(chinese_number.find(a_index))
footnote = u'<aside epub:type="footnote" id="p{1}a{2}">{0}</aside>'.format(content, p_index, a_index_ascii)
return footnote
def get_annotation_cat(text):
match = re.search(ur'【[^○一二三四五六七八九】]+】', text)
cat = make_singular_character(match.group(0)).replace(u'【', '').replace(u'】', '')
#print cat
return cat
def add_span_for_marker(text):
text = make_singular_character(text)
for i, v in annotation_cats.iteritems():
text = text.replace(
u'【{}】'.format(i),
u'<span class="common {0}">{1}</span>'.format(v, i)
)
return text
def remove_multiple_newines(text):
return re.sub(r'\n{2,}', '', text)
def remove_bracket(text):
return text.replace(u'【', '').replace(u'】', '')
def remove_index(text):
return re.sub(ur'^【[○一二三四五六七八九]+】', '', make_singular_character(text))
def change_shiwen_to_shi(text):
return text.replace(u'【釋文】', u'【釋】')
def make_singular_character(text):
return re.sub(ur'【.*([注疏釋校]).*】', ur'【\1】', text, flags=re.UNICODE)
#return re.sub(ur'【([^】])[^】]+】', ur'【\1】', text, flags=re.UNICODE)
def main():
chinese_number = u"○一二三四五六七八九"
ops_path = os.path.join(
os.path.expanduser('~'), 'Downloads', 'zhuangzi', 'OPS')
chapters = sorted(map(lambda x: os.path.join(ops_path, x),
filter(
lambda x: x.endswith('html'), os.listdir(ops_path))
))
#print chapters
#for c in chapters:
for c in chapters:
print "*"*100
print c
with open(c) as f:
content = f.read()
html = bs(content, 'html.parser')
html = convert_inline_annotations(html)
if not 'xmlns:epub' in html.html.attrs:
html.html['xmlns:epub'] = "http://www.idpf.org/2007/ops"
#print html.prettify()
with open("{}".format(c), 'wb') as f:
f.write(remove_multiple_newines(str(html)))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment