digglife/reformat_annotation.py

## reformat_annotation.py
#!/usr/bin/env python
# encoding:utf-8
import re
import os
import json
from bs4 import BeautifulSoup as bs

chinese_number = u"○一二三四五六七八九"
annotation_cats = {
    u'注': 'zhu',
    u'疏': 'shu',
    u'釋': "shi",
    u'校': "jiao"
}

def convert_inline_annotations(soup):
    paragraphs = soup.findAll('p')
    format_data = []
    text_index = 0
    annotation_index = 0
    for p in paragraphs:
        if p.text.startswith((u'【', u'◎')):
            annotations = format_data[text_index]['annotations']
            annotation = p.text
            match = re.match(ur'^【([○一二三四五六七八九]+)】.*', annotation)
            if match:
                # print "=== ANNOTATOIN WITH INDEX FOUND ==="
                # print annotation
                annotation_index = match.group(1)
                #annotation_index = int(''.join(map(lambda x: str(chinese_number.find(x)), annotation_index)))-1
                #cat = get_annotation_cat(annotation)
                annotations[annotation_index] = remove_index(annotation)
                #annotations[annotation_index][cat] = annotation
                p.extract()
            else:
                # print "=== ANNOTATOIN WITH MARKER FOUND ==="
                # print annotation
                if annotations:
                    annotations[annotation_index] += remove_index(annotation)
                    p.extract()
                else:
                    print u"This is not a inline annotation. =>{}<=".format(p.text)
        else:
            # print "=== MAIN TEXT FOUND: ==="
            # print p.text
            # skip increase index for the first main text.
            if format_data:
                text_index += 1
            format_data.append({'text': '', 'annotations': {}})
            format_data[text_index]['text'] = p.text
            p.extract()
    print json.dumps(format_data, indent=2, ensure_ascii=False)
    soup = get_text_with_markers(format_data, soup)
    return soup

def get_text_with_annotations(format_data, soup):
    for text_index, data in enumerate(format_data):
        annotations = data['annotations']
        main_text = data['text']
        # print main_text
        if annotations:
            for index, items in annotations.iteritems():
                #concated_annotaion = ' '.join(items.values())
                annotation = add_span_for_marker(items)
                annotations_span = u"<span class='annotation'>{}</span>".format(annotation)
                main_text = main_text.replace(u'【{}】'.format(index), annotations_span)
        main_text_p = bs(u"<p>{}</p>".format(main_text)).p
        soup.div.append(main_text_p)
    return soup

def get_text_with_markers(format_data, soup):
    for text_index, data in enumerate(format_data):
        annotations = data['annotations']
        main_text = data['text']
        footnotes=[]
        if annotations:
            #print json.dumps(annotations, indent=2, ensure_ascii=False)
            for index, items in sorted(annotations.iteritems()):
                main_text = main_text.replace(
                    u'【{}】'.format(index), add_footnote_link(text_index, index)

                )
                footnotes.append( add_footnote_id(items, text_index, index) )
        main_text_p = bs(u"<p>{}</p>".format(main_text)).p
        soup.div.append(main_text_p)
        for footnote in footnotes:
            footnote_p = bs(footnote)
            soup.div.append(footnote_p)

    return soup

def add_footnote_link(p_index, a_index):
    a_index_ascii = str(chinese_number.find(a_index))
    footnote_link = u'<a epub:type="noteref" href="#p{0}a{1}" id="p{0}a{1}ref"><sup>{1}</sup></a>'.format(p_index, a_index_ascii)
    return footnote_link

def add_footnote_id(content, p_index, a_index):
    a_index_ascii = str(chinese_number.find(a_index))
    footnote = u'<aside epub:type="footnote" id="p{1}a{2}">{0}</aside>'.format(content, p_index, a_index_ascii)
    return footnote

def get_annotation_cat(text):
    match = re.search(ur'【[^○一二三四五六七八九】]+】', text)
    cat = make_singular_character(match.group(0)).replace(u'【', '').replace(u'】', '')
    #print cat
    return cat

def add_span_for_marker(text):
    text = make_singular_character(text)
    for i, v in annotation_cats.iteritems():
        text = text.replace(
            u'【{}】'.format(i),
            u'<span class="common {0}">{1}</span>'.format(v, i)
        )

    return text


def remove_multiple_newines(text):
    return re.sub(r'\n{2,}', '', text)

def remove_bracket(text):
    return text.replace(u'【', '').replace(u'】', '')

def remove_index(text):
    return re.sub(ur'^【[○一二三四五六七八九]+】', '', make_singular_character(text))

def change_shiwen_to_shi(text):
    return text.replace(u'【釋文】', u'【釋】')

def make_singular_character(text):
    return re.sub(ur'【.*([注疏釋校]).*】', ur'【\1】', text, flags=re.UNICODE)
    #return re.sub(ur'【([^】])[^】]+】', ur'【\1】', text, flags=re.UNICODE)

def main():
    chinese_number = u"○一二三四五六七八九"

    ops_path = os.path.join(
        os.path.expanduser('~'), 'Downloads', 'zhuangzi', 'OPS')

    chapters = sorted(map(lambda x: os.path.join(ops_path, x),
                          filter(
                              lambda x: x.endswith('html'), os.listdir(ops_path))
                          ))
    #print chapters
    #for c in chapters:
    for c in chapters:
        print "*"*100
        print c
        with open(c) as f:
            content = f.read()

        html = bs(content, 'html.parser')
        html = convert_inline_annotations(html)

        if not 'xmlns:epub' in html.html.attrs:
            html.html['xmlns:epub'] = "http://www.idpf.org/2007/ops"
        #print html.prettify()

        with open("{}".format(c), 'wb') as f:
            f.write(remove_multiple_newines(str(html)))

if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	# encoding:utf-8
	import re
	import os
	import json
	from bs4 import BeautifulSoup as bs

	chinese_number = u"○一二三四五六七八九"
	annotation_cats = {
	u'注': 'zhu',
	u'疏': 'shu',
	u'釋': "shi",
	u'校': "jiao"
	}

	def convert_inline_annotations(soup):
	paragraphs = soup.findAll('p')
	format_data = []
	text_index = 0
	annotation_index = 0
	for p in paragraphs:
	if p.text.startswith((u'【', u'◎')):
	annotations = format_data[text_index]['annotations']
	annotation = p.text
	match = re.match(ur'^【([○一二三四五六七八九]+)】.*', annotation)
	if match:
	# print "=== ANNOTATOIN WITH INDEX FOUND ==="
	# print annotation
	annotation_index = match.group(1)
	#annotation_index = int(''.join(map(lambda x: str(chinese_number.find(x)), annotation_index)))-1
	#cat = get_annotation_cat(annotation)
	annotations[annotation_index] = remove_index(annotation)
	#annotations[annotation_index][cat] = annotation
	p.extract()
	else:
	# print "=== ANNOTATOIN WITH MARKER FOUND ==="
	# print annotation
	if annotations:
	annotations[annotation_index] += remove_index(annotation)
	p.extract()
	else:
	print u"This is not a inline annotation. =>{}<=".format(p.text)
	else:
	# print "=== MAIN TEXT FOUND: ==="
	# print p.text
	# skip increase index for the first main text.
	if format_data:
	text_index += 1
	format_data.append({'text': '', 'annotations': {}})
	format_data[text_index]['text'] = p.text
	p.extract()
	print json.dumps(format_data, indent=2, ensure_ascii=False)
	soup = get_text_with_markers(format_data, soup)
	return soup

	def get_text_with_annotations(format_data, soup):
	for text_index, data in enumerate(format_data):
	annotations = data['annotations']
	main_text = data['text']
	# print main_text
	if annotations:
	for index, items in annotations.iteritems():
	#concated_annotaion = ' '.join(items.values())
	annotation = add_span_for_marker(items)
	annotations_span = u"<span class='annotation'>{}</span>".format(annotation)
	main_text = main_text.replace(u'【{}】'.format(index), annotations_span)
	main_text_p = bs(u"<p>{}</p>".format(main_text)).p
	soup.div.append(main_text_p)
	return soup

	def get_text_with_markers(format_data, soup):
	for text_index, data in enumerate(format_data):
	annotations = data['annotations']
	main_text = data['text']
	footnotes=[]
	if annotations:
	#print json.dumps(annotations, indent=2, ensure_ascii=False)
	for index, items in sorted(annotations.iteritems()):
	main_text = main_text.replace(
	u'【{}】'.format(index), add_footnote_link(text_index, index)

	)
	footnotes.append( add_footnote_id(items, text_index, index) )
	main_text_p = bs(u"<p>{}</p>".format(main_text)).p
	soup.div.append(main_text_p)
	for footnote in footnotes:
	footnote_p = bs(footnote)
	soup.div.append(footnote_p)

	return soup

	def add_footnote_link(p_index, a_index):
	a_index_ascii = str(chinese_number.find(a_index))
	footnote_link = u'<a epub:type="noteref" href="#p{0}a{1}" id="p{0}a{1}ref"><sup>{1}</sup></a>'.format(p_index, a_index_ascii)
	return footnote_link

	def add_footnote_id(content, p_index, a_index):
	a_index_ascii = str(chinese_number.find(a_index))
	footnote = u'<aside epub:type="footnote" id="p{1}a{2}">{0}</aside>'.format(content, p_index, a_index_ascii)
	return footnote

	def get_annotation_cat(text):
	match = re.search(ur'【[^○一二三四五六七八九】]+】', text)
	cat = make_singular_character(match.group(0)).replace(u'【', '').replace(u'】', '')
	#print cat
	return cat

	def add_span_for_marker(text):
	text = make_singular_character(text)
	for i, v in annotation_cats.iteritems():
	text = text.replace(
	u'【{}】'.format(i),
	u'<span class="common {0}">{1}</span>'.format(v, i)
	)

	return text


	def remove_multiple_newines(text):
	return re.sub(r'\n{2,}', '', text)

	def remove_bracket(text):
	return text.replace(u'【', '').replace(u'】', '')

	def remove_index(text):
	return re.sub(ur'^【[○一二三四五六七八九]+】', '', make_singular_character(text))

	def change_shiwen_to_shi(text):
	return text.replace(u'【釋文】', u'【釋】')

	def make_singular_character(text):
	return re.sub(ur'【.([注疏釋校]).】', ur'【\1】', text, flags=re.UNICODE)
	#return re.sub(ur'【([^】])[^】]+】', ur'【\1】', text, flags=re.UNICODE)

	def main():
	chinese_number = u"○一二三四五六七八九"

	ops_path = os.path.join(
	os.path.expanduser('~'), 'Downloads', 'zhuangzi', 'OPS')

	chapters = sorted(map(lambda x: os.path.join(ops_path, x),
	filter(
	lambda x: x.endswith('html'), os.listdir(ops_path))
	))
	#print chapters
	#for c in chapters:
	for c in chapters:
	print ""100
	print c
	with open(c) as f:
	content = f.read()

	html = bs(content, 'html.parser')
	html = convert_inline_annotations(html)

	if not 'xmlns:epub' in html.html.attrs:
	html.html['xmlns:epub'] = "http://www.idpf.org/2007/ops"
	#print html.prettify()

	with open("{}".format(c), 'wb') as f:
	f.write(remove_multiple_newines(str(html)))

	if __name__ == '__main__':
	main()