g0ddest/splitter.py

## splitter.py
from lxml import etree
import copy
from bs4 import BeautifulSoup
import re


class Splitter:
    ns = ''
    xml_file = "book.fb2"

    @staticmethod
    def fb2_to_html(text):
        part = BeautifulSoup(text, 'lxml')
        for section in part.find_all('section'):
            for attr in list(section.attrs.keys()):
                del section[attr]
            section.name = 'div'
        for strong in part.find_all('strong'):
            strong.name = "b"
        for emphasis in part.find_all('emphasis'):
            emphasis.name = 'i'
        for empty_line in part.find_all('empty-line'):
            empty_line.name = 'br'
        for title in part.find_all('title'):
            title.name = 'h1'
        for image in part.find_all('image'):
            image.name = 'img'
            href = list(filter(lambda x: re.match("(.+)?href", x), list(image.attrs.keys())))
            if len(href) > 0:
                image['src'] = 'i/' + image[href[0]].strip("#")
                del image[href[0]]
        return str(part).encode('utf-8')

    def parse(self):
        tree = etree.parse(self.xml_file, etree.XMLParser(ns_clean=True))

        root = etree.QName(tree.getroot())
        ns = root.namespace

        i = 0

        for elem in tree.iter():
            tag_name = etree.QName(elem.tag).localname
            if tag_name.lower() == 'section':
                sc = copy.copy(elem)
                for section in sc.findall(etree.QName(ns, 'section')):
                    sc.remove(section)
                etree.dump(sc)
                f = open("book/{}.html".format(i), "wb")
                content = self.fb2_to_html(etree.tostring(sc, encoding='utf-8'))
                f.write(content)
                f.close()
                i = i+1


splitter = Splitter()
splitter.parse()
	from lxml import etree
	import copy
	from bs4 import BeautifulSoup
	import re


	class Splitter:
	ns = ''
	xml_file = "book.fb2"

	@staticmethod
	def fb2_to_html(text):
	part = BeautifulSoup(text, 'lxml')
	for section in part.find_all('section'):
	for attr in list(section.attrs.keys()):
	del section[attr]
	section.name = 'div'
	for strong in part.find_all('strong'):
	strong.name = "b"
	for emphasis in part.find_all('emphasis'):
	emphasis.name = 'i'
	for empty_line in part.find_all('empty-line'):
	empty_line.name = 'br'
	for title in part.find_all('title'):
	title.name = 'h1'
	for image in part.find_all('image'):
	image.name = 'img'
	href = list(filter(lambda x: re.match("(.+)?href", x), list(image.attrs.keys())))
	if len(href) > 0:
	image['src'] = 'i/' + image[href[0]].strip("#")
	del image[href[0]]
	return str(part).encode('utf-8')

	def parse(self):
	tree = etree.parse(self.xml_file, etree.XMLParser(ns_clean=True))

	root = etree.QName(tree.getroot())
	ns = root.namespace

	i = 0

	for elem in tree.iter():
	tag_name = etree.QName(elem.tag).localname
	if tag_name.lower() == 'section':
	sc = copy.copy(elem)
	for section in sc.findall(etree.QName(ns, 'section')):
	sc.remove(section)
	etree.dump(sc)
	f = open("book/{}.html".format(i), "wb")
	content = self.fb2_to_html(etree.tostring(sc, encoding='utf-8'))
	f.write(content)
	f.close()
	i = i+1


	splitter = Splitter()
	splitter.parse()