Skip to content

Instantly share code, notes, and snippets.

@g0ddest
Created January 10, 2019 22:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save g0ddest/716aeffaadf87425dea621880eeba9d6 to your computer and use it in GitHub Desktop.
Save g0ddest/716aeffaadf87425dea621880eeba9d6 to your computer and use it in GitHub Desktop.
Зависимости:
from lxml import etree
import copy
from bs4 import BeautifulSoup
import re
class Splitter:
ns = ''
xml_file = "book.fb2"
@staticmethod
def fb2_to_html(text):
part = BeautifulSoup(text, 'lxml')
for section in part.find_all('section'):
for attr in list(section.attrs.keys()):
del section[attr]
section.name = 'div'
for strong in part.find_all('strong'):
strong.name = "b"
for emphasis in part.find_all('emphasis'):
emphasis.name = 'i'
for empty_line in part.find_all('empty-line'):
empty_line.name = 'br'
for title in part.find_all('title'):
title.name = 'h1'
for image in part.find_all('image'):
image.name = 'img'
href = list(filter(lambda x: re.match("(.+)?href", x), list(image.attrs.keys())))
if len(href) > 0:
image['src'] = 'i/' + image[href[0]].strip("#")
del image[href[0]]
return str(part).encode('utf-8')
def parse(self):
tree = etree.parse(self.xml_file, etree.XMLParser(ns_clean=True))
root = etree.QName(tree.getroot())
ns = root.namespace
i = 0
for elem in tree.iter():
tag_name = etree.QName(elem.tag).localname
if tag_name.lower() == 'section':
sc = copy.copy(elem)
for section in sc.findall(etree.QName(ns, 'section')):
sc.remove(section)
etree.dump(sc)
f = open("book/{}.html".format(i), "wb")
content = self.fb2_to_html(etree.tostring(sc, encoding='utf-8'))
f.write(content)
f.close()
i = i+1
splitter = Splitter()
splitter.parse()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment