Skip to content

Instantly share code, notes, and snippets.

@57uff3r
Forked from yoihito/fb2transformer.py
Last active August 29, 2015 14:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save 57uff3r/d518d900b85941794157 to your computer and use it in GitHub Desktop.
Save 57uff3r/d518d900b85941794157 to your computer and use it in GitHub Desktop.
import xml.sax
import xml.sax.handler
import re
class BodyNotFound(Exception):
pass
class BookHandler(xml.sax.handler.ContentHandler):
def __init__(self):
self.inBody = False
self.text = None
def startElement(self, name, attr):
if name == "body":
if self.text == None:
self.text = ""
self.inBody = True
def characters(self, data):
if self.inBody:
self.text += data
def endElement(self, name):
if name == "body":
self.inBody = False
class Fb2Transformer:
def read(self, filename):
parser = xml.sax.make_parser()
handler = BookHandler()
parser.setContentHandler(handler)
parser.parse(filename)
if handler.text == None:
raise BodyNotFound()
handler.text = re.sub(r"[^a-zA-Z ]", " ", handler.text)
handler.text = re.sub(r"[ ]+", " ", handler.text)
return handler.text.lower()
def replace(self, filename, dictionary):
text = open(filename).read()
for word, trans in dictionary.items():
nr = "\\b(?i)"+word+"\\b"
text = re.sub(r""+nr, trans, text)
return text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment