Skip to content

Instantly share code, notes, and snippets.

@hamidfzm
Created October 7, 2015 19:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hamidfzm/429921604fefed7e71be to your computer and use it in GitHub Desktop.
Save hamidfzm/429921604fefed7e71be to your computer and use it in GitHub Desktop.
epub persian normalizer script
# -*- coding: utf-8 -*-
from hazm import Normalizer
from zipfile import ZipFile, ZIP_DEFLATED
import argparse
import binascii
import re
import os
import shutil
from tempfile import TemporaryDirectory
parser = argparse.ArgumentParser(description='Normalizes input text files.')
parser.add_argument('file', type=str, help='File path to normalize.')
args = parser.parse_args()
epub_filename = args.file
normalizer = Normalizer()
html_file_pattern = re.compile('\.html?$')
with TemporaryDirectory() as temp_path:
with ZipFile(epub_filename) as epub:
epub.extractall(temp_path)
with ZipFile(epub_filename, 'w', ZIP_DEFLATED) as epub:
for root, dirs, files in os.walk(temp_path):
for file in files:
filename = os.path.join(root, file)
if html_file_pattern.findall(file):
with open(filename, 'rt', encoding='utf-8') as f:
content = normalizer.normalize(f.read())
content = content.replace("\u200F", "\u200C")
content = content.replace("\u0643", "\u06A9")
content = re.sub("[\u064A\u0649]", "\u06CC", content)
with open(filename, 'wt', encoding='utf-8') as f:
f.write(content)
epub.write(filename, filename.replace(temp_path, ''))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment