Skip to content

Instantly share code, notes, and snippets.

@martianboy
Last active August 29, 2015 14:06
Show Gist options
  • Save martianboy/57a0fe73dfea06f0b31a to your computer and use it in GitHub Desktop.
Save martianboy/57a0fe73dfea06f0b31a to your computer and use it in GitHub Desktop.
Normalize input file with Hazm Normalizer, replace in-place.
from hazm import Normalizer
from zipfile import ZipFile
import argparse
import binascii
import re
import os
import shutil
parser = argparse.ArgumentParser(description='Normalizes input text files.')
parser.add_argument('file', type=str, help='File path to normalize.')
args = parser.parse_args()
epub_filename = args.file
normalizer = Normalizer()
html_file_pattern = re.compile('\.html?$')
temp_path = '/tmp/epub-' + binascii.hexlify(os.urandom(10)).decode('ascii') + '/'
os.mkdir(temp_path)
with ZipFile(epub_filename) as epub:
epub.extractall(temp_path)
with ZipFile(epub_filename, 'w') as epub:
for root, dirs, files in os.walk(temp_path):
for file in files:
filename = os.path.join(root, file)
if html_file_pattern.findall(file):
with open(filename, 'rt', encoding='utf-8') as f:
content = normalizer.normalize(f.read())
content = content.replace("\u200F", "\u200C")
with open(filename, 'wt', encoding='utf-8') as f:
f.write(content)
epub.write(filename, filename.replace(temp_path, ''))
shutil.rmtree(temp_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment