Skip to content

Instantly share code, notes, and snippets.

@tos-kamiya
Last active May 29, 2023 12:13
Show Gist options
  • Save tos-kamiya/b752a963dd2623ec748936792f1e8751 to your computer and use it in GitHub Desktop.
Save tos-kamiya/b752a963dd2623ec748936792f1e8751 to your computer and use it in GitHub Desktop.
HyFi: A Hyphen Fixer for English text (CLI tool)
#!/usr/bin/env python3
import argparse
import re
import sys
from typing import Dict, Iterator, List, Optional, Set, Tuple
def load_words(file_path: str) -> Set[str]:
with open(file_path, 'r') as f:
words = frozenset(word.strip() for word in f)
return words
def load_user_rules(filepath: str) -> Dict[str, str]:
"""Load user rules from a file. The file format is 'original replacement' per line."""
d = {}
with open(filepath, 'r') as f:
for li, line in enumerate(f):
p = line.split()
if len(p) != 2:
exit("Error: line {li+1}: invalid user rule: {line}")
k, v = p
d[k] = v
return d
def generate_word_variants(word:
str) -> List[str]:
word_variants = []
if not any(c.isalpha() for c in word):
return word_variants
wi = word
for i in range(4):
wj = wi
for j in range(4):
word_variants.append(wj)
if wj and not wj[-1].isalpha():
wj = wj[:-1]
else:
break
if wi and not wi[0].isalpha():
wi = wi[1:]
else:
break
if any(c.isupper() for c in word):
lowercase_variants = list(v.lower() for v in word_variants)
word_variants.extend(lowercase_variants)
return word_variants
def process_lines(line_it: Iterator[str], english_words: Set[str], user_rules: Dict[str, str] = {}, unrecognized_pairs_sink: Optional[List[Tuple[str, str]]] = None) -> Iterator[str]:
try:
cur_line = next(line_it).split()
line_num = 1
except StopIteration:
return # empty file, nothing to process
unrecognized_pairs = []
for line in line_it:
next_line = line.split()
if cur_line and next_line and re.search(r'[-\u2010-\u2015]$', cur_line[-1]):
concat_word = cur_line[-1] + next_line[0]
if concat_word in user_rules:
cur_line[-1] = user_rules[concat_word]
next_line = next_line[1:]
else:
reformed_word = cur_line[-1][:-1] + next_line[0]
joint_word_variants = generate_word_variants(reformed_word)
if not any(word in english_words for word in joint_word_variants):
concat_word = cur_line[-1] + next_line[0]
unrecognized_pairs.append((concat_word, reformed_word))
print(f'Warning: line {line_num + 1}: the word "{concat_word}" does not seem a valid English word.', file=sys.stderr)
else:
cur_line[-1] = reformed_word
next_line = next_line[1:]
yield ' '.join(cur_line)
cur_line = next_line
line_num += 1
yield ' '.join(cur_line) # yield the last line
if unrecognized_pairs_sink is not None:
done_pairs = set()
for p in unrecognized_pairs:
if p in done_pairs:
continue
done_pairs.add(p)
unrecognized_pairs_sink.append(p)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='HyFi: Hyphen Fixer for English text.')
parser.add_argument('input', help='The input text file to process or "-" for stdin.')
parser.add_argument('-o', '--output', help='The output text file to write. If not given, write to stdout.')
parser.add_argument('-u', '--unrecognized-pairs', help='Each of the unrecognized words and its possible replacement.')
parser.add_argument('-r', '--user-rules', help='The file containing user rules for replacing unrecognized words.')
args = parser.parse_args()
english_words = load_words('/usr/share/dict/words')
input_stream = sys.stdin if args.input == "-" else open(args.input, 'r')
output_stream = sys.stdout if args.output is None else open(args.output, 'w')
try:
user_rules = load_user_rules(args.user_rules) if args.user_rules else {}
urps = []
for line in process_lines(input_stream, english_words, user_rules=user_rules, unrecognized_pairs_sink=urps):
output_stream.write(line + '\n')
finally:
if input_stream is not sys.stdin:
input_stream.close()
if output_stream is not sys.stdout:
output_stream.close()
if args.unrecognized_pairs is not None:
with open(args.unrecognized_pairs, 'w') as outp:
for concat_word, reformed_word in urps:
print(f"{concat_word} {reformed_word}", file=outp)
@tos-kamiya
Copy link
Author

A screenshot.
Screenshot from 2023-05-29 16-28-46

@tos-kamiya
Copy link
Author

The tool now has a public repository: https://github.com/tos-kamiya/hyfi .

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment