0/misspell.py

## misspell.py
#!/usr/bin/env python3

"""
A wrapper around difflib.get_close_matches to make it easy to find misspelled
words in text that contains many non-words.

For regular text, where every letter sequence must be a dictionary word,
checking spelling is straightforward. Technical text, such as documentation or
code comments, may contain many valid non-words (e.g. function names) and
deliberate nonsense example strings. As a consequence, there tend to be many
false positives. To reduce the amount of noise, we ignore any unrecognized
words that aren't close to legitimate words, since those are unlikely to be
misspellings.
"""

from difflib import get_close_matches
from math import ceil
from re import split
from sys import argv, exit, stderr


## Parse arguments.
dicts = []
texts = []
dict_next = False
done_options = False
for arg in argv[1:]:
    if dict_next:
        dicts.append(arg)
        dict_next = False
    elif not done_options and arg.startswith('-'):
        if arg == '--':
            done_options = True
        elif arg == '-d':
            dict_next = True
        else:
            print('Unrecognized option:', arg)
            exit(1)
    else:
        texts.append(arg)

if not dicts or not texts:
    print(f'usage: {argv[0]} -d <dict> [-d <dict> ...] [--] <text> [<text> ...]')
    print()
    print('Check the words in the text files against the words in the dict files.')
    exit(1)

print('dict:', ', '.join(dicts), file=stderr)
print('text:', ', '.join(texts), file=stderr)


## Extract words.
dict_words = set()
for path in dicts:
    with open(path) as f:
        # Each word is on its own line.
        dict_words.update(x.rstrip().lower() for x in f.readlines())

text_words = set()
for path in texts:
    with open(path) as f:
        for line in f:
            # Every run of ASCII letters is a word.
            text_words.update(split('[^A-Za-z]', line.lower()))


## Scan words.
for i, text_word in enumerate(text_words):
    if i % 100 == 0:
        print(f'{i}/{len(text_words)} ({ceil(100*i/len(text_words))}%)', file=stderr)

    # Ignore words we recognize.
    if text_word in dict_words:
        continue

    # A higher cutoff leads to faster matches and fewer results.
    match = get_close_matches(text_word, dict_words, n=1, cutoff=0.8)

    # Ignore words that aren't close to anything.
    if not match:
        continue

    print(f'{text_word} => {match[0]}')

print(f'{len(text_words)}/{len(text_words)} (100%)', file=stderr)
	#!/usr/bin/env python3

	"""
	A wrapper around difflib.get_close_matches to make it easy to find misspelled
	words in text that contains many non-words.

	For regular text, where every letter sequence must be a dictionary word,
	checking spelling is straightforward. Technical text, such as documentation or
	code comments, may contain many valid non-words (e.g. function names) and
	deliberate nonsense example strings. As a consequence, there tend to be many
	false positives. To reduce the amount of noise, we ignore any unrecognized
	words that aren't close to legitimate words, since those are unlikely to be
	misspellings.
	"""

	from difflib import get_close_matches
	from math import ceil
	from re import split
	from sys import argv, exit, stderr


	## Parse arguments.
	dicts = []
	texts = []
	dict_next = False
	done_options = False
	for arg in argv[1:]:
	if dict_next:
	dicts.append(arg)
	dict_next = False
	elif not done_options and arg.startswith('-'):
	if arg == '--':
	done_options = True
	elif arg == '-d':
	dict_next = True
	else:
	print('Unrecognized option:', arg)
	exit(1)
	else:
	texts.append(arg)

	if not dicts or not texts:
	print(f'usage: {argv[0]} -d <dict> [-d <dict> ...] [--] <text> [<text> ...]')
	print()
	print('Check the words in the text files against the words in the dict files.')
	exit(1)

	print('dict:', ', '.join(dicts), file=stderr)
	print('text:', ', '.join(texts), file=stderr)


	## Extract words.
	dict_words = set()
	for path in dicts:
	with open(path) as f:
	# Each word is on its own line.
	dict_words.update(x.rstrip().lower() for x in f.readlines())

	text_words = set()
	for path in texts:
	with open(path) as f:
	for line in f:
	# Every run of ASCII letters is a word.
	text_words.update(split('[^A-Za-z]', line.lower()))


	## Scan words.
	for i, text_word in enumerate(text_words):
	if i % 100 == 0:
	print(f'{i}/{len(text_words)} ({ceil(100*i/len(text_words))}%)', file=stderr)

	# Ignore words we recognize.
	if text_word in dict_words:
	continue

	# A higher cutoff leads to faster matches and fewer results.
	match = get_close_matches(text_word, dict_words, n=1, cutoff=0.8)

	# Ignore words that aren't close to anything.
	if not match:
	continue

	print(f'{text_word} => {match[0]}')

	print(f'{len(text_words)}/{len(text_words)} (100%)', file=stderr)