Skip to content

Instantly share code, notes, and snippets.

@0
Created October 8, 2017 08:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save 0/de44ef0f44b30b5b0bde747e3fd30c72 to your computer and use it in GitHub Desktop.
Save 0/de44ef0f44b30b5b0bde747e3fd30c72 to your computer and use it in GitHub Desktop.
Find misspelled words in text that contains many non-words.
#!/usr/bin/env python3
"""
A wrapper around difflib.get_close_matches to make it easy to find misspelled
words in text that contains many non-words.
For regular text, where every letter sequence must be a dictionary word,
checking spelling is straightforward. Technical text, such as documentation or
code comments, may contain many valid non-words (e.g. function names) and
deliberate nonsense example strings. As a consequence, there tend to be many
false positives. To reduce the amount of noise, we ignore any unrecognized
words that aren't close to legitimate words, since those are unlikely to be
misspellings.
"""
from difflib import get_close_matches
from math import ceil
from re import split
from sys import argv, exit, stderr
## Parse arguments.
dicts = []
texts = []
dict_next = False
done_options = False
for arg in argv[1:]:
if dict_next:
dicts.append(arg)
dict_next = False
elif not done_options and arg.startswith('-'):
if arg == '--':
done_options = True
elif arg == '-d':
dict_next = True
else:
print('Unrecognized option:', arg)
exit(1)
else:
texts.append(arg)
if not dicts or not texts:
print(f'usage: {argv[0]} -d <dict> [-d <dict> ...] [--] <text> [<text> ...]')
print()
print('Check the words in the text files against the words in the dict files.')
exit(1)
print('dict:', ', '.join(dicts), file=stderr)
print('text:', ', '.join(texts), file=stderr)
## Extract words.
dict_words = set()
for path in dicts:
with open(path) as f:
# Each word is on its own line.
dict_words.update(x.rstrip().lower() for x in f.readlines())
text_words = set()
for path in texts:
with open(path) as f:
for line in f:
# Every run of ASCII letters is a word.
text_words.update(split('[^A-Za-z]', line.lower()))
## Scan words.
for i, text_word in enumerate(text_words):
if i % 100 == 0:
print(f'{i}/{len(text_words)} ({ceil(100*i/len(text_words))}%)', file=stderr)
# Ignore words we recognize.
if text_word in dict_words:
continue
# A higher cutoff leads to faster matches and fewer results.
match = get_close_matches(text_word, dict_words, n=1, cutoff=0.8)
# Ignore words that aren't close to anything.
if not match:
continue
print(f'{text_word} => {match[0]}')
print(f'{len(text_words)}/{len(text_words)} (100%)', file=stderr)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment