Skip to content

Instantly share code, notes, and snippets.

@reachsumit
Created July 19, 2020 03:54
Show Gist options
  • Save reachsumit/a04e3a651852e631030ab56038520ac3 to your computer and use it in GitHub Desktop.
Save reachsumit/a04e3a651852e631030ab56038520ac3 to your computer and use it in GitHub Desktop.
spell-check Norvig's test sets using pretrained FastText embeddings
import io
import fasttext
def load_vectors(fname):
fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
n, d = map(int, fin.readline().split())
data = {}
for line in fin:
tokens = line.rstrip().split(' ')
data[tokens[0]] = map(float, tokens[1:])
return data
def spelltest(tests, model, vocab):
"Run correction(wrong) on all (right, wrong) pairs; report results."
import time
start = time.clock()
good, unknown = 0, 0
n = len(tests)
for right, wrong in tests:
w = wrong
if w in vocab:
print('word: {} exists in the vocabulary. No correction required'.format(w))
else:
w_old = w
w = model.get_nearest_neighbors(w, k=1)[0][1]
print("found replacement: {} for word: {}".format(w, w_old))
good += (w == right)
dt = time.clock() - start
print('{:.0%} of {} correct at {:.0f} words per second '
.format(good / n, n, n / dt))
def Testset(lines):
"Parse 'right: wrong1 wrong2' lines into [('right', 'wrong1'), ('right', 'wrong2')] pairs."
return [(right, wrong)
for (right, wrongs) in (line.split(':') for line in lines)
for wrong in wrongs.split()]
if __name__ == "__main__":
model = fasttext.load_model("crawl-300d-2M-subword.bin")
vocab = load_vectors("crawl-300d-2M-subword.vec")
spelltest(Testset(open('spell-testset1.txt')), model, vocab)
spelltest(Testset(open('spell-testset2.txt')), model, vocab)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment