Skip to content

Instantly share code, notes, and snippets.

@reachsumit
Last active July 19, 2020 06:03
Show Gist options
  • Save reachsumit/a357c171b87086bceea193c0cfa53337 to your computer and use it in GitHub Desktop.
Save reachsumit/a357c171b87086bceea193c0cfa53337 to your computer and use it in GitHub Desktop.
fasttext based spell-checker trained on Peter Norvig's "big.txt" training data
import io
import fasttext
def load_vectors(fname):
fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
n, d = map(int, fin.readline().split())
data = {}
for line in fin:
tokens = line.rstrip().split(' ')
data[tokens[0]] = map(float, tokens[1:])
return data
def spelltest(tests, model):
"Run correction(wrong) on all (right, wrong) pairs; report results."
import time
start = time.clock()
good, unknown = 0, 0
n = len(tests)
for right, wrong in tests:
w_old = wrong
w = wrong
if w in model.words:
pass
else:
w = model.get_nearest_neighbors(w, k=1)[0][1]
good += (w == right)
if not (w == right):
if w_old != w:
print("Edited {} to {}, but the correct word is: {}".format(w_old, w, right))
dt = time.clock() - start
print('{:.0%} of {} correct at {:.0f} words per second '
.format(good / n, n, n / dt))
def Testset(lines):
"Parse 'right: wrong1 wrong2' lines into [('right', 'wrong1'), ('right', 'wrong2')] pairs."
return [(right, wrong)
for (right, wrongs) in (line.split(':') for line in lines)
for wrong in wrongs.split()]
if __name__ == "__main__":
model = fasttext.train_unsupervised('big.txt', wordNgrams=1, minn=1, maxn=2, dim=300, ws=8, neg=8, epoch=4, minCount=1, bucket=900000)
spelltest(Testset(open('spell-testset1.txt')), model)
spelltest(Testset(open('spell-testset2.txt')), model)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment