reachsumit/fasttext_pretrained.py

## fasttext_pretrained.py
import io
import fasttext

def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
    return data

def spelltest(tests, model, vocab):
    "Run correction(wrong) on all (right, wrong) pairs; report results."
    import time
    start = time.clock()
    good, unknown = 0, 0
    n = len(tests)
    for right, wrong in tests:
        w = wrong
        if w in vocab:
            print('word: {} exists in the vocabulary. No correction required'.format(w))
        else:
            w_old = w
            w = model.get_nearest_neighbors(w, k=1)[0][1]
            print("found replacement: {} for word: {}".format(w, w_old))
        good += (w == right)
    dt = time.clock() - start
    print('{:.0%} of {} correct at {:.0f} words per second '
          .format(good / n, n, n / dt))

def Testset(lines):
    "Parse 'right: wrong1 wrong2' lines into [('right', 'wrong1'), ('right', 'wrong2')] pairs."
    return [(right, wrong)
            for (right, wrongs) in (line.split(':') for line in lines)
            for wrong in wrongs.split()]

if __name__ == "__main__":
    model = fasttext.load_model("crawl-300d-2M-subword.bin")
    vocab = load_vectors("crawl-300d-2M-subword.vec")

    spelltest(Testset(open('spell-testset1.txt')), model, vocab)
    spelltest(Testset(open('spell-testset2.txt')), model, vocab)
	import io
	import fasttext

	def load_vectors(fname):
	fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
	n, d = map(int, fin.readline().split())
	data = {}
	for line in fin:
	tokens = line.rstrip().split(' ')
	data[tokens[0]] = map(float, tokens[1:])
	return data

	def spelltest(tests, model, vocab):
	"Run correction(wrong) on all (right, wrong) pairs; report results."
	import time
	start = time.clock()
	good, unknown = 0, 0
	n = len(tests)
	for right, wrong in tests:
	w = wrong
	if w in vocab:
	print('word: {} exists in the vocabulary. No correction required'.format(w))
	else:
	w_old = w
	w = model.get_nearest_neighbors(w, k=1)[0][1]
	print("found replacement: {} for word: {}".format(w, w_old))
	good += (w == right)
	dt = time.clock() - start
	print('{:.0%} of {} correct at {:.0f} words per second '
	.format(good / n, n, n / dt))

	def Testset(lines):
	"Parse 'right: wrong1 wrong2' lines into [('right', 'wrong1'), ('right', 'wrong2')] pairs."
	return [(right, wrong)
	for (right, wrongs) in (line.split(':') for line in lines)
	for wrong in wrongs.split()]

	if __name__ == "__main__":
	model = fasttext.load_model("crawl-300d-2M-subword.bin")
	vocab = load_vectors("crawl-300d-2M-subword.vec")

	spelltest(Testset(open('spell-testset1.txt')), model, vocab)
	spelltest(Testset(open('spell-testset2.txt')), model, vocab)