reachsumit/fasttext_trained.py

## fasttext_trained.py
import io
import fasttext

def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
    return data

def spelltest(tests, model):
    "Run correction(wrong) on all (right, wrong) pairs; report results."
    import time
    start = time.clock()
    good, unknown = 0, 0
    n = len(tests)
    for right, wrong in tests:
        w_old = wrong
        w = wrong
        if w in model.words:
            pass
        else:
            w = model.get_nearest_neighbors(w, k=1)[0][1]
        good += (w == right)
        if not (w == right):
            if w_old != w:
                print("Edited {} to {}, but the correct word is: {}".format(w_old, w, right))
    dt = time.clock() - start
    print('{:.0%} of {} correct at {:.0f} words per second '
          .format(good / n, n, n / dt))

def Testset(lines):
    "Parse 'right: wrong1 wrong2' lines into [('right', 'wrong1'), ('right', 'wrong2')] pairs."
    return [(right, wrong)
            for (right, wrongs) in (line.split(':') for line in lines)
            for wrong in wrongs.split()]

if __name__ == "__main__":
    model = fasttext.train_unsupervised('big.txt', wordNgrams=1, minn=1, maxn=2, dim=300, ws=8, neg=8, epoch=4, minCount=1, bucket=900000)

    spelltest(Testset(open('spell-testset1.txt')), model)
    spelltest(Testset(open('spell-testset2.txt')), model)
	import io
	import fasttext

	def load_vectors(fname):
	fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
	n, d = map(int, fin.readline().split())
	data = {}
	for line in fin:
	tokens = line.rstrip().split(' ')
	data[tokens[0]] = map(float, tokens[1:])
	return data

	def spelltest(tests, model):
	"Run correction(wrong) on all (right, wrong) pairs; report results."
	import time
	start = time.clock()
	good, unknown = 0, 0
	n = len(tests)
	for right, wrong in tests:
	w_old = wrong
	w = wrong
	if w in model.words:
	pass
	else:
	w = model.get_nearest_neighbors(w, k=1)[0][1]
	good += (w == right)
	if not (w == right):
	if w_old != w:
	print("Edited {} to {}, but the correct word is: {}".format(w_old, w, right))
	dt = time.clock() - start
	print('{:.0%} of {} correct at {:.0f} words per second '
	.format(good / n, n, n / dt))

	def Testset(lines):
	"Parse 'right: wrong1 wrong2' lines into [('right', 'wrong1'), ('right', 'wrong2')] pairs."
	return [(right, wrong)
	for (right, wrongs) in (line.split(':') for line in lines)
	for wrong in wrongs.split()]

	if __name__ == "__main__":
	model = fasttext.train_unsupervised('big.txt', wordNgrams=1, minn=1, maxn=2, dim=300, ws=8, neg=8, epoch=4, minCount=1, bucket=900000)

	spelltest(Testset(open('spell-testset1.txt')), model)
	spelltest(Testset(open('spell-testset2.txt')), model)