Created
January 1, 2013 14:10
-
-
Save peczenyj/4427793 to your computer and use it in GitHub Desktop.
Usage: gawk -f spelling.awk words.txt big.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Usage: gawk -f spelling2.awk file_with_words_one_per_line.txt [ big.txt [ big2.txt ... ]] | |
# Gawk version with 15 lines -- 04/13/2008 | |
# Author: tiago (dot) peczenyj (at) gmail (dot) com | |
# about.me/peczenyj | |
# Based on : http://norvig.com/spell-correct.html | |
function edits(w,max,candidates,list, i,j){ | |
for(i=0;i< max ;++i) ++list[substr(w,0,i) substr(w,i+2)] | |
for(i=0;i< max-1;++i) ++list[substr(w,0,i) substr(w,i+2,1) substr(w,i+1,1) substr(w,i+3)] | |
for(i=0;i< max ;++i) for(j in alpha) ++list[substr(w,0,i) alpha[j] substr(w,i+2)] | |
for(i=0;i<= max ;++i) for(j in alpha) ++list[substr(w,0,i) alpha[j] substr(w,i+1)] | |
for(i in list) if(i in NWORDS) candidates[i] = NWORDS[i] } | |
function correct(word ,candidates,i,list,max,temp){ | |
edits(word,length(word),candidates,list) | |
if (!asort(candidates,temp)) for(i in list) edits(i,length(i),candidates) | |
return (max = asorti(candidates)) ? candidates[max] : word } | |
BEGIN{ if (ARGC == 1) ARGV[ARGC++] = "big.txt" # http://norvig.com/big.txt | |
while(++i<=length(x="abcdefghijklmnopqrstuvwxyz")) alpha[i]=substr(x,i,1) | |
IGNORECASE=RS="[^"x"]+" } | |
{ (NR == FNR)? ++words[tolower($1)] : ++NWORDS[tolower($1)] } | |
END{ for (word in words ) print (word in NWORDS) ? word : "correct("word")=> " correct(tolower(word)) } |
$ gawk -f spelling.awk words.txt big.txt
correct(guidlines)=> guideline
correct(reciet)=> recite
correct(colate)=> violate
correct(embaras)=> embers
correct(orentated)=> orentated
correct(generataed)=> generate
correct(unequivocaly)=> unequivocal
correct(economtric)=> economic
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
$ cat words.txt
reciet
economtric
embaras
colate
orentated
unequivocaly
generataed
guidlines