Last active
December 10, 2015 07:58
-
-
Save peczenyj/4404742 to your computer and use it in GitHub Desktop.
Usage: gawk -v word=some_word_to_verify -f spelling.awk [ big.txt [ big2.txt ... ]]
Gawk version with 15 lines -- 04/13/2008
Author: tiago (dot) peczenyj (at) gmail (dot) com
Based on : http://norvig.com/spell-correct.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Usage: gawk -v word=some_word_to_verify -f spelling.awk [ big.txt [ big2.txt ... ]] | |
# Gawk version with 15 lines -- 04/13/2008 | |
# Author: tiago (dot) peczenyj (at) gmail (dot) com | |
# about.me/peczenyj | |
# Based on : http://norvig.com/spell-correct.html | |
function edits(w,max,candidates,list, i,j){ | |
for(i=0;i< max ;++i) ++list[substr(w,0,i) substr(w,i+2)] # deletes | |
for(i=0;i< max-1;++i) ++list[substr(w,0,i) substr(w,i+2,1) substr(w,i+1,1) substr(w,i+3)] # transposes | |
for(i=0;i< max ;++i) for(j in alpha) ++list[substr(w,0,i) alpha[j] substr(w,i+2)] # replaces | |
for(i=0;i<= max ;++i) for(j in alpha) ++list[substr(w,0,i) alpha[j] substr(w,i+1)] # inserts | |
for(i in list) if(i in NWORDS) candidates[i] = NWORDS[i] } | |
function correct(word ,candidates,i,list,max,temp){ | |
edits(word,length(word),candidates,list) | |
if (!asort(candidates,temp)) for(i in list) edits(i,length(i),candidates) | |
return (max = asorti(candidates)) ? candidates[max] : word } | |
BEGIN{ if (ARGC == 1) ARGV[ARGC++] = "big.txt" # http://norvig.com/big.txt | |
while(++i<=length(x="abcdefghijklmnopqrstuvwxyz")) alpha[i]=substr(x,i,1) | |
IGNORECASE=RS="[^"x"]+" } | |
{ ++NWORDS[tolower($1)] } | |
END{ print (word in NWORDS) ? word : "correct("word")=> " correct(tolower(word)) } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
$ time gawk -v word=reciet -f spelling.awk
correct(reciet)=> recite
real 0m4.450s
user 0m4.351s
sys 0m0.027s