Created
October 4, 2012 18:23
-
-
Save haiderfaraz/3835436 to your computer and use it in GitHub Desktop.
Ruby based Spell Check Bot
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'rubygems' | |
require 'spider' | |
require 'raspell' | |
require 'hpricot' | |
#The URL below will be used by the crawler. | |
#The bot will take this URL and run a spell check on the page that it leads to. | |
#It will also perform the spell check on sub-pages emanating from the URL. | |
domain = 'http://192.168.56.90/index.php/en/admissions' | |
#Here we can choose which dictionary we want to run the spell check against | |
#speller = Aspell.new('en_GB') | |
speller = Aspell.new('en_US') | |
#speller = Aspell.new('en') | |
Spider.start_at(domain) do |s| | |
s.add_url_check do |a_url| | |
a_url.match("^#{domain}") | |
end | |
s.on :success do |a_url, resp, prior_url| | |
unless resp['content-type'].match('text/html') | |
puts "Skipping #{a_url}" | |
next | |
end | |
puts "On page #{a_url}" | |
document = Hpricot(resp.body) | |
# (document/"#middle-column").inner_html | |
document.search('head').remove | |
document.search('script').remove | |
document.search('link').remove | |
document.search('meta').remove | |
document.search('style').remove | |
#Hpricot is an XML parser. | |
#We are using it here to tell the bot which section of the document we want it to spell check. | |
#The site that I want to run the spell check on has a div titled "middle-column" that contains | |
#text that I want spell checked. I am giving that instruction to the bot here... | |
# words = document.inner_text.gsub(/\s+/, ' ').strip.split(/\s/) | |
words = (document/"#middle-column").inner_text.gsub(/\s+/, ' ').strip.split(/\s/) | |
speller.list_misspelled(words).each do |mistake| | |
puts " * Found mistake \"#{mistake}\" perhaps you meant \"#{speller.suggest(mistake).first}\"" | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment