Skip to content

Instantly share code, notes, and snippets.

@danlamanna
Created June 5, 2012 18:51
Show Gist options
  • Save danlamanna/2876920 to your computer and use it in GitHub Desktop.
Save danlamanna/2876920 to your computer and use it in GitHub Desktop.
Scrapy crawling, pyEnchant spell checking.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from spellchecker.items import SpellcheckItem
from BeautifulSoup import BeautifulSoup
import re,urllib,enchant
from string import strip
class SpellcheckSpider(CrawlSpider):
name = "intellisites"
allowed_domains = ["intellisites.com"]
start_urls = [
"http://intellisites.com/",
]
allowed_words = ["Facebook", "LinkedIn", "BoldChat", "EXEControl"]
rules = (
Rule(SgmlLinkExtractor(allow=('http://intellisites.com/')), callback='parse_item', follow=True),
)
""" This is called for every URL crawled, it parses the words on the page
and created a SpellcheckItem to return. """
def parse_item(self, response):
# Setup the spellcheck item
item = SpellcheckItem()
item["url"] = response.url
item["invalid_words"] = []
# Get the text visible from the page
html = urllib.urlopen(response.url).read()
soup = BeautifulSoup(html)
visible_texts = filter(self.visible, soup.findAll(text=True))
d = enchant.Dict("en_US")
for text in visible_texts:
legit_words = filter(self.isLegit, text.split(" "))
for word in legit_words:
word = word.strip()
if word in self.allowed_words or d.check(word):
continue
elif not d.check(word) and word not in item["invalid_words"]:
item["invalid_words"].append(word)
return item
""" Ensures the word is greater than one character, and is strictly alphanumeric. """
def isLegit(self, possible_word):
if (len(possible_word) <= 1):
return False
elif not re.match("^\w+$", possible_word):
return False
else:
return True
""" Ensures we're only dealing with text in proper places. """
def visible(self, element):
if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
return False
elif re.match('<!--.*-->', str(element)):
return False
return True
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment