danlamanna/imperial_spider.py

## imperial_spider.py
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

from spellchecker.items import SpellcheckItem

from BeautifulSoup import BeautifulSoup

import re,urllib,enchant
from string import strip


class SpellcheckSpider(CrawlSpider):

    name = "intellisites"
    allowed_domains = ["intellisites.com"]
    start_urls = [
        "http://intellisites.com/",
    ]

    allowed_words = ["Facebook", "LinkedIn", "BoldChat", "EXEControl"]

    rules = (
        Rule(SgmlLinkExtractor(allow=('http://intellisites.com/')), callback='parse_item', follow=True),
    )

    """ This is called for every URL crawled, it parses the words on the page
    and created a SpellcheckItem to return. """
    def parse_item(self, response):
        # Setup the spellcheck item
        item = SpellcheckItem()
        item["url"] = response.url
        item["invalid_words"] = []

        # Get the text visible from the page
        html = urllib.urlopen(response.url).read()
        soup = BeautifulSoup(html)
        visible_texts = filter(self.visible, soup.findAll(text=True))

        d = enchant.Dict("en_US")

        for text in visible_texts:
            legit_words = filter(self.isLegit, text.split(" "))

            for word in legit_words:
                word = word.strip()

                if word in self.allowed_words or d.check(word):
                    continue
                elif not d.check(word) and word not in item["invalid_words"]:
                    item["invalid_words"].append(word)

        return item

    """ Ensures the word is greater than one character, and is strictly alphanumeric. """
    def isLegit(self, possible_word):
        if (len(possible_word) <= 1):
            return False
        elif not re.match("^\w+$", possible_word):
            return False
        else:
            return True

    """ Ensures we're only dealing with text in proper places. """
    def visible(self, element):
        if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
            return False
        elif re.match('<!--.*-->', str(element)):
            return False
        return True
	from scrapy.contrib.spiders import CrawlSpider, Rule
	from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

	from spellchecker.items import SpellcheckItem

	from BeautifulSoup import BeautifulSoup

	import re,urllib,enchant
	from string import strip


	class SpellcheckSpider(CrawlSpider):

	name = "intellisites"
	allowed_domains = ["intellisites.com"]
	start_urls = [
	"http://intellisites.com/",
	]

	allowed_words = ["Facebook", "LinkedIn", "BoldChat", "EXEControl"]

	rules = (
	Rule(SgmlLinkExtractor(allow=('http://intellisites.com/')), callback='parse_item', follow=True),
	)

	""" This is called for every URL crawled, it parses the words on the page
	and created a SpellcheckItem to return. """
	def parse_item(self, response):
	# Setup the spellcheck item
	item = SpellcheckItem()
	item["url"] = response.url
	item["invalid_words"] = []

	# Get the text visible from the page
	html = urllib.urlopen(response.url).read()
	soup = BeautifulSoup(html)
	visible_texts = filter(self.visible, soup.findAll(text=True))

	d = enchant.Dict("en_US")

	for text in visible_texts:
	legit_words = filter(self.isLegit, text.split(" "))

	for word in legit_words:
	word = word.strip()

	if word in self.allowed_words or d.check(word):
	continue
	elif not d.check(word) and word not in item["invalid_words"]:
	item["invalid_words"].append(word)

	return item

	""" Ensures the word is greater than one character, and is strictly alphanumeric. """
	def isLegit(self, possible_word):
	if (len(possible_word) <= 1):
	return False
	elif not re.match("^\w+$", possible_word):
	return False
	else:
	return True

	""" Ensures we're only dealing with text in proper places. """
	def visible(self, element):
	if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
	return False
	elif re.match('<!--.*-->', str(element)):
	return False
	return True