Blizzardo1/crawler.py

## crawler.py
from html.parser import HTMLParser
from urllib.request import urlopen
from urllib import parse
import traceback

class Crawler(HTMLParser):
    def handle_starttag(self, tag, attribs):
        if tag == 'a':
            for (key, val) in attribs:
                if key == 'href':
                    newUrl = parse.urljoin(self.baseUrl, val)
                    self.links = self.links + [newUrl]

    def getLinks(self, url):
        self.links = []
        self.baseUrl = url
        response = urlopen(url)
        if response.getheader('Content-Type') == 'text/html':
            htmlBytes = response.read()
            htmlString = htmlBytes.decode("utf-8")
            self.feed(htmlString)
            return htmlString, self.links
        else:
            return "",[]

def crawl(url, word, maxPages):
    pagesToVisit = [url]
    numberVisited = 0
    foundWord = False
    while numberVisited < maxPages and pagesToVisit != [] and not foundWord:
        numberVisited = numberVisited + 1
        url = pagesToVisit[0]
        pagesToVisit = pagesToVisit[1:]
        try:
            print(numberVisited, "Visiting:", url)
            parser = Crawler()
            data, links = parser.getLinks(url)
            if data.find(word) > -1:
                foundWord = True
            pagesToVisit = pagesToVisit + links
            print(" **Success!** ")
        except:
            print(" **Failure!** ")
            traceback.print_exc()
    if foundWord:
        print("The word \"", word, "\" was found at \"",url,"\"")
    else:
        print("Could not find the word \"", word, "\" anywhere! D:")
	from html.parser import HTMLParser
	from urllib.request import urlopen
	from urllib import parse
	import traceback

	class Crawler(HTMLParser):
	def handle_starttag(self, tag, attribs):
	if tag == 'a':
	for (key, val) in attribs:
	if key == 'href':
	newUrl = parse.urljoin(self.baseUrl, val)
	self.links = self.links + [newUrl]

	def getLinks(self, url):
	self.links = []
	self.baseUrl = url
	response = urlopen(url)
	if response.getheader('Content-Type') == 'text/html':
	htmlBytes = response.read()
	htmlString = htmlBytes.decode("utf-8")
	self.feed(htmlString)
	return htmlString, self.links
	else:
	return "",[]

	def crawl(url, word, maxPages):
	pagesToVisit = [url]
	numberVisited = 0
	foundWord = False
	while numberVisited < maxPages and pagesToVisit != [] and not foundWord:
	numberVisited = numberVisited + 1
	url = pagesToVisit[0]
	pagesToVisit = pagesToVisit[1:]
	try:
	print(numberVisited, "Visiting:", url)
	parser = Crawler()
	data, links = parser.getLinks(url)
	if data.find(word) > -1:
	foundWord = True
	pagesToVisit = pagesToVisit + links
	print(" Success! ")
	except:
	print(" Failure! ")
	traceback.print_exc()
	if foundWord:
	print("The word \"", word, "\" was found at \"",url,"\"")
	else:
	print("Could not find the word \"", word, "\" anywhere! D:")