Skip to content

Instantly share code, notes, and snippets.

@Blizzardo1
Created January 11, 2017 20:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Blizzardo1/57cf16eadf5467f66c53b7a00735c90c to your computer and use it in GitHub Desktop.
Save Blizzardo1/57cf16eadf5467f66c53b7a00735c90c to your computer and use it in GitHub Desktop.
Made with a little help because I don't know Python for shit...
from html.parser import HTMLParser
from urllib.request import urlopen
from urllib import parse
import traceback
class Crawler(HTMLParser):
def handle_starttag(self, tag, attribs):
if tag == 'a':
for (key, val) in attribs:
if key == 'href':
newUrl = parse.urljoin(self.baseUrl, val)
self.links = self.links + [newUrl]
def getLinks(self, url):
self.links = []
self.baseUrl = url
response = urlopen(url)
if response.getheader('Content-Type') == 'text/html':
htmlBytes = response.read()
htmlString = htmlBytes.decode("utf-8")
self.feed(htmlString)
return htmlString, self.links
else:
return "",[]
def crawl(url, word, maxPages):
pagesToVisit = [url]
numberVisited = 0
foundWord = False
while numberVisited < maxPages and pagesToVisit != [] and not foundWord:
numberVisited = numberVisited + 1
url = pagesToVisit[0]
pagesToVisit = pagesToVisit[1:]
try:
print(numberVisited, "Visiting:", url)
parser = Crawler()
data, links = parser.getLinks(url)
if data.find(word) > -1:
foundWord = True
pagesToVisit = pagesToVisit + links
print(" **Success!** ")
except:
print(" **Failure!** ")
traceback.print_exc()
if foundWord:
print("The word \"", word, "\" was found at \"",url,"\"")
else:
print("Could not find the word \"", word, "\" anywhere! D:")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment