Skip to content

Instantly share code, notes, and snippets.

@puneetraipuria
Last active August 29, 2015 14:07
Show Gist options
  • Save puneetraipuria/58ca4c5b3c83423bff34 to your computer and use it in GitHub Desktop.
Save puneetraipuria/58ca4c5b3c83423bff34 to your computer and use it in GitHub Desktop.
spider algorithm
#page spyder algorithms
import urlparse
import urllib
from bs4 import BeautifulSoup
url = "http://github.com"
urls = [url]
visited = [url]
while len(urls) > 0:
try:
htmltext = urllib.urlopen(urls[0]).read()
except:
print urls[0]
soup = BeautifulSoup(htmltext)
urls.pop(0)
print len(urls)
for tag in soup.findAll('a',href=True):
tag['href'] = urlparse.urljoin(url,tag['href'])
if url in tag['href'] and tag['href'] not in visited:
urls.append(tag['href'])
visited.append(tag['href'])
print visited
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment