Skip to content

Instantly share code, notes, and snippets.

@jcxia43
Created April 26, 2012 11:34
Show Gist options
  • Save jcxia43/2498975 to your computer and use it in GitHub Desktop.
Save jcxia43/2498975 to your computer and use it in GitHub Desktop.
Simple web crawler
import urllib
#this is just a very simple web crawler, can not actually do
#what a real web crawler do :)
#get the next link on the page,here page is the content of
#the HTML text, also a string
def get_next_link(page):
start_pos = page.find("<a href=")
if start_pos == -1:
return None,0
start_pos = page.find('"',start_pos)
end_pos = page.find('"',start_pos + 1)
url = page[start_pos + 1:end_pos]
return url,end_pos
#get all links on one page
def get_all_link(page):
crawl = []
while True:
url,end_pos = get_next_link(page)
if url != None:
crawl.append(url)
page = page[end_pos:] #the NEW page starts where the last link is
else:
break
return crawl
#union two lists
def union(a,b):
for element in b:
if element not in a:
a.append(element)
#crawl a seed URL, get every link that directly/indirectly connected
#to the seed page.
def crawl(seed):
tocrawl = [seed] #tocrawl contains those yet to be crawled pages
crawled = [] #crawled pages
last_url = ''
while len(tocrawl) > 0:
page = tocrawl.pop()
if page not in crawled:
url = str(page)
if url[0] == '/':
url = last_url + url #turns a relative path to an absolute one
if url.find('http') != -1:
last_url = page
try:
file = urllib.urlopen(url)
page = file.read()
except:
page = ''
union(tocrawl,get_all_link(page))
crawled.append(last_url)
print url
file.close()
crawl('http://www.google.com')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment