Created
April 26, 2012 11:34
-
-
Save jcxia43/2498975 to your computer and use it in GitHub Desktop.
Simple web crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib | |
#this is just a very simple web crawler, can not actually do | |
#what a real web crawler do :) | |
#get the next link on the page,here page is the content of | |
#the HTML text, also a string | |
def get_next_link(page): | |
start_pos = page.find("<a href=") | |
if start_pos == -1: | |
return None,0 | |
start_pos = page.find('"',start_pos) | |
end_pos = page.find('"',start_pos + 1) | |
url = page[start_pos + 1:end_pos] | |
return url,end_pos | |
#get all links on one page | |
def get_all_link(page): | |
crawl = [] | |
while True: | |
url,end_pos = get_next_link(page) | |
if url != None: | |
crawl.append(url) | |
page = page[end_pos:] #the NEW page starts where the last link is | |
else: | |
break | |
return crawl | |
#union two lists | |
def union(a,b): | |
for element in b: | |
if element not in a: | |
a.append(element) | |
#crawl a seed URL, get every link that directly/indirectly connected | |
#to the seed page. | |
def crawl(seed): | |
tocrawl = [seed] #tocrawl contains those yet to be crawled pages | |
crawled = [] #crawled pages | |
last_url = '' | |
while len(tocrawl) > 0: | |
page = tocrawl.pop() | |
if page not in crawled: | |
url = str(page) | |
if url[0] == '/': | |
url = last_url + url #turns a relative path to an absolute one | |
if url.find('http') != -1: | |
last_url = page | |
try: | |
file = urllib.urlopen(url) | |
page = file.read() | |
except: | |
page = '' | |
union(tocrawl,get_all_link(page)) | |
crawled.append(last_url) | |
print url | |
file.close() | |
crawl('http://www.google.com') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment