Skip to content

Instantly share code, notes, and snippets.

Created February 12, 2016 21:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save anonymous/9eb076980a478c55c75d to your computer and use it in GitHub Desktop.
Save anonymous/9eb076980a478c55c75d to your computer and use it in GitHub Desktop.
#http://www.diveintopython.net/html_processing/extracting_data.html
#https://docs.python.org/2/library/robotparser.html
import robotparser
import urllib
import csv
from urlparse import urlparse
def get_page(url):
sock = urllib.urlopen(url)
htmlSource = sock.read()
sock.close()
return htmlSource
#https://www.udacity.com/course/viewer#!/c-cs101/l-48727569/e-48718374/m-48719196
def get_next_target(page):
start_link = page.find('href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
return url, end_quote
def union(p,q):
for e in q:
if e not in p:
p.append(e)
def get_all_links(page):
links = []
while True:
url,endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
terms = extract_dictionary_terms(page)
return links, terms
def get_title(page):
start_title = page.find('<title>')
if start_title == -1:
return 'None'
end_title = page.find('</title>')
title = page[start_title+7:end_title]
return title
def crawl_web(seed,max_depth,max_pages):
tocrawl = [seed]
crawled = []
next_depth = []
title = []
depth = 0
while tocrawl and depth <= max_depth:
page_url = tocrawl.pop()
url_split = urlparse(page_url)
rp = robotparser.RobotFileParser()
rp.set_url(url_split.netloc + "/robots.txt")
rp.read()
if (page_url not in crawled) and (rp.can_fetch("*", page_url)):
links,terms = get_all_links(get_page(page_url))
union(next_depth, links)
while len(tocrawl) > max_pages:
tocrawl.pop()
#union(title, get_title(get_page(page)))
write_to_file(page_url,terms)
crawled.append(page_url)
if not tocrawl:
tocrawl, next_depth = next_depth, []
depth = depth + 1
def extract_dictionary_terms(page):
terms = import_dictionary()
for term in terms:
if page.find(term) >= 0:
terms.append(term)
else:
continue
if len(terms) > 0:
return terms
else:
return []
def import_dictionary():
with open("dictionary.txt", "r") as dictionaryfile:
dictionary = dictionaryfile.readlines()
dictionaryfile.close()
return dictionary
def write_to_file(url,terms):
crawled_file = open("crawled_file.csv", "wb")
wr = csv.writer(crawled_file, quoting=csv.QUOTE_ALL)
print >> crawled_file, url + '\",\"'
wr.writerow(terms)
# print >> crawled_file, url + ','
# for term in terms:
# print >> crawled_file, term + ','
# print >> crawled_file, '\n'
crawled_file.close()
return
#print "Hello, welcome to my web crawler."
seed = raw_input("Please enter the seed website URL: ")
#tocrawl, seed =
crawl_web(seed,500,500)
#while tocrawl > 0:
# crawl_web(seed,4,10)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment