Skip to content

Instantly share code, notes, and snippets.

@davelester
Created June 16, 2012 03:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save davelester/2939839 to your computer and use it in GitHub Desktop.
Save davelester/2939839 to your computer and use it in GitHub Desktop.
Scraper hack. Code I am writing to (hopefully) scrape course syllabi. A mix of new code, and a previous project. Currently broken.
import urllib.robotparser
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import time
import re
import string
from collections import deque
import os
import fileinput
# hardcoded variable..
year = '2001'
# define a page object and the data we care about
class Page:
def __init__(self, id, title, url, snippet, visibletext, unfilteredscrape):
self.id = id
self.title = title
self.url = url
self.snippet = snippet
self.visibletext = visibletext
self.unfilteredscrape = unfilteredscrape
def RequestResponse(url):
try:
response = urllib.request.urlopen(url)
return response
except urllib.error.URLError as err:
return False
# read the robots.txt file for this host import urllib.robotparser
def SetRobotsChecker(robot_url):
rp = urllib.robotparser.RobotFileParser()
rp.set_url(robot_url)
rp.read()
return rp
# this should actually pull the robots.txt file for each url.. it doesn't
robotsUrl = 'http://www.davelester.org/robots.txt'
rp = SetRobotsChecker(robotsUrl)
# returns True if it is ok to fetch this url, else False
def OKToCrawl(rp, url):
return rp.can_fetch("*", url)
# we'll use this as a filter to retrieve visible text from an HTML page
def visible(element):
if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
return False
elif re.match('<!--.*-->', str(element)):
return False
return True
def crawl(url, link_id):
# pausing for 1 second between each web page access
time.sleep(1)
html = urllib.request.urlopen(url)
soup = BeautifulSoup(html)
texts = soup.body.findAll(text=True)
unfilteredscrape = soup
# filter visible texts..
visibletext = filter(visible, texts)
snippet_content = visibletext
# extract the title of the web page
title = soup.title.text
# short snippet of text from the page (140 characters)
content = ''.join(snippet_content)
content = content.replace('\n', '')
snippet = content[140:280] + ' ...'
# create object for the page
page = Page(link_id, title, url, snippet, visibletext, unfilteredscrape)
return page
"""
This is where the magic happens
"""
# create list to store pages successfully crawled
crawledUrls = []
# create queue.. eventually, we may want to add urls to the queue
# that are included on pages, and not in our initial list
crawlQueue = deque()
# look for file extensions in urls, parse only html, php, htm, shtml (sorry, not clean urls!)
# then add each of these fancy urls of ours to the queue
for url in fileinput.input('sampleurls.txt'):
# lowercase the url
regular_url = url.lower()
if (RequestResponse(regular_url)):
if url[-5:] == 'html\n':
crawlQueue.append(url[:-1])
elif url[-5:] == '.php\n':
crawlQueue.append(url[:-1])
elif url[-5:] == '.htm\n':
crawlQueue.append(url[:-1])
print('A list of URLs to crawl has been obtained from the MySQL database. URLs that are successfully crawled will be printed below.')
# set starting link ID
link_id = 0
# open pageobjects file, which we'll write to for each page object
fpages = open('crawledpages', 'w')
curpath = os.path.abspath(os.curdir)
for link in set(crawlQueue):
# open pagetext file
link_path = link.replace(".", "")
link_path = link.replace("/", "_")
link_path = link_path[7:]
pagetext = open(curpath+'/pagetext/'+link_path+'.txt', 'w')
# open visiblepagetext file
# visibletext = open(curpath+'/visibletext/'+link_path+'.txt', 'w')
# crawl that url!
page = crawl(link, link_id)
if (page):
crawledUrls.append(link)
# print the url
print(link)
# add to link_id counter
link_id += 1
# write page object as new line to file
s = str(page.id) + '\t' + str(page.title) + '\t' + str(page.url) + '\t' + str(page.snippet) + '\n'
fpages.write(s)
# write pagetext to new file
pagetext.write(str(page.unfilteredscrape))
pagetext.close()
# write visiblepagetext to new file
# visibletext.write(str(page.visibletext))
# visibletext.close()
print('Congratulations! The crawl is completed.')
# close the file handler for page
fpages.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment