Created
June 16, 2012 03:55
-
-
Save davelester/2939839 to your computer and use it in GitHub Desktop.
Scraper hack. Code I am writing to (hopefully) scrape course syllabi. A mix of new code, and a previous project. Currently broken.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib.robotparser | |
import urllib.request | |
import urllib.parse | |
from bs4 import BeautifulSoup | |
import time | |
import re | |
import string | |
from collections import deque | |
import os | |
import fileinput | |
# hardcoded variable.. | |
year = '2001' | |
# define a page object and the data we care about | |
class Page: | |
def __init__(self, id, title, url, snippet, visibletext, unfilteredscrape): | |
self.id = id | |
self.title = title | |
self.url = url | |
self.snippet = snippet | |
self.visibletext = visibletext | |
self.unfilteredscrape = unfilteredscrape | |
def RequestResponse(url): | |
try: | |
response = urllib.request.urlopen(url) | |
return response | |
except urllib.error.URLError as err: | |
return False | |
# read the robots.txt file for this host import urllib.robotparser | |
def SetRobotsChecker(robot_url): | |
rp = urllib.robotparser.RobotFileParser() | |
rp.set_url(robot_url) | |
rp.read() | |
return rp | |
# this should actually pull the robots.txt file for each url.. it doesn't | |
robotsUrl = 'http://www.davelester.org/robots.txt' | |
rp = SetRobotsChecker(robotsUrl) | |
# returns True if it is ok to fetch this url, else False | |
def OKToCrawl(rp, url): | |
return rp.can_fetch("*", url) | |
# we'll use this as a filter to retrieve visible text from an HTML page | |
def visible(element): | |
if element.parent.name in ['style', 'script', '[document]', 'head', 'title']: | |
return False | |
elif re.match('<!--.*-->', str(element)): | |
return False | |
return True | |
def crawl(url, link_id): | |
# pausing for 1 second between each web page access | |
time.sleep(1) | |
html = urllib.request.urlopen(url) | |
soup = BeautifulSoup(html) | |
texts = soup.body.findAll(text=True) | |
unfilteredscrape = soup | |
# filter visible texts.. | |
visibletext = filter(visible, texts) | |
snippet_content = visibletext | |
# extract the title of the web page | |
title = soup.title.text | |
# short snippet of text from the page (140 characters) | |
content = ''.join(snippet_content) | |
content = content.replace('\n', '') | |
snippet = content[140:280] + ' ...' | |
# create object for the page | |
page = Page(link_id, title, url, snippet, visibletext, unfilteredscrape) | |
return page | |
""" | |
This is where the magic happens | |
""" | |
# create list to store pages successfully crawled | |
crawledUrls = [] | |
# create queue.. eventually, we may want to add urls to the queue | |
# that are included on pages, and not in our initial list | |
crawlQueue = deque() | |
# look for file extensions in urls, parse only html, php, htm, shtml (sorry, not clean urls!) | |
# then add each of these fancy urls of ours to the queue | |
for url in fileinput.input('sampleurls.txt'): | |
# lowercase the url | |
regular_url = url.lower() | |
if (RequestResponse(regular_url)): | |
if url[-5:] == 'html\n': | |
crawlQueue.append(url[:-1]) | |
elif url[-5:] == '.php\n': | |
crawlQueue.append(url[:-1]) | |
elif url[-5:] == '.htm\n': | |
crawlQueue.append(url[:-1]) | |
print('A list of URLs to crawl has been obtained from the MySQL database. URLs that are successfully crawled will be printed below.') | |
# set starting link ID | |
link_id = 0 | |
# open pageobjects file, which we'll write to for each page object | |
fpages = open('crawledpages', 'w') | |
curpath = os.path.abspath(os.curdir) | |
for link in set(crawlQueue): | |
# open pagetext file | |
link_path = link.replace(".", "") | |
link_path = link.replace("/", "_") | |
link_path = link_path[7:] | |
pagetext = open(curpath+'/pagetext/'+link_path+'.txt', 'w') | |
# open visiblepagetext file | |
# visibletext = open(curpath+'/visibletext/'+link_path+'.txt', 'w') | |
# crawl that url! | |
page = crawl(link, link_id) | |
if (page): | |
crawledUrls.append(link) | |
# print the url | |
print(link) | |
# add to link_id counter | |
link_id += 1 | |
# write page object as new line to file | |
s = str(page.id) + '\t' + str(page.title) + '\t' + str(page.url) + '\t' + str(page.snippet) + '\n' | |
fpages.write(s) | |
# write pagetext to new file | |
pagetext.write(str(page.unfilteredscrape)) | |
pagetext.close() | |
# write visiblepagetext to new file | |
# visibletext.write(str(page.visibletext)) | |
# visibletext.close() | |
print('Congratulations! The crawl is completed.') | |
# close the file handler for page | |
fpages.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment