davelester/gist:2939839

## gistfile1.txt
import urllib.robotparser
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import time
import re
import string
from collections import deque
import os
import fileinput

# hardcoded variable..
year = '2001'

# define a page object and the data we care about
class Page:
	def __init__(self, id, title, url, snippet, visibletext, unfilteredscrape):
		self.id = id
		self.title = title
		self.url = url
		self.snippet = snippet
		self.visibletext = visibletext
		self.unfilteredscrape = unfilteredscrape

def RequestResponse(url):
	try:
		response = urllib.request.urlopen(url)
		return response
	except urllib.error.URLError as err:
		return False

# read the robots.txt file for this host import urllib.robotparser
def SetRobotsChecker(robot_url):
	rp = urllib.robotparser.RobotFileParser()
	rp.set_url(robot_url)
	rp.read()
	return rp

# this should actually pull the robots.txt file for each url.. it doesn't
robotsUrl = 'http://www.davelester.org/robots.txt'

rp = SetRobotsChecker(robotsUrl)

# returns True if it is ok to fetch this url, else False
def OKToCrawl(rp, url):
	return rp.can_fetch("*", url)

# we'll use this as a filter to retrieve visible text from an HTML page
def visible(element):
	if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
		return False
	elif re.match('<!--.*-->', str(element)):
		return False
	return True

def crawl(url, link_id):
	# pausing for 1 second between each web page access
	time.sleep(1)

	html = urllib.request.urlopen(url)
	soup = BeautifulSoup(html)
	texts = soup.body.findAll(text=True)

	unfilteredscrape = soup

	# filter visible texts..
	visibletext = filter(visible, texts)
	snippet_content = visibletext

	# extract the title of the web page
	title = soup.title.text

	# short snippet of text from the page (140 characters)
	content = ''.join(snippet_content)
	content = content.replace('\n', '')
	snippet = content[140:280] + ' ...'

	# create object for the page
	page = Page(link_id, title, url, snippet, visibletext, unfilteredscrape)

	return page

"""
This is where the magic happens
"""

# create list to store pages successfully crawled
crawledUrls = []

# create queue.. eventually, we may want to add urls to the queue
# that are included on pages, and not in our initial list
crawlQueue = deque()

# look for file extensions in urls, parse only html, php, htm, shtml (sorry, not clean urls!)
# then add each of these fancy urls of ours to the queue
for url in fileinput.input('sampleurls.txt'):
	# lowercase the url
	regular_url = url.lower()

	if (RequestResponse(regular_url)):
		if url[-5:] == 'html\n':
			crawlQueue.append(url[:-1])
		elif url[-5:] == '.php\n':
			crawlQueue.append(url[:-1])
		elif url[-5:] == '.htm\n':
			crawlQueue.append(url[:-1])

print('A list of URLs to crawl has been obtained from the MySQL database. URLs that are successfully crawled will be printed below.')

# set starting link ID
link_id = 0

# open pageobjects file, which we'll write to for each page object
fpages = open('crawledpages', 'w')
curpath = os.path.abspath(os.curdir)

for link in set(crawlQueue):
	# open pagetext file
	link_path = link.replace(".", "")
	link_path = link.replace("/", "_")
	link_path = link_path[7:]
	pagetext = open(curpath+'/pagetext/'+link_path+'.txt', 'w')

	# open visiblepagetext file
	# visibletext = open(curpath+'/visibletext/'+link_path+'.txt', 'w')

	# crawl that url!
	page = crawl(link, link_id)
	if (page):
		crawledUrls.append(link)
		# print the url
		print(link)
		# add to link_id counter
		link_id += 1

	# write page object as new line to file
	s = str(page.id) + '\t' + str(page.title) + '\t' + str(page.url) + '\t' + str(page.snippet) + '\n'
	fpages.write(s)

	# write pagetext to new file
	pagetext.write(str(page.unfilteredscrape))
	pagetext.close()

	# write visiblepagetext to new file
	# visibletext.write(str(page.visibletext))
	# visibletext.close()

print('Congratulations! The crawl is completed.')

# close the file handler for page
fpages.close()
	import urllib.robotparser
	import urllib.request
	import urllib.parse
	from bs4 import BeautifulSoup
	import time
	import re
	import string
	from collections import deque
	import os
	import fileinput

	# hardcoded variable..
	year = '2001'

	# define a page object and the data we care about
	class Page:
	def __init__(self, id, title, url, snippet, visibletext, unfilteredscrape):
	self.id = id
	self.title = title
	self.url = url
	self.snippet = snippet
	self.visibletext = visibletext
	self.unfilteredscrape = unfilteredscrape

	def RequestResponse(url):
	try:
	response = urllib.request.urlopen(url)
	return response
	except urllib.error.URLError as err:
	return False

	# read the robots.txt file for this host import urllib.robotparser
	def SetRobotsChecker(robot_url):
	rp = urllib.robotparser.RobotFileParser()
	rp.set_url(robot_url)
	rp.read()
	return rp

	# this should actually pull the robots.txt file for each url.. it doesn't
	robotsUrl = 'http://www.davelester.org/robots.txt'

	rp = SetRobotsChecker(robotsUrl)

	# returns True if it is ok to fetch this url, else False
	def OKToCrawl(rp, url):
	return rp.can_fetch("*", url)

	# we'll use this as a filter to retrieve visible text from an HTML page
	def visible(element):
	if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
	return False
	elif re.match('<!--.*-->', str(element)):
	return False
	return True

	def crawl(url, link_id):
	# pausing for 1 second between each web page access
	time.sleep(1)

	html = urllib.request.urlopen(url)
	soup = BeautifulSoup(html)
	texts = soup.body.findAll(text=True)

	unfilteredscrape = soup

	# filter visible texts..
	visibletext = filter(visible, texts)
	snippet_content = visibletext

	# extract the title of the web page
	title = soup.title.text

	# short snippet of text from the page (140 characters)
	content = ''.join(snippet_content)
	content = content.replace('\n', '')
	snippet = content[140:280] + ' ...'

	# create object for the page
	page = Page(link_id, title, url, snippet, visibletext, unfilteredscrape)

	return page

	"""
	This is where the magic happens
	"""

	# create list to store pages successfully crawled
	crawledUrls = []

	# create queue.. eventually, we may want to add urls to the queue
	# that are included on pages, and not in our initial list
	crawlQueue = deque()

	# look for file extensions in urls, parse only html, php, htm, shtml (sorry, not clean urls!)
	# then add each of these fancy urls of ours to the queue
	for url in fileinput.input('sampleurls.txt'):
	# lowercase the url
	regular_url = url.lower()

	if (RequestResponse(regular_url)):
	if url[-5:] == 'html\n':
	crawlQueue.append(url[:-1])
	elif url[-5:] == '.php\n':
	crawlQueue.append(url[:-1])
	elif url[-5:] == '.htm\n':
	crawlQueue.append(url[:-1])

	print('A list of URLs to crawl has been obtained from the MySQL database. URLs that are successfully crawled will be printed below.')

	# set starting link ID
	link_id = 0

	# open pageobjects file, which we'll write to for each page object
	fpages = open('crawledpages', 'w')
	curpath = os.path.abspath(os.curdir)

	for link in set(crawlQueue):
	# open pagetext file
	link_path = link.replace(".", "")
	link_path = link.replace("/", "_")
	link_path = link_path[7:]
	pagetext = open(curpath+'/pagetext/'+link_path+'.txt', 'w')

	# open visiblepagetext file
	# visibletext = open(curpath+'/visibletext/'+link_path+'.txt', 'w')

	# crawl that url!
	page = crawl(link, link_id)
	if (page):
	crawledUrls.append(link)
	# print the url
	print(link)
	# add to link_id counter
	link_id += 1

	# write page object as new line to file
	s = str(page.id) + '\t' + str(page.title) + '\t' + str(page.url) + '\t' + str(page.snippet) + '\n'
	fpages.write(s)

	# write pagetext to new file
	pagetext.write(str(page.unfilteredscrape))
	pagetext.close()

	# write visiblepagetext to new file
	# visibletext.write(str(page.visibletext))
	# visibletext.close()

	print('Congratulations! The crawl is completed.')

	# close the file handler for page
	fpages.close()