achavez/jou_scraper.py

## jou_scraper.py
from bs4 import BeautifulSoup
import requests, csv

# Settings
base_url = 'http://www.schiefferschool.tcu.edu/'
story_list_url = 'more_headlines.asp'
blacklist = ['931.htm', '1021.asp']

# Get a list of articles to scrape from the More Headlines page
news_page = requests.get(base_url + story_list_url)
if news_page.status_code == 200:
	encoding = news_page.encoding
	story_list = BeautifulSoup(news_page.text, from_encoding=encoding)
	story_list = story_list.find("div", id="GeneralContent")
	print "Downloaded story list from " + base_url + story_list_url
else:
	error_message = str(news_page.status_code) + ' error downloading story list'
	raise Exception(error_message)

# Loop over the list of URLs from the story page and save them in a CSV
with open('output.csv', 'wb') as outputfile:
	csvwriter = csv.writer(outputfile)
	counter = 0
	for link in story_list.find_all('a'):
		if link.get('href') in blacklist:
			continue
		url = 'http://www.schiefferschool.tcu.edu/' + link.get('href')
		print("Extracting text from " + url)
		story = requests.get(url)
		encoding = story.encoding
		if story.status_code == 200:
			# Parse the story page
			story = BeautifulSoup(story.text, from_encoding=encoding)
			headline = story.find("h3", class_="hugePurpleHeadline").string.encode('ascii', 'xmlcharrefreplace')
			body = story.find("span", class_="news_body_content").encode('ascii')
			body = body[33:len(body)-48].strip() # Strip out wrapper tag
			# Extract date and dateline from story meta
			for span in story.find_all("span", class_="news_bodybold"):
				if span.parent.name == 'br':
					span = span.get_text().strip().split("\n")
					dateline = span[0]
					date = span[len(span) - 1]
			# Get the image and caption
			img = story.select('#GeneralContent table[align="right"] img')
			if img:
				img = img[0]['src']
				# Download the image
				img_url = base_url + img
				print "Downloading image " + img_url
				r = requests.get(img_url, stream=True)
				img_file = img[7:]
				with open(img_file, 'wb') as fd:
					for chunk in r.iter_content(65536):
						fd.write(chunk)
			else:
				img_file = ''
			# Write the text to a CSV
			csvwriter.writerow([url, headline, body, date, dateline, img_file])
			# Iterate the counter
			counter = counter + 1
		else:
			error_message = str(story.status_code) + ' error downloading story'
			raise Exception(error_message)

print "Extracted " + str(counter) + " stories from " + story_list_url
	from bs4 import BeautifulSoup
	import requests, csv

	# Settings
	base_url = 'http://www.schiefferschool.tcu.edu/'
	story_list_url = 'more_headlines.asp'
	blacklist = ['931.htm', '1021.asp']

	# Get a list of articles to scrape from the More Headlines page
	news_page = requests.get(base_url + story_list_url)
	if news_page.status_code == 200:
	encoding = news_page.encoding
	story_list = BeautifulSoup(news_page.text, from_encoding=encoding)
	story_list = story_list.find("div", id="GeneralContent")
	print "Downloaded story list from " + base_url + story_list_url
	else:
	error_message = str(news_page.status_code) + ' error downloading story list'
	raise Exception(error_message)

	# Loop over the list of URLs from the story page and save them in a CSV
	with open('output.csv', 'wb') as outputfile:
	csvwriter = csv.writer(outputfile)
	counter = 0
	for link in story_list.find_all('a'):
	if link.get('href') in blacklist:
	continue
	url = 'http://www.schiefferschool.tcu.edu/' + link.get('href')
	print("Extracting text from " + url)
	story = requests.get(url)
	encoding = story.encoding
	if story.status_code == 200:
	# Parse the story page
	story = BeautifulSoup(story.text, from_encoding=encoding)
	headline = story.find("h3", class_="hugePurpleHeadline").string.encode('ascii', 'xmlcharrefreplace')
	body = story.find("span", class_="news_body_content").encode('ascii')
	body = body[33:len(body)-48].strip() # Strip out wrapper tag
	# Extract date and dateline from story meta
	for span in story.find_all("span", class_="news_bodybold"):
	if span.parent.name == 'br':
	span = span.get_text().strip().split("\n")
	dateline = span[0]
	date = span[len(span) - 1]
	# Get the image and caption
	img = story.select('#GeneralContent table[align="right"] img')
	if img:
	img = img[0]['src']
	# Download the image
	img_url = base_url + img
	print "Downloading image " + img_url
	r = requests.get(img_url, stream=True)
	img_file = img[7:]
	with open(img_file, 'wb') as fd:
	for chunk in r.iter_content(65536):
	fd.write(chunk)
	else:
	img_file = ''
	# Write the text to a CSV
	csvwriter.writerow([url, headline, body, date, dateline, img_file])
	# Iterate the counter
	counter = counter + 1
	else:
	error_message = str(story.status_code) + ' error downloading story'
	raise Exception(error_message)

	print "Extracted " + str(counter) + " stories from " + story_list_url