Skip to content

Instantly share code, notes, and snippets.

@achavez
Last active August 29, 2015 14:00
Show Gist options
  • Save achavez/11300485 to your computer and use it in GitHub Desktop.
Save achavez/11300485 to your computer and use it in GitHub Desktop.
Scraper for schiefferschool.tcu.edu
from bs4 import BeautifulSoup
import requests, csv
# Settings
base_url = 'http://www.schiefferschool.tcu.edu/'
story_list_url = 'more_headlines.asp'
blacklist = ['931.htm', '1021.asp']
# Get a list of articles to scrape from the More Headlines page
news_page = requests.get(base_url + story_list_url)
if news_page.status_code == 200:
encoding = news_page.encoding
story_list = BeautifulSoup(news_page.text, from_encoding=encoding)
story_list = story_list.find("div", id="GeneralContent")
print "Downloaded story list from " + base_url + story_list_url
else:
error_message = str(news_page.status_code) + ' error downloading story list'
raise Exception(error_message)
# Loop over the list of URLs from the story page and save them in a CSV
with open('output.csv', 'wb') as outputfile:
csvwriter = csv.writer(outputfile)
counter = 0
for link in story_list.find_all('a'):
if link.get('href') in blacklist:
continue
url = 'http://www.schiefferschool.tcu.edu/' + link.get('href')
print("Extracting text from " + url)
story = requests.get(url)
encoding = story.encoding
if story.status_code == 200:
# Parse the story page
story = BeautifulSoup(story.text, from_encoding=encoding)
headline = story.find("h3", class_="hugePurpleHeadline").string.encode('ascii', 'xmlcharrefreplace')
body = story.find("span", class_="news_body_content").encode('ascii')
body = body[33:len(body)-48].strip() # Strip out wrapper tag
# Extract date and dateline from story meta
for span in story.find_all("span", class_="news_bodybold"):
if span.parent.name == 'br':
span = span.get_text().strip().split("\n")
dateline = span[0]
date = span[len(span) - 1]
# Get the image and caption
img = story.select('#GeneralContent table[align="right"] img')
if img:
img = img[0]['src']
# Download the image
img_url = base_url + img
print "Downloading image " + img_url
r = requests.get(img_url, stream=True)
img_file = img[7:]
with open(img_file, 'wb') as fd:
for chunk in r.iter_content(65536):
fd.write(chunk)
else:
img_file = ''
# Write the text to a CSV
csvwriter.writerow([url, headline, body, date, dateline, img_file])
# Iterate the counter
counter = counter + 1
else:
error_message = str(story.status_code) + ' error downloading story'
raise Exception(error_message)
print "Extracted " + str(counter) + " stories from " + story_list_url
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment