shaneshifflett/medill_scraper.py

## medill_scraper.py
from BeautifulSoup import BeautifulSoup
from datetime import datetime
import itertools
import requests
import os


directory = "medill-scraper/data/scrapes/%s/" % (datetime.now().strftime("%m-%d-%Y"))
if not os.path.exists(directory):
    os.makedirs(directory)


def get_external_files(its, tag):
    for it in its:
        try:
            href = it[tag]
            path = directory + "/".join(href.split("/")[0:len(href.split("/"))-1]) + "/"
            filename = href.split("/")[-1]
            if not os.path.exists(path):
                os.makedirs(path)
            if not os.path.isfile(path+filename):
                url = "http://news.medill.northwestern.edu" + href
                if href[0] != '/':
                    url = "http://news.medill.northwestern.edu/chicago/" + href
                print "%s %s %s" % (href, url, href[0]=='/')
                content = requests.get(url).content
                with open(path+filename, "wb") as f:
                    f.write(content)
        except Exception as e:
            print e


url = 'http://news.medill.northwestern.edu/chicago/news.aspx?id=75169&terms=medill'
article_id = url.split("?")[1].split("id=")[1]
page = requests.get(url).content
soup = BeautifulSoup(page)
get_external_files(soup.findAll("link"), 'href')
get_external_files(soup.findAll("script"), 'src')
get_external_files(soup.findAll("img"), 'src')


with open(directory + article_id + ".html", "wb") as f:
    page = page.replace("/uploadedImages", "uploadedImages")
    f.write(page)
	from BeautifulSoup import BeautifulSoup
	from datetime import datetime
	import itertools
	import requests
	import os


	directory = "medill-scraper/data/scrapes/%s/" % (datetime.now().strftime("%m-%d-%Y"))
	if not os.path.exists(directory):
	os.makedirs(directory)


	def get_external_files(its, tag):
	for it in its:
	try:
	href = it[tag]
	path = directory + "/".join(href.split("/")[0:len(href.split("/"))-1]) + "/"
	filename = href.split("/")[-1]
	if not os.path.exists(path):
	os.makedirs(path)
	if not os.path.isfile(path+filename):
	url = "http://news.medill.northwestern.edu" + href
	if href[0] != '/':
	url = "http://news.medill.northwestern.edu/chicago/" + href
	print "%s %s %s" % (href, url, href[0]=='/')
	content = requests.get(url).content
	with open(path+filename, "wb") as f:
	f.write(content)
	except Exception as e:
	print e


	url = 'http://news.medill.northwestern.edu/chicago/news.aspx?id=75169&terms=medill'
	article_id = url.split("?")[1].split("id=")[1]
	page = requests.get(url).content
	soup = BeautifulSoup(page)
	get_external_files(soup.findAll("link"), 'href')
	get_external_files(soup.findAll("script"), 'src')
	get_external_files(soup.findAll("img"), 'src')


	with open(directory + article_id + ".html", "wb") as f:
	page = page.replace("/uploadedImages", "uploadedImages")
	f.write(page)