Skip to content

Instantly share code, notes, and snippets.

@shaneshifflett
Last active December 31, 2015 07:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shaneshifflett/7951764 to your computer and use it in GitHub Desktop.
Save shaneshifflett/7951764 to your computer and use it in GitHub Desktop.
medill scraper
from BeautifulSoup import BeautifulSoup
from datetime import datetime
import itertools
import requests
import os
directory = "medill-scraper/data/scrapes/%s/" % (datetime.now().strftime("%m-%d-%Y"))
if not os.path.exists(directory):
os.makedirs(directory)
def get_external_files(its, tag):
for it in its:
try:
href = it[tag]
path = directory + "/".join(href.split("/")[0:len(href.split("/"))-1]) + "/"
filename = href.split("/")[-1]
if not os.path.exists(path):
os.makedirs(path)
if not os.path.isfile(path+filename):
url = "http://news.medill.northwestern.edu" + href
if href[0] != '/':
url = "http://news.medill.northwestern.edu/chicago/" + href
print "%s %s %s" % (href, url, href[0]=='/')
content = requests.get(url).content
with open(path+filename, "wb") as f:
f.write(content)
except Exception as e:
print e
url = 'http://news.medill.northwestern.edu/chicago/news.aspx?id=75169&terms=medill'
article_id = url.split("?")[1].split("id=")[1]
page = requests.get(url).content
soup = BeautifulSoup(page)
get_external_files(soup.findAll("link"), 'href')
get_external_files(soup.findAll("script"), 'src')
get_external_files(soup.findAll("img"), 'src')
with open(directory + article_id + ".html", "wb") as f:
page = page.replace("/uploadedImages", "uploadedImages")
f.write(page)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment