Skip to content

Instantly share code, notes, and snippets.

@arne-cl
Created March 31, 2015 12:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save arne-cl/9a674a1faf260bd19e56 to your computer and use it in GitHub Desktop.
Save arne-cl/9a674a1faf260bd19e56 to your computer and use it in GitHub Desktop.
simple scraper for blog.fefe.de
import os
import datetime
from datetime import timedelta
import requests
def create_dir(path):
"""
Creates a directory. Warns, if the directory can't be accessed. Passes,
if the directory already exists.
modified from http://stackoverflow.com/a/600612
Parameters
----------
path : str
path to the directory to be created
"""
import sys
import errno
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST:
if os.path.isdir(path):
pass
else: # if something exists at the path, but it's not a dir
raise
elif exc.errno == errno.EACCES:
sys.stderr.write("Cannot create [%s]! Check Permissions" % path)
raise
else:
raise
def get_previous_month(date):
"""
given a date, returns the year (int) and month (int) of the previous month.
cf. http://stackoverflow.com/a/9725093/564514
"""
first_day_of_month = datetime.date(year=date.year, month=date.month, day=1)
last_day_of_prev_month = first_day_of_month - timedelta(days=1)
return last_day_of_prev_month.year, last_day_of_prev_month.month
def get_fefe_daterange(start_date=START_DATE, end_date=datetime.date.today()):
current_date = end_date
while current_date >= START_DATE:
yield (current_date.year, current_date.month)
year_of_prev_month, prev_month = get_previous_month(current_date)
current_date = datetime.date(year=year_of_prev_month, month=prev_month, day=1)
def scrape_fefes_blog(start_date=START_DATE, end_date=datetime.date.today(), output_dir=os.curdir):
create_dir(output_dir)
for (year, month) in get_fefe_daterange(start_date, end_date):
month_id = '{}{month:02d}'.format(year, month=month)
url = 'http://blog.fefe.de/?mon=' + month_id
month_page = requests.get(url)
with open(os.path.join(output_dir, month_id+'.html'), 'w') as output_file:
output_file.write(month_page.content)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment