arne-cl/fefe_scraper.py

## fefe_scraper.py
import os
import datetime
from datetime import timedelta

import requests


def create_dir(path):
    """
    Creates a directory. Warns, if the directory can't be accessed. Passes,
    if the directory already exists.

    modified from http://stackoverflow.com/a/600612

    Parameters
    ----------
    path : str
        path to the directory to be created
    """
    import sys
    import errno

    try:
        os.makedirs(path)
    except OSError as exc:  # Python >2.5
        if exc.errno == errno.EEXIST:
            if os.path.isdir(path):
                pass
            else: # if something exists at the path, but it's not a dir
                raise
        elif exc.errno == errno.EACCES:
            sys.stderr.write("Cannot create [%s]! Check Permissions" % path)
            raise
        else:
            raise


def get_previous_month(date):
    """
    given a date, returns the year (int) and month (int) of the previous month.

    cf. http://stackoverflow.com/a/9725093/564514
    """
    first_day_of_month = datetime.date(year=date.year, month=date.month, day=1)
    last_day_of_prev_month = first_day_of_month - timedelta(days=1)
    return last_day_of_prev_month.year, last_day_of_prev_month.month


def get_fefe_daterange(start_date=START_DATE, end_date=datetime.date.today()):
    current_date = end_date
    while current_date >= START_DATE:
        yield (current_date.year, current_date.month)
        year_of_prev_month, prev_month = get_previous_month(current_date)
        current_date = datetime.date(year=year_of_prev_month, month=prev_month, day=1)


def scrape_fefes_blog(start_date=START_DATE, end_date=datetime.date.today(), output_dir=os.curdir):
    create_dir(output_dir)
    for (year, month) in get_fefe_daterange(start_date, end_date):
        month_id = '{}{month:02d}'.format(year, month=month)
        url = 'http://blog.fefe.de/?mon=' + month_id
        month_page = requests.get(url)
        with open(os.path.join(output_dir, month_id+'.html'), 'w') as output_file:
            output_file.write(month_page.content)
	import os
	import datetime
	from datetime import timedelta

	import requests


	def create_dir(path):
	"""
	Creates a directory. Warns, if the directory can't be accessed. Passes,
	if the directory already exists.

	modified from http://stackoverflow.com/a/600612

	Parameters
	----------
	path : str
	path to the directory to be created
	"""
	import sys
	import errno

	try:
	os.makedirs(path)
	except OSError as exc: # Python >2.5
	if exc.errno == errno.EEXIST:
	if os.path.isdir(path):
	pass
	else: # if something exists at the path, but it's not a dir
	raise
	elif exc.errno == errno.EACCES:
	sys.stderr.write("Cannot create [%s]! Check Permissions" % path)
	raise
	else:
	raise


	def get_previous_month(date):
	"""
	given a date, returns the year (int) and month (int) of the previous month.

	cf. http://stackoverflow.com/a/9725093/564514
	"""
	first_day_of_month = datetime.date(year=date.year, month=date.month, day=1)
	last_day_of_prev_month = first_day_of_month - timedelta(days=1)
	return last_day_of_prev_month.year, last_day_of_prev_month.month


	def get_fefe_daterange(start_date=START_DATE, end_date=datetime.date.today()):
	current_date = end_date
	while current_date >= START_DATE:
	yield (current_date.year, current_date.month)
	year_of_prev_month, prev_month = get_previous_month(current_date)
	current_date = datetime.date(year=year_of_prev_month, month=prev_month, day=1)


	def scrape_fefes_blog(start_date=START_DATE, end_date=datetime.date.today(), output_dir=os.curdir):
	create_dir(output_dir)
	for (year, month) in get_fefe_daterange(start_date, end_date):
	month_id = '{}{month:02d}'.format(year, month=month)
	url = 'http://blog.fefe.de/?mon=' + month_id
	month_page = requests.get(url)
	with open(os.path.join(output_dir, month_id+'.html'), 'w') as output_file:
	output_file.write(month_page.content)