joshbode/dn.html

## dn.html
<!DOCTYPE html>

<html>
  <head>
    <title>Democracy Now Transcript: {{ date }}</title>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <style type="text/css">
      @import url(https://fonts.googleapis.com/css?family=Open+Sans);
      body {
        margin: auto;
        max-width: 90%;
        font-family: 'Open Sans', sans-serif;
        font-size: 24pt;
      }
      h1 > span {
        display: block;
      }
    </style>
  </head>
  <body>
    <h1>Democracy Now Transcript: <span>{{ date }}</span></h1>{% for story in stories %}
    <h2>{{ story.heading }}</h2>
    <div>
      {{ story.content | join | indent(6) }}
    </div>{% endfor %}
  </body>
</html>

## dn.py
#! /usr/bin/env python3

"""
Collate Democracy Now Transcripts.
"""

import io
import sys
import os.path
import time
import datetime

import requests
import bs4
import jinja2


# exclude items
EXCLUDE_CLASS = {'donate_container', 'transcript', 'fine_print', 'left_panel'}


class Query:
    SITE = 'http://www.democracynow.org'
    SHOW = SITE + '/shows/{:%Y/%m/%d}'
    STORY = SITE + '{}'


def get_stories(date):
    """Get story list."""

    date = datetime.date(*time.strptime(str(date), '%Y-%m-%d')[:3])
    req = requests.get(Query.SHOW.format(date))
    soup = bs4.BeautifulSoup(req.text, 'lxml')

    content = soup.find('div', class_='show_content')

    return [a['href'] for a in content.find_all('a', text='Read Story')]


def get_story(story):
    """Get story content."""

    req = requests.get(Query.STORY.format(story))
    soup = bs4.BeautifulSoup(req.text, 'lxml')

    heading = soup.find('h1')
    content = [
        x for x in soup.find('div', class_='story_with_left_panel')
        if not (
            isinstance(x, bs4.element.Tag) and
            set(x.get('class', [])) & EXCLUDE_CLASS
        )
    ]

    return {
        'heading': heading.encode_contents().decode('utf-8'),
        'content': content
    }

date, output = sys.argv[1:3]

stories = [get_story(story) for story in get_stories(date)]

env = jinja2.Environment(loader=jinja2.FileSystemLoader(os.path.curdir))
template = env.get_template('dn.html')
result = template.render(date=date, stories=stories)

with io.open(output, 'w', encoding='utf-8') as f:
    f.write(result)
	<!DOCTYPE html>

	<html>
	<head>
	<title>Democracy Now Transcript: {{ date }}</title>
	<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
	<style type="text/css">
	@import url(https://fonts.googleapis.com/css?family=Open+Sans);
	body {
	margin: auto;
	max-width: 90%;
	font-family: 'Open Sans', sans-serif;
	font-size: 24pt;
	}
	h1 > span {
	display: block;
	}
	</style>
	</head>
	<body>
	<h1>Democracy Now Transcript: <span>{{ date }}</span></h1>{% for story in stories %}
	<h2>{{ story.heading }}</h2>
	<div>
	{{ story.content \| join \| indent(6) }}
	</div>{% endfor %}
	</body>
	</html>
	#! /usr/bin/env python3

	"""
	Collate Democracy Now Transcripts.
	"""

	import io
	import sys
	import os.path
	import time
	import datetime

	import requests
	import bs4
	import jinja2


	# exclude items
	EXCLUDE_CLASS = {'donate_container', 'transcript', 'fine_print', 'left_panel'}


	class Query:
	SITE = 'http://www.democracynow.org'
	SHOW = SITE + '/shows/{:%Y/%m/%d}'
	STORY = SITE + '{}'


	def get_stories(date):
	"""Get story list."""

	date = datetime.date(*time.strptime(str(date), '%Y-%m-%d')[:3])
	req = requests.get(Query.SHOW.format(date))
	soup = bs4.BeautifulSoup(req.text, 'lxml')

	content = soup.find('div', class_='show_content')

	return [a['href'] for a in content.find_all('a', text='Read Story')]


	def get_story(story):
	"""Get story content."""

	req = requests.get(Query.STORY.format(story))
	soup = bs4.BeautifulSoup(req.text, 'lxml')

	heading = soup.find('h1')
	content = [
	x for x in soup.find('div', class_='story_with_left_panel')
	if not (
	isinstance(x, bs4.element.Tag) and
	set(x.get('class', [])) & EXCLUDE_CLASS
	)
	]

	return {
	'heading': heading.encode_contents().decode('utf-8'),
	'content': content
	}

	date, output = sys.argv[1:3]

	stories = [get_story(story) for story in get_stories(date)]

	env = jinja2.Environment(loader=jinja2.FileSystemLoader(os.path.curdir))
	template = env.get_template('dn.html')
	result = template.render(date=date, stories=stories)

	with io.open(output, 'w', encoding='utf-8') as f:
	f.write(result)