Skip to content

Instantly share code, notes, and snippets.

@joshbode
Last active March 18, 2016 05:02
Show Gist options
  • Save joshbode/29f559d37bfa781063df to your computer and use it in GitHub Desktop.
Save joshbode/29f559d37bfa781063df to your computer and use it in GitHub Desktop.
Collate transcripts from Democracy Now into a single document.
<!DOCTYPE html>
<html>
<head>
<title>Democracy Now Transcript: {{ date }}</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<style type="text/css">
@import url(https://fonts.googleapis.com/css?family=Open+Sans);
body {
margin: auto;
max-width: 90%;
font-family: 'Open Sans', sans-serif;
font-size: 24pt;
}
h1 > span {
display: block;
}
</style>
</head>
<body>
<h1>Democracy Now Transcript: <span>{{ date }}</span></h1>{% for story in stories %}
<h2>{{ story.heading }}</h2>
<div>
{{ story.content | join | indent(6) }}
</div>{% endfor %}
</body>
</html>
#! /usr/bin/env python3
"""
Collate Democracy Now Transcripts.
"""
import io
import sys
import os.path
import time
import datetime
import requests
import bs4
import jinja2
# exclude items
EXCLUDE_CLASS = {'donate_container', 'transcript', 'fine_print', 'left_panel'}
class Query:
SITE = 'http://www.democracynow.org'
SHOW = SITE + '/shows/{:%Y/%m/%d}'
STORY = SITE + '{}'
def get_stories(date):
"""Get story list."""
date = datetime.date(*time.strptime(str(date), '%Y-%m-%d')[:3])
req = requests.get(Query.SHOW.format(date))
soup = bs4.BeautifulSoup(req.text, 'lxml')
content = soup.find('div', class_='show_content')
return [a['href'] for a in content.find_all('a', text='Read Story')]
def get_story(story):
"""Get story content."""
req = requests.get(Query.STORY.format(story))
soup = bs4.BeautifulSoup(req.text, 'lxml')
heading = soup.find('h1')
content = [
x for x in soup.find('div', class_='story_with_left_panel')
if not (
isinstance(x, bs4.element.Tag) and
set(x.get('class', [])) & EXCLUDE_CLASS
)
]
return {
'heading': heading.encode_contents().decode('utf-8'),
'content': content
}
date, output = sys.argv[1:3]
stories = [get_story(story) for story in get_stories(date)]
env = jinja2.Environment(loader=jinja2.FileSystemLoader(os.path.curdir))
template = env.get_template('dn.html')
result = template.render(date=date, stories=stories)
with io.open(output, 'w', encoding='utf-8') as f:
f.write(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment