Last active
March 18, 2016 05:02
-
-
Save joshbode/29f559d37bfa781063df to your computer and use it in GitHub Desktop.
Collate transcripts from Democracy Now into a single document.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<html> | |
<head> | |
<title>Democracy Now Transcript: {{ date }}</title> | |
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
<style type="text/css"> | |
@import url(https://fonts.googleapis.com/css?family=Open+Sans); | |
body { | |
margin: auto; | |
max-width: 90%; | |
font-family: 'Open Sans', sans-serif; | |
font-size: 24pt; | |
} | |
h1 > span { | |
display: block; | |
} | |
</style> | |
</head> | |
<body> | |
<h1>Democracy Now Transcript: <span>{{ date }}</span></h1>{% for story in stories %} | |
<h2>{{ story.heading }}</h2> | |
<div> | |
{{ story.content | join | indent(6) }} | |
</div>{% endfor %} | |
</body> | |
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
""" | |
Collate Democracy Now Transcripts. | |
""" | |
import io | |
import sys | |
import os.path | |
import time | |
import datetime | |
import requests | |
import bs4 | |
import jinja2 | |
# exclude items | |
EXCLUDE_CLASS = {'donate_container', 'transcript', 'fine_print', 'left_panel'} | |
class Query: | |
SITE = 'http://www.democracynow.org' | |
SHOW = SITE + '/shows/{:%Y/%m/%d}' | |
STORY = SITE + '{}' | |
def get_stories(date): | |
"""Get story list.""" | |
date = datetime.date(*time.strptime(str(date), '%Y-%m-%d')[:3]) | |
req = requests.get(Query.SHOW.format(date)) | |
soup = bs4.BeautifulSoup(req.text, 'lxml') | |
content = soup.find('div', class_='show_content') | |
return [a['href'] for a in content.find_all('a', text='Read Story')] | |
def get_story(story): | |
"""Get story content.""" | |
req = requests.get(Query.STORY.format(story)) | |
soup = bs4.BeautifulSoup(req.text, 'lxml') | |
heading = soup.find('h1') | |
content = [ | |
x for x in soup.find('div', class_='story_with_left_panel') | |
if not ( | |
isinstance(x, bs4.element.Tag) and | |
set(x.get('class', [])) & EXCLUDE_CLASS | |
) | |
] | |
return { | |
'heading': heading.encode_contents().decode('utf-8'), | |
'content': content | |
} | |
date, output = sys.argv[1:3] | |
stories = [get_story(story) for story in get_stories(date)] | |
env = jinja2.Environment(loader=jinja2.FileSystemLoader(os.path.curdir)) | |
template = env.get_template('dn.html') | |
result = template.render(date=date, stories=stories) | |
with io.open(output, 'w', encoding='utf-8') as f: | |
f.write(result) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment