Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Small Python script to scrape LDS General Conference transcripts and output HTML page listing scripture references. Example: http://bencrowder.net/files/gc-references/2013-04
#!/usr/bin/python
# -*- coding: utf-8 -*-
import re
import codecs
import requests
import bs4
# Change these
year = 2013
month = 4
limit = None
class ConferenceSession:
talks = []
references = {}
urls = {}
# List of book names in the scriptures, used for sorting
book_names = {
'old_testament': [ 'Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy', 'Joshua', 'Judges', 'Ruth', '1 Samuel', '2 Samuel', '1 Kings', '2 Kings', '1 Chronicles', '2 Chronicles', 'Ezra', 'Nehemiah', 'Esther', 'Job', 'Psalms', 'Psalm', 'Proverbs', 'Ecclesiastes', 'Song of Solomon', 'Isaiah', 'Jeremiah', 'Lamentations', 'Ezekiel', 'Daniel', 'Hosea', 'Joel', 'Amos', 'Obadiah', 'Jonah', 'Micah', 'Nahum', 'Habakkuk', 'Zephaniah', 'Haggai', 'Zechariah', 'Malachi' ],
'new_testament': [ 'Matthew', 'Mark', 'Luke', 'John', 'Acts', 'Romans', '1 Corinthians', '2 Corinthians', 'Galatians', 'Ephesians', 'Philippians', 'Colossians', '1 Thessalonians', '2 Thessalonians', '1 Timothy', '2 Timothy', 'Titus', 'Philemon', 'Hebrews', 'James', '1 Peter', '2 Peter', '1 John', '2 John', '3 John', 'Jude', 'Revelation' ],
'book_of_mormon': [ '1 Nephi', '2 Nephi', 'Jacob', 'Enos', 'Jarom', 'Omni', 'Words of Mormon', 'Mosiah', 'Alma', 'Helaman', '3 Nephi', '4 Nephi', 'Mormon', 'Ether', 'Moroni' ],
'doctrine_and_covenants': [ 'D&C' ],
'pearl_of_great_price': [ 'Moses', 'Abraham', 'Joseph Smith—Matthew', 'Joseph Smith—History', 'Articles of Faith' ]
}
sorted_references = {
'old_testament': [],
'new_testament': [],
'book_of_mormon': [],
'doctrine_and_covenants': [],
'pearl_of_great_price': [],
'other': []
}
def __init__(self, year, month, limit=None):
self.year = year
self.month = month
# Get the talks
print 'Getting the talks...'
self.get_talks(year, month, limit)
# Get the references
print 'Getting references for each talk...'
self.get_references()
# Sort by verse #, chapter, and then book
print 'Sorting...'
self.sorted_list = self.references.iterkeys()
self.sorted_list = sorted(self.sorted_list, key=self.sort_by_verse)
self.sorted_list = sorted(self.sorted_list, key=self.sort_by_chapter)
self.sorted_list = sorted(self.sorted_list, key=self.sort_by_book)
# Sort into books, populates self.sorted_references
self.sort_into_books()
# Save the list to disk
print 'Saving to disk...'
self.save()
# Download the talks for a given conference session
def get_talks(self, year, month, limit=None):
url = 'http://www.lds.org/general-conference/sessions/%04d/%02d' % (year, month)
# Slurp in the HTML
r = requests.get(url)
soup = bs4.BeautifulSoup(r.content)
# Get all <span class="talk">
talks = soup.find_all("span", "talk", limit=limit)
self.talks = []
for talk in talks:
if talk.a:
title, url = talk.a.contents[0], talk.a['href']
speaker = talk.parent.find("span", "speaker").contents[0]
self.talks.append({'title': title, 'url': url, 'speaker': speaker})
# Get references for a given talk
def get_refs_for_talk(self, url):
r = requests.get(url)
soup = bs4.BeautifulSoup(r.content)
# Get all <a class="scriptureRef">
refs = soup.find_all("a", "scriptureRef")
response = []
for ref in refs:
title = ref.contents[0].strip()
ref_url = ref['href']
# Check to see if the title starts with a verse number
if re.search('^\d{1,3}:', title) and re.search('scriptures/dc-testament', ref_url):
title = 'D&C %s' % title
title = re.sub(r'Doctrine and Covenants', 'D&C', title)
# Replace non-breaking spaces with normal spaces
title = title.replace(u"\u00A0", " ")
response.append({'title': title, 'url': ref_url})
return response
# Go through the talks and get references for each
def get_references(self):
for talk in self.talks:
talk['references'] = self.get_refs_for_talk(talk['url'])
# For each reference in the talk
for ref in talk['references']:
title = ref['title']
# Initialize the array for that reference
if title not in self.references:
self.references[title] = []
self.urls[title] = ref['url']
# Add the talk and its URL to the list
self.references[title].append(talk)
# Sort function by verse (after the colon)
def sort_by_verse(self, key):
m = re.match(r'(.*?) (\d+)(:(\d+))?', key)
if m:
groups = m.groups()
if len(groups) > 2 and groups[3] is not None:
return int(groups[3])
else:
return 0
else:
return 0
# Sort by chapter (just before the colon)
def sort_by_chapter(self, key):
m = re.match(r'(.*?) (\d+)(:(\d+))?', key)
if m:
return int(m.groups()[1])
else:
return 0
# Sort by book name
def sort_by_book(self, key):
# First get the book name (first part of the reference)
m = re.match(r'(.*?) (\d+)', key)
val = 0
if m:
book_name = m.groups()[0].encode('utf-8')
# Now we want to use the index from our book name list as the sort key, to put things in order
if book_name in self.book_names['old_testament']:
list_name = 'old_testament'
elif book_name in self.book_names['new_testament']:
list_name = 'new_testament'
elif book_name in self.book_names['book_of_mormon']:
list_name = 'book_of_mormon'
elif book_name in self.book_names['doctrine_and_covenants']:
list_name = 'doctrine_and_covenants'
elif book_name in self.book_names['pearl_of_great_price']:
list_name = 'pearl_of_great_price'
else:
list_name = 'other'
if list_name != 'other':
val = self.book_names[list_name].index(book_name)
return val
# Sort self.sorted_list out by book (populates self.sorted_references)
def sort_into_books(self):
for ref in self.sorted_list:
# Get the book name
m = re.match(r'(.*?) (\d+)', ref)
if m == None:
self.sorted_references['other'].append(ref)
else:
book = m.groups()[0].encode('utf-8')
if book in self.book_names['old_testament']:
self.sorted_references['old_testament'].append(ref)
elif book in self.book_names['new_testament']:
self.sorted_references['new_testament'].append(ref)
elif book in self.book_names['book_of_mormon']:
self.sorted_references['book_of_mormon'].append(ref)
elif book in self.book_names['doctrine_and_covenants']:
self.sorted_references['doctrine_and_covenants'].append(ref)
elif book in self.book_names['pearl_of_great_price']:
self.sorted_references['pearl_of_great_price'].append(ref)
else:
self.sorted_references['other'].append(ref)
# Saves a single volume
def print_list(self, book):
for ref in self.sorted_references[book]:
talks = self.references[ref]
url = self.urls[ref]
self.handle.write('<li>\n\t<label><a href="%s">%s</a></label>\n\t<ul class="refs">\n' % (url, ref))
for talk in talks:
self.handle.write('\t\t<li><a href="%s">%s</a></li>\n' % (talk['url'], talk['title']))
self.handle.write('\t</ul>\n</li>\n')
# Counts a single volume
def count_list(self, book):
return len(self.sorted_references[book])
# Save the whole list
def save(self):
if self.month == 4:
month_name = 'April'
elif self.month == 10:
month_name = 'October'
# And write it out to the file
f = codecs.open('output.html', 'w', 'utf-8')
self.handle = f
f.write('<html>\n')
f.write('<head>\n')
f.write('\t<meta charset="utf-8">\n')
f.write('\t<title>%s %s General Conference Scripture References</title>\n' % (month_name, year))
f.write('\t<style type="text/css">\n')
f.write('\t\t* { -moz-box-sizing: border-box; box-sizing: border-box; }\n')
f.write('\t\ta { color: #5591ce; text-decoration: none; }\n')
f.write('\t\ta:hover { text-decoration: underline; }\n')
f.write('\t\tbody { margin: 0; padding: 0; font-family: Helvetica, Arial, sans-serif; }\n')
f.write('\t\t#page { max-width: 800px; width: 95%; margin: 50px auto; }\n')
f.write('\t\t#page h1 { font-size: 1.8em; }\n')
f.write('\t\t#page h2 { font-size: 1.6em; margin: 2em 0 .5em; }\n')
f.write('\t\t#page > ul { list-style: none; margin: 0; padding: 0; line-height: 1.5em; }\n')
f.write('\t\t#page > ul > li { border-bottom: solid 1px #ddd; padding: 5px 0; overflow: auto; clear: both; }\n')
f.write('\t\t#page > ul label { font-weight: bold; font-size: 1.2em; width: 50%; float: left; }\n')
f.write('\t\t#page > ul ul.refs { margin: 0; float: left; padding: 0; list-style: none; }\n')
f.write('\t\t#page > ul.toc > li { border: none; display: inline-block; }\n')
f.write('\t\t#page > ul.toc > li + li:before { content: " -- "; color: #ccc; }\n')
f.write('\t\t@media screen and (max-width: 750px) {\n')
f.write('\t\t\t#page { margin: 15px auto; }\n')
f.write('\t\t\t#page > ul label { float: none; }\n')
f.write('\t\t\t#page > ul ul.refs { float: none; }\n')
f.write('\t\t}\n')
f.write('\t</style>\n')
f.write('</head>\n')
f.write('<body>\n')
f.write('<section id="page">\n')
f.write('\t<h1>%s %s General Conference Scripture References</h1>\n\n' % (month_name, year))
f.write('\t<ul class="toc">\n')
f.write('\t\t<li><a href="#old-testament">Old Testament</a></li>\n')
f.write('\t\t<li><a href="#new-testament">New Testament</a></li>\n')
f.write('\t\t<li><a href="#book-of-mormon">Book of Mormon</a></li>\n')
f.write('\t\t<li><a href="#doctrine-and-covenants">Doctrine and Covenants</a></li>\n')
f.write('\t\t<li><a href="#pearl-of-great-price">Pearl of Great Price</a></li>\n')
f.write('\t</ul>\n\n')
f.write('\t<h2 id="old-testament">Old Testament</h2>\n')
f.write('\t<ul>\n')
self.print_list('old_testament')
f.write('\t</ul>\n')
f.write('\t<h2 id="new-testament">New Testament</h2>\n')
f.write('\t<ul>\n')
self.print_list('new_testament')
f.write('\t</ul>\n')
f.write('\t<h2 id="book-of-mormon">Book of Mormon</h2>\n')
f.write('\t<ul>\n')
self.print_list('book_of_mormon')
f.write('\t</ul>\n')
f.write('\t<h2 id="doctrine-and-covenants">Doctrine and Covenants</h2>\n')
f.write('\t<ul>\n')
self.print_list('doctrine_and_covenants')
f.write('\t</ul>\n')
f.write('\t<h2 id="pearl-of-great-price">Pearl of Great Price</h2>\n')
f.write('\t<ul>\n')
self.print_list('pearl_of_great_price')
f.write('\t</ul>\n')
f.write('\t<h2>Other</h2>\n')
f.write('\t<ul>\n')
self.print_list('other')
f.write('\t</ul>\n')
f.write('</section>\n')
f.write('</body>\n')
f.write('</html>\n')
f.close()
if __name__ == '__main__':
session = ConferenceSession(year, month, limit)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment