Skip to content

Instantly share code, notes, and snippets.

@bencrowder
Last active December 29, 2023 19:48
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bencrowder/5360985 to your computer and use it in GitHub Desktop.
Save bencrowder/5360985 to your computer and use it in GitHub Desktop.
Small Python script to scrape LDS General Conference transcripts and output HTML page listing scripture references. Example: http://bencrowder.net/files/gc-references/2013-04
#!/usr/bin/python
# -*- coding: utf-8 -*-
import re
import codecs
import requests
import bs4
# Change these
year = 2013
month = 4
limit = None
class ConferenceSession:
talks = []
references = {}
urls = {}
# List of book names in the scriptures, used for sorting
book_names = {
'old_testament': [ 'Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy', 'Joshua', 'Judges', 'Ruth', '1 Samuel', '2 Samuel', '1 Kings', '2 Kings', '1 Chronicles', '2 Chronicles', 'Ezra', 'Nehemiah', 'Esther', 'Job', 'Psalms', 'Psalm', 'Proverbs', 'Ecclesiastes', 'Song of Solomon', 'Isaiah', 'Jeremiah', 'Lamentations', 'Ezekiel', 'Daniel', 'Hosea', 'Joel', 'Amos', 'Obadiah', 'Jonah', 'Micah', 'Nahum', 'Habakkuk', 'Zephaniah', 'Haggai', 'Zechariah', 'Malachi' ],
'new_testament': [ 'Matthew', 'Mark', 'Luke', 'John', 'Acts', 'Romans', '1 Corinthians', '2 Corinthians', 'Galatians', 'Ephesians', 'Philippians', 'Colossians', '1 Thessalonians', '2 Thessalonians', '1 Timothy', '2 Timothy', 'Titus', 'Philemon', 'Hebrews', 'James', '1 Peter', '2 Peter', '1 John', '2 John', '3 John', 'Jude', 'Revelation' ],
'book_of_mormon': [ '1 Nephi', '2 Nephi', 'Jacob', 'Enos', 'Jarom', 'Omni', 'Words of Mormon', 'Mosiah', 'Alma', 'Helaman', '3 Nephi', '4 Nephi', 'Mormon', 'Ether', 'Moroni' ],
'doctrine_and_covenants': [ 'D&C' ],
'pearl_of_great_price': [ 'Moses', 'Abraham', 'Joseph Smith—Matthew', 'Joseph Smith—History', 'Articles of Faith' ]
}
sorted_references = {
'old_testament': [],
'new_testament': [],
'book_of_mormon': [],
'doctrine_and_covenants': [],
'pearl_of_great_price': [],
'other': []
}
def __init__(self, year, month, limit=None):
self.year = year
self.month = month
# Get the talks
print 'Getting the talks...'
self.get_talks(year, month, limit)
# Get the references
print 'Getting references for each talk...'
self.get_references()
# Sort by verse #, chapter, and then book
print 'Sorting...'
self.sorted_list = self.references.iterkeys()
self.sorted_list = sorted(self.sorted_list, key=self.sort_by_verse)
self.sorted_list = sorted(self.sorted_list, key=self.sort_by_chapter)
self.sorted_list = sorted(self.sorted_list, key=self.sort_by_book)
# Sort into books, populates self.sorted_references
self.sort_into_books()
# Save the list to disk
print 'Saving to disk...'
self.save()
# Download the talks for a given conference session
def get_talks(self, year, month, limit=None):
url = 'http://www.lds.org/general-conference/sessions/%04d/%02d' % (year, month)
# Slurp in the HTML
r = requests.get(url)
soup = bs4.BeautifulSoup(r.content)
# Get all <span class="talk">
talks = soup.find_all("span", "talk", limit=limit)
self.talks = []
for talk in talks:
if talk.a:
title, url = talk.a.contents[0], talk.a['href']
speaker = talk.parent.find("span", "speaker").contents[0]
self.talks.append({'title': title, 'url': url, 'speaker': speaker})
# Get references for a given talk
def get_refs_for_talk(self, url):
r = requests.get(url)
soup = bs4.BeautifulSoup(r.content)
# Get all <a class="scriptureRef">
refs = soup.find_all("a", "scriptureRef")
response = []
for ref in refs:
title = ref.contents[0].strip()
ref_url = ref['href']
# Check to see if the title starts with a verse number
if re.search('^\d{1,3}:', title) and re.search('scriptures/dc-testament', ref_url):
title = 'D&C %s' % title
title = re.sub(r'Doctrine and Covenants', 'D&C', title)
# Replace non-breaking spaces with normal spaces
title = title.replace(u"\u00A0", " ")
response.append({'title': title, 'url': ref_url})
return response
# Go through the talks and get references for each
def get_references(self):
for talk in self.talks:
talk['references'] = self.get_refs_for_talk(talk['url'])
# For each reference in the talk
for ref in talk['references']:
title = ref['title']
# Initialize the array for that reference
if title not in self.references:
self.references[title] = []
self.urls[title] = ref['url']
# Add the talk and its URL to the list
self.references[title].append(talk)
# Sort function by verse (after the colon)
def sort_by_verse(self, key):
m = re.match(r'(.*?) (\d+)(:(\d+))?', key)
if m:
groups = m.groups()
if len(groups) > 2 and groups[3] is not None:
return int(groups[3])
else:
return 0
else:
return 0
# Sort by chapter (just before the colon)
def sort_by_chapter(self, key):
m = re.match(r'(.*?) (\d+)(:(\d+))?', key)
if m:
return int(m.groups()[1])
else:
return 0
# Sort by book name
def sort_by_book(self, key):
# First get the book name (first part of the reference)
m = re.match(r'(.*?) (\d+)', key)
val = 0
if m:
book_name = m.groups()[0].encode('utf-8')
# Now we want to use the index from our book name list as the sort key, to put things in order
if book_name in self.book_names['old_testament']:
list_name = 'old_testament'
elif book_name in self.book_names['new_testament']:
list_name = 'new_testament'
elif book_name in self.book_names['book_of_mormon']:
list_name = 'book_of_mormon'
elif book_name in self.book_names['doctrine_and_covenants']:
list_name = 'doctrine_and_covenants'
elif book_name in self.book_names['pearl_of_great_price']:
list_name = 'pearl_of_great_price'
else:
list_name = 'other'
if list_name != 'other':
val = self.book_names[list_name].index(book_name)
return val
# Sort self.sorted_list out by book (populates self.sorted_references)
def sort_into_books(self):
for ref in self.sorted_list:
# Get the book name
m = re.match(r'(.*?) (\d+)', ref)
if m == None:
self.sorted_references['other'].append(ref)
else:
book = m.groups()[0].encode('utf-8')
if book in self.book_names['old_testament']:
self.sorted_references['old_testament'].append(ref)
elif book in self.book_names['new_testament']:
self.sorted_references['new_testament'].append(ref)
elif book in self.book_names['book_of_mormon']:
self.sorted_references['book_of_mormon'].append(ref)
elif book in self.book_names['doctrine_and_covenants']:
self.sorted_references['doctrine_and_covenants'].append(ref)
elif book in self.book_names['pearl_of_great_price']:
self.sorted_references['pearl_of_great_price'].append(ref)
else:
self.sorted_references['other'].append(ref)
# Saves a single volume
def print_list(self, book):
for ref in self.sorted_references[book]:
talks = self.references[ref]
url = self.urls[ref]
self.handle.write('<li>\n\t<label><a href="%s">%s</a></label>\n\t<ul class="refs">\n' % (url, ref))
for talk in talks:
self.handle.write('\t\t<li><a href="%s">%s</a></li>\n' % (talk['url'], talk['title']))
self.handle.write('\t</ul>\n</li>\n')
# Counts a single volume
def count_list(self, book):
return len(self.sorted_references[book])
# Save the whole list
def save(self):
if self.month == 4:
month_name = 'April'
elif self.month == 10:
month_name = 'October'
# And write it out to the file
f = codecs.open('output.html', 'w', 'utf-8')
self.handle = f
f.write('<html>\n')
f.write('<head>\n')
f.write('\t<meta charset="utf-8">\n')
f.write('\t<title>%s %s General Conference Scripture References</title>\n' % (month_name, year))
f.write('\t<style type="text/css">\n')
f.write('\t\t* { -moz-box-sizing: border-box; box-sizing: border-box; }\n')
f.write('\t\ta { color: #5591ce; text-decoration: none; }\n')
f.write('\t\ta:hover { text-decoration: underline; }\n')
f.write('\t\tbody { margin: 0; padding: 0; font-family: Helvetica, Arial, sans-serif; }\n')
f.write('\t\t#page { max-width: 800px; width: 95%; margin: 50px auto; }\n')
f.write('\t\t#page h1 { font-size: 1.8em; }\n')
f.write('\t\t#page h2 { font-size: 1.6em; margin: 2em 0 .5em; }\n')
f.write('\t\t#page > ul { list-style: none; margin: 0; padding: 0; line-height: 1.5em; }\n')
f.write('\t\t#page > ul > li { border-bottom: solid 1px #ddd; padding: 5px 0; overflow: auto; clear: both; }\n')
f.write('\t\t#page > ul label { font-weight: bold; font-size: 1.2em; width: 50%; float: left; }\n')
f.write('\t\t#page > ul ul.refs { margin: 0; float: left; padding: 0; list-style: none; }\n')
f.write('\t\t#page > ul.toc > li { border: none; display: inline-block; }\n')
f.write('\t\t#page > ul.toc > li + li:before { content: " -- "; color: #ccc; }\n')
f.write('\t\t@media screen and (max-width: 750px) {\n')
f.write('\t\t\t#page { margin: 15px auto; }\n')
f.write('\t\t\t#page > ul label { float: none; }\n')
f.write('\t\t\t#page > ul ul.refs { float: none; }\n')
f.write('\t\t}\n')
f.write('\t</style>\n')
f.write('</head>\n')
f.write('<body>\n')
f.write('<section id="page">\n')
f.write('\t<h1>%s %s General Conference Scripture References</h1>\n\n' % (month_name, year))
f.write('\t<ul class="toc">\n')
f.write('\t\t<li><a href="#old-testament">Old Testament</a></li>\n')
f.write('\t\t<li><a href="#new-testament">New Testament</a></li>\n')
f.write('\t\t<li><a href="#book-of-mormon">Book of Mormon</a></li>\n')
f.write('\t\t<li><a href="#doctrine-and-covenants">Doctrine and Covenants</a></li>\n')
f.write('\t\t<li><a href="#pearl-of-great-price">Pearl of Great Price</a></li>\n')
f.write('\t</ul>\n\n')
f.write('\t<h2 id="old-testament">Old Testament</h2>\n')
f.write('\t<ul>\n')
self.print_list('old_testament')
f.write('\t</ul>\n')
f.write('\t<h2 id="new-testament">New Testament</h2>\n')
f.write('\t<ul>\n')
self.print_list('new_testament')
f.write('\t</ul>\n')
f.write('\t<h2 id="book-of-mormon">Book of Mormon</h2>\n')
f.write('\t<ul>\n')
self.print_list('book_of_mormon')
f.write('\t</ul>\n')
f.write('\t<h2 id="doctrine-and-covenants">Doctrine and Covenants</h2>\n')
f.write('\t<ul>\n')
self.print_list('doctrine_and_covenants')
f.write('\t</ul>\n')
f.write('\t<h2 id="pearl-of-great-price">Pearl of Great Price</h2>\n')
f.write('\t<ul>\n')
self.print_list('pearl_of_great_price')
f.write('\t</ul>\n')
f.write('\t<h2>Other</h2>\n')
f.write('\t<ul>\n')
self.print_list('other')
f.write('\t</ul>\n')
f.write('</section>\n')
f.write('</body>\n')
f.write('</html>\n')
f.close()
if __name__ == '__main__':
session = ConferenceSession(year, month, limit)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment