bencrowder/gc_references.py

## gc_references.py
#!/usr/bin/python
# -*- coding: utf-8 -*-

import re
import codecs
import requests
import bs4

# Change these
year = 2013
month = 4
limit = None

class ConferenceSession:
    talks = []
    references = {}
    urls = {}

    # List of book names in the scriptures, used for sorting
    book_names = {
        'old_testament': [ 'Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy', 'Joshua', 'Judges', 'Ruth', '1 Samuel', '2 Samuel', '1 Kings', '2 Kings', '1 Chronicles', '2 Chronicles', 'Ezra', 'Nehemiah', 'Esther', 'Job', 'Psalms', 'Psalm', 'Proverbs', 'Ecclesiastes', 'Song of Solomon', 'Isaiah', 'Jeremiah', 'Lamentations', 'Ezekiel', 'Daniel', 'Hosea', 'Joel', 'Amos', 'Obadiah', 'Jonah', 'Micah', 'Nahum', 'Habakkuk', 'Zephaniah', 'Haggai', 'Zechariah', 'Malachi' ],
        'new_testament': [ 'Matthew', 'Mark', 'Luke', 'John', 'Acts', 'Romans', '1 Corinthians', '2 Corinthians', 'Galatians', 'Ephesians', 'Philippians', 'Colossians', '1 Thessalonians', '2 Thessalonians', '1 Timothy', '2 Timothy', 'Titus', 'Philemon', 'Hebrews', 'James', '1 Peter', '2 Peter', '1 John', '2 John', '3 John', 'Jude', 'Revelation' ],
        'book_of_mormon': [ '1 Nephi', '2 Nephi', 'Jacob', 'Enos', 'Jarom', 'Omni', 'Words of Mormon', 'Mosiah', 'Alma', 'Helaman', '3 Nephi', '4 Nephi', 'Mormon', 'Ether', 'Moroni' ],
        'doctrine_and_covenants': [ 'D&C' ],
        'pearl_of_great_price': [ 'Moses', 'Abraham', 'Joseph Smith—Matthew', 'Joseph Smith—History', 'Articles of Faith' ]
    }

    sorted_references = {
        'old_testament': [],
        'new_testament': [],
        'book_of_mormon': [],
        'doctrine_and_covenants': [],
        'pearl_of_great_price': [],
        'other': []
    }

    def __init__(self, year, month, limit=None):
        self.year = year
        self.month = month

        # Get the talks
        print 'Getting the talks...'
        self.get_talks(year, month, limit)

        # Get the references
        print 'Getting references for each talk...'
        self.get_references()

        # Sort by verse #, chapter, and then book
        print 'Sorting...'
        self.sorted_list = self.references.iterkeys()
        self.sorted_list = sorted(self.sorted_list, key=self.sort_by_verse)
        self.sorted_list = sorted(self.sorted_list, key=self.sort_by_chapter)
        self.sorted_list = sorted(self.sorted_list, key=self.sort_by_book)

        # Sort into books, populates self.sorted_references
        self.sort_into_books()

        # Save the list to disk
        print 'Saving to disk...'
        self.save()

    # Download the talks for a given conference session
    def get_talks(self, year, month, limit=None):
        url = 'http://www.lds.org/general-conference/sessions/%04d/%02d' % (year, month)

        # Slurp in the HTML
        r = requests.get(url)
        soup = bs4.BeautifulSoup(r.content)

        # Get all <span class="talk">
        talks = soup.find_all("span", "talk", limit=limit)

        self.talks = []
        for talk in talks:
            if talk.a:
                title, url = talk.a.contents[0], talk.a['href']
                speaker = talk.parent.find("span", "speaker").contents[0]
                self.talks.append({'title': title, 'url': url, 'speaker': speaker})

    # Get references for a given talk
    def get_refs_for_talk(self, url):
        r = requests.get(url)
        soup = bs4.BeautifulSoup(r.content)

        # Get all <a class="scriptureRef">
        refs = soup.find_all("a", "scriptureRef")

        response = []
        for ref in refs:
            title = ref.contents[0].strip()
            ref_url = ref['href']

            # Check to see if the title starts with a verse number
            if re.search('^\d{1,3}:', title) and re.search('scriptures/dc-testament', ref_url):
                title = 'D&C %s' % title

            title = re.sub(r'Doctrine and Covenants', 'D&C', title)

            # Replace non-breaking spaces with normal spaces
            title = title.replace(u"\u00A0", " ")

            response.append({'title': title, 'url': ref_url})

        return response

    # Go through the talks and get references for each
    def get_references(self):
        for talk in self.talks:
            talk['references'] = self.get_refs_for_talk(talk['url'])

            # For each reference in the talk
            for ref in talk['references']:
                title = ref['title']

                # Initialize the array for that reference
                if title not in self.references:
                    self.references[title] = []
                    self.urls[title] = ref['url']

                # Add the talk and its URL to the list
                self.references[title].append(talk)

    # Sort function by verse (after the colon)
    def sort_by_verse(self, key):
        m = re.match(r'(.*?) (\d+)(:(\d+))?', key)

        if m:
            groups = m.groups()
            if len(groups) > 2 and groups[3] is not None:
                return int(groups[3])
            else:
                return 0
        else:
            return 0

    # Sort by chapter (just before the colon)
    def sort_by_chapter(self, key):
        m = re.match(r'(.*?) (\d+)(:(\d+))?', key)
        if m:
            return int(m.groups()[1])
        else:
            return 0

    # Sort by book name
    def sort_by_book(self, key):
        # First get the book name (first part of the reference)
        m = re.match(r'(.*?) (\d+)', key)

        val = 0

        if m:
            book_name = m.groups()[0].encode('utf-8')

            # Now we want to use the index from our book name list as the sort key, to put things in order
            if book_name in self.book_names['old_testament']:
                list_name = 'old_testament'
            elif book_name in self.book_names['new_testament']:
                list_name = 'new_testament'
            elif book_name in self.book_names['book_of_mormon']:
                list_name = 'book_of_mormon'
            elif book_name in self.book_names['doctrine_and_covenants']:
                list_name = 'doctrine_and_covenants'
            elif book_name in self.book_names['pearl_of_great_price']:
                list_name = 'pearl_of_great_price'
            else:
                list_name = 'other'

            if list_name != 'other':
                val = self.book_names[list_name].index(book_name)

        return val

    # Sort self.sorted_list out by book (populates self.sorted_references)
    def sort_into_books(self):
        for ref in self.sorted_list:
            # Get the book name
            m = re.match(r'(.*?) (\d+)', ref)
            if m == None:
                self.sorted_references['other'].append(ref)
            else:
                book = m.groups()[0].encode('utf-8')

                if book in self.book_names['old_testament']:
                    self.sorted_references['old_testament'].append(ref)
                elif book in self.book_names['new_testament']:
                    self.sorted_references['new_testament'].append(ref)
                elif book in self.book_names['book_of_mormon']:
                    self.sorted_references['book_of_mormon'].append(ref)
                elif book in self.book_names['doctrine_and_covenants']:
                    self.sorted_references['doctrine_and_covenants'].append(ref)
                elif book in self.book_names['pearl_of_great_price']:
                    self.sorted_references['pearl_of_great_price'].append(ref)
                else:
                    self.sorted_references['other'].append(ref)

    # Saves a single volume
    def print_list(self, book):
        for ref in self.sorted_references[book]:
            talks = self.references[ref]
            url = self.urls[ref]

            self.handle.write('<li>\n\t<label><a href="%s">%s</a></label>\n\t<ul class="refs">\n' % (url, ref))

            for talk in talks:
                self.handle.write('\t\t<li><a href="%s">%s</a></li>\n' % (talk['url'], talk['title']))

            self.handle.write('\t</ul>\n</li>\n')

    # Counts a single volume
    def count_list(self, book):
        return len(self.sorted_references[book])

    # Save the whole list
    def save(self):
        if self.month == 4:
            month_name = 'April'
        elif self.month == 10:
            month_name = 'October'

        # And write it out to the file
        f = codecs.open('output.html', 'w', 'utf-8')
        self.handle = f

        f.write('<html>\n')
        f.write('<head>\n')
        f.write('\t<meta charset="utf-8">\n')
        f.write('\t<title>%s %s General Conference Scripture References</title>\n' % (month_name, year))
        f.write('\t<style type="text/css">\n')
        f.write('\t\t*  					{ -moz-box-sizing: border-box; box-sizing: border-box; }\n')
        f.write('\t\ta						{ color: #5591ce; text-decoration: none; }\n')
        f.write('\t\ta:hover				{ text-decoration: underline; }\n')
        f.write('\t\tbody					{ margin: 0; padding: 0; font-family: Helvetica, Arial, sans-serif; }\n')
        f.write('\t\t#page					{ max-width: 800px; width: 95%; margin: 50px auto; }\n')
        f.write('\t\t#page h1				{ font-size: 1.8em; }\n')
        f.write('\t\t#page h2				{ font-size: 1.6em; margin: 2em 0 .5em; }\n')
        f.write('\t\t#page > ul				{ list-style: none; margin: 0; padding: 0; line-height: 1.5em; }\n')
        f.write('\t\t#page > ul > li		{ border-bottom: solid 1px #ddd; padding: 5px 0; overflow: auto; clear: both; }\n')
        f.write('\t\t#page > ul label		{ font-weight: bold; font-size: 1.2em; width: 50%; float: left; }\n')
        f.write('\t\t#page > ul ul.refs		{ margin: 0; float: left; padding: 0; list-style: none; }\n')
        f.write('\t\t#page > ul.toc > li	{ border: none; display: inline-block; }\n')
        f.write('\t\t#page > ul.toc > li + li:before	{ content: " -- "; color: #ccc; }\n')
        f.write('\t\t@media screen and (max-width: 750px) {\n')
        f.write('\t\t\t#page                { margin: 15px auto; }\n')
        f.write('\t\t\t#page > ul label     { float: none; }\n')
        f.write('\t\t\t#page > ul ul.refs   { float: none; }\n')
        f.write('\t\t}\n')
        f.write('\t</style>\n')
        f.write('</head>\n')
        f.write('<body>\n')
        f.write('<section id="page">\n')
        f.write('\t<h1>%s %s General Conference Scripture References</h1>\n\n' % (month_name, year))

        f.write('\t<ul class="toc">\n')
        f.write('\t\t<li><a href="#old-testament">Old Testament</a></li>\n')
        f.write('\t\t<li><a href="#new-testament">New Testament</a></li>\n')
        f.write('\t\t<li><a href="#book-of-mormon">Book of Mormon</a></li>\n')
        f.write('\t\t<li><a href="#doctrine-and-covenants">Doctrine and Covenants</a></li>\n')
        f.write('\t\t<li><a href="#pearl-of-great-price">Pearl of Great Price</a></li>\n')
        f.write('\t</ul>\n\n')

        f.write('\t<h2 id="old-testament">Old Testament</h2>\n')
        f.write('\t<ul>\n')
        self.print_list('old_testament')
        f.write('\t</ul>\n')

        f.write('\t<h2 id="new-testament">New Testament</h2>\n')
        f.write('\t<ul>\n')
        self.print_list('new_testament')
        f.write('\t</ul>\n')

        f.write('\t<h2 id="book-of-mormon">Book of Mormon</h2>\n')
        f.write('\t<ul>\n')
        self.print_list('book_of_mormon')
        f.write('\t</ul>\n')

        f.write('\t<h2 id="doctrine-and-covenants">Doctrine and Covenants</h2>\n')
        f.write('\t<ul>\n')
        self.print_list('doctrine_and_covenants')
        f.write('\t</ul>\n')

        f.write('\t<h2 id="pearl-of-great-price">Pearl of Great Price</h2>\n')
        f.write('\t<ul>\n')
        self.print_list('pearl_of_great_price')
        f.write('\t</ul>\n')

        f.write('\t<h2>Other</h2>\n')
        f.write('\t<ul>\n')
        self.print_list('other')
        f.write('\t</ul>\n')

        f.write('</section>\n')
        f.write('</body>\n')
        f.write('</html>\n')

        f.close()

if __name__ == '__main__':
    session = ConferenceSession(year, month, limit)
	#!/usr/bin/python
	# -- coding: utf-8 --

	import re
	import codecs
	import requests
	import bs4

	# Change these
	year = 2013
	month = 4
	limit = None

	class ConferenceSession:
	talks = []
	references = {}
	urls = {}

	# List of book names in the scriptures, used for sorting
	book_names = {
	'old_testament': [ 'Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy', 'Joshua', 'Judges', 'Ruth', '1 Samuel', '2 Samuel', '1 Kings', '2 Kings', '1 Chronicles', '2 Chronicles', 'Ezra', 'Nehemiah', 'Esther', 'Job', 'Psalms', 'Psalm', 'Proverbs', 'Ecclesiastes', 'Song of Solomon', 'Isaiah', 'Jeremiah', 'Lamentations', 'Ezekiel', 'Daniel', 'Hosea', 'Joel', 'Amos', 'Obadiah', 'Jonah', 'Micah', 'Nahum', 'Habakkuk', 'Zephaniah', 'Haggai', 'Zechariah', 'Malachi' ],
	'new_testament': [ 'Matthew', 'Mark', 'Luke', 'John', 'Acts', 'Romans', '1 Corinthians', '2 Corinthians', 'Galatians', 'Ephesians', 'Philippians', 'Colossians', '1 Thessalonians', '2 Thessalonians', '1 Timothy', '2 Timothy', 'Titus', 'Philemon', 'Hebrews', 'James', '1 Peter', '2 Peter', '1 John', '2 John', '3 John', 'Jude', 'Revelation' ],
	'book_of_mormon': [ '1 Nephi', '2 Nephi', 'Jacob', 'Enos', 'Jarom', 'Omni', 'Words of Mormon', 'Mosiah', 'Alma', 'Helaman', '3 Nephi', '4 Nephi', 'Mormon', 'Ether', 'Moroni' ],
	'doctrine_and_covenants': [ 'D&C' ],
	'pearl_of_great_price': [ 'Moses', 'Abraham', 'Joseph Smith—Matthew', 'Joseph Smith—History', 'Articles of Faith' ]
	}

	sorted_references = {
	'old_testament': [],
	'new_testament': [],
	'book_of_mormon': [],
	'doctrine_and_covenants': [],
	'pearl_of_great_price': [],
	'other': []
	}

	def __init__(self, year, month, limit=None):
	self.year = year
	self.month = month

	# Get the talks
	print 'Getting the talks...'
	self.get_talks(year, month, limit)

	# Get the references
	print 'Getting references for each talk...'
	self.get_references()

	# Sort by verse #, chapter, and then book
	print 'Sorting...'
	self.sorted_list = self.references.iterkeys()
	self.sorted_list = sorted(self.sorted_list, key=self.sort_by_verse)
	self.sorted_list = sorted(self.sorted_list, key=self.sort_by_chapter)
	self.sorted_list = sorted(self.sorted_list, key=self.sort_by_book)

	# Sort into books, populates self.sorted_references
	self.sort_into_books()

	# Save the list to disk
	print 'Saving to disk...'
	self.save()

	# Download the talks for a given conference session
	def get_talks(self, year, month, limit=None):
	url = 'http://www.lds.org/general-conference/sessions/%04d/%02d' % (year, month)

	# Slurp in the HTML
	r = requests.get(url)
	soup = bs4.BeautifulSoup(r.content)

	# Get all <span class="talk">
	talks = soup.find_all("span", "talk", limit=limit)

	self.talks = []
	for talk in talks:
	if talk.a:
	title, url = talk.a.contents[0], talk.a['href']
	speaker = talk.parent.find("span", "speaker").contents[0]
	self.talks.append({'title': title, 'url': url, 'speaker': speaker})

	# Get references for a given talk
	def get_refs_for_talk(self, url):
	r = requests.get(url)
	soup = bs4.BeautifulSoup(r.content)

	# Get all <a class="scriptureRef">
	refs = soup.find_all("a", "scriptureRef")

	response = []
	for ref in refs:
	title = ref.contents[0].strip()
	ref_url = ref['href']

	# Check to see if the title starts with a verse number
	if re.search('^\d{1,3}:', title) and re.search('scriptures/dc-testament', ref_url):
	title = 'D&C %s' % title

	title = re.sub(r'Doctrine and Covenants', 'D&C', title)

	# Replace non-breaking spaces with normal spaces
	title = title.replace(u"\u00A0", " ")

	response.append({'title': title, 'url': ref_url})

	return response

	# Go through the talks and get references for each
	def get_references(self):
	for talk in self.talks:
	talk['references'] = self.get_refs_for_talk(talk['url'])

	# For each reference in the talk
	for ref in talk['references']:
	title = ref['title']

	# Initialize the array for that reference
	if title not in self.references:
	self.references[title] = []
	self.urls[title] = ref['url']

	# Add the talk and its URL to the list
	self.references[title].append(talk)

	# Sort function by verse (after the colon)
	def sort_by_verse(self, key):
	m = re.match(r'(.*?) (\d+)(:(\d+))?', key)

	if m:
	groups = m.groups()
	if len(groups) > 2 and groups[3] is not None:
	return int(groups[3])
	else:
	return 0
	else:
	return 0

	# Sort by chapter (just before the colon)
	def sort_by_chapter(self, key):
	m = re.match(r'(.*?) (\d+)(:(\d+))?', key)
	if m:
	return int(m.groups()[1])
	else:
	return 0

	# Sort by book name
	def sort_by_book(self, key):
	# First get the book name (first part of the reference)
	m = re.match(r'(.*?) (\d+)', key)

	val = 0

	if m:
	book_name = m.groups()[0].encode('utf-8')

	# Now we want to use the index from our book name list as the sort key, to put things in order
	if book_name in self.book_names['old_testament']:
	list_name = 'old_testament'
	elif book_name in self.book_names['new_testament']:
	list_name = 'new_testament'
	elif book_name in self.book_names['book_of_mormon']:
	list_name = 'book_of_mormon'
	elif book_name in self.book_names['doctrine_and_covenants']:
	list_name = 'doctrine_and_covenants'
	elif book_name in self.book_names['pearl_of_great_price']:
	list_name = 'pearl_of_great_price'
	else:
	list_name = 'other'

	if list_name != 'other':
	val = self.book_names[list_name].index(book_name)

	return val

	# Sort self.sorted_list out by book (populates self.sorted_references)
	def sort_into_books(self):
	for ref in self.sorted_list:
	# Get the book name
	m = re.match(r'(.*?) (\d+)', ref)
	if m == None:
	self.sorted_references['other'].append(ref)
	else:
	book = m.groups()[0].encode('utf-8')

	if book in self.book_names['old_testament']:
	self.sorted_references['old_testament'].append(ref)
	elif book in self.book_names['new_testament']:
	self.sorted_references['new_testament'].append(ref)
	elif book in self.book_names['book_of_mormon']:
	self.sorted_references['book_of_mormon'].append(ref)
	elif book in self.book_names['doctrine_and_covenants']:
	self.sorted_references['doctrine_and_covenants'].append(ref)
	elif book in self.book_names['pearl_of_great_price']:
	self.sorted_references['pearl_of_great_price'].append(ref)
	else:
	self.sorted_references['other'].append(ref)

	# Saves a single volume
	def print_list(self, book):
	for ref in self.sorted_references[book]:
	talks = self.references[ref]
	url = self.urls[ref]

	self.handle.write('<li>\n\t<label><a href="%s">%s</a></label>\n\t<ul class="refs">\n' % (url, ref))

	for talk in talks:
	self.handle.write('\t\t<li><a href="%s">%s</a></li>\n' % (talk['url'], talk['title']))

	self.handle.write('\t</ul>\n</li>\n')

	# Counts a single volume
	def count_list(self, book):
	return len(self.sorted_references[book])

	# Save the whole list
	def save(self):
	if self.month == 4:
	month_name = 'April'
	elif self.month == 10:
	month_name = 'October'

	# And write it out to the file
	f = codecs.open('output.html', 'w', 'utf-8')
	self.handle = f

	f.write('<html>\n')
	f.write('<head>\n')
	f.write('\t<meta charset="utf-8">\n')
	f.write('\t<title>%s %s General Conference Scripture References</title>\n' % (month_name, year))
	f.write('\t<style type="text/css">\n')
	f.write('\t\t* { -moz-box-sizing: border-box; box-sizing: border-box; }\n')
	f.write('\t\ta { color: #5591ce; text-decoration: none; }\n')
	f.write('\t\ta:hover { text-decoration: underline; }\n')
	f.write('\t\tbody { margin: 0; padding: 0; font-family: Helvetica, Arial, sans-serif; }\n')
	f.write('\t\t#page { max-width: 800px; width: 95%; margin: 50px auto; }\n')
	f.write('\t\t#page h1 { font-size: 1.8em; }\n')
	f.write('\t\t#page h2 { font-size: 1.6em; margin: 2em 0 .5em; }\n')
	f.write('\t\t#page > ul { list-style: none; margin: 0; padding: 0; line-height: 1.5em; }\n')
	f.write('\t\t#page > ul > li { border-bottom: solid 1px #ddd; padding: 5px 0; overflow: auto; clear: both; }\n')
	f.write('\t\t#page > ul label { font-weight: bold; font-size: 1.2em; width: 50%; float: left; }\n')
	f.write('\t\t#page > ul ul.refs { margin: 0; float: left; padding: 0; list-style: none; }\n')
	f.write('\t\t#page > ul.toc > li { border: none; display: inline-block; }\n')
	f.write('\t\t#page > ul.toc > li + li:before { content: " -- "; color: #ccc; }\n')
	f.write('\t\t@media screen and (max-width: 750px) {\n')
	f.write('\t\t\t#page { margin: 15px auto; }\n')
	f.write('\t\t\t#page > ul label { float: none; }\n')
	f.write('\t\t\t#page > ul ul.refs { float: none; }\n')
	f.write('\t\t}\n')
	f.write('\t</style>\n')
	f.write('</head>\n')
	f.write('<body>\n')
	f.write('<section id="page">\n')
	f.write('\t<h1>%s %s General Conference Scripture References</h1>\n\n' % (month_name, year))

	f.write('\t<ul class="toc">\n')
	f.write('\t\t<li><a href="#old-testament">Old Testament</a></li>\n')
	f.write('\t\t<li><a href="#new-testament">New Testament</a></li>\n')
	f.write('\t\t<li><a href="#book-of-mormon">Book of Mormon</a></li>\n')
	f.write('\t\t<li><a href="#doctrine-and-covenants">Doctrine and Covenants</a></li>\n')
	f.write('\t\t<li><a href="#pearl-of-great-price">Pearl of Great Price</a></li>\n')
	f.write('\t</ul>\n\n')

	f.write('\t<h2 id="old-testament">Old Testament</h2>\n')
	f.write('\t<ul>\n')
	self.print_list('old_testament')
	f.write('\t</ul>\n')

	f.write('\t<h2 id="new-testament">New Testament</h2>\n')
	f.write('\t<ul>\n')
	self.print_list('new_testament')
	f.write('\t</ul>\n')

	f.write('\t<h2 id="book-of-mormon">Book of Mormon</h2>\n')
	f.write('\t<ul>\n')
	self.print_list('book_of_mormon')
	f.write('\t</ul>\n')

	f.write('\t<h2 id="doctrine-and-covenants">Doctrine and Covenants</h2>\n')
	f.write('\t<ul>\n')
	self.print_list('doctrine_and_covenants')
	f.write('\t</ul>\n')

	f.write('\t<h2 id="pearl-of-great-price">Pearl of Great Price</h2>\n')
	f.write('\t<ul>\n')
	self.print_list('pearl_of_great_price')
	f.write('\t</ul>\n')

	f.write('\t<h2>Other</h2>\n')
	f.write('\t<ul>\n')
	self.print_list('other')
	f.write('\t</ul>\n')

	f.write('</section>\n')
	f.write('</body>\n')
	f.write('</html>\n')

	f.close()

	if __name__ == '__main__':
	session = ConferenceSession(year, month, limit)