Skip to content

Instantly share code, notes, and snippets.

@billyeh
Last active July 11, 2017 20:15
Show Gist options
  • Save billyeh/4eb74118cc4f671e54d55b553f333f2d to your computer and use it in GitHub Desktop.
Save billyeh/4eb74118cc4f671e54d55b553f333f2d to your computer and use it in GitHub Desktop.
Parse the Recovery Version to create a flat Bible
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import requests\n",
"import itertools\n",
"\n",
"from bs4 import BeautifulSoup, NavigableString\n",
"from urllib import parse\n",
"\n",
"URL = 'http://online.recoveryversion.bible/BibleChapters.asp?'\n",
"OUTLINE = 'outline'\n",
"VERSE = 'verse'\n",
"\n",
"def extract_outline_tag(e):\n",
" e.content_type = OUTLINE\n",
" return e.name == 'heading'\n",
"\n",
"def extract_verse_tag(e):\n",
" e.content_type = VERSE\n",
" return 'class' in e.attrs and 'verses' in e.attrs['class']\n",
"\n",
"class ChapterParser:\n",
" def __init__(self, chapter):\n",
" params = {\n",
" 'fcid': chapter,\n",
" 'lcid': chapter,\n",
" }\n",
" self.soup = BeautifulSoup(requests.get(URL + parse.urlencode(params)).text, 'html5lib')\n",
" self.clear_extra_text()\n",
" self.replace_breaks()\n",
" \n",
" def replace_breaks(self):\n",
" for br in self.soup.find_all(\"br\"):\n",
" br.replace_with(\"\\n\")\n",
" \n",
" def clear_extra_text(self):\n",
" for tag in self.soup.find_all('sup'):\n",
" tag.extract()\n",
" for tag in self.soup.find_all('b'):\n",
" tag.extract()\n",
" for tag in self.soup.find_all('a'):\n",
" tag.extract()\n",
" \n",
" @property\n",
" def book(self):\n",
" return self.soup.title.get_text().split('\\xa0')[0]\n",
" \n",
" @property\n",
" def content(self):\n",
" def extract_content(e):\n",
" e.book = self.book\n",
" return extract_outline_tag(e) or extract_verse_tag(e)\n",
" return self.soup.find_all(extract_content)\n",
"\n",
"class FlatBible:\n",
" \n",
" def __init__(self, first_chapter, second_chapter):\n",
" self.chapters = [ChapterParser(c) for c in range(first_chapter, second_chapter + 1)]\n",
" self.tags = itertools.chain.from_iterable(c.content for c in self.chapters)\n",
" \n",
" def to_json(self):\n",
" book_list = []\n",
" books = [{'name': b[0], 'tags': list(b[1])} for b in itertools.groupby(self.tags, lambda t: t.book)]\n",
" for b in books:\n",
" tags = itertools.groupby(b['tags'], lambda t: t.content_type)\n",
" b['contents'] = [{'type': t[0], 'text': ''.join([c.text for c in t[1]])} for t in tags]\n",
" del b['tags']\n",
" return books"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('matt', 1, 28),\n",
" ('mark', 29, 44),\n",
" ('luke', 45, 68),\n",
" ('john', 69, 89),\n",
" ('acts', 90, 117),\n",
" ('rom', 118, 133),\n",
" ('1cor', 134, 149),\n",
" ('2cor', 150, 162),\n",
" ('gal', 163, 168),\n",
" ('eph', 169, 174),\n",
" ('phil', 175, 178),\n",
" ('col', 179, 182),\n",
" ('1thess', 183, 187),\n",
" ('2thess', 188, 190),\n",
" ('1tim', 191, 196),\n",
" ('2tim', 197, 200),\n",
" ('titus', 201, 203),\n",
" ('phlm', 204, 204),\n",
" ('heb', 205, 217),\n",
" ('jas', 218, 222),\n",
" ('1pet', 223, 227),\n",
" ('2pet', 228, 230),\n",
" ('1john', 231, 235),\n",
" ('2john', 236, 236),\n",
" ('3john', 237, 237),\n",
" ('jude', 238, 238),\n",
" ('rev', 239, 260)]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# NT books taken from https://github.com/davisd/python-scriptures/blob/master/scriptures/texts/protestant.py\n",
"# Copyright (c) 2010-2015, David Davis <davisd@davisd.com>\n",
"# All rights reserved.\n",
"books = (\n",
" 'matt', ('Matthew', 'Matt', 'Matt(?:hew)?', [25, 23, 17, 25, 48, 34, 29, 34, 38, 42, 30, 50, 58, 36, 39, 28, 27, 35, 30, 34, 46, 46, 39, 51, 46, 75, 66, 20]),\n",
" 'mark', ('Mark', 'Mark', 'Mark', [45, 28, 35, 41, 43, 56, 37, 38, 50, 52, 33, 44, 37, 72, 47, 20]),\n",
" 'luke', ('Luke', 'Luke', 'Luke', [80, 52, 38, 44, 39, 49, 50, 56, 62, 42, 54, 59, 35, 35, 32, 31, 37, 43, 48, 47, 38, 71, 56, 53]),\n",
" 'john', ('John', 'John', '(?<!(?:1|2|3|I)\\s)(?<!(?:1|2|3|I))John', [51, 25, 36, 54, 47, 71, 53, 59, 41, 42, 57, 50, 38, 31, 27, 33, 26, 40, 42, 31, 25]),\n",
" 'acts', ('Acts', 'Acts', 'Acts', [26, 47, 26, 37, 42, 15, 60, 40, 43, 48, 30, 25, 52, 28, 41, 40, 34, 28, 41, 38, 40, 30, 35, 27, 27, 32, 44, 31]),\n",
" 'rom', ('Romans', 'Rom', 'Rom(?:ans)?', [32, 29, 31, 25, 21, 23, 25, 39, 33, 21, 36, 21, 14, 23, 33, 27]),\n",
" '1cor', ('I Corinthians', '1Cor', '(?:1|I)(?:\\s)?Cor(?:inthians)?', [31, 16, 23, 21, 13, 20, 40, 13, 27, 33, 34, 31, 13, 40, 58, 24]),\n",
" '2cor', ('II Corinthians', '2Cor', '(?:2|II)(?:\\s)?Cor(?:inthians)?', [24, 17, 18, 18, 21, 18, 16, 24, 15, 18, 33, 21, 14]),\n",
" 'gal', ('Galatians', 'Gal', 'Gal(?:atians)?', [24, 21, 29, 31, 26, 18]),\n",
" 'eph', ('Ephesians', 'Eph', 'Eph(?:esians)?', [23, 22, 21, 32, 33, 24]),\n",
" 'phil', ('Philippians', 'Phil', 'Phil(?:ippians)?', [30, 30, 21, 23]),\n",
" 'col', ('Colossians', 'Col', 'Col(?:ossians)?', [29, 23, 25, 18]),\n",
" '1thess', ('I Thessalonians', '1Thess', '(?:1|I)(?:\\s)?Thess(?:alonians)?', [10, 20, 13, 18, 28]),\n",
" '2thess', ('II Thessalonians', '2Thess', '(?:2|II)(?:\\s)?Thess(?:alonians)?', [12, 17, 18]),\n",
" '1tim', ('I Timothy', '1Tim', '(?:1|I)(?:\\s)?Tim(?:othy)?', [20, 15, 16, 16, 25, 21]),\n",
" '2tim', ('II Timothy', '2Tim', '(?:2|II)(?:\\s)?Tim(?:othy)?', [18, 26, 17, 22]),\n",
" 'titus', ('Titus', 'Titus', 'Tit(?:us)?', [16, 15, 15]),\n",
" 'phlm', ('Philemon', 'Phlm', 'Phlm|Phile(?:m(?:on)?)?', [25]),\n",
" 'heb', ('Hebrews', 'Heb', 'Heb(?:rews)?', [14, 18, 19, 16, 14, 20, 28, 13, 28, 39, 40, 29, 25]),\n",
" 'jas', ('James', 'Jas', 'Ja(?:me)?s', [27, 26, 18, 17, 20]),\n",
" '1pet', ('I Peter', '1Pet', '(?:1|I)(?:\\s)?Pet(?:er)?', [25, 25, 22, 19, 14]),\n",
" '2pet', ('II Peter', '2Pet', '(?:2|II)(?:\\s)?Pet(?:er)?', [21, 22, 18]),\n",
" '1john', ('I John', '1John', '(?:(?:1|I)(?:\\s)?)John', [10, 29, 24, 21, 21]),\n",
" '2john', ('II John', '2John', '(?:(?:2|II)(?:\\s)?)John', [13]),\n",
" '3john', ('III John', '3John', '(?:(?:3|III)(?:\\s)?)John', [14]),\n",
" 'jude', ('Jude', 'Jude', 'Jude', [25]),\n",
" 'rev', ('Revelation of Jesus Christ', 'Rev', 'Rev(?:elation)?(?:\\sof Jesus Christ)?', [20, 29, 22, 11, 14, 17, 17, 13, 21, 11, 19, 17, 18, 20, 8, 21, 18, 24, 21, 15, 27, 21]),\n",
")\n",
"\n",
"books = zip(*(iter(books),) * 2)\n",
"curr_chapter = 1\n",
"book_ranges = []\n",
"for k, v in books:\n",
" last_chapter_of_book = curr_chapter + len(v[-1]) - 1\n",
" book_ranges.append((k, curr_chapter, last_chapter_of_book))\n",
" curr_chapter = last_chapter_of_book + 1\n",
"book_ranges\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. The King's Antecedents and Status\\nA. His Genealogy an\n",
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. The Beginning of the Gospel and\\nthe Initiation of the\n",
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction\\n\"}, {\"type\": \"verse\", \"text\": \" Inasmuch\n",
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. The Eternal Word Incarnated Coming to Bring God into M\n",
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction\\n\"}, {\"type\": \"verse\", \"text\": \" The form\n",
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction --\\nThe Gospel of God\\nA. Promised in the\n",
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction -- The Initial Gifts\\nand the Participati\n",
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction\\nA. Greeting\\n\"}, {\"type\": \"verse\", \"text\n",
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction -- \\nThe Will of God to Rescue Us\\nout of\n",
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction\\n\"}, {\"type\": \"verse\", \"text\": \" Paul, an\n",
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction\\n\"}, {\"type\": \"verse\", \"text\": \" Paul and\n",
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction\\nA. The Apostle's Greeting\\n\"}, {\"type\": \n",
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction\\n\"}, {\"type\": \"verse\", \"text\": \" Paul and\n",
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction\\n\"}, {\"type\": \"verse\", \"text\": \" Paul and\n",
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction\\n\"}, {\"type\": \"verse\", \"text\": \" Paul, an\n",
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction\\n\"}, {\"type\": \"verse\", \"text\": \" Paul, an\n",
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction\\n\"}, {\"type\": \"verse\", \"text\": \" Paul, a \n",
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction\\n\"}, {\"type\": \"verse\", \"text\": \" Paul, a \n",
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction --\\nGod Speaking in the Son\\n\"}, {\"type\":\n",
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction -- To the\\nTwelve Tribes in Dispersion\\n\"\n",
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction -- To the\\nSojourning Believers under the\n",
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction -- To the Believers,\\nWho Have Been Allot\n",
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. The Fellowship of the Divine Life\\nA. The Manifestatio\n",
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction\\nA. Loving in Truthfulness for the Truth\\\n",
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction\\nA. Loving in Truthfulness\\n\"}, {\"type\": \n",
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction --\\nTo Those Called, Beloved, and Kept\\n\"\n",
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction --\\nThe Revelation of Christ\\nand the Tes\n"
]
}
],
"source": [
"import json\n",
"for rng in book_ranges:\n",
" book = json.dumps(FlatBible(*rng[1:]).to_json()[0])\n",
" print(book[:100])\n",
" with open('{0}.json'.format(rng[0]), 'w+') as f:\n",
" f.write(book)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment