Last active
July 11, 2017 20:15
-
-
Save billyeh/4eb74118cc4f671e54d55b553f333f2d to your computer and use it in GitHub Desktop.
Parse the Recovery Version to create a flat Bible
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import requests\n", | |
"import itertools\n", | |
"\n", | |
"from bs4 import BeautifulSoup, NavigableString\n", | |
"from urllib import parse\n", | |
"\n", | |
"URL = 'http://online.recoveryversion.bible/BibleChapters.asp?'\n", | |
"OUTLINE = 'outline'\n", | |
"VERSE = 'verse'\n", | |
"\n", | |
"def extract_outline_tag(e):\n", | |
" e.content_type = OUTLINE\n", | |
" return e.name == 'heading'\n", | |
"\n", | |
"def extract_verse_tag(e):\n", | |
" e.content_type = VERSE\n", | |
" return 'class' in e.attrs and 'verses' in e.attrs['class']\n", | |
"\n", | |
"class ChapterParser:\n", | |
" def __init__(self, chapter):\n", | |
" params = {\n", | |
" 'fcid': chapter,\n", | |
" 'lcid': chapter,\n", | |
" }\n", | |
" self.soup = BeautifulSoup(requests.get(URL + parse.urlencode(params)).text, 'html5lib')\n", | |
" self.clear_extra_text()\n", | |
" self.replace_breaks()\n", | |
" \n", | |
" def replace_breaks(self):\n", | |
" for br in self.soup.find_all(\"br\"):\n", | |
" br.replace_with(\"\\n\")\n", | |
" \n", | |
" def clear_extra_text(self):\n", | |
" for tag in self.soup.find_all('sup'):\n", | |
" tag.extract()\n", | |
" for tag in self.soup.find_all('b'):\n", | |
" tag.extract()\n", | |
" for tag in self.soup.find_all('a'):\n", | |
" tag.extract()\n", | |
" \n", | |
" @property\n", | |
" def book(self):\n", | |
" return self.soup.title.get_text().split('\\xa0')[0]\n", | |
" \n", | |
" @property\n", | |
" def content(self):\n", | |
" def extract_content(e):\n", | |
" e.book = self.book\n", | |
" return extract_outline_tag(e) or extract_verse_tag(e)\n", | |
" return self.soup.find_all(extract_content)\n", | |
"\n", | |
"class FlatBible:\n", | |
" \n", | |
" def __init__(self, first_chapter, second_chapter):\n", | |
" self.chapters = [ChapterParser(c) for c in range(first_chapter, second_chapter + 1)]\n", | |
" self.tags = itertools.chain.from_iterable(c.content for c in self.chapters)\n", | |
" \n", | |
" def to_json(self):\n", | |
" book_list = []\n", | |
" books = [{'name': b[0], 'tags': list(b[1])} for b in itertools.groupby(self.tags, lambda t: t.book)]\n", | |
" for b in books:\n", | |
" tags = itertools.groupby(b['tags'], lambda t: t.content_type)\n", | |
" b['contents'] = [{'type': t[0], 'text': ''.join([c.text for c in t[1]])} for t in tags]\n", | |
" del b['tags']\n", | |
" return books" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('matt', 1, 28),\n", | |
" ('mark', 29, 44),\n", | |
" ('luke', 45, 68),\n", | |
" ('john', 69, 89),\n", | |
" ('acts', 90, 117),\n", | |
" ('rom', 118, 133),\n", | |
" ('1cor', 134, 149),\n", | |
" ('2cor', 150, 162),\n", | |
" ('gal', 163, 168),\n", | |
" ('eph', 169, 174),\n", | |
" ('phil', 175, 178),\n", | |
" ('col', 179, 182),\n", | |
" ('1thess', 183, 187),\n", | |
" ('2thess', 188, 190),\n", | |
" ('1tim', 191, 196),\n", | |
" ('2tim', 197, 200),\n", | |
" ('titus', 201, 203),\n", | |
" ('phlm', 204, 204),\n", | |
" ('heb', 205, 217),\n", | |
" ('jas', 218, 222),\n", | |
" ('1pet', 223, 227),\n", | |
" ('2pet', 228, 230),\n", | |
" ('1john', 231, 235),\n", | |
" ('2john', 236, 236),\n", | |
" ('3john', 237, 237),\n", | |
" ('jude', 238, 238),\n", | |
" ('rev', 239, 260)]" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# NT books taken from https://github.com/davisd/python-scriptures/blob/master/scriptures/texts/protestant.py\n", | |
"# Copyright (c) 2010-2015, David Davis <davisd@davisd.com>\n", | |
"# All rights reserved.\n", | |
"books = (\n", | |
" 'matt', ('Matthew', 'Matt', 'Matt(?:hew)?', [25, 23, 17, 25, 48, 34, 29, 34, 38, 42, 30, 50, 58, 36, 39, 28, 27, 35, 30, 34, 46, 46, 39, 51, 46, 75, 66, 20]),\n", | |
" 'mark', ('Mark', 'Mark', 'Mark', [45, 28, 35, 41, 43, 56, 37, 38, 50, 52, 33, 44, 37, 72, 47, 20]),\n", | |
" 'luke', ('Luke', 'Luke', 'Luke', [80, 52, 38, 44, 39, 49, 50, 56, 62, 42, 54, 59, 35, 35, 32, 31, 37, 43, 48, 47, 38, 71, 56, 53]),\n", | |
" 'john', ('John', 'John', '(?<!(?:1|2|3|I)\\s)(?<!(?:1|2|3|I))John', [51, 25, 36, 54, 47, 71, 53, 59, 41, 42, 57, 50, 38, 31, 27, 33, 26, 40, 42, 31, 25]),\n", | |
" 'acts', ('Acts', 'Acts', 'Acts', [26, 47, 26, 37, 42, 15, 60, 40, 43, 48, 30, 25, 52, 28, 41, 40, 34, 28, 41, 38, 40, 30, 35, 27, 27, 32, 44, 31]),\n", | |
" 'rom', ('Romans', 'Rom', 'Rom(?:ans)?', [32, 29, 31, 25, 21, 23, 25, 39, 33, 21, 36, 21, 14, 23, 33, 27]),\n", | |
" '1cor', ('I Corinthians', '1Cor', '(?:1|I)(?:\\s)?Cor(?:inthians)?', [31, 16, 23, 21, 13, 20, 40, 13, 27, 33, 34, 31, 13, 40, 58, 24]),\n", | |
" '2cor', ('II Corinthians', '2Cor', '(?:2|II)(?:\\s)?Cor(?:inthians)?', [24, 17, 18, 18, 21, 18, 16, 24, 15, 18, 33, 21, 14]),\n", | |
" 'gal', ('Galatians', 'Gal', 'Gal(?:atians)?', [24, 21, 29, 31, 26, 18]),\n", | |
" 'eph', ('Ephesians', 'Eph', 'Eph(?:esians)?', [23, 22, 21, 32, 33, 24]),\n", | |
" 'phil', ('Philippians', 'Phil', 'Phil(?:ippians)?', [30, 30, 21, 23]),\n", | |
" 'col', ('Colossians', 'Col', 'Col(?:ossians)?', [29, 23, 25, 18]),\n", | |
" '1thess', ('I Thessalonians', '1Thess', '(?:1|I)(?:\\s)?Thess(?:alonians)?', [10, 20, 13, 18, 28]),\n", | |
" '2thess', ('II Thessalonians', '2Thess', '(?:2|II)(?:\\s)?Thess(?:alonians)?', [12, 17, 18]),\n", | |
" '1tim', ('I Timothy', '1Tim', '(?:1|I)(?:\\s)?Tim(?:othy)?', [20, 15, 16, 16, 25, 21]),\n", | |
" '2tim', ('II Timothy', '2Tim', '(?:2|II)(?:\\s)?Tim(?:othy)?', [18, 26, 17, 22]),\n", | |
" 'titus', ('Titus', 'Titus', 'Tit(?:us)?', [16, 15, 15]),\n", | |
" 'phlm', ('Philemon', 'Phlm', 'Phlm|Phile(?:m(?:on)?)?', [25]),\n", | |
" 'heb', ('Hebrews', 'Heb', 'Heb(?:rews)?', [14, 18, 19, 16, 14, 20, 28, 13, 28, 39, 40, 29, 25]),\n", | |
" 'jas', ('James', 'Jas', 'Ja(?:me)?s', [27, 26, 18, 17, 20]),\n", | |
" '1pet', ('I Peter', '1Pet', '(?:1|I)(?:\\s)?Pet(?:er)?', [25, 25, 22, 19, 14]),\n", | |
" '2pet', ('II Peter', '2Pet', '(?:2|II)(?:\\s)?Pet(?:er)?', [21, 22, 18]),\n", | |
" '1john', ('I John', '1John', '(?:(?:1|I)(?:\\s)?)John', [10, 29, 24, 21, 21]),\n", | |
" '2john', ('II John', '2John', '(?:(?:2|II)(?:\\s)?)John', [13]),\n", | |
" '3john', ('III John', '3John', '(?:(?:3|III)(?:\\s)?)John', [14]),\n", | |
" 'jude', ('Jude', 'Jude', 'Jude', [25]),\n", | |
" 'rev', ('Revelation of Jesus Christ', 'Rev', 'Rev(?:elation)?(?:\\sof Jesus Christ)?', [20, 29, 22, 11, 14, 17, 17, 13, 21, 11, 19, 17, 18, 20, 8, 21, 18, 24, 21, 15, 27, 21]),\n", | |
")\n", | |
"\n", | |
"books = zip(*(iter(books),) * 2)\n", | |
"curr_chapter = 1\n", | |
"book_ranges = []\n", | |
"for k, v in books:\n", | |
" last_chapter_of_book = curr_chapter + len(v[-1]) - 1\n", | |
" book_ranges.append((k, curr_chapter, last_chapter_of_book))\n", | |
" curr_chapter = last_chapter_of_book + 1\n", | |
"book_ranges\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. The King's Antecedents and Status\\nA. His Genealogy an\n", | |
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. The Beginning of the Gospel and\\nthe Initiation of the\n", | |
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction\\n\"}, {\"type\": \"verse\", \"text\": \" Inasmuch\n", | |
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. The Eternal Word Incarnated Coming to Bring God into M\n", | |
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction\\n\"}, {\"type\": \"verse\", \"text\": \" The form\n", | |
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction --\\nThe Gospel of God\\nA. Promised in the\n", | |
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction -- The Initial Gifts\\nand the Participati\n", | |
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction\\nA. Greeting\\n\"}, {\"type\": \"verse\", \"text\n", | |
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction -- \\nThe Will of God to Rescue Us\\nout of\n", | |
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction\\n\"}, {\"type\": \"verse\", \"text\": \" Paul, an\n", | |
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction\\n\"}, {\"type\": \"verse\", \"text\": \" Paul and\n", | |
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction\\nA. The Apostle's Greeting\\n\"}, {\"type\": \n", | |
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction\\n\"}, {\"type\": \"verse\", \"text\": \" Paul and\n", | |
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction\\n\"}, {\"type\": \"verse\", \"text\": \" Paul and\n", | |
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction\\n\"}, {\"type\": \"verse\", \"text\": \" Paul, an\n", | |
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction\\n\"}, {\"type\": \"verse\", \"text\": \" Paul, an\n", | |
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction\\n\"}, {\"type\": \"verse\", \"text\": \" Paul, a \n", | |
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction\\n\"}, {\"type\": \"verse\", \"text\": \" Paul, a \n", | |
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction --\\nGod Speaking in the Son\\n\"}, {\"type\":\n", | |
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction -- To the\\nTwelve Tribes in Dispersion\\n\"\n", | |
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction -- To the\\nSojourning Believers under the\n", | |
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction -- To the Believers,\\nWho Have Been Allot\n", | |
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. The Fellowship of the Divine Life\\nA. The Manifestatio\n", | |
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction\\nA. Loving in Truthfulness for the Truth\\\n", | |
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction\\nA. Loving in Truthfulness\\n\"}, {\"type\": \n", | |
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction --\\nTo Those Called, Beloved, and Kept\\n\"\n", | |
"{\"contents\": [{\"type\": \"outline\", \"text\": \"I. Introduction --\\nThe Revelation of Christ\\nand the Tes\n" | |
] | |
} | |
], | |
"source": [ | |
"import json\n", | |
"for rng in book_ranges:\n", | |
" book = json.dumps(FlatBible(*rng[1:]).to_json()[0])\n", | |
" print(book[:100])\n", | |
" with open('{0}.json'.format(rng[0]), 'w+') as f:\n", | |
" f.write(book)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.4.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment