Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save josifoski/9e45bfbfe73e58bc97562ad368f055c4 to your computer and use it in GitHub Desktop.
Save josifoski/9e45bfbfe73e58bc97562ad368f055c4 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# script for scraping Bible texts from biblegateway.com
# creator: Aleksandar Josifoski for Troy Lyndon troylyndon@gmail.com property of RDGames http://rdgames.us
# 2016-04-01
from bs4 import BeautifulSoup
import urllib.request
import re
import sys
import os
import time
import datetime
import html
import random
import codecs
import zipfile
## INPUT ######################################################################################################################################
# First, take a look at biblegateway.com/versions or in bgtwkeys.txt (bgtwkeys.txt needs to be updated from time to time)
# and pick keys for which translations to be scrapped.
# Second, add in pool which Bible translations to be scrapped, put only Bible abbreviations prefixed with Language abbreeviation
# like pool = [ "EN-KJV", "EN-NKJV"]
# if you want to rescrape Bible, delete statusdone generated file where Bible is saved
pool = [
'SR-ERV-SR','SV-SVL','SV-SV1917','SV-SFB','SV-SFB2014','SW-SNT','TA-ERV-TA','TH-TNCV','TH-ERV-TH','TL-ADB1905',
'TL-SND','TWI-NA-TWI','UK-UKR','UK-ERV-UK','UR-ERV-UR','USP-USP','VI-VIET','VI-BD2011','VI-NVB','VI-BPT',
'ZH-CCB','ZH-ERV-ZH','ZH-CNVS','ZH-CNVT','ZH-CSBS','ZH-CSBT','ZH-CUVS','ZH-CUV','ZH-CUVMPS','ZH-CUVMPT'
]
# if you want to scrape only one book, or some of them reduce bookslist
bookslist = [ "Genesis", "Exodus", "Leviticus", "Numbers", "Deuteronomy", "Joshua", "Judges", "Ruth", "1 Samuel", "2 Samuel",
"1 Kings", "2 Kings", "1 Chronicles", "2 Chronicles", "Ezra", "Nehemiah", "Esther", "Job", "Psalms", "Proverbs",
"Ecclesiastes", "Song of Solomon", "Isaiah", "Jeremiah", "Lamentations", "Ezekiel", "Daniel", "Hosea",
"Joel", "Amos", "Obadiah", "Jonah", "Micah", "Nahum", "Habakkuk", "Zephaniah", "Haggai", "Zechariah", "Malachi",
"Matthew", "Mark", "Luke", "John", "Acts", "Romans", "1 Corinthians", "2 Corinthians", "Galatians", "Ephesians",
"Philippians", "Colossians", "1 Thessalonians", "2 Thessalonians", "1 Timothy", "2 Timothy", "Titus", "Philemon", "Hebrews",
"James", "1 Peter", "2 Peter", "1 John", "2 John", "3 John", "Jude", "Revelation" ]
#rootdir = '/data/rdgames/testing/bg/'
#xmlrootdir = '/data/rdgames/testing/bgxml/'
rootdir = '/home/josifoski/bibles/bg/'
xmlrootdir = '/home/josifoski/bibles/bgxml/'
# note for rootdir, last character must be / for first character put / for absolute path
# if you omit first character /, directories will be created in
# current directory where python scripts are
###############################################################################################################################################
dprefixes = {
"Genesis":"01", "Exodus":"02", "Leviticus":"03", "Numbers":"04", "Deuteronomy":"05", "Joshua":"06", "Judges":"07", "Ruth":"08", "1 Samuel":"09",
"2 Samuel":"10", "1 Kings":"11", "2 Kings":"12", "1 Chronicles":"13", "2 Chronicles":"14", "Ezra":"15", "Nehemiah":"16", "Esther":"17", "Job":"18",
"Psalms":"19", "Proverbs":"20", "Ecclesiastes":"21", "Song of Solomon":"22", "Isaiah":"23", "Jeremiah":"24", "Lamentations":"25", "Ezekiel":"26",
"Daniel":"27", "Hosea":"28", "Joel":"29", "Amos":"30", "Obadiah":"31", "Jonah":"32", "Micah":"33", "Nahum":"34", "Habakkuk":"35", "Zephaniah":"36",
"Haggai":"37", "Zechariah":"38", "Malachi":"39", "Matthew":"40", "Mark":"41", "Luke":"42", "John":"43", "Acts":"44", "Romans":"45",
"1 Corinthians":"46", "2 Corinthians":"47", "Galatians":"48", "Ephesians":"49", "Philippians":"50", "Colossians":"51", "1 Thessalonians":"52",
"2 Thessalonians":"53", "1 Timothy":"54", "2 Timothy":"55", "Titus":"56", "Philemon":"57", "Hebrews":"58", "James":"59", "1 Peter":"60",
"2 Peter":"61", "1 John":"62", "2 John":"63", "3 John":"64", "Jude":"65", "Revelation":"66" }
# if need to scrap only particular books, in bookslist above preserve only names of books for scrapping. This becarefull2 are reserve for copy/paste
becarefull2 = [ "Genesis", "Exodus", "Leviticus", "Numbers", "Deuteronomy", "Joshua", "Judges", "Ruth", "1 Samuel", "2 Samuel",
"1 Kings", "2 Kings", "1 Chronicles", "2 Chronicles", "Ezra", "Nehemiah", "Esther", "Job", "Psalms", "Proverbs",
"Ecclesiastes", "Song of Solomon", "Isaiah", "Jeremiah", "Lamentations", "Ezekiel", "Daniel", "Hosea",
"Joel", "Amos", "Obadiah", "Jonah", "Micah", "Nahum", "Habakkuk", "Zephaniah", "Haggai", "Zechariah", "Malachi",
"Matthew", "Mark", "Luke", "John", "Acts", "Romans", "1 Corinthians", "2 Corinthians", "Galatians", "Ephesians",
"Philippians", "Colossians", "1 Thessalonians", "2 Thessalonians", "1 Timothy", "2 Timothy", "Titus", "Philemon", "Hebrews",
"James", "1 Peter", "2 Peter", "1 John", "2 John", "3 John", "Jude", "Revelation" ]
prefixes=('01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26',
'27','28','29','30','31','32','33','34','35','36','37','38','39','40','41','42','43','44','45','46','47','48','49','50','51','52','53','54','55',
'56','57','58','59','60','61','62','63','64','65','66')
chaptersdict = { "Genesis" : 50, "Exodus" : 40, "Leviticus" : 27, "Numbers" : 36, "Deuteronomy" : 34, "Joshua" : 24, "Judges" : 21, "Ruth" : 4,
"1 Samuel" : 31, "2 Samuel" : 24, "1 Kings" : 22, "2 Kings" : 25, "1 Chronicles" : 29, "2 Chronicles" : 36, "Ezra" : 10, "Nehemiah" : 13, "Esther" : 10,
"Job" : 42, "Psalms" : 150, "Proverbs" : 31, "Ecclesiastes" : 12, "Song of Solomon" : 8, "Isaiah" : 66, "Jeremiah" : 52, "Lamentations" : 5,
"Ezekiel" : 48, "Daniel" : 12, "Hosea" : 14, "Joel" : 3, "Amos" : 9, "Obadiah" : 1, "Jonah" : 4, "Micah" : 7, "Nahum" : 3, "Habakkuk" : 3,
"Zephaniah" : 3, "Haggai" : 2, "Zechariah" : 14, "Malachi" : 4, "Matthew" : 28, "Mark" : 16, "Luke" : 24, "John" : 21, "Acts" : 28, "Romans" : 16,
"1 Corinthians" : 16, "2 Corinthians" : 13, "Galatians" : 6, "Ephesians" : 6, "Philippians" : 4, "Colossians" : 4, "1 Thessalonians" : 5,
"2 Thessalonians" : 3, "1 Timothy" : 6, "2 Timothy" : 4, "Titus" : 3, "Philemon" : 1, "Hebrews" : 13, "James" : 5, "1 Peter" : 5, "2 Peter" : 3,
"1 John" : 5, "2 John" : 1, "3 John" : 1, "Jude" : 1, "Revelation" : 22 }
Bibliaa = {
'Genesis' : ['1:31','2:25','3:24','4:26','5:32','6:22','7:24','8:22','9:29','10:32','11:32','12:20','13:18','14:24','15:21','16:16','17:27','18:33','19:38','20:18','21:34','22:24','23:20','24:67','25:34','26:35','27:46','28:22','29:35','30:43','31:55','32:32','33:20','34:31','35:29','36:43','37:36','38:30','39:23','40:23','41:57','42:38','43:34','44:34','45:28','46:34','47:31','48:22','49:33','50:26'],
'Exodus' : ['1:22','2:25','3:22','4:31','5:23','6:30','7:25','8:32','9:35','10:29','11:10','12:51','13:22','14:31','15:27','16:36','17:16','18:27','19:25','20:26','21:36','22:31','23:33','24:18','25:40','26:37','27:21','28:43','29:46','30:38','31:18','32:35','33:23','34:35','35:35','36:38','37:29','38:31','39:43','40:38'],
'Leviticus' : ['1:17','2:16','3:17','4:35','5:19','6:30','7:38','8:36','9:24','10:20','11:47','12:8','13:59','14:57','15:33','16:34','17:16','18:30','19:37','20:27','21:24','22:33','23:44','24:23','25:55','26:46','27:34'],
'Numbers' : ['1:54','2:34','3:51','4:49','5:31','6:27','7:89','8:26','9:23','10:36','11:35','12:16','13:33','14:45','15:41','16:50','17:13','18:32','19:22','20:29','21:35','22:41','23:30','24:25','25:18','26:65','27:23','28:31','29:40','30:16','31:54','32:42','33:56','34:29','35:34','36:13'],
'Deuteronomy' : ['1:46','2:37','3:29','4:49','5:33','6:25','7:26','8:20','9:29','10:22','11:32','12:32','13:18','14:29','15:23','16:22','17:20','18:22','19:21','20:20','21:23','22:30','23:25','24:22','25:19','26:19','27:26','28:68','29:29','30:20','31:30','32:52','33:29','34:12'],
'Joshua' : ['1:18','2:24','3:17','4:24','5:15','6:27','7:26','8:35','9:27','10:43','11:23','12:24','13:33','14:15','15:63','16:10','17:18','18:28','19:51','20:9','21:45','22:34','23:16','24:33'],
'Judges' : ['1:36','2:23','3:31','4:24','5:31','6:40','7:25','8:35','9:57','10:18','11:40','12:15','13:25','14:20','15:20','16:31','17:13','18:31','19:30','20:48','21:25'],
'Ruth' : ['1:22','2:23','3:18','4:22'],
'1Samuel' : ['1:28','2:36','3:21','4:22','5:12','6:21','7:17','8:22','9:27','10:27','11:15','12:25','13:23','14:52','15:35','16:23','17:58','18:30','19:24','20:42','21:15','22:23','23:29','24:22','25:44','26:25','27:12','28:25','29:11','30:31','31:13'],
'2Samuel' : ['1:27','2:32','3:39','4:12','5:25','6:23','7:29','8:18','9:13','10:19','11:27','12:31','13:39','14:33','15:37','16:23','17:29','18:33','19:43','20:26','21:22','22:51','23:39','24:25'],
'1Kings' : ['1:53','2:46','3:28','4:34','5:18','6:38','7:51','8:66','9:28','10:29','11:43','12:33','13:34','14:31','15:34','16:34','17:24','18:46','19:21','20:43','21:29','22:53'],
'2Kings' : ['1:18','2:25','3:27','4:44','5:27','6:33','7:20','8:29','9:37','10:36','11:21','12:21','13:25','14:29','15:38','16:20','17:41','18:37','19:37','20:21','21:26','22:20','23:37','24:20','25:30'],
'1Chronicles' : ['1:54','2:55','3:24','4:43','5:26','6:81','7:40','8:40','9:44','10:14','11:47','12:40','13:14','14:17','15:29','16:43','17:27','18:17','19:19','20:8','21:30','22:19','23:32','24:31','25:31','26:32','27:34','28:21','29:30'],
'2Chronicles' : ['1:17','2:18','3:17','4:22','5:14','6:42','7:22','8:18','9:31','10:19','11:23','12:16','13:22','14:15','15:19','16:14','17:19','18:34','19:11','20:37','21:20','22:12','23:21','24:27','25:28','26:23','27:9','28:27','29:36','30:27','31:21','32:33','33:25','34:33','35:27','36:23'],
'Ezra' : ['1:11','2:70','3:13','4:24','5:17','6:22','7:28','8:36','9:15','10:44'],
'Nehemiah' : ['1:11','2:20','3:32','4:23','5:19','6:19','7:73','8:18','9:38','10:39','11:36','12:47','13:31'],
'Esther' : ['1:22','2:23','3:15','4:17','5:14','6:14','7:10','8:17','9:32','10:3'],
'Job' : ['1:22','2:13','3:26','4:21','5:27','6:30','7:21','8:22','9:35','10:22','11:20','12:25','13:28','14:22','15:35','16:22','17:16','18:21','19:29','20:29','21:34','22:30','23:17','24:25','25:6','26:14','27:23','28:28','29:25','30:31','31:40','32:22','33:33','34:37','35:16','36:33','37:24','38:41','39:30','40:24','41:34','42:17'],
'Psalms' : ['1:6','2:12','3:8','4:8','5:12','6:10','7:17','8:9','9:20','10:18','11:7','12:8','13:6','14:7','15:5','16:11','17:15','18:50','19:14','20:9','21:13','22:31','23:6','24:10','25:22','26:12','27:14','28:9','29:11','30:12','31:24','32:11','33:22','34:22','35:28','36:12','37:40','38:22','39:13','40:17','41:13','42:11','43:5','44:26','45:17','46:11','47:9','48:14','49:20','50:23','51:19','52:9','53:6','54:7','55:23','56:13','57:11','58:11','59:17','60:12','61:8','62:12','63:11','64:10','65:13','66:20','67:7','68:35','69:36','70:5','71:24','72:20','73:28','74:23','75:10','76:12','77:20','78:72','79:13','80:19','81:16','82:8','83:18','84:12','85:13','86:17','87:7','88:18','89:52','90:17','91:16','92:15','93:5','94:23','95:11','96:13','97:12','98:9','99:9','100:5','101:8','102:28','103:22','104:35','105:45','106:48','107:43','108:13','109:31','110:7','111:10','112:10','113:9','114:8','115:18','116:19','117:2','118:29','119:176','120:7','121:8','122:9','123:4','124:8','125:5','126:6','127:5','128:6','129:8','130:8','131:3','132:18','133:3','134:3','135:21','136:26','137:9','138:8','139:24','140:13','141:10','142:7','143:12','144:15','145:21','146:10','147:20','148:14','149:9','150:6'],
'Proverbs' : ['1:33','2:22','3:35','4:27','5:23','6:35','7:27','8:36','9:18','10:32','11:31','12:28','13:25','14:35','15:33','16:33','17:28','18:24','19:29','20:30','21:31','22:29','23:35','24:34','25:28','26:28','27:27','28:28','29:27','30:33','31:31'],
'Ecclesiastes' : ['1:18','2:26','3:22','4:16','5:20','6:12','7:29','8:17','9:18','10:20','11:10','12:14'],
'SongofSolomon' : ['1:17','2:17','3:11','4:16','5:16','6:13','7:13','8:14'],
'Isaiah' : ['1:31','2:22','3:26','4:6','5:30','6:13','7:25','8:22','9:21','10:34','11:16','12:6','13:22','14:32','15:9','16:14','17:14','18:7','19:25','20:6','21:17','22:25','23:18','24:23','25:12','26:21','27:13','28:29','29:24','30:33','31:9','32:20','33:24','34:17','35:10','36:22','37:38','38:22','39:8','40:31','41:29','42:25','43:28','44:28','45:25','46:13','47:15','48:22','49:26','50:11','51:23','52:15','53:12','54:17','55:13','56:12','57:21','58:14','59:21','60:22','61:11','62:12','63:19','64:12','65:25','66:24'],
'Jeremiah' : ['1:19','2:37','3:25','4:31','5:31','6:30','7:34','8:22','9:26','10:25','11:23','12:17','13:27','14:22','15:21','16:21','17:27','18:23','19:15','20:18','21:14','22:30','23:40','24:10','25:38','26:24','27:22','28:17','29:32','30:24','31:40','32:44','33:26','34:22','35:19','36:32','37:21','38:28','39:18','40:16','41:18','42:22','43:13','44:30','45:5','46:28','47:7','48:47','49:39','50:46','51:64','52:34'],
'Lamentations' : ['1:22','2:22','3:66','4:22','5:22'],
'Ezekiel' : ['1:28','2:10','3:27','4:17','5:17','6:14','7:27','8:18','9:11','10:22','11:25','12:28','13:23','14:23','15:8','16:63','17:24','18:32','19:14','20:49','21:32','22:31','23:49','24:27','25:17','26:21','27:36','28:26','29:21','30:26','31:18','32:32','33:33','34:31','35:15','36:38','37:28','38:23','39:29','40:49','41:26','42:20','43:27','44:31','45:25','46:24','47:23','48:35'],
'Daniel' : ['1:21','2:49','3:30','4:37','5:31','6:28','7:28','8:27','9:27','10:21','11:45','12:13'],
'Hosea' : ['1:11','2:23','3:5','4:19','5:15','6:11','7:16','8:14','9:17','10:15','11:12','12:14','13:16','14:9'],
'Joel' : ['1:20','2:32','3:21'],
'Amos' : ['1:15','2:16','3:15','4:13','5:27','6:14','7:17','8:14','9:15'],
'Obadiah' : ['1:21'],
'Jonah' : ['1:17','2:10','3:10','4:11'],
'Micah' : ['1:16','2:13','3:12','4:13','5:15','6:16','7:20'],
'Nahum' : ['1:15','2:13','3:19'],
'Habakkuk' : ['1:17','2:20','3:19'],
'Zephaniah' : ['1:18','2:15','3:20'],
'Haggai' : ['1:15','2:23'],
'Zechariah' : ['1:21','2:13','3:10','4:14','5:11','6:15','7:14','8:23','9:17','10:12','11:17','12:14','13:9','14:21'],
'Malachi' : ['1:14','2:17','3:18','4:6'],
'Matthew' : ['1:25','2:23','3:17','4:25','5:48','6:34','7:29','8:34','9:38','10:42','11:30','12:50','13:58','14:36','15:39','16:28','17:27','18:35','19:30','20:34','21:46','22:46','23:39','24:51','25:46','26:75','27:66','28:20'],
'Mark' : ['1:45','2:28','3:35','4:41','5:43','6:56','7:37','8:38','9:50','10:52','11:33','12:44','13:37','14:72','15:47','16:20'],
'Luke' : ['1:80','2:52','3:38','4:44','5:39','6:49','7:50','8:56','9:62','10:42','11:54','12:59','13:35','14:35','15:32','16:31','17:37','18:43','19:48','20:47','21:38','22:71','23:56','24:53'],
'John' : ['1:51','2:25','3:36','4:54','5:47','6:71','7:53','8:59','9:41','10:42','11:57','12:50','13:38','14:31','15:27','16:33','17:26','18:40','19:42','20:31','21:25'],
'Acts' : ['1:26','2:47','3:26','4:37','5:42','6:15','7:60','8:40','9:43','10:48','11:30','12:25','13:52','14:28','15:41','16:40','17:34','18:28','19:41','20:38','21:40','22:30','23:35','24:27','25:27','26:32','27:44','28:31'],
'Romans' : ['1:32','2:29','3:31','4:25','5:21','6:23','7:25','8:39','9:33','10:21','11:36','12:21','13:14','14:23','15:33','16:27'],
'1Corinthians' : ['1:31','2:16','3:23','4:21','5:13','6:20','7:40','8:13','9:27','10:33','11:34','12:31','13:13','14:40','15:58','16:24'],
'2Corinthians' : ['1:24','2:17','3:18','4:18','5:21','6:18','7:16','8:24','9:15','10:18','11:33','12:21','13:14'],
'Galatians' : ['1:24','2:21','3:29','4:31','5:26','6:18'],
'Ephesians' : ['1:23','2:22','3:21','4:32','5:33','6:24'],
'Philippians' : ['1:30','2:30','3:21','4:23'],
'Colossians' : ['1:29','2:23','3:25','4:18'],
'1Thessalonians' : ['1:10','2:20','3:13','4:18','5:28'],
'2Thessalonians' : ['1:12','2:17','3:18'],
'1Timothy' : ['1:20','2:15','3:16','4:16','5:25','6:21'],
'2Timothy' : ['1:18','2:26','3:17','4:22'],
'Titus' : ['1:16','2:15','3:15'],
'Philemon' : ['1:25'],
'Hebrews' : ['1:14','2:18','3:19','4:16','5:14','6:20','7:28','8:13','9:28','10:39','11:40','12:29','13:25'],
'James' : ['1:27','2:26','3:18','4:17','5:20'],
'1Peter' : ['1:25','2:25','3:22','4:19','5:14'],
'2Peter' : ['1:21','2:22','3:18'],
'1John' : ['1:10','2:29','3:24','4:21','5:21'],
'2John' : ['1:13'],
'3John' : ['1:14'],
'Jude' : ['1:25'],
'Revelation' : ['1:20','2:29','3:22','4:11','5:14','6:17','7:17','8:13','9:21','10:11','11:19','12:17','13:18','14:20','15:8','16:21','17:18','18:24','19:21','20:15','21:27','22:21']
}
def splitversetext(snum, vtext):
global bib
global newabbrev
global fzgroupingsreff
global glava
global numoflines
global sline
global c
global dprefixes
global bookslist
global dabbrevbook
global numtobibleabbrev
fzgroupingsreff.write(dprefixes[c] + ';' + glava + ';' + snum + os.linesep)
sline = ''
l = vtext.split()
lsize = len(l)
try:
ileft = int(snum.split('-')[0])
except:
ileft = int(snum.split('-')[0].strip('abcde'))
try:
iright = int(snum.split('-')[1])
except:
iright = int(snum.split('-')[1].strip('abcde'))
diff = iright - ileft + 1
k = 0
j = int(lsize/diff) - 1
for i in range(ileft, iright + 1):
if i != iright:
sline += '{' + glava + ':' + str(i) + '} ' + ' '.join(l[k:k + j + 1]) + os.linesep
numoflines += 1
k += j + 1
else:
sline += '{' + glava + ':' + str(i) + '} ' + ' '.join(l[k:]) + os.linesep
numoflines += 1
now = datetime.datetime.now()
currentdate = str(now).split()[0].replace('-', '')
writeorappend = 'w'
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
time1=time.time()
ibiblecount = 0
for bib in pool:
ibiblecount += 1
bib = str(bib)
bib = bib.strip()
if bib.count('-') > 1:
newabbrev = bib.replace('-', '_', 1)
newabbrev = newabbrev.replace('-', '')
else:
newabbrev = bib.replace('-', '_')
directoryf = rootdir + newabbrev.split('_')[0] + '/' + newabbrev + '/'
if not os.path.exists(directoryf):
os.makedirs(directoryf)
if os.path.exists(directoryf + newabbrev + '_statusdone'):
print('#' + str(ibiblecount) + '/' + str(len(pool)) + ' ' + newabbrev + ' already previously scrapped')
continue
directoryfxmls = xmlrootdir + bib.split('-')[0] + '/'
if not os.path.exists(directoryfxmls):
os.makedirs(directoryfxmls)
filenamexmlzip = directoryfxmls + newabbrev + '_xml_files_' + currentdate + '.zip'
zxml = zipfile.ZipFile(filenamexmlzip, "w")
time2 = time.time()
errors=0
sfileintegrityname = directoryf + newabbrev + '_integrityinfos.txt'
fintegrity=open(sfileintegrityname, writeorappend) #in this file will be recorded translation/chapter informations of integrity of text fails
fintegrity.write('source: biblegateway.com/versions' + os.linesep)
szgroupingsreff = directoryf + newabbrev + '_groupingsreff.txt'
fzgroupingsreff = codecs.open(szgroupingsreff, writeorappend, 'utf-8')
groupings = 0
for c in bookslist:
if c.strip() != '':
time3 = time.time()
c = c.strip()
sfilename = directoryf + dprefixes[c] + '-' + c.replace(' ','') + '.' + newabbrev + '.txt'
g=codecs.open(sfilename, writeorappend, 'utf-8')
#print()
print('#' + str(ibiblecount) + '/' + str(len(pool)) + ' ' + sfilename.split('/')[-1])
#for i in [1]:
for i in range(chaptersdict[c]): #if you want to scrap only particular chapters, you can intervene in this line, for example for i in [2, 3, 5 ]: to scrap chapters 3, 4, 6 only
#range function is indexing from zero, so for example for i in range(50) will loop 0..49
#but becareful, this script is intentioned for batch processing whole bibles, to not overwrite full bible texts with crampled partial texts.
#for that kind of reasons (chapters only scraping) you should replace line above with rootdir = '/yourpath/testing/'
#which will save in testing folder
url = "http://www.biblegateway.com/passage/?search=" + c.replace(' ','+') + "+" + str(i + 1) + "&version=" + '-'.join(bib.split('-')[1:])
glava = str(i + 1)
# next block is new to prevent breaking from server while scraping
#print(str(i+1), end=' ')
#sys.stdout.flush()
try:
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
so = str(resp.read().decode('utf-8'))
soup = BeautifulSoup(so, 'html.parser')
breakchapter = False
except:
breakchapter = True
# meaning of this is that chapter is not present on site, for scrapper to continue to work
# chapter will be skipped
if breakchapter:
fintegrity.write('! ' + sfilename.split('/')[-1] + ' chapter: ' + glava + ', do not exists on site' + os.linesep)
errors += 1
else:
mydivs = soup.findAll("div", { "class" : "passage-text" })
try:
time.sleep(0.1)
sobs = html.unescape(str(mydivs[0]))
except:
fintegrity.write('! ' + sfilename.split('/')[-1] + ' chapter: ' + glava + ', do not exists on site' + os.linesep)
errors += 1
continue
# prevention for future with saving part where text is from original xml files
sfilenamexml = directoryfxmls + dprefixes[c] + '-' + '%03d' % int(i + 1) + '-' + c.replace(' ','') + '.' + newabbrev + '.xml'
fxml = codecs.open(sfilenamexml, 'w', 'utf-8')
fxml.write(html.unescape(sobs))
fxml.close()
zxml.write(sfilenamexml, arcname = sfilenamexml.split('/')[-1])
os.remove(sfilenamexml)
sobs = re.sub('(<sup class="versenum">)(.*?)(</sup>)', r'</span>\1\2\3<span class="text">', sobs, flags=re.UNICODE)
sobs = sobs.replace('<sup class="versenum">','SplittingForGod<sup class="versenum">')
lso = sobs.split('SplittingForGod')
if '</h3>' or '<h3>' in lso[0]:
isoup = BeautifulSoup(lso[0], 'html.parser')
tagsitext = isoup.findAll("h3")
siv1 = ''
for tag in tagsitext:
siv1 += tag.text.strip() + ' '
siv1 = siv1.strip()
if siv1 != '':
g.write('{i' + glava + ':1} ' + siv1 + os.linesep)
lso[0] = re.sub('<h3>.*?</h3>', '', lso[0], flags = re.UNICODE)
numoflines = 0
isoup = BeautifulSoup(lso[0], 'html.parser')
if isoup.find("sup", { "class" : "versenum" }) == None:
lso[0] = '<sup class="versenum"> 1 </sup>' + lso[0]
# fix broken parts without versenum tag
ind = len(lso) - 1
while ind > 0:
isoup = BeautifulSoup(lso[ind], 'html.parser')
if isoup.find("sup", { "class" : "versenum" }) != None:
ind -= 1
else:
lso[ind - 1] = lso[ind -1] + lso[ind]
del lso[ind]
ind -= 1
# fix groups like 22-23; 23-24 in Genesis 27 EN-MSG for result 22-24
lvn = []
for item in lso:
isoup = BeautifulSoup(item, 'html.parser')
lvn.append(isoup.find("sup", { "class" : "versenum" }).text.strip())
ind = 0
while ind < (len(lso) -1 ):
if '-' in lvn[ind]:
gn1 = lvn[ind].split('-')[1]
gnleft = lvn[ind].split('-')[0]
else:
gn1 = lvn[ind]
gnleft = lvn[ind]
if '-' in lvn[ind + 1]:
gn2 = lvn[ind + 1].split('-')[0]
gnright = lvn[ind + 1].split('-')[1]
else:
gn2 = lvn[ind + 1]
gnright = lvn[ind + 1]
if gn1 == gn2:
lso[ind + 1] = lso[ind] + ' ' + lso[ind + 1]
del lso[ind]
lvn[ind + 1] = gnleft + '-' + gnright
del lvn[ind]
isoup = BeautifulSoup(lso[ind], 'html.parser')
isoup.find("sup", { "class" : "versenum" }).extract()
isoup.find("sup", { "class" : "versenum" }).extract()
lso[ind] = '<sup class="versenum"> ' + gnleft + '-' + gnright + ' </sup>' + str(isoup)
ind = ind - 1
ind += 1
for ind in range(len(lso)):
isoup = BeautifulSoup(lso[ind], 'html.parser')
if isoup.find("h3") != None:
htags = isoup.findAll("h3")
siv1 = ''
for tag in htags:
siv1 += tag.text.strip() + ' '
siv1 = siv1.strip()
FilteredIntro = siv1
bheading = True
isoup.find("h3").extract()
else:
bheading = False
vtag = isoup.find("sup", { "class" : "versenum" })
try:
FilteredVerseNum = vtag.text
except:
FilteredVerseNum = '200'
FilteredVerseNum = FilteredVerseNum.strip()
FilteredVerseNum = FilteredVerseNum.replace(' ', '')
try:
isoup.find("span", {"class":"chapternum"}).extract()
except:
pass
try:
isoup.find("sup", {"class":"versenum"}).extract()
except:
pass
lt = []
content_tags = isoup.findAll("span", {"class" : "text"})
for tag in content_tags:
lt.append(tag.text)
FilteredVerseText = ' '.join(lt)
FilteredVerseText = FilteredVerseText.strip()
FilteredVerseText = re.sub(' +', ' ', FilteredVerseText, flags=re.UNICODE)
FilteredVerseText = re.sub(' ([?!.:;,])', r'\1', FilteredVerseText, flags=re.UNICODE)
FilteredVerseText = re.sub('([?!.:;,])(\w)', r'\1 \2', FilteredVerseText, flags=re.UNICODE)
FilteredVerseText = re.sub('([^ ])—', r'\1 —', FilteredVerseText, flags=re.UNICODE)
FilteredVerseText = re.sub('—([^ ])', r'— \1', FilteredVerseText, flags=re.UNICODE)
FilteredVerseText = re.sub('\[ ?\w+ ?\]', '', FilteredVerseText, flags=re.UNICODE)
FilteredVerseText = re.sub('( \d+,) (\d+[ ,;:.])', r'\1\2', FilteredVerseText, flags=re.UNICODE)
FilteredVerseText = re.sub('( \d+,) (\d+)$', r'\1\2', FilteredVerseText, flags=re.UNICODE)
FilteredVerseText = re.sub('([“ ‘])L ord', r'\1Lord', FilteredVerseText, flags=re.UNICODE)
FilteredVerseText = FilteredVerseText.replace('“ ','“')
FilteredVerseText = FilteredVerseText.replace(' ”','”')
FilteredVerseText = FilteredVerseText.replace('‘ ','‘')
FilteredVerseText = FilteredVerseText.replace(' ’','’')
FilteredVerseText = FilteredVerseText.replace(" 's", "'s")
if '-' in FilteredVerseNum:
groupings += 1
splitversetext(FilteredVerseNum, FilteredVerseText)
else:
sline= '{' + glava + ':' + FilteredVerseNum + '} ' + FilteredVerseText + os.linesep
numoflines += 1
if bheading:
if FilteredIntro != '':
try:
sline += '{i' + glava + ':' + str(int(FilteredVerseNum) +1) + '} ' + FilteredIntro + os.linesep
except:
try:
sline += '{i' + glava + ':' + str(int(FilteredVerseNum.split('-')[1]) +1) + '} ' + FilteredIntro + os.linesep
except:
sline += '{i' + glava + ':' + str(200) + '} ' + FilteredIntro + os.linesep
g.write(sline)
trebalinii=int(Bibliaa[c.replace(' ','')][int(glava)-1].split(':')[1])
if numoflines != trebalinii:
fintegrity.write('! ' + sfilename.split('/')[-1] + ' ' + glava + ' ' + str(numoflines) + ' ' + str(trebalinii) + '\n')
errors += 1
time.sleep(0.1 + random.uniform(0.1, 0.2))
#print()
time4 = time.time()
h1hours = int((time4-time3)/3600)
h1min = int((time4-time3 - h1hours * 3600)/60)
h1sec = time4 - time3 - h1hours * 3600 - h1min * 60
h2hours = int((time4-time2)/3600)
h2min = int((time4-time2 - h2hours * 3600)/60)
h2sec = time4 - time2 - h2hours * 3600 - h2min * 60
h3hours = int((time4-time1)/3600)
h3min = int((time4-time1 - h3hours * 3600)/60)
h3sec = time4 - time1 - h3hours * 3600 - h3min * 60
print("#%s/%s %s %s %dh:%dm:%ds / %dh:%dm:%ds / %dh:%dm:%ds" % (str(ibiblecount), str(len(pool)), newabbrev, c.replace(' ',''), h1hours, h1min, h1sec, h2hours, h2min, h2sec, h3hours, h3min, h3sec))
g.close()
zxml.close()
#print()
#print('Total number of errors for ' + directoryf + newabbrev + ' : ' + str(errors) +'\n')
print("#%s/%s %s done. %dh:%dm:%ds / Total: %dh:%dm:%ds" % (str(ibiblecount), str(len(pool)), newabbrev, h2hours, h2min, h2sec, h3hours, h3min, h3sec))
fintegrity.write('Total number of errors for ' + directoryf + newabbrev + ' : ' + str(errors) +'\n')
fintegrity.close()
fzgroupingsreff.close()
fstatusdone = open(directoryf + newabbrev + '_statusdone', 'w')
fstatusdone.close()
print('Done!')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment