Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save josifoski/d2d3e787d758168a162c25957e7ba678 to your computer and use it in GitHub Desktop.
Save josifoski/d2d3e787d758168a162c25957e7ba678 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3+
# script for scraping Bible texts from bible.com
# creator: Aleksandar Josifoski for Troy Lyndon troylyndon@gmail.com property of http://rdgames.us
# 2016-03-23
# http://bible.com/versions
# for server based scrapping, with heart recommended ♡ pythonanywhere ♡
## INPUT #####################################################################################################################
# First, take a look at bc_keys.py and pick keys for which translations to be scrapped
# Second, add in pool which Bible translations to be scrapped, put only number in quotes like pool = [ "86", "110"]
# for example ENG_KJV have key "1"
pool = [
]
# if you want to scrape only one book, or some of them reduce bookslist
bookslist = [ "gen","exo","lev","num","deu","jos","jdg","rut","1sa","2sa","1ki","2ki","1ch","2ch","ezr","neh","est","job","psa","pro","ecc","sng","isa","jer","lam","ezk","dan","hos","jol","amo","oba","jon","mic","nam","hab","zep","hag","zec","mal","mat","mrk","luk","jhn","act","rom","1co","2co","gal","eph","php","col","1th","2th","1ti","2ti","tit","phm","heb","jas","1pe","2pe","1jn","2jn","3jn","jud","rev"]
#bookslist = [ "1pe","2pe","1jn","2jn","3jn","jud" ]
# note, last character must be / for first character put / for absolute path
# if you omit first character /, directories will be created in current directory where python scripts are
# Important!!! check rootdir and xmlrootdir in next lines
#rootdir = '/data/rdgames/testing/' # again, last character must be / same as above
#xmlrootdir = '/data/rdgames/testing/xml/'
rootdir = '/home/josifoski/bibles/bc/' # this is on server, put # before one of them to exclude one
xmlrootdir = '/home/josifoski/bibles/bcxml/' # for server scrapping put here correct username
# if you want to rescrape Bible, delete statusdone generated file where Bible is saved
##############################################################################################################################
from bc_keys import numtobibleabbrev
from bs4 import BeautifulSoup
import re
import urllib.request
import sys
import os
import time
import datetime
import random
import html
import codecs
import zipfile
dabbrevbook = { "gen":"Genesis", "exo":"Exodus","lev":"Leviticus","num":"Numbers","deu":"Deuteronomy","jos":"Joshua","jdg":"Judges","rut":"Ruth",
"1sa":"1Samuel","2sa":"2Samuel","1ki":"1Kings","2ki":"2Kings","1ch":"1Chronicles","2ch":"2Chronicles","ezr":"Ezra","neh":"Nehemiah",
"est":"Esther","job":"Job","psa":"Psalms","pro":"Proverbs","ecc":"Ecclesiastes","sng":"SongofSolomon","isa":"Isaiah","jer":"Jeremiah",
"lam":"Lamentations","ezk":"Ezekiel","dan":"Daniel","hos":"Hosea","jol":"Joel","amo":"Amos","oba":"Obadiah","jon":"Jonah","mic":"Micah",
"nam":"Nahum","hab":"Habakkuk","zep":"Zephaniah","hag":"Haggai","zec":"Zechariah","mal":"Malachi","mat":"Matthew","mrk":"Mark","luk":"Luke",
"jhn":"John","act":"Acts","rom":"Romans","1co":"1Corinthians","2co":"2Corinthians","gal":"Galatians","eph":"Ephesians","php":"Philippians",
"col":"Colossians","1th":"1Thessalonians","2th":"2Thessalonians","1ti":"1Timothy","2ti":"2Timothy","tit":"Titus","phm":"Philemon",
"heb":"Hebrews","jas":"James","1pe":"1Peter","2pe":"2Peter","1jn":"1John","2jn":"2John","3jn":"3John","jud":"Jude","rev":"Revelation" }
dprefixes = {
"Genesis":"01", "Exodus":"02", "Leviticus":"03", "Numbers":"04", "Deuteronomy":"05", "Joshua":"06", "Judges":"07", "Ruth":"08", "1Samuel":"09",
"2Samuel":"10", "1Kings":"11", "2Kings":"12", "1Chronicles":"13", "2Chronicles":"14", "Ezra":"15", "Nehemiah":"16", "Esther":"17", "Job":"18",
"Psalms":"19", "Proverbs":"20", "Ecclesiastes":"21", "SongofSolomon":"22", "Isaiah":"23", "Jeremiah":"24", "Lamentations":"25", "Ezekiel":"26",
"Daniel":"27", "Hosea":"28", "Joel":"29", "Amos":"30", "Obadiah":"31", "Jonah":"32", "Micah":"33", "Nahum":"34", "Habakkuk":"35", "Zephaniah":"36",
"Haggai":"37", "Zechariah":"38", "Malachi":"39", "Matthew":"40", "Mark":"41", "Luke":"42", "John":"43", "Acts":"44", "Romans":"45",
"1Corinthians":"46", "2Corinthians":"47", "Galatians":"48", "Ephesians":"49", "Philippians":"50", "Colossians":"51", "1Thessalonians":"52",
"2Thessalonians":"53", "1Timothy":"54", "2Timothy":"55", "Titus":"56", "Philemon":"57", "Hebrews":"58", "James":"59", "1Peter":"60",
"2Peter":"61", "1John":"62", "2John":"63", "3John":"64", "Jude":"65", "Revelation":"66" }
booksdict = { "Genesis" : 50, "Exodus" : 40, "Leviticus" : 27, "Numbers" : 36, "Deuteronomy" : 34, "Joshua" : 24, "Judges" : 21, "Ruth" : 4,
"1Samuel" : 31, "2Samuel" : 24, "1Kings" : 22, "2Kings" : 25, "1Chronicles" : 29, "2Chronicles" : 36, "Ezra" : 10, "Nehemiah" : 13, "Esther" : 10,
"Job" : 42, "Psalms" : 150, "Proverbs" : 31, "Ecclesiastes" : 12, "SongofSolomon" : 8, "Isaiah" : 66, "Jeremiah" : 52, "Lamentations" : 5,
"Ezekiel" : 48, "Daniel" : 12, "Hosea" : 14, "Joel" : 3, "Amos" : 9, "Obadiah" : 1, "Jonah" : 4, "Micah" : 7, "Nahum" : 3, "Habakkuk" : 3,
"Zephaniah" : 3, "Haggai" : 2, "Zechariah" : 14, "Malachi" : 4, "Matthew" : 28, "Mark" : 16, "Luke" : 24, "John" : 21, "Acts" : 28, "Romans" : 16,
"1Corinthians" : 16, "2Corinthians" : 13, "Galatians" : 6, "Ephesians" : 6, "Philippians" : 4, "Colossians" : 4, "1Thessalonians" : 5,
"2Thessalonians" : 3, "1Timothy" : 6, "2Timothy" : 4, "Titus" : 3, "Philemon" : 1, "Hebrews" : 13, "James" : 5, "1Peter" : 5, "2Peter" : 3,
"1John" : 5, "2John" : 1, "3John" : 1, "Jude" : 1, "Revelation" : 22 }
Bibliaa = {
'Genesis' : ['1:31','2:25','3:24','4:26','5:32','6:22','7:24','8:22','9:29','10:32','11:32','12:20','13:18','14:24','15:21','16:16','17:27','18:33','19:38','20:18','21:34','22:24','23:20','24:67','25:34','26:35','27:46','28:22','29:35','30:43','31:55','32:32','33:20','34:31','35:29','36:43','37:36','38:30','39:23','40:23','41:57','42:38','43:34','44:34','45:28','46:34','47:31','48:22','49:33','50:26'],
'Exodus' : ['1:22','2:25','3:22','4:31','5:23','6:30','7:25','8:32','9:35','10:29','11:10','12:51','13:22','14:31','15:27','16:36','17:16','18:27','19:25','20:26','21:36','22:31','23:33','24:18','25:40','26:37','27:21','28:43','29:46','30:38','31:18','32:35','33:23','34:35','35:35','36:38','37:29','38:31','39:43','40:38'],
'Leviticus' : ['1:17','2:16','3:17','4:35','5:19','6:30','7:38','8:36','9:24','10:20','11:47','12:8','13:59','14:57','15:33','16:34','17:16','18:30','19:37','20:27','21:24','22:33','23:44','24:23','25:55','26:46','27:34'],
'Numbers' : ['1:54','2:34','3:51','4:49','5:31','6:27','7:89','8:26','9:23','10:36','11:35','12:16','13:33','14:45','15:41','16:50','17:13','18:32','19:22','20:29','21:35','22:41','23:30','24:25','25:18','26:65','27:23','28:31','29:40','30:16','31:54','32:42','33:56','34:29','35:34','36:13'],
'Deuteronomy' : ['1:46','2:37','3:29','4:49','5:33','6:25','7:26','8:20','9:29','10:22','11:32','12:32','13:18','14:29','15:23','16:22','17:20','18:22','19:21','20:20','21:23','22:30','23:25','24:22','25:19','26:19','27:26','28:68','29:29','30:20','31:30','32:52','33:29','34:12'],
'Joshua' : ['1:18','2:24','3:17','4:24','5:15','6:27','7:26','8:35','9:27','10:43','11:23','12:24','13:33','14:15','15:63','16:10','17:18','18:28','19:51','20:9','21:45','22:34','23:16','24:33'],
'Judges' : ['1:36','2:23','3:31','4:24','5:31','6:40','7:25','8:35','9:57','10:18','11:40','12:15','13:25','14:20','15:20','16:31','17:13','18:31','19:30','20:48','21:25'],
'Ruth' : ['1:22','2:23','3:18','4:22'],
'1Samuel' : ['1:28','2:36','3:21','4:22','5:12','6:21','7:17','8:22','9:27','10:27','11:15','12:25','13:23','14:52','15:35','16:23','17:58','18:30','19:24','20:42','21:15','22:23','23:29','24:22','25:44','26:25','27:12','28:25','29:11','30:31','31:13'],
'2Samuel' : ['1:27','2:32','3:39','4:12','5:25','6:23','7:29','8:18','9:13','10:19','11:27','12:31','13:39','14:33','15:37','16:23','17:29','18:33','19:43','20:26','21:22','22:51','23:39','24:25'],
'1Kings' : ['1:53','2:46','3:28','4:34','5:18','6:38','7:51','8:66','9:28','10:29','11:43','12:33','13:34','14:31','15:34','16:34','17:24','18:46','19:21','20:43','21:29','22:53'],
'2Kings' : ['1:18','2:25','3:27','4:44','5:27','6:33','7:20','8:29','9:37','10:36','11:21','12:21','13:25','14:29','15:38','16:20','17:41','18:37','19:37','20:21','21:26','22:20','23:37','24:20','25:30'],
'1Chronicles' : ['1:54','2:55','3:24','4:43','5:26','6:81','7:40','8:40','9:44','10:14','11:47','12:40','13:14','14:17','15:29','16:43','17:27','18:17','19:19','20:8','21:30','22:19','23:32','24:31','25:31','26:32','27:34','28:21','29:30'],
'2Chronicles' : ['1:17','2:18','3:17','4:22','5:14','6:42','7:22','8:18','9:31','10:19','11:23','12:16','13:22','14:15','15:19','16:14','17:19','18:34','19:11','20:37','21:20','22:12','23:21','24:27','25:28','26:23','27:9','28:27','29:36','30:27','31:21','32:33','33:25','34:33','35:27','36:23'],
'Ezra' : ['1:11','2:70','3:13','4:24','5:17','6:22','7:28','8:36','9:15','10:44'],
'Nehemiah' : ['1:11','2:20','3:32','4:23','5:19','6:19','7:73','8:18','9:38','10:39','11:36','12:47','13:31'],
'Esther' : ['1:22','2:23','3:15','4:17','5:14','6:14','7:10','8:17','9:32','10:3'],
'Job' : ['1:22','2:13','3:26','4:21','5:27','6:30','7:21','8:22','9:35','10:22','11:20','12:25','13:28','14:22','15:35','16:22','17:16','18:21','19:29','20:29','21:34','22:30','23:17','24:25','25:6','26:14','27:23','28:28','29:25','30:31','31:40','32:22','33:33','34:37','35:16','36:33','37:24','38:41','39:30','40:24','41:34','42:17'],
'Psalms' : ['1:6','2:12','3:8','4:8','5:12','6:10','7:17','8:9','9:20','10:18','11:7','12:8','13:6','14:7','15:5','16:11','17:15','18:50','19:14','20:9','21:13','22:31','23:6','24:10','25:22','26:12','27:14','28:9','29:11','30:12','31:24','32:11','33:22','34:22','35:28','36:12','37:40','38:22','39:13','40:17','41:13','42:11','43:5','44:26','45:17','46:11','47:9','48:14','49:20','50:23','51:19','52:9','53:6','54:7','55:23','56:13','57:11','58:11','59:17','60:12','61:8','62:12','63:11','64:10','65:13','66:20','67:7','68:35','69:36','70:5','71:24','72:20','73:28','74:23','75:10','76:12','77:20','78:72','79:13','80:19','81:16','82:8','83:18','84:12','85:13','86:17','87:7','88:18','89:52','90:17','91:16','92:15','93:5','94:23','95:11','96:13','97:12','98:9','99:9','100:5','101:8','102:28','103:22','104:35','105:45','106:48','107:43','108:13','109:31','110:7','111:10','112:10','113:9','114:8','115:18','116:19','117:2','118:29','119:176','120:7','121:8','122:9','123:4','124:8','125:5','126:6','127:5','128:6','129:8','130:8','131:3','132:18','133:3','134:3','135:21','136:26','137:9','138:8','139:24','140:13','141:10','142:7','143:12','144:15','145:21','146:10','147:20','148:14','149:9','150:6'],
'Proverbs' : ['1:33','2:22','3:35','4:27','5:23','6:35','7:27','8:36','9:18','10:32','11:31','12:28','13:25','14:35','15:33','16:33','17:28','18:24','19:29','20:30','21:31','22:29','23:35','24:34','25:28','26:28','27:27','28:28','29:27','30:33','31:31'],
'Ecclesiastes' : ['1:18','2:26','3:22','4:16','5:20','6:12','7:29','8:17','9:18','10:20','11:10','12:14'],
'SongofSolomon' : ['1:17','2:17','3:11','4:16','5:16','6:13','7:13','8:14'],
'Isaiah' : ['1:31','2:22','3:26','4:6','5:30','6:13','7:25','8:22','9:21','10:34','11:16','12:6','13:22','14:32','15:9','16:14','17:14','18:7','19:25','20:6','21:17','22:25','23:18','24:23','25:12','26:21','27:13','28:29','29:24','30:33','31:9','32:20','33:24','34:17','35:10','36:22','37:38','38:22','39:8','40:31','41:29','42:25','43:28','44:28','45:25','46:13','47:15','48:22','49:26','50:11','51:23','52:15','53:12','54:17','55:13','56:12','57:21','58:14','59:21','60:22','61:11','62:12','63:19','64:12','65:25','66:24'],
'Jeremiah' : ['1:19','2:37','3:25','4:31','5:31','6:30','7:34','8:22','9:26','10:25','11:23','12:17','13:27','14:22','15:21','16:21','17:27','18:23','19:15','20:18','21:14','22:30','23:40','24:10','25:38','26:24','27:22','28:17','29:32','30:24','31:40','32:44','33:26','34:22','35:19','36:32','37:21','38:28','39:18','40:16','41:18','42:22','43:13','44:30','45:5','46:28','47:7','48:47','49:39','50:46','51:64','52:34'],
'Lamentations' : ['1:22','2:22','3:66','4:22','5:22'],
'Ezekiel' : ['1:28','2:10','3:27','4:17','5:17','6:14','7:27','8:18','9:11','10:22','11:25','12:28','13:23','14:23','15:8','16:63','17:24','18:32','19:14','20:49','21:32','22:31','23:49','24:27','25:17','26:21','27:36','28:26','29:21','30:26','31:18','32:32','33:33','34:31','35:15','36:38','37:28','38:23','39:29','40:49','41:26','42:20','43:27','44:31','45:25','46:24','47:23','48:35'],
'Daniel' : ['1:21','2:49','3:30','4:37','5:31','6:28','7:28','8:27','9:27','10:21','11:45','12:13'],
'Hosea' : ['1:11','2:23','3:5','4:19','5:15','6:11','7:16','8:14','9:17','10:15','11:12','12:14','13:16','14:9'],
'Joel' : ['1:20','2:32','3:21'],
'Amos' : ['1:15','2:16','3:15','4:13','5:27','6:14','7:17','8:14','9:15'],
'Obadiah' : ['1:21'],
'Jonah' : ['1:17','2:10','3:10','4:11'],
'Micah' : ['1:16','2:13','3:12','4:13','5:15','6:16','7:20'],
'Nahum' : ['1:15','2:13','3:19'],
'Habakkuk' : ['1:17','2:20','3:19'],
'Zephaniah' : ['1:18','2:15','3:20'],
'Haggai' : ['1:15','2:23'],
'Zechariah' : ['1:21','2:13','3:10','4:14','5:11','6:15','7:14','8:23','9:17','10:12','11:17','12:14','13:9','14:21'],
'Malachi' : ['1:14','2:17','3:18','4:6'],
'Matthew' : ['1:25','2:23','3:17','4:25','5:48','6:34','7:29','8:34','9:38','10:42','11:30','12:50','13:58','14:36','15:39','16:28','17:27','18:35','19:30','20:34','21:46','22:46','23:39','24:51','25:46','26:75','27:66','28:20'],
'Mark' : ['1:45','2:28','3:35','4:41','5:43','6:56','7:37','8:38','9:50','10:52','11:33','12:44','13:37','14:72','15:47','16:20'],
'Luke' : ['1:80','2:52','3:38','4:44','5:39','6:49','7:50','8:56','9:62','10:42','11:54','12:59','13:35','14:35','15:32','16:31','17:37','18:43','19:48','20:47','21:38','22:71','23:56','24:53'],
'John' : ['1:51','2:25','3:36','4:54','5:47','6:71','7:53','8:59','9:41','10:42','11:57','12:50','13:38','14:31','15:27','16:33','17:26','18:40','19:42','20:31','21:25'],
'Acts' : ['1:26','2:47','3:26','4:37','5:42','6:15','7:60','8:40','9:43','10:48','11:30','12:25','13:52','14:28','15:41','16:40','17:34','18:28','19:41','20:38','21:40','22:30','23:35','24:27','25:27','26:32','27:44','28:31'],
'Romans' : ['1:32','2:29','3:31','4:25','5:21','6:23','7:25','8:39','9:33','10:21','11:36','12:21','13:14','14:23','15:33','16:27'],
'1Corinthians' : ['1:31','2:16','3:23','4:21','5:13','6:20','7:40','8:13','9:27','10:33','11:34','12:31','13:13','14:40','15:58','16:24'],
'2Corinthians' : ['1:24','2:17','3:18','4:18','5:21','6:18','7:16','8:24','9:15','10:18','11:33','12:21','13:14'],
'Galatians' : ['1:24','2:21','3:29','4:31','5:26','6:18'],
'Ephesians' : ['1:23','2:22','3:21','4:32','5:33','6:24'],
'Philippians' : ['1:30','2:30','3:21','4:23'],
'Colossians' : ['1:29','2:23','3:25','4:18'],
'1Thessalonians' : ['1:10','2:20','3:13','4:18','5:28'],
'2Thessalonians' : ['1:12','2:17','3:18'],
'1Timothy' : ['1:20','2:15','3:16','4:16','5:25','6:21'],
'2Timothy' : ['1:18','2:26','3:17','4:22'],
'Titus' : ['1:16','2:15','3:15'],
'Philemon' : ['1:25'],
'Hebrews' : ['1:14','2:18','3:19','4:16','5:14','6:20','7:28','8:13','9:28','10:39','11:40','12:29','13:25'],
'James' : ['1:27','2:26','3:18','4:17','5:20'],
'1Peter' : ['1:25','2:25','3:22','4:19','5:14'],
'2Peter' : ['1:21','2:22','3:18'],
'1John' : ['1:10','2:29','3:24','4:21','5:21'],
'2John' : ['1:13'],
'3John' : ['1:14'],
'Jude' : ['1:25'],
'Revelation' : ['1:20','2:29','3:22','4:11','5:14','6:17','7:17','8:13','9:21','10:11','11:19','12:17','13:18','14:20','15:8','16:21','17:18','18:24','19:21','20:15','21:27','22:21']
}
def splitversetext(snum, vtext):
# function for splitting grouped verses, if exists
global fzgroupingsreff
global glava
global numoflines
global sline
global c
global dprefixes
global bookslist
global dabbrevbook
global numtobibleabbrev
global bib
fzgroupingsreff.write(dprefixes[dabbrevbook[c]] + ';' + glava + ';' + snum + os.linesep)
sline = ''
l = vtext.split()
lsize = len(l)
try:
ileft = int(snum.split('-')[0])
iright = int(snum.split('-')[1])
except:
ileft = 200
iright = 200
diff = iright - ileft + 1
k = 0
j = int(lsize/diff) - 1
for i in range(ileft, iright + 1):
if i != iright:
sline += '{' + glava + ':' + str(i) + '} ' + ' '.join(l[k:k + j + 1]) + os.linesep
numoflines += 1
k += j + 1
else:
sline += '{' + glava + ':' + str(i) + '} ' + ' '.join(l[k:]) + os.linesep
numoflines += 1
now = datetime.datetime.now()
currentdate = str(now).split()[0].replace('-', '')
writeorappend = 'w'
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
time1=time.time()
ibiblecount = 0
# parsing Bible(s)
for bib in pool:
ibiblecount += 1
bib = str(bib)
bib = bib.strip()
directoryf = rootdir + numtobibleabbrev[bib].split('_')[0] + '/' + numtobibleabbrev[bib] + '/'
if not os.path.exists(directoryf):
os.makedirs(directoryf)
if os.path.exists(directoryf + numtobibleabbrev[bib] + '_statusdone'):
print(directoryf + numtobibleabbrev[bib] + ' already previously scrapped')
continue
directoryfxmls = xmlrootdir + numtobibleabbrev[bib].split('_')[0] + '/'
if not os.path.exists(directoryfxmls):
os.makedirs(directoryfxmls)
filenamexmlzip = directoryfxmls + numtobibleabbrev[bib]+ '_xml_files_' + currentdate + '.zip'
zxml = zipfile.ZipFile(filenamexmlzip, "w")
time2 = time.time()
errors=0
sfileintegrityname = directoryf + numtobibleabbrev[bib] + '_integrityinfos.txt'
fintegrity = open(sfileintegrityname, writeorappend) #in this file will be added translation/chapter informations of integrity of text fails
fintegrity.write('source: bible.com/versions' + os.linesep)
szgroupingsreff = directoryf + numtobibleabbrev[bib] + '_groupingsreff.txt'
fzgroupingsreff = open(szgroupingsreff, writeorappend ) #in this file will be added informations about grouped-splitted texts refferences
groupings = 0
for c in bookslist:
time3 = time.time()
c = c.strip()
sfilename = directoryf + dprefixes[dabbrevbook[c]] + '-' + dabbrevbook[c] + '.' + numtobibleabbrev[bib] + '.txt'
g=codecs.open(sfilename, writeorappend, 'utf-8')
#print()
print('#' + str(ibiblecount) + '/' + str(len(pool)) + ' ' + bib + ' ' + sfilename.split('/')[-1])
lch = []
for i in range(booksdict[dabbrevbook[c]]):
lch.append(i)
# now we are going parsing chapters
for i in lch:
#print(str(i + 1), end = ' ')
sys.stdout.flush()
glava=str(i + 1)
url = "http://bible.com/bible/" + bib.strip() + '/' + c.strip() + '.' + str(i + 1)
try:
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
so = str(resp.read().decode('utf-8'))
soup = BeautifulSoup(so, 'html.parser')
breakchapter = False
except:
breakchapter = True
# meaning of this is that chapter is not present on site, for scrapper to continue to work
# chapter will be skipped
if breakchapter:
fintegrity.write('! ' + sfilename.split('/')[-1] + ' chapter: ' + glava + ', do not exists on site' + os.linesep)
errors += 1
else:
# narrow informations where Bible text is
mydivs = soup.findAll("div", { "class" : "chapter" })
try:
time.sleep(0.1)
so = html.unescape(str(mydivs[0]))
except:
fintegrity.write('! ' + sfilename.split('/')[-1] + ' chapter: ' + glava + ', do not exists on site' + os.linesep)
errors += 1
continue
# prevention for future with saving part where text is from original xml files
sfilenamexml = directoryfxmls + dprefixes[dabbrevbook[c]] + '-' + '%03d' % int(i + 1) + '-' + dabbrevbook[c] + '.' + numtobibleabbrev[bib] + '.xml'
fxml = codecs.open(sfilenamexml, 'w', 'utf-8')
fxml.write(html.unescape(so))
fxml.close()
zxml.write(sfilenamexml, arcname = sfilenamexml.split('/')[-1])
os.remove(sfilenamexml)
so = re.sub('<span class="label">\s*#\s*</span>', '', so, flags=re.UNICODE)
so=so.replace('<span class="label">','SplittingForGod<span class="label">')
lso=so.split('SplittingForGod')
# in case there is heading before verse 1
siv1 = ''
if '<span class="heading">' in lso[0]:
isoup = BeautifulSoup(lso[0], 'html.parser')
tagsitext = isoup.findAll("span", {"class" : "heading"})
for tag in tagsitext:
siv1 += tag.text.strip() + ' '
siv1 = siv1.strip()
g.write('{i' + glava + ':1} ' + siv1 + os.linesep)
lso = lso[1:]
numoflines = 0
# fix broken parts without "span", { "class" : "label" } tag
ind = len(lso) - 1
while ind > 0:
isoup = BeautifulSoup(lso[ind], 'html.parser')
if isoup.find("span", { "class" : "label" }) != None:
ind -= 1
else:
lso[ind - 1] = lso[ind -1] + lso[ind]
del lso[ind]
ind -= 1
# fix groups like 22-23; 23-24 in Genesis 27 EN_MSG for result 22-24
lvn = []
for item in lso:
isoup = BeautifulSoup(item, 'html.parser')
lvn.append(isoup.find("span", { "class" : "label" }).text.strip())
ind = 0
if len(lvn) > 1:
while ind < (len(lso) -1 ):
if '-' in lvn[ind]:
gn1 = lvn[ind].split('-')[1]
gnleft = lvn[ind].split('-')[0]
else:
gn1 = lvn[ind]
gnleft = lvn[ind]
if '-' in lvn[ind + 1]:
gn2 = lvn[ind + 1].split('-')[0]
gnright = lvn[ind + 1].split('-')[1]
else:
gn2 = lvn[ind + 1]
gnright = lvn[ind + 1]
if gn1 == gn2:
lso[ind + 1] = lso[ind] + ' ' + lso[ind + 1]
del lso[ind]
lvn[ind + 1] = gnleft + '-' + gnright
del lvn[ind]
isoup = BeautifulSoup(lso[ind], 'html.parser')
isoup.find("span", { "class" : "label" }).extract()
isoup.find("span", { "class" : "label" }).extract()
lso[ind] = '<span class="label">' + gnleft + '-' + gnright + '</span>' + str(isoup)
ind = ind - 1
ind += 1
# main
for ind in range(len(lso)):
isoup = BeautifulSoup(lso[ind], 'html.parser')
vtag = isoup.find("span", { "class" : "label" })
FilteredVerseNum = vtag.text
FilteredVerseNum = FilteredVerseNum.strip(' \n[]-')
schecknum = ''
for char in FilteredVerseNum:
if char.isdigit() or (char == '-'):
schecknum += char
FilteredVerseNum = schecknum
lt = []
content_tags = isoup.findAll("span", {"class" : "content"})
for tag in content_tags:
lt.append(tag.text)
FilteredVerseText = ' '.join(lt)
FilteredVerseText = FilteredVerseText.strip()
FilteredVerseText = re.sub(' +', ' ', FilteredVerseText, flags=re.UNICODE)
FilteredVerseText = re.sub(' ([?!.:;,])', r'\1', FilteredVerseText, flags=re.UNICODE)
FilteredVerseText = re.sub('([?!.:;,])(\w)', r'\1 \2', FilteredVerseText, flags=re.UNICODE)
FilteredVerseText = re.sub('([^ ])—', r'\1 —', FilteredVerseText, flags=re.UNICODE)
FilteredVerseText = re.sub('—([^ ])', r'— \1', FilteredVerseText, flags=re.UNICODE)
FilteredVerseText = re.sub('\[ ?\w+ ?\]', '', FilteredVerseText, flags=re.UNICODE)
FilteredVerseText = re.sub('( \d+,) (\d+[ ,;:.])', r'\1\2', FilteredVerseText, flags=re.UNICODE)
FilteredVerseText = re.sub('( \d+,) (\d+)$', r'\1\2', FilteredVerseText, flags=re.UNICODE)
FilteredVerseText = re.sub('([“ ‘])L ord', r'\1Lord', FilteredVerseText, flags=re.UNICODE)
FilteredVerseText = FilteredVerseText.replace('“ ','“')
FilteredVerseText = FilteredVerseText.replace(' ”','”')
FilteredVerseText = FilteredVerseText.replace('‘ ','‘')
FilteredVerseText = FilteredVerseText.replace(' ’','’')
FilteredVerseText = FilteredVerseText.replace(" 's", "'s")
if ('Footnotes for' in FilteredVerseText) and ind == (len(lso) -1):
FilteredVerseText = re.sub('Footnotes for.*', '', FilteredVerseText, flags=re.UNICODE)
if '-' in FilteredVerseNum:
groupings += 1
splitversetext(FilteredVerseNum, FilteredVerseText)
else:
sline= '{' + glava + ':' + FilteredVerseNum + '} ' + FilteredVerseText + os.linesep
numoflines += 1
if isoup.find("span", {"class" : "heading"}) != None:
htags = isoup.findAll("span", {"class" : "heading"})
siv1 = ''
for tag in htags:
siv1 += tag.text.strip() + ' '
siv1 = siv1.strip()
FilteredIntro = siv1
try:
sline += '{i' + glava + ':' + str(int(FilteredVerseNum) +1) + '} ' + FilteredIntro + os.linesep
except:
try:
sline += '{i' + glava + ':' + str(int(FilteredVerseNum.split('-')[1]) +1) + '} ' + FilteredIntro + os.linesep
except:
sline += '{i' + glava + ':' + str(200) + '} ' + FilteredIntro + os.linesep
g.write(sline)
trebalinii=int(Bibliaa[dabbrevbook[c]][int(glava)-1].split(':')[1])
if numoflines != trebalinii:
fintegrity.write('! ' + sfilename.split('/')[-1] + ' ' + glava + ' ' + str(numoflines) + ' ' + str(trebalinii) + os.linesep)
errors += 1
time.sleep(0.1 + random.uniform(0.01, 0.15))
#print()
time4 = time.time()
h1hours = int((time4-time3)/3600)
h1min = int((time4-time3 - h1hours * 3600)/60)
h1sec = time4 - time3 - h1hours * 3600 - h1min * 60
h2hours = int((time4-time2)/3600)
h2min = int((time4-time2 - h2hours * 3600)/60)
h2sec = time4 - time2 - h2hours * 3600 - h2min * 60
h3hours = int((time4-time1)/3600)
h3min = int((time4-time1 - h3hours * 3600)/60)
h3sec = time4 - time1 - h3hours * 3600 - h3min * 60
print("#%s/%s (%dh:%dm:%ds / %dh:%dm:%ds / #%s %dh:%dm:%ds)" % (str(ibiblecount), str(len(pool)), h1hours, h1min, h1sec, h2hours, h2min, h2sec, str(ibiblecount), h3hours, h3min, h3sec))
g.close()
zxml.close()
#print()
#print('Total number of errors per chapter for ' + directoryf + numtobibleabbrev[bib] + ' : ' + str(errors) + os.linesep)
print("#%s/%s Done %s %s %dh:%dm:%ds / %dh:%dm:%ds)" % (str(ibiblecount), str(len(pool)), bib, numtobibleabbrev[bib], h2hours, h2min, h2sec, h3hours, h3min, h3sec))
print('********')
fintegrity.write('Total number of errors per chapter for ' + directoryf + numtobibleabbrev[bib] + ' : ' + str(errors) + os.linesep)
fintegrity.close()
fzgroupingsreff.close()
fstatusdone = open(directoryf + numtobibleabbrev[bib] + '_statusdone', 'w')
fstatusdone.close()
print('Done!')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment