Skip to content

Instantly share code, notes, and snippets.

@bbriggs
Created December 21, 2015 18:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bbriggs/984886e654073651347b to your computer and use it in GitHub Desktop.
Save bbriggs/984886e654073651347b to your computer and use it in GitHub Desktop.
Converts Unbound Bible formats into a plain text with no line breaks. Useful for preprocessing Bible text for NLP tasks.
#!/usr/bin/env python
#Convert Unbound Bible Format into plain text, all one line.
import os
import os.path
import sys
import re
from tempfile import mkstemp
from shutil import move
def unbound2plain(infile):
fd, abs_path = mkstemp()
with open(abs_path,'a') as tmp_file:
with open(infile) as source:
for line in source:
if not line.startswith('#'):
line = re.sub(r'^[\d\sO]+\t','', line)
line = re.sub(r'\r\n$',' ',line)
tmp_file.write(line)
source.close()
tmp_file.close()
os.close(fd)
#replaces original file with new one
os.remove(source.name)
move(abs_path, infile)
return
def main(infile):
unbound2plain(infile)
if __name__ == '__main__':
infile = sys.argv[1]
main(infile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment