Skip to content

Instantly share code, notes, and snippets.

@rasoolims
Last active March 22, 2017 23:23
Show Gist options
  • Save rasoolims/992523c7aa191fe69b5a1a4a47fa9248 to your computer and use it in GitHub Desktop.
Save rasoolims/992523c7aa191fe69b5a1a4a47fa9248 to your computer and use it in GitHub Desktop.
Convert Bible xml files from https://github.com/christos-c/bible-corpus/ to aligned files for all pairs of languages (src.target)
import re,os,sys,codecs,traceback
from xml.dom import minidom
'''
The input folder is the folder with Bible xml files from https://github.com/christos-c/bible-corpus/
The output folder contains aligned files for all pairs of languages (src.target)
It merges English and English-Web files.
'''
if len(sys.argv)<3:
print 'input_folder output_folder'
sys.exit(0)
inp_folder = os.path.abspath(sys.argv[1])+'/'
output_folder = os.path.abspath(sys.argv[2])+'/'
bible_dict = dict()
langs = set()
for f in os.listdir(inp_folder):
print f
try:
xmldoc = minidom.parse(inp_folder+f)
sentences = xmldoc.getElementsByTagName('seg')
bible_dict[f] = dict()
for sentence in sentences:
sen_id = sentence.attributes['id'].value
try:
bible_dict[f][sen_id] = sentence.firstChild.nodeValue.replace('\n',' ').replace('\t',' ').strip()
except: pass
print len(bible_dict[f])
except:
print f
traceback.print_exc(file=sys.stdout)
print 'saving bibles'
for f1 in bible_dict.keys():
if f1 == 'English-WEB.xml': continue
for f2 in bible_dict.keys():
if f1>=f2 or f2 == 'English-WEB.xml': continue
l1 = f1[:f1.find('.')]
l2 = f2[:f2.find('.')]
print l1,l2
w1 = codecs.open(output_folder+l2+'.'+l1,'w',encoding='utf-8')
w2 = codecs.open(output_folder+l1+'.'+l2,'w',encoding='utf-8')
shared_sentences = set(bible_dict[f1].keys()) & set(bible_dict[f2].keys())
for s in shared_sentences:
w1.write(bible_dict[f1][s]+'\n')
w2.write(bible_dict[f2][s]+'\n')
# Merging with English-Web
if l1 == 'English':
shared_sentences = set(bible_dict['English-WEB.xml'].keys()) & set(bible_dict[f2].keys())
for s in shared_sentences:
w1.write(bible_dict['English-WEB.xml'][s]+'\n')
w2.write(bible_dict[f2][s]+'\n')
if l2 == 'English':
shared_sentences = set(bible_dict[f1].keys()) & set(bible_dict['English-WEB.xml'].keys())
for s in shared_sentences:
w1.write(bible_dict[f1][s]+'\n')
w2.write(bible_dict['English-WEB.xml'][s]+'\n')
w1.close()
w2.close()
print 'done!'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment