JGVerdugo/extract-tei.py

## extract-tei.py
"""Extracts a TEI bilingual vocabulary to a term-tab-term plain-text structure"""

"""Note: this script uses the BeautifulSoup library for TEI parsing."""
"""See http://www.crummy.com/software/BeautifulSoup/bs4 for details."""

from bs4 import BeautifulSoup
import sys
import codecs

if len(sys.argv) < 2:
    print "\n    Usage: extract-tei.py filename\n"
    sys.exit()

filename = sys.argv[1]

# Basic error control

try:
    document = open(filename).read()
except IOError:
    print "\nSorry, I can't find that file. Exiting..."
    sys.exit()

# Parse the file, get a list of entries

soup = BeautifulSoup(document)
listOfEntries = soup.find_all('entry')

# Processing entries
pairsOfTerms = []

for element in listOfEntries:

    source = element.find_all('orth')
    targets = element.find_all('quote')

    # Check if more than 1 source (there are none, but just in case)
    if len(source) > 1:
        print source[0].text, "<<<<<<< This entry has more than 1 source!!"
        sys.exit()

    # Match each target with its parent source
    for target in targets:
        pairsOfTerms.append((source[0].text, target.text))

# Dump into a text file

with codecs.open("dump", "w", "UTF-8") as dumpFile:
    for pair in sorted(set(pairsOfTerms)):  # Do not process duplicates (there are many!)
        dumpFile.write(pair[0])
        dumpFile.write("\t")
        dumpFile.write(pair[1])
        dumpFile.write("\n")
	"""Extracts a TEI bilingual vocabulary to a term-tab-term plain-text structure"""

	"""Note: this script uses the BeautifulSoup library for TEI parsing."""
	"""See http://www.crummy.com/software/BeautifulSoup/bs4 for details."""

	from bs4 import BeautifulSoup
	import sys
	import codecs

	if len(sys.argv) < 2:
	print "\n Usage: extract-tei.py filename\n"
	sys.exit()

	filename = sys.argv[1]

	# Basic error control

	try:
	document = open(filename).read()
	except IOError:
	print "\nSorry, I can't find that file. Exiting..."
	sys.exit()

	# Parse the file, get a list of entries

	soup = BeautifulSoup(document)
	listOfEntries = soup.find_all('entry')

	# Processing entries
	pairsOfTerms = []

	for element in listOfEntries:

	source = element.find_all('orth')
	targets = element.find_all('quote')

	# Check if more than 1 source (there are none, but just in case)
	if len(source) > 1:
	print source[0].text, "<<<<<<< This entry has more than 1 source!!"
	sys.exit()

	# Match each target with its parent source
	for target in targets:
	pairsOfTerms.append((source[0].text, target.text))

	# Dump into a text file

	with codecs.open("dump", "w", "UTF-8") as dumpFile:
	for pair in sorted(set(pairsOfTerms)): # Do not process duplicates (there are many!)
	dumpFile.write(pair[0])
	dumpFile.write("\t")
	dumpFile.write(pair[1])
	dumpFile.write("\n")