stevecassidy/xml2text.py

## xml2text.py
"""
Author: Steve Cassidy (Steve.Cassidy@mq.edu.au)

Script to convert XML export from Trove into single files.

The XML export from Trove consists of a single XML file with many
<article> elements, one per article.   Since an export file can be very
large this makes processing the data hard.  This script breaks the
large file into many small files that could then be fed to
later processes.  Each file is named for the article id number
and written to the directory 'output'.

Usage:

python3 xml2text.py <trove xml file>

"""


import sys
from xml.sax.handler import ContentHandler
from xml.sax import parse
import os


class ArticleHandler(ContentHandler):

    def __init__(self):

        self.text = ""

    def startElement(self, name, attrs):

        if name == 'article':
            self.current_id = attrs['id']

        if name == "articleText":
            self.text = ""

    def characters(self, content):

        self.text += content

    def endElement(self, name):

        if name == 'articleText':
            with open(os.path.join('output', self.current_id + ".html"), 'w') as out:
                out.write(self.text)


if __name__ == '__main__':

    with open(sys.argv[1]) as fd:
        handler = ArticleHandler()
        parse(fd, handler)
	"""
	Author: Steve Cassidy (Steve.Cassidy@mq.edu.au)

	Script to convert XML export from Trove into single files.

	The XML export from Trove consists of a single XML file with many
	<article> elements, one per article. Since an export file can be very
	large this makes processing the data hard. This script breaks the
	large file into many small files that could then be fed to
	later processes. Each file is named for the article id number
	and written to the directory 'output'.

	Usage:

	python3 xml2text.py <trove xml file>

	"""


	import sys
	from xml.sax.handler import ContentHandler
	from xml.sax import parse
	import os


	class ArticleHandler(ContentHandler):

	def __init__(self):

	self.text = ""

	def startElement(self, name, attrs):

	if name == 'article':
	self.current_id = attrs['id']

	if name == "articleText":
	self.text = ""

	def characters(self, content):

	self.text += content

	def endElement(self, name):

	if name == 'articleText':
	with open(os.path.join('output', self.current_id + ".html"), 'w') as out:
	out.write(self.text)


	if __name__ == '__main__':

	with open(sys.argv[1]) as fd:
	handler = ArticleHandler()
	parse(fd, handler)