<cfcomponent
	output="false"
	hint="I parse an XML file a little bit at a time using a file input stream so that the whole file doesn't need to be read in and parsed at one time.">


	<!--- Set up instance variables. --->
	<cfset VARIABLES.Instance = {

		<!--- This is the the event listern. --->
		Listener = "",

		<!---
			This is the buffer that holds are data that is
			pulled in from the input buffer but has not yet
			been processed.
		--->
		Buffer = "",

		<!---
			The compiled pattern that we will be using to parse
			the data that comes from the input stream.
		--->
		Pattern = ""
		} />


	<!---
		Create a regular expression pattern to handle the
		differnt node types. We are doing it here so we can
		create an eaiser to read, verbose pattern.
	--->
	<cfsavecontent variable="VARIABLES.Instance.Pattern"
		>(?xi)

		## Make sure that we start off with a regular expression
		## that is both verbose as well as case-insensitive.

		## Start at the beginning of the string (this will be the
		## beginning of our data buffer.

			^

		## Ignored by our event parser.
		## Doc type / encoding property.

			(
				\s*
				<\?[\w\W]+?\?>
				\s*
			)|

		## CDATA character match.

			(
				\s*
				<!\[CDATA\[
					[\w\W]*?
				\]\]>
				\s*
			)|

		## Text node match.

			([^<]+(?=<))|

		## Self-closing tag match.

			(
				<[\w:\-]+
					(?:\s+[\w:\-]+\s*=\s*"[^"]*")*
					\s*
				/>
			)|

		## Open tag match.

			(
				<[\w:\-]+
					(?:\s+[\w:\-]+\s*=\s*"[^"]*")*
					\s*
				>
			)|

		## Close tag match.

			(</[\w:\-]+>)

	</cfsavecontent>


	<!--- Now that we have the pattern, compile it. --->
	<cfset VARIABLES.Instance.Pattern = CreateObject(
		"java",
		"java.util.regex.Pattern"
		).Compile(
			JavaCast(
				"string",
				Trim( VARIABLES.Instance.Pattern )
				)
			)
		/>



	<cffunction
		name="Init"
		access="public"
		returntype="any"
		output="false"
		hint="I initialize and return this object.">

		<!--- Define arguments. --->
		<cfargument
			name="Listener"
			type="any"
			required="true"
			hint="I am the XML parse listener."
			/>

		<!--- Set listener. --->
		<cfset VARIABLES.Instance.Listener = ARGUMENTS.Listener />

		<!--- Return This reference. --->
		<cfreturn THIS />
	</cffunction>


	<cffunction
		name="AnnounceEvent"
		access="private"
		returntype="void"
		output="true"
		hint="I take an event and announce it to the listener if it is set.">

		<!--- Define arguments. --->
		<cfargument
			name="EventName"
			type="string"
			required="true"
			hint="I am the name of the event to announce."
			/>

		<cfargument
			name="EventData"
			type="any"
			required="false"
			default="#StructNew()#"
			hint="I am the event data that we are passing to the event listener."
			/>

		<!--- Invoke event listener in the listener object. --->
		<cfinvoke
			component="#VARIABLES.Instance.Listener#"
			method="#ARGUMENTS.EventName#">

			<cfinvokeargument
				name="1"
				value="#ARGUMENTS.EventData#"
				/>

		</cfinvoke>

		<!--- Return out. --->
		<cfreturn />
	</cffunction>


	<cffunction
		name="GetNextNode"
		access="private"
		returntype="any"
		output="false"
		hint="I get the next whole element from the input stream. That might be a open tag, close tag, or node text.">

		<!--- Define arguments. --->
		<cfargument
			name="InputStream"
			type="any"
			required="true"
			output="false"
			hint="The buffered input stream for our XML file."
			/>

		<!--- Define the local scope. --->
		<cfset var LOCAL = {} />

		<!--- Create a pattern matcher on the internal buffer. --->
		<cfset LOCAL.Matcher = VARIABLES.Instance.Pattern.Matcher(
			JavaCast( "string", VARIABLES.Instance.Buffer )
			) />

		<!--- Check to see if we can find a node. --->
		<cfif LOCAL.Matcher.Find()>

			<!---
				We found a node pattern. Let's check to see which
				one we found. To do this, we are going to get
				each group. Only one of them should be NOT NULL.
			--->
			<cfset LOCAL.IgnoredNode = LOCAL.Matcher.Group(
				JavaCast( "int", 1 )
				) />

			<cfset LOCAL.CDATANode = LOCAL.Matcher.Group(
				JavaCast( "int", 2 )
				) />

			<cfset LOCAL.TextNode = LOCAL.Matcher.Group(
				JavaCast( "int", 3 )
				) />

			<cfset LOCAL.OpenCloseNode = LOCAL.Matcher.Group(
				JavaCast( "int", 4 )
				) />

			<cfset LOCAL.OpenNode = LOCAL.Matcher.Group(
				JavaCast( "int", 5 )
				) />

			<cfset LOCAL.CloseNode = LOCAL.Matcher.Group(
				JavaCast( "int", 6 )
				) />


			<!---
				Now that we have gotten our pattern groups out of
				the match, let's check to see what node type we
				are dealing with. Based on the node type, we can
				return the proper group.
			--->
			<cfif StructKeyExists( LOCAL, "IgnoredNode" )>

				<cfset LOCAL.Node = {
					Type = "Ignored",
					Node = LOCAL.IgnoredNode
					} />

			<cfelseif StructKeyExists( LOCAL, "CDATANode" )>

				<cfset LOCAL.Node = {
					Type = "Text",
					Node = LOCAL.CDATANode
					} />

			<cfelseif StructKeyExists( LOCAL, "TextNode" )>

				<cfset LOCAL.Node = {
					Type = "Text",
					Node = LOCAL.TextNode
					} />

			<cfelseif StructKeyExists( LOCAL, "OpenCloseNode" )>

				<cfset LOCAL.Node = {
					Type = "SelfClose",
					Node = LOCAL.OpenCloseNode
					} />

			<cfelseif StructKeyExists( LOCAL, "OpenNode" )>

				<cfset LOCAL.Node = {
					Type = "Open",
					Node = LOCAL.OpenNode
					} />

			<cfelseif StructKeyExists( LOCAL, "CloseNode" )>

				<cfset LOCAL.Node = {
					Type = "Close",
					Node = LOCAL.CloseNode
					} />

			</cfif>


			<!---
				Before we return the node, trim the internal
				buffer so we don't keep matching on the same
				node pattern.
			--->
			<cfif (Len( VARIABLES.Instance.Buffer ) EQ Len( LOCAL.Node.Node ))>

				<!--- Just reset the internal buffer. --->
				<cfset VARIABLES.Instance.Buffer = "" />

			<cfelse>

				<!--- Trim the local buffer. --->
				<cfset VARIABLES.Instance.Buffer = Right(
					VARIABLES.Instance.Buffer,
					(Len( VARIABLES.Instance.Buffer ) - Len( LOCAL.Node.Node ))
					) />

			</cfif>


			<!--- Return the node. --->
			<cfreturn LOCAL.Node />

		<cfelse>

			<!---
				We did not find a node pattern. It is possible
				that we need to get more data out of the input
				stream before we can get that data. Therefore,
				let's read more data into out local buffer.
			--->

			<!---
				Create a buffer into which we will read the
				buffered input stream. Let's make the buffer
				about five megs. We are going to use a large
				string and then get the underlying bytes to get
				a byte buffer.
			--->
			<cfset LOCAL.Buffer = RepeatString( "12345", 1024 )
				.GetBytes()
				/>

			<!--- Read input stream into local buffer. --->
			<cfset LOCAL.BytesRead = ARGUMENTS.InputStream.Read(
				LOCAL.Buffer,
				JavaCast( "int", 0 ),
				JavaCast( "int", ArrayLen( LOCAL.Buffer ) )
				) />

			<!---
				Check to see if we read any bytes. If we didn't
				then we have run out of data to read and cannot
				possibly match any more node patterns; just
				return void.
			--->
			<cfif (LOCAL.BytesRead EQ -1)>

				<!---
					No more data. Return VOID to signal that
					we have run out of data.
				--->
				<cfreturn />

			<cfelse>

				<!---
					We have read data in from the buffered input
					stream. Now, let's append that to our
					internal buffer. Be sure to only move over
					the bytes that were read - this might not
					include the whole buffer contents.
				--->
				<cfset VARIABLES.Instance.Buffer &= Left(
					ToString( LOCAL.Buffer ),
					LOCAL.BytesRead
					) />

			</cfif>


			<!---
				Now that we have updated our buffer, call the
				method again to find the next node pattern.
			--->
			<cfreturn GetNextNode( ARGUMENTS.InputStream ) />

		</cfif>
	</cffunction>


	<cffunction
		name="Parse"
		access="public"
		returntype="void"
		output="true"
		hint="I parse the given XML file and announce events as I parse it.">

		<!--- Define arguments. --->
		<cfargument
			name="FilePath"
			type="string"
			required="true"
			hint="I am the expanded path to the given xml file."
			/>

		<!--- Define the local scope. --->
		<cfset var LOCAL = {} />

		<!---
			Create a file input stream to read in the XML file
			a bit at a time. We are going to create a buffered
			input stream for efficiency.
		--->
		<cfset LOCAL.InputStream = CreateObject(
			"java",
			"java.io.BufferedInputStream"
			).Init(
				CreateObject(
					"java",
					"java.io.FileInputStream"
					).Init(
						JavaCast( "string", ARGUMENTS.FilePath )
						)
				)
			/>


		<!--- Announce the start document event. --->
		<cfset AnnounceEvent( "StartDocument" ) />

		<!---
			Keep looping over the next reads until we get to a
			point where we no longer have data (GetNextNode()
			returns void).
		--->
		<cfloop condition="true">

			<!--- Read the next node. --->
			<cfset LOCAL.Node = GetNextNode( LOCAL.InputStream ) />

			<!--- Check to see if we have a noce. --->
			<cfif StructKeyExists( LOCAL, "Node" )>

				<!--- Check to see if we are ignoring this node. --->
				<cfif (LOCAL.Node.Type NEQ "Ignored")>

					<!--- Parse the node into an actual structure. --->
					<cfset LOCAL.ParsedNode = ParseNode(
						LOCAL.Node.Type,
						LOCAL.Node.Node
						) />

					<!---
						Check to see which type of node was returned.
						We will use thie node type (which is a pseudo
						node type) to announce an event.
					--->
					<cfswitch expression="#LOCAL.Node.Type#">
						<cfcase value="SelfClose">

							<cfset AnnounceEvent( "StartNode", LOCAL.ParsedNode ) />
							<cfset AnnounceEvent( "EndNode", LOCAL.ParsedNode ) />

						</cfcase>
						<cfcase value="Open">

							<cfset AnnounceEvent( "StartNode", LOCAL.ParsedNode ) />

						</cfcase>
						<cfcase value="Text">

							<cfset AnnounceEvent( "Text", LOCAL.ParsedNode ) />

						</cfcase>
						<cfcase value="Close">

							<cfset AnnounceEvent( "EndNode", LOCAL.ParsedNode ) />

						</cfcase>
					</cfswitch>

				</cfif>

			<cfelse>

				<!---
					GetNextNode() returned void which means that
					it has run out of data to read. Break out of
					our conditional loop.
				--->
				<cfbreak />

			</cfif>

		</cfloop>

		<!--- Announce the end document event. --->
		<cfset AnnounceEvent( "EndDocument" ) />

		<!---
			Close the input stream. This will release the
			file from the buffered reader and will free it
			from locking.
		--->
		<cfset LOCAL.InputStream.Close() />

		<!--- Return out. --->
		<cfreturn />
	</cffunction>


	<cffunction
		name="ParseNode"
		access="private"
		returntype="any"
		output="false"
		hint="I take an XML node string and parse it into an XML node.">

		<!--- Define arguments. --->
		<cfargument
			name="NodeType"
			type="string"
			required="true"
			hint="I am the type of node that was found."
			/>

		<cfargument
			name="NodeString"
			type="string"
			required="true"
			hint="I am a string representation of part of an XML node."
			/>

		<!--- Define the local scope. --->
		<cfset var LOCAL = {} />

		<!---
			When we parse this node, we might have to kludge
			the node text a bit in order to get it parsed by
			CFXML. This include turning openNode types into
			self closing tags.
		--->
		<cfswitch expression="#ARGUMENTS.NodeType#">
			<cfcase value="SelfClose">

				<!--- Leave as is. --->

			</cfcase>
			<cfcase value="Open">

				<!--- Turn this node into a self-closing node. --->
				<cfset ARGUMENTS.NodeString = REReplace(
					ARGUMENTS.NodeString,
					">$",
					"/>",
					"one"
					) />

			</cfcase>
			<cfcase value="Text">

				<!--- Leave as is. --->

			</cfcase>
			<cfcase value="Close">

				<!--- Turn into self-closing node. --->
				<cfset ARGUMENTS.NodeString = REReplace(
					ARGUMENTS.NodeString,
					"^</([^>]+)>$",
					"<\1/>",
					"one"
					) />

			</cfcase>
		</cfswitch>


		<!---
			We have to be careful for nodes that have namespaces.
			If we try to parse a node that uses a name space and
			that namespace is not bound, we will get an error.
			Therefore, we have to check to see if our node usses
			any namespaces.
		--->
		<cfset LOCAL.NameSpaces = REMatch(
			"^<[\w\d\-]+:[\w\d\-]+|[\w\d\-]+:[\w\d\-]+\s*=",
			ARGUMENTS.NodeString
			) />

		<!---
			Set a default name space string. This is the string
			that will be added to the root node of our future
			CFXML document.
		--->
		<cfset LOCAL.NameSpaceString = "" />

		<!--- Loop over any name space matches. --->
		<cfloop
			index="LOCAL.NameSpace"
			array="#LOCAL.NameSpaces#">

			<!--- Clean the namespace. --->
			<cfset LOCAL.NameSpace = ListFirst(
				Replace( LOCAL.NameSpace, "<", "", "one" ),
				":"
				) />

			<!---
				Make sure we are not trying to bind the xmlns
				prefix itself. We only want to find ones NOT in
				the root document.
			--->
			<cfif (LOCAL.NameSpace NEQ "xmlns")>

				<!--- Add this name space to our string. --->
				<cfset LOCAL.NameSpaceString &= (
					" xmlns:" &
					LOCAL.NameSpace &
					"=""http://www.domain.ext"""
					) />

			</cfif>

		</cfloop>


		<!---
			At this point, we have converted our node string
			into an XML node that could be properly parsed as
			part of a new document. We have also extracted any
			name spaces that need to be bound in this document.
			Create a simple XML document with the above nested
			element. We are using a place holder data2 to help
			work around some CDATA limitations in CF XML Parsing.
		--->
		<cfxml variable="LOCAL.Xml">
			<cfoutput>
				<root #LOCAL.NameSpaceString#>
					<data>#ARGUMENTS.NodeString#</data>
					<data2>place-holder</data2>
				</root>
			</cfoutput>
		</cfxml>


		<!---
			Get the child node. When we do this, we need to
			treat element nodes differently than text nodes.
		--->
		<cfif (ARGUMENTS.NodeType EQ "Text")>

			<!--- Get node text from dummy node. --->
			<cfset LOCAL.ChildNodes = XmlSearch(
				LOCAL.Xml,
				"/root/data2/text()"
				) />

			<!---
				Store actual value of text node into our
				dummy node value. This will allow us to get
				the actual text even though CDATA text is not
				handled so well in ColdFusion.
			--->
			<cfset LOCAL.ChildNodes[ 1 ].XmlValue = LOCAL.Xml.root.data.XmlText />

		<cfelse>

			<!--- Get node. --->
			<cfset LOCAL.ChildNodes = XmlSearch(
				LOCAL.Xml,
				"/root/data/*[ 1 ]/"
				) />

		</cfif>


		<!--- Return the parsed node. --->
		<cfreturn LOCAL.ChildNodes[ 1 ] />
	</cffunction>

</cfcomponent>