<cfcomponent output="false" hint="I parse an XML file a little bit at a time using a file input stream so that the whole file doesn't need to be read in and parsed at one time."> <!--- Set up instance variables. ---> <cfset VARIABLES.Instance = { <!--- This is the the event listern. ---> Listener = "", <!--- This is the buffer that holds are data that is pulled in from the input buffer but has not yet been processed. ---> Buffer = "", <!--- The compiled pattern that we will be using to parse the data that comes from the input stream. ---> Pattern = "" } /> <!--- Create a regular expression pattern to handle the differnt node types. We are doing it here so we can create an eaiser to read, verbose pattern. ---> <cfsavecontent variable="VARIABLES.Instance.Pattern" >(?xi) ## Make sure that we start off with a regular expression ## that is both verbose as well as case-insensitive. ## Start at the beginning of the string (this will be the ## beginning of our data buffer. ^ ## Ignored by our event parser. ## Doc type / encoding property. ( \s* <\?[\w\W]+?\?> \s* )| ## CDATA character match. ( \s* <!\[CDATA\[ [\w\W]*? \]\]> \s* )| ## Text node match. ([^<]+(?=<))| ## Self-closing tag match. ( <[\w:\-]+ (?:\s+[\w:\-]+\s*=\s*"[^"]*")* \s* /> )| ## Open tag match. ( <[\w:\-]+ (?:\s+[\w:\-]+\s*=\s*"[^"]*")* \s* > )| ## Close tag match. (</[\w:\-]+>) </cfsavecontent> <!--- Now that we have the pattern, compile it. ---> <cfset VARIABLES.Instance.Pattern = CreateObject( "java", "java.util.regex.Pattern" ).Compile( JavaCast( "string", Trim( VARIABLES.Instance.Pattern ) ) ) /> <cffunction name="Init" access="public" returntype="any" output="false" hint="I initialize and return this object."> <!--- Define arguments. ---> <cfargument name="Listener" type="any" required="true" hint="I am the XML parse listener." /> <!--- Set listener. ---> <cfset VARIABLES.Instance.Listener = ARGUMENTS.Listener /> <!--- Return This reference. ---> <cfreturn THIS /> </cffunction> <cffunction name="AnnounceEvent" access="private" returntype="void" output="true" hint="I take an event and announce it to the listener if it is set."> <!--- Define arguments. ---> <cfargument name="EventName" type="string" required="true" hint="I am the name of the event to announce." /> <cfargument name="EventData" type="any" required="false" default="#StructNew()#" hint="I am the event data that we are passing to the event listener." /> <!--- Invoke event listener in the listener object. ---> <cfinvoke component="#VARIABLES.Instance.Listener#" method="#ARGUMENTS.EventName#"> <cfinvokeargument name="1" value="#ARGUMENTS.EventData#" /> </cfinvoke> <!--- Return out. ---> <cfreturn /> </cffunction> <cffunction name="GetNextNode" access="private" returntype="any" output="false" hint="I get the next whole element from the input stream. That might be a open tag, close tag, or node text."> <!--- Define arguments. ---> <cfargument name="InputStream" type="any" required="true" output="false" hint="The buffered input stream for our XML file." /> <!--- Define the local scope. ---> <cfset var LOCAL = {} /> <!--- Create a pattern matcher on the internal buffer. ---> <cfset LOCAL.Matcher = VARIABLES.Instance.Pattern.Matcher( JavaCast( "string", VARIABLES.Instance.Buffer ) ) /> <!--- Check to see if we can find a node. ---> <cfif LOCAL.Matcher.Find()> <!--- We found a node pattern. Let's check to see which one we found. To do this, we are going to get each group. Only one of them should be NOT NULL. ---> <cfset LOCAL.IgnoredNode = LOCAL.Matcher.Group( JavaCast( "int", 1 ) ) /> <cfset LOCAL.CDATANode = LOCAL.Matcher.Group( JavaCast( "int", 2 ) ) /> <cfset LOCAL.TextNode = LOCAL.Matcher.Group( JavaCast( "int", 3 ) ) /> <cfset LOCAL.OpenCloseNode = LOCAL.Matcher.Group( JavaCast( "int", 4 ) ) /> <cfset LOCAL.OpenNode = LOCAL.Matcher.Group( JavaCast( "int", 5 ) ) /> <cfset LOCAL.CloseNode = LOCAL.Matcher.Group( JavaCast( "int", 6 ) ) /> <!--- Now that we have gotten our pattern groups out of the match, let's check to see what node type we are dealing with. Based on the node type, we can return the proper group. ---> <cfif StructKeyExists( LOCAL, "IgnoredNode" )> <cfset LOCAL.Node = { Type = "Ignored", Node = LOCAL.IgnoredNode } /> <cfelseif StructKeyExists( LOCAL, "CDATANode" )> <cfset LOCAL.Node = { Type = "Text", Node = LOCAL.CDATANode } /> <cfelseif StructKeyExists( LOCAL, "TextNode" )> <cfset LOCAL.Node = { Type = "Text", Node = LOCAL.TextNode } /> <cfelseif StructKeyExists( LOCAL, "OpenCloseNode" )> <cfset LOCAL.Node = { Type = "SelfClose", Node = LOCAL.OpenCloseNode } /> <cfelseif StructKeyExists( LOCAL, "OpenNode" )> <cfset LOCAL.Node = { Type = "Open", Node = LOCAL.OpenNode } /> <cfelseif StructKeyExists( LOCAL, "CloseNode" )> <cfset LOCAL.Node = { Type = "Close", Node = LOCAL.CloseNode } /> </cfif> <!--- Before we return the node, trim the internal buffer so we don't keep matching on the same node pattern. ---> <cfif (Len( VARIABLES.Instance.Buffer ) EQ Len( LOCAL.Node.Node ))> <!--- Just reset the internal buffer. ---> <cfset VARIABLES.Instance.Buffer = "" /> <cfelse> <!--- Trim the local buffer. ---> <cfset VARIABLES.Instance.Buffer = Right( VARIABLES.Instance.Buffer, (Len( VARIABLES.Instance.Buffer ) - Len( LOCAL.Node.Node )) ) /> </cfif> <!--- Return the node. ---> <cfreturn LOCAL.Node /> <cfelse> <!--- We did not find a node pattern. It is possible that we need to get more data out of the input stream before we can get that data. Therefore, let's read more data into out local buffer. ---> <!--- Create a buffer into which we will read the buffered input stream. Let's make the buffer about five megs. We are going to use a large string and then get the underlying bytes to get a byte buffer. ---> <cfset LOCAL.Buffer = RepeatString( "12345", 1024 ) .GetBytes() /> <!--- Read input stream into local buffer. ---> <cfset LOCAL.BytesRead = ARGUMENTS.InputStream.Read( LOCAL.Buffer, JavaCast( "int", 0 ), JavaCast( "int", ArrayLen( LOCAL.Buffer ) ) ) /> <!--- Check to see if we read any bytes. If we didn't then we have run out of data to read and cannot possibly match any more node patterns; just return void. ---> <cfif (LOCAL.BytesRead EQ -1)> <!--- No more data. Return VOID to signal that we have run out of data. ---> <cfreturn /> <cfelse> <!--- We have read data in from the buffered input stream. Now, let's append that to our internal buffer. Be sure to only move over the bytes that were read - this might not include the whole buffer contents. ---> <cfset VARIABLES.Instance.Buffer &= Left( ToString( LOCAL.Buffer ), LOCAL.BytesRead ) /> </cfif> <!--- Now that we have updated our buffer, call the method again to find the next node pattern. ---> <cfreturn GetNextNode( ARGUMENTS.InputStream ) /> </cfif> </cffunction> <cffunction name="Parse" access="public" returntype="void" output="true" hint="I parse the given XML file and announce events as I parse it."> <!--- Define arguments. ---> <cfargument name="FilePath" type="string" required="true" hint="I am the expanded path to the given xml file." /> <!--- Define the local scope. ---> <cfset var LOCAL = {} /> <!--- Create a file input stream to read in the XML file a bit at a time. We are going to create a buffered input stream for efficiency. ---> <cfset LOCAL.InputStream = CreateObject( "java", "java.io.BufferedInputStream" ).Init( CreateObject( "java", "java.io.FileInputStream" ).Init( JavaCast( "string", ARGUMENTS.FilePath ) ) ) /> <!--- Announce the start document event. ---> <cfset AnnounceEvent( "StartDocument" ) /> <!--- Keep looping over the next reads until we get to a point where we no longer have data (GetNextNode() returns void). ---> <cfloop condition="true"> <!--- Read the next node. ---> <cfset LOCAL.Node = GetNextNode( LOCAL.InputStream ) /> <!--- Check to see if we have a noce. ---> <cfif StructKeyExists( LOCAL, "Node" )> <!--- Check to see if we are ignoring this node. ---> <cfif (LOCAL.Node.Type NEQ "Ignored")> <!--- Parse the node into an actual structure. ---> <cfset LOCAL.ParsedNode = ParseNode( LOCAL.Node.Type, LOCAL.Node.Node ) /> <!--- Check to see which type of node was returned. We will use thie node type (which is a pseudo node type) to announce an event. ---> <cfswitch expression="#LOCAL.Node.Type#"> <cfcase value="SelfClose"> <cfset AnnounceEvent( "StartNode", LOCAL.ParsedNode ) /> <cfset AnnounceEvent( "EndNode", LOCAL.ParsedNode ) /> </cfcase> <cfcase value="Open"> <cfset AnnounceEvent( "StartNode", LOCAL.ParsedNode ) /> </cfcase> <cfcase value="Text"> <cfset AnnounceEvent( "Text", LOCAL.ParsedNode ) /> </cfcase> <cfcase value="Close"> <cfset AnnounceEvent( "EndNode", LOCAL.ParsedNode ) /> </cfcase> </cfswitch> </cfif> <cfelse> <!--- GetNextNode() returned void which means that it has run out of data to read. Break out of our conditional loop. ---> <cfbreak /> </cfif> </cfloop> <!--- Announce the end document event. ---> <cfset AnnounceEvent( "EndDocument" ) /> <!--- Close the input stream. This will release the file from the buffered reader and will free it from locking. ---> <cfset LOCAL.InputStream.Close() /> <!--- Return out. ---> <cfreturn /> </cffunction> <cffunction name="ParseNode" access="private" returntype="any" output="false" hint="I take an XML node string and parse it into an XML node."> <!--- Define arguments. ---> <cfargument name="NodeType" type="string" required="true" hint="I am the type of node that was found." /> <cfargument name="NodeString" type="string" required="true" hint="I am a string representation of part of an XML node." /> <!--- Define the local scope. ---> <cfset var LOCAL = {} /> <!--- When we parse this node, we might have to kludge the node text a bit in order to get it parsed by CFXML. This include turning openNode types into self closing tags. ---> <cfswitch expression="#ARGUMENTS.NodeType#"> <cfcase value="SelfClose"> <!--- Leave as is. ---> </cfcase> <cfcase value="Open"> <!--- Turn this node into a self-closing node. ---> <cfset ARGUMENTS.NodeString = REReplace( ARGUMENTS.NodeString, ">$", "/>", "one" ) /> </cfcase> <cfcase value="Text"> <!--- Leave as is. ---> </cfcase> <cfcase value="Close"> <!--- Turn into self-closing node. ---> <cfset ARGUMENTS.NodeString = REReplace( ARGUMENTS.NodeString, "^</([^>]+)>$", "<\1/>", "one" ) /> </cfcase> </cfswitch> <!--- We have to be careful for nodes that have namespaces. If we try to parse a node that uses a name space and that namespace is not bound, we will get an error. Therefore, we have to check to see if our node usses any namespaces. ---> <cfset LOCAL.NameSpaces = REMatch( "^<[\w\d\-]+:[\w\d\-]+|[\w\d\-]+:[\w\d\-]+\s*=", ARGUMENTS.NodeString ) /> <!--- Set a default name space string. This is the string that will be added to the root node of our future CFXML document. ---> <cfset LOCAL.NameSpaceString = "" /> <!--- Loop over any name space matches. ---> <cfloop index="LOCAL.NameSpace" array="#LOCAL.NameSpaces#"> <!--- Clean the namespace. ---> <cfset LOCAL.NameSpace = ListFirst( Replace( LOCAL.NameSpace, "<", "", "one" ), ":" ) /> <!--- Make sure we are not trying to bind the xmlns prefix itself. We only want to find ones NOT in the root document. ---> <cfif (LOCAL.NameSpace NEQ "xmlns")> <!--- Add this name space to our string. ---> <cfset LOCAL.NameSpaceString &= ( " xmlns:" & LOCAL.NameSpace & "=""http://www.domain.ext""" ) /> </cfif> </cfloop> <!--- At this point, we have converted our node string into an XML node that could be properly parsed as part of a new document. We have also extracted any name spaces that need to be bound in this document. Create a simple XML document with the above nested element. We are using a place holder data2 to help work around some CDATA limitations in CF XML Parsing. ---> <cfxml variable="LOCAL.Xml"> <cfoutput> <root #LOCAL.NameSpaceString#> <data>#ARGUMENTS.NodeString#</data> <data2>place-holder</data2> </root> </cfoutput> </cfxml> <!--- Get the child node. When we do this, we need to treat element nodes differently than text nodes. ---> <cfif (ARGUMENTS.NodeType EQ "Text")> <!--- Get node text from dummy node. ---> <cfset LOCAL.ChildNodes = XmlSearch( LOCAL.Xml, "/root/data2/text()" ) /> <!--- Store actual value of text node into our dummy node value. This will allow us to get the actual text even though CDATA text is not handled so well in ColdFusion. ---> <cfset LOCAL.ChildNodes[ 1 ].XmlValue = LOCAL.Xml.root.data.XmlText /> <cfelse> <!--- Get node. ---> <cfset LOCAL.ChildNodes = XmlSearch( LOCAL.Xml, "/root/data/*[ 1 ]/" ) /> </cfif> <!--- Return the parsed node. ---> <cfreturn LOCAL.ChildNodes[ 1 ] /> </cffunction> </cfcomponent>