Skip to content

Instantly share code, notes, and snippets.

@howellcc
Forked from bennadel/code-1.cfm
Last active November 9, 2016 22:00
Show Gist options
  • Save howellcc/9e4dadaf5e71121db192cf2a4a74ad32 to your computer and use it in GitHub Desktop.
Save howellcc/9e4dadaf5e71121db192cf2a4a74ad32 to your computer and use it in GitHub Desktop.
Ask Ben: Parsing Very Large XML Documents In ColdFusion
component
output="false"
hint="I help to parse large XML files by matching patterns and then only parsing sub-nodes of the document." {
public any function Init(required string Nodes, required string XmlFilePath, numeric BufferSize=(1024*1024*5)) {
/*
Create the regular expression pattern based on the
node list. We have to match both standard nodes and
self-closing nodes. The first thing we have to do is
clean up the node list.
*/
Nodes = ListChangeDelims(Nodes,"|", ", ");
/* Define the pattern. */
var Pattern = ( "(?i)" & "<(#Nodes#)\b[^>]*(?<=/)>|" & "<(#Nodes#)\b[^>]*>[\w\W]*?</\2>");
/* Set up the instance variables. */
VARIABLES.Instance = {
/*
This the compiled version of our regular
expression pattern. By compiling the pattern,
it allows us to access the Matcher functionality
later on.
*/
Pattern = CreateObject(
"java",
"java.util.regex.Pattern"
).Compile(
JavaCast( "string", Pattern )
),
/*
This is the data buffer that will hold our
partial XML file data.
*/
DataBuffer = "",
/*
The transfer buffer is what we will use to
transfer data from the input file stream into
our data buffer. It is this buffer that will
determine the size of each file read.
*/
TransferBuffer = RepeatString( " ", BufferSize ).GetBytes(),
/*
This will be our buffered file input stream
which let us read in the large XML file a
chunk at a time.
*/
InputStream = ""
};
/*
Setup the file intput stream. This buffere input
stream will all us to read in the XML file in
chunks rather than as a whole.
*/
VARIABLES.Instance.InputStream = CreateObject(
"java",
"java.io.BufferedInputStream"
).Init(
CreateObject(
"java",
"java.io.FileInputStream"
).Init(
JavaCast(
"string",
XmlFilePath
)
)
);
/* Return an intialized object. */
return THIS ;
}
public void function Close(){
/* Close the file input stream. */
VARIABLES.Instance.InputStream.Close();
/* Return out. */
return;
}
public any function GetNextNode() {
//hint="I return the next node in the XML document. If no node can be found, I return VOID.">
/* Create a matcher for our current buffer. */
var Matcher = VARIABLES.Instance.Pattern.Matcher(
JavaCast( "string", VARIABLES.Instance.DataBuffer )
) ;
/* Try to find the next node. */
if(Matcher.Find()){
/*
The matcher found a pattern match. Let's pull out
the matching XML.
*/
var XMLData = Matcher.Group();
/*
Now that we have the pattern matched, we need to
figure out how many characters to leave in our
buffer.
*/
var CharsToLeave = (
Len( VARIABLES.Instance.DataBuffer ) -
(Matcher.Start() + Len( XMLData ))
);
/*
Check to see if we have any characters to leave
in the buffer after this match.
*/
if(CharsToLeave){
/* Trim the buffer. */
VARIABLES.Instance.DataBuffer = Right(
VARIABLES.Instance.DataBuffer,
CharsToLeave
);
}else{
/*
No character data should be left in the
buffer. Just set it to empyt string.
*/
VARIABLES.Instance.DataBuffer = "" ;
}
/*
Now that we have the buffer updated, parse the
XML data and return the root element.
*/
return
XmlParse( Trim( XMLData ) )
.XmlRoot ;
} else {
/*
The pattern matcher could not find the next node.
This might be because our buffer does contain
enough information. Let's try to read more of our
XML file into the buffer.
*/
/* Read input stream into local buffer. */
var BytesRead = VARIABLES.Instance.InputStream.Read(
VARIABLES.Instance.TransferBuffer,
JavaCast( "int", 0 ),
JavaCast( "int", ArrayLen( VARIABLES.Instance.TransferBuffer ) )
) ;
/*
Check to see if we read any bytes. If we didn't
then we have run out of data to read and cannot
possibly match any more node patterns; just
return void.
*/
if (BytesRead EQ -1){
/* Release the file input stream. */
THIS.Close();
/* No more data to be matched. */
return;
}else{
/*
We have read data in from the buffered file
input stream. Now, let's append that to our
internal buffer. Be sure to only move over
the bytes that were read - this might not
include the whole buffer contents.
*/
VARIABLES.Instance.DataBuffer &= Left(
ToString( VARIABLES.Instance.TransferBuffer ),
LOCAL.BytesRead
);
}
/*
Now that we have updated our buffer, we want to
give the pattern matcher another change to find
the node pattern.
*/
return GetNextNode();
}
}
}
@howellcc
Copy link
Author

howellcc commented Nov 9, 2016

I took the original file from Ben Nadel and translated it to cfscript.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment