Skip to content

Instantly share code, notes, and snippets.

@rsimoes
Created February 18, 2012 02:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rsimoes/1856963 to your computer and use it in GitHub Desktop.
Save rsimoes/1856963 to your computer and use it in GitHub Desktop.
XML parsing benchmarks
use v5.14;
use strict;
use warnings;
use utf8::all;
use Data::Dump;
use LWP::Simple qw(get);
use Benchmark qw(cmpthese timethese);
use XML::LibXML;
use XML::Parser;
use XML::SAX::PurePerl;
use XML::SAX::ExpatXS;
use XML::TreePP;
use XML::Twig;
# Regex pattern copyright 1998 Robert D. Cameron of Simon Fraser University
# Full article at http://www.cs.sfu.ca/~cameron/REX.html
my $text_se = "[^<]+";
my $until_hyphen = "[^-]*-";
my $until_2_hyphens = "$until_hyphen(?:[^-]$until_hyphen)*-";
my $comment_ce = "$until_2_hyphens>?";
my $until_rsbs = "[^\\]]*](?:[^\\]]+])*]+";
my $cdata_ce = "$until_rsbs(?:[^\\]>]$until_rsbs)*>";
my $s = "[[:space:]]+";
my $name_start = "[[:alpha:]_:]";
my $name_char = "[[:word:]:.-]";
my $name = "(?:$name_start)(?:$name_char)*";
my $quote_se = "\"[^\"]*\"|'[^']*'";
my $dt_ident_se = "$s$name(?:$s(?:$name|$quote_se))*";
my $markup_decl_ce = "(?:[^\\]\"'><]+|$quote_se)*>";
my $s1 = "[[:space:]]";
my $until_qms = "[^?]*\\?+";
my $pi_tail = "\\?>|$s1$until_qms(?:[^>?]$until_qms)*>";
my $dt_item_se = "<(?:!(?:--$until_2_hyphens>|[^-]$markup_decl_ce)|\\?$name(?:$pi_tail))|%$name;|$s";
my $doc_type_ce = "$dt_ident_se(?:$s)?(?:\\[(?:$dt_item_se)*](?:$s)?)?>?";
my $decl_ce = "--(?:$comment_ce)?|\\[CDATA\\[(?:$cdata_ce)?|DOCTYPE(?:$doc_type_ce)?";
my $pi_ce = "$name(?:$pi_tail)?";
my $end_tag_ce = "$name(?:$s)?>?";
my $att_val_se = "\"[^<\"]*\"|'[^<']*'";
my $elem_tag_ce = "$name(?:$s$name(?:$s)?=(?:$s)?(?:$att_val_se))*(?:$s)?/?>?";
my $markup_spe = "<(?:!(?:$decl_ce)?|\\?(?:$pi_ce)?|/(?:$end_tag_ce)?|(?:$elem_tag_ce)?)";
sub shallow_parse {
my ($doc) = @_;
state $xml_spe = qr/$text_se|$markup_spe/;
my @result = $doc =~ /$xml_spe/g;
return \@result; }
sub shallow_parse_re2 {
use re::engine::RE2;
my ($doc) = @_;
state $xml_spe = qr/$text_se|$markup_spe/;
my @result = $doc =~ /$xml_spe/g;
return @result; }
my $xml_doc = get "http://www.w3.org/TR/REC-xml/REC-xml-20081126.xml";
my $parser = XML::Parser->new;
my $sax_pp = XML::SAX::PurePerl->new;
my $sax_xs = XML::SAX::ExpatXS->new;
my $tree_pp = XML::TreePP->new;
my $twig = XML::Twig->new;
say "Starting benchmark...";
cmpthese( undef, {
regex => sub { shallow_parse($xml_doc) },
re2gex => sub { shallow_parse_re2($xml_doc) },
parser => sub { $parser->parse($xml_doc) },
sax_pp => sub { $sax_pp->parse_string($xml_doc) },
sax_xs => sub { $sax_xs->parse_string($xml_doc) },
twig => sub { $twig->parse($xml_doc) },
tree_pp => sub { $tree_pp->parse($xml_doc ) },
libxml => sub { XML::LibXML::Document->new( string => $xml_doc) } } );
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment