Skip to content

Instantly share code, notes, and snippets.

@keymon
Created November 8, 2011 10:41
Show Gist options
  • Save keymon/1347472 to your computer and use it in GitHub Desktop.
Save keymon/1347472 to your computer and use it in GitHub Desktop.
This perl script just greps and XML, printing the structure in plain text.
#!/bin/env perl
#
# This is a simple script that "greps" an XML based on
# the names of the xml elements. It prints the contents
# of the Text data in that element
#
# Author: Hector Rivas
#
use XML::Parser;
use Getopt::Std;
my %Options;
my $print_content; # Print or not this line
my $first=1; # If this is the first line or not
@parent = (); # Stores the name of the parent groups
@element_content = (); # Stores the content of current element
@element_subcontent = (""); # Stores the content of the childs
# initialize the parser
my $parser = XML::Parser->new( Handlers => {
Start=>\&handle_start,
End=>\&handle_end,
Char=>\&handle_char,
});
# Parse the options
if (not getopts('q1etp', \%Options)) {
print_help();
exit 1;
}
# Get the file name
$filename = shift @ARGV;
if (not $filename) {
print_help();
exit 0;
}
# Elements to query. We create a hash for this.
my %element_filter;
@element_filter{@ARGV} = ();
$parser->parsefile($filename);
# Print the acumulated content
print $element_subcontent[0];
sub print_help() {
print <<HelpText;
Usage: grepxml [options] input.xml [Element1 ...]
This program greps an XML printing the node names and the Text data.
Options:
-1 Print the first node searched and exit.
-q Print the data, not the node names.
-e Print also elements with empty content
-p Print all the parents for eache elements (root.sub1.sub2.element val)
HelpText
}
# Perl trim function to remove whitespace from the start and end of the string
sub trim($)
{
my $string = shift;
$string =~ s/^\s+//;
$string =~ s/\s+$//;
return $string;
}
# For each element
sub handle_start {
my( $expat, $element, %attrs ) = @_;
# Add the initial content "" and subcontent
unshift(@element_content, "");
unshift(@element_subcontent, "");
unshift(@parent, $element);
}
# Process the Text nodes
sub handle_char {
my( $expat, $content ) = @_;
# Append the content to the last element
$element_content[0] = $element_content[0] . $content;
}
# At the end
sub handle_end {
my( $expat, $element, %attrs ) = @_;
# Get all the path
my $element_path = join(".", reverse(@parent)); shift @parent;
# If we have to filter check if it is in the hash
my $print_content=1 if (keys( %element_filter ) == 0 or
(exists $element_filter{$element} or exists $element_filter{$element_path}));
# Get the acumulated content
my $content = trim(shift @element_content);
# And the subelement content
my $subcontent=shift @element_subcontent;
# The new content
my $new_content="";
# If the Text is not empty (or enabled print empty contents) and we have to print this element
if (($content or $Options{'e'}) and $print_content) {
# Get the content of the parent:
if ($Options{'t'}) {
$new_content .= " " x ($#element_content+1);
}
if (not $Options{'q'}) {
if ($Options{'p'}) {
$new_content .= "$element_path ";
} else {
$new_content .= "$element ";
}
}
$new_content .= "$content\n";
# If option -1 is set and we are filtering for this element, exit.
if ($Options{'1'} and
(exists $element_filter{$element} or exists $element_filter{$element_path})) {
print $new_content;
exit 0
}
}
$element_subcontent[0].=$new_content.$subcontent;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment