Created
November 8, 2011 10:41
-
-
Save keymon/1347472 to your computer and use it in GitHub Desktop.
This perl script just greps and XML, printing the structure in plain text.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/env perl | |
# | |
# This is a simple script that "greps" an XML based on | |
# the names of the xml elements. It prints the contents | |
# of the Text data in that element | |
# | |
# Author: Hector Rivas | |
# | |
use XML::Parser; | |
use Getopt::Std; | |
my %Options; | |
my $print_content; # Print or not this line | |
my $first=1; # If this is the first line or not | |
@parent = (); # Stores the name of the parent groups | |
@element_content = (); # Stores the content of current element | |
@element_subcontent = (""); # Stores the content of the childs | |
# initialize the parser | |
my $parser = XML::Parser->new( Handlers => { | |
Start=>\&handle_start, | |
End=>\&handle_end, | |
Char=>\&handle_char, | |
}); | |
# Parse the options | |
if (not getopts('q1etp', \%Options)) { | |
print_help(); | |
exit 1; | |
} | |
# Get the file name | |
$filename = shift @ARGV; | |
if (not $filename) { | |
print_help(); | |
exit 0; | |
} | |
# Elements to query. We create a hash for this. | |
my %element_filter; | |
@element_filter{@ARGV} = (); | |
$parser->parsefile($filename); | |
# Print the acumulated content | |
print $element_subcontent[0]; | |
sub print_help() { | |
print <<HelpText; | |
Usage: grepxml [options] input.xml [Element1 ...] | |
This program greps an XML printing the node names and the Text data. | |
Options: | |
-1 Print the first node searched and exit. | |
-q Print the data, not the node names. | |
-e Print also elements with empty content | |
-p Print all the parents for eache elements (root.sub1.sub2.element val) | |
HelpText | |
} | |
# Perl trim function to remove whitespace from the start and end of the string | |
sub trim($) | |
{ | |
my $string = shift; | |
$string =~ s/^\s+//; | |
$string =~ s/\s+$//; | |
return $string; | |
} | |
# For each element | |
sub handle_start { | |
my( $expat, $element, %attrs ) = @_; | |
# Add the initial content "" and subcontent | |
unshift(@element_content, ""); | |
unshift(@element_subcontent, ""); | |
unshift(@parent, $element); | |
} | |
# Process the Text nodes | |
sub handle_char { | |
my( $expat, $content ) = @_; | |
# Append the content to the last element | |
$element_content[0] = $element_content[0] . $content; | |
} | |
# At the end | |
sub handle_end { | |
my( $expat, $element, %attrs ) = @_; | |
# Get all the path | |
my $element_path = join(".", reverse(@parent)); shift @parent; | |
# If we have to filter check if it is in the hash | |
my $print_content=1 if (keys( %element_filter ) == 0 or | |
(exists $element_filter{$element} or exists $element_filter{$element_path})); | |
# Get the acumulated content | |
my $content = trim(shift @element_content); | |
# And the subelement content | |
my $subcontent=shift @element_subcontent; | |
# The new content | |
my $new_content=""; | |
# If the Text is not empty (or enabled print empty contents) and we have to print this element | |
if (($content or $Options{'e'}) and $print_content) { | |
# Get the content of the parent: | |
if ($Options{'t'}) { | |
$new_content .= " " x ($#element_content+1); | |
} | |
if (not $Options{'q'}) { | |
if ($Options{'p'}) { | |
$new_content .= "$element_path "; | |
} else { | |
$new_content .= "$element "; | |
} | |
} | |
$new_content .= "$content\n"; | |
# If option -1 is set and we are filtering for this element, exit. | |
if ($Options{'1'} and | |
(exists $element_filter{$element} or exists $element_filter{$element_path})) { | |
print $new_content; | |
exit 0 | |
} | |
} | |
$element_subcontent[0].=$new_content.$subcontent; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment