Skip to content

Instantly share code, notes, and snippets.

@jlblcc
Last active November 19, 2016 06:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jlblcc/8700993 to your computer and use it in GitHub Desktop.
Save jlblcc/8700993 to your computer and use it in GitHub Desktop.
A simple perl script that fetches and parses GCMD Keywords using LWP::UserAgent.
#!/usr/bin/env perl
select(STDERR);
$| = 1;
select(STDOUT); # default
$| = 1;
#
# Usage: perl fetchGCMD.pl [-s scheme:http://gcmdservices.gsfc.nasa.gov/kms/concept_schemes? ] [-d DELIMETER]
#
use strict;
use utf8;
use JSON;
use XML::XPath;
use XML::XPath::XMLParser;
use LWP::UserAgent;
use Getopt::Long qw< :config auto_version bundling no_ignore_case >;
use Pod::Usage;
use HTML::Entities;
binmode( STDOUT, ":utf8" );
#my @required = qw(l);
my $scheme = 'sciencekeywords';
my $delim = '|';
my $json = '';
my $pretty = '';
my $with_defs = '';
my $options = GetOptions(
's|scheme=s' => \$scheme,
'd|delimiter=s' => \$delim,
'j|json' => \$json,
'p|pretty' => \$pretty,
'w|with-definitions' => \$with_defs,
# Standard options
'usage' => sub { pod2usage(2) },
'help' => sub { pod2usage(1) },
'man' => sub { pod2usage( -exitstatus => 0, -verbose => 2 ) },
) or die pod2usage(); # dies if an unlisted option is given
my $site =
"http://gcmdservices.gsfc.nasa.gov/kms/concepts/concept_scheme/$scheme/?format=rdf";
# You can also specify a filename, you will probably want to comment out the LWP stuff
# my $xp = XML::XPath->new(filename => '/var/rdf/GCMD_keywords.rdf');
my $ua = LWP::UserAgent->new;
my $req = HTTP::Request->new( GET => $site );
my $resp = $ua->request($req);
my $xml = $resp->decoded_content;
if ( $resp->is_success ) {
my $xp = XML::XPath->new( xml => $xml );
$xp->set_namespace( 'rdf', "http://www.w3.org/1999/02/22-rdf-syntax-ns#" );
$xp->set_namespace( 'skos', "http://www.w3.org/2004/02/skos/core#" );
$xp->set_namespace( 'gcmd', "http://gcmd.gsfc.nasa.gov/" );
my $version = $xp->find('rdf:RDF/gcmd:keywordVersion')->string_value;
if ($json) {
# find all root parents
my $nodeset = $xp->find('/*/skos:Concept[not(skos:broader)]');
my @root;
foreach my $node ( $nodeset->get_nodelist ) {
# print "FOUND\n\n", XML::XPath::XMLParser::as_string($node), "\n\n";
my %obj = json_node( $node, $xp, $with_defs );
push @root, \%obj;
}
print STDOUT to_json(
\@root,
{
'utf8' => 1,
'pretty' => $pretty,
}
);
}
else {
print "GCMD ", $scheme, "Keywords Version:", $version, "\n";
print "uuid", $delim, "label", $delim, "definition", $delim, "broader",
"\n";
my $nodeset = $xp->find('//skos:Concept'); # find all concepts
foreach my $node ( $nodeset->get_nodelist ) {
my $uuid = $node->getAttribute('rdf:about');
# print "uuid:", $uuid, "\n" if $uuid;
print $uuid , $delim;
my $label = $xp->find( 'skos:prefLabel', $node )->get_node(1);
# print "label:", $label->string_value , "\n" if $label;
print $label->string_value, $delim;
if ($with_defs) {
my $def = $xp->find( 'skos:definition', $node )->get_node(1);
$def = $def ? decode_entities( $def->string_value ) : undef;
$def =~ s/\r|\n/ /g; #This line will remove carriage returns
# print "def:", $def , "\n" if $def;
print $def , $delim;
}
else {
print $delim;
}
my $broad = $xp->find( 'skos:broader', $node )->get_node(1);
$broad = $broad ? $broad->getAttribute('rdf:resource') : undef;
# print "broader:", $broad , "\n" if $broad;
print $broad, "\n";
}
}
}
else {
die 'Error: Could not download Keywords. Please check options.';
}
sub json_node {
my $node = @_[0];
my $xp = @_[1];
my $with_defs = @_[2];
my $uuid = $node->getAttribute('rdf:about');
my $label = $xp->find( 'skos:prefLabel', $node )->get_node(1);
my $broad = $xp->find( 'skos:broader', $node )->get_node(1);
$broad = $broad ? $broad->getAttribute('rdf:resource') : undef;
my %jhash = (
"uuid" => $uuid,
"label" => $label->string_value,
"broader" => $broad
);
my $def = $xp->find( 'skos:definition', $node )->get_node(1);
if ($def) {
if ($with_defs) {
$def = $def ? decode_entities( $def->string_value ) : undef;
$def =~ s/\r|\n/ /g; #This line will remove carriage returns
# print "def:", $def , "\n" if $def;
$jhash{"definition"} = $def;
}
else {
$jhash{"hasDefinition"} = 1;
}
}
my $children = $xp->find( 'skos:narrower', $node );
if ( $children->size() ) {
$jhash{children} = [ () ];
foreach my $child ( $children->get_nodelist ) {
my $cid = $child->getAttribute('rdf:resource');
my $childnode = $xp->find("/*/skos:Concept[\@rdf:about='$cid']");
my %val = json_node( $childnode->get_node(1), $xp, $with_defs );
push $jhash{children}, \%val;
my $parent = $child->getParentNode;
$parent->removeChild($child);
}
}
return %jhash;
}
__END__
#############################################################################
#USAGE of script
#############################################################################
=head1 NAME
fetchGCMD
=head1 SYNOPSIS
perl fetchGCMD.pl [-s --scheme=] [-d --delimeter=] [-j --json]
Options:
-s --scheme=scheme name, default=sciencekeywords,
see http://gcmdservices.gsfc.nasa.gov/kms/concept_schemes
-d --delimeter=delimiter, default=|
-j --json json output
-p --pretty pretty json output
-w --with-definitions
-help brief help message
-man full documentation
=head1 DESCRIPTION
This script will download and format the GCMD keywords for the indicated scheme.
The output is dumped to stdout, you can pipe to a file.
=cut
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment