Last active
November 19, 2016 06:05
-
-
Save jlblcc/8700993 to your computer and use it in GitHub Desktop.
A simple perl script that fetches and parses GCMD Keywords using LWP::UserAgent.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
select(STDERR); | |
$| = 1; | |
select(STDOUT); # default | |
$| = 1; | |
# | |
# Usage: perl fetchGCMD.pl [-s scheme:http://gcmdservices.gsfc.nasa.gov/kms/concept_schemes? ] [-d DELIMETER] | |
# | |
use strict; | |
use utf8; | |
use JSON; | |
use XML::XPath; | |
use XML::XPath::XMLParser; | |
use LWP::UserAgent; | |
use Getopt::Long qw< :config auto_version bundling no_ignore_case >; | |
use Pod::Usage; | |
use HTML::Entities; | |
binmode( STDOUT, ":utf8" ); | |
#my @required = qw(l); | |
my $scheme = 'sciencekeywords'; | |
my $delim = '|'; | |
my $json = ''; | |
my $pretty = ''; | |
my $with_defs = ''; | |
my $options = GetOptions( | |
's|scheme=s' => \$scheme, | |
'd|delimiter=s' => \$delim, | |
'j|json' => \$json, | |
'p|pretty' => \$pretty, | |
'w|with-definitions' => \$with_defs, | |
# Standard options | |
'usage' => sub { pod2usage(2) }, | |
'help' => sub { pod2usage(1) }, | |
'man' => sub { pod2usage( -exitstatus => 0, -verbose => 2 ) }, | |
) or die pod2usage(); # dies if an unlisted option is given | |
my $site = | |
"http://gcmdservices.gsfc.nasa.gov/kms/concepts/concept_scheme/$scheme/?format=rdf"; | |
# You can also specify a filename, you will probably want to comment out the LWP stuff | |
# my $xp = XML::XPath->new(filename => '/var/rdf/GCMD_keywords.rdf'); | |
my $ua = LWP::UserAgent->new; | |
my $req = HTTP::Request->new( GET => $site ); | |
my $resp = $ua->request($req); | |
my $xml = $resp->decoded_content; | |
if ( $resp->is_success ) { | |
my $xp = XML::XPath->new( xml => $xml ); | |
$xp->set_namespace( 'rdf', "http://www.w3.org/1999/02/22-rdf-syntax-ns#" ); | |
$xp->set_namespace( 'skos', "http://www.w3.org/2004/02/skos/core#" ); | |
$xp->set_namespace( 'gcmd', "http://gcmd.gsfc.nasa.gov/" ); | |
my $version = $xp->find('rdf:RDF/gcmd:keywordVersion')->string_value; | |
if ($json) { | |
# find all root parents | |
my $nodeset = $xp->find('/*/skos:Concept[not(skos:broader)]'); | |
my @root; | |
foreach my $node ( $nodeset->get_nodelist ) { | |
# print "FOUND\n\n", XML::XPath::XMLParser::as_string($node), "\n\n"; | |
my %obj = json_node( $node, $xp, $with_defs ); | |
push @root, \%obj; | |
} | |
print STDOUT to_json( | |
\@root, | |
{ | |
'utf8' => 1, | |
'pretty' => $pretty, | |
} | |
); | |
} | |
else { | |
print "GCMD ", $scheme, "Keywords Version:", $version, "\n"; | |
print "uuid", $delim, "label", $delim, "definition", $delim, "broader", | |
"\n"; | |
my $nodeset = $xp->find('//skos:Concept'); # find all concepts | |
foreach my $node ( $nodeset->get_nodelist ) { | |
my $uuid = $node->getAttribute('rdf:about'); | |
# print "uuid:", $uuid, "\n" if $uuid; | |
print $uuid , $delim; | |
my $label = $xp->find( 'skos:prefLabel', $node )->get_node(1); | |
# print "label:", $label->string_value , "\n" if $label; | |
print $label->string_value, $delim; | |
if ($with_defs) { | |
my $def = $xp->find( 'skos:definition', $node )->get_node(1); | |
$def = $def ? decode_entities( $def->string_value ) : undef; | |
$def =~ s/\r|\n/ /g; #This line will remove carriage returns | |
# print "def:", $def , "\n" if $def; | |
print $def , $delim; | |
} | |
else { | |
print $delim; | |
} | |
my $broad = $xp->find( 'skos:broader', $node )->get_node(1); | |
$broad = $broad ? $broad->getAttribute('rdf:resource') : undef; | |
# print "broader:", $broad , "\n" if $broad; | |
print $broad, "\n"; | |
} | |
} | |
} | |
else { | |
die 'Error: Could not download Keywords. Please check options.'; | |
} | |
sub json_node { | |
my $node = @_[0]; | |
my $xp = @_[1]; | |
my $with_defs = @_[2]; | |
my $uuid = $node->getAttribute('rdf:about'); | |
my $label = $xp->find( 'skos:prefLabel', $node )->get_node(1); | |
my $broad = $xp->find( 'skos:broader', $node )->get_node(1); | |
$broad = $broad ? $broad->getAttribute('rdf:resource') : undef; | |
my %jhash = ( | |
"uuid" => $uuid, | |
"label" => $label->string_value, | |
"broader" => $broad | |
); | |
my $def = $xp->find( 'skos:definition', $node )->get_node(1); | |
if ($def) { | |
if ($with_defs) { | |
$def = $def ? decode_entities( $def->string_value ) : undef; | |
$def =~ s/\r|\n/ /g; #This line will remove carriage returns | |
# print "def:", $def , "\n" if $def; | |
$jhash{"definition"} = $def; | |
} | |
else { | |
$jhash{"hasDefinition"} = 1; | |
} | |
} | |
my $children = $xp->find( 'skos:narrower', $node ); | |
if ( $children->size() ) { | |
$jhash{children} = [ () ]; | |
foreach my $child ( $children->get_nodelist ) { | |
my $cid = $child->getAttribute('rdf:resource'); | |
my $childnode = $xp->find("/*/skos:Concept[\@rdf:about='$cid']"); | |
my %val = json_node( $childnode->get_node(1), $xp, $with_defs ); | |
push $jhash{children}, \%val; | |
my $parent = $child->getParentNode; | |
$parent->removeChild($child); | |
} | |
} | |
return %jhash; | |
} | |
__END__ | |
############################################################################# | |
#USAGE of script | |
############################################################################# | |
=head1 NAME | |
fetchGCMD | |
=head1 SYNOPSIS | |
perl fetchGCMD.pl [-s --scheme=] [-d --delimeter=] [-j --json] | |
Options: | |
-s --scheme=scheme name, default=sciencekeywords, | |
see http://gcmdservices.gsfc.nasa.gov/kms/concept_schemes | |
-d --delimeter=delimiter, default=| | |
-j --json json output | |
-p --pretty pretty json output | |
-w --with-definitions | |
-help brief help message | |
-man full documentation | |
=head1 DESCRIPTION | |
This script will download and format the GCMD keywords for the indicated scheme. | |
The output is dumped to stdout, you can pipe to a file. | |
=cut |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment