Created
January 7, 2011 14:54
-
-
Save nichtich/769542 to your computer and use it in GitHub Desktop.
Create CSV overview of a SKOS classification
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use utf8; | |
=head1 NAME | |
skosc2csv.pl - Create CSV overview of a SKOS classification | |
=head1 SYNOPSIS | |
skosc2csv.pl [ NTriples file ] | |
=cut | |
use Pod::Usage; | |
use Text::CSV; #::Encoded; | |
use Encode qw(decode); | |
our $VERSION = '0.12'; | |
my $arg = shift @ARGV; | |
my $input = \*STDIN; | |
my $output = \*STDOUT; | |
if ( $arg ) { | |
pod2usage(1) if $arg =~ /^(-\?|-h|--help)$/; | |
pod2usage(-verbose => 2) if $arg =~ /^-?-man$/; | |
open($input, '<', $arg) || die "Failed to open file $arg"; | |
} | |
binmode $output, ':utf8'; | |
=head1 DESCRIPTION | |
This script reads RDF data in NTriples format and creates an overview | |
of the classification that is encoded in this RDF data in SKOS format. | |
=head2 Input format | |
The input (read from STDIN or from a file that is specified as first | |
parameter) must be RDF in NTriples format. All but the following RDF | |
predicates are ignored: | |
=over 4 | |
=item rdf:type | |
=item skos:notation | |
=item skos:prefLabel | |
=item dcterms:title | |
Used if no prefLabel was found for some concept. | |
=item skos:broader | |
=item dcterms:subject | |
=item dcterms:extent | |
=item foaf:page | |
=back | |
=head2 Output format | |
The overview is given as CSV file with one class per line and the | |
following columns: | |
=over 4 | |
=item level | |
The class' hierarchy level, starting with 1. | |
=item path | |
The class' broader classes as path of notations, separated by C<E<lt>>. | |
For instane the class C<MT 87> (Community music) of the Library of Congress | |
classification is located at C<M E<lt> MT>. | |
=item | |
=back | |
=head1 EXAMPLE | |
The following examples uses blank RDF nodes C<_:lccM>, C<_:c1>, etc. for resources. | |
In practise at least your classes should have stable URIs. | |
=head2 input in NTriples format | |
<_:lccM> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2008/05/skos#Concept> . | |
<_:lccM> <http://www.w3.org/2008/05/skos#notation> "M" . | |
<_:lccM> <http://www.w3.org/2008/05/skos#prefLabel> "Music"@en . | |
<_:lccMT> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2008/05/skos#Concept> . | |
<_:lccMT> <http://www.w3.org/2008/05/skos#notation> "MT" . | |
<_:lccMT> <http://www.w3.org/2008/05/skos#broader> <_:lccM> . | |
<_:lccMT> <http://www.w3.org/2008/05/skos#prefLabel> "Musical instruction and study"@en . | |
<_:lccMT87> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2008/05/skos#Concept> . | |
<_:lccMT87> <http://www.w3.org/2008/05/skos#notation> "MT 87" . | |
<_:lccMT87> <http://www.w3.org/2008/05/skos#broader> <_:lccM> . | |
<_:lccMT87> <http://www.w3.org/2008/05/skos#prefLabel> "Community music"@en . | |
<_:c1> <http://purl.org/dc/terms/extent> "50"^^<http://www.w3.org/2001/XMLSchema#integer> . | |
<_:c1> <http://purl.org/dc/terms/subject> <_:lccMT> . | |
<_:c2> <http://purl.org/dc/terms/subject> <_:lccMT87> . | |
<_:c2> <http://purl.org/dc/terms/extent> "30"^^<http://www.w3.org/2001/XMLSchema#integer> . | |
=head2 input in Turtle format | |
Turtle format is much easier to read. You can also provide your classification | |
data in Turtle format and convert it, for instance with the 'rapper' command | |
line tool. | |
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . | |
@prefix skos: <http://www.w3.org/2008/05/skos#> . | |
@prefix dct: <http://purl.org/dc/terms/> . | |
# classes of the classification | |
<_:lccM> a skos:Concept ; | |
skos:notation "M" ; | |
skos:prefLabel "Music"@en . | |
<_:lccMT> a skos:Concept ; | |
skos:notation "MT" ; | |
skos:prefLabel "Musical instruction and study"@en ; | |
skos:broader <_:lccM> . | |
<_:lccMT87> a skos:Concept ; | |
skos:notation "MT 87" ; | |
skos:prefLabel "Community music"@en ; | |
skos:broader <_:lccM> . | |
# collections for some of the classes | |
<_:c1> dct:subject <_:lccMT> ; | |
dct:extent 50 . | |
<_:c2> dct:subject <_:lccMT87> ; | |
dct:extent 30 . | |
=head2 output as CSV | |
"level","path","notation","label" | |
"1","","M","Music",,"80" | |
"2","M","MT 87","Community music","30","30" | |
"2","M","MT","Musical instruction and study","50","50" | |
=head1 TODO | |
The current version does not use skos:prefLabel but skos:prefLabel. | |
Some classifications do not use unique notations (for instance | |
L<dewey.info>. | |
=cut | |
my %predicates = ( | |
'http://www.w3.org/2008/05/skos#notation' => 'notation', | |
'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' => 'a', | |
'http://purl.org/dc/terms/subject' => 'subject', | |
'http://purl.org/dc/terms/title' => 'title', | |
'http://purl.org/dc/elements/1.1/subject' => 'subject', | |
'http://purl.org/dc/elements/1.1/title' => 'title', | |
'http://www.w3.org/2008/05/skos#broader' => 'broader', | |
'http://www.w3.org/2008/05/skos#prefLabel' => 'label', | |
'http://xmlns.com/foaf/0.1/page' => 'page', | |
'http://purl.org/dc/terms/extent' => 'extent' | |
); | |
my $triples = {}; | |
### | |
### Parse NTriples format into a simple hash. This code is based on code | |
### from L<RDF::Trine::Parser::NTriples> to avoid including RDF::Trine. | |
### Using RDF::Trine would simplify this script, but add many dependencies. | |
### | |
my $lineno = 0; | |
while (defined(my $line = <$input>)) { | |
LINE: | |
($line, my @extra) = split(/\r\n|\r|\n/, $line, 2); | |
$lineno++; | |
next unless (defined($line) and length($line)); | |
next unless ($line =~ /\S/); | |
chomp($line); | |
$line =~ s/^\s*//; | |
$line =~ s/\s*$//; | |
next if ($line =~ /^#/); | |
my @nodes = (); | |
for my $which (qw(subject predicate object)) { | |
my $node; | |
# eat one node | |
$line =~ s/^\s*//; | |
my $char = length($line) ? substr($line, 0, 1) : '.'; | |
if ($char eq '.') { | |
# ignore | |
} elsif ($char eq '<') { | |
my ($uri) = $line =~ m/^<([^>]*)>/; | |
substr($line, 0, length($uri)+2) = ''; | |
$node = $uri; # TODO: _unescape($uri, $lineno) ); | |
} elsif ($char eq '_') { | |
my ($name) = $line =~ m/^_:([A-Za-z][A-Za-z0-9]*)/; | |
substr($line, 0, length($name)+2) = ''; | |
$node = "_:$name"; | |
} elsif ($char eq '"') { | |
substr($line, 0, 1) = ''; | |
my $value = decode('utf8', ''); | |
while (length($line) and substr($line, 0, 1) ne '"') { | |
while ($line =~ m/^([^"\\]+)/) { | |
$value .= $1; | |
substr($line,0,length($1)) = ''; | |
} | |
if (substr($line,0,1) eq '\\') { | |
while ($line =~ m/^\\(.)/) { | |
if ($1 eq 't') { | |
$value .= "\t"; | |
substr($line,0,2) = ''; | |
} elsif ($1 eq 'r') { | |
$value .= "\r"; | |
substr($line,0,2) = ''; | |
} elsif ($1 eq 'n') { | |
$value .= "\n"; | |
substr($line,0,2) = ''; | |
} elsif ($1 eq '"') { | |
$value .= '"'; | |
substr($line,0,2) = ''; | |
} elsif ($1 eq '\\') { | |
$value .= "\\"; | |
substr($line,0,2) = ''; | |
} elsif ($1 eq 'u') { | |
$line =~ m/^\\u([0-9A-F]{4})/ or die qq[Bad N-Triples \\u escape at line $lineno, near "$line"]; | |
$value .= chr(oct('0x' . $1)); | |
substr($line,0,6) = ''; | |
} elsif ($1 eq 'U') { | |
$line =~ m/^\\U([0-9A-F]{8})/ or die qq[Bad N-Triples \\U escape at line $lineno, near "$line"]; | |
$value .= chr(oct('0x' . $1)); | |
substr($line,0,10) = ''; | |
} else { | |
die qq[Not valid N-Triples escape character '\\$1' at line $lineno, near "$line"]; | |
} | |
} | |
} | |
} | |
if (substr($line,0,1) eq '"') { | |
substr($line,0,1) = ''; | |
} else { | |
die qq[Ending double quote not found at line $lineno]; | |
} | |
if ($line =~ m/^@([a-z]+(-[a-zA-Z0-9]+)*)/) { | |
my $lang = $1; | |
substr($line,0,1+length($lang)) = ''; | |
# ignore the language | |
$node = $value; | |
} elsif (substr($line,0,3) eq '^^<') { | |
substr($line,0,3) = ''; | |
my ($uri) = $line =~ m/^([^>]*)>/; | |
substr($line, 0, length($uri)+1) = ''; | |
# ignore the datatype | |
$node = $value; | |
} else { | |
$node = $value; | |
} | |
} else { | |
die qq[Not valid N-Triples node start character '$char' at line $lineno, near "$line"]; | |
} | |
die "expected $which at line $lineno" unless $node; | |
push(@nodes, $node); | |
$line =~ s/^\s*//; | |
} | |
$line =~ s/^\s//g; | |
unless ($line eq '.') { | |
die "Missing expected '.' at line $lineno"; | |
} | |
my $subj = $nodes[0]; | |
my $pred = $predicates{ $nodes[1] }; | |
my $obj = $nodes[2]; | |
if ( $pred ) { | |
if ( not exists $triples->{$subj} ) { | |
$triples->{$subj} = { $pred => [ $obj ] }; | |
#} elsif ( not exists $triples->{$subj}->{$pred} ) { | |
} else { | |
push @{ $triples->{$subj}->{$pred} }, $obj; | |
} | |
} | |
if (@extra) { | |
$line = shift(@extra); | |
goto LINE; | |
} | |
} | |
### | |
### Collect skos:Concepts and associated collections | |
### | |
my $concepts = {}; | |
my @top_concepts; # notations of top concepts | |
my $collections = {}; | |
# check whether a resource is a skos:Concept | |
sub isa_concept { | |
return unless $_[0] and $_[0]->{a}; | |
return grep { $_ eq 'http://www.w3.org/2008/05/skos#Concept' } | |
@{$_[0]->{a}}; | |
} | |
# chek whether an object is unique for its subject and predicate | |
sub get_uniqueobject { | |
return $_[0]->[0] if $_[0] and @{$_[0]} == 1; | |
} | |
sub get_concept_notation { | |
my $uri = shift; | |
my $concept = $triples->{$uri}; | |
return unless isa_concept( $concept ); | |
return get_uniqueobject( $concept->{notation} ); | |
} | |
while ( my ($uri,$c) = each %$triples ) { | |
my $message = ""; | |
if ( isa_concept($c) ) { | |
my $notation = get_uniqueobject( $c->{notation} ); | |
my $label = get_uniqueobject( $c->{label} ); | |
unless ( defined $label ) { | |
$label = get_uniqueobject( $c->{title} ); | |
} | |
if ( defined $notation and defined $label ) { | |
die "Notation $notation is not unique" | |
if $concepts->{ $notation }; | |
$concepts->{ $notation } = { | |
label => $label, | |
uri => $uri | |
}; | |
if ( $c->{broader} ) { | |
my $b_uri = $c->{broader}->[0]; | |
my $broader = get_concept_notation( $b_uri ); | |
if ( $broader ) { | |
$concepts->{ $notation }->{broader} = $broader; | |
} else { | |
$message = "Concept or notation not found: " | |
. $b_uri; | |
} | |
} else { | |
push @top_concepts, $notation; | |
} | |
} else { | |
$message = "$uri must have exactely one notation and one label"; | |
} | |
} elsif ( $c->{subject} ) { | |
# We may change this limitation in a future version: | |
my $c_uri = get_uniqueobject( $c->{subject} ); | |
if ($c_uri) { | |
my $notation = get_concept_notation( $c_uri ); | |
if ( $notation ) { | |
push @{ $collections->{$notation} }, | |
{ map { $_ => $c->{$_}->[0] } keys %$c }; | |
} else { | |
$message = "$c_uri (dct:subject of $uri) must be " | |
. "a skos:Concept with one skos:notation."; | |
} | |
} else { | |
$message = "$uri must have only one subject"; | |
} | |
} else { | |
next; # ignore all other resources | |
} | |
if ( $message ) { | |
print STDERR $message . "\n"; | |
} | |
} | |
### | |
### For now we only want one collection per concept | |
### | |
while ( my ($notation, $c) = each %$collections ) { | |
if (@$c != 1) { | |
print STDERR "$notation should only have one collection.\n"; | |
next; | |
} | |
# We will add 'foaf:page' in a later version | |
$concepts->{$notation}->{extent} = $c->[0]->{extent}; | |
} | |
### | |
### Now we have a nice hashref $concepts to validate | |
### | |
while ( my ($notation, $c) = each %$concepts ) { | |
next unless $c->{broader}; | |
push @{ $concepts->{ $c->{broader} }->{narrower} }, $notation; | |
} | |
sub traverse { | |
my $c = $concepts->{$_[0]}; | |
if ( $c->{visited} ) { | |
die "there is a loop or diamond in your 'tree'"; | |
exit; | |
} | |
$c->{visited} = 1; | |
$c->{size} = $c->{extent} unless $c->{size}; | |
return unless $c->{narrower}; | |
foreach my $n (@{$c->{narrower}}) { | |
traverse( $n ); | |
$c->{size} += $concepts->{$n}->{size} if ($concepts->{$n}->{size}); | |
} | |
} | |
traverse( $_ ) for @top_concepts; | |
delete $_->{visited} for values %$concepts; | |
my $csv = Text::CSV->new({binary => 1,always_quote=>1}); | |
# notation, path, level | |
sub print_stat { | |
my $n = shift; | |
my $path = shift || ""; | |
my $level = shift || 1; | |
my $c = $concepts->{$n}; | |
my @row = ($level, $path, $n, $c->{label}, $c->{extent}, $c->{size} ); | |
$csv->print( $output, \@row ); | |
print $output "\n"; | |
$path = "$path > " if $path; | |
foreach my $narrower (@{$c->{narrower}}) { | |
print_stat ( $narrower, "$path$n", $level+1 ); | |
} | |
} | |
$csv->print( $output, [qw(level path notation label)] ); | |
print $output "\n"; | |
foreach my $n (sort @top_concepts) { | |
print_stat( $n ); | |
} | |
=head1 COPYRIGHT | |
Copyright (c) 2010 Jakob Voss. All rights reserved. This program is | |
free software; you can redistribute it and/or modify it under the | |
same terms as Perl itself. | |
=cut |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment