Skip to content

Instantly share code, notes, and snippets.

@nichtich
Created January 7, 2011 14:54
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nichtich/769542 to your computer and use it in GitHub Desktop.
Save nichtich/769542 to your computer and use it in GitHub Desktop.
Create CSV overview of a SKOS classification
#!/usr/bin/perl
use strict;
use warnings;
use utf8;
=head1 NAME
skosc2csv.pl - Create CSV overview of a SKOS classification
=head1 SYNOPSIS
skosc2csv.pl [ NTriples file ]
=cut
use Pod::Usage;
use Text::CSV; #::Encoded;
use Encode qw(decode);
our $VERSION = '0.12';
my $arg = shift @ARGV;
my $input = \*STDIN;
my $output = \*STDOUT;
if ( $arg ) {
pod2usage(1) if $arg =~ /^(-\?|-h|--help)$/;
pod2usage(-verbose => 2) if $arg =~ /^-?-man$/;
open($input, '<', $arg) || die "Failed to open file $arg";
}
binmode $output, ':utf8';
=head1 DESCRIPTION
This script reads RDF data in NTriples format and creates an overview
of the classification that is encoded in this RDF data in SKOS format.
=head2 Input format
The input (read from STDIN or from a file that is specified as first
parameter) must be RDF in NTriples format. All but the following RDF
predicates are ignored:
=over 4
=item rdf:type
=item skos:notation
=item skos:prefLabel
=item dcterms:title
Used if no prefLabel was found for some concept.
=item skos:broader
=item dcterms:subject
=item dcterms:extent
=item foaf:page
=back
=head2 Output format
The overview is given as CSV file with one class per line and the
following columns:
=over 4
=item level
The class' hierarchy level, starting with 1.
=item path
The class' broader classes as path of notations, separated by C<E<lt>>.
For instane the class C<MT 87> (Community music) of the Library of Congress
classification is located at C<M E<lt> MT>.
=item
=back
=head1 EXAMPLE
The following examples uses blank RDF nodes C<_:lccM>, C<_:c1>, etc. for resources.
In practise at least your classes should have stable URIs.
=head2 input in NTriples format
<_:lccM> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2008/05/skos#Concept> .
<_:lccM> <http://www.w3.org/2008/05/skos#notation> "M" .
<_:lccM> <http://www.w3.org/2008/05/skos#prefLabel> "Music"@en .
<_:lccMT> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2008/05/skos#Concept> .
<_:lccMT> <http://www.w3.org/2008/05/skos#notation> "MT" .
<_:lccMT> <http://www.w3.org/2008/05/skos#broader> <_:lccM> .
<_:lccMT> <http://www.w3.org/2008/05/skos#prefLabel> "Musical instruction and study"@en .
<_:lccMT87> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2008/05/skos#Concept> .
<_:lccMT87> <http://www.w3.org/2008/05/skos#notation> "MT 87" .
<_:lccMT87> <http://www.w3.org/2008/05/skos#broader> <_:lccM> .
<_:lccMT87> <http://www.w3.org/2008/05/skos#prefLabel> "Community music"@en .
<_:c1> <http://purl.org/dc/terms/extent> "50"^^<http://www.w3.org/2001/XMLSchema#integer> .
<_:c1> <http://purl.org/dc/terms/subject> <_:lccMT> .
<_:c2> <http://purl.org/dc/terms/subject> <_:lccMT87> .
<_:c2> <http://purl.org/dc/terms/extent> "30"^^<http://www.w3.org/2001/XMLSchema#integer> .
=head2 input in Turtle format
Turtle format is much easier to read. You can also provide your classification
data in Turtle format and convert it, for instance with the 'rapper' command
line tool.
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix skos: <http://www.w3.org/2008/05/skos#> .
@prefix dct: <http://purl.org/dc/terms/> .
# classes of the classification
<_:lccM> a skos:Concept ;
skos:notation "M" ;
skos:prefLabel "Music"@en .
<_:lccMT> a skos:Concept ;
skos:notation "MT" ;
skos:prefLabel "Musical instruction and study"@en ;
skos:broader <_:lccM> .
<_:lccMT87> a skos:Concept ;
skos:notation "MT 87" ;
skos:prefLabel "Community music"@en ;
skos:broader <_:lccM> .
# collections for some of the classes
<_:c1> dct:subject <_:lccMT> ;
dct:extent 50 .
<_:c2> dct:subject <_:lccMT87> ;
dct:extent 30 .
=head2 output as CSV
"level","path","notation","label"
"1","","M","Music",,"80"
"2","M","MT 87","Community music","30","30"
"2","M","MT","Musical instruction and study","50","50"
=head1 TODO
The current version does not use skos:prefLabel but skos:prefLabel.
Some classifications do not use unique notations (for instance
L<dewey.info>.
=cut
my %predicates = (
'http://www.w3.org/2008/05/skos#notation' => 'notation',
'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' => 'a',
'http://purl.org/dc/terms/subject' => 'subject',
'http://purl.org/dc/terms/title' => 'title',
'http://purl.org/dc/elements/1.1/subject' => 'subject',
'http://purl.org/dc/elements/1.1/title' => 'title',
'http://www.w3.org/2008/05/skos#broader' => 'broader',
'http://www.w3.org/2008/05/skos#prefLabel' => 'label',
'http://xmlns.com/foaf/0.1/page' => 'page',
'http://purl.org/dc/terms/extent' => 'extent'
);
my $triples = {};
###
### Parse NTriples format into a simple hash. This code is based on code
### from L<RDF::Trine::Parser::NTriples> to avoid including RDF::Trine.
### Using RDF::Trine would simplify this script, but add many dependencies.
###
my $lineno = 0;
while (defined(my $line = <$input>)) {
LINE:
($line, my @extra) = split(/\r\n|\r|\n/, $line, 2);
$lineno++;
next unless (defined($line) and length($line));
next unless ($line =~ /\S/);
chomp($line);
$line =~ s/^\s*//;
$line =~ s/\s*$//;
next if ($line =~ /^#/);
my @nodes = ();
for my $which (qw(subject predicate object)) {
my $node;
# eat one node
$line =~ s/^\s*//;
my $char = length($line) ? substr($line, 0, 1) : '.';
if ($char eq '.') {
# ignore
} elsif ($char eq '<') {
my ($uri) = $line =~ m/^<([^>]*)>/;
substr($line, 0, length($uri)+2) = '';
$node = $uri; # TODO: _unescape($uri, $lineno) );
} elsif ($char eq '_') {
my ($name) = $line =~ m/^_:([A-Za-z][A-Za-z0-9]*)/;
substr($line, 0, length($name)+2) = '';
$node = "_:$name";
} elsif ($char eq '"') {
substr($line, 0, 1) = '';
my $value = decode('utf8', '');
while (length($line) and substr($line, 0, 1) ne '"') {
while ($line =~ m/^([^"\\]+)/) {
$value .= $1;
substr($line,0,length($1)) = '';
}
if (substr($line,0,1) eq '\\') {
while ($line =~ m/^\\(.)/) {
if ($1 eq 't') {
$value .= "\t";
substr($line,0,2) = '';
} elsif ($1 eq 'r') {
$value .= "\r";
substr($line,0,2) = '';
} elsif ($1 eq 'n') {
$value .= "\n";
substr($line,0,2) = '';
} elsif ($1 eq '"') {
$value .= '"';
substr($line,0,2) = '';
} elsif ($1 eq '\\') {
$value .= "\\";
substr($line,0,2) = '';
} elsif ($1 eq 'u') {
$line =~ m/^\\u([0-9A-F]{4})/ or die qq[Bad N-Triples \\u escape at line $lineno, near "$line"];
$value .= chr(oct('0x' . $1));
substr($line,0,6) = '';
} elsif ($1 eq 'U') {
$line =~ m/^\\U([0-9A-F]{8})/ or die qq[Bad N-Triples \\U escape at line $lineno, near "$line"];
$value .= chr(oct('0x' . $1));
substr($line,0,10) = '';
} else {
die qq[Not valid N-Triples escape character '\\$1' at line $lineno, near "$line"];
}
}
}
}
if (substr($line,0,1) eq '"') {
substr($line,0,1) = '';
} else {
die qq[Ending double quote not found at line $lineno];
}
if ($line =~ m/^@([a-z]+(-[a-zA-Z0-9]+)*)/) {
my $lang = $1;
substr($line,0,1+length($lang)) = '';
# ignore the language
$node = $value;
} elsif (substr($line,0,3) eq '^^<') {
substr($line,0,3) = '';
my ($uri) = $line =~ m/^([^>]*)>/;
substr($line, 0, length($uri)+1) = '';
# ignore the datatype
$node = $value;
} else {
$node = $value;
}
} else {
die qq[Not valid N-Triples node start character '$char' at line $lineno, near "$line"];
}
die "expected $which at line $lineno" unless $node;
push(@nodes, $node);
$line =~ s/^\s*//;
}
$line =~ s/^\s//g;
unless ($line eq '.') {
die "Missing expected '.' at line $lineno";
}
my $subj = $nodes[0];
my $pred = $predicates{ $nodes[1] };
my $obj = $nodes[2];
if ( $pred ) {
if ( not exists $triples->{$subj} ) {
$triples->{$subj} = { $pred => [ $obj ] };
#} elsif ( not exists $triples->{$subj}->{$pred} ) {
} else {
push @{ $triples->{$subj}->{$pred} }, $obj;
}
}
if (@extra) {
$line = shift(@extra);
goto LINE;
}
}
###
### Collect skos:Concepts and associated collections
###
my $concepts = {};
my @top_concepts; # notations of top concepts
my $collections = {};
# check whether a resource is a skos:Concept
sub isa_concept {
return unless $_[0] and $_[0]->{a};
return grep { $_ eq 'http://www.w3.org/2008/05/skos#Concept' }
@{$_[0]->{a}};
}
# chek whether an object is unique for its subject and predicate
sub get_uniqueobject {
return $_[0]->[0] if $_[0] and @{$_[0]} == 1;
}
sub get_concept_notation {
my $uri = shift;
my $concept = $triples->{$uri};
return unless isa_concept( $concept );
return get_uniqueobject( $concept->{notation} );
}
while ( my ($uri,$c) = each %$triples ) {
my $message = "";
if ( isa_concept($c) ) {
my $notation = get_uniqueobject( $c->{notation} );
my $label = get_uniqueobject( $c->{label} );
unless ( defined $label ) {
$label = get_uniqueobject( $c->{title} );
}
if ( defined $notation and defined $label ) {
die "Notation $notation is not unique"
if $concepts->{ $notation };
$concepts->{ $notation } = {
label => $label,
uri => $uri
};
if ( $c->{broader} ) {
my $b_uri = $c->{broader}->[0];
my $broader = get_concept_notation( $b_uri );
if ( $broader ) {
$concepts->{ $notation }->{broader} = $broader;
} else {
$message = "Concept or notation not found: "
. $b_uri;
}
} else {
push @top_concepts, $notation;
}
} else {
$message = "$uri must have exactely one notation and one label";
}
} elsif ( $c->{subject} ) {
# We may change this limitation in a future version:
my $c_uri = get_uniqueobject( $c->{subject} );
if ($c_uri) {
my $notation = get_concept_notation( $c_uri );
if ( $notation ) {
push @{ $collections->{$notation} },
{ map { $_ => $c->{$_}->[0] } keys %$c };
} else {
$message = "$c_uri (dct:subject of $uri) must be "
. "a skos:Concept with one skos:notation.";
}
} else {
$message = "$uri must have only one subject";
}
} else {
next; # ignore all other resources
}
if ( $message ) {
print STDERR $message . "\n";
}
}
###
### For now we only want one collection per concept
###
while ( my ($notation, $c) = each %$collections ) {
if (@$c != 1) {
print STDERR "$notation should only have one collection.\n";
next;
}
# We will add 'foaf:page' in a later version
$concepts->{$notation}->{extent} = $c->[0]->{extent};
}
###
### Now we have a nice hashref $concepts to validate
###
while ( my ($notation, $c) = each %$concepts ) {
next unless $c->{broader};
push @{ $concepts->{ $c->{broader} }->{narrower} }, $notation;
}
sub traverse {
my $c = $concepts->{$_[0]};
if ( $c->{visited} ) {
die "there is a loop or diamond in your 'tree'";
exit;
}
$c->{visited} = 1;
$c->{size} = $c->{extent} unless $c->{size};
return unless $c->{narrower};
foreach my $n (@{$c->{narrower}}) {
traverse( $n );
$c->{size} += $concepts->{$n}->{size} if ($concepts->{$n}->{size});
}
}
traverse( $_ ) for @top_concepts;
delete $_->{visited} for values %$concepts;
my $csv = Text::CSV->new({binary => 1,always_quote=>1});
# notation, path, level
sub print_stat {
my $n = shift;
my $path = shift || "";
my $level = shift || 1;
my $c = $concepts->{$n};
my @row = ($level, $path, $n, $c->{label}, $c->{extent}, $c->{size} );
$csv->print( $output, \@row );
print $output "\n";
$path = "$path > " if $path;
foreach my $narrower (@{$c->{narrower}}) {
print_stat ( $narrower, "$path$n", $level+1 );
}
}
$csv->print( $output, [qw(level path notation label)] );
print $output "\n";
foreach my $n (sort @top_concepts) {
print_stat( $n );
}
=head1 COPYRIGHT
Copyright (c) 2010 Jakob Voss. All rights reserved. This program is
free software; you can redistribute it and/or modify it under the
same terms as Perl itself.
=cut
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment