Created
October 2, 2012 08:03
-
-
Save gaurav/3817213 to your computer and use it in GitHub Desktop.
A file to split a DwC-A file into triples
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
use strict; | |
use warnings; | |
use v5.10.0; | |
=head1 NAME | |
delimit2rdf.pl -- Delimited file to RDF | |
=head1 SYNOPSIS | |
delimit2rdf.pl < input.txt > output.rdf | |
=head1 DESCRIPTION | |
This script is basically the core of a DwC-A to | |
RDF converter. It uses information core/extension | |
nodes in meta.xml (see http://rs.tdwg.org/dwc/terms/guides/text/index.htm#metafile | |
for more details) to convert a tab delimited file | |
into an RDF file. | |
=cut | |
use RDF::Helper; | |
# Put all the conversion information here. | |
my $encoding = undef; # Not implemented yet. | |
my $fieldsTerminatedBy = "\t"; | |
my $linesTerminatedBy = "\n"; | |
my $fieldsEnclosedBy = undef; # Not implemented yet. | |
my $ignoreHeaderLines = 0; | |
my $rowType = "http://rs.tdwg.org/dwc/terms/Taxon"; | |
my @fields = ( | |
undef, # Use this field as a primary id. | |
'http://rs.tdwg.org/dwc/terms/parentNameUsageID', | |
'http://rs.tdwg.org/dwc/terms/acceptedNameUsageID', | |
'http://rs.tdwg.org/dwc/terms/scientificName', | |
'http://rs.gbif.org/terms/1.0/canonicalName', | |
'http://rs.tdwg.org/dwc/terms/taxonRank', | |
'http://rs.tdwg.org/dwc/terms/taxonomicStatus', | |
'http://rs.tdwg.org/dwc/terms/nomenclaturalStatus', | |
'http://rs.tdwg.org/dwc/terms/genus', | |
'http://rs.tdwg.org/dwc/terms/specificEpithet', | |
'http://rs.tdwg.org/dwc/terms/infraspecificEpithet', | |
'http://rs.tdwg.org/dwc/terms/namePublishedIn', | |
'http://rs.tdwg.org/dwc/terms/nameAccordingTo', | |
'http://rs.tdwg.org/dwc/terms/kingdom', | |
'http://rs.tdwg.org/dwc/terms/phylum', | |
'http://rs.tdwg.org/dwc/terms/class', | |
'http://rs.tdwg.org/dwc/terms/order', | |
'http://rs.tdwg.org/dwc/terms/family' | |
); | |
# Check the input. | |
my %primary_key_check; | |
die "encoding not yet supported! Use utf8 please." if defined $encoding; | |
die "fieldsEnclosedBy is not yet supposed!" if defined $fieldsEnclosedBy; | |
die "The first field must be a primary key index!" unless(not defined $fields[0]); | |
# Conversion time! | |
=comment | |
my $rdf = RDF::Helper->new( | |
BaseInterface => 'RDF::Trine', | |
namespaces => { | |
'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', | |
'dwc' => 'http://rs.tdwg.org/dwc/terms/', | |
'gbifterms' => 'http://rs.gbif.org/terms/1.0/', | |
'gbifnub' => 'http://gaurav.github.com/gbif-nub-dwca/' | |
}, | |
ExpandQNames => 1 | |
); | |
=cut | |
say <<'HEADER'; | |
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . | |
@prefix dwc: <http://rs.tdwg.org/dwc/terms/> . | |
@prefix gbifterms: <http://rs.gbif.org/terms/1.0/> . | |
@prefix gbifnub_id: <http://gaurav.github.com/gbif-nub-dwca/id_> . | |
@prefix gbifnub_name: <http://gaurav.github.com/gbif-nub-dwca/name_> . | |
HEADER | |
say STDERR "Started conversion."; | |
my $count_resources = 0; | |
my $count_triples = 0; | |
{ | |
local $/ = $linesTerminatedBy; # Line terminator | |
<> if $ignoreHeaderLines; # Skip a line. | |
while(<>) { | |
chomp; | |
my $line = $_; | |
my @records = split($fieldsTerminatedBy, $line); | |
my $index = $records[0]; | |
die "Duplicate id: $index" if exists $primary_key_check{$index}; | |
$primary_key_check{$index} = 1; | |
my $resource = 'gbifnub_id:' . $index; | |
# $rdf->assert_resource($resource, 'rdf:type', $rowType); | |
say "<$resource> rdf:type <$rowType> ;\n"; | |
my $count = 0; | |
foreach my $x (@records) { | |
# Skip the first record. | |
if($count == 0) { | |
die "Invalid first field" unless defined $x; | |
$count++; | |
next; | |
} | |
my $term_name = $fields[$count]; | |
my $term_value = convert_term_value($term_name, $x); | |
$count++; | |
next unless defined $term_value; | |
say "\t<$term_name> $term_value;"; | |
$count_triples++; | |
} | |
say "."; | |
say STDERR "Processed '$records[3]' (#$index)."; | |
$count_resources++; | |
} | |
} | |
# print $rdf->serialize(format => 'rdfxml'); | |
say STDERR "Conversion completed, $count_resources resources processed, $count_triples created."; | |
sub convert_term_value { | |
my ($term_name, $term_value) = @_; | |
given($term_name) { | |
when('http://rs.tdwg.org/dwc/terms/parentNameUsageID') { | |
return undef if $term_value eq ''; | |
return "<gbifnub_id:$term_value>"; | |
} | |
when('http://rs.tdwg.org/dwc/terms/acceptedNameUsageID') { | |
return undef if $term_value eq ''; | |
return "\"$term_value\""; | |
} | |
when('http://rs.tdwg.org/dwc/terms/scientificName') { | |
return undef if $term_value eq ''; | |
return "\"$term_value\""; | |
} | |
# when('http://rs.gbif.org/terms/1.0/canonicalName', | |
# when('http://rs.tdwg.org/dwc/terms/taxonRank', | |
# when('http://rs.tdwg.org/dwc/terms/taxonomicStatus', | |
# when('http://rs.tdwg.org/dwc/terms/nomenclaturalStatus', | |
# when('http://rs.tdwg.org/dwc/terms/genus', | |
# when('http://rs.tdwg.org/dwc/terms/specificEpithet', | |
# when('http://rs.tdwg.org/dwc/terms/infraspecificEpithet', | |
# when('http://rs.tdwg.org/dwc/terms/namePublishedIn', | |
# when('http://rs.tdwg.org/dwc/terms/nameAccordingTo', | |
# when('http://rs.tdwg.org/dwc/terms/kingdom', | |
# when('http://rs.tdwg.org/dwc/terms/phylum', | |
# when('http://rs.tdwg.org/dwc/terms/class', | |
# when('http://rs.tdwg.org/dwc/terms/order', | |
# when('http://rs.tdwg.org/dwc/terms/family' | |
# | |
} | |
return "\"$term_value\""; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment