Skip to content

Instantly share code, notes, and snippets.

@gaurav
Created October 2, 2012 08:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gaurav/3817213 to your computer and use it in GitHub Desktop.
Save gaurav/3817213 to your computer and use it in GitHub Desktop.
A file to split a DwC-A file into triples
#!/usr/bin/perl -w
use strict;
use warnings;
use v5.10.0;
=head1 NAME
delimit2rdf.pl -- Delimited file to RDF
=head1 SYNOPSIS
delimit2rdf.pl < input.txt > output.rdf
=head1 DESCRIPTION
This script is basically the core of a DwC-A to
RDF converter. It uses information core/extension
nodes in meta.xml (see http://rs.tdwg.org/dwc/terms/guides/text/index.htm#metafile
for more details) to convert a tab delimited file
into an RDF file.
=cut
use RDF::Helper;
# Put all the conversion information here.
my $encoding = undef; # Not implemented yet.
my $fieldsTerminatedBy = "\t";
my $linesTerminatedBy = "\n";
my $fieldsEnclosedBy = undef; # Not implemented yet.
my $ignoreHeaderLines = 0;
my $rowType = "http://rs.tdwg.org/dwc/terms/Taxon";
my @fields = (
undef, # Use this field as a primary id.
'http://rs.tdwg.org/dwc/terms/parentNameUsageID',
'http://rs.tdwg.org/dwc/terms/acceptedNameUsageID',
'http://rs.tdwg.org/dwc/terms/scientificName',
'http://rs.gbif.org/terms/1.0/canonicalName',
'http://rs.tdwg.org/dwc/terms/taxonRank',
'http://rs.tdwg.org/dwc/terms/taxonomicStatus',
'http://rs.tdwg.org/dwc/terms/nomenclaturalStatus',
'http://rs.tdwg.org/dwc/terms/genus',
'http://rs.tdwg.org/dwc/terms/specificEpithet',
'http://rs.tdwg.org/dwc/terms/infraspecificEpithet',
'http://rs.tdwg.org/dwc/terms/namePublishedIn',
'http://rs.tdwg.org/dwc/terms/nameAccordingTo',
'http://rs.tdwg.org/dwc/terms/kingdom',
'http://rs.tdwg.org/dwc/terms/phylum',
'http://rs.tdwg.org/dwc/terms/class',
'http://rs.tdwg.org/dwc/terms/order',
'http://rs.tdwg.org/dwc/terms/family'
);
# Check the input.
my %primary_key_check;
die "encoding not yet supported! Use utf8 please." if defined $encoding;
die "fieldsEnclosedBy is not yet supposed!" if defined $fieldsEnclosedBy;
die "The first field must be a primary key index!" unless(not defined $fields[0]);
# Conversion time!
=comment
my $rdf = RDF::Helper->new(
BaseInterface => 'RDF::Trine',
namespaces => {
'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
'dwc' => 'http://rs.tdwg.org/dwc/terms/',
'gbifterms' => 'http://rs.gbif.org/terms/1.0/',
'gbifnub' => 'http://gaurav.github.com/gbif-nub-dwca/'
},
ExpandQNames => 1
);
=cut
say <<'HEADER';
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix dwc: <http://rs.tdwg.org/dwc/terms/> .
@prefix gbifterms: <http://rs.gbif.org/terms/1.0/> .
@prefix gbifnub_id: <http://gaurav.github.com/gbif-nub-dwca/id_> .
@prefix gbifnub_name: <http://gaurav.github.com/gbif-nub-dwca/name_> .
HEADER
say STDERR "Started conversion.";
my $count_resources = 0;
my $count_triples = 0;
{
local $/ = $linesTerminatedBy; # Line terminator
<> if $ignoreHeaderLines; # Skip a line.
while(<>) {
chomp;
my $line = $_;
my @records = split($fieldsTerminatedBy, $line);
my $index = $records[0];
die "Duplicate id: $index" if exists $primary_key_check{$index};
$primary_key_check{$index} = 1;
my $resource = 'gbifnub_id:' . $index;
# $rdf->assert_resource($resource, 'rdf:type', $rowType);
say "<$resource> rdf:type <$rowType> ;\n";
my $count = 0;
foreach my $x (@records) {
# Skip the first record.
if($count == 0) {
die "Invalid first field" unless defined $x;
$count++;
next;
}
my $term_name = $fields[$count];
my $term_value = convert_term_value($term_name, $x);
$count++;
next unless defined $term_value;
say "\t<$term_name> $term_value;";
$count_triples++;
}
say ".";
say STDERR "Processed '$records[3]' (#$index).";
$count_resources++;
}
}
# print $rdf->serialize(format => 'rdfxml');
say STDERR "Conversion completed, $count_resources resources processed, $count_triples created.";
sub convert_term_value {
my ($term_name, $term_value) = @_;
given($term_name) {
when('http://rs.tdwg.org/dwc/terms/parentNameUsageID') {
return undef if $term_value eq '';
return "<gbifnub_id:$term_value>";
}
when('http://rs.tdwg.org/dwc/terms/acceptedNameUsageID') {
return undef if $term_value eq '';
return "\"$term_value\"";
}
when('http://rs.tdwg.org/dwc/terms/scientificName') {
return undef if $term_value eq '';
return "\"$term_value\"";
}
# when('http://rs.gbif.org/terms/1.0/canonicalName',
# when('http://rs.tdwg.org/dwc/terms/taxonRank',
# when('http://rs.tdwg.org/dwc/terms/taxonomicStatus',
# when('http://rs.tdwg.org/dwc/terms/nomenclaturalStatus',
# when('http://rs.tdwg.org/dwc/terms/genus',
# when('http://rs.tdwg.org/dwc/terms/specificEpithet',
# when('http://rs.tdwg.org/dwc/terms/infraspecificEpithet',
# when('http://rs.tdwg.org/dwc/terms/namePublishedIn',
# when('http://rs.tdwg.org/dwc/terms/nameAccordingTo',
# when('http://rs.tdwg.org/dwc/terms/kingdom',
# when('http://rs.tdwg.org/dwc/terms/phylum',
# when('http://rs.tdwg.org/dwc/terms/class',
# when('http://rs.tdwg.org/dwc/terms/order',
# when('http://rs.tdwg.org/dwc/terms/family'
#
}
return "\"$term_value\"";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment