Skip to content

Instantly share code, notes, and snippets.

@icklecows
Last active January 27, 2017 11:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save icklecows/1ba84a5cd5f0ecb5f30ac480cc9de8db to your computer and use it in GitHub Desktop.
Save icklecows/1ba84a5cd5f0ecb5f30ac480cc9de8db to your computer and use it in GitHub Desktop.
Preliminary work on Frictionless Data

This perl module is not finished. It was started as a library to implement the Data Package specification set out here:

http://frictionlessdata.io/guides/data-package/

It only contains code intended to validate packages so far, and carries out some basic validation.

#!/usr/bin/perl -w
use strict;
use JSON;
package FrictionlessData;
our $VERSION = "0.1";
my @errors;
sub new {
### What properties do we need to capture here?
my $class = shift;
my $self = {};
bless $self, $class;
return $self;
}
sub validate {
my ( $self, $datapackage, $schema ) = @_;
if ( defined $schema ) {
############################################################################################
# The schema conversion should be in two functions:
# - required_fields
#
# return an array of field names which must be present or the document isn't valid
#
# - field_properties
#
# return a hash of properties and their values
#
############################################################################################
my $ds = convert_json_to_perl($self, $schema);
my @required; # An array of field names for required properties - simple check, so if it's invalid, it can reject quickly
my %properties; # A hash of field names and validation criteria - fieldname_[anyOf/oneOf_]_property : value / array / hash
if ($ds) { # else returns an error message explaining that the schema is missing
@required = $self->required_fields($ds);
%properties = $self->field_properties($ds);
} else {
push @errors, "The Schema is not valid JSON";
}
if ( defined $datapackage ) {
my $dp = $self->convert_json_to_perl($datapackage);
if ($dp) {
my @datapackage_keys = keys %$dp;
foreach my $rq ( @required ) {
next if exists $$dp{$rq};
push @errors, "Missing required field \"$rq\"";
}
#################################################################################################
#
# This will be where we match the data package against the schema
#
#################################################################################################
my @property_keys = keys %properties;
foreach my $datapackage_key (@datapackage_keys) {
foreach my $property_key ( @property_keys ) {
if ( $property_key =~ /^$datapackage_key/ ){
my $validate_result = $self->validate_field( $datapackage_key, $$dp{$datapackage_key}, $property_key, $properties{$property_key} );
}
}
}
} else {
push @errors, "The Data Package is not valid JSON";
}
} else {
push @errors, "Please enter a file to validate.";
}
} else {
push @errors, "Please enter a schema.";
}
if (scalar @errors > 0) {
foreach my $err (@errors) {
print "$err\n";
}
return 0;
} else {
return 1;
# print "The data package is valid\n";
}
}
sub convert_json_to_perl {
my ($self, $json) = @_;
my $json_parser = JSON->new; # $json_parser is the JSON parser object that we use to convert the JSON file to a perl data structure
#################################################################
# #
# needs more work here on extracting a URI as well as from file #
# #
#################################################################
open(JSONFILE, "$json") || die $!;
##### Convert the json to a perl data structure
my $perl_data = join( "", <JSONFILE>); #perl_data is the data package schema converted from a file to a string
close(JSONFILE);
$perl_data = eval{ $json_parser->decode( $perl_data ) }; # $perl_data is the json string converted into a perl data structure
return $perl_data;
}
sub required_fields {
#####################################################################################
# #
# returns an array of field names which must be present or the document isn't valid #
# #
#####################################################################################
my ($self, $schema) = @_;
my @required;
foreach my $schemakey ( keys %$schema ) {
next unless $schemakey eq "required";
my $required_fields = $$schema{$schemakey};
foreach my $field (@$required_fields) {
push @required, $field;
}
}
return @required;
}
sub field_properties {
#################################################
# #
# returns a hash of properties and their values #
# #
#################################################
my ($self, $schema) = @_;
my %properties;
foreach my $schemakey (keys %$schema) {
if ( $schemakey eq "properties" ) {
my $properties = $$schema{$schemakey};
# The definitions of the properties are held in a separate file - this just checks that they are all referencing the same file
my %reffiles;
foreach my $property ( keys %$properties ) {
my $property_definitions = $$properties{$property};
foreach my $definition ( keys %$property_definitions ) {
if ( $definition eq '$ref' ) {
my $ref = $$property_definitions{$definition};
my @refs = split( '#', $ref );
push @{$reffiles{$refs[0]}}, $refs[1];
} else {
next if $definition eq "title";
next if $definition eq "description";
next if $definition eq "propertyOrder";
next if $definition eq "format";
my $deftype = $$property_definitions{$definition};
$properties{$property."_".$definition} = $deftype;
}
}
}
foreach my $reffile ( keys %reffiles ) {
my $rf = $self->convert_json_to_perl( $reffile);
if ($rf) {
my @defs = @{$reffiles{$reffile}};
my %definitions_container; # Here, we're just getting to the actual list of definitions. There's probably only one item in this hash - "define" with the value being the hash of definitions.
foreach my $def (@defs) {
my @defpath = split('/', $def);
$definitions_container{$defpath[1]} = $$rf{$defpath[1]};
}
foreach my $definition ( keys %definitions_container) {
my $field_definitions = $definitions_container{$definition}; #Here we get the field definitions from the container.
foreach my $field_name (keys %$field_definitions) {
my $field_properties = $$field_definitions{$field_name};
foreach my $field_property (keys %$field_properties) {
next if $field_property eq "title";
next if $field_property eq "description";
my $property_value = $$field_properties{$field_property};
$properties{$field_name."_".$field_property} = $property_value;
}
}
}
}
}
}
}
return %properties;
}
sub validate_field {
my ( $self, $field_name, $field_value, $property_name, $properties ) = @_;
if ( ref($properties) eq "ARRAY" ) {
my @properties = split("_", $property_name);
my $property = $properties[1];
if ( $property eq "oneOf" ) {
return one_of( $field_name, $field_value, $property_name, $properties );
} elsif ( $property eq "anyOf" ) {
return any_of( $field_name, $field_value, $property_name, $properties );
} else {
return 0;
}
} elsif ( ref($properties) eq "HASH" ) {
push @errors, "HASH types don't have a validation script yet.";
return 0;
} else {
my @properties = split("_", $property_name);
my $property = $properties[1];
if ( $property eq "type" ) {
return $self->is_type($properties, $field_value);
} elsif ( $property eq "minItems" ) {
return $self->has_min_items( $properties, $field_value );
} elsif ( $property eq "pattern" ) {
return $self->matches_pattern( $properties, $field_value );
} else {
push @errors, "This property ($property) doesn't have a validation script yet.";
return 0;
}
}
}
sub sub_properties {
my ( $self, $property_name, @subarray ) = @_;
my $properties;
foreach my $definitions ( @subarray) {
if ( ref( $definitions ) eq "HASH" ) {
foreach my $field_name (keys %$definitions) {
next if $field_name eq "title";
next if $field_name eq "description";
push @{$$properties{$property_name . "_". $field_name}}, $$definitions{$field_name};
}
}
}
foreach my $k ( keys %$properties ) {
foreach my $i ( @{$$properties{$k}} ) {
next unless ref($i) eq "ARRAY";
if ( scalar @$i == 1 ) {
$$properties{$k} = @$i[0];
}
}
}
return $properties;
}
sub is_type {
my ( $self, $type, $value ) = @_;
if ( $type eq "array" ) {
if ( ref($value) eq "ARRAY" ) {
return 1;
} else {
push @errors, "Type should be ARRAY";
return 0;
}
} elsif ( $type eq "object" ) {
if ( ref($value) eq "HASH" ) {
return 1;
} else {
push @errors, "Type should be OBJECT";
return 0;
}
} elsif ( $type eq "integer" ) {
if ( $value =~/^([0-9])+$/ ) {
return 1;
} else {
push @errors, "Type should be INTEGER";
return 0;
}
} elsif ( $type eq "string" ) {
if ( !ref($value) ) {
return 1;
} else {
push @errors, "Type should be STRING";
return 0;
}
} else {
push @errors, "Type not tested";
return 0;
}
}
sub has_min_items {
my ( $self, $number, $value ) = @_;
if ( scalar @$value >= $number ) {
return 1;
} else {
push @errors, "Does not have the minimum number of items";
return 0;
}
}
sub matches_pattern {
my ( $self, $pattern, $value ) = @_;
if ( $value =~ $pattern ) {
return 1;
} else {
push @errors, "Does not match the given pattern";
return 0;
}
}
sub one_of {
## Check if one and only one of the conditions is met
my ( $self, $field_name, $field_value, $property_name, $properties, ) = @_;
my $subproperties = sub_properties( $property_name, @{$properties} );
if ( $subproperties ) {
foreach my $subp ( keys %$subproperties ) {
my @subprops = split("_", $subp);
my $subprop = $subprops[-1];
my $count_true = 0;
foreach my $prop (@$properties) {
foreach my $sp ( keys %$prop ) {
if ( $subprop eq $sp ) {
if ( $sp eq "type" ) {
$count_true += is_type( $$prop{$sp}, $field_value );
} elsif ( $sp eq "properties" ) {
$count_true += 0;
} elsif ( $sp eq "anyOf" ) {
$count_true += 0;
}
}
}
}
if ( $count_true == 1 ) {
return 1;
} else {
push @errors, "$field_name does not have oneOf the required items.";
return 0;
}
}
} else {
return 0;
}
}
sub any_of {
## Check if at least one of the conditions is met
my ( $self, $field_name, $field_value, $property_name, $properties, ) = @_;
my $subproperties = sub_properties( $property_name, @{$properties} );
if ( $subproperties ) {
foreach my $subp ( keys %$subproperties ) {
my @subprops = split("_", $subp);
my $subprop = $subprops[-1];
my $count_true = 0;
foreach my $prop (@$properties) {
foreach my $sp ( keys %$prop ) {
if ( $subprop eq $sp ) {
if ( $sp eq "type" ) {
$count_true += is_type( $$prop{$sp}, $field_value );
} elsif ( $sp eq "properties" ) {
$count_true += 0;
} elsif ( $sp eq "anyOf" ) {
$count_true += 0;
}
}
}
}
if ( $count_true >= 1 ) {
return 1;
} else {
push @errors, "$field_name does not have anyOf the required items.";
return 0;
}
}
} else {
push @errors, "$field_name does not have any sub-properties to test";
return 0;
}
}
sub all_of {
my ( $self, $pattern, $value ) = @_;
## Check if all of the conditions are met
}
sub none_of {
my ( $self, $pattern, $value ) = @_;
## Check that none of the conditions is met
}
1;
__END__
=head1 NAME
FrictionlessData
=head1 SYNOPSIS
(example of code)
=head1 DESCRIPTION
A module for interacting with data packages made according to the OKF standard L<http://frictionlessdata.io/>
=head1 LICENSE
The MIT License (MIT)
Copyright (c) 2016 University of Bath
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
=head2 Methods
=over 12
=item C<new>
Returns a new FrictionlessData object.
=item C<validated>
Returns a boolean value indicating whether the data package is valid.
=back
=cut
#!/usr/bin/perl
use strict;
use FrictionlessData;
my ($file, $schema) = @ARGV;
main($file, $schema);
sub main {
my ( $file, $schema ) = @_;
my $d = FrictionlessData->new;
$d->validate( $file, $schema );
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment