Skip to content

Instantly share code, notes, and snippets.

@forrcaho
Last active September 7, 2021 19:40
Show Gist options
  • Save forrcaho/ddb1bd108ef35ef0e66899d1d1502927 to your computer and use it in GitHub Desktop.
Save forrcaho/ddb1bd108ef35ef0e66899d1d1502927 to your computer and use it in GitHub Desktop.
Convert a JSON file with structured data into an SQLite DB file
{
"artist" : [
{
"name": "The Beatles",
"album" : [
{
"title": "Sgt. Pepper's Lonely Hearts Club Band",
"track": [
{
"title": "Within You Without You",
"duration": "5:05"
},
{
"title": "Fixing a Hole",
"duration": "2:36"
}
]
},
{
"title": "Abby Road",
"track": [
{
"title": "Come Together",
"duration": "4:19"
},
{
"title": "Here Comes the Sun",
"duration": "3:05"
}
]
}
]
},
{
"name": "The Rolling Stones",
"album": [
{
"title": "December's Children (and Everybody's)",
"track": [
{
"title": "Get Off My Cloud",
"duration": "2:52"
},
{
"title": "As Tears Go By",
"duration": "2:45"
}
]
},
{
"title": "Their Satanic Majesties Request",
"track": [
{
"title": "Sing This All Together",
"duration": "3:46"
},
{
"title": "She's a Rainbow",
"duration": "4:35"
}
]
}
]
}
]
}
=pod
=encoding UTF-8
=head1 NAME
json_to_sql.pl
=head1 SYNOPSIS
perl json_to_sql.pl --json=data.json --out=data.db
=head1 DESCRIPTION
This program takes structured JSON data from a file and creates an SQLite db file from it.
It was written for some specific data I have in JSON format and wanted to be able to query using SQL.
It makes some assumptions about the data, but can be crafted into a more general purpose tool as
the need arises.
The current code assumes:
=over 4
=item * The top level of the JSON input file is an object, containing keys whose values are arrays.
(The keys become table names, and the values contain the records that go into those tables.)
=item * There are no keys named "id" in any of the records. (This code auto-generates ids for all
table rows and would be confused if an id is already present.)
=item * The attributes of any JSON object representing a record are either scalar values or arrays.
(Scalar values are placed directly into the corresponding SQL record; arrays generate cross
tables.)
=back
The output table will contain a column for each attribute which appears in any of its JSON records,
with NULL values for any column whose JSON record is missing the corresponding attribute. It is
expected that the records for a given table share many if not all common attributes, though, for the
output to be useful.
=head1 REQUIREMENTS
This script relies on the following of CPAN modules, which need to be installed using 'cpanm'
or your favorite installer:
=over 4
=item * Data::Compare
=item * DBD::SQLite
=item * DBI
=item * Getopt::Long
=item * JSON
=item * ReadOnly
=back
=head1 AUTHOR
=over 4
=item *
Forrest Cahoon <forrest.cahoon@gmail.com>
=back
=cut
use strict;
use warnings;
use Data::Compare;
use DBD::SQLite;
use DBI;
use Getopt::Long;
use JSON;
use Readonly;
Readonly my $PREFIX => "JSONtoSQL";
my $json_fname = "input.json";
my $sqlite_db_fname = "output.sql";
GetOptions(
'input|in|json=s' => \$json_fname,
'output|out|db=s' => \$sqlite_db_fname
);
die "Can't find input file $json_fname\n" unless -e $json_fname;
die "$sqlite_db_fname already exists, will not overwrite\n" if -e $sqlite_db_fname;
my $json = "";
{
local $/;
open IN, "<", $json_fname or die "Can't open $json_fname for reading\n";
$json = <IN>;
close IN;
}
my $in_data = decode_json $json;
my $tables = {}; # Data structure to easily generate SQL tables from.
my $cross_tables = {}; # Keep track of cross tables to get rid of unneeded ones when all data is processed.
for my $table_name (keys %$in_data) {
create_table_if_new($table_name);
for my $record (@{$in_data->{$table_name}}) {
process_record($table_name, $record);
}
}
fix_unneeded_cross_tables();
# Create SQLite DB file
my $dbh = DBI->connect("dbi:SQLite:dbname=$sqlite_db_fname","","");
$dbh->do("PRAGMA foreign_keys = OFF"); # Don't enforce FKs until all data is loaded.
for my $table_name (sort keys %$tables) {
my $table = $tables->{$table_name};
# create table
my $sql = "CREATE TABLE $table_name (\n id INTEGER PRIMARY KEY";
for my $col_name (sort keys %{$table->{"columns"}}) {
my $col_type = $table->{"columns"}->{$col_name};
$col_type =~ s/^BOOLEAN$/INTEGER/;
$sql .= ",\n $col_name $col_type";
}
for my $fkey (@{$table->{"fkeys"}}) {
$sql .= ",\n FOREIGN KEY $fkey";
}
$sql .= "\n)";
print "creating table $table_name\n";
$dbh->prepare($sql)->execute;
# insert data
for my $record (@{$table->{"data"}}) {
my $cols = "id";
my $values = delete $record->{"_${PREFIX}_id"};
for my $col (sort keys %$record) {
my $value = $record->{$col};
$cols .= ", $col";
if ($table->{"columns"}->{$col} eq 'TEXT') {
$values .= ", '" . sql_escape($value) . "'";
} elsif ($value eq "_${PREFIX}_BOOL_TRUE") {
$values .= ", 1";
} elsif ($value eq "_${PREFIX}_BOOL_FALSE") {
$values .= ", 0";
} else {
$values .= ", $value";
}
}
$sql = "INSERT INTO $table_name ($cols) VALUES ($values)";
$dbh->prepare($sql)->execute;
}
print "data inserted\n";
}
# Now, turn on FKs:
$dbh->do("PRAGMA foreign_keys = ON");
#####################################################################################################
# Creates a data structure that will be easy to convert to SQL
sub create_table_if_new {
my ($table_name) = @_;
return if $tables->{$table_name};
$tables->{$table_name} = {
"columns" => {}, # keys are column names, values are column data types
"data" => [], # array of hash references containing records by column name
"fkeys" => [], # array of strings used to define foreign keys on SQL table creation
"id" => 1, # incrementing counter to generate unique ids for each table row
}
}
# Called for each record of the top-level JSON data arrays.
# Inserts records into the corresponding table structure.
sub process_record {
my ($table_name, $record) = @_;
my $db_record = {};
my $table = $tables->{$table_name};
for my $col (keys %$record) {
if (JSON::is_bool($record->{$col})) {
$db_record->{$col} = $record->{$col} ? "_${PREFIX}_BOOL_TRUE" : "_${PREFIX}_BOOL_FALSE";
$table->{"columns"}->{$col} = determine_column_type($record->{$col}, $table->{"columns"}->{$col});
} elsif (ref $record->{$col}) {
die "Cannot parse $col attribute of $table_name in JSON\n" if ref $record->{$col} ne 'ARRAY';
make_cross_table_records($table_name, $table->{"id"}, $col, $record->{$col});
} else {
$db_record->{$col} = $record->{$col};
$table->{"columns"}->{$col} = determine_column_type($record->{$col}, $table->{"columns"}->{$col});
}
}
if (scalar keys %$db_record > 0) {
$db_record->{"_${PREFIX}_id"} = $tables->{$table_name}->{"id"};
push @{$table->{"data"}}, $db_record;
$table->{"id"}++;
}
}
# Determines SQLite column type based on column data for the current record and
# the previously determined (provisional) column type from previous records.
sub determine_column_type {
my ($value, $old_column_type) = @_;
# If we've seen a non-numeric value already, we need to keep this column TEXT
return 'TEXT' if (defined $old_column_type && $old_column_type eq 'TEXT');
# Check for our boolean values and return a BOOLEAN type (unless we've already seen another
# type for this column already). Note SQLite uses INTEGER for its BOOLEAN type, but this
# distinction may be useful for other DBs in the future.
if ($value =~ /^_${PREFIX}_BOOL_/
&& (!defined $old_column_type || $old_column_type eq 'BOOLEAN')) {
return 'BOOLEAN';
}
# Check whether it looks like an INTEGER
if ($value =~ /^\d+$/
&& (!defined $old_column_type || $old_column_type eq 'INTEGER' || $old_column_type eq 'BOOLEAN')) {
return 'INTEGER';
}
# Check for REAL; look for digits and only one '.'
my @dots = ($value =~ /(\.)/g);
my $dot_count = scalar @dots;
if ($value =~ /^[\d\.]+$/ && $dot_count == 1
&& (!defined $old_column_type || $old_column_type ne 'TEXT')) {
return 'REAL';
}
# If none of these tests have matched, this is 'TEXT'
return 'TEXT';
}
# When a record attribute is an array, that data goes in a separate child table
# and is joined to its parent table with a cross table.
sub make_cross_table_records {
my ($parent_name, $parent_id, $child_name, $child_values) = @_;
return unless scalar @$child_values > 0;
create_table_if_new($child_name);
my $cross_name = "${parent_name}_${child_name}";
create_table_if_new($cross_name);
$cross_tables->{$cross_name} = [$parent_name, $child_name];
my $cross_table = $tables->{$cross_name};
$cross_table->{"columns"} = { "${parent_name}_id" => 'INTEGER', "${child_name}_id" => 'INTEGER' };
$cross_table->{"fkeys"} = [
"(${parent_name}_id) REFERENCES ${parent_name} (id)",
"(${child_name}_id) REFERENCES ${child_name} (id)",
];
delete $tables->{$parent_name}->{"columns"}->{$child_name};
for my $value (@$child_values) {
my $child_id = find_or_create_id($child_name, $value);
my $cross_record = {
"${parent_name}_id" => $parent_id,
"${child_name}_id" => $child_id,
"_${PREFIX}_id" => $cross_table->{"id"},
};
$cross_table->{"id"}++;
push @{$cross_table->{"data"}}, $cross_record;
}
}
# For child table records, we need to look thru the records already created (from the same
# attribute on previously procesed parent records) to find the id of the matching record (if any),
# or add the new record to the child table and return its id.
sub find_or_create_id {
my ($table_name, $value) = @_;
my $table = $tables->{$table_name};
if (!ref $value) {
$value = { "value" => $value };
}
for my $col (keys %$value) {
if (JSON::is_bool($value->{$col})) {
$value->{$col} = $value->{$col} ? "_${PREFIX}_BOOL_TRUE" : "_${PREFIX}_BOOL_FALSE";
} elsif (ref $value->{$col}) {
die "Cannot parse $col attribute of $table_name in JSON\n" if ref $value->{$col} ne 'ARRAY';
make_cross_table_records($table_name, $table->{"id"}, $col, $value->{$col});
delete $value->{$col};
}
if ($value->{$col}) {
$table->{"columns"}->{$col} = determine_column_type($value->{$col}, $table->{"columns"}->{$col});
}
}
# This is a prime candidate for speeding up if better performance is needed. We are examining records
# in an array one at a time, looking for a match so we can return the corresponding id. We could build
# a hash-based structure of some sort to do this lookup much faster.
for my $record (@{$table->{"data"}}) {
if (Compare($value, $record, { ignore_hash_keys => [ "_${PREFIX}_id" ] })) {
return $record->{"_${PREFIX}_id"};
}
}
$value->{"_${PREFIX}_id"} = $table->{"id"};
$table->{"id"}++;
push @{$table->{"data"}}, $value;
return $value->{"_${PREFIX}_id"};
}
# We must assume a many-to-many relationship between parent and child tables while we're building
# the table structure, but once all the data is loaded it's possible that each child record only
# points to one parent. In that case, we dispense with the cross table and add a foreign key directly
# on the child table to point to the (one) parent for each child record.
sub fix_unneeded_cross_tables {
for my $cross_name (keys %$cross_tables) {
my ($parent_name, $child_name) = @{$cross_tables->{$cross_name}};
my $cross_table = $tables->{$cross_name};
my $child_to_parent = {};
my $child_is_multi = 0;
for my $record (@{$cross_table->{"data"}}) {
my $parent_id = $record->{"${parent_name}_id"};
my $child_id = $record->{"${child_name}_id"};
if ($child_to_parent->{$child_id} && $child_to_parent->{$child_id} != $parent_id) {
$child_is_multi = 1;
last;
}
$child_to_parent->{$child_id} = $parent_id;
}
next if $child_is_multi;
print "fixing unneded cross table $child_name -> $parent_name\n";
my $child_table = $tables->{$child_name};
$child_table->{"columns"}->{"${parent_name}_id"} = 'INTEGER';
$child_table->{"fkeys"} = ["(${parent_name}_id) REFERENCES ${parent_name} (id)"];
for my $record (@{$child_table->{"data"}}) {
my $child_id = $record->{"_${PREFIX}_id"};
my $parent_id = $child_to_parent->{$child_id};
$record->{"${parent_name}_id"} = $parent_id;
}
delete $tables->{$cross_name};
}
}
sub sql_escape {
my ($value) = @_;
$value =~ s/'/''/g;
return $value;
}
1;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment