Skip to content

Instantly share code, notes, and snippets.

@cjcolvar
Created November 2, 2017 19:35
Show Gist options
  • Save cjcolvar/a7c26de736e5dede6381ea6b31264a16 to your computer and use it in GitHub Desktop.
Save cjcolvar/a7c26de736e5dede6381ea6b31264a16 to your computer and use it in GitHub Desktop.
Variations migration script via batch ingest from Paul Hoffman
#!/usr/bin/perl
use strict;
use warnings;
use XML::LibXSLT;
use XML::LibXML;
use POSIX qw(strftime);
use File::Basename qw(dirname basename);
use Getopt::Long
qw(:config posix_default gnu_compat require_order bundling no_ignore_case);
use constant TAB => ',';
sub usage;
sub fatal;
sub blather;
(my $prog = $0) =~ s{.+/}{};
# XXX Media files must be named LABEL.BITRATE.mp4 or LABEL.BITRATE.mov
my %bitrate2quality = qw(
192k high
28k low
);
my $verbose;
my @time = localtime;
my $batch_name = strftime('Export from Variations %Y-%m-%d %H:%M:%S', @time);
my $submitter;
# XXX Hard-coded Variations content root
my $src = '/v3/dmlserv/content';
my $dst;
my $all;
GetOptions(
'b=s' => \$batch_name,
'u=s' => \$submitter,
's=s' => \$src,
'd=s' => \$dst,
'v' => \$verbose,
'a' => \$all,
) or usage;
usage 'Option -u SUBMITTER is required'
if !defined $submitter;
fatal "invalid batch name: $batch_name"
if $batch_name =~ /[^-:@. 0-9A-Za-z]/;
fatal "invalid submitter (must be an e-mail address): $submitter"
if $submitter !~ /^[0-9A-Za-z][-.0-9A-Za-z]*[@][0-9A-Za-z][-.0-9A-Za-z]*$/;
my $xslt = XML::LibXSLT->new;
my $stylesheet = $xslt->parse_stylesheet(XML::LibXML->load_xml('string' => xsl()));
my %object;
xchdir($src);
# XXX Hard-coded path
$dst ||= strftime('/v3/avalon/tmp/%Y%m%dT%H%M%S', @time);
if (!@ARGV) {
usage if !$all;
# XXX Hard-coded path to access files
@ARGV = sort glob('access/audio/*.xml');
}
elsif ($all) {
usage "Option -a doesn't make sense when XML files are specified";
}
my @metadata;
fatal "not an XML file: $_" for grep !/\.xml$/, @ARGV;
xmkdir(dirname($dst), $dst, "$dst/content");
blather sprintf "exporting %d object(s)...", scalar(@ARGV);
foreach my $f (@ARGV) {
my $obj = object($f);
$object{$obj->{'bibid'}} = $obj;
copy_media_files($obj);
}
save_batch_metadata();
my $rdy = $dst;
if ($rdy =~ s{/tmp/}{/ready/}) {
xrename($dst, $rdy);
}
print $rdy, "\n";
blather 'avout: done';
# --- Functions
sub object {
my ($f) = @_;
my $obj = parse_access_file($f);
find_media_files($obj);
extract_structure($obj);
return $obj;
}
sub parse_access_file {
my ($f) = @_;
# XXX XML access files must be named BIBID.xml
$f =~ m{.*/(\d+)\.xml$} or fatal "not an XML access file: $f";
my $bibid = $1;
my $doc = XML::LibXML->load_xml('location' => $f);
my @media_labels = field($doc, '//OrderedMediaObjects/MO/Label');
my %obj = (
'doc' => $doc,
'bibid' => $bibid,
'access_file' => $f,
'media' => {
map { $_ => {} } @media_labels,
}
);
return \%obj;
}
sub find_media_files {
my ($obj) = @_;
my $doc = $obj->{'doc'};
while (my ($label, $media) = each %{ $obj->{'media'} }) {
my ($mediadoc) = $doc->findnodes(sprintf '//RecordSet/MediaObject[Label/text()="%s"]', $label);
my $media_id = field($mediadoc, 'Id');
(my $media_id_number = $media_id) =~ s{.+/}{};
$media->{'media_id'} = $media_id;
$media->{'label'} = $label;
$media->{'files'} = {};
my ($infodoc) = $mediadoc->findnodes('FileInfos/FileInfo');
my $size = field($infodoc, 'Size');
my $name = field($infodoc, 'FileName');
my $md5 = field($infodoc, 'Checksum');
my @files = glob("audio-objects/$media_id_number/*.*");
# XXX Only *.mp4 or *.mov media files are recognized
my %ext = map { /\.(mp4|mov)$/ ? ($1 => 1) : () } @files;
next if scalar(keys %ext) != 1;
$media->{'name'} = sprintf '%s.%s', lc $label, keys %ext;
foreach my $file (@files) {
# XXX Media files must be named LABEL.BITRATE.* where BITRATE ends in "k"
$file =~ /\.(\d+k)\.\w+$/ or next;
my $bitrate = $1;
# XXX You may need to modify %bitrate2quality above
my $quality = $bitrate2quality{$bitrate} or next;
my $file_size = -s $file;
my $destination = $file;
for ($destination) {
s/\.$bitrate\./.$quality./;
s{.+/}{content/};
}
my %file = (
'bit_rate' => $bitrate,
'quality' => $quality,
'size' => $file_size,
'source' => $file,
'destination' => $destination,
);
$file{'checksum'} = $md5 if $file_size == $size;
warning('huh?'), next if $media->{'files'}{$quality};
$media->{'files'}{$quality} = \%file;
}
}
}
sub extract_structure {
my ($obj) = @_;
my $doc = $obj->{'doc'};
my %structure;
ITEM:
foreach my $itemdoc ($doc->findnodes("//Container/Structure/Item")) {
#########################################################################
# BEGIN a very nasty workaround for an apparent bug in #
# XML::LibXML::Element::findnodes that causes all ContentInterval #
# elements to be returned, not just those within this Item. Or maybe I #
# just don't understand how to use XML::LibXML. :-( #
#########################################################################
my $item = xmldoc($itemdoc->toString(1));
my $structure_xml = $item->toString(1);
my @spandocs = $item->findnodes('//ContentInterval');
#########################################################################
# END workaround #
#########################################################################
foreach my $span (@spandocs) {
my $media_id = $span->getAttribute('mediaRef');
$structure{$media_id} ||= $structure_xml;
blather " media item {",
" media id = $media_id",
" tracks = " . scalar(@spandocs),
" }";
next ITEM;
}
}
while (my ($label, $media) = each %{ $obj->{'media'} }) {
my $media_id = $media->{'media_id'};
$media->{'structure'} = delete $structure{$media_id} or die;
}
die if scalar keys %structure;
}
sub copy_media_files {
my ($obj) = @_;
while (my ($label, $media) = each %{ $obj->{'media'} }) {
my $files = $media->{'files'};
foreach my $quality (qw(low medium high)) {
my $file = $files->{$quality} or next;
my $file_src = $file->{'source'};
my $file_dst = sprintf '%s/%s', $dst, $file->{'destination'};
xlink($file_src, $file_dst);
}
}
}
sub warning {
print STDERR 'avout: warning: ', $_, "\n" for @_;
}
sub save_batch_metadata {
my $fh = xopen('>', "$dst/batch_manifest.csv");
my $max_num_media = 0;
while (my ($bibid, $object) = each %object) {
my $media = $object->{'media'};
my $num_media = scalar keys %$media;
$max_num_media = $num_media if $num_media > $max_num_media;
}
print $fh $batch_name, TAB, $submitter, "\n";
# XXX Always skip transcoding
print $fh join(TAB, 'Bibliographic ID', 'Bibliographic ID Label', ('File', 'Skip Transcoding') x $max_num_media), "\n";
foreach my $object (sort { $a->{'bibid'} <=> $b->{'bibid'} } values %object) {
my $bibid = $object->{'bibid'};
my $media = $object->{'media'};
print STDERR $bibid;
# XXX Bib ID is a local identifier
print $fh $bibid, TAB, 'local';
foreach my $label (sort by_media_id keys %{ $object->{'media'} }) {
my $media = $object->{'media'}{$label};
(my $letter = lc $label) =~ s/.+-//;
my $name = $media->{'name'};
my $struc_xml = $stylesheet->transform(XML::LibXML->load_xml('string' => $media->{'structure'}))->toString(1);
my $struc_dst = sprintf '%s/content/%s.structure.xml', $dst, $name;
print { xopen('>', $struc_dst) } $struc_xml;
print $fh TAB, "content/$name", TAB, 'Yes';
print STDERR ' ', $letter, '(';
foreach my $file (values %{ $media->{'files'} }) {
print STDERR uc substr($file->{'quality'}, 0, 1);
}
print STDERR ')';
}
print $fh "\n";
print STDERR "\n";
}
}
# --- Functions
sub xrename {
my ($src, $dst) = @_;
rename($src, $dst) or fatal "rename $src $dst: $!";
}
sub xlink {
my ($src, $dst) = @_;
link($src, $dst) or fatal "link $src $dst: $!";
}
sub xopen {
my ($mode, $path) = @_;
open my $fh, $mode, $path or fatal "open $path: $!";
return $fh;
}
sub xmkdir {
foreach my $dir (@_) {
-d $dir or mkdir $dir or fatal "mkdir $dir: $!";
}
}
sub xchdir {
foreach my $dir (@_) {
chdir $dir or fatal "chdir $dir: $!";
}
}
sub field {
my ($elt, $gi) = @_;
my @vals = map { $_->textContent } $elt->findnodes($gi);
return if !@vals;
return @vals if wantarray;
return $vals[0];
}
sub xmldoc {
return XML::LibXML->load_xml('string' => qq{<?xml version="1.0" encoding="UTF-8"?>\n} . shift);
}
sub xsl {
<<'EOS';
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="xml" encoding="UTF-8" indent="yes"/>
<!--
<xsl:template match="/">
<xsl:apply-templates/>
</xsl:template>
-->
<xsl:template match="/Item">
<Item label="{@label}">
<xsl:apply-templates/>
</Item>
</xsl:template>
<xsl:template match="Div">
<Div label="{@label}">
<xsl:apply-templates/>
</Div>
</xsl:template>
<xsl:template match="Chunk">
<Span begin="{floor(number(ContentInterval/@begin div 3600000)mod 24)}:{floor(number(ContentInterval/@begin div 60000)mod 60)}:{floor(number(ContentInterval/@begin div 1000)mod 60)}"
end="{floor(number(ContentInterval/@end div 3600000)mod 24)}:{floor(number(ContentInterval/@end div 60000)mod 60)}:{floor(number(ContentInterval/@end div 1000)mod 60)}"
label="{@label}"/>
</xsl:template>
<xsl:template match="text()"/>
</xsl:stylesheet>
EOS
}
sub by_media_id {
length($a) <=> length($b)
or
$a cmp $b
}
sub blather {
return if !$verbose;
print STDERR $_, "\n" for @_;
}
sub usage {
print STDERR "usage: avout -u SUBMITTER [-va] [-b NAME] [-s SRCDIR] [-d DSTDIR] XMLFILE...\n";
print STDERR $_, "\n" for @_;
exit 1;
}
sub fatal {
print STDERR "avout: $_\n" for @_;
exit 2;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment