Last active
August 29, 2015 13:57
-
-
Save peaeater/9650933 to your computer and use it in GitHub Desktop.
Processes Internet Archive packages, producing 1 txt, djvu xml, jpg per page of a digitized publication, plus an XML manifest. The output is intended for ingest by Solr through Andi's DIH handler. Jobs are broken into subscript dependencies. Requires imagemagick and djvulibre.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# processes Internet Archive packages, producing per page: 1 txt, 1 ocrxml, 1 jpg | |
# requires djvulibre, imagemagick | |
param( | |
[string]$indir = ".", | |
[string]$outbase = $indir | |
) | |
[Reflection.Assembly]::LoadWithPartialName("System.IO.Compression.FileSystem") | |
function WebSafe([string]$s) { | |
return $s.ToLowerInvariant().Replace(" ", "-") | |
} | |
$files = ls "$indir\*.*" -include *.djvu | |
foreach ($file in $files) { | |
$djvu = $file.FullName | |
# extract .txt per djvu page | |
& .\djvu2txt.ps1 -in "$djvu" | |
# extract hidden text .xml per djvu page | |
& .\djvu2xml.ps1 -in "$djvu" | |
# create dir for jpgs based on djvu filename | |
$jpgdir = ('{0}\{1}' -f $outbase, (WebSafe($file.BaseName))) | |
if (!(test-path $jpgdir)) { | |
mkdir $jpgdir | |
} | |
# extract .jp2 a la .NET 4 | |
$zip = get-item (('{0}\{1}_jp2.zip' -f $file.DirectoryName, $file.BaseName)) | |
$jp2dir = ('{0}\{1}_jp2' -f $file.DirectoryName, $file.BaseName) | |
if (!(test-path $jp2dir)) { | |
[System.IO.Compression.ZipFile]::ExtractToDirectory($zip, $file.DirectoryName) | |
} | |
# rename .jp2 to just page #s | |
$jp2s = ls "$jp2dir" | |
foreach ($jp2 in $jp2s) { | |
$pattern = ('{0}_0*' -f $file.BaseName) | |
$newname = [System.Text.RegularExpressions.Regex]::Replace($jp2.Name, $pattern, "") | |
ren -path $jp2.FullName -newname $newname | |
} | |
# delete scan job marker images | |
del ('{0}\.jp2' -f $jp2dir) | |
del ('{0}\{1}' -f $jp2dir, $newname) | |
# convert .jp2 to .jpg | |
& .\jp22jpg.ps1 -size 1000 -in "$jp2dir" -outdir $jpgdir | |
# clean up | |
del -Recurse $jp2dir | |
# convert IA metadata to manifest | |
$metadata = ('{0}\{1}_meta.xml' -f $file.DirectoryName, $file.BaseName) | |
$manifest = ('{0}\manifest.xml' -f $file.DirectoryName) | |
$source = [System.IO.Directory]::GetParent($file.DirectoryName).BaseName | |
& .\meta2manifest.ps1 -in $metadata -out $manifest -source $source | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment