Last active
June 22, 2022 18:45
-
-
Save peaeater/acd5ff27706e23ec12106a58517a6215 to your computer and use it in GitHub Desktop.
Create a text file mirror from PDFs, requires poppler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<# | |
1. Leaf | |
Given a text file of PDF filenames, extract content from PDFs recursively | |
and create mirror directory structure for text file outputs. | |
* Handles filenames with entry separators. | |
* Ignores PDF older than its text file mirror unless -force param is used. | |
* Requires poppler pdftotext.exe | |
.\text-mirror.ps1 -in C:\dev\abc\extract\extracted\pdfs\abc-pdfs-1.txt ` | |
-pathprefix C:\dev\abc\src\web\media\documents ` | |
-outdir C:\dev\abc\raw\text-mirror\documents ` | |
-poppler C:\utils\poppler-0.68\bin\pdftotext.exe | |
2. Container | |
Given a directory path, extract content from PDFs recursively | |
and create mirror directory structure for text file outputs. | |
* Ignores PDF older than its text file mirror unless -force param is used. | |
* Requires poppler pdftotext.exe | |
.\text-mirror.ps1 -indir C:\dev\abc\src\web\media\documents -outdir C:\dev\abc\raw\text-mirror\documents | |
Peter Tyrrell, Andornot | |
#> | |
param ( | |
[Parameter(Mandatory=$true, ParameterSetName="Leaf")] | |
[string]$in, | |
[Parameter(Mandatory=$true, ParameterSetName="Leaf")] | |
[string]$pathsnip, | |
[Parameter(Mandatory=$true, ParameterSetName="Leaf")] | |
[string]$pathprefix, | |
[Parameter(Mandatory=$true, ParameterSetName="Container")] | |
[string]$indir, | |
[Parameter(Mandatory=$true, ParameterSetName="Leaf")] | |
[Parameter(Mandatory=$false, ParameterSetName="Container")] | |
[string]$outdir = $indir, | |
[Parameter(Mandatory=$false, ParameterSetName="Leaf")] | |
[Parameter(Mandatory=$false, ParameterSetName="Container")] | |
[Switch] | |
$force, | |
[Parameter(Mandatory=$false, ParameterSetName="Leaf")] | |
[Parameter(Mandatory=$false, ParameterSetName="Container")] | |
[string]$logsrc = "", | |
[Parameter(Mandatory=$false, ParameterSetName="Leaf")] | |
[Parameter(Mandatory=$false, ParameterSetName="Container")] | |
[string]$poppler = "c:\utils\poppler-0.68\bin\pdftotext.exe" | |
) | |
<# | |
FUNCTIONS | |
#> | |
. .\helper.logging.ps1 | |
function getValidPaths([string]$in, [string]$pathprefix, [string]$pathsnip) { | |
$paths = get-content $in | Where-Object { $_ -ne '' } | ForEach-Object { $_.split('|') } | |
$i = 0 | |
$e = 0 | |
$validPaths = @() | |
foreach ($path in $paths) { | |
$i++ | |
try { | |
if ($pathsnip) { | |
$path = $path.Replace($pathsnip, "", [System.StringComparison]::OrdinalIgnoreCase) | |
} | |
if ($pathprefix) { | |
$path = join-path $pathprefix $path | |
} | |
$path = $path.trim() | |
$ok = test-path $path -PathType Leaf -ErrorAction Stop | |
if ($ok) { | |
write-host "$path Found" | |
$validPaths = $validPaths + $path | |
} | |
else { | |
write-host "$path Not Found" | |
} | |
# update progress display | |
write-progress -activity "Validating file paths..." -status "Processing $i of $($paths.Count)" -percentcomplete (($i / $paths.Count) * 100) | |
} | |
catch [Exception] { | |
$e++ | |
$msg = "Error occurred while validating $path. $($_.Exception)" | |
logError $logsrc $msg | |
} | |
} | |
return $validPaths | |
} | |
<# | |
MAIN | |
#> | |
if ($indir) { | |
$files = get-childitem "$indir\*" -include *.pdf -recurse | |
$pathprefix = $indir | |
} | |
if ($in) { | |
$files = get-childitem $(getValidPaths $in $pathprefix $pathsnip) -include *.pdf | |
} | |
$i = 0 | |
$e = 0 | |
$skip = 0 | |
foreach ($file in $files) { | |
$i++ | |
$update = $true | |
$status = "Processing $i of $($files.Count)." | |
write-progress -activity "Extracting PDF text..." -status $status -percentcomplete (($i / $files.Count) * 100) | |
try { | |
# txt file path => [base out dir] + [pdf parent dir w/o qualifier] + .txt | |
$txtdir = join-path $outdir $($file.DirectoryName.Replace($pathprefix, "", [System.StringComparison]::OrdinalIgnoreCase)) | |
$txtpath = [System.IO.Path]::Combine($txtdir, $("{0}.txt" -f $file.BaseName)) | |
# if txt already exists and timestamp is newer than pdf, skip (unless -force is active) | |
if ((-not $force) -and (test-path $txtpath)) { | |
$txtfile = get-itemproperty -path $txtpath | |
if ($txtfile.LastWriteTime -gt $file.LastWriteTime) { | |
$update = $false | |
$status = $status + " (Skipping - $($file.FullName) doesn't need update.)" | |
} | |
} | |
# extract text to txt | |
if ($force -or $update) { | |
if(!(test-path $txtdir)) { | |
mkdir $txtdir | out-null | |
} | |
& $poppler -eol unix -raw -enc UTF-8 $file.FullName $txtpath | |
# fix filenames mangled by poppler (writes filename diacritics with Windows-1252 instead of UTF-8) | |
$txtpath_1252 = [System.Text.Encoding]::GetEncoding(1252).GetString([System.Text.Encoding]::UTF8.GetBytes($txtpath)) | |
if ($txtpath -ne $txtpath_1252) { | |
move-item -path $txtpath_1252 -destination $txtpath | |
} | |
} | |
else { | |
$skip++ | |
} | |
} | |
catch [Exception] { | |
$e++ | |
$msg = "Error occurred while processing $($file.FullName). $($_.Exception)" | |
logError $logsrc $msg | |
} | |
} | |
# final report | |
$msg = "PDF text extraction finished processing $i {0} to $outdir. $skip skipped, $e {1}." -f $(if ($e -eq 1) {"file"} else {"files"}),$(if ($e -eq 1) {"error"} else {"errors"}) | |
logInfo $logsrc $msg | |
exit 0 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment