For (more) context, see also my answer to Extract first page from multiple pdfs question.
Last active
January 22, 2024 20:41
-
-
Save dogfuntom/ce06b4d60fe97d79e01d9564c1f5857b to your computer and use it in GitHub Desktop.
PowerShell script that uses Ghostscript to extract 1st page pictures from each PDF in a directory and its subdirectories
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Recursively for each PDF file make a 1st page JPEG nearby. | |
# Execute `$WhatIfPreference = 1` before running this script to dry run. | |
# (tip: dry run doesn't require Ghostscript at all, so in this way it's possible to simulate this script is before even actually installing the gs) | |
# Execute `$VerbosePreference = 'Continue'` before running this script to enable verbose output. | |
# Execute `$VerbosePreference = 'Inquire'` before running this script to be asked to confirm before each operation. | |
# (NOTE: the `$VerbosePreference = 'Inquire'` treats `[A] Yes to All` as simply `[Y] Yes` for some reason.) | |
# (Also any of the above can be copy-pasted right into this script itself but it's rather anti-idiomatic.) | |
# Where the PDFs are (reminder: this script is recursive but this can be changed easily, see below). | |
$path = 'Y:\our\P\ath' | |
# How to call Ghostscript (tip: it's fine to use full path to the executable if short version doesn't work). | |
$gs = 'gswin64c' | |
# Filter: Must be newer than this. | |
$time = '2001-01-01' | |
# Filter: Detect existing pictures and skip them? | |
$skipExisting = $true | |
# The easiest are both 'jpeg' or both 'png'. To learn more advanced options: https://ghostscript.readthedocs.io/en/latest/Devices.html#image-file-formats | |
$sDevice = 'jpeg' | |
$extension = 'jpeg' | |
# (Tip: remove -Recurse if lack of subdirectory processing is desired.) | |
Get-ChildItem -Path $path -Recurse -Include *.pdf | | |
Where-Object -FilterScript { | |
# Filter by last write time. Change to other kind of time as needed. | |
($_.LastWriteTime -gt $time) | |
} | | |
ForEach-Object { | |
$nameWithoutExtension = $_.BaseName | |
$directoryFullPath = $_.DirectoryName | |
$out = "$directoryFullPath\1p $nameWithoutExtension.$extension" | |
if ($skipExisting -and (Test-Path -Path $out -PathType leaf)) { | |
Write-Verbose "skip (already exists): $_ => $out" | |
# Note that Return in ForEach-Object acts like Continue in foreach would (i.e. it skips to next iteration and not exits fully). | |
Return | |
} | |
if ($WhatIfPreference) { | |
Write-Host "(dry run) $_ => $out" | |
} | |
else { | |
Write-Host "$_ => $out" | |
& $gs -sDEVICE="$sDevice" -o $out -dFirstPage=1 -dLastPage=1 $_ | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment