Skip to content

Instantly share code, notes, and snippets.

@mixio
Last active July 4, 2021 20:08
Show Gist options
  • Save mixio/b9c4d97ae556fe6d9d16a6c3fb30f43a to your computer and use it in GitHub Desktop.
Save mixio/b9c4d97ae556fe6d9d16a6c3fb30f43a to your computer and use it in GitHub Desktop.
This is a sample script showing how to process a directory of PDF files: 1) Extract PDF's text. 2) Create a BBEdit document with it.
--
use AppleScript version "2.7"
use scripting additions
use framework "Foundation"
--
property NSString : class "NSString"
property NSURL : class "NSURL"
property PDFDocument : class "PDFDocument"
property NSCharacterSet : class "NSCharacterSet"
--
property pPdfsFolder : "/absolute/path/to/pdfs_folder/"
property pReadyFolder : "/absolute/path/to/ready_folder/"
property pProcessedFolder : "/absolute/Path/to/processed_folder/"
property pPathToTextFactory : "/absolute/path/to/text_factory"
--
on extractTextFromPdf(aFile)
set vPdfString to ""
set vPdfFilePath to POSIX path of (aFile as string)
set vPdfURL to NSURL's fileURLWithPath:vPdfFilePath
set vPdfDocument to PDFDocument's alloc()'s initWithURL:vPdfURL
set vPdfString to vPdfDocument's |string|()'s stringByTrimmingCharactersInSet:(NSCharacterSet's whitespaceAndNewlineCharacterSet)
return vPdfString as string
end extractTextFromPdf
--
on sanitizeFileName(aFilename)
set vFileName to aFilename as string
set vIllegalFileNameCharacters to NSCharacterSet's characterSetWithCharactersInString:"\"/\\?%*:|<>$"
set vNSString to NSString's stringWithString:vFileName
set vNSString to vNSString's stringByTrimmingCharactersInSet:(NSCharacterSet's whitespaceAndNewlineCharacterSet)
set vSplitted to vNSString's componentsSeparatedByCharactersInSet:vIllegalFileNameCharacters
set vSanitizedFileName to vSplitted's componentsJoinedByString:""
return vSanitizedFileName as string
end sanitizeFileName
--
do shell script ("mkdir -p" & space & the quoted form of pReadyFolder)
do shell script ("mkdir -p" & space & the quoted form of pProcessedFolder)
set vPdfsFolderAlias to (POSIX file (pPdfsFolder as string)) as alias
set vReadyFolderAlias to (POSIX file (pReadyFolder as string)) as alias
set vProcessedFolderAlias to (POSIX file (pProcessedFolder as string)) as alias
set vFileNames to list folder vPdfsFolderAlias
repeat with vFileName in vFileNames
set vPdfFileAlias to (POSIX file (pPdfsFolder & vFileName)) as alias
tell application "System Events"
set vExtension to name extension of vPdfFileAlias
set vTypeIdentifier to type identifier of vPdfFileAlias
end tell
if vExtension is "pdf" or vTypeIdentifier ends with "pdf" then
set vDocumentContents to my extractTextFromPdf(vPdfFileAlias)
if vDocumentContents ≠ "" then
set vDocumentName to first paragraph of vDocumentContents
set vDocumentName to my sanitizeFileName(vDocumentName)
if vDocumentName ≠ "" then
set vProcessedFile to (vProcessedFolderAlias as string) & vDocumentName & ".txt"
log vDocumentName
log vProcessedFile
tell application "BBEdit"
set vDocument to make new text document at beginning with properties {text:vDocumentContents} initial save location vProcessedFile
save vDocument
apply text factory pPathToTextFactory to vDocument saving yes
close vDocument
end tell
tell application "Finder"
move vPdfFileAlias to vReadyFolderAlias
end tell
end if
end if
end if
end repeat
--
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment