Last active
July 4, 2021 20:08
-
-
Save mixio/b9c4d97ae556fe6d9d16a6c3fb30f43a to your computer and use it in GitHub Desktop.
This is a sample script showing how to process a directory of PDF files: 1) Extract PDF's text. 2) Create a BBEdit document with it.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- | |
use AppleScript version "2.7" | |
use scripting additions | |
use framework "Foundation" | |
-- | |
property NSString : class "NSString" | |
property NSURL : class "NSURL" | |
property PDFDocument : class "PDFDocument" | |
property NSCharacterSet : class "NSCharacterSet" | |
-- | |
property pPdfsFolder : "/absolute/path/to/pdfs_folder/" | |
property pReadyFolder : "/absolute/path/to/ready_folder/" | |
property pProcessedFolder : "/absolute/Path/to/processed_folder/" | |
property pPathToTextFactory : "/absolute/path/to/text_factory" | |
-- | |
on extractTextFromPdf(aFile) | |
set vPdfString to "" | |
set vPdfFilePath to POSIX path of (aFile as string) | |
set vPdfURL to NSURL's fileURLWithPath:vPdfFilePath | |
set vPdfDocument to PDFDocument's alloc()'s initWithURL:vPdfURL | |
set vPdfString to vPdfDocument's |string|()'s stringByTrimmingCharactersInSet:(NSCharacterSet's whitespaceAndNewlineCharacterSet) | |
return vPdfString as string | |
end extractTextFromPdf | |
-- | |
on sanitizeFileName(aFilename) | |
set vFileName to aFilename as string | |
set vIllegalFileNameCharacters to NSCharacterSet's characterSetWithCharactersInString:"\"/\\?%*:|<>$" | |
set vNSString to NSString's stringWithString:vFileName | |
set vNSString to vNSString's stringByTrimmingCharactersInSet:(NSCharacterSet's whitespaceAndNewlineCharacterSet) | |
set vSplitted to vNSString's componentsSeparatedByCharactersInSet:vIllegalFileNameCharacters | |
set vSanitizedFileName to vSplitted's componentsJoinedByString:"" | |
return vSanitizedFileName as string | |
end sanitizeFileName | |
-- | |
do shell script ("mkdir -p" & space & the quoted form of pReadyFolder) | |
do shell script ("mkdir -p" & space & the quoted form of pProcessedFolder) | |
set vPdfsFolderAlias to (POSIX file (pPdfsFolder as string)) as alias | |
set vReadyFolderAlias to (POSIX file (pReadyFolder as string)) as alias | |
set vProcessedFolderAlias to (POSIX file (pProcessedFolder as string)) as alias | |
set vFileNames to list folder vPdfsFolderAlias | |
repeat with vFileName in vFileNames | |
set vPdfFileAlias to (POSIX file (pPdfsFolder & vFileName)) as alias | |
tell application "System Events" | |
set vExtension to name extension of vPdfFileAlias | |
set vTypeIdentifier to type identifier of vPdfFileAlias | |
end tell | |
if vExtension is "pdf" or vTypeIdentifier ends with "pdf" then | |
set vDocumentContents to my extractTextFromPdf(vPdfFileAlias) | |
if vDocumentContents ≠ "" then | |
set vDocumentName to first paragraph of vDocumentContents | |
set vDocumentName to my sanitizeFileName(vDocumentName) | |
if vDocumentName ≠ "" then | |
set vProcessedFile to (vProcessedFolderAlias as string) & vDocumentName & ".txt" | |
log vDocumentName | |
log vProcessedFile | |
tell application "BBEdit" | |
set vDocument to make new text document at beginning with properties {text:vDocumentContents} initial save location vProcessedFile | |
save vDocument | |
apply text factory pPathToTextFactory to vDocument saving yes | |
close vDocument | |
end tell | |
tell application "Finder" | |
move vPdfFileAlias to vReadyFolderAlias | |
end tell | |
end if | |
end if | |
end if | |
end repeat | |
-- |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment