Created
May 14, 2023 08:00
-
-
Save SKaplanOfficial/c40ab8e3b86338dd6dd941d06cac2565 to your computer and use it in GitHub Desktop.
JXA script to extract text from a video file by analyzing individual frames with the Vision framework.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(() => { | |
ObjC.import("objc"); | |
ObjC.import("CoreMedia"); | |
ObjC.import("Foundation"); | |
ObjC.import("AVFoundation"); | |
ObjC.import("Vision"); | |
ObjC.import("AppKit"); | |
// Load the video file | |
const assetURL = $.NSURL.fileURLWithPath( | |
"/Users/exampleUser/Downloads/example.mov" | |
); | |
const asset = $.objc_getClass("AVAsset").assetWithURL(assetURL); | |
// Ensure the video has a video track | |
if (asset.tracksWithMediaType($.AVMediaTypeVideo).count == 0) { | |
return ""; | |
} | |
const frameCount = 15; // The number of frames to analyze | |
// Set up the AVAssetReader for reading the video frames into pixel buffers | |
const reader = $.objc_getClass("AVAssetReader").alloc.initWithAssetError( | |
asset, | |
null | |
); | |
const track = asset.tracksWithMediaType($.AVMediaTypeVideo).objectAtIndex(0); | |
const settings = $.NSDictionary.dictionaryWithObjectForKey( | |
"420v", | |
"PixelFormatType" | |
); | |
readerOutput = $.objc_getClass( | |
"AVAssetReaderTrackOutput" | |
).alloc.initWithTrackOutputSettings(track, settings); | |
reader.addOutput(readerOutput); | |
reader.startReading; | |
// Read the video frames into pixel buffers | |
samples = []; | |
let buf = Ref(); | |
while ( | |
samples.length < frameCount && | |
reader.status != $.AVAssetReaderStatusCompleted && | |
reader.status != $.AVAssetReaderStatusFailed | |
) { | |
buf = readerOutput.copyNextSampleBuffer; | |
samples.push(buf); | |
} | |
const texts = []; // The texts found in the video | |
// Analyze each sample | |
for (let i = 0; i < samples.length; i++) { | |
// Convert the pixel buffer to a Core Image buffer | |
const imageBufferRef = ObjC.castRefToObject( | |
$.CMSampleBufferGetImageBuffer(samples[samples.length - i - 1]) | |
); | |
// Initialize the request handler and text request | |
const requestHandler = | |
$.VNImageRequestHandler.alloc.initWithCVPixelBufferOptions( | |
imageBufferRef, | |
$.NSDictionary.alloc.init | |
); | |
const textRequest = $.VNRecognizeTextRequest.alloc.init; | |
// Perform the text request and get the results | |
requestHandler.performRequestsError(ObjC.wrap([textRequest]), null); | |
const textResults = textRequest.results; | |
// Get the text from each observation | |
const sampleTexts = []; | |
for (let i = 0; i < textResults.count; i++) { | |
const observation = textResults.objectAtIndex(i); | |
const observationText = observation.topCandidates(1).objectAtIndex(0) | |
.string.js; | |
sampleTexts.push(observationText); | |
} | |
// Add this sample's texts to the overall list of texts | |
sampleTexts.forEach((text) => { | |
if (!texts.includes(text)) { | |
texts.push(text); | |
} | |
}); | |
} | |
// Return the texts found in the video | |
return texts; | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment