Skip to content

Instantly share code, notes, and snippets.

@SKaplanOfficial
Created May 14, 2023 08:00
Show Gist options
  • Save SKaplanOfficial/c40ab8e3b86338dd6dd941d06cac2565 to your computer and use it in GitHub Desktop.
Save SKaplanOfficial/c40ab8e3b86338dd6dd941d06cac2565 to your computer and use it in GitHub Desktop.
JXA script to extract text from a video file by analyzing individual frames with the Vision framework.
(() => {
ObjC.import("objc");
ObjC.import("CoreMedia");
ObjC.import("Foundation");
ObjC.import("AVFoundation");
ObjC.import("Vision");
ObjC.import("AppKit");
// Load the video file
const assetURL = $.NSURL.fileURLWithPath(
"/Users/exampleUser/Downloads/example.mov"
);
const asset = $.objc_getClass("AVAsset").assetWithURL(assetURL);
// Ensure the video has a video track
if (asset.tracksWithMediaType($.AVMediaTypeVideo).count == 0) {
return "";
}
const frameCount = 15; // The number of frames to analyze
// Set up the AVAssetReader for reading the video frames into pixel buffers
const reader = $.objc_getClass("AVAssetReader").alloc.initWithAssetError(
asset,
null
);
const track = asset.tracksWithMediaType($.AVMediaTypeVideo).objectAtIndex(0);
const settings = $.NSDictionary.dictionaryWithObjectForKey(
"420v",
"PixelFormatType"
);
readerOutput = $.objc_getClass(
"AVAssetReaderTrackOutput"
).alloc.initWithTrackOutputSettings(track, settings);
reader.addOutput(readerOutput);
reader.startReading;
// Read the video frames into pixel buffers
samples = [];
let buf = Ref();
while (
samples.length < frameCount &&
reader.status != $.AVAssetReaderStatusCompleted &&
reader.status != $.AVAssetReaderStatusFailed
) {
buf = readerOutput.copyNextSampleBuffer;
samples.push(buf);
}
const texts = []; // The texts found in the video
// Analyze each sample
for (let i = 0; i < samples.length; i++) {
// Convert the pixel buffer to a Core Image buffer
const imageBufferRef = ObjC.castRefToObject(
$.CMSampleBufferGetImageBuffer(samples[samples.length - i - 1])
);
// Initialize the request handler and text request
const requestHandler =
$.VNImageRequestHandler.alloc.initWithCVPixelBufferOptions(
imageBufferRef,
$.NSDictionary.alloc.init
);
const textRequest = $.VNRecognizeTextRequest.alloc.init;
// Perform the text request and get the results
requestHandler.performRequestsError(ObjC.wrap([textRequest]), null);
const textResults = textRequest.results;
// Get the text from each observation
const sampleTexts = [];
for (let i = 0; i < textResults.count; i++) {
const observation = textResults.objectAtIndex(i);
const observationText = observation.topCandidates(1).objectAtIndex(0)
.string.js;
sampleTexts.push(observationText);
}
// Add this sample's texts to the overall list of texts
sampleTexts.forEach((text) => {
if (!texts.includes(text)) {
texts.push(text);
}
});
}
// Return the texts found in the video
return texts;
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment