SKaplanOfficial/ExtractTextFromVideo.scpt

## ExtractTextFromVideo.scpt
(() => {
  ObjC.import("objc");
  ObjC.import("CoreMedia");
  ObjC.import("Foundation");
  ObjC.import("AVFoundation");
  ObjC.import("Vision");
  ObjC.import("AppKit");

  // Load the video file
  const assetURL = $.NSURL.fileURLWithPath(
    "/Users/exampleUser/Downloads/example.mov"
  );
  const asset = $.objc_getClass("AVAsset").assetWithURL(assetURL);

  // Ensure the video has a video track
  if (asset.tracksWithMediaType($.AVMediaTypeVideo).count == 0) {
    return "";
  }

  const frameCount = 15; // The number of frames to analyze

  // Set up the AVAssetReader for reading the video frames into pixel buffers
  const reader = $.objc_getClass("AVAssetReader").alloc.initWithAssetError(
    asset,
    null
  );
  const track = asset.tracksWithMediaType($.AVMediaTypeVideo).objectAtIndex(0);
  const settings = $.NSDictionary.dictionaryWithObjectForKey(
    "420v",
    "PixelFormatType"
  );
  readerOutput = $.objc_getClass(
    "AVAssetReaderTrackOutput"
  ).alloc.initWithTrackOutputSettings(track, settings);
  reader.addOutput(readerOutput);
  reader.startReading;

  // Read the video frames into pixel buffers
  samples = [];
  let buf = Ref();
  while (
    samples.length < frameCount &&
    reader.status != $.AVAssetReaderStatusCompleted &&
    reader.status != $.AVAssetReaderStatusFailed
  ) {
    buf = readerOutput.copyNextSampleBuffer;
    samples.push(buf);
  }

  const texts = []; // The texts found in the video

  // Analyze each sample
  for (let i = 0; i < samples.length; i++) {
    // Convert the pixel buffer to a Core Image buffer
    const imageBufferRef = ObjC.castRefToObject(
      $.CMSampleBufferGetImageBuffer(samples[samples.length - i - 1])
    );

    // Initialize the request handler and text request
    const requestHandler =
      $.VNImageRequestHandler.alloc.initWithCVPixelBufferOptions(
        imageBufferRef,
        $.NSDictionary.alloc.init
      );
    const textRequest = $.VNRecognizeTextRequest.alloc.init;

    // Perform the text request and get the results
    requestHandler.performRequestsError(ObjC.wrap([textRequest]), null);
    const textResults = textRequest.results;

    // Get the text from each observation
    const sampleTexts = [];
    for (let i = 0; i < textResults.count; i++) {
      const observation = textResults.objectAtIndex(i);
      const observationText = observation.topCandidates(1).objectAtIndex(0)
        .string.js;
      sampleTexts.push(observationText);
    }

    // Add this sample's texts to the overall list of texts
    sampleTexts.forEach((text) => {
      if (!texts.includes(text)) {
        texts.push(text);
      }
    });
  }

  // Return the texts found in the video
  return texts;
})();
	(() => {
	ObjC.import("objc");
	ObjC.import("CoreMedia");
	ObjC.import("Foundation");
	ObjC.import("AVFoundation");
	ObjC.import("Vision");
	ObjC.import("AppKit");

	// Load the video file
	const assetURL = $.NSURL.fileURLWithPath(
	"/Users/exampleUser/Downloads/example.mov"
	);
	const asset = $.objc_getClass("AVAsset").assetWithURL(assetURL);

	// Ensure the video has a video track
	if (asset.tracksWithMediaType($.AVMediaTypeVideo).count == 0) {
	return "";
	}

	const frameCount = 15; // The number of frames to analyze

	// Set up the AVAssetReader for reading the video frames into pixel buffers
	const reader = $.objc_getClass("AVAssetReader").alloc.initWithAssetError(
	asset,
	null
	);
	const track = asset.tracksWithMediaType($.AVMediaTypeVideo).objectAtIndex(0);
	const settings = $.NSDictionary.dictionaryWithObjectForKey(
	"420v",
	"PixelFormatType"
	);
	readerOutput = $.objc_getClass(
	"AVAssetReaderTrackOutput"
	).alloc.initWithTrackOutputSettings(track, settings);
	reader.addOutput(readerOutput);
	reader.startReading;

	// Read the video frames into pixel buffers
	samples = [];
	let buf = Ref();
	while (
	samples.length < frameCount &&
	reader.status != $.AVAssetReaderStatusCompleted &&
	reader.status != $.AVAssetReaderStatusFailed
	) {
	buf = readerOutput.copyNextSampleBuffer;
	samples.push(buf);
	}

	const texts = []; // The texts found in the video

	// Analyze each sample
	for (let i = 0; i < samples.length; i++) {
	// Convert the pixel buffer to a Core Image buffer
	const imageBufferRef = ObjC.castRefToObject(
	$.CMSampleBufferGetImageBuffer(samples[samples.length - i - 1])
	);

	// Initialize the request handler and text request
	const requestHandler =
	$.VNImageRequestHandler.alloc.initWithCVPixelBufferOptions(
	imageBufferRef,
	$.NSDictionary.alloc.init
	);
	const textRequest = $.VNRecognizeTextRequest.alloc.init;

	// Perform the text request and get the results
	requestHandler.performRequestsError(ObjC.wrap([textRequest]), null);
	const textResults = textRequest.results;

	// Get the text from each observation
	const sampleTexts = [];
	for (let i = 0; i < textResults.count; i++) {
	const observation = textResults.objectAtIndex(i);
	const observationText = observation.topCandidates(1).objectAtIndex(0)
	.string.js;
	sampleTexts.push(observationText);
	}

	// Add this sample's texts to the overall list of texts
	sampleTexts.forEach((text) => {
	if (!texts.includes(text)) {
	texts.push(text);
	}
	});
	}

	// Return the texts found in the video
	return texts;
	})();