abfo/Program.cs

## Program.cs
using SixLabors.ImageSharp;
using SixLabors.ImageSharp.Processing;
using SixLabors.ImageSharp.PixelFormats;
using SixLabors.ImageSharp.Formats.Jpeg;
using FaceAiSharp;
using System.Numerics.Tensors;
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;

const string InputFolder = @"D:\photo-frame\input";
const string OutputFolder = @"D:\photo-frame\output";
const int OutputWidth = 1366;
const int OutputHeight = 768;
const float OutputAspectRatio = (float) OutputHeight / OutputWidth;

int fileNumber = 0;

Console.WriteLine("photo-frame-process");

ImageEmbedder imageEmbedder = new ImageEmbedder();
var faceDetector = FaceAiSharpBundleFactory.CreateFaceDetectorWithLandmarks();
Random rng = new Random();

List<MetaImage> metaImages = new List<MetaImage>();

DirectoryInfo di = new DirectoryInfo(InputFolder);
string[] extensions = { "*.jpg", "*.jpeg", "*.png", "*.gif", "*.webp" };
List<FileInfo> files = new List<FileInfo>();
foreach(string ext in extensions)
{
    files.AddRange(di.GetFiles(ext, SearchOption.AllDirectories));
}
foreach(FileInfo file in files)
{
    // debug single image
    //if (file.Name != "IMG_1023.JPG") { continue; }

    Console.Write(file.Name);

    using (Image<Rgb24> image = Image.Load<Rgb24>(file.FullName))
    {
        image.Mutate(i => i.AutoOrient());

        float aspect = (float) image.Height / image.Width;

        // happiest path - same aspect ratio
        if (aspect == OutputAspectRatio)
        {
            Console.WriteLine(" - same aspect ratio");
            image.Mutate(x => x.Resize(OutputWidth, OutputHeight));
            image.SaveAsJpeg(GetNextFileName(), new JpegEncoder {Quality = 90});
            continue;
        }

        // vertical image
        if (image.Width < image.Height)
        {
            Console.WriteLine(" - vertical image");
            metaImages.Add(new MetaImage(file.FullName, imageEmbedder.Embed(file.FullName)));
            continue;
        }

        // horizontal image
        Console.WriteLine(" - horizontal image");
        try
        {
            FaceAwareResize(image, OutputWidth, OutputHeight);
            image.SaveAsJpeg(GetNextFileName(), new JpegEncoder {Quality = 90});
        }
        catch (Exception ex)
        {
            Console.WriteLine($"Error processing {file.FullName}: {ex.Message}");
        }
    }
}

while(metaImages.Count > 0)
{
    if (metaImages.Count == 1)
    {
        // one left over image, just resize as best as possible...
        MetaImage metaImage = metaImages[0];
        metaImages.Remove(metaImage);

        Console.WriteLine($"{Path.GetFileName(metaImage.ImagePath)} - single vertical image");

        using (Image<Rgb24> image = Image.Load<Rgb24>(metaImage.ImagePath))
        {
            try
            {
                image.Mutate(i => i.AutoOrient());
                FaceAwareResize(image, OutputWidth, OutputHeight);
                image.SaveAsJpeg(GetNextFileName(), new JpegEncoder {Quality = 90});
            }
            catch (Exception ex)
            {
                Console.WriteLine($"Error processing {metaImage.ImagePath}: {ex.Message}");
            }
        }

        break;
    }

    // find two vertical images to combin
    MetaImage meta1 = metaImages[rng.Next(metaImages.Count)];
    MetaImage? meta2 = null;

    float bestSimilarity = float.MinValue;

    // find the second image that is closest to the first image based on cosine similarity
    foreach(MetaImage candidate in metaImages)
    {
        if (candidate == meta1)
        {
            continue;
        }

        float similarity = TensorPrimitives.CosineSimilarity(meta1.Embedding, candidate.Embedding);
        if (similarity > bestSimilarity)
        {
            bestSimilarity = similarity;
            meta2 = candidate;
        }
    }

    if (meta2 == null)
    {
        throw new Exception("No second image found");
    }

    metaImages.Remove(meta1);
    metaImages.Remove(meta2);

    Console.WriteLine($"{Path.GetFileName(meta1.ImagePath)} - vertial image paired with {Path.GetFileName(meta2.ImagePath)}");

    try
    {
        using (Image<Rgb24> image1 = Image.Load<Rgb24>(meta1.ImagePath))
        {
            image1.Mutate(i => i.AutoOrient());

            using (Image<Rgb24> image2 = Image.Load<Rgb24>(meta2.ImagePath))
            {
                image2.Mutate(i => i.AutoOrient());

                FaceAwareResize(image1, OutputWidth / 2, OutputHeight);
                FaceAwareResize(image2, OutputWidth / 2, OutputHeight);

                // create a new image with the two images combined
                using (Image<Rgb24> combinedImage = new Image<Rgb24>(OutputWidth, OutputHeight))
                {
                    combinedImage.Mutate(x => x.DrawImage(image1, new Point(0, 0), 1f));
                    combinedImage.Mutate(x => x.DrawImage(image2, new Point(OutputWidth / 2, 0), 1f));
                    combinedImage.SaveAsJpeg(GetNextFileName(), new JpegEncoder {Quality = 90});
                }
            }
        }
    }
    catch (Exception ex)
    {
        Console.WriteLine($"Error processing {meta1.ImagePath} and {meta2.ImagePath}: {ex.Message}");
    }
}

Console.WriteLine("photo-frame-process done!");


// utility below


void FaceAwareResize(Image<Rgb24> image, int width, int height)
{
    RectangleF? detectRect = null;

    var faces = faceDetector.DetectFaces(image);
    foreach(var face in faces)
    {
        if (detectRect.HasValue)
        {
            detectRect = RectangleF.Union(detectRect.Value, face.Box);
        }
        else
        {
            detectRect = face.Box;
        }
    }

    RectangleF coreRect = detectRect ?? new RectangleF(image.Width / 2.0f, image.Height / 2.0f, 0.1f, 0.1f);

    // get the center of coreRect as PointF
    PointF center = new PointF(coreRect.X + coreRect.Width / 2.0f, coreRect.Y + coreRect.Height / 2.0f);

    float targetAspectRatio = (float) width / height;
    float imageAspectRatio = (float) image.Width / image.Height;

    if (targetAspectRatio >= imageAspectRatio)
    {
        // figure out the best Y position
        float targetHeight = image.Width * ((float)height / width);
        float y = center.Y - targetHeight / 2.0f;
        if (y < 0)
        {
            y = 0;
        }
        else if (y + targetHeight > image.Height)
        {
            y = image.Height - targetHeight;
        }

        int intY = (int)y;
        int intTargetHeight = (int)targetHeight;
        int extra = image.Height - (intY + intTargetHeight);
        if (extra < 0)
        {
            intTargetHeight += extra;
        }

        Rectangle targetRect = new Rectangle(0, intY, image.Width, intTargetHeight);

        // crop to targetRect
        image.Mutate(x => x.Crop(targetRect));
    }
    else
    {
        // figure out the best X position
        float targetWidth = image.Height * ((float)width / height);
        float x = center.X - targetWidth / 2.0f;
        if (x < 0)
        {
            x = 0;
        }
        else if (x + targetWidth > image.Width)
        {
            x = image.Width - targetWidth;
        }

        int intX = (int)x;
        int intTargetWidth = (int)targetWidth;
        int extra = image.Width - (intX + intTargetWidth);
        if (extra < 0)
        {
            intTargetWidth += extra;
        }

        Rectangle targetRect = new Rectangle(intX, 0, intTargetWidth, image.Height);

        // crop to targetRect
        image.Mutate(x => x.Crop(targetRect));
    }

    // Resize the image to the target dimensions while keeping the face in the center
    image.Mutate(x => x.Resize(new ResizeOptions
    {
        Size = new Size(width, height)
    }));
}

string GetNextFileName()
{
    fileNumber++;
    return Path.Combine(OutputFolder, $"{fileNumber:00000000}.jpg");
}

internal class MetaImage
{
    public string ImagePath { get; set; }
    public float[] Embedding { get; set; }

    public MetaImage(string imagePath, float[] embedding)
    {
        ImagePath = imagePath;
        Embedding = embedding;
    }
}

/// <summary>
/// Embeds an image using the OpenAI CLIP model.
/// See https://github.com/bartbroere/clip.dll/blob/master/Program.cs
/// </summary>
internal class ImageEmbedder
{
    private InferenceSession _model;

    public ImageEmbedder()
    {
        if (!File.Exists("clip-image-vit-32-float32.onnx"))
        {
            using (HttpClient httpClient = new HttpClient())
            {
                var response = httpClient.GetAsync("https://huggingface.co/rocca/openai-clip-js/resolve/main/clip-image-vit-32-float32.onnx").Result;
                using (var fs = new FileStream("clip-image-vit-32-float32.onnx", FileMode.CreateNew))
                {
                    response.Content.CopyToAsync(fs).Wait();
                }
            }
        }

        _model = new InferenceSession("clip-image-vit-32-float32.onnx");
    }

    public float[] Embed(string imagePath)
    {
        return Embed(Image.Load<Rgb24>(File.ReadAllBytes(imagePath)));
    }

    public float[] Embed(Image<Rgb24> image)
    {
        var smallestSide = Math.Min(image.Width, image.Height);
        image.Mutate(x => x.Crop(
            new Rectangle(
                (image.Width - smallestSide) / 2,
            (image.Height - smallestSide) / 2,
            smallestSide,
            smallestSide
        )));

        image.Mutate(x => x.Resize(224, 224));

        var inputTensor = new DenseTensor<float>(new[] {1, 3, 224, 224});

        for (var x = 0; x < 224; x++)
        {
            for (var y = 0; y < 224; y++)
            {
                // Normalize from bytes (0-255) to floats (constants borrowed from CLIP repository)
                inputTensor[0, 0, y, x] = Convert.ToSingle((((float) image[x, y].R / 255) - 0.48145466) / 0.26862954);
                inputTensor[0, 1, y, x] = Convert.ToSingle((((float) image[x, y].G / 255) - 0.4578275 ) / 0.26130258);
                inputTensor[0, 2, y, x] = Convert.ToSingle((((float) image[x, y].B / 255) - 0.40821073) / 0.27577711);
            }
        }

        var inputs = new List<NamedOnnxValue> {NamedOnnxValue.CreateFromTensor("input", inputTensor)};

        var outputData = _model.Run(inputs).ToList().Last().AsTensor<float>().ToArray();

        return outputData;
    }
}
	using SixLabors.ImageSharp;
	using SixLabors.ImageSharp.Processing;
	using SixLabors.ImageSharp.PixelFormats;
	using SixLabors.ImageSharp.Formats.Jpeg;
	using FaceAiSharp;
	using System.Numerics.Tensors;
	using Microsoft.ML.OnnxRuntime;
	using Microsoft.ML.OnnxRuntime.Tensors;

	const string InputFolder = @"D:\photo-frame\input";
	const string OutputFolder = @"D:\photo-frame\output";
	const int OutputWidth = 1366;
	const int OutputHeight = 768;
	const float OutputAspectRatio = (float) OutputHeight / OutputWidth;

	int fileNumber = 0;

	Console.WriteLine("photo-frame-process");

	ImageEmbedder imageEmbedder = new ImageEmbedder();
	var faceDetector = FaceAiSharpBundleFactory.CreateFaceDetectorWithLandmarks();
	Random rng = new Random();

	List<MetaImage> metaImages = new List<MetaImage>();

	DirectoryInfo di = new DirectoryInfo(InputFolder);
	string[] extensions = { ".jpg", ".jpeg", ".png", ".gif", "*.webp" };
	List<FileInfo> files = new List<FileInfo>();
	foreach(string ext in extensions)
	{
	files.AddRange(di.GetFiles(ext, SearchOption.AllDirectories));
	}
	foreach(FileInfo file in files)
	{
	// debug single image
	//if (file.Name != "IMG_1023.JPG") { continue; }

	Console.Write(file.Name);

	using (Image<Rgb24> image = Image.Load<Rgb24>(file.FullName))
	{
	image.Mutate(i => i.AutoOrient());

	float aspect = (float) image.Height / image.Width;

	// happiest path - same aspect ratio
	if (aspect == OutputAspectRatio)
	{
	Console.WriteLine(" - same aspect ratio");
	image.Mutate(x => x.Resize(OutputWidth, OutputHeight));
	image.SaveAsJpeg(GetNextFileName(), new JpegEncoder {Quality = 90});
	continue;
	}

	// vertical image
	if (image.Width < image.Height)
	{
	Console.WriteLine(" - vertical image");
	metaImages.Add(new MetaImage(file.FullName, imageEmbedder.Embed(file.FullName)));
	continue;
	}

	// horizontal image
	Console.WriteLine(" - horizontal image");
	try
	{
	FaceAwareResize(image, OutputWidth, OutputHeight);
	image.SaveAsJpeg(GetNextFileName(), new JpegEncoder {Quality = 90});
	}
	catch (Exception ex)
	{
	Console.WriteLine($"Error processing {file.FullName}: {ex.Message}");
	}
	}
	}

	while(metaImages.Count > 0)
	{
	if (metaImages.Count == 1)
	{
	// one left over image, just resize as best as possible...
	MetaImage metaImage = metaImages[0];
	metaImages.Remove(metaImage);

	Console.WriteLine($"{Path.GetFileName(metaImage.ImagePath)} - single vertical image");

	using (Image<Rgb24> image = Image.Load<Rgb24>(metaImage.ImagePath))
	{
	try
	{
	image.Mutate(i => i.AutoOrient());
	FaceAwareResize(image, OutputWidth, OutputHeight);
	image.SaveAsJpeg(GetNextFileName(), new JpegEncoder {Quality = 90});
	}
	catch (Exception ex)
	{
	Console.WriteLine($"Error processing {metaImage.ImagePath}: {ex.Message}");
	}
	}

	break;
	}

	// find two vertical images to combin
	MetaImage meta1 = metaImages[rng.Next(metaImages.Count)];
	MetaImage? meta2 = null;

	float bestSimilarity = float.MinValue;

	// find the second image that is closest to the first image based on cosine similarity
	foreach(MetaImage candidate in metaImages)
	{
	if (candidate == meta1)
	{
	continue;
	}

	float similarity = TensorPrimitives.CosineSimilarity(meta1.Embedding, candidate.Embedding);
	if (similarity > bestSimilarity)
	{
	bestSimilarity = similarity;
	meta2 = candidate;
	}
	}

	if (meta2 == null)
	{
	throw new Exception("No second image found");
	}

	metaImages.Remove(meta1);
	metaImages.Remove(meta2);

	Console.WriteLine($"{Path.GetFileName(meta1.ImagePath)} - vertial image paired with {Path.GetFileName(meta2.ImagePath)}");

	try
	{
	using (Image<Rgb24> image1 = Image.Load<Rgb24>(meta1.ImagePath))
	{
	image1.Mutate(i => i.AutoOrient());

	using (Image<Rgb24> image2 = Image.Load<Rgb24>(meta2.ImagePath))
	{
	image2.Mutate(i => i.AutoOrient());

	FaceAwareResize(image1, OutputWidth / 2, OutputHeight);
	FaceAwareResize(image2, OutputWidth / 2, OutputHeight);

	// create a new image with the two images combined
	using (Image<Rgb24> combinedImage = new Image<Rgb24>(OutputWidth, OutputHeight))
	{
	combinedImage.Mutate(x => x.DrawImage(image1, new Point(0, 0), 1f));
	combinedImage.Mutate(x => x.DrawImage(image2, new Point(OutputWidth / 2, 0), 1f));
	combinedImage.SaveAsJpeg(GetNextFileName(), new JpegEncoder {Quality = 90});
	}
	}
	}
	}
	catch (Exception ex)
	{
	Console.WriteLine($"Error processing {meta1.ImagePath} and {meta2.ImagePath}: {ex.Message}");
	}
	}

	Console.WriteLine("photo-frame-process done!");


	// utility below


	void FaceAwareResize(Image<Rgb24> image, int width, int height)
	{
	RectangleF? detectRect = null;

	var faces = faceDetector.DetectFaces(image);
	foreach(var face in faces)
	{
	if (detectRect.HasValue)
	{
	detectRect = RectangleF.Union(detectRect.Value, face.Box);
	}
	else
	{
	detectRect = face.Box;
	}
	}

	RectangleF coreRect = detectRect ?? new RectangleF(image.Width / 2.0f, image.Height / 2.0f, 0.1f, 0.1f);

	// get the center of coreRect as PointF
	PointF center = new PointF(coreRect.X + coreRect.Width / 2.0f, coreRect.Y + coreRect.Height / 2.0f);

	float targetAspectRatio = (float) width / height;
	float imageAspectRatio = (float) image.Width / image.Height;

	if (targetAspectRatio >= imageAspectRatio)
	{
	// figure out the best Y position
	float targetHeight = image.Width * ((float)height / width);
	float y = center.Y - targetHeight / 2.0f;
	if (y < 0)
	{
	y = 0;
	}
	else if (y + targetHeight > image.Height)
	{
	y = image.Height - targetHeight;
	}

	int intY = (int)y;
	int intTargetHeight = (int)targetHeight;
	int extra = image.Height - (intY + intTargetHeight);
	if (extra < 0)
	{
	intTargetHeight += extra;
	}

	Rectangle targetRect = new Rectangle(0, intY, image.Width, intTargetHeight);

	// crop to targetRect
	image.Mutate(x => x.Crop(targetRect));
	}
	else
	{
	// figure out the best X position
	float targetWidth = image.Height * ((float)width / height);
	float x = center.X - targetWidth / 2.0f;
	if (x < 0)
	{
	x = 0;
	}
	else if (x + targetWidth > image.Width)
	{
	x = image.Width - targetWidth;
	}

	int intX = (int)x;
	int intTargetWidth = (int)targetWidth;
	int extra = image.Width - (intX + intTargetWidth);
	if (extra < 0)
	{
	intTargetWidth += extra;
	}

	Rectangle targetRect = new Rectangle(intX, 0, intTargetWidth, image.Height);

	// crop to targetRect
	image.Mutate(x => x.Crop(targetRect));
	}

	// Resize the image to the target dimensions while keeping the face in the center
	image.Mutate(x => x.Resize(new ResizeOptions
	{
	Size = new Size(width, height)
	}));
	}

	string GetNextFileName()
	{
	fileNumber++;
	return Path.Combine(OutputFolder, $"{fileNumber:00000000}.jpg");
	}

	internal class MetaImage
	{
	public string ImagePath { get; set; }
	public float[] Embedding { get; set; }

	public MetaImage(string imagePath, float[] embedding)
	{
	ImagePath = imagePath;
	Embedding = embedding;
	}
	}

	/// <summary>
	/// Embeds an image using the OpenAI CLIP model.
	/// See https://github.com/bartbroere/clip.dll/blob/master/Program.cs
	/// </summary>
	internal class ImageEmbedder
	{
	private InferenceSession _model;

	public ImageEmbedder()
	{
	if (!File.Exists("clip-image-vit-32-float32.onnx"))
	{
	using (HttpClient httpClient = new HttpClient())
	{
	var response = httpClient.GetAsync("https://huggingface.co/rocca/openai-clip-js/resolve/main/clip-image-vit-32-float32.onnx").Result;
	using (var fs = new FileStream("clip-image-vit-32-float32.onnx", FileMode.CreateNew))
	{
	response.Content.CopyToAsync(fs).Wait();
	}
	}
	}

	_model = new InferenceSession("clip-image-vit-32-float32.onnx");
	}

	public float[] Embed(string imagePath)
	{
	return Embed(Image.Load<Rgb24>(File.ReadAllBytes(imagePath)));
	}

	public float[] Embed(Image<Rgb24> image)
	{
	var smallestSide = Math.Min(image.Width, image.Height);
	image.Mutate(x => x.Crop(
	new Rectangle(
	(image.Width - smallestSide) / 2,
	(image.Height - smallestSide) / 2,
	smallestSide,
	smallestSide
	)));

	image.Mutate(x => x.Resize(224, 224));

	var inputTensor = new DenseTensor<float>(new[] {1, 3, 224, 224});

	for (var x = 0; x < 224; x++)
	{
	for (var y = 0; y < 224; y++)
	{
	// Normalize from bytes (0-255) to floats (constants borrowed from CLIP repository)
	inputTensor[0, 0, y, x] = Convert.ToSingle((((float) image[x, y].R / 255) - 0.48145466) / 0.26862954);
	inputTensor[0, 1, y, x] = Convert.ToSingle((((float) image[x, y].G / 255) - 0.4578275 ) / 0.26130258);
	inputTensor[0, 2, y, x] = Convert.ToSingle((((float) image[x, y].B / 255) - 0.40821073) / 0.27577711);
	}
	}

	var inputs = new List<NamedOnnxValue> {NamedOnnxValue.CreateFromTensor("input", inputTensor)};

	var outputData = _model.Run(inputs).ToList().Last().AsTensor<float>().ToArray();

	return outputData;
	}
	}