Created
March 11, 2025 23:49
Process photos for a digital frame using face aware cropping and image embeddings to pair similar vertical photos.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using SixLabors.ImageSharp; | |
using SixLabors.ImageSharp.Processing; | |
using SixLabors.ImageSharp.PixelFormats; | |
using SixLabors.ImageSharp.Formats.Jpeg; | |
using FaceAiSharp; | |
using System.Numerics.Tensors; | |
using Microsoft.ML.OnnxRuntime; | |
using Microsoft.ML.OnnxRuntime.Tensors; | |
const string InputFolder = @"D:\photo-frame\input"; | |
const string OutputFolder = @"D:\photo-frame\output"; | |
const int OutputWidth = 1366; | |
const int OutputHeight = 768; | |
const float OutputAspectRatio = (float) OutputHeight / OutputWidth; | |
int fileNumber = 0; | |
Console.WriteLine("photo-frame-process"); | |
ImageEmbedder imageEmbedder = new ImageEmbedder(); | |
var faceDetector = FaceAiSharpBundleFactory.CreateFaceDetectorWithLandmarks(); | |
Random rng = new Random(); | |
List<MetaImage> metaImages = new List<MetaImage>(); | |
DirectoryInfo di = new DirectoryInfo(InputFolder); | |
string[] extensions = { "*.jpg", "*.jpeg", "*.png", "*.gif", "*.webp" }; | |
List<FileInfo> files = new List<FileInfo>(); | |
foreach(string ext in extensions) | |
{ | |
files.AddRange(di.GetFiles(ext, SearchOption.AllDirectories)); | |
} | |
foreach(FileInfo file in files) | |
{ | |
// debug single image | |
//if (file.Name != "IMG_1023.JPG") { continue; } | |
Console.Write(file.Name); | |
using (Image<Rgb24> image = Image.Load<Rgb24>(file.FullName)) | |
{ | |
image.Mutate(i => i.AutoOrient()); | |
float aspect = (float) image.Height / image.Width; | |
// happiest path - same aspect ratio | |
if (aspect == OutputAspectRatio) | |
{ | |
Console.WriteLine(" - same aspect ratio"); | |
image.Mutate(x => x.Resize(OutputWidth, OutputHeight)); | |
image.SaveAsJpeg(GetNextFileName(), new JpegEncoder {Quality = 90}); | |
continue; | |
} | |
// vertical image | |
if (image.Width < image.Height) | |
{ | |
Console.WriteLine(" - vertical image"); | |
metaImages.Add(new MetaImage(file.FullName, imageEmbedder.Embed(file.FullName))); | |
continue; | |
} | |
// horizontal image | |
Console.WriteLine(" - horizontal image"); | |
try | |
{ | |
FaceAwareResize(image, OutputWidth, OutputHeight); | |
image.SaveAsJpeg(GetNextFileName(), new JpegEncoder {Quality = 90}); | |
} | |
catch (Exception ex) | |
{ | |
Console.WriteLine($"Error processing {file.FullName}: {ex.Message}"); | |
} | |
} | |
} | |
while(metaImages.Count > 0) | |
{ | |
if (metaImages.Count == 1) | |
{ | |
// one left over image, just resize as best as possible... | |
MetaImage metaImage = metaImages[0]; | |
metaImages.Remove(metaImage); | |
Console.WriteLine($"{Path.GetFileName(metaImage.ImagePath)} - single vertical image"); | |
using (Image<Rgb24> image = Image.Load<Rgb24>(metaImage.ImagePath)) | |
{ | |
try | |
{ | |
image.Mutate(i => i.AutoOrient()); | |
FaceAwareResize(image, OutputWidth, OutputHeight); | |
image.SaveAsJpeg(GetNextFileName(), new JpegEncoder {Quality = 90}); | |
} | |
catch (Exception ex) | |
{ | |
Console.WriteLine($"Error processing {metaImage.ImagePath}: {ex.Message}"); | |
} | |
} | |
break; | |
} | |
// find two vertical images to combin | |
MetaImage meta1 = metaImages[rng.Next(metaImages.Count)]; | |
MetaImage? meta2 = null; | |
float bestSimilarity = float.MinValue; | |
// find the second image that is closest to the first image based on cosine similarity | |
foreach(MetaImage candidate in metaImages) | |
{ | |
if (candidate == meta1) | |
{ | |
continue; | |
} | |
float similarity = TensorPrimitives.CosineSimilarity(meta1.Embedding, candidate.Embedding); | |
if (similarity > bestSimilarity) | |
{ | |
bestSimilarity = similarity; | |
meta2 = candidate; | |
} | |
} | |
if (meta2 == null) | |
{ | |
throw new Exception("No second image found"); | |
} | |
metaImages.Remove(meta1); | |
metaImages.Remove(meta2); | |
Console.WriteLine($"{Path.GetFileName(meta1.ImagePath)} - vertial image paired with {Path.GetFileName(meta2.ImagePath)}"); | |
try | |
{ | |
using (Image<Rgb24> image1 = Image.Load<Rgb24>(meta1.ImagePath)) | |
{ | |
image1.Mutate(i => i.AutoOrient()); | |
using (Image<Rgb24> image2 = Image.Load<Rgb24>(meta2.ImagePath)) | |
{ | |
image2.Mutate(i => i.AutoOrient()); | |
FaceAwareResize(image1, OutputWidth / 2, OutputHeight); | |
FaceAwareResize(image2, OutputWidth / 2, OutputHeight); | |
// create a new image with the two images combined | |
using (Image<Rgb24> combinedImage = new Image<Rgb24>(OutputWidth, OutputHeight)) | |
{ | |
combinedImage.Mutate(x => x.DrawImage(image1, new Point(0, 0), 1f)); | |
combinedImage.Mutate(x => x.DrawImage(image2, new Point(OutputWidth / 2, 0), 1f)); | |
combinedImage.SaveAsJpeg(GetNextFileName(), new JpegEncoder {Quality = 90}); | |
} | |
} | |
} | |
} | |
catch (Exception ex) | |
{ | |
Console.WriteLine($"Error processing {meta1.ImagePath} and {meta2.ImagePath}: {ex.Message}"); | |
} | |
} | |
Console.WriteLine("photo-frame-process done!"); | |
// utility below | |
void FaceAwareResize(Image<Rgb24> image, int width, int height) | |
{ | |
RectangleF? detectRect = null; | |
var faces = faceDetector.DetectFaces(image); | |
foreach(var face in faces) | |
{ | |
if (detectRect.HasValue) | |
{ | |
detectRect = RectangleF.Union(detectRect.Value, face.Box); | |
} | |
else | |
{ | |
detectRect = face.Box; | |
} | |
} | |
RectangleF coreRect = detectRect ?? new RectangleF(image.Width / 2.0f, image.Height / 2.0f, 0.1f, 0.1f); | |
// get the center of coreRect as PointF | |
PointF center = new PointF(coreRect.X + coreRect.Width / 2.0f, coreRect.Y + coreRect.Height / 2.0f); | |
float targetAspectRatio = (float) width / height; | |
float imageAspectRatio = (float) image.Width / image.Height; | |
if (targetAspectRatio >= imageAspectRatio) | |
{ | |
// figure out the best Y position | |
float targetHeight = image.Width * ((float)height / width); | |
float y = center.Y - targetHeight / 2.0f; | |
if (y < 0) | |
{ | |
y = 0; | |
} | |
else if (y + targetHeight > image.Height) | |
{ | |
y = image.Height - targetHeight; | |
} | |
int intY = (int)y; | |
int intTargetHeight = (int)targetHeight; | |
int extra = image.Height - (intY + intTargetHeight); | |
if (extra < 0) | |
{ | |
intTargetHeight += extra; | |
} | |
Rectangle targetRect = new Rectangle(0, intY, image.Width, intTargetHeight); | |
// crop to targetRect | |
image.Mutate(x => x.Crop(targetRect)); | |
} | |
else | |
{ | |
// figure out the best X position | |
float targetWidth = image.Height * ((float)width / height); | |
float x = center.X - targetWidth / 2.0f; | |
if (x < 0) | |
{ | |
x = 0; | |
} | |
else if (x + targetWidth > image.Width) | |
{ | |
x = image.Width - targetWidth; | |
} | |
int intX = (int)x; | |
int intTargetWidth = (int)targetWidth; | |
int extra = image.Width - (intX + intTargetWidth); | |
if (extra < 0) | |
{ | |
intTargetWidth += extra; | |
} | |
Rectangle targetRect = new Rectangle(intX, 0, intTargetWidth, image.Height); | |
// crop to targetRect | |
image.Mutate(x => x.Crop(targetRect)); | |
} | |
// Resize the image to the target dimensions while keeping the face in the center | |
image.Mutate(x => x.Resize(new ResizeOptions | |
{ | |
Size = new Size(width, height) | |
})); | |
} | |
string GetNextFileName() | |
{ | |
fileNumber++; | |
return Path.Combine(OutputFolder, $"{fileNumber:00000000}.jpg"); | |
} | |
internal class MetaImage | |
{ | |
public string ImagePath { get; set; } | |
public float[] Embedding { get; set; } | |
public MetaImage(string imagePath, float[] embedding) | |
{ | |
ImagePath = imagePath; | |
Embedding = embedding; | |
} | |
} | |
/// <summary> | |
/// Embeds an image using the OpenAI CLIP model. | |
/// See https://github.com/bartbroere/clip.dll/blob/master/Program.cs | |
/// </summary> | |
internal class ImageEmbedder | |
{ | |
private InferenceSession _model; | |
public ImageEmbedder() | |
{ | |
if (!File.Exists("clip-image-vit-32-float32.onnx")) | |
{ | |
using (HttpClient httpClient = new HttpClient()) | |
{ | |
var response = httpClient.GetAsync("https://huggingface.co/rocca/openai-clip-js/resolve/main/clip-image-vit-32-float32.onnx").Result; | |
using (var fs = new FileStream("clip-image-vit-32-float32.onnx", FileMode.CreateNew)) | |
{ | |
response.Content.CopyToAsync(fs).Wait(); | |
} | |
} | |
} | |
_model = new InferenceSession("clip-image-vit-32-float32.onnx"); | |
} | |
public float[] Embed(string imagePath) | |
{ | |
return Embed(Image.Load<Rgb24>(File.ReadAllBytes(imagePath))); | |
} | |
public float[] Embed(Image<Rgb24> image) | |
{ | |
var smallestSide = Math.Min(image.Width, image.Height); | |
image.Mutate(x => x.Crop( | |
new Rectangle( | |
(image.Width - smallestSide) / 2, | |
(image.Height - smallestSide) / 2, | |
smallestSide, | |
smallestSide | |
))); | |
image.Mutate(x => x.Resize(224, 224)); | |
var inputTensor = new DenseTensor<float>(new[] {1, 3, 224, 224}); | |
for (var x = 0; x < 224; x++) | |
{ | |
for (var y = 0; y < 224; y++) | |
{ | |
// Normalize from bytes (0-255) to floats (constants borrowed from CLIP repository) | |
inputTensor[0, 0, y, x] = Convert.ToSingle((((float) image[x, y].R / 255) - 0.48145466) / 0.26862954); | |
inputTensor[0, 1, y, x] = Convert.ToSingle((((float) image[x, y].G / 255) - 0.4578275 ) / 0.26130258); | |
inputTensor[0, 2, y, x] = Convert.ToSingle((((float) image[x, y].B / 255) - 0.40821073) / 0.27577711); | |
} | |
} | |
var inputs = new List<NamedOnnxValue> {NamedOnnxValue.CreateFromTensor("input", inputTensor)}; | |
var outputData = _model.Run(inputs).ToList().Last().AsTensor<float>().ToArray(); | |
return outputData; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment