Add image to multimodal runner test. (#14194)

shoumikhin · facebook-github-bot · commit 2de800f523a1 · 2025-09-11T14:13:06.000-07:00
Summary: Pull Request resolved: #14194 . Reviewed By: mergennachin Differential Revision: D82183713
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm
@@ -192,7 +192,9 @@ - (BOOL)generate:(NSArray<ExecuTorchLLMMultimodalInput *> *)inputs
   }
   auto status = _runner->generate(
     std::move(nativeInputs),
-    llm::GenerationConfig{.seq_len = static_cast<int32_t>(seq_len)},
+    llm::GenerationConfig{
+      .max_new_tokens = static_cast<int32_t>(seq_len),
+    },
     [callback](const std::string& token) {
       if (callback) {
         callback(@(token.c_str()));
diff --git a/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift b/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift
@@ -9,25 +9,67 @@
 import ExecuTorchLLM
 import XCTest
 
+extension UIImage {
+  func asImage() -> Image {
+    let targetWidth = 336
+    let h = Double(targetWidth) * Double(size.height) / Double(size.width)
+    let targetHeight = Int(h.rounded())
+    let format = UIGraphicsImageRendererFormat.default()
+    format.scale = 1
+    let resized = UIGraphicsImageRenderer(size: CGSize(width: targetWidth, height: targetHeight), format: format).image { _ in
+      draw(in: CGRect(origin: .zero, size: CGSize(width: targetWidth, height: targetHeight)))
+    }
+    let cgImage = resized.cgImage!
+    let width = cgImage.width
+    let height = cgImage.height
+    let pixelCount = width * height
+    let bytesPerPixel = 4
+    let bytesPerRow = bytesPerPixel * width
+    var pixelBytes = [UInt8](repeating: 0, count: pixelCount * bytesPerPixel)
+    let context = CGContext(
+      data: &pixelBytes,
+      width: width,
+      height: height,
+      bitsPerComponent: 8,
+      bytesPerRow: bytesPerRow,
+      space: CGColorSpaceCreateDeviceRGB(),
+      bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue | CGBitmapInfo.byteOrder32Big.rawValue
+    )!
+    context.draw(cgImage, in: CGRect(x: 0, y: 0, width: width, height: height))
+    var rgbBytes = [UInt8](repeating: 0, count: pixelCount * 3)
+    for i in 0..<pixelCount {
+      let p = i * bytesPerPixel
+      rgbBytes[i] = pixelBytes[p]
+      rgbBytes[i + pixelCount] = pixelBytes[p + 1]
+      rgbBytes[i + pixelCount * 2] = pixelBytes[p + 2]
+    }
+    return Image(data: Data(rgbBytes), width: width, height: height, channels: 3)
+  }
+}
+
 class MultimodalRunnerTest: XCTestCase {
   func test() {
     let bundle = Bundle(for: type(of: self))
     guard let modelPath = bundle.path(forResource: "llava", ofType: "pte"),
-          let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "bin") else {
+          let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "bin"),
+          let imagePath = bundle.path(forResource: "IMG_0005", ofType: "JPG"),
+          let image = UIImage(contentsOfFile: imagePath) else {
       XCTFail("Couldn't find model or tokenizer files")
       return
     }
-    return
     let runner = MultimodalRunner(modelPath: modelPath, tokenizerPath: tokenizerPath)
     var text = ""
 
     do {
-      try runner.generate([MultimodalInput("hello")], sequenceLength: 2) { token in
+      try runner.generate([
+        MultimodalInput("What's on the picture?"),
+        MultimodalInput(image.asImage()),
+      ], sequenceLength: 256) { token in
         text += token
       }
     } catch {
       XCTFail("Failed to generate text with error \(error)")
     }
-    XCTAssertEqual("hello,", text.lowercased())
+    XCTAssertTrue(text.lowercased().contains("water"))
   }
 }
diff --git a/extension/llm/apple/ExecuTorchLLM/__tests__/resources/IMG_0005.JPG b/extension/llm/apple/ExecuTorchLLM/__tests__/resources/IMG_0005.JPG