pytorch · facebook-github-bot · Sep 18, 2025 · Sep 18, 2025
@@ -9,25 +9,65 @@
 import ExecuTorchLLM
 import XCTest
 
+extension UIImage {
+  func asImage() -> Image {
+    let targetWidth = 336
+    let scaledHeight = Int((Double(targetWidth) * Double(size.height) / Double(size.width)).rounded())
+    let format = UIGraphicsImageRendererFormat.default()
+    format.scale = 1
+    let resizedImage = UIGraphicsImageRenderer(size: CGSize(width: targetWidth, height: scaledHeight), format: format).image { _ in
+      draw(in: CGRect(origin: .zero, size: CGSize(width: targetWidth, height: scaledHeight)))
+    }
+    let resizedCGImage = resizedImage.cgImage!
+    let imageWidth = resizedCGImage.width
+    let imageHeight = resizedCGImage.height
+    let pixelCount = imageWidth * imageHeight
+    var rgbaBuffer = [UInt8](repeating: 0, count: pixelCount * 4)
+    let context = CGContext(
+      data: &rgbaBuffer,
+      width: imageWidth,
+      height: imageHeight,
+      bitsPerComponent: 8,
+      bytesPerRow: imageWidth * 4,
+      space: CGColorSpaceCreateDeviceRGB(),
+      bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue | CGBitmapInfo.byteOrder32Big.rawValue
+    )!
+    context.draw(resizedCGImage, in: CGRect(x: 0, y: 0, width: imageWidth, height: imageHeight))
+    var planarRGB = [UInt8](repeating: 0, count: pixelCount * 3)
+    for pixelIndex in 0..<pixelCount {
+      let sourceOffset = pixelIndex * 4
+      planarRGB[pixelIndex] = rgbaBuffer[sourceOffset]
+      planarRGB[pixelIndex + pixelCount] = rgbaBuffer[sourceOffset + 1]
+      planarRGB[pixelIndex + pixelCount * 2] = rgbaBuffer[sourceOffset + 2]
+    }
+    return Image(data: Data(planarRGB), width: targetWidth, height: scaledHeight, channels: 3)
+  }
+}
+
 class MultimodalRunnerTest: XCTestCase {
   func test() {
     let bundle = Bundle(for: type(of: self))
     guard let modelPath = bundle.path(forResource: "llava", ofType: "pte"),
-          let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "bin") else {
+          let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "bin"),
+          let imagePath = bundle.path(forResource: "IMG_0005", ofType: "jpg"),
+          let image = UIImage(contentsOfFile: imagePath) else {
       XCTFail("Couldn't find model or tokenizer files")
       return
     }
-    return
     let runner = MultimodalRunner(modelPath: modelPath, tokenizerPath: tokenizerPath)
     var text = ""
 
     do {
-      try runner.generate([MultimodalInput("hello")], sequenceLength: 2) { token in
+      try runner.generate([
+        MultimodalInput("A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: "),
+        MultimodalInput(image.asImage()),
+        MultimodalInput("What's on the picture? ASSISTANT: "),
+      ], sequenceLength: 768) { token in
         text += token
       }
     } catch {
       XCTFail("Failed to generate text with error \(error)")
     }
-    XCTAssertEqual("hello,", text.lowercased())
+    XCTAssertTrue(text.lowercased().contains("waterfall"))
   }
 }