Skip to content

Commit 6f7b990

Browse files
shoumikhinfacebook-github-bot
authored andcommitted
Add image to multimodal runner test. (#14194)
Summary: Pull Request resolved: #14194 . Reviewed By: mergennachin Differential Revision: D82183713
1 parent 654e722 commit 6f7b990

File tree

2 files changed

+44
-4
lines changed

2 files changed

+44
-4
lines changed

extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,25 +9,65 @@
99
import ExecuTorchLLM
1010
import XCTest
1111

12+
extension UIImage {
13+
func asImage() -> Image {
14+
let targetWidth = 336
15+
let scaledHeight = Int((Double(targetWidth) * Double(size.height) / Double(size.width)).rounded())
16+
let format = UIGraphicsImageRendererFormat.default()
17+
format.scale = 1
18+
let resizedImage = UIGraphicsImageRenderer(size: CGSize(width: targetWidth, height: scaledHeight), format: format).image { _ in
19+
draw(in: CGRect(origin: .zero, size: CGSize(width: targetWidth, height: scaledHeight)))
20+
}
21+
let resizedCGImage = resizedImage.cgImage!
22+
let imageWidth = resizedCGImage.width
23+
let imageHeight = resizedCGImage.height
24+
let pixelCount = imageWidth * imageHeight
25+
var rgbaBuffer = [UInt8](repeating: 0, count: pixelCount * 4)
26+
let context = CGContext(
27+
data: &rgbaBuffer,
28+
width: imageWidth,
29+
height: imageHeight,
30+
bitsPerComponent: 8,
31+
bytesPerRow: imageWidth * 4,
32+
space: CGColorSpaceCreateDeviceRGB(),
33+
bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue | CGBitmapInfo.byteOrder32Big.rawValue
34+
)!
35+
context.draw(resizedCGImage, in: CGRect(x: 0, y: 0, width: imageWidth, height: imageHeight))
36+
var planarRGB = [UInt8](repeating: 0, count: pixelCount * 3)
37+
for pixelIndex in 0..<pixelCount {
38+
let sourceOffset = pixelIndex * 4
39+
planarRGB[pixelIndex] = rgbaBuffer[sourceOffset]
40+
planarRGB[pixelIndex + pixelCount] = rgbaBuffer[sourceOffset + 1]
41+
planarRGB[pixelIndex + pixelCount * 2] = rgbaBuffer[sourceOffset + 2]
42+
}
43+
return Image(data: Data(planarRGB), width: targetWidth, height: scaledHeight, channels: 3)
44+
}
45+
}
46+
1247
class MultimodalRunnerTest: XCTestCase {
1348
func test() {
1449
let bundle = Bundle(for: type(of: self))
1550
guard let modelPath = bundle.path(forResource: "llava", ofType: "pte"),
16-
let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "bin") else {
51+
let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "bin"),
52+
let imagePath = bundle.path(forResource: "IMG_0005", ofType: "jpg"),
53+
let image = UIImage(contentsOfFile: imagePath) else {
1754
XCTFail("Couldn't find model or tokenizer files")
1855
return
1956
}
20-
return
2157
let runner = MultimodalRunner(modelPath: modelPath, tokenizerPath: tokenizerPath)
2258
var text = ""
2359

2460
do {
25-
try runner.generate([MultimodalInput("hello")], sequenceLength: 2) { token in
61+
try runner.generate([
62+
MultimodalInput("A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: "),
63+
MultimodalInput(image.asImage()),
64+
MultimodalInput("What's on the picture? ASSISTANT: "),
65+
], sequenceLength: 768) { token in
2666
text += token
2767
}
2868
} catch {
2969
XCTFail("Failed to generate text with error \(error)")
3070
}
31-
XCTAssertEqual("hello,", text.lowercased())
71+
XCTAssertTrue(text.lowercased().contains("waterfall"))
3272
}
3373
}
75.9 KB
Loading

0 commit comments

Comments
 (0)