Skip to content

Commit 2de800f

Browse files
shoumikhinfacebook-github-bot
authored andcommitted
Add image to multimodal runner test. (#14194)
Summary: Pull Request resolved: #14194 . Reviewed By: mergennachin Differential Revision: D82183713
1 parent 6d8583d commit 2de800f

File tree

3 files changed

+49
-5
lines changed

3 files changed

+49
-5
lines changed

extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,9 @@ - (BOOL)generate:(NSArray<ExecuTorchLLMMultimodalInput *> *)inputs
192192
}
193193
auto status = _runner->generate(
194194
std::move(nativeInputs),
195-
llm::GenerationConfig{.seq_len = static_cast<int32_t>(seq_len)},
195+
llm::GenerationConfig{
196+
.max_new_tokens = static_cast<int32_t>(seq_len),
197+
},
196198
[callback](const std::string& token) {
197199
if (callback) {
198200
callback(@(token.c_str()));

extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,25 +9,67 @@
99
import ExecuTorchLLM
1010
import XCTest
1111

12+
extension UIImage {
13+
func asImage() -> Image {
14+
let targetWidth = 336
15+
let h = Double(targetWidth) * Double(size.height) / Double(size.width)
16+
let targetHeight = Int(h.rounded())
17+
let format = UIGraphicsImageRendererFormat.default()
18+
format.scale = 1
19+
let resized = UIGraphicsImageRenderer(size: CGSize(width: targetWidth, height: targetHeight), format: format).image { _ in
20+
draw(in: CGRect(origin: .zero, size: CGSize(width: targetWidth, height: targetHeight)))
21+
}
22+
let cgImage = resized.cgImage!
23+
let width = cgImage.width
24+
let height = cgImage.height
25+
let pixelCount = width * height
26+
let bytesPerPixel = 4
27+
let bytesPerRow = bytesPerPixel * width
28+
var pixelBytes = [UInt8](repeating: 0, count: pixelCount * bytesPerPixel)
29+
let context = CGContext(
30+
data: &pixelBytes,
31+
width: width,
32+
height: height,
33+
bitsPerComponent: 8,
34+
bytesPerRow: bytesPerRow,
35+
space: CGColorSpaceCreateDeviceRGB(),
36+
bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue | CGBitmapInfo.byteOrder32Big.rawValue
37+
)!
38+
context.draw(cgImage, in: CGRect(x: 0, y: 0, width: width, height: height))
39+
var rgbBytes = [UInt8](repeating: 0, count: pixelCount * 3)
40+
for i in 0..<pixelCount {
41+
let p = i * bytesPerPixel
42+
rgbBytes[i] = pixelBytes[p]
43+
rgbBytes[i + pixelCount] = pixelBytes[p + 1]
44+
rgbBytes[i + pixelCount * 2] = pixelBytes[p + 2]
45+
}
46+
return Image(data: Data(rgbBytes), width: width, height: height, channels: 3)
47+
}
48+
}
49+
1250
class MultimodalRunnerTest: XCTestCase {
1351
func test() {
1452
let bundle = Bundle(for: type(of: self))
1553
guard let modelPath = bundle.path(forResource: "llava", ofType: "pte"),
16-
let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "bin") else {
54+
let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "bin"),
55+
let imagePath = bundle.path(forResource: "IMG_0005", ofType: "JPG"),
56+
let image = UIImage(contentsOfFile: imagePath) else {
1757
XCTFail("Couldn't find model or tokenizer files")
1858
return
1959
}
20-
return
2160
let runner = MultimodalRunner(modelPath: modelPath, tokenizerPath: tokenizerPath)
2261
var text = ""
2362

2463
do {
25-
try runner.generate([MultimodalInput("hello")], sequenceLength: 2) { token in
64+
try runner.generate([
65+
MultimodalInput("What's on the picture?"),
66+
MultimodalInput(image.asImage()),
67+
], sequenceLength: 256) { token in
2668
text += token
2769
}
2870
} catch {
2971
XCTFail("Failed to generate text with error \(error)")
3072
}
31-
XCTAssertEqual("hello,", text.lowercased())
73+
XCTAssertTrue(text.lowercased().contains("water"))
3274
}
3375
}
1.77 MB
Loading

0 commit comments

Comments
 (0)