Skip to content

Commit e31cef6

Browse files
authored
Rename image_encoder to vision_encoder to match HF naming convention (#14392)
Summary: As titled. We want to align with `optimum-executorch` naming convension ( which comes from HF `transformers`): https://github.com/huggingface/optimum-executorch/blob/main/optimum/exporters/executorch/tasks/multimodal_text_to_text.py#L238 Differential Revision: D82677835
1 parent 4872141 commit e31cef6

File tree

5 files changed

+13
-13
lines changed

5 files changed

+13
-13
lines changed

examples/models/llava/export_llava.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -224,12 +224,12 @@ def export_all(llava_model: LlavaModel):
224224

225225
lowered_and_edge = to_edge_transform_and_lower(
226226
{
227-
"image_encoder": image_encoder_ep,
227+
"vision_encoder": image_encoder_ep,
228228
"token_embedding": token_embedding_ep,
229229
"text_decoder": text_model_ep,
230230
},
231231
partitioner={
232-
"image_encoder": [XnnpackPartitioner()],
232+
"vision_encoder": [XnnpackPartitioner()],
233233
"text_decoder": [
234234
# First partition the DQLinear nodes, then partition the rest of the nodes,
235235
# to avoid multiple DQLinear nodes in the same partition,
@@ -254,7 +254,7 @@ def export_all(llava_model: LlavaModel):
254254
],
255255
memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
256256
sym_shape_eval_pass={
257-
"image_encoder": ConstraintBasedSymShapeEvalPass(),
257+
"vision_encoder": ConstraintBasedSymShapeEvalPass(),
258258
"text_decoder": ConstraintBasedSymShapeEvalPass(),
259259
"token_embedding": HintBasedSymShapeEvalPass(),
260260
},

examples/models/llava/test/test_llava.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def test_llava_export(self):
105105
start_pos += pte_embeds_before_img.shape[1]
106106

107107
# pte prefill image
108-
pte_embeds_img = llava_module.run_method("image_encoder", (resized,))[0]
108+
pte_embeds_img = llava_module.run_method("vision_encoder", (resized,))[0]
109109
llava_module.run_method(
110110
"text_decoder",
111111
(

examples/models/llava/test/test_pte.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def main():
5656

5757
# pte prefill image
5858
logging.warning("Image encoder started")
59-
pte_embeds_img = llava_module.run_method("image_encoder", (resized,))[0]
59+
pte_embeds_img = llava_module.run_method("vision_encoder", (resized,))[0]
6060
logging.warning("Image encoder finished")
6161
logging.warning("Image token prefill started")
6262
pte_prefill_img = llava_module.run_method(

extension/llm/runner/constants.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ inline constexpr auto kUseKVCache = "use_kv_cache";
2020
inline constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
2121

2222
// Multimodal method name conventions
23-
inline constexpr auto kImageEncoderMethod = "image_encoder";
23+
inline constexpr auto kVisionEncoderMethod = "vision_encoder";
2424
inline constexpr auto kAudioEncoderMethod = "audio_encoder";
2525
inline constexpr auto kTokenEmbeddingMethod = "token_embedding";
2626
inline constexpr auto kTextModelMethod = "text_decoder";

extension/llm/runner/multimodal_prefiller.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,9 @@ Result<uint64_t> MultimodalPrefiller::prefill(
4343
Image image = input.get_image();
4444

4545
auto method_meta = ET_UNWRAP(
46-
module_->method_meta(kImageEncoderMethod),
46+
module_->method_meta(kVisionEncoderMethod),
4747
"Failed to get method_meta for %s",
48-
kImageEncoderMethod);
48+
kVisionEncoderMethod);
4949

5050
ET_CHECK_MSG(
5151
method_meta.num_inputs() > 0,
@@ -80,7 +80,7 @@ Result<uint64_t> MultimodalPrefiller::prefill(
8080

8181
// Run image encoder
8282
auto image_encoder_outputs =
83-
ET_UNWRAP(module_->execute(kImageEncoderMethod, image_tensor));
83+
ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor));
8484

8585
encoder_output = image_encoder_outputs[0];
8686
} else if (input.is_audio()) {
@@ -175,8 +175,8 @@ ::executorch::runtime::Error MultimodalPrefiller::load() {
175175
ET_UNWRAP(module_->method_names(), "Failed to get method names");
176176

177177
// Load image_encoder method if exists.
178-
if (methods.find(kImageEncoderMethod) != methods.end()) {
179-
ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kImageEncoderMethod));
178+
if (methods.find(kVisionEncoderMethod) != methods.end()) {
179+
ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kVisionEncoderMethod));
180180
}
181181

182182
if (methods.find(kAudioEncoderMethod) != methods.end()) {
@@ -203,8 +203,8 @@ bool MultimodalPrefiller::is_method_loaded() {
203203
ET_CHECK_MSG(false, "Failed to get method names");
204204
}
205205
std::unordered_set<std::string> methods = methods_res.get();
206-
if (methods.find(kImageEncoderMethod) != methods.end()) {
207-
return module_->is_method_loaded(kImageEncoderMethod);
206+
if (methods.find(kVisionEncoderMethod) != methods.end()) {
207+
return module_->is_method_loaded(kVisionEncoderMethod);
208208
}
209209
return true;
210210
}

0 commit comments

Comments
 (0)