Rename image_encoder to vision_encoder to match HF naming convention (#14392)

larryliu0820 · web-flow · commit e31cef61ccab · 2025-09-17T15:59:22.000-07:00
Summary: As titled. We want to align with `optimum-executorch` naming convension ( which comes from HF `transformers`): https://github.com/huggingface/optimum-executorch/blob/main/optimum/exporters/executorch/tasks/multimodal_text_to_text.py#L238 Differential Revision: D82677835
diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py
@@ -224,12 +224,12 @@ def export_all(llava_model: LlavaModel):
 
     lowered_and_edge = to_edge_transform_and_lower(
         {
-            "image_encoder": image_encoder_ep,
+            "vision_encoder": image_encoder_ep,
             "token_embedding": token_embedding_ep,
             "text_decoder": text_model_ep,
         },
         partitioner={
-            "image_encoder": [XnnpackPartitioner()],
+            "vision_encoder": [XnnpackPartitioner()],
             "text_decoder": [
                 # First partition the DQLinear nodes, then partition the rest of the nodes,
                 # to avoid multiple DQLinear nodes in the same partition,
@@ -254,7 +254,7 @@ def export_all(llava_model: LlavaModel):
             ],
             memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
             sym_shape_eval_pass={
-                "image_encoder": ConstraintBasedSymShapeEvalPass(),
+                "vision_encoder": ConstraintBasedSymShapeEvalPass(),
                 "text_decoder": ConstraintBasedSymShapeEvalPass(),
                 "token_embedding": HintBasedSymShapeEvalPass(),
             },
diff --git a/examples/models/llava/test/test_llava.py b/examples/models/llava/test/test_llava.py
@@ -105,7 +105,7 @@ def test_llava_export(self):
         start_pos += pte_embeds_before_img.shape[1]
 
         # pte prefill image
-        pte_embeds_img = llava_module.run_method("image_encoder", (resized,))[0]
+        pte_embeds_img = llava_module.run_method("vision_encoder", (resized,))[0]
         llava_module.run_method(
             "text_decoder",
             (
diff --git a/examples/models/llava/test/test_pte.py b/examples/models/llava/test/test_pte.py
@@ -56,7 +56,7 @@ def main():
 
     # pte prefill image
     logging.warning("Image encoder started")
-    pte_embeds_img = llava_module.run_method("image_encoder", (resized,))[0]
+    pte_embeds_img = llava_module.run_method("vision_encoder", (resized,))[0]
     logging.warning("Image encoder finished")
     logging.warning("Image token prefill started")
     pte_prefill_img = llava_module.run_method(
diff --git a/extension/llm/runner/constants.h b/extension/llm/runner/constants.h
@@ -20,7 +20,7 @@ inline constexpr auto kUseKVCache = "use_kv_cache";
 inline constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
 
 // Multimodal method name conventions
-inline constexpr auto kImageEncoderMethod = "image_encoder";
+inline constexpr auto kVisionEncoderMethod = "vision_encoder";
 inline constexpr auto kAudioEncoderMethod = "audio_encoder";
 inline constexpr auto kTokenEmbeddingMethod = "token_embedding";
 inline constexpr auto kTextModelMethod = "text_decoder";
diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp
@@ -43,9 +43,9 @@ Result<uint64_t> MultimodalPrefiller::prefill(
     Image image = input.get_image();
 
     auto method_meta = ET_UNWRAP(
-        module_->method_meta(kImageEncoderMethod),
+        module_->method_meta(kVisionEncoderMethod),
         "Failed to get method_meta for %s",
-        kImageEncoderMethod);
+        kVisionEncoderMethod);
 
     ET_CHECK_MSG(
         method_meta.num_inputs() > 0,
@@ -80,7 +80,7 @@ Result<uint64_t> MultimodalPrefiller::prefill(
 
     // Run image encoder
     auto image_encoder_outputs =
-        ET_UNWRAP(module_->execute(kImageEncoderMethod, image_tensor));
+        ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor));
 
     encoder_output = image_encoder_outputs[0];
   } else if (input.is_audio()) {
@@ -175,8 +175,8 @@ ::executorch::runtime::Error MultimodalPrefiller::load() {
       ET_UNWRAP(module_->method_names(), "Failed to get method names");
 
   // Load image_encoder method if exists.
-  if (methods.find(kImageEncoderMethod) != methods.end()) {
-    ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kImageEncoderMethod));
+  if (methods.find(kVisionEncoderMethod) != methods.end()) {
+    ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kVisionEncoderMethod));
   }
 
   if (methods.find(kAudioEncoderMethod) != methods.end()) {
@@ -203,8 +203,8 @@ bool MultimodalPrefiller::is_method_loaded() {
     ET_CHECK_MSG(false, "Failed to get method names");
   }
   std::unordered_set<std::string> methods = methods_res.get();
-  if (methods.find(kImageEncoderMethod) != methods.end()) {
-    return module_->is_method_loaded(kImageEncoderMethod);
+  if (methods.find(kVisionEncoderMethod) != methods.end()) {
+    return module_->is_method_loaded(kVisionEncoderMethod);
   }
   return true;
 }

Original file line number	Diff line number	Diff line change
`@@ -105,7 +105,7 @@ def test_llava_export(self):`
`105`	`105`	`start_pos += pte_embeds_before_img.shape[1]`
`106`	`106`
`107`	`107`	`# pte prefill image`
`108`		`- pte_embeds_img = llava_module.run_method("image_encoder", (resized,))[0]`
	`108`	`+ pte_embeds_img = llava_module.run_method("vision_encoder", (resized,))[0]`
`109`	`109`	`llava_module.run_method(`
`110`	`110`	`"text_decoder",`
`111`	`111`	`(`