vulkan: fuse mul_mat_id + mul

jeffbolznv · jeffbolznv · commit 4f922ce17a92 · 2025-11-07T22:14:55.000-06:00
This comes up in qwen3 moe.
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -802,6 +802,7 @@ struct vk_mat_vec_push_constants {
     uint32_t batch_stride_b;
     uint32_t batch_stride_d;
     uint32_t enable_bias;
+    uint32_t enable_scale;
     uint32_t ne02;
     uint32_t ne12;
     uint32_t broadcast2;
@@ -824,6 +825,7 @@ struct vk_mat_vec_id_push_constants {
     uint32_t batch_stride_b;
     uint32_t batch_stride_d;
     uint32_t enable_bias;
+    uint32_t enable_scale;
     uint32_t nei0;
     uint32_t ne11;
 };
@@ -6796,7 +6798,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
     // compute
     const vk_mat_vec_push_constants pc = {
         (uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01,
-        stride_batch_x, stride_batch_y, stride_batch_d, enable_bias,
+        stride_batch_x, stride_batch_y, stride_batch_d, enable_bias, 0,
         (uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3,
     };
     ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
@@ -7617,13 +7619,22 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
         groups_x = CEIL_DIV(groups_x, groups_z);
     }
 
-    uint32_t enable_bias = ctx->num_additional_fused_ops > 0;
+    uint32_t enable_bias = 0;
+    uint32_t enable_scale = 0;
+    if (ctx->num_additional_fused_ops > 0) {
+        if (cgraph->nodes[node_idx + 1]->op == GGML_OP_MUL) {
+            enable_scale = 1;
+        } else {
+            GGML_ASSERT(cgraph->nodes[node_idx + 1]->op == GGML_OP_ADD_ID);
+            enable_bias = 1;
+        }
+    }
 
     vk_buffer d_B = d_D;
     size_t b_buf_offset = 0;
     uint64_t b_sz = 0;
 
-    if (enable_bias) {
+    if (enable_bias || enable_scale) {
         const ggml_tensor * bias = cgraph->nodes[node_idx + 1]->src[1];
 
         bool b_uma = false;
@@ -7645,7 +7656,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
         (uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01,
         (uint32_t)x_ne, stride_batch_y, (uint32_t)(ne20*ne21),
 
-        enable_bias,
+        enable_bias, enable_scale,
 
         (uint32_t)nei0, (uint32_t)ne11,
     };
@@ -12671,6 +12682,40 @@ static bool ggml_vk_can_fuse(const ggml_backend_vk_context * ctx, const struct g
         }
     }
 
+    if (ops.size() == 2 && ops.begin()[0] == GGML_OP_MUL_MAT_ID && ops.begin()[1] == GGML_OP_MUL) {
+        // additional constraints specific to this fusion
+        const ggml_tensor *mmid = cgraph->nodes[node_idx];
+        const ggml_tensor *mul = cgraph->nodes[node_idx + 1];
+        const ggml_tensor *scale = mul->src[1];
+
+        if (mmid != mul->src[0]) {
+            return false;
+        }
+        // mat-vec only
+        if (!ggml_vk_use_mul_mat_vec_id(cgraph, node_idx)) {
+            return false;
+        }
+        // shaders assume the types match
+        if (mmid->type != scale->type) {
+            return false;
+        }
+        // shaders assume the bias is contiguous
+        if (!ggml_is_contiguous(scale)) {
+            return false;
+        }
+        // unaligned bias isn't handled
+        if (get_misalign_bytes(ctx, scale) != 0) {
+            return false;
+        }
+        // shader only indexes by expert index
+        if (scale->ne[0] != 1 ||
+            scale->ne[1] != mul->ne[1] ||
+            scale->ne[2] != 1 ||
+            scale->ne[3] != 1) {
+            return false;
+        }
+    }
+
     return true;
 }
 
@@ -12917,6 +12962,8 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
                 ctx->num_additional_fused_ops = 1;
             } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID })) {
                 ctx->num_additional_fused_ops = 1;
+            } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_MUL })) {
+                ctx->num_additional_fused_ops = 1;
             } else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 2 }) &&
                        ggml_check_edges(cgraph, i, rope_view_set_rows_edges) &&
                        ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i)) {
@@ -13142,7 +13189,8 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
                     is_src_of(graph->nodes[j], graph->nodes[c]) &&
                     !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_RMS_NORM && graph->nodes[j]->op == GGML_OP_MUL) &&
                     !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_MUL_MAT && graph->nodes[j]->op == GGML_OP_ADD) &&
-                    !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_MUL_MAT_ID && graph->nodes[j]->op == GGML_OP_ADD_ID)) {
+                    !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_MUL_MAT_ID && graph->nodes[j]->op == GGML_OP_ADD_ID) &&
+                    !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_MUL_MAT_ID && graph->nodes[j]->op == GGML_OP_MUL)) {
                     ok = false;
                     break;
                 }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl
@@ -49,6 +49,7 @@ layout (push_constant) uniform parameter
     uint batch_stride_d;
 
     uint enable_bias;
+    uint enable_scale;
 
 #ifdef MUL_MAT_ID
     uint nei0;
@@ -129,6 +130,12 @@ void reduce_result(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32_t
                     temp[j][n] += FLOAT_TYPE(data_bias[j*p.batch_stride_d + d_offset + first_row + n]);
 #endif
                 }
+#ifdef MUL_MAT_ID
+                if (p.enable_scale != 0) {
+                    const uint expert_idx = gl_GlobalInvocationID.y;
+                    temp[j][n] *= FLOAT_TYPE(data_bias[expert_idx]);
+                }
+#endif
                 data_d[j*p.batch_stride_d + d_offset + first_row + n] = D_TYPE(temp[j][n]);
             }
         }
@@ -171,6 +178,12 @@ void reduce_result(FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32_t d_offs
                     temp[j][n] += FLOAT_TYPE(data_bias[j*p.batch_stride_d + d_offset + first_row + n]);
 #endif
                 }
+#ifdef MUL_MAT_ID
+                if (p.enable_scale != 0) {
+                    const uint expert_idx = gl_GlobalInvocationID.y;
+                    temp[j][n] *= FLOAT_TYPE(data_bias[expert_idx]);
+                }
+#endif
                 data_d[j*p.batch_stride_d + d_offset + first_row + n] = D_TYPE(temp[j][n]);
             }
         }
@@ -203,6 +216,12 @@ void reduce_result(FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32_t d_offs
                     tmpsh[j][n][0] += FLOAT_TYPE(data_bias[j*p.batch_stride_d + d_offset + first_row + n]);
 #endif
                 }
+#ifdef MUL_MAT_ID
+                if (p.enable_scale != 0) {
+                    const uint expert_idx = gl_GlobalInvocationID.y;
+                    tmpsh[j][n][0] *= FLOAT_TYPE(data_bias[expert_idx]);
+                }
+#endif
                 data_d[j*p.batch_stride_d + d_offset + first_row + n] = D_TYPE(tmpsh[j][n][0]);
             }
         }
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -3495,9 +3495,10 @@ struct test_mul_mat_id : public test_case {
     const int64_t n;
     const int64_t k;
     const uint32_t o; // number of outputs
+    const bool mul;
 
     std::string vars() override {
-        return VARS_TO_STR9(type_a, type_b, n_mats, n_used, b, m, n, k, o);
+        return VARS_TO_STR10(type_a, type_b, n_mats, n_used, b, m, n, k, o, mul);
     }
 
     double max_nmse_err() override {
@@ -3511,9 +3512,9 @@ struct test_mul_mat_id : public test_case {
 
     test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
             int n_mats = 8, int n_used = 2, bool b = false,
-            int64_t m = 32, int64_t n = 32, int64_t k = 32, uint32_t o = 1)
+            int64_t m = 32, int64_t n = 32, int64_t k = 32, uint32_t o = 1, bool mul = false)
         : type_a(type_a), type_b(type_b), n_mats(n_mats), n_used(n_used), b(b),
-            m(m), n(n), k(k), o(o) {
+            m(m), n(n), k(k), o(o), mul(mul) {
             GGML_ASSERT(n_used <= n_mats);
         }
 
@@ -3542,6 +3543,13 @@ struct test_mul_mat_id : public test_case {
             out = ggml_add(ctx, out, out2);
         }
 
+        if (mul) {
+            std::array<int64_t, 4> ne { 1, out->ne[1], out->ne[2], out->ne[3] };
+            ne[0] = 1;
+            ggml_tensor * m = ggml_new_tensor(ctx, out->type, 4, ne.data());
+            out = ggml_mul(ctx, out, m);
+        }
+
         return out;
     }
 
@@ -3566,7 +3574,7 @@ struct test_mul_mat_id : public test_case {
         }
     }
 
-    bool run_whole_graph() override { return o > 1; }
+    bool run_whole_graph() override { return o > 1 || mul; }
 
     std::string op_desc(ggml_tensor * t) override {
         GGML_UNUSED(t);
@@ -6978,6 +6986,15 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
         }
     }
 
+    for (int bs : {1, 4, 512}) {
+        for (ggml_type type_a : {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q4_0, GGML_TYPE_Q4_K}) {
+            for (ggml_type type_b : {GGML_TYPE_F32}) {
+                // test with mul after (ffn_moe_weighted)
+                test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, 128, 8, false, 768, bs, 2048, 1, true));
+            }
+        }
+    }
+
     for (ggml_type type_a : base_types) {
         for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
             for (int n : {1, 16}) {

Original file line number	Diff line number	Diff line change
`@@ -49,6 +49,7 @@ layout (push_constant) uniform parameter`
`49`	`49`	`uint batch_stride_d;`
`50`	`50`
`51`	`51`	`uint enable_bias;`
	`52`	`+ uint enable_scale;`
`52`	`53`
`53`	`54`	`#ifdef MUL_MAT_ID`
`54`	`55`	`uint nei0;`
`@@ -129,6 +130,12 @@ void reduce_result(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32_t`
`129`	`130`	`temp[j][n] += FLOAT_TYPE(data_bias[j*p.batch_stride_d + d_offset + first_row + n]);`
`130`	`131`	`#endif`
`131`	`132`	`}`
	`133`	`+#ifdef MUL_MAT_ID`
	`134`	`+ if (p.enable_scale != 0) {`
	`135`	`+ const uint expert_idx = gl_GlobalInvocationID.y;`
	`136`	`+ temp[j][n] *= FLOAT_TYPE(data_bias[expert_idx]);`
	`137`	`+ }`
	`138`	`+#endif`
`132`	`139`	`data_d[j*p.batch_stride_d + d_offset + first_row + n] = D_TYPE(temp[j][n]);`
`133`	`140`	`}`
`134`	`141`	`}`
`@@ -171,6 +178,12 @@ void reduce_result(FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32_t d_offs`
`171`	`178`	`temp[j][n] += FLOAT_TYPE(data_bias[j*p.batch_stride_d + d_offset + first_row + n]);`
`172`	`179`	`#endif`
`173`	`180`	`}`
	`181`	`+#ifdef MUL_MAT_ID`
	`182`	`+ if (p.enable_scale != 0) {`
	`183`	`+ const uint expert_idx = gl_GlobalInvocationID.y;`
	`184`	`+ temp[j][n] *= FLOAT_TYPE(data_bias[expert_idx]);`
	`185`	`+ }`
	`186`	`+#endif`
`174`	`187`	`data_d[j*p.batch_stride_d + d_offset + first_row + n] = D_TYPE(temp[j][n]);`
`175`	`188`	`}`
`176`	`189`	`}`
`@@ -203,6 +216,12 @@ void reduce_result(FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32_t d_offs`
`203`	`216`	`tmpsh[j][n][0] += FLOAT_TYPE(data_bias[j*p.batch_stride_d + d_offset + first_row + n]);`
`204`	`217`	`#endif`
`205`	`218`	`}`
	`219`	`+#ifdef MUL_MAT_ID`
	`220`	`+ if (p.enable_scale != 0) {`
	`221`	`+ const uint expert_idx = gl_GlobalInvocationID.y;`
	`222`	`+ tmpsh[j][n][0] *= FLOAT_TYPE(data_bias[expert_idx]);`
	`223`	`+ }`
	`224`	`+#endif`
`206`	`225`	`data_d[j*p.batch_stride_d + d_offset + first_row + n] = D_TYPE(tmpsh[j][n][0]);`
`207`	`226`	`}`
`208`	`227`	`}`