use ggml_div_inplace for normalization

danbev · danbev · commit d04847d26de3 · 2025-11-05T08:39:05.000+01:00
This commit updates the llama_sampler_gpu_top_p_apply_ggml function
to use ggml_div_inplace instead of ggml_div as this generated an error
on webgpu backends:
```console
/home/danbev/work/ai/llama.cpp-debug/ggml/src/ggml-webgpu/ggml-webgpu.cpp:2146: ggml_webgpu: Device error! Reason: 2, Message:
  Writable storage buffer binding aliasing found between [BindGroup "div_f32"] set at bind group index 0, binding index 1, and
  [BindGroup "div_f32"] set at bind group index 0, binding index 2, with overlapping ranges (offset: 0, size: 32) and (offset: 0,
  size: 32) in [Buffer "allocated_buffer"].
   - While encoding [ComputePassEncoder (unlabeled)].DispatchWorkgroups(1, 1, 1).
   - While finishing [CommandEncoder (unlabeled)].
```

It also sets ggml_data-filtered_ids as an output tensor as it might
otherwise be reused before being read.
diff --git a/src/llama-gpu-sampling.cpp b/src/llama-gpu-sampling.cpp
@@ -245,14 +245,15 @@ static void llama_sampler_gpu_top_p_apply_ggml(
     struct ggml_tensor * top_k_ids = ggml_cont(ctx, ggml_top_k(ctx, softmax, ctx_data->k));
     ggml_set_name(top_k_ids, "top_k_ids");
     ggml_data->filtered_ids = top_k_ids;
+    ggml_set_output(ggml_data->filtered_ids);
 
     struct ggml_tensor * prob_rows = ggml_reshape_2d(ctx, softmax, 1, ggml_data->logits->ne[0]);
     struct ggml_tensor * top_k_rows = ggml_get_rows(ctx, prob_rows, top_k_ids);
     ggml_set_name(top_k_rows, "top_k_rows");
 
     struct ggml_tensor * top_k = ggml_reshape_1d(ctx, top_k_rows, ctx_data->k);
     struct ggml_tensor * total = ggml_sum(ctx, top_k);
-    struct ggml_tensor * norm = ggml_div(ctx, top_k, ggml_repeat(ctx, total, top_k));
+    struct ggml_tensor * norm = ggml_div_inplace(ctx, top_k, ggml_repeat(ctx, total, top_k));
     ggml_data->probs = norm;
     ggml_build_forward_expand(gf, ggml_data->probs);
 }