add support for dist sampling on GPU

danbev · danbev · commit 5d18032ba055 · 2025-11-06T07:27:43.000+01:00
This commit add support for performing distribution sampling on the GPU.
It adds a function to the sampler interface for setting input tensors
which will be called after the computation graph has been built and
scheduled.

For the dist sampler this allows it to set a random uniform value
that is used to sample from the cumulative distribution.
diff --git a/common/llguidance.cpp b/common/llguidance.cpp
@@ -106,14 +106,15 @@ static void llama_sampler_llg_free(llama_sampler * smpl) {
 }
 
 static llama_sampler_i llama_sampler_llg_i = {
-    /* .name        = */ llama_sampler_llg_name,
-    /* .accept      = */ llama_sampler_llg_accept_impl,
-    /* .apply       = */ llama_sampler_llg_apply,
-    /* .reset       = */ llama_sampler_llg_reset,
-    /* .clone       = */ llama_sampler_llg_clone,
-    /* .free        = */ llama_sampler_llg_free,
-    /* .apply_ggml  = */ NULL,
-    /* .accept_ggml = */ NULL,
+    /* .name           = */ llama_sampler_llg_name,
+    /* .accept         = */ llama_sampler_llg_accept_impl,
+    /* .apply          = */ llama_sampler_llg_apply,
+    /* .reset          = */ llama_sampler_llg_reset,
+    /* .clone          = */ llama_sampler_llg_clone,
+    /* .free           = */ llama_sampler_llg_free,
+    /* .apply_ggml     = */ NULL,
+    /* .accept_ggml    = */ NULL,
+    /* .set_input_ggml = */ NULL,
 };
 
 static size_t llama_sampler_llg_tokenize_fn(const void * user_data, const uint8_t * bytes, size_t bytes_len,
diff --git a/include/llama.h b/include/llama.h
@@ -1182,6 +1182,7 @@ extern "C" {
                                                struct ggml_cgraph   * gf,
                                                struct ggml_tensor   * selected_token);
 
+        void                   (*set_input_ggml)(struct llama_sampler * smpl);
 
         // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph
         //void (*apply_ggml) (struct llama_sampler * smpl, ...);
@@ -1210,6 +1211,7 @@ extern "C" {
                                                                 struct ggml_context  * ctx,
                                                                 struct  ggml_cgraph  * gf,
                                                                 struct ggml_tensor   * selected_token);
+    LLAMA_API void                   llama_sampler_set_input_ggml(struct llama_sampler * smpl);
 
     // llama_sampler_chain
     // a type of llama_sampler that can chain multiple samplers one after another
diff --git a/src/llama-gpu-sampling.cpp b/src/llama-gpu-sampling.cpp
@@ -1,6 +1,8 @@
 #include "llama-gpu-sampling.h"
 #include "ggml.h"
 #include <cstdio>
+#include <chrono>
+#include <random>
 
 static void llama_sampler_gpu_greedy_apply_ggml(
         struct llama_sampler           * smpl,
@@ -27,14 +29,15 @@ static struct llama_sampler * llama_sampler_gpu_greedy_clone(const struct llama_
 
 struct llama_sampler * llama_sampler_gpu_init_greedy() {
     static const llama_sampler_i iface = {
-        /*.name        =*/ llama_sampler_gpu_greedy_sampler_name,
-        /*.accept      =*/ nullptr,
-        /*.apply       =*/ nullptr,
-        /*.reset       =*/ nullptr,
-        /*.clone       =*/ llama_sampler_gpu_greedy_clone,
-        /*.free        =*/ nullptr,
-        /*.apply_ggml  =*/ llama_sampler_gpu_greedy_apply_ggml,
-        /*.accept_ggml =*/ nullptr,
+        /*.name            =*/ llama_sampler_gpu_greedy_sampler_name,
+        /*.accept          =*/ nullptr,
+        /*.apply           =*/ nullptr,
+        /*.reset           =*/ nullptr,
+        /*.clone           =*/ llama_sampler_gpu_greedy_clone,
+        /*.free            =*/ nullptr,
+        /*.apply_ggml      =*/ llama_sampler_gpu_greedy_apply_ggml,
+        /*.accept_ggml     =*/ nullptr,
+        /*.set_input_ggml  =*/ nullptr,
     };
 
     auto * sampler = new llama_sampler {
@@ -85,14 +88,15 @@ static struct llama_sampler * llama_sampler_gpu_temp_clone(const struct llama_sa
 
 struct llama_sampler * llama_sampler_gpu_init_temp(float temp) {
     static const llama_sampler_i iface = {
-        /*.name        =*/ llama_sampler_gpu_temp_name,
-        /*.accept      =*/ nullptr,
-        /*.apply       =*/ nullptr,
-        /*.reset       =*/ nullptr,
-        /*.clone       =*/ llama_sampler_gpu_temp_clone,
-        /*.free        =*/ llama_sampler_gpu_temp_free,
-        /*.apply_ggml  =*/ llama_sampler_gpu_temp_apply_ggml,
-        /*.accept_ggml =*/ nullptr,
+        /*.name           =*/ llama_sampler_gpu_temp_name,
+        /*.accept         =*/ nullptr,
+        /*.apply          =*/ nullptr,
+        /*.reset          =*/ nullptr,
+        /*.clone          =*/ llama_sampler_gpu_temp_clone,
+        /*.free           =*/ llama_sampler_gpu_temp_free,
+        /*.apply_ggml     =*/ llama_sampler_gpu_temp_apply_ggml,
+        /*.accept_ggml    =*/ nullptr,
+        /*.set_input_ggml =*/ nullptr,
     };
 
     auto * ctx_data = new llama_sampler_gpu_temp_ctx {
@@ -141,14 +145,15 @@ static struct llama_sampler * llama_sampler_gpu_softmax_clone(const struct llama
 
 struct llama_sampler * llama_sampler_gpu_init_softmax() {
     static const llama_sampler_i iface = {
-        /*.name        =*/ llama_sampler_gpu_softmax_name,
-        /*.accept      =*/ nullptr,
-        /*.apply       =*/ nullptr,
-        /*.reset       =*/ nullptr,
-        /*.clone       =*/ llama_sampler_gpu_softmax_clone,
-        /*.free        =*/ llama_sampler_gpu_softmax_free,
-        /*.apply_ggml  =*/ llama_sampler_gpu_softmax_apply_ggml,
-        /*.accept_ggml =*/ nullptr,
+        /*.name           =*/ llama_sampler_gpu_softmax_name,
+        /*.accept         =*/ nullptr,
+        /*.apply          =*/ nullptr,
+        /*.reset          =*/ nullptr,
+        /*.clone          =*/ llama_sampler_gpu_softmax_clone,
+        /*.free           =*/ llama_sampler_gpu_softmax_free,
+        /*.apply_ggml     =*/ llama_sampler_gpu_softmax_apply_ggml,
+        /*.accept_ggml    =*/ nullptr,
+        /*.set_input_ggml =*/ nullptr,
     };
 
     auto * ctx_data = new llama_sampler_gpu_softmax_ctx {
@@ -204,14 +209,15 @@ static struct llama_sampler * llama_sampler_gpu_top_k_clone(const struct llama_s
 
 struct llama_sampler * llama_sampler_gpu_init_top_k(int32_t k) {
     static const llama_sampler_i iface = {
-        /*.name        =*/ llama_sampler_gpu_top_k_name,
-        /*.accept      =*/ nullptr,
-        /*.apply       =*/ nullptr,
-        /*.reset       =*/ nullptr,
-        /*.clone       =*/ llama_sampler_gpu_top_k_clone,
-        /*.free        =*/ llama_sampler_gpu_top_k_free,
-        /*.apply_ggml  =*/ llama_sampler_gpu_top_k_apply_ggml,
-        /*.accept_ggml =*/ nullptr,
+        /*.name           =*/ llama_sampler_gpu_top_k_name,
+        /*.accept         =*/ nullptr,
+        /*.apply          =*/ nullptr,
+        /*.reset          =*/ nullptr,
+        /*.clone          =*/ llama_sampler_gpu_top_k_clone,
+        /*.free           =*/ llama_sampler_gpu_top_k_free,
+        /*.apply_ggml     =*/ llama_sampler_gpu_top_k_apply_ggml,
+        /*.accept_ggml    =*/ nullptr,
+        /*.set_input_ggml =*/ nullptr,
     };
 
     auto * ctx_data = new llama_sampler_gpu_top_k_ctx {
@@ -274,14 +280,15 @@ static struct llama_sampler * llama_sampler_gpu_top_p_clone(const struct llama_s
 
 struct llama_sampler * llama_sampler_gpu_init_top_p(int32_t k) {
     static const llama_sampler_i iface = {
-        /*.name        =*/ llama_sampler_gpu_top_p_name,
-        /*.accept      =*/ nullptr,
-        /*.apply       =*/ nullptr,
-        /*.reset       =*/ nullptr,
-        /*.clone       =*/ llama_sampler_gpu_top_p_clone,
-        /*.free        =*/ llama_sampler_gpu_top_p_free,
-        /*.apply_ggml  =*/ llama_sampler_gpu_top_p_apply_ggml,
-        /*.accept_ggml =*/ nullptr,
+        /*.name           =*/ llama_sampler_gpu_top_p_name,
+        /*.accept         =*/ nullptr,
+        /*.apply          =*/ nullptr,
+        /*.reset          =*/ nullptr,
+        /*.clone          =*/ llama_sampler_gpu_top_p_clone,
+        /*.free           =*/ llama_sampler_gpu_top_p_free,
+        /*.apply_ggml     =*/ llama_sampler_gpu_top_p_apply_ggml,
+        /*.accept_ggml    =*/ nullptr,
+        /*.set_input_ggml =*/ nullptr,
     };
 
     auto * ctx_data = new llama_sampler_gpu_top_p_ctx {
@@ -295,3 +302,131 @@ struct llama_sampler * llama_sampler_gpu_init_top_p(int32_t k) {
 
     return sampler;
 }
+
+static uint32_t get_rng_seed(uint32_t seed) {
+    if (seed == LLAMA_DEFAULT_SEED) {
+        // use system clock if std::random_device is not a true RNG
+        static bool is_rd_prng = std::random_device().entropy() == 0;
+        if (is_rd_prng) {
+            return (uint32_t) std::chrono::system_clock::now().time_since_epoch().count();
+        }
+        std::random_device rd;
+        return rd();
+    }
+    return seed;
+}
+
+struct llama_sampler_gpu_dist_ctx {
+    const uint32_t seed;
+          uint32_t seed_cur;
+    std::mt19937   rng;
+
+    struct ggml_tensor * uniform;
+};
+
+static void llama_sampler_gpu_dist_apply_ggml(
+        struct llama_sampler           * smpl,
+        struct ggml_context            * ctx,
+        struct ggml_cgraph             * gf,
+        struct llama_sampler_ggml_data * ggml_data) {
+    GGML_UNUSED(gf);
+    auto * sctx = (llama_sampler_gpu_dist_ctx *) smpl->ctx;
+    printf("gpu dist: Building sampler with seed=%d\n", sctx->seed);
+
+    // Create the uniform random scalar input tensor. This will be set by
+    // llama_sampler_gpu_dist_set_input_ggml after this graph is built, but
+    // before it is executed.
+    struct ggml_tensor * uniform = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+    sctx->uniform = uniform;
+    ggml_set_name(uniform, "uniform");
+    ggml_set_input(uniform);
+    ggml_set_output(uniform);
+
+    struct ggml_tensor * softmax = ggml_soft_max(ctx, ggml_data->logits);
+    ggml_set_name(softmax, "softmax");
+
+    struct ggml_tensor * cumsum = ggml_cumsum(ctx, softmax);
+    ggml_set_name(cumsum, "cumsum");
+
+    // Broadcast the random uniform value to match cumsums’s shape
+    struct ggml_tensor * rnd_rep = ggml_repeat(ctx, sctx->uniform, cumsum);
+    ggml_set_name(rnd_rep, "dist_rand_rep");
+
+    // Each entry in rnd_rep has the random value in it so we subtract this
+    // tensor with the cumsum tensor. Recall that each entry in cumsum is the
+    // cumulative probability up to that index. While the entry is smaller than
+    // the random value the difference is positive, but once we exceed the
+    // random value the difference becomes zero or negative.
+    struct ggml_tensor * diff = ggml_sub(ctx, rnd_rep, cumsum);
+    ggml_set_name(diff, "dist_rnd_minus_cumsum");
+
+    // The ggml_step function produces a tensor where entries are 1 if the
+    // corresponding entry in diff is > 0, and 0 otherwise. So all values up to
+    // the index where the cumulative probability exceeds the random value are 1,
+    // and all entries after that are 0.
+    struct ggml_tensor * mask = ggml_step(ctx, diff);
+    ggml_set_name(mask, "dist_mask");
+
+    // Taking the sum of the mask gives us the index entry where the cumulative
+    // threshold is first exceeded and this is our sampled token index as a float.
+    struct ggml_tensor * idxf = ggml_sum(ctx, mask);
+    ggml_set_name(idxf, "dist_index_f32");
+
+    // Cast the float index to integer.
+    struct ggml_tensor * idx = ggml_cast(ctx, idxf, GGML_TYPE_I32);
+    ggml_set_name(idx, "dist_index_i32");
+    ggml_set_output(idx);
+    ggml_data->sampled_token = idx;
+}
+
+static const char * llama_sampler_gpu_dist_name(const struct llama_sampler *) {
+    return "gpu-dist";
+}
+
+static void llama_sampler_gpu_dist_free(struct llama_sampler * smpl) {
+    auto * sctx = (llama_sampler_gpu_dist_ctx *) smpl->ctx;
+    delete sctx;
+}
+
+static struct llama_sampler * llama_sampler_gpu_dist_clone(const struct llama_sampler * smpl) {
+    auto * sctx = (llama_sampler_gpu_dist_ctx *) smpl->ctx;
+    return llama_sampler_gpu_init_dist(sctx->seed);
+}
+
+static void llama_sampler_gpu_dist_set_input_ggml(struct llama_sampler * smpl) {
+    auto * sctx = (llama_sampler_gpu_dist_ctx *) smpl->ctx;
+    GGML_ASSERT(sctx->uniform != nullptr);
+
+    std::uniform_real_distribution<float> dist(0.0f, 1.0f);
+    const float rnd = dist(sctx->rng);
+    ggml_backend_tensor_set(sctx->uniform, &rnd, 0, sizeof(rnd));
+}
+
+struct llama_sampler * llama_sampler_gpu_init_dist(uint32_t seed) {
+    static const llama_sampler_i iface = {
+        /*.name           =*/ llama_sampler_gpu_dist_name,
+        /*.accept         =*/ nullptr,
+        /*.apply          =*/ nullptr,
+        /*.reset          =*/ nullptr,
+        /*.clone          =*/ llama_sampler_gpu_dist_clone,
+        /*.free           =*/ llama_sampler_gpu_dist_free,
+        /*.apply_ggml     =*/ llama_sampler_gpu_dist_apply_ggml,
+        /*.accept_ggml    =*/ nullptr,
+        /*.set_input_ggml =*/ llama_sampler_gpu_dist_set_input_ggml,
+    };
+
+    auto seed_cur = get_rng_seed(seed);
+    auto * ctx_data = new llama_sampler_gpu_dist_ctx {
+        /*.seed     =*/ seed,
+        /*.seed_cur =*/ seed_cur,
+        /*.rng      =*/ std::mt19937(seed_cur),
+        /*.random   =*/ nullptr,
+    };
+
+    auto * sampler = new llama_sampler {
+        /*.iface =*/ &iface,
+        /*.ctx   =*/ ctx_data,
+    };
+
+    return sampler;
+}
diff --git a/src/llama-gpu-sampling.h b/src/llama-gpu-sampling.h
@@ -20,6 +20,8 @@ LLAMA_API struct llama_sampler * llama_sampler_gpu_init_top_k(int32_t k);
 // TODO: implement real top-p sampling on GPU.
 LLAMA_API struct llama_sampler * llama_sampler_gpu_init_top_p(int32_t k);
 
+LLAMA_API struct llama_sampler * llama_sampler_gpu_init_dist(uint32_t seed);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -464,6 +464,12 @@ void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
 
 void llm_graph_input_sampling::set_input(const llama_ubatch * ubatch) {
     GGML_UNUSED(ubatch);
+    for (const auto & [seq_id, sampler] : samplers) {
+        if (sampler->iface->set_input_ggml) {
+            printf("llm_graph_input_sampling::set_input: setting sampler input for seq_id %d\n", seq_id);
+            sampler->iface->set_input_ggml(sampler);
+        }
+    }
 }
 
 bool llm_graph_input_sampling::can_reuse(const llm_graph_params & params) {
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -387,7 +387,7 @@ class llm_graph_input_mem_hybrid : public llm_graph_input_i {
 class llm_graph_input_sampling : public llm_graph_input_i {
 public:
     llm_graph_input_sampling(int32_t n_vocab, bool sorted, std::unordered_map<llama_seq_id, llama_sampler*> samplers) :
-        n_vocab(n_vocab), sorted_value(sorted) {
+        n_vocab(n_vocab), sorted_value(sorted), samplers(samplers) {
 
         sampler_versions.reserve(samplers.size());
         for (const auto & [seq_id, sampler] : samplers) {
@@ -406,6 +406,7 @@ class llm_graph_input_sampling : public llm_graph_input_i {
 
     // Track sampler chain version for reuse
     std::unordered_map<llama_seq_id, uint64_t> sampler_versions;
+    std::unordered_map<llama_seq_id, llama_sampler*> samplers;
 };
 
 //
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
diff --git a/tests/test-gpu-sampling.cpp b/tests/test-gpu-sampling.cpp

Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,8 @@ LLAMA_API struct llama_sampler * llama_sampler_gpu_init_top_k(int32_t k);`
`20`	`20`	`// TODO: implement real top-p sampling on GPU.`
`21`	`21`	`LLAMA_API struct llama_sampler * llama_sampler_gpu_init_top_p(int32_t k);`
`22`	`22`
	`23`	`+LLAMA_API struct llama_sampler * llama_sampler_gpu_init_dist(uint32_t seed);`
	`24`	`+`
`23`	`25`	`#ifdef __cplusplus`
`24`	`26`	`}`
`25`	`27`	`#endif`