ggml-org · danbev · Nov 4, 2025 · Nov 5, 2025 · Nov 5, 2025 · Nov 5, 2025
@@ -2477,6 +2477,109 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
         }
     ).set_sparam());
+    add_opt(common_arg(
+        {"--gpu-sampling"},
+        "enable GPU sampling (default: disabled)",
+        [](common_params & params) {
+            params.sampling.gpu_sampling = true;
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--gpu-top-k"}, "N",
+        string_format("GPU top-k sampling (default: %d, <= 0 = disabled)", params.sampling.gpu_top_k),
+        [](common_params & params, int value) {
+            params.sampling.gpu_top_k = value;
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--gpu-top-p-approx-k"}, "N",
+        string_format("GPU top-p approximation using top-k (default: %d, 0 = disabled)", params.sampling.gpu_top_p_approx_k),
+        [](common_params & params, int value) {
+            params.sampling.gpu_top_p_approx_k = value;
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--gpu-temp"}, "N",
+        string_format("GPU temperature (default: %.2f, 0.0 = disabled, greedy sampling)", (double)params.sampling.gpu_temp),
+        [](common_params & params, const std::string & value) {
+            params.sampling.gpu_temp = std::stof(value);
+            params.sampling.gpu_temp = std::max(params.sampling.gpu_temp, 0.0f);
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--gpu-softmax"},
+        "add GPU softmax to sampling chain (default: disabled)",
+        [](common_params & params) {
+            params.sampling.gpu_softmax = true;
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--gpu-dist"},
+        "add GPU dist (final sampling) to sampling chain (default: disabled)",
+        [](common_params & params) {
+            params.sampling.gpu_dist = true;
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        // TODO: need to get feedback on how to best configure per slot GPU samplers
+        {"--gpu-slot"}, "SLOT_ID:CONFIG",
+        "configure GPU sampling for a specific slot\n"
+        "format: SLOT_ID:top-k=N,temp=F,dist=BOOL or SLOT_ID:none to disable GPU sampling\n"
+        "example: --gpu-slot 0:top-k=20,temp=0.8,dist=true --gpu-slot 1:none",
+        [](common_params & params, const std::string & value) {
+            auto colon_pos = value.find(':');
+            if (colon_pos == std::string::npos) {
+                throw std::invalid_argument("--gpu-slot format must be SLOT_ID:CONFIG");
+            }
+
+            int32_t slot_id = std::stoi(value.substr(0, colon_pos));
+            std::string config_str = value.substr(colon_pos + 1);
+
+            if (config_str == "none") {
+                common_params_sampling slot_config = params.sampling;
+                slot_config.gpu_sampling = false;
+                params.sampling.gpu_slot_configs[slot_id] = slot_config;
+                return;
+            }
+
+            common_params_sampling slot_config;
+            auto it = params.sampling.gpu_slot_configs.find(slot_id);
+            if (it != params.sampling.gpu_slot_configs.end()) {
+                slot_config = it->second;
+            } else {
+                slot_config = params.sampling;
+            }
+
+            size_t pos = 0;
+            while (pos < config_str.size()) {
+                size_t eq_pos = config_str.find('=', pos);
+                if (eq_pos == std::string::npos) break;
+
+                size_t comma_pos = config_str.find(',', eq_pos);
+                if (comma_pos == std::string::npos) {
+                    comma_pos = config_str.size();
+                }
+                std::string key = config_str.substr(pos, eq_pos - pos);
+                std::string val = config_str.substr(eq_pos + 1, comma_pos - eq_pos - 1);
+
+                if (key == "top-k") {
+                    slot_config.gpu_top_k = std::stoi(val);
+                } else if (key == "top-p-approx-k") {
+                    slot_config.gpu_top_p_approx_k = std::stoi(val);
+                } else if (key == "temp") {
+                    slot_config.gpu_temp = std::stof(val);
+                } else if (key == "softmax") {
+                    slot_config.gpu_softmax = (val == "true" || val == "1");
+                } else if (key == "dist") {
+                    slot_config.gpu_dist = (val == "true" || val == "1");
+                } else {
+                    throw std::invalid_argument("Unknown GPU sampling parameter: " + key);
+                }
+                pos = comma_pos + 1;
+            }
+            params.sampling.gpu_slot_configs[slot_id] = slot_config;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_sparam());
     add_opt(common_arg(
         {"--pooling"}, "{none,mean,cls,last,rank}",
         "pooling type for embeddings, use model default if unspecified",

@@ -8,6 +8,7 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
+#include "sampling.h"
 
 #include <algorithm>
 #include <cinttypes>
@@ -927,6 +928,8 @@ struct common_init_result common_init_from_params(common_params & params) {
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
     auto cparams = common_context_params_to_llama(params);
+    cparams.samplers = params.gpu_samplers;
+    cparams.n_samplers = params.n_gpu_samplers;
 
     llama_context * lctx = llama_init_from_model(model, cparams);
     if (lctx == NULL) {

@@ -188,6 +188,17 @@ struct common_params_sampling {
     std::vector<llama_logit_bias> logit_bias;     // logit biases to apply
     std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
 
+    // GPU sampling parameters
+    bool    gpu_sampling        = false; // enable GPU sampling
+    int32_t gpu_top_k           = 40;    // GPU top-k (<= 0 to disable)
+    int32_t gpu_top_p_approx_k  = 0;     // GPU top-p approximation using top-k (0 = disabled)
+    float   gpu_temp            = 0.80f; // GPU temperature (0.0 = disabled, greedy sampling)
+    bool    gpu_softmax         = false; // add GPU softmax to chain
+    bool    gpu_dist            = false; // add GPU dist (final sampling) to chain
+
+    // Per-slot GPU sampling configuration (llama-server)
+    std::map<int32_t, common_params_sampling> gpu_slot_configs;
+
     // print the parameters into a string
     std::string print() const;
 };
@@ -511,6 +522,9 @@ struct common_params {
     bool has_speculative() const {
         return !speculative.model.path.empty() || !speculative.model.hf_repo.empty();
     }
+
+    struct llama_sampler_seq_config * gpu_samplers;
+    size_t                            n_gpu_samplers;
 };
 
 // call once at the start of a program if it uses libcommon

@@ -106,12 +106,16 @@ static void llama_sampler_llg_free(llama_sampler * smpl) {
 }
 
 static llama_sampler_i llama_sampler_llg_i = {
-    /* .name   = */ llama_sampler_llg_name,
-    /* .accept = */ llama_sampler_llg_accept_impl,
-    /* .apply  = */ llama_sampler_llg_apply,
-    /* .reset  = */ llama_sampler_llg_reset,
-    /* .clone  = */ llama_sampler_llg_clone,
-    /* .free   = */ llama_sampler_llg_free,
+    /* .name                = */ llama_sampler_llg_name,
+    /* .accept              = */ llama_sampler_llg_accept_impl,
+    /* .apply               = */ llama_sampler_llg_apply,
+    /* .reset               = */ llama_sampler_llg_reset,
+    /* .clone               = */ llama_sampler_llg_clone,
+    /* .free                = */ llama_sampler_llg_free,
+    /* .apply_ggml          = */ NULL,
+    /* .accept_ggml         = */ NULL,
+    /* .set_input_ggml      = */ NULL,
+    /* .set_backend_context = */ NULL,
 };
 
 static size_t llama_sampler_llg_tokenize_fn(const void * user_data, const uint8_t * bytes, size_t bytes_len,

@@ -113,17 +113,51 @@ struct common_sampler {
     llama_token_data_array cur_p;
 
     void set_logits(struct llama_context * ctx, int idx) {
-        const auto * logits = llama_get_logits_ith(ctx, idx);
+        const float *       sampled_probs        = llama_get_sampled_probs_ith(ctx, idx);
+        const float *       sampled_logits       = llama_get_sampled_logits_ith(ctx, idx);
+        const llama_token * sampled_ids          = llama_get_sampled_token_ids_ith(ctx, idx);
 
         const llama_model * model = llama_get_model(ctx);
         const llama_vocab * vocab = llama_model_get_vocab(model);
 
         const int n_vocab = llama_vocab_n_tokens(vocab);
 
-        cur.resize(n_vocab);
+        // Use the member variable instead of allocating locally
+        cur.clear();
 
-        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
+        if (sampled_probs) {
+            const uint32_t sampled_probs_count = llama_get_sampled_probs_count_ith(ctx, idx);
+            cur.reserve(sampled_probs_count);
+            // The GPU sampler has filtered the probabilities so we need to use the sampled ids.
+            if (sampled_ids != nullptr) {
+                for (uint32_t i = 0; i < sampled_probs_count; ++i) {
+                    cur.emplace_back(llama_token_data{sampled_ids[i], 0.0f, sampled_probs[i]});
+                }
+            } else {
+                for (llama_token token_id = 0; token_id < (int) sampled_probs_count; token_id++) {
+                    cur.emplace_back(llama_token_data{token_id, 0.0f, sampled_probs[token_id]});
+                }
+            }
+        } else if (sampled_logits) {
+            const uint32_t sampled_logits_count = llama_get_sampled_logits_count_ith(ctx, idx);
+            cur.reserve(sampled_logits_count);
+            // The GPU sampler has filtered the logits so we need to use the sampled ids.
+            if (sampled_ids != nullptr) {
+                for (llama_token i = 0; i < (int)sampled_logits_count; i++) {
+                    cur.emplace_back(llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f});
+                }
+            } else {
+                for (llama_token token_id = 0; token_id < (int)sampled_logits_count; token_id++) {
+                    cur.emplace_back(llama_token_data{token_id, sampled_logits[token_id], 0.0f});
+                }
+            }
+        } else {
+            const auto * logits = llama_get_logits_ith(ctx, idx);
+            GGML_ASSERT(logits != nullptr);
+            cur.reserve(n_vocab);
+            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+                cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+            }
         }
 
         cur_p = { cur.data(), cur.size(), -1, false };
@@ -287,6 +321,42 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
     return result;
 }
 
+struct llama_sampler * common_sampler_gpu_init(const struct llama_model * model, const struct common_params_sampling & params) {
+    GGML_UNUSED(model);
+
+    llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
+    chain_params.no_perf = params.no_perf;
+
+    struct llama_sampler * chain = llama_sampler_chain_init(chain_params);
+
+    if (!params.gpu_sampling) {
+        return chain; // return empty chain
+    }
+
+    if (params.gpu_temp > 0.0f) {
+        llama_sampler_chain_add(chain, llama_sampler_gpu_init_temp(params.gpu_temp));
+    }
+
+    if (params.gpu_top_k > 0) {
+        llama_sampler_chain_add(chain, llama_sampler_gpu_init_top_k(params.gpu_top_k));
+    }
+
+    // TODO: GPU top_p is an approximation using top_k at the moment
+    if (params.gpu_top_p_approx_k > 0) {
+        llama_sampler_chain_add(chain, llama_sampler_gpu_init_top_p(params.gpu_top_p_approx_k));
+    }
+
+    if (params.gpu_softmax) {
+        llama_sampler_chain_add(chain, llama_sampler_gpu_init_softmax());
+    }
+
+    if (params.gpu_dist) {
+        llama_sampler_chain_add(chain, llama_sampler_gpu_init_dist(params.seed));
+    }
+
+    return chain;
+}
+
 void common_sampler_free(struct common_sampler * gsmpl) {
     if (gsmpl) {
         llama_sampler_free(gsmpl->grmr);
@@ -337,6 +407,13 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
 }
 
 llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
+    // Check if a GPU sampler has already sampled a token in which case we
+    // return that token id directly.
+    const llama_token gpu_sampled_token  = llama_get_sampled_token_ith(ctx, idx);
+    if (gpu_sampled_token != LLAMA_TOKEN_NULL) {
+        return gpu_sampled_token;
+    }
+
     gsmpl->set_logits(ctx, idx);
 
     auto & grmr  = gsmpl->grmr;

@@ -38,6 +38,13 @@ struct common_sampler;
 
 struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);
 
+// Create a GPU sampler chain from common sampling parameters
+// Returns a llama_sampler chain configured with GPU samplers based on the parameters
+// This chain can be used per-sequence for GPU-based sampling
+// Note: Only samplers that have GPU equivalents will be added to the chain
+// The returned sampler should be freed with llama_sampler_free()
+struct llama_sampler * common_sampler_gpu_init(const struct llama_model * model, const struct common_params_sampling & params);
+
 void common_sampler_free(struct common_sampler * gsmpl);
 
 // if accept_grammar is true, the token is accepted both by the sampling chain and the grammar

@@ -557,6 +557,7 @@ extern "C" {
         GGML_OP_OPT_STEP_SGD,
 
         GGML_OP_GLU,
+        GGML_OP_CUMSUM,
 
         GGML_OP_COUNT,
     };
@@ -978,6 +979,10 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
 
+    GGML_API struct ggml_tensor * ggml_cumsum(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
     // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
     GGML_API struct ggml_tensor * ggml_sum_rows(
             struct ggml_context * ctx,

@@ -1727,6 +1727,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_sum(params, tensor);
             } break;
+        case GGML_OP_CUMSUM:
+            {
+                ggml_compute_forward_cumsum(params, tensor);
+            } break;
         case GGML_OP_SUM_ROWS:
             {
                 ggml_compute_forward_sum_rows(params, tensor);
@@ -2150,6 +2154,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
         case GGML_OP_SIN:
         case GGML_OP_COS:
         case GGML_OP_SUM:
+        case GGML_OP_CUMSUM:
         case GGML_OP_SUM_ROWS:
         case GGML_OP_MEAN:
         case GGML_OP_ARGMAX: