Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 103 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2477,6 +2477,109 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
}
).set_sparam());
add_opt(common_arg(
{"--gpu-sampling"},
"enable GPU sampling (default: disabled)",
[](common_params & params) {
params.sampling.gpu_sampling = true;
}
).set_sparam());
add_opt(common_arg(
{"--gpu-top-k"}, "N",
string_format("GPU top-k sampling (default: %d, <= 0 = disabled)", params.sampling.gpu_top_k),
[](common_params & params, int value) {
params.sampling.gpu_top_k = value;
}
).set_sparam());
add_opt(common_arg(
{"--gpu-top-p-approx-k"}, "N",
string_format("GPU top-p approximation using top-k (default: %d, 0 = disabled)", params.sampling.gpu_top_p_approx_k),
[](common_params & params, int value) {
params.sampling.gpu_top_p_approx_k = value;
}
).set_sparam());
add_opt(common_arg(
{"--gpu-temp"}, "N",
string_format("GPU temperature (default: %.2f, 0.0 = disabled, greedy sampling)", (double)params.sampling.gpu_temp),
[](common_params & params, const std::string & value) {
params.sampling.gpu_temp = std::stof(value);
params.sampling.gpu_temp = std::max(params.sampling.gpu_temp, 0.0f);
}
).set_sparam());
add_opt(common_arg(
{"--gpu-softmax"},
"add GPU softmax to sampling chain (default: disabled)",
[](common_params & params) {
params.sampling.gpu_softmax = true;
}
).set_sparam());
add_opt(common_arg(
{"--gpu-dist"},
"add GPU dist (final sampling) to sampling chain (default: disabled)",
[](common_params & params) {
params.sampling.gpu_dist = true;
}
).set_sparam());
add_opt(common_arg(
// TODO: need to get feedback on how to best configure per slot GPU samplers
{"--gpu-slot"}, "SLOT_ID:CONFIG",
"configure GPU sampling for a specific slot\n"
"format: SLOT_ID:top-k=N,temp=F,dist=BOOL or SLOT_ID:none to disable GPU sampling\n"
"example: --gpu-slot 0:top-k=20,temp=0.8,dist=true --gpu-slot 1:none",
[](common_params & params, const std::string & value) {
auto colon_pos = value.find(':');
if (colon_pos == std::string::npos) {
throw std::invalid_argument("--gpu-slot format must be SLOT_ID:CONFIG");
}

int32_t slot_id = std::stoi(value.substr(0, colon_pos));
std::string config_str = value.substr(colon_pos + 1);

if (config_str == "none") {
common_params_sampling slot_config = params.sampling;
slot_config.gpu_sampling = false;
params.sampling.gpu_slot_configs[slot_id] = slot_config;
return;
}

common_params_sampling slot_config;
auto it = params.sampling.gpu_slot_configs.find(slot_id);
if (it != params.sampling.gpu_slot_configs.end()) {
slot_config = it->second;
} else {
slot_config = params.sampling;
}

size_t pos = 0;
while (pos < config_str.size()) {
size_t eq_pos = config_str.find('=', pos);
if (eq_pos == std::string::npos) break;

size_t comma_pos = config_str.find(',', eq_pos);
if (comma_pos == std::string::npos) {
comma_pos = config_str.size();
}
std::string key = config_str.substr(pos, eq_pos - pos);
std::string val = config_str.substr(eq_pos + 1, comma_pos - eq_pos - 1);

if (key == "top-k") {
slot_config.gpu_top_k = std::stoi(val);
} else if (key == "top-p-approx-k") {
slot_config.gpu_top_p_approx_k = std::stoi(val);
} else if (key == "temp") {
slot_config.gpu_temp = std::stof(val);
} else if (key == "softmax") {
slot_config.gpu_softmax = (val == "true" || val == "1");
} else if (key == "dist") {
slot_config.gpu_dist = (val == "true" || val == "1");
} else {
throw std::invalid_argument("Unknown GPU sampling parameter: " + key);
}
pos = comma_pos + 1;
}
params.sampling.gpu_slot_configs[slot_id] = slot_config;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_sparam());
add_opt(common_arg(
{"--pooling"}, "{none,mean,cls,last,rank}",
"pooling type for embeddings, use model default if unspecified",
Expand Down
3 changes: 3 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "common.h"
#include "log.h"
#include "llama.h"
#include "sampling.h"

#include <algorithm>
#include <cinttypes>
Expand Down Expand Up @@ -927,6 +928,8 @@ struct common_init_result common_init_from_params(common_params & params) {
const llama_vocab * vocab = llama_model_get_vocab(model);

auto cparams = common_context_params_to_llama(params);
cparams.samplers = params.gpu_samplers;
cparams.n_samplers = params.n_gpu_samplers;

llama_context * lctx = llama_init_from_model(model, cparams);
if (lctx == NULL) {
Expand Down
14 changes: 14 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,17 @@ struct common_params_sampling {
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens

// GPU sampling parameters
bool gpu_sampling = false; // enable GPU sampling
int32_t gpu_top_k = 40; // GPU top-k (<= 0 to disable)
int32_t gpu_top_p_approx_k = 0; // GPU top-p approximation using top-k (0 = disabled)
float gpu_temp = 0.80f; // GPU temperature (0.0 = disabled, greedy sampling)
bool gpu_softmax = false; // add GPU softmax to chain
bool gpu_dist = false; // add GPU dist (final sampling) to chain

// Per-slot GPU sampling configuration (llama-server)
std::map<int32_t, common_params_sampling> gpu_slot_configs;

// print the parameters into a string
std::string print() const;
};
Expand Down Expand Up @@ -511,6 +522,9 @@ struct common_params {
bool has_speculative() const {
return !speculative.model.path.empty() || !speculative.model.hf_repo.empty();
}

struct llama_sampler_seq_config * gpu_samplers;
size_t n_gpu_samplers;
};

// call once at the start of a program if it uses libcommon
Expand Down
16 changes: 10 additions & 6 deletions common/llguidance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,16 @@ static void llama_sampler_llg_free(llama_sampler * smpl) {
}

static llama_sampler_i llama_sampler_llg_i = {
/* .name = */ llama_sampler_llg_name,
/* .accept = */ llama_sampler_llg_accept_impl,
/* .apply = */ llama_sampler_llg_apply,
/* .reset = */ llama_sampler_llg_reset,
/* .clone = */ llama_sampler_llg_clone,
/* .free = */ llama_sampler_llg_free,
/* .name = */ llama_sampler_llg_name,
/* .accept = */ llama_sampler_llg_accept_impl,
/* .apply = */ llama_sampler_llg_apply,
/* .reset = */ llama_sampler_llg_reset,
/* .clone = */ llama_sampler_llg_clone,
/* .free = */ llama_sampler_llg_free,
/* .apply_ggml = */ NULL,
/* .accept_ggml = */ NULL,
/* .set_input_ggml = */ NULL,
/* .set_backend_context = */ NULL,
};

static size_t llama_sampler_llg_tokenize_fn(const void * user_data, const uint8_t * bytes, size_t bytes_len,
Expand Down
85 changes: 81 additions & 4 deletions common/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,17 +113,51 @@ struct common_sampler {
llama_token_data_array cur_p;

void set_logits(struct llama_context * ctx, int idx) {
const auto * logits = llama_get_logits_ith(ctx, idx);
const float * sampled_probs = llama_get_sampled_probs_ith(ctx, idx);
const float * sampled_logits = llama_get_sampled_logits_ith(ctx, idx);
const llama_token * sampled_ids = llama_get_sampled_token_ids_ith(ctx, idx);

const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);

const int n_vocab = llama_vocab_n_tokens(vocab);

cur.resize(n_vocab);
// Use the member variable instead of allocating locally
cur.clear();

for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
if (sampled_probs) {
const uint32_t sampled_probs_count = llama_get_sampled_probs_count_ith(ctx, idx);
cur.reserve(sampled_probs_count);
// The GPU sampler has filtered the probabilities so we need to use the sampled ids.
if (sampled_ids != nullptr) {
for (uint32_t i = 0; i < sampled_probs_count; ++i) {
cur.emplace_back(llama_token_data{sampled_ids[i], 0.0f, sampled_probs[i]});
}
} else {
for (llama_token token_id = 0; token_id < (int) sampled_probs_count; token_id++) {
cur.emplace_back(llama_token_data{token_id, 0.0f, sampled_probs[token_id]});
}
}
} else if (sampled_logits) {
const uint32_t sampled_logits_count = llama_get_sampled_logits_count_ith(ctx, idx);
cur.reserve(sampled_logits_count);
// The GPU sampler has filtered the logits so we need to use the sampled ids.
if (sampled_ids != nullptr) {
for (llama_token i = 0; i < (int)sampled_logits_count; i++) {
cur.emplace_back(llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f});
}
} else {
for (llama_token token_id = 0; token_id < (int)sampled_logits_count; token_id++) {
cur.emplace_back(llama_token_data{token_id, sampled_logits[token_id], 0.0f});
}
}
} else {
const auto * logits = llama_get_logits_ith(ctx, idx);
GGML_ASSERT(logits != nullptr);
cur.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
}

cur_p = { cur.data(), cur.size(), -1, false };
Expand Down Expand Up @@ -287,6 +321,42 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
return result;
}

struct llama_sampler * common_sampler_gpu_init(const struct llama_model * model, const struct common_params_sampling & params) {
GGML_UNUSED(model);

llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
chain_params.no_perf = params.no_perf;

struct llama_sampler * chain = llama_sampler_chain_init(chain_params);

if (!params.gpu_sampling) {
return chain; // return empty chain
}

if (params.gpu_temp > 0.0f) {
llama_sampler_chain_add(chain, llama_sampler_gpu_init_temp(params.gpu_temp));
}

if (params.gpu_top_k > 0) {
llama_sampler_chain_add(chain, llama_sampler_gpu_init_top_k(params.gpu_top_k));
}

// TODO: GPU top_p is an approximation using top_k at the moment
if (params.gpu_top_p_approx_k > 0) {
llama_sampler_chain_add(chain, llama_sampler_gpu_init_top_p(params.gpu_top_p_approx_k));
}

if (params.gpu_softmax) {
llama_sampler_chain_add(chain, llama_sampler_gpu_init_softmax());
}

if (params.gpu_dist) {
llama_sampler_chain_add(chain, llama_sampler_gpu_init_dist(params.seed));
}

return chain;
}

void common_sampler_free(struct common_sampler * gsmpl) {
if (gsmpl) {
llama_sampler_free(gsmpl->grmr);
Expand Down Expand Up @@ -337,6 +407,13 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
}

llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
// Check if a GPU sampler has already sampled a token in which case we
// return that token id directly.
const llama_token gpu_sampled_token = llama_get_sampled_token_ith(ctx, idx);
if (gpu_sampled_token != LLAMA_TOKEN_NULL) {
return gpu_sampled_token;
}

gsmpl->set_logits(ctx, idx);

auto & grmr = gsmpl->grmr;
Expand Down
7 changes: 7 additions & 0 deletions common/sampling.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@ struct common_sampler;

struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);

// Create a GPU sampler chain from common sampling parameters
// Returns a llama_sampler chain configured with GPU samplers based on the parameters
// This chain can be used per-sequence for GPU-based sampling
// Note: Only samplers that have GPU equivalents will be added to the chain
// The returned sampler should be freed with llama_sampler_free()
struct llama_sampler * common_sampler_gpu_init(const struct llama_model * model, const struct common_params_sampling & params);

void common_sampler_free(struct common_sampler * gsmpl);

// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
Expand Down
5 changes: 5 additions & 0 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,7 @@ extern "C" {
GGML_OP_OPT_STEP_SGD,

GGML_OP_GLU,
GGML_OP_CUMSUM,

GGML_OP_COUNT,
};
Expand Down Expand Up @@ -978,6 +979,10 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);

GGML_API struct ggml_tensor * ggml_cumsum(
struct ggml_context * ctx,
struct ggml_tensor * a);

// sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
GGML_API struct ggml_tensor * ggml_sum_rows(
struct ggml_context * ctx,
Expand Down
5 changes: 5 additions & 0 deletions ggml/src/ggml-cpu/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -1727,6 +1727,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_sum(params, tensor);
} break;
case GGML_OP_CUMSUM:
{
ggml_compute_forward_cumsum(params, tensor);
} break;
case GGML_OP_SUM_ROWS:
{
ggml_compute_forward_sum_rows(params, tensor);
Expand Down Expand Up @@ -2150,6 +2154,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_SIN:
case GGML_OP_COS:
case GGML_OP_SUM:
case GGML_OP_CUMSUM:
case GGML_OP_SUM_ROWS:
case GGML_OP_MEAN:
case GGML_OP_ARGMAX:
Expand Down
Loading
Loading