Skip to content

Commit bcf81ed

Browse files
committed
Address review comments
1 parent 347b769 commit bcf81ed

File tree

4 files changed

+13
-21
lines changed

4 files changed

+13
-21
lines changed

convert_hf_to_gguf.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8258,11 +8258,7 @@ def set_gguf_parameters(self):
82588258
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
82598259
self.gguf_writer.add_expert_used_count(n_experts_used)
82608260

8261-
mask_token_id = 156895
8262-
if self.hparams.get("mask_token_id") is not None:
8263-
mask_token_id = self.hparams["mask_token_id"]
8264-
8265-
self.gguf_writer.add_mask_token_id(mask_token_id)
8261+
self.gguf_writer.add_mask_token_id(156895)
82668262
self.gguf_writer.add_causal_attention(False)
82678263
self.gguf_writer.add_diffusion_shift_logits(False)
82688264

@@ -8316,7 +8312,6 @@ def prepare_tensors(self):
83168312
raise ValueError(f"Unprocessed experts: {experts}")
83178313

83188314

8319-
83208315
@ModelBase.register("HunYuanDenseV1ForCausalLM")
83218316
class HunYuanModel(TextModel):
83228317
model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE

gguf-py/gguf/constants.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2711,7 +2711,8 @@ class MODEL_TENSOR(IntEnum):
27112711
MODEL_TENSOR.FFN_GATE_EXP,
27122712
MODEL_TENSOR.FFN_UP_EXP,
27132713
MODEL_TENSOR.FFN_DOWN_EXP,
2714-
]
2714+
],
2715+
# TODO
27152716
}
27162717

27172718
# tensors that will not be serialized

src/llama-model.cpp

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -938,7 +938,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
938938
break;
939939
case LLM_ARCH_LLADA_MOE:
940940
{
941-
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
941+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
942942

943943
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
944944
// diffusion language model uses non-causal attention
@@ -2407,6 +2407,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
24072407
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
24082408
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
24092409

2410+
GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for llada-moe");
2411+
GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for llada-moe");
2412+
24102413
for (int i = 0; i < n_layer; ++i) {
24112414
auto & layer = layers[i];
24122415

@@ -2423,13 +2426,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
24232426

24242427
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
24252428

2426-
if (n_expert == 0) {
2427-
throw std::runtime_error("n_expert must be > 0");
2428-
}
2429-
if (n_expert_used == 0) {
2430-
throw std::runtime_error("n_expert_used must be > 0");
2431-
}
2432-
24332429
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
24342430

24352431
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
@@ -12535,15 +12531,15 @@ struct llm_build_llada_moe : public llm_graph_context {
1253512531
cb(Vcur, "Vcur", il);
1253612532

1253712533
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12534+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
12535+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12536+
1253812537
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
1253912538
cb(Qcur, "Qcur_normed", il);
1254012539

12541-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
1254212540
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
1254312541
cb(Kcur, "Kcur_normed", il);
1254412542

12545-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12546-
1254712543
Qcur = ggml_rope_ext(
1254812544
ctx0, Qcur, inp_pos, nullptr,
1254912545
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,

src/llama-vocab.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1864,8 +1864,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
18641864
tokenizer_pre == "jina-v2-es" ||
18651865
tokenizer_pre == "jina-v2-de" ||
18661866
tokenizer_pre == "a.x-4.0" ||
1867-
tokenizer_pre == "mellum" ||
1868-
tokenizer_pre == "llada-moe") {
1867+
tokenizer_pre == "mellum") {
18691868
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
18701869
} else if (
18711870
tokenizer_pre == "jina-v1-en" ||
@@ -1963,7 +1962,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
19631962
pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
19641963
clean_spaces = false;
19651964
} else if (
1966-
tokenizer_pre == "bailingmoe") {
1965+
tokenizer_pre == "bailingmoe" ||
1966+
tokenizer_pre == "llada-moe") {
19671967
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
19681968
clean_spaces = false;
19691969
} else if (

0 commit comments

Comments
 (0)