@@ -36,6 +36,7 @@ const char * llm_type_name(llm_type type) {
36
36
case LLM_TYPE_80M: return "80M";
37
37
case LLM_TYPE_109M: return "109M";
38
38
case LLM_TYPE_137M: return "137M";
39
+ case LLM_TYPE_140M: return "140M";
39
40
case LLM_TYPE_160M: return "160M";
40
41
case LLM_TYPE_190M: return "190M";
41
42
case LLM_TYPE_220M: return "220M";
@@ -44,13 +45,15 @@ const char * llm_type_name(llm_type type) {
44
45
case LLM_TYPE_270M: return "270M";
45
46
case LLM_TYPE_335M: return "335M";
46
47
case LLM_TYPE_350M: return "350M";
48
+ case LLM_TYPE_360M: return "360M";
47
49
case LLM_TYPE_410M: return "410M";
48
50
case LLM_TYPE_450M: return "450M";
49
51
case LLM_TYPE_475M: return "475M";
50
52
case LLM_TYPE_558M: return "558M";
51
53
case LLM_TYPE_700M: return "700M";
52
54
case LLM_TYPE_770M: return "770M";
53
55
case LLM_TYPE_780M: return "780M";
56
+ case LLM_TYPE_950M: return "950M";
54
57
case LLM_TYPE_0_3B: return "0.3B";
55
58
case LLM_TYPE_0_5B: return "0.5B";
56
59
case LLM_TYPE_0_6B: return "0.6B";
@@ -622,19 +625,32 @@ void llama_model::load_hparams(llama_model_loader & ml) {
622
625
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
623
626
ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
624
627
625
- hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
626
- hparams.n_swa = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
627
- hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
628
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
629
+ if (found_swa && hparams.n_swa == 0) {
630
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
631
+ hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
632
+ } else {
633
+ hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
634
+ hparams.n_swa = 8192;
635
+ hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
636
+ }
628
637
629
638
switch (hparams.n_expert) {
639
+ case 0: {
640
+ // MobileLLM (no MoE)
641
+ switch (hparams.n_embd) {
642
+ case 2048: type = LLM_TYPE_140M; break;
643
+ case 4096: type = LLM_TYPE_360M; break;
644
+ case 6144: type = LLM_TYPE_950M; break;
645
+ default: type = LLM_TYPE_UNKNOWN;
646
+ }
647
+ } break;
630
648
case 16: type = LLM_TYPE_17B_16E; break;
631
649
case 128: type = LLM_TYPE_17B_128E; break;
632
650
default: type = LLM_TYPE_UNKNOWN;
633
651
}
634
652
635
- if (type == LLM_TYPE_17B_128E) {
636
- hparams.use_kq_norm = false;
637
- }
653
+ hparams.use_kq_norm = type != LLM_TYPE_17B_128E;
638
654
} break;
639
655
case LLM_ARCH_ARCEE:
640
656
{
@@ -2454,9 +2470,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2454
2470
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2455
2471
}
2456
2472
2457
- GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Llama 4 requires n_moe_layer_step > 0");
2458
2473
for (int i = 0; i < n_layer; ++i) {
2459
- bool is_moe_layer = (i + 1) % hparams.n_moe_layer_step == 0;
2474
+ bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
2460
2475
2461
2476
auto & layer = layers[i];
2462
2477
@@ -6328,6 +6343,14 @@ struct llm_build_llama : public llm_graph_context {
6328
6343
cb(Kcur, "Kcur", il);
6329
6344
cb(Vcur, "Vcur", il);
6330
6345
6346
+ if (hparams.use_kq_norm) {
6347
+ // Llama4TextL2Norm
6348
+ Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
6349
+ Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
6350
+ cb(Qcur, "Qcur_normed", il);
6351
+ cb(Kcur, "Kcur_normed", il);
6352
+ }
6353
+
6331
6354
cur = build_attn(inp_attn,
6332
6355
model.layers[il].wo, model.layers[il].bo,
6333
6356
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
@@ -6435,7 +6458,8 @@ struct llm_build_llama_iswa : public llm_graph_context {
6435
6458
for (int il = 0; il < n_layer; ++il) {
6436
6459
ggml_tensor * inpSA = inpL;
6437
6460
6438
- const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
6461
+ const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
6462
+ (il + 1) % hparams.n_no_rope_layer_step != 0;
6439
6463
6440
6464
// norm
6441
6465
cur = build_norm(inpL,
@@ -18981,7 +19005,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
18981
19005
} break;
18982
19006
case LLM_ARCH_LLAMA4:
18983
19007
{
18984
- llm = std::make_unique<llm_build_llama_iswa>(*this, params);
19008
+ if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
19009
+ llm = std::make_unique<llm_build_llama>(*this, params);
19010
+ } else {
19011
+ llm = std::make_unique<llm_build_llama_iswa>(*this, params);
19012
+ }
18985
19013
} break;
18986
19014
case LLM_ARCH_DECI:
18987
19015
{
0 commit comments