From bd1f3e50042253b16c86b7820260a41dc8dc8b3e Mon Sep 17 00:00:00 2001 From: Shane A Date: Mon, 15 Sep 2025 10:30:59 -0700 Subject: [PATCH 1/5] Add HF to gguf conversion logic for Olmo3 --- convert_hf_to_gguf.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index bbc21813f81ca..6b5c8a4bb6b02 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5951,9 +5951,34 @@ class SeedOssModel(TextModel): @ModelBase.register("Olmo2ForCausalLM") +@ModelBase.register("Olmo3ForCausalLM") class Olmo2Model(TextModel): model_arch = gguf.MODEL_ARCH.OLMO2 + def set_gguf_parameters(self): + super().set_gguf_parameters() + + rope_scaling = self.hparams.get("rope_scaling") or {} + if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) + self.gguf_writer.add_rope_scaling_attn_factors(rope_scaling["attention_factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) + + if "sliding_window" in self.hparams: + self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + + sliding_window_pattern = [] + if "layer_types" in self.hparams: + sliding_window_pattern = [t == "sliding_attention" for t in self.hparams["layer_types"]] + else: + # Olmo2 does not use sliding window attention. + # Olmo3 defaults to using sliding window for all layers except every 4th. + for i in range(self.hparams["num_hidden_layers"]): + sliding_window_pattern.append((i + 1) % 4 != 0) + + self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) + @ModelBase.register("OlmoeForCausalLM") class OlmoeModel(TextModel): From a5f19bb86750f08b14d1f3a9634f7581ebbd280a Mon Sep 17 00:00:00 2001 From: Shane A Date: Mon, 15 Sep 2025 10:37:21 -0700 Subject: [PATCH 2/5] Add Olmo3 implementation --- src/llama-model.cpp | 52 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 818b209641a5a..ca1ef9c97b0c6 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1315,6 +1315,14 @@ void llama_model::load_hparams(llama_model_loader & ml) { { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); + if (found_swa && hparams.n_swa > 0) { + hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; + hparams.set_swa_pattern(4); + } else { + hparams.swa_type = LLAMA_SWA_TYPE_NONE; + } + switch (hparams.n_layer) { case 16: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_7B; break; @@ -12149,6 +12157,7 @@ struct llm_build_olmo : public llm_graph_context { } }; +template struct llm_build_olmo2 : public llm_graph_context { llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -12164,7 +12173,14 @@ struct llm_build_olmo2 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv(); + using inp_attn_type = std::conditional_t; + inp_attn_type * inp_attn = nullptr; + + if constexpr (iswa) { + inp_attn = build_attn_inp_kv_iswa(); + } else { + inp_attn = build_attn_inp_kv(); + } ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -12197,25 +12213,45 @@ struct llm_build_olmo2 : public llm_graph_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - Qcur = ggml_rope_ext( + const bool is_swa = hparams.is_swa(il); + + if (is_swa) { + // For sliding window layers, Olmo3 does not use rope scaling. + // This is achieved here by setting freq_scale and attn_factor to 1. + // We also set ext_factor to 0 to avoid a few unnecessary computations. + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, 1.0, + 0.0, 1.0, beta_fast, beta_slow + ); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, 1.0, + 0.0, 1.0, beta_fast, beta_slow + ); + } + else { + Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); - Kcur = ggml_rope_ext( + Kcur = ggml_rope_ext( ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); + } cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -18915,7 +18951,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { } break; case LLM_ARCH_OLMO2: { - llm = std::make_unique(*this, params); + if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) { + llm = std::make_unique>(*this, params); + } else { + llm = std::make_unique>(*this, params); + } } break; case LLM_ARCH_OLMOE: { From 6997fad8112f78d8ca2cb812d99d8028823857d6 Mon Sep 17 00:00:00 2001 From: Shane A Date: Mon, 15 Sep 2025 15:13:21 -0700 Subject: [PATCH 3/5] Update rope comment --- src/llama-model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index ca1ef9c97b0c6..04ae82e663459 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -12216,7 +12216,7 @@ struct llm_build_olmo2 : public llm_graph_context { const bool is_swa = hparams.is_swa(il); if (is_swa) { - // For sliding window layers, Olmo3 does not use rope scaling. + // For sliding window layers, Olmo3 use regular rope with no yarn rope scaling. // This is achieved here by setting freq_scale and attn_factor to 1. // We also set ext_factor to 0 to avoid a few unnecessary computations. Qcur = ggml_rope_ext( From aea35e8df4ccf61daf4ad4aa1c18db3fc479802e Mon Sep 17 00:00:00 2001 From: Shane A Date: Mon, 15 Sep 2025 15:14:40 -0700 Subject: [PATCH 4/5] Fix indentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- src/llama-model.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 04ae82e663459..cf2e5fb689024 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -12250,8 +12250,8 @@ struct llm_build_olmo2 : public llm_graph_context { cb(Vcur, "Vcur", il); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { From 7cd97a2d7ea24431a8591ad3205e8ac8c322054d Mon Sep 17 00:00:00 2001 From: Shane A Date: Tue, 16 Sep 2025 15:38:17 -0700 Subject: [PATCH 5/5] Apply suggestion from @CISC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- src/llama-model.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 77182258658cf..2be807a6a9dab 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -12314,8 +12314,7 @@ struct llm_build_olmo2 : public llm_graph_context { n_rot, rope_type, n_ctx_orig, freq_base, 1.0, 0.0, 1.0, beta_fast, beta_slow ); - } - else { + } else { Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,