From bd1f3e50042253b16c86b7820260a41dc8dc8b3e Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Mon, 15 Sep 2025 10:30:59 -0700
Subject: [PATCH 1/5] Add HF to gguf conversion logic for Olmo3

---
 convert_hf_to_gguf.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index bbc21813f81ca..6b5c8a4bb6b02 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5951,9 +5951,34 @@ class SeedOssModel(TextModel):
 
 
 @ModelBase.register("Olmo2ForCausalLM")
+@ModelBase.register("Olmo3ForCausalLM")
 class Olmo2Model(TextModel):
     model_arch = gguf.MODEL_ARCH.OLMO2
 
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        rope_scaling = self.hparams.get("rope_scaling") or {}
+        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
+            self.gguf_writer.add_rope_scaling_attn_factors(rope_scaling["attention_factor"])
+            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
+
+        if "sliding_window" in self.hparams:
+            self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
+
+            sliding_window_pattern = []
+            if "layer_types" in self.hparams:
+                sliding_window_pattern = [t == "sliding_attention" for t in self.hparams["layer_types"]]
+            else:
+                # Olmo2 does not use sliding window attention.
+                # Olmo3 defaults to using sliding window for all layers except every 4th.
+                for i in range(self.hparams["num_hidden_layers"]):
+                    sliding_window_pattern.append((i + 1) % 4 != 0)
+
+            self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
+
 
 @ModelBase.register("OlmoeForCausalLM")
 class OlmoeModel(TextModel):

From a5f19bb86750f08b14d1f3a9634f7581ebbd280a Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Mon, 15 Sep 2025 10:37:21 -0700
Subject: [PATCH 2/5] Add Olmo3 implementation

---
 src/llama-model.cpp | 52 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 818b209641a5a..ca1ef9c97b0c6 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1315,6 +1315,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
+                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+                if (found_swa && hparams.n_swa > 0) {
+                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+                    hparams.set_swa_pattern(4);
+                } else {
+                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+                }
+
                 switch (hparams.n_layer) {
                     case 16: type = LLM_TYPE_1B; break;
                     case 32: type = LLM_TYPE_7B; break;
@@ -12149,6 +12157,7 @@ struct llm_build_olmo : public llm_graph_context {
     }
 };
 
+template <bool iswa>
 struct llm_build_olmo2 : public llm_graph_context {
     llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -12164,7 +12173,14 @@ struct llm_build_olmo2 : public llm_graph_context {
         // inp_pos - contains the positions
         ggml_tensor * inp_pos = build_inp_pos();
 
-        auto * inp_attn = build_attn_inp_kv();
+        using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
+        inp_attn_type * inp_attn = nullptr;
+
+        if constexpr (iswa) {
+            inp_attn = build_attn_inp_kv_iswa();
+        } else {
+            inp_attn = build_attn_inp_kv();
+        }
 
         ggml_tensor * inp_out_ids = build_inp_out_ids();
 
@@ -12197,25 +12213,45 @@ struct llm_build_olmo2 : public llm_graph_context {
                 Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
                 Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
 
-                Qcur = ggml_rope_ext(
+                const bool is_swa = hparams.is_swa(il);
+
+                if (is_swa) {
+                    // For sliding window layers, Olmo3 does not use rope scaling.
+                    // This is achieved here by setting freq_scale and attn_factor to 1.
+                    // We also set ext_factor to 0 to avoid a few unnecessary computations.
+                    Qcur = ggml_rope_ext(
+                        ctx0, Qcur, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
+                        0.0, 1.0, beta_fast, beta_slow
+                        );
+
+                    Kcur = ggml_rope_ext(
+                        ctx0, Kcur, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
+                        0.0, 1.0, beta_fast, beta_slow
+                        );
+                }
+                else {
+                    Qcur = ggml_rope_ext(
                         ctx0, Qcur, inp_pos, nullptr,
                         n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                         ext_factor, attn_factor, beta_fast, beta_slow
                         );
 
-                Kcur = ggml_rope_ext(
+                    Kcur = ggml_rope_ext(
                         ctx0, Kcur, inp_pos, nullptr,
                         n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                         ext_factor, attn_factor, beta_fast, beta_slow
                         );
+                }
 
                 cb(Qcur, "Qcur", il);
                 cb(Kcur, "Kcur", il);
                 cb(Vcur, "Vcur", il);
 
                 cur = build_attn(inp_attn,
-                        model.layers[il].wo, NULL,
-                        Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                    model.layers[il].wo, NULL,
+                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1 && inp_out_ids) {
@@ -18915,7 +18951,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
             } break;
         case LLM_ARCH_OLMO2:
             {
-                llm = std::make_unique<llm_build_olmo2>(*this, params);
+                if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
+                    llm = std::make_unique<llm_build_olmo2<true>>(*this, params);
+                } else {
+                    llm = std::make_unique<llm_build_olmo2<false>>(*this, params);
+                }
             } break;
         case LLM_ARCH_OLMOE:
             {

From 6997fad8112f78d8ca2cb812d99d8028823857d6 Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Mon, 15 Sep 2025 15:13:21 -0700
Subject: [PATCH 3/5] Update rope comment

---
 src/llama-model.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index ca1ef9c97b0c6..04ae82e663459 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -12216,7 +12216,7 @@ struct llm_build_olmo2 : public llm_graph_context {
                 const bool is_swa = hparams.is_swa(il);
 
                 if (is_swa) {
-                    // For sliding window layers, Olmo3 does not use rope scaling.
+                    // For sliding window layers, Olmo3 use regular rope with no yarn rope scaling.
                     // This is achieved here by setting freq_scale and attn_factor to 1.
                     // We also set ext_factor to 0 to avoid a few unnecessary computations.
                     Qcur = ggml_rope_ext(

From aea35e8df4ccf61daf4ad4aa1c18db3fc479802e Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Mon, 15 Sep 2025 15:14:40 -0700
Subject: [PATCH 4/5] Fix indentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
---
 src/llama-model.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 04ae82e663459..cf2e5fb689024 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -12250,8 +12250,8 @@ struct llm_build_olmo2 : public llm_graph_context {
                 cb(Vcur, "Vcur", il);
 
                 cur = build_attn(inp_attn,
-                    model.layers[il].wo, NULL,
-                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        model.layers[il].wo, NULL,
+                        Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1 && inp_out_ids) {

From 7cd97a2d7ea24431a8591ad3205e8ac8c322054d Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Tue, 16 Sep 2025 15:38:17 -0700
Subject: [PATCH 5/5] Apply suggestion from @CISC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
---
 src/llama-model.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 77182258658cf..2be807a6a9dab 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -12314,8 +12314,7 @@ struct llm_build_olmo2 : public llm_graph_context {
                         n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
                         0.0, 1.0, beta_fast, beta_slow
                         );
-                }
-                else {
+                } else {
                     Qcur = ggml_rope_ext(
                         ctx0, Qcur, inp_pos, nullptr,
                         n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,