From 98eb6ad4fc4b0dcb839ffe2590c8482140df65c8 Mon Sep 17 00:00:00 2001
From: Curt Tigges <curttigges@curts-mbp.local.meter>
Date: Fri, 23 May 2025 13:57:15 -0700
Subject: [PATCH 1/4] extracted encoder and decoder

---
 clt/models/clt.py       |  95 +++++++++-----------
 clt/models/decoder.py   | 186 ++++++++++++++++++++++++++++++++++++++++
 clt/models/encoder.py   | 165 +++++++++++++++++++++++++++++++++++
 clt/models/encoding.py  | 139 +-----------------------------
 clt/training/trainer.py |   2 +-
 5 files changed, 397 insertions(+), 190 deletions(-)
 create mode 100644 clt/models/decoder.py
 create mode 100644 clt/models/encoder.py

diff --git a/clt/models/clt.py b/clt/models/clt.py
index d209741..f912299 100644
--- a/clt/models/clt.py
+++ b/clt/models/clt.py
@@ -6,15 +6,21 @@
 
 from clt.config import CLTConfig
 from clt.models.base import BaseTranscoder
-from clt.models.parallel import ColumnParallelLinear, RowParallelLinear  # Import parallel layers
+from clt.models.parallel import RowParallelLinear  # Removed ColumnParallelLinear
 from clt.models.activations import BatchTopK, JumpReLU, TokenTopK  # Import BatchTopK, JumpReLU and TokenTopK
 
 # Import the new encoding helper functions
-from clt.models.encoding import get_preactivations as _get_preactivations_helper
-from clt.models.encoding import _encode_all_layers as _encode_all_layers_helper
-from clt.models.encoding import _apply_batch_topk_helper
+from clt.models.encoding import (
+    _apply_batch_topk_helper,
+)  # Removed _get_preactivations_helper, _encode_all_layers_helper
 from clt.models.encoding import _apply_token_topk_helper
 
+# Import the new Encoder module
+from clt.models.encoder import Encoder
+
+# Import the new Decoder module
+from clt.models.decoder import Decoder
+
 # Import the activation registry
 from clt.activations.registry import get_activation_fn
 
@@ -81,19 +87,11 @@ def __init__(
 
         logger.info(f"CLT TP model initialized on rank {self.rank} with device {self.device} and dtype {self.dtype}")
 
-        self.encoders = nn.ModuleList(
-            [
-                ColumnParallelLinear(
-                    in_features=config.d_model,
-                    out_features=config.num_features,
-                    bias=True,
-                    process_group=self.process_group,
-                    device=self.device,
-                    dtype=self.dtype,
-                )
-                for _ in range(config.num_layers)
-            ]
+        # Instantiate the new Encoder module
+        self.encoder_module = Encoder(
+            config=config, process_group=self.process_group, device=self.device, dtype=self.dtype
         )
+        # The old self.encoders = nn.ModuleList(...) is now removed.
 
         self.decoders = nn.ModuleDict(
             {
@@ -113,6 +111,19 @@ def __init__(
             }
         )
 
+        # Instantiate the new Decoder module
+        self.decoder_module = Decoder(
+            config=config, process_group=self.process_group, device=self.device, dtype=self.dtype
+        )
+        # Remove the old self.decoders and _cached_decoder_norms registration
+        del self.decoders
+        # Note: _cached_decoder_norms was registered in __init__ before,
+        # now it's handled within the Decoder module itself.
+        # If self._cached_decoder_norms = None was present, it should be removed too.
+        # Checking the original code, _cached_decoder_norms was an attribute, not registered with register_buffer initially for the main class.
+        # It was registered with register_buffer for the theta estimation buffers.
+        # The Decoder class now handles its own _cached_decoder_norms registration.
+
         if self.config.activation_fn == "jumprelu":
             initial_threshold_val = torch.ones(
                 config.num_layers, config.num_features, device=self.device, dtype=self.dtype
@@ -168,40 +179,17 @@ def jumprelu(self, x: torch.Tensor, layer_idx: int) -> torch.Tensor:
 
     def get_preactivations(self, x: torch.Tensor, layer_idx: int) -> torch.Tensor:
         """Get pre-activation values (full tensor) for features at the specified layer."""
-        # Ensure input is on the correct device and dtype before passing to helper
-        x_processed = x.to(device=self.device, dtype=self.dtype)
-
-        return _get_preactivations_helper(
-            x_processed,
-            layer_idx,
-            self.config,
-            self.encoders,
-            self.device,  # Pass self.device
-            self.dtype,  # Pass self.dtype
-            self.rank,
-        )
+        # Call the new encoder module's method directly
+        return self.encoder_module.get_preactivations(x, layer_idx)
 
     def _encode_all_layers(
         self, inputs: Dict[int, torch.Tensor]
-    ) -> Tuple[Dict[int, torch.Tensor], List[Tuple[int, int, int]], torch.device, torch.dtype]:
+    ) -> Tuple[Dict[int, torch.Tensor], List[Tuple[int, int, int]]]:  # Return type updated
         """Encodes inputs for all layers and returns pre-activations and original shape info."""
-        # self.device and self.dtype are guaranteed to be set from __init__
-
-        # Ensure all input tensors are on the determined effective device and dtype
-        processed_inputs: Dict[int, torch.Tensor] = {}
-        for layer_idx, x_orig in inputs.items():
-            processed_inputs[layer_idx] = x_orig.to(device=self.device, dtype=self.dtype)
-
-        # Call the helper function with processed inputs and determined device/dtype
-        # The helper returns the device and dtype it operated on, which should match self.device and self.dtype
-        preactivations_dict, original_shapes_info, returned_device, returned_dtype = _encode_all_layers_helper(
-            processed_inputs, self.config, self.encoders, self.device, self.dtype, self.rank
-        )
-        # The returned_device and returned_dtype from the helper reflect what was used.
-        # Assert they match self.device and self.dtype for sanity if needed
-        # assert returned_device == self.device, "Device mismatch in _encode_all_layers_helper"
-        # assert returned_dtype == self.dtype, "Dtype mismatch in _encode_all_layers_helper"
-        return preactivations_dict, original_shapes_info, self.device, self.dtype  # Return self.device, self.dtype
+        # Call the encoder module's method
+        # The encoder_module handles device and dtype internally for its operations.
+        # Inputs are processed within the encoder_module's methods.
+        return self.encoder_module.encode_all_layers(inputs)
 
     @torch.no_grad()
     def _update_min_selected_preactivations(
@@ -452,6 +440,8 @@ def decode(self, a: Dict[int, torch.Tensor], layer_idx: int) -> torch.Tensor:
                 decoded = decoder(activation_tensor)  # Removed try-except
                 reconstruction += decoded
         return reconstruction
+        # Call the new decoder module's method directly
+        return self.decoder_module.decode(a, layer_idx)
 
     def forward(self, inputs: Dict[int, torch.Tensor]) -> Dict[int, torch.Tensor]:
         """Process inputs through the parallel transcoder model.
@@ -521,11 +511,9 @@ def get_feature_activations(self, inputs: Dict[int, torch.Tensor]) -> Dict[int,
             processed_inputs[layer_idx] = x_orig.to(device=self.device, dtype=self.dtype)
 
         if self.config.activation_fn == "batchtopk" or self.config.activation_fn == "topk":
-            # Note: _encode_all_layers_helper uses the device/dtype passed to it, which are self.device, self.dtype.
-            preactivations_dict, _, processed_device, processed_dtype = _encode_all_layers_helper(
-                processed_inputs, self.config, self.encoders, self.device, self.dtype, self.rank
-            )
-            # Assert processed_device == self.device and processed_dtype == self.dtype if needed
+            # _encode_all_layers now returns 2 values: preactivations_dict, original_shapes_info
+            # We only need preactivations_dict here.
+            preactivations_dict, _ = self._encode_all_layers(processed_inputs)
 
             if not preactivations_dict:  # Indicates helper returned empty, possibly due to all-empty inputs
                 activations = {}
@@ -678,6 +666,8 @@ def get_decoder_norms(self) -> torch.Tensor:
         self._cached_decoder_norms = full_decoder_norms
 
         return full_decoder_norms
+        # Call the new decoder module's method directly
+        return self.decoder_module.get_decoder_norms()
 
     @torch.no_grad()
     def estimate_theta_posthoc(
@@ -792,8 +782,7 @@ def estimate_theta_posthoc(
             inputs_on_device = {k: v.to(device=target_device, dtype=self.dtype) for k, v in inputs_batch.items()}
             # _encode_all_layers uses self.device, self.dtype internally.
             # Since self.to(target_device) was called, self.device is now target_device.
-            preactivations_dict, _, current_op_dev, current_op_dtype = self._encode_all_layers(inputs_on_device)
-            # Assert current_op_dev == target_device and current_op_dtype == self.dtype if needed for strict checking
+            preactivations_dict, _ = self._encode_all_layers(inputs_on_device)
 
             if not preactivations_dict:
                 logger.warning(f"Rank {self.rank}: No preactivations. Skipping batch {processed_batches_total + 1}.")
diff --git a/clt/models/decoder.py b/clt/models/decoder.py
new file mode 100644
index 0000000..1e60fa6
--- /dev/null
+++ b/clt/models/decoder.py
@@ -0,0 +1,186 @@
+import torch
+import torch.nn as nn
+from typing import Dict, Optional
+import logging
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from clt.config import CLTConfig
+from clt.models.parallel import RowParallelLinear
+
+logger = logging.getLogger(__name__)
+
+
+class Decoder(nn.Module):
+    """
+    Encapsulates the decoder functionality of the CrossLayerTranscoder.
+    It holds the stack of decoder layers and provides methods to decode
+    feature activations and compute decoder norms.
+    """
+
+    _cached_decoder_norms: Optional[torch.Tensor] = None
+
+    def __init__(
+        self,
+        config: CLTConfig,
+        process_group: Optional[ProcessGroup],
+        device: torch.device,
+        dtype: torch.dtype,
+    ):
+        super().__init__()
+        self.config = config
+        self.process_group = process_group
+        self.device = device
+        self.dtype = dtype
+
+        if process_group is None or not dist.is_initialized():
+            self.world_size = 1
+            self.rank = 0
+        else:
+            self.world_size = dist.get_world_size(process_group)
+            self.rank = dist.get_rank(process_group)
+
+        self.decoders = nn.ModuleDict(
+            {
+                f"{src_layer}->{tgt_layer}": RowParallelLinear(
+                    in_features=self.config.num_features,
+                    out_features=self.config.d_model,
+                    bias=True,
+                    process_group=self.process_group,
+                    input_is_parallel=False,
+                    d_model_for_init=self.config.d_model,
+                    num_layers_for_init=self.config.num_layers,
+                    device=self.device,
+                    dtype=self.dtype,
+                )
+                for src_layer in range(self.config.num_layers)
+                for tgt_layer in range(src_layer, self.config.num_layers)
+            }
+        )
+        self.register_buffer("_cached_decoder_norms", None, persistent=False)
+
+    def decode(self, a: Dict[int, torch.Tensor], layer_idx: int) -> torch.Tensor:
+        """Decode the feature activations to reconstruct outputs at the specified layer.
+
+        Input activations `a` are expected to be the *full* tensors.
+        The RowParallelLinear decoder splits them internally.
+
+        Args:
+            a: Dictionary mapping layer indices to *full* feature activations [..., num_features]
+            layer_idx: Index of the layer to reconstruct outputs for
+
+        Returns:
+            Reconstructed outputs [..., d_model]
+        """
+        available_keys = sorted(a.keys())
+        if not available_keys:
+            logger.warning(f"Rank {self.rank}: No activation keys available in decode method for layer {layer_idx}")
+            return torch.zeros((0, self.config.d_model), device=self.device, dtype=self.dtype)
+
+        first_key = available_keys[0]
+        example_tensor = a[first_key]
+        batch_dim_size = example_tensor.shape[0] if example_tensor.numel() > 0 else 0
+        if batch_dim_size == 0:
+            for key in available_keys:
+                if a[key].numel() > 0:
+                    batch_dim_size = a[key].shape[0]
+                    example_tensor = a[key]
+                    break
+
+        reconstruction = torch.zeros((batch_dim_size, self.config.d_model), device=self.device, dtype=self.dtype)
+
+        for src_layer in range(layer_idx + 1):
+            if src_layer in a:
+                activation_tensor = a[src_layer].to(device=self.device, dtype=self.dtype)
+
+                if activation_tensor.numel() == 0:
+                    continue
+                if activation_tensor.shape[-1] != self.config.num_features:
+                    logger.warning(
+                        f"Rank {self.rank}: Activation tensor for layer {src_layer} has incorrect feature dimension {activation_tensor.shape[-1]}, expected {self.config.num_features}. Skipping decode contribution."
+                    )
+                    continue
+
+                decoder = self.decoders[f"{src_layer}->{layer_idx}"]
+                decoded = decoder(activation_tensor)
+                reconstruction += decoded
+        return reconstruction
+
+    def get_decoder_norms(self) -> torch.Tensor:
+        """Get L2 norms of all decoder matrices for each feature (gathered across ranks).
+
+        The decoders are of type `RowParallelLinear`. Their weights are sharded across the
+        input feature dimension (CLT features). Each feature's decoder weight vector
+        (across all target layers) resides on a single rank.
+
+        The computation proceeds as follows:
+        1. For each source CLT layer (`src_layer`):
+           a. Initialize a local accumulator for squared norms (`local_norms_sq_accum`)
+              for all features, matching the model's device and float32 for precision.
+           b. For each target model layer (`tgt_layer`) that this `src_layer` decodes to:
+              i. Get the corresponding `RowParallelLinear` decoder module.
+              ii. Access its local weight shard (`decoder.weight`, shape [d_model, local_num_features]).
+              iii. Compute L2 norm squared for each column (feature) in this local shard.
+              iv. Determine the global indices for the features this rank owns.
+              v. Add these squared norms to the corresponding global slice in `local_norms_sq_accum`.
+           c. All-reduce `local_norms_sq_accum` across all ranks using SUM operation.
+              This sums the squared norm contributions for each feature from the rank that owns it.
+           d. Take the square root of the summed squared norms and cast to the model's dtype.
+              Store this in the `full_decoder_norms` tensor for the current `src_layer`.
+        2. Cache and return `full_decoder_norms`.
+
+        The norms are cached in `self._cached_decoder_norms` to avoid recomputation.
+
+        Returns:
+            Tensor of shape [num_layers, num_features] containing L2 norms of decoder
+            weights for each feature, applicable for sparsity calculations.
+        """
+        if self._cached_decoder_norms is not None:
+            return self._cached_decoder_norms
+
+        full_decoder_norms = torch.zeros(
+            self.config.num_layers, self.config.num_features, device=self.device, dtype=self.dtype
+        )
+
+        for src_layer in range(self.config.num_layers):
+            local_norms_sq_accum = torch.zeros(self.config.num_features, device=self.device, dtype=torch.float32)
+
+            for tgt_layer in range(src_layer, self.config.num_layers):
+                decoder_key = f"{src_layer}->{tgt_layer}"
+                decoder = self.decoders[decoder_key]
+                assert isinstance(decoder, RowParallelLinear), f"Decoder {decoder_key} is not RowParallelLinear"
+
+                current_norms_sq = torch.norm(decoder.weight, dim=0).pow(2).to(torch.float32)
+
+                full_dim = decoder.full_in_features
+                features_per_rank = (full_dim + self.world_size - 1) // self.world_size
+                start_idx = self.rank * features_per_rank
+                end_idx = min(start_idx + features_per_rank, full_dim)
+                actual_local_dim = max(0, end_idx - start_idx)
+                local_dim_padded = decoder.local_in_features
+
+                if local_dim_padded != features_per_rank and self.rank == self.world_size - 1:
+                    pass
+                elif local_dim_padded != actual_local_dim and local_dim_padded != features_per_rank:
+                    logger.warning(
+                        f"Rank {self.rank}: Padded local dim ({local_dim_padded}) doesn't match calculated actual local dim ({actual_local_dim}) or features_per_rank ({features_per_rank}) for {decoder_key}. This might indicate an issue with RowParallelLinear partitioning."
+                    )
+
+                if actual_local_dim > 0:
+                    valid_norms_sq = current_norms_sq[:actual_local_dim]
+                    if valid_norms_sq.shape[0] == actual_local_dim:
+                        global_slice = slice(start_idx, end_idx)
+                        local_norms_sq_accum[global_slice] += valid_norms_sq
+                    else:
+                        logger.warning(
+                            f"Rank {self.rank}: Shape mismatch in decoder norm calculation for {decoder_key}. "
+                            f"Valid norms shape {valid_norms_sq.shape}, expected size {actual_local_dim}."
+                        )
+
+            if self.process_group is not None and dist.is_initialized():
+                dist.all_reduce(local_norms_sq_accum, op=dist.ReduceOp.SUM, group=self.process_group)
+
+            full_decoder_norms[src_layer] = torch.sqrt(local_norms_sq_accum).to(self.dtype)
+
+        self._cached_decoder_norms = full_decoder_norms
+        return full_decoder_norms
diff --git a/clt/models/encoder.py b/clt/models/encoder.py
new file mode 100644
index 0000000..474f729
--- /dev/null
+++ b/clt/models/encoder.py
@@ -0,0 +1,165 @@
+import torch
+import torch.nn as nn
+from typing import Dict, List, Tuple, Optional
+import logging
+import torch.distributed as dist
+
+from clt.config import CLTConfig
+from clt.models.parallel import ColumnParallelLinear
+from torch.distributed import ProcessGroup
+
+logger = logging.getLogger(__name__)
+
+
+class Encoder(nn.Module):
+    """
+    Encapsulates the encoder functionality of the CrossLayerTranscoder.
+    It holds the stack of encoder layers and provides methods to get
+    pre-activations.
+    """
+
+    def __init__(
+        self,
+        config: CLTConfig,
+        process_group: Optional[ProcessGroup],
+        device: torch.device,
+        dtype: torch.dtype,
+    ):
+        super().__init__()
+        self.config = config
+        self.process_group = process_group
+        self.device = device
+        self.dtype = dtype
+
+        if process_group is None or not dist.is_initialized():
+            self.world_size = 1
+            self.rank = 0
+        else:
+            self.world_size = dist.get_world_size(process_group)
+            self.rank = dist.get_rank(process_group)
+
+        self.encoders = nn.ModuleList(
+            [
+                ColumnParallelLinear(
+                    in_features=config.d_model,
+                    out_features=config.num_features,
+                    bias=True,
+                    process_group=self.process_group,
+                    device=self.device,
+                    dtype=self.dtype,
+                )
+                for _ in range(config.num_layers)
+            ]
+        )
+
+    def get_preactivations(self, x: torch.Tensor, layer_idx: int) -> torch.Tensor:
+        """Get pre-activation values (full tensor) for features at the specified layer."""
+        result: Optional[torch.Tensor] = None
+        fallback_shape: Optional[Tuple[int, int]] = None
+        input_for_linear: Optional[torch.Tensor] = None
+
+        # Ensure input is on the correct device and dtype
+        x = x.to(device=self.device, dtype=self.dtype)
+
+        try:
+            # 1. Check input shape and reshape if necessary
+            if x.dim() == 2:
+                input_for_linear = x
+            elif x.dim() == 3:
+                batch, seq_len, d_model = x.shape
+                if d_model != self.config.d_model:
+                    logger.warning(
+                        f"Rank {self.rank}: Input d_model {d_model} != config {self.config.d_model} layer {layer_idx}"
+                    )
+                    fallback_shape = (batch * seq_len, self.config.num_features)
+                else:
+                    input_for_linear = x.reshape(-1, d_model)
+            else:
+                logger.warning(
+                    f"Rank {self.rank}: Cannot handle input shape {x.shape} for preactivations layer {layer_idx}"
+                )
+                fallback_batch_dim = x.shape[0] if x.dim() > 0 else 0
+                fallback_shape = (fallback_batch_dim, self.config.num_features)
+
+            # 2. Check d_model match if not already done and input_for_linear was set
+            if fallback_shape is None and input_for_linear is not None:
+                if input_for_linear.shape[1] != self.config.d_model:
+                    logger.warning(
+                        f"Rank {self.rank}: Input d_model {input_for_linear.shape[1]} != config {self.config.d_model} layer {layer_idx}"
+                    )
+                    fallback_shape = (input_for_linear.shape[0], self.config.num_features)
+            elif fallback_shape is None and input_for_linear is None:
+                logger.error(
+                    f"Rank {self.rank}: Could not determine input for linear layer {layer_idx} and no fallback shape set. Input x.shape: {x.shape}"
+                )
+                fallback_batch_dim = x.shape[0] if x.dim() > 0 else 0
+                fallback_shape = (fallback_batch_dim, self.config.num_features)
+
+            # 3. Proceed if no errors so far (i.e. fallback_shape is still None)
+            if fallback_shape is None and input_for_linear is not None:
+                # The input_for_linear is already on self.device and self.dtype due to the .to() call at the start of the function
+                # or because it's derived from x which was moved.
+                result = self.encoders[layer_idx](input_for_linear)
+            elif fallback_shape is None and input_for_linear is None:
+                logger.error(
+                    f"Rank {self.rank}: Critical logic error in get_preactivations for layer {layer_idx}. input_for_linear is None and fallback_shape is None. Input x.shape: {x.shape}"
+                )
+                fallback_batch_dim = x.shape[0] if x.dim() > 0 else 0
+                fallback_shape = (fallback_batch_dim, self.config.num_features)
+
+        except IndexError:
+            logger.error(
+                f"Rank {self.rank}: Invalid layer index {layer_idx} requested for encoder. Max index is {len(self.encoders) - 1}."
+            )
+            if x.dim() == 2:
+                fallback_batch_dim = x.shape[0]
+            elif x.dim() == 3:
+                fallback_batch_dim = x.shape[0] * x.shape[1]
+            elif x.dim() > 0:
+                fallback_batch_dim = x.shape[0]
+            else:
+                fallback_batch_dim = 0
+            fallback_shape = (fallback_batch_dim, self.config.num_features)
+
+        if result is not None:
+            return result
+        else:
+            if fallback_shape is None:
+                logger.error(
+                    f"Rank {self.rank}: Fallback shape not determined for layer {layer_idx}, and no result. Input x.shape: {x.shape}. Returning empty tensor."
+                )
+                fallback_shape = (0, self.config.num_features)
+            return torch.zeros(fallback_shape, device=self.device, dtype=self.dtype)
+
+    def encode_all_layers(
+        self, inputs: Dict[int, torch.Tensor]
+    ) -> Tuple[Dict[int, torch.Tensor], List[Tuple[int, int, int]]]:
+        """
+        Encodes inputs for all layers using the stored encoders.
+        Assumes input tensors in `inputs` will be moved to the correct device/dtype
+        by the `get_preactivations` method.
+
+        Returns:
+            A tuple containing:
+                - preactivations_dict: Dictionary mapping layer indices to pre-activation tensors.
+                - original_shapes_info: List of tuples storing (layer_idx, batch_size, seq_len)
+                                        for restoring original 3D shapes if needed.
+        """
+        preactivations_dict: Dict[int, torch.Tensor] = {}
+        original_shapes_info: List[Tuple[int, int, int]] = []
+
+        # Iterate in a deterministic layer order
+        for layer_idx in sorted(inputs.keys()):
+            x = inputs[layer_idx]  # x will be moved to device/dtype in get_preactivations
+
+            if x.dim() == 3:
+                batch_size, seq_len, _ = x.shape
+                original_shapes_info.append((layer_idx, batch_size, seq_len))
+            elif x.dim() == 2:
+                batch_size, _ = x.shape
+                original_shapes_info.append((layer_idx, batch_size, 1))  # seq_len is 1 for 2D
+
+            preact = self.get_preactivations(x, layer_idx)
+            preactivations_dict[layer_idx] = preact
+
+        return preactivations_dict, original_shapes_info
diff --git a/clt/models/encoding.py b/clt/models/encoding.py
index 10cad90..448ab68 100644
--- a/clt/models/encoding.py
+++ b/clt/models/encoding.py
@@ -1,6 +1,5 @@
 import torch
-import torch.nn as nn
-from typing import Dict, Optional, Tuple, List, cast
+from typing import Dict, Optional, Tuple, List
 import logging
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
@@ -14,140 +13,8 @@
 # For now, let's assume a logger instance is passed or they use their own.
 logger = logging.getLogger(__name__)
 
-
-def get_preactivations(
-    x: torch.Tensor,
-    layer_idx: int,
-    config: CLTConfig,
-    encoders: nn.ModuleList,
-    model_device: torch.device,
-    model_dtype: torch.dtype,
-    rank: int = 0,  # Default rank for non-distributed scenarios if logger uses it
-) -> torch.Tensor:
-    """Get pre-activation values (full tensor) for features at the specified layer."""
-    result: Optional[torch.Tensor] = None
-    fallback_shape: Optional[Tuple[int, int]] = None
-    input_for_linear: Optional[torch.Tensor] = None  # Initialize to handle cases where it might not be set before use
-
-    try:
-        # 1. Check input shape and reshape if necessary
-        if x.dim() == 2:
-            input_for_linear = x
-        elif x.dim() == 3:
-            batch, seq_len, d_model = x.shape
-            if d_model != config.d_model:
-                logger.warning(f"Rank {rank}: Input d_model {d_model} != config {config.d_model} layer {layer_idx}")
-                fallback_shape = (batch * seq_len, config.num_features)
-            else:
-                input_for_linear = x.reshape(-1, d_model)
-        else:
-            logger.warning(f"Rank {rank}: Cannot handle input shape {x.shape} for preactivations layer {layer_idx}")
-            # Attempt to determine a batch dimension for fallback, even if it's just the first dim
-            fallback_batch_dim = x.shape[0] if x.dim() > 0 else 0
-            fallback_shape = (fallback_batch_dim, config.num_features)
-
-        # 2. Check d_model match if not already done and input_for_linear was set
-        if fallback_shape is None and input_for_linear is not None:
-            if input_for_linear.shape[1] != config.d_model:
-                logger.warning(
-                    f"Rank {rank}: Input d_model {input_for_linear.shape[1]} != config {config.d_model} layer {layer_idx}"
-                )
-                fallback_shape = (input_for_linear.shape[0], config.num_features)
-        elif fallback_shape is None and input_for_linear is None:
-            # This case implies x.dim() was not 2 or 3, or x.dim()==3 but d_model mismatch led to fallback_shape already
-            # If fallback_shape is still None here, it means the initial x.dim() check didn't set it,
-            # and input_for_linear is also None. This is an unexpected state if we expect a tensor out.
-            logger.error(
-                f"Rank {rank}: Could not determine input for linear layer {layer_idx} and no fallback shape set. Input x.shape: {x.shape}"
-            )
-            # Ensure fallback_shape is set to something reasonable (e.g. using x.shape[0] if available)
-            fallback_batch_dim = x.shape[0] if x.dim() > 0 else 0
-            fallback_shape = (fallback_batch_dim, config.num_features)
-
-        # 3. Proceed if no errors so far (i.e. fallback_shape is still None)
-        if fallback_shape is None and input_for_linear is not None:
-            # Explicitly cast the output of the parallel linear layer
-            result = cast(
-                torch.Tensor, encoders[layer_idx](input_for_linear.to(device=model_device, dtype=model_dtype))
-            )
-        elif fallback_shape is None and input_for_linear is None:
-            # This condition should ideally be caught by the checks above.
-            # If we reach here, it implies an unhandled case or logic error.
-            logger.error(
-                f"Rank {rank}: Critical logic error in get_preactivations for layer {layer_idx}. input_for_linear is None and fallback_shape is None. Input x.shape: {x.shape}"
-            )
-            # Force a fallback_shape to prevent returning None from the function, though this indicates a problem.
-            fallback_batch_dim = x.shape[0] if x.dim() > 0 else 0  # Default batch dim
-            fallback_shape = (fallback_batch_dim, config.num_features)
-
-    except IndexError:  # Specific exception for out-of-bounds access to encoders list
-        logger.error(
-            f"Rank {rank}: Invalid layer index {layer_idx} requested for encoder. Max index is {len(encoders) - 1}."
-        )
-        # Determine fallback batch dimension more robustly
-        if x.dim() == 2:
-            fallback_batch_dim = x.shape[0]
-        elif x.dim() == 3:
-            fallback_batch_dim = x.shape[0] * x.shape[1]
-        elif x.dim() > 0:  # Handle other dimensionalities if they occur
-            fallback_batch_dim = x.shape[0]
-        else:  # 0-dim tensor or other unexpected case
-            fallback_batch_dim = 0
-        fallback_shape = (fallback_batch_dim, config.num_features)
-    # Removed broad `except Exception as e`
-
-    # 4. Return result or fallback tensor
-    if result is not None:
-        return result
-    else:
-        if fallback_shape is None:
-            # This state should ideally not be reached if logic above is correct.
-            # It means no result was computed, and no fallback_shape was determined.
-            logger.error(
-                f"Rank {rank}: Fallback shape not determined for layer {layer_idx}, and no result. Input x.shape: {x.shape}. Returning empty tensor."
-            )
-            # Default to a completely empty tensor if all else fails
-            fallback_shape = (0, config.num_features)
-        return torch.zeros(fallback_shape, device=model_device, dtype=model_dtype)
-
-
-def _encode_all_layers(
-    inputs: Dict[int, torch.Tensor],
-    config: CLTConfig,
-    encoders: nn.ModuleList,
-    model_device: torch.device,
-    model_dtype: torch.dtype,
-    rank: int = 0,
-) -> Tuple[Dict[int, torch.Tensor], List[Tuple[int, int, int]], torch.device, torch.dtype]:
-    """Encodes inputs for all layers and returns pre-activations and original shape info.
-    Assumes input tensors in `inputs` are already on model_device and model_dtype.
-    """
-    preactivations_dict = {}
-    original_shapes_info: List[Tuple[int, int, int]] = []
-
-    # Device and dtype are now asserted by the type hints and caller responsibility
-    # No inference or .to() calls needed here for the inputs dict items themselves.
-
-    # Iterate in a deterministic layer order so that all TP ranks execute
-    # collective operations (all_gather) in the exact same sequence.
-    for layer_idx in sorted(inputs.keys()):
-        x = inputs[layer_idx]  # x is x_orig, assumed to be on correct device/dtype
-        # Optional: Add assertions here if strict checking is desired at this stage
-        # assert x.device == model_device, f"Input for layer {layer_idx} not on expected device"
-        # assert x.dtype == model_dtype, f"Input for layer {layer_idx} not on expected dtype"
-
-        if x.dim() == 3:
-            batch_size, seq_len, _ = x.shape
-            original_shapes_info.append((layer_idx, batch_size, seq_len))
-        elif x.dim() == 2:
-            batch_size, _ = x.shape
-            original_shapes_info.append((layer_idx, batch_size, 1))
-
-        # Pass through model_device and model_dtype as they are confirmed.
-        preact = get_preactivations(x, layer_idx, config, encoders, model_device, model_dtype, rank)
-        preactivations_dict[layer_idx] = preact
-
-    return preactivations_dict, original_shapes_info, model_device, model_dtype
+# The get_preactivations and _encode_all_layers functions previously here
+# have been moved to the clt.models.encoder.Encoder class.
 
 
 def _apply_batch_topk_helper(
diff --git a/clt/training/trainer.py b/clt/training/trainer.py
index 583a445..359e96c 100644
--- a/clt/training/trainer.py
+++ b/clt/training/trainer.py
@@ -794,7 +794,7 @@ def train(self, eval_every: int = 1000) -> CrossLayerTranscoder:
                         # --- Log per-layer standard deviation of pre-activations ---
                         # This requires getting the pre-activations first.
                         # _encode_all_layers returns: preactivations_dict, original_shapes_info, device, dtype
-                        preactivations_eval_dict, _, _, _ = self.model._encode_all_layers(inputs)
+                        preactivations_eval_dict, _ = self.model._encode_all_layers(inputs)
                         layerwise_preact_std_dev: Dict[str, float] = {}
                         if preactivations_eval_dict:
                             for layer_idx, preact_tensor in preactivations_eval_dict.items():

From e4ed5f101f9d92798e90599dfba384895514dfce Mon Sep 17 00:00:00 2001
From: Curt Tigges <curttigges@curts-mbp.local.meter>
Date: Fri, 23 May 2025 14:11:53 -0700
Subject: [PATCH 2/4] fixed references

---
 clt/models/clt.py     | 121 ------------------------------------------
 clt/models/decoder.py |   2 -
 2 files changed, 123 deletions(-)

diff --git a/clt/models/clt.py b/clt/models/clt.py
index f912299..0fb90aa 100644
--- a/clt/models/clt.py
+++ b/clt/models/clt.py
@@ -395,51 +395,6 @@ def decode(self, a: Dict[int, torch.Tensor], layer_idx: int) -> torch.Tensor:
         Returns:
             Reconstructed outputs [..., d_model]
         """
-        available_keys = sorted(a.keys())
-        if not available_keys:
-            logger.warning(f"Rank {self.rank}: No activation keys available in decode method for layer {layer_idx}")
-            # Determine a consistent device/dtype for the empty tensor, even if self.device/self.dtype is None initially
-            # op_dev, op_dtype = (
-            #     self._get_current_op_device_dtype()
-            # )  # Pass no sample, gets model defaults or system defaults
-            return torch.zeros((0, self.config.d_model), device=self.device, dtype=self.dtype)
-
-        first_key = available_keys[0]
-        example_tensor = a[first_key]
-        # Need batch dimension size for reconstruction tensor
-        # Handle cases where example_tensor might be empty (though filtered earlier)
-        batch_dim_size = example_tensor.shape[0] if example_tensor.numel() > 0 else 0
-        # If batch_dim_size is still 0, try finding a non-empty tensor
-        if batch_dim_size == 0:
-            for key in available_keys:
-                if a[key].numel() > 0:
-                    batch_dim_size = a[key].shape[0]
-                    example_tensor = a[key]  # Update example_tensor to one that has data for device/dtype
-                    break
-
-        reconstruction = torch.zeros((batch_dim_size, self.config.d_model), device=self.device, dtype=self.dtype)
-
-        # Sum contributions from features at all contributing layers
-        for src_layer in range(layer_idx + 1):
-            if src_layer in a:
-                # Decoder expects full activation tensor [..., num_features]
-                activation_tensor = a[src_layer].to(device=self.device, dtype=self.dtype)
-
-                # Check activation tensor shape
-                if activation_tensor.numel() == 0:
-                    continue  # Skip empty activations
-                if activation_tensor.shape[-1] != self.config.num_features:
-                    logger.warning(
-                        f"Rank {self.rank}: Activation tensor for layer {src_layer} has incorrect feature dimension {activation_tensor.shape[-1]}, expected {self.config.num_features}. Skipping decode contribution."
-                    )
-                    continue
-
-                decoder = self.decoders[f"{src_layer}->{layer_idx}"]
-                # RowParallelLinear takes full input (input_is_parallel=False),
-                # splits it internally, computes local result, and all-reduces.
-                decoded = decoder(activation_tensor)  # Removed try-except
-                reconstruction += decoded
-        return reconstruction
         # Call the new decoder module's method directly
         return self.decoder_module.decode(a, layer_idx)
 
@@ -590,82 +545,6 @@ def get_decoder_norms(self) -> torch.Tensor:
             Tensor of shape [num_layers, num_features] containing L2 norms of decoder
             weights for each feature, applicable for sparsity calculations.
         """
-        if self._cached_decoder_norms is not None:
-            return self._cached_decoder_norms
-
-        full_decoder_norms = torch.zeros(
-            self.config.num_layers, self.config.num_features, device=self.device, dtype=self.dtype
-        )
-
-        for src_layer in range(self.config.num_layers):
-            local_norms_sq_accum = torch.zeros(self.config.num_features, device=self.device, dtype=torch.float32)
-
-            for tgt_layer in range(src_layer, self.config.num_layers):
-                decoder_key = f"{src_layer}->{tgt_layer}"
-                decoder = self.decoders[decoder_key]
-                assert isinstance(decoder, RowParallelLinear), f"Decoder {decoder_key} is not RowParallelLinear"
-
-                # decoder.weight shape: [d_model, local_num_features (padded)]
-                # Calculate norms on local weight shard
-                current_norms_sq = torch.norm(decoder.weight, dim=0).pow(2).to(torch.float32)
-                # current_norms_sq shape: [local_num_features (padded)]
-
-                # Determine the slice of the *full* feature dimension this rank owns
-                full_dim = decoder.full_in_features  # Original number of features
-                local_dim_padded = decoder.local_in_features  # Padded local size
-
-                # Calculate start and end indices in the *full* dimension
-                # Correct calculation using integer division based on full dimension
-                features_per_rank = (full_dim + self.world_size - 1) // self.world_size
-                start_idx = self.rank * features_per_rank
-                end_idx = min(start_idx + features_per_rank, full_dim)
-                actual_local_dim = max(0, end_idx - start_idx)
-
-                # Check if local padded size matches expected local dimension
-                # This is a sanity check for RowParallelLinear's partitioning logic
-                if local_dim_padded != features_per_rank and self.rank == self.world_size - 1:
-                    # The last rank might have fewer features if full_dim is not divisible by world_size
-                    # RowParallelLinear pads its weight, so local_dim_padded might be larger than actual_local_dim
-                    pass  # Padding is expected here
-                elif local_dim_padded != actual_local_dim and local_dim_padded != features_per_rank:
-                    logger.warning(
-                        f"Rank {self.rank}: Padded local dim ({local_dim_padded}) doesn't match calculated actual local dim ({actual_local_dim}) or features_per_rank ({features_per_rank}) for {decoder_key}. This might indicate an issue with RowParallelLinear partitioning."
-                    )
-                    # Proceed cautiously, but log the potential discrepancy
-
-                # If this rank has valid features for this layer (based on correct calculation)
-                if actual_local_dim > 0:
-                    # The norms correspond to the first `actual_local_dim` columns of the weight
-                    # We slice the norms up to the *actual* number of features this rank owns, ignoring padding
-                    valid_norms_sq = current_norms_sq[:actual_local_dim]
-
-                    # Ensure shapes match before adding
-                    if valid_norms_sq.shape[0] == actual_local_dim:
-                        # Accumulate into the correct global slice determined by start_idx and end_idx
-                        global_slice = slice(start_idx, end_idx)
-                        local_norms_sq_accum[global_slice] += valid_norms_sq
-                    else:
-                        # This should not happen with the slicing logic above
-                        logger.warning(
-                            f"Rank {self.rank}: Shape mismatch in decoder norm calculation for {decoder_key}. "
-                            f"Valid norms shape {valid_norms_sq.shape}, expected size {actual_local_dim}."
-                        )
-
-            # Reduce the accumulated squared norms across all ranks
-            # Each feature's decoder weight vector lives entirely on a single rank
-            # (row-parallel sharding over the feature dimension).  To reconstruct the
-            # correct global ‖w‖₂ we must therefore **sum** the per-rank contributions,
-            # not average them – averaging would shrink every norm by `world_size` and
-            # drastically weaken the sparsity penalty.
-            if self.process_group is not None and dist.is_initialized():
-                dist.all_reduce(local_norms_sq_accum, op=dist.ReduceOp.SUM, group=self.process_group)
-
-            # Now take the square root and store in the final tensor (cast back to model dtype)
-            full_decoder_norms[src_layer] = torch.sqrt(local_norms_sq_accum).to(self.dtype)
-
-        self._cached_decoder_norms = full_decoder_norms
-
-        return full_decoder_norms
         # Call the new decoder module's method directly
         return self.decoder_module.get_decoder_norms()
 
diff --git a/clt/models/decoder.py b/clt/models/decoder.py
index 1e60fa6..1c0e8d3 100644
--- a/clt/models/decoder.py
+++ b/clt/models/decoder.py
@@ -18,8 +18,6 @@ class Decoder(nn.Module):
     feature activations and compute decoder norms.
     """
 
-    _cached_decoder_norms: Optional[torch.Tensor] = None
-
     def __init__(
         self,
         config: CLTConfig,

From 39148c1bcfa3c3668fcc7b7dac0d83652c9131d8 Mon Sep 17 00:00:00 2001
From: Curt Tigges <curttigges@curts-mbp.local.meter>
Date: Fri, 23 May 2025 14:37:11 -0700
Subject: [PATCH 3/4] separated out jumprelu threshold management

---
 clt/models/activations.py |  227 ++++++++-
 clt/models/clt.py         | 1012 +++----------------------------------
 clt/models/encoding.py    |  302 -----------
 clt/models/theta.py       |  632 +++++++++++++++++++++++
 4 files changed, 940 insertions(+), 1233 deletions(-)
 delete mode 100644 clt/models/encoding.py
 create mode 100644 clt/models/theta.py

diff --git a/clt/models/activations.py b/clt/models/activations.py
index 693529e..3901782 100644
--- a/clt/models/activations.py
+++ b/clt/models/activations.py
@@ -1,5 +1,9 @@
 import torch
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Dict, List
+import torch.distributed as dist
+import logging
+from clt.config import CLTConfig
+from torch.distributed import ProcessGroup
 
 
 class BatchTopK(torch.autograd.Function):
@@ -200,3 +204,224 @@ def backward(ctx, *grad_outputs: torch.Tensor) -> Tuple[Optional[torch.Tensor],
             else:
                 grad_threshold = grad_threshold_per_element.sum()
         return grad_input, grad_threshold, None
+
+
+# --- Helper functions for applying BatchTopK/TokenTopK globally --- #
+# These were previously in clt.models.encoding.py
+
+logger_helpers = logging.getLogger(__name__ + ".helpers")  # Use a sub-logger
+
+
+def _apply_batch_topk_helper(
+    preactivations_dict: Dict[int, torch.Tensor],
+    config: CLTConfig,
+    device: torch.device,
+    dtype: torch.dtype,
+    rank: int,
+    process_group: Optional[ProcessGroup],
+) -> Dict[int, torch.Tensor]:
+    """Helper to apply BatchTopK globally across concatenated layer pre-activations."""
+
+    world_size = 1
+    if process_group is not None and dist.is_initialized():
+        world_size = dist.get_world_size(process_group)
+
+    if not preactivations_dict:
+        logger_helpers.warning(f"Rank {rank}: _apply_batch_topk_helper received empty preactivations_dict.")
+        return {}
+
+    ordered_preactivations_original: List[torch.Tensor] = []
+    ordered_preactivations_normalized: List[torch.Tensor] = []
+    layer_feature_sizes: List[Tuple[int, int]] = []
+
+    first_valid_preact = next((p for p in preactivations_dict.values() if p.numel() > 0), None)
+    if first_valid_preact is None:
+        logger_helpers.warning(
+            f"Rank {rank}: No valid preactivations found in dict for BatchTopK. Returning empty dict."
+        )
+        return {
+            layer_idx: torch.empty((0, config.num_features), device=device, dtype=dtype)
+            for layer_idx in preactivations_dict.keys()
+        }
+    batch_tokens_dim = first_valid_preact.shape[0]
+
+    for layer_idx in range(config.num_layers):
+        if layer_idx in preactivations_dict:
+            preact_orig = preactivations_dict[layer_idx]
+            preact_orig = preact_orig.to(device=device, dtype=dtype)
+            current_num_features = preact_orig.shape[1] if preact_orig.numel() > 0 else config.num_features
+
+            if preact_orig.numel() == 0:
+                if batch_tokens_dim > 0:
+                    zeros_shape = (batch_tokens_dim, current_num_features)
+                    ordered_preactivations_original.append(torch.zeros(zeros_shape, device=device, dtype=dtype))
+                    ordered_preactivations_normalized.append(torch.zeros(zeros_shape, device=device, dtype=dtype))
+            elif preact_orig.shape[0] != batch_tokens_dim:
+                logger_helpers.warning(
+                    f"Rank {rank} Layer {layer_idx}: Mismatched batch dim ({preact_orig.shape[0]} vs {batch_tokens_dim}). Using zeros."
+                )
+                zeros_shape = (batch_tokens_dim, current_num_features)
+                ordered_preactivations_original.append(torch.zeros(zeros_shape, device=device, dtype=dtype))
+                ordered_preactivations_normalized.append(torch.zeros(zeros_shape, device=device, dtype=dtype))
+            else:
+                ordered_preactivations_original.append(preact_orig)
+                mean = preact_orig.mean(dim=0, keepdim=True)
+                std = preact_orig.std(dim=0, keepdim=True)
+                preact_norm = (preact_orig - mean) / (std + 1e-6)
+                ordered_preactivations_normalized.append(preact_norm)
+            layer_feature_sizes.append((layer_idx, current_num_features))
+
+    if not ordered_preactivations_original:
+        logger_helpers.warning(
+            f"Rank {rank}: No tensors collected after iterating layers for BatchTopK. Returning empty activations."
+        )
+        return {
+            layer_idx: torch.empty((batch_tokens_dim, config.num_features), device=device, dtype=dtype)
+            for layer_idx in preactivations_dict.keys()
+        }
+
+    concatenated_preactivations_original = torch.cat(ordered_preactivations_original, dim=1)
+    concatenated_preactivations_normalized = torch.cat(ordered_preactivations_normalized, dim=1)
+
+    k_val: int
+    if config.batchtopk_k is not None:
+        k_val = int(config.batchtopk_k)
+    else:
+        k_val = concatenated_preactivations_original.size(1)
+
+    mask_shape = concatenated_preactivations_original.shape
+    mask = torch.empty(mask_shape, dtype=torch.bool, device=device)
+
+    if world_size > 1:
+        if rank == 0:
+            local_mask = BatchTopK._compute_mask(
+                concatenated_preactivations_original, k_val, concatenated_preactivations_normalized
+            )
+            mask.copy_(local_mask)
+            dist.broadcast(mask, src=0, group=process_group)
+        else:
+            dist.broadcast(mask, src=0, group=process_group)
+    else:
+        mask = BatchTopK._compute_mask(
+            concatenated_preactivations_original, k_val, concatenated_preactivations_normalized
+        )
+
+    activated_concatenated = concatenated_preactivations_original * mask.to(dtype)
+
+    activations_dict: Dict[int, torch.Tensor] = {}
+    current_total_feature_offset = 0
+    for original_layer_idx, num_features_this_layer in layer_feature_sizes:
+        activated_segment = activated_concatenated[
+            :, current_total_feature_offset : current_total_feature_offset + num_features_this_layer
+        ]
+        activations_dict[original_layer_idx] = activated_segment
+        current_total_feature_offset += num_features_this_layer
+    return activations_dict
+
+
+def _apply_token_topk_helper(
+    preactivations_dict: Dict[int, torch.Tensor],
+    config: CLTConfig,
+    device: torch.device,
+    dtype: torch.dtype,
+    rank: int,
+    process_group: Optional[ProcessGroup],
+) -> Dict[int, torch.Tensor]:
+    """Helper to apply TokenTopK globally across concatenated layer pre-activations."""
+    world_size = 1
+    if process_group is not None and dist.is_initialized():
+        world_size = dist.get_world_size(process_group)
+
+    if not preactivations_dict:
+        logger_helpers.warning(f"Rank {rank}: _apply_token_topk_helper received empty preactivations_dict.")
+        return {}
+
+    ordered_preactivations_original: List[torch.Tensor] = []
+    ordered_preactivations_normalized: List[torch.Tensor] = []
+    layer_feature_sizes: List[Tuple[int, int]] = []
+
+    first_valid_preact = next((p for p in preactivations_dict.values() if p.numel() > 0), None)
+    if first_valid_preact is None:
+        logger_helpers.warning(
+            f"Rank {rank}: No valid preactivations found in dict for TokenTopK. Returning empty dict."
+        )
+        return {
+            layer_idx: torch.empty((0, config.num_features), device=device, dtype=dtype)
+            for layer_idx in preactivations_dict.keys()
+        }
+    batch_tokens_dim = first_valid_preact.shape[0]
+
+    for layer_idx in range(config.num_layers):
+        if layer_idx in preactivations_dict:
+            preact_orig = preactivations_dict[layer_idx]
+            preact_orig = preact_orig.to(device=device, dtype=dtype)
+            current_num_features = preact_orig.shape[1] if preact_orig.numel() > 0 else config.num_features
+
+            if preact_orig.numel() == 0:
+                if batch_tokens_dim > 0:
+                    zeros_shape = (batch_tokens_dim, current_num_features)
+                    ordered_preactivations_original.append(torch.zeros(zeros_shape, device=device, dtype=dtype))
+                    ordered_preactivations_normalized.append(torch.zeros(zeros_shape, device=device, dtype=dtype))
+            elif preact_orig.shape[0] != batch_tokens_dim:
+                logger_helpers.warning(
+                    f"Rank {rank} Layer {layer_idx}: Mismatched batch dim ({preact_orig.shape[0]} vs {batch_tokens_dim}) for TokenTopK. Using zeros."
+                )
+                zeros_shape = (batch_tokens_dim, current_num_features)
+                ordered_preactivations_original.append(torch.zeros(zeros_shape, device=device, dtype=dtype))
+                ordered_preactivations_normalized.append(torch.zeros(zeros_shape, device=device, dtype=dtype))
+            else:
+                ordered_preactivations_original.append(preact_orig)
+                mean = preact_orig.mean(dim=0, keepdim=True)
+                std = preact_orig.std(dim=0, keepdim=True)
+                preact_norm = (preact_orig - mean) / (std + 1e-6)
+                ordered_preactivations_normalized.append(preact_norm)
+            layer_feature_sizes.append((layer_idx, current_num_features))
+
+    if not ordered_preactivations_original:
+        logger_helpers.warning(
+            f"Rank {rank}: No tensors collected after iterating layers for TokenTopK. Returning empty activations."
+        )
+        return {
+            layer_idx: torch.empty((batch_tokens_dim, config.num_features), device=device, dtype=dtype)
+            for layer_idx in preactivations_dict.keys()
+        }
+
+    concatenated_preactivations_original = torch.cat(ordered_preactivations_original, dim=1)
+    concatenated_preactivations_normalized = torch.cat(ordered_preactivations_normalized, dim=1)
+
+    k_val_float: float
+    if hasattr(config, "topk_k") and config.topk_k is not None:
+        k_val_float = float(config.topk_k)
+    else:
+        k_val_float = float(concatenated_preactivations_original.size(1))
+
+    mask_shape = concatenated_preactivations_original.shape
+    mask = torch.empty(mask_shape, dtype=torch.bool, device=device)
+
+    if world_size > 1:
+        if rank == 0:
+            local_mask = TokenTopK._compute_mask(
+                concatenated_preactivations_original,
+                k_val_float,
+                concatenated_preactivations_normalized,
+            )
+            mask.copy_(local_mask)
+            dist.broadcast(mask, src=0, group=process_group)
+        else:
+            dist.broadcast(mask, src=0, group=process_group)
+    else:
+        mask = TokenTopK._compute_mask(
+            concatenated_preactivations_original, k_val_float, concatenated_preactivations_normalized
+        )
+
+    activated_concatenated = concatenated_preactivations_original * mask.to(dtype)
+
+    activations_dict: Dict[int, torch.Tensor] = {}
+    current_total_feature_offset = 0
+    for original_layer_idx, num_features_this_layer in layer_feature_sizes:
+        activated_segment = activated_concatenated[
+            :, current_total_feature_offset : current_total_feature_offset + num_features_this_layer
+        ]
+        activations_dict[original_layer_idx] = activated_segment
+        current_total_feature_offset += num_features_this_layer
+    return activations_dict
diff --git a/clt/models/clt.py b/clt/models/clt.py
index 0fb90aa..92306ca 100644
--- a/clt/models/clt.py
+++ b/clt/models/clt.py
@@ -1,34 +1,20 @@
 import torch
-import torch.nn as nn
-from typing import Dict, Optional, Union, Tuple, cast, List
-import logging  # Import logging
+from typing import Dict, Optional, Union, Tuple, List
+import logging
 import torch.distributed as dist
 
 from clt.config import CLTConfig
 from clt.models.base import BaseTranscoder
-from clt.models.parallel import RowParallelLinear  # Removed ColumnParallelLinear
-from clt.models.activations import BatchTopK, JumpReLU, TokenTopK  # Import BatchTopK, JumpReLU and TokenTopK
 
-# Import the new encoding helper functions
-from clt.models.encoding import (
-    _apply_batch_topk_helper,
-)  # Removed _get_preactivations_helper, _encode_all_layers_helper
-from clt.models.encoding import _apply_token_topk_helper
-
-# Import the new Encoder module
+from clt.models.activations import _apply_batch_topk_helper, _apply_token_topk_helper
 from clt.models.encoder import Encoder
-
-# Import the new Decoder module
 from clt.models.decoder import Decoder
+from clt.models.theta import ThetaManager
 
-# Import the activation registry
 from clt.activations.registry import get_activation_fn
 
 from torch.distributed import ProcessGroup
 
-from . import mark_replicated  # Added import
-
-# Configure logging (or use existing logger if available)
 logger = logging.getLogger(__name__)
 
 
@@ -36,39 +22,17 @@ class CrossLayerTranscoder(BaseTranscoder):
     """Implementation of a Cross-Layer Transcoder (CLT) with tensor parallelism."""
 
     _cached_decoder_norms: Optional[torch.Tensor] = None
-    _min_selected_preact: Optional[torch.Tensor]
-    _sum_min_selected_preact: Optional[torch.Tensor]
-    _count_min_selected_preact: Optional[torch.Tensor]
-    _avg_layer_means: Optional[torch.Tensor]
-    _avg_layer_stds: Optional[torch.Tensor]
-    _processed_batches_for_stats: Optional[torch.Tensor]
-    log_threshold: Optional[nn.Parameter]
-    device: torch.device  # Ensure device is always set
-    dtype: torch.dtype  # Ensure dtype is always set
+
+    device: torch.device
+    dtype: torch.dtype
 
     def __init__(
         self,
         config: CLTConfig,
         process_group: Optional["ProcessGroup"],
-        device: Optional[torch.device] = None,  # device can be None initially
+        device: Optional[torch.device] = None,
     ):
-        """Initialize the Cross-Layer Transcoder.
-
-        The encoder matrices use ColumnParallelLinear, sharding output features.
-        The decoder matrices use RowParallelLinear, sharding input features (i.e., the CLT features).
-        The `log_threshold` for JumpReLU is a per-layer, per-feature parameter, marked as replicated.
-        Buffers `_sum_min_selected_preact` and `_count_min_selected_preact` are used for
-        on-the-fly theta estimation if converting from BatchTopK/TokenTopK to JumpReLU.
-
-        Args:
-            config: Configuration for the transcoder.
-            process_group: The process group for tensor parallelism. If None, the model operates
-                in a non-distributed manner.
-            device: Optional device to initialize the model parameters on. If None, behavior depends
-                on subsequent calls or available hardware.
-        """
         super().__init__(config)
-
         self.process_group = process_group
         if process_group is None or not dist.is_initialized():
             self.world_size = 1
@@ -78,8 +42,7 @@ def __init__(
             self.world_size = dist.get_world_size(process_group)
             self.rank = dist.get_rank(process_group)
 
-        # Consolidate device and dtype initialization
-        self.dtype = self._resolve_dtype(config.clt_dtype)  # Defaults to float32 if config.clt_dtype is None or invalid
+        self.dtype = self._resolve_dtype(config.clt_dtype)
         if device is not None:
             self.device = device
         else:
@@ -87,61 +50,17 @@ def __init__(
 
         logger.info(f"CLT TP model initialized on rank {self.rank} with device {self.device} and dtype {self.dtype}")
 
-        # Instantiate the new Encoder module
         self.encoder_module = Encoder(
             config=config, process_group=self.process_group, device=self.device, dtype=self.dtype
         )
-        # The old self.encoders = nn.ModuleList(...) is now removed.
-
-        self.decoders = nn.ModuleDict(
-            {
-                f"{src_layer}->{tgt_layer}": RowParallelLinear(
-                    in_features=config.num_features,
-                    out_features=config.d_model,
-                    bias=True,
-                    process_group=self.process_group,
-                    input_is_parallel=False,
-                    d_model_for_init=config.d_model,
-                    num_layers_for_init=config.num_layers,
-                    device=self.device,
-                    dtype=self.dtype,
-                )
-                for src_layer in range(config.num_layers)
-                for tgt_layer in range(src_layer, config.num_layers)
-            }
-        )
-
-        # Instantiate the new Decoder module
         self.decoder_module = Decoder(
             config=config, process_group=self.process_group, device=self.device, dtype=self.dtype
         )
-        # Remove the old self.decoders and _cached_decoder_norms registration
-        del self.decoders
-        # Note: _cached_decoder_norms was registered in __init__ before,
-        # now it's handled within the Decoder module itself.
-        # If self._cached_decoder_norms = None was present, it should be removed too.
-        # Checking the original code, _cached_decoder_norms was an attribute, not registered with register_buffer initially for the main class.
-        # It was registered with register_buffer for the theta estimation buffers.
-        # The Decoder class now handles its own _cached_decoder_norms registration.
-
-        if self.config.activation_fn == "jumprelu":
-            initial_threshold_val = torch.ones(
-                config.num_layers, config.num_features, device=self.device, dtype=self.dtype
-            ) * torch.log(torch.tensor(config.jumprelu_threshold, device=self.device, dtype=self.dtype))
-            # Ensure log_threshold is created on the correct device and dtype
-            self.log_threshold = nn.Parameter(initial_threshold_val)
-            mark_replicated(self.log_threshold)
-        else:
-            # Ensure log_threshold is not accidentally created or used if not jumprelu
-            self.log_threshold = None
-
-        self.bandwidth = 1.0
-
-        self.register_buffer("_sum_min_selected_preact", None, persistent=False)
-        self.register_buffer("_count_min_selected_preact", None, persistent=False)
+        self.theta_manager = ThetaManager(
+            config=config, process_group=self.process_group, device=self.device, dtype=self.dtype
+        )
 
     def _resolve_dtype(self, dtype_input: Optional[Union[str, torch.dtype]]) -> torch.dtype:
-        """Converts string dtype names to torch.dtype objects, defaulting to float32."""
         if isinstance(dtype_input, torch.dtype):
             return dtype_input
         if isinstance(dtype_input, str):
@@ -159,146 +78,20 @@ def _resolve_dtype(self, dtype_input: Optional[Union[str, torch.dtype]]) -> torc
 
     def jumprelu(self, x: torch.Tensor, layer_idx: int) -> torch.Tensor:
         """Apply JumpReLU activation function for a specific layer."""
-        # Select the threshold for the given layer
-        if not hasattr(self, "log_threshold") or self.log_threshold is None:
-            logger.error(
-                f"Rank {self.rank}: log_threshold attribute not initialized or None for JumpReLU. Returning input."
-            )
-            return x.to(device=self.device, dtype=self.dtype)  # Ensure output is on correct device/dtype
-
-        if layer_idx >= self.log_threshold.shape[0]:
-            logger.error(
-                f"Rank {self.rank}: Invalid layer_idx {layer_idx} for log_threshold with shape {self.log_threshold.shape}. Returning input."
-            )
-            return x.to(device=self.device, dtype=self.dtype)  # Ensure output is on correct device/dtype
-
-        threshold = torch.exp(self.log_threshold[layer_idx]).to(device=self.device, dtype=self.dtype)
-        # Apply JumpReLU - This needs the *full* preactivation dimension
-        # Cast output to Tensor to satisfy linter
-        return cast(torch.Tensor, JumpReLU.apply(x, threshold, self.bandwidth))
+        return self.theta_manager.jumprelu(x, layer_idx)
 
     def get_preactivations(self, x: torch.Tensor, layer_idx: int) -> torch.Tensor:
-        """Get pre-activation values (full tensor) for features at the specified layer."""
-        # Call the new encoder module's method directly
         return self.encoder_module.get_preactivations(x, layer_idx)
 
     def _encode_all_layers(
         self, inputs: Dict[int, torch.Tensor]
-    ) -> Tuple[Dict[int, torch.Tensor], List[Tuple[int, int, int]]]:  # Return type updated
-        """Encodes inputs for all layers and returns pre-activations and original shape info."""
-        # Call the encoder module's method
-        # The encoder_module handles device and dtype internally for its operations.
-        # Inputs are processed within the encoder_module's methods.
+    ) -> Tuple[Dict[int, torch.Tensor], List[Tuple[int, int, int]]]:
         return self.encoder_module.encode_all_layers(inputs)
 
-    @torch.no_grad()
-    def _update_min_selected_preactivations(
-        self,
-        concatenated_preactivations_original: torch.Tensor,
-        activated_concatenated: torch.Tensor,
-        layer_feature_sizes: List[Tuple[int, int]],
-    ):
-        """
-        Updates the _min_selected_preact buffer with minimum pre-activation values
-        for features selected by BatchTopK during the current step.
-        This function operates with no_grad.
-        """
-        if (
-            not hasattr(self, "_sum_min_selected_preact")
-            or self._sum_min_selected_preact is None
-            or not hasattr(self, "_count_min_selected_preact")
-            or self._count_min_selected_preact is None
-        ):
-            if self.config.activation_fn == "batchtopk":
-                logger.warning(f"Rank {self.rank}: running BatchTopK stats buffers not found. Skipping theta update.")
-            return
-
-        assert self._sum_min_selected_preact is not None and isinstance(
-            self._sum_min_selected_preact, torch.Tensor
-        ), f"Rank {self.rank}: _sum_min_selected_preact is not a Tensor or is None."
-        assert self._count_min_selected_preact is not None and isinstance(
-            self._count_min_selected_preact, torch.Tensor
-        ), f"Rank {self.rank}: _count_min_selected_preact is not a Tensor or is None."
-
-        current_total_feature_offset = 0
-        for i, (original_layer_idx, num_features_this_layer) in enumerate(layer_feature_sizes):
-            if original_layer_idx >= self._sum_min_selected_preact.shape[0]:
-                logger.warning(
-                    f"Rank {self.rank}: Invalid original_layer_idx {original_layer_idx} for _min_selected_preact update. Skipping layer."
-                )
-                current_total_feature_offset += num_features_this_layer
-                continue
-
-            preact_orig_this_layer = concatenated_preactivations_original[
-                :, current_total_feature_offset : current_total_feature_offset + num_features_this_layer
-            ]
-            gated_acts_segment = activated_concatenated[
-                :, current_total_feature_offset : current_total_feature_offset + num_features_this_layer
-            ]
-
-            if gated_acts_segment.shape == preact_orig_this_layer.shape:
-                # Vectorised per-feature min calculation that avoids CPU-only ops like nonzero on MPS.
-                mask_active = gated_acts_segment > 0  # Active features after gating
-
-                if mask_active.any():
-                    # Replace inactive entries by +inf and take per-feature minimum across tokens
-                    masked_preact = torch.where(
-                        mask_active,
-                        preact_orig_this_layer,
-                        torch.full_like(preact_orig_this_layer, float("inf")),
-                    )
-
-                    per_feature_min_this_batch = masked_preact.amin(dim=0)
-
-                    if logger.isEnabledFor(logging.DEBUG):
-                        # Log characteristics of the minimums being used for theta estimation
-                        finite_mins_for_log = per_feature_min_this_batch[torch.isfinite(per_feature_min_this_batch)]
-                        if finite_mins_for_log.numel() > 0:
-                            logger.debug(
-                                f"Rank {self.rank} Layer {original_layer_idx}: per_feature_min_this_batch (finite values for log) "
-                                f"min={finite_mins_for_log.min().item():.4f}, "
-                                f"max={finite_mins_for_log.max().item():.4f}, "
-                                f"mean={finite_mins_for_log.mean().item():.4f}, "
-                                f"median={torch.median(finite_mins_for_log).item():.4f}"
-                            )
-                        else:
-                            logger.debug(
-                                f"Rank {self.rank} Layer {original_layer_idx}: No finite per_feature_min_this_batch values to log stats for."
-                            )
-
-                        # Log how many original pre-activations were negative but still contributed to a positive gated_act
-                        original_preacts_leading_to_positive_gated = preact_orig_this_layer[mask_active]
-                        if original_preacts_leading_to_positive_gated.numel() > 0:  # Check if tensor is not empty
-                            num_negative_contrib = (original_preacts_leading_to_positive_gated < 0).sum().item()
-                            if num_negative_contrib > 0:
-                                logger.debug(
-                                    f"Rank {self.rank} Layer {original_layer_idx}: {num_negative_contrib} negative original pre-activations "
-                                    f"(out of {mask_active.sum().item()} active selections) contributed to theta estimation via positive gated_acts_segment."
-                                )
-
-                    # Update running sum and count for expected-value calculation
-                    valid_mask = torch.isfinite(per_feature_min_this_batch)
-
-                    self._sum_min_selected_preact[original_layer_idx, valid_mask] += per_feature_min_this_batch[
-                        valid_mask
-                    ]
-                    self._count_min_selected_preact[original_layer_idx, valid_mask] += 1
-            else:
-                logger.warning(
-                    f"Rank {self.rank}: Shape mismatch for theta update, layer {original_layer_idx}. "
-                    f"Original: {preact_orig_this_layer.shape}, Gated: {gated_acts_segment.shape}"
-                )
-
-            current_total_feature_offset += num_features_this_layer
-
-        # Function now purely updates the buffer – it no longer recurses or returns a value.
-
     def _apply_batch_topk(
         self,
         preactivations_dict: Dict[int, torch.Tensor],
-        # device and dtype arguments are removed as they are taken from self
     ) -> Dict[int, torch.Tensor]:
-        """Applies BatchTopK to concatenated pre-activations from all layers by calling the helper."""
         return _apply_batch_topk_helper(
             preactivations_dict, self.config, self.device, self.dtype, self.rank, self.process_group
         )
@@ -306,38 +99,22 @@ def _apply_batch_topk(
     def _apply_token_topk(
         self,
         preactivations_dict: Dict[int, torch.Tensor],
-        # device and dtype arguments are removed as they are taken from self
     ) -> Dict[int, torch.Tensor]:
-        """Applies TokenTopK to concatenated pre-activations from all layers by calling the helper."""
         return _apply_token_topk_helper(
             preactivations_dict, self.config, self.device, self.dtype, self.rank, self.process_group
         )
 
     def encode(self, x: torch.Tensor, layer_idx: int) -> torch.Tensor:
         """Encode the input activations at the specified layer.
-
-        Returns the *full* feature activations after nonlinearity.
-        This method is used for 'relu' and 'jumprelu' activations.
-        For 'batchtopk', use get_feature_activations.
-
-        Args:
-            x: Input activations [batch_size, seq_len, d_model] or [batch_tokens, d_model]
-            layer_idx: Index of the layer
-
-        Returns:
-            Encoded activations after nonlinearity [..., num_features]
+        This method is primarily for 'relu' and 'jumprelu' activations.
+        BatchTopK/TokenTopK are handled in get_feature_activations.
         """
-        # Ensure input is on the correct device and dtype
         x = x.to(device=self.device, dtype=self.dtype)
-
         fallback_tensor: Optional[torch.Tensor] = None
         activated: Optional[torch.Tensor] = None
-
-        # Get full preactivations [..., num_features]
         preact = self.get_preactivations(x, layer_idx)
 
         if preact.numel() == 0:
-            # If preactivations failed or returned empty, create fallback based on expected shape
             logger.warning(f"Rank {self.rank}: Received empty preactivations for encode layer {layer_idx}.")
             batch_dim = x.shape[0] if x.dim() == 2 else x.shape[0] * x.shape[1] if x.dim() == 3 else 0
             fallback_shape = (batch_dim, self.config.num_features)
@@ -346,785 +123,160 @@ def encode(self, x: torch.Tensor, layer_idx: int) -> torch.Tensor:
             logger.warning(
                 f"Rank {self.rank}: Received invalid preactivations shape {preact.shape} for encode layer {layer_idx}."
             )
-            fallback_shape = (preact.shape[0], self.config.num_features)  # Try to keep batch dim
+            fallback_shape = (preact.shape[0], self.config.num_features)
             fallback_tensor = torch.zeros(fallback_shape, device=self.device, dtype=self.dtype)
         else:
-            # Apply activation function to the full preactivation tensor using the registry
             try:
-                activation_fn_callable = get_activation_fn(self.config.activation_fn)
-                activated = activation_fn_callable(self, preact, layer_idx)
-            except ValueError as e:  # Catch if activation function is not in registry
+                if self.config.activation_fn == "jumprelu":
+                    activated = self.theta_manager.jumprelu(preact, layer_idx)
+                elif self.config.activation_fn == "relu":
+                    activation_fn_callable = get_activation_fn("relu")  # Standard ReLU from registry
+                    activated = activation_fn_callable(self, preact, layer_idx)  # Corrected signature
+                else:
+                    # This path should ideally not be taken if BatchTopK/TokenTopK are handled elsewhere.
+                    # If other activation functions are added that fit this per-layer, per-token model,
+                    # ensure get_activation_fn returns a callable with the correct signature.
+                    logger.error(
+                        f"Rank {self.rank}: Unsupported activation function '{self.config.activation_fn}' encountered in encode method path. Expected jumprelu or relu."
+                    )
+                    # Fallback to zero tensor to avoid crashing, but this indicates a logic issue.
+                    fallback_shape = (preact.shape[0], self.config.num_features)
+                    fallback_tensor = torch.zeros(fallback_shape, device=self.device, dtype=self.dtype)
+
+            except ValueError as e:  # Catch if activation function name is not in registry
                 logger.error(
                     f"Rank {self.rank}: Error getting activation function '{self.config.activation_fn}' for layer {layer_idx}: {e}"
                 )
-                # Fallback to a zero tensor of the correct shape if activation function is invalid
                 fallback_shape = (preact.shape[0], self.config.num_features)
                 fallback_tensor = torch.zeros(fallback_shape, device=self.device, dtype=self.dtype)
-            except Exception as e:  # Catch any other unexpected errors during activation fn call
+            except Exception as e:
                 logger.error(
                     f"Rank {self.rank}: Unexpected error during activation function '{self.config.activation_fn}' for layer {layer_idx}: {e}"
                 )
-                # Fallback to a zero tensor of the correct shape
                 fallback_shape = (preact.shape[0], self.config.num_features)
                 fallback_tensor = torch.zeros(fallback_shape, device=self.device, dtype=self.dtype)
 
-        # Return activated tensor or fallback
         if activated is not None:
             return activated
         elif fallback_tensor is not None:
             return fallback_tensor
         else:
-            # Should not happen, but return empty tensor as last resort
-            # This case implies preact was valid, but no activation was applied (e.g. unknown self.config.activation_fn)
-            # and no fallback_tensor was created. This should be an error or handled by an explicit 'else' for activation_fn.
-            # For now, let's make it raise an error as it's an unexpected state.
-            raise ValueError(
-                f"Rank {self.rank}: Unknown activation function '{self.config.activation_fn}' or logic error in encode for layer {layer_idx}."
+            # This state implies an issue if neither `activated` nor `fallback_tensor` was set.
+            # For instance, if preact was valid but the activation_fn logic path didn't set either.
+            logger.critical(
+                f"Rank {self.rank}: Critical logic error in encode for layer {layer_idx}. Activation function '{self.config.activation_fn}' not properly handled leading to no output."
             )
+            # Return a zero tensor of the expected output shape as a last resort before crashing.
+            expected_batch_dim = (
+                preact.shape[0]
+                if preact.numel() > 0
+                else (x.shape[0] if x.dim() == 2 else (x.shape[0] * x.shape[1] if x.dim() == 3 else 0))
+            )
+            return torch.zeros((expected_batch_dim, self.config.num_features), device=self.device, dtype=self.dtype)
 
     def decode(self, a: Dict[int, torch.Tensor], layer_idx: int) -> torch.Tensor:
-        """Decode the feature activations to reconstruct outputs at the specified layer.
-
-        Input activations `a` are expected to be the *full* tensors.
-        The RowParallelLinear decoder splits them internally.
-
-        Args:
-            a: Dictionary mapping layer indices to *full* feature activations [..., num_features]
-            layer_idx: Index of the layer to reconstruct outputs for
-
-        Returns:
-            Reconstructed outputs [..., d_model]
-        """
-        # Call the new decoder module's method directly
         return self.decoder_module.decode(a, layer_idx)
 
     def forward(self, inputs: Dict[int, torch.Tensor]) -> Dict[int, torch.Tensor]:
-        """Process inputs through the parallel transcoder model.
-
-        Args:
-            inputs: Dictionary mapping layer indices to input activations
-
-        Returns:
-            Dictionary mapping layer indices to reconstructed outputs
-        """
-        # Get feature activations based on the configured activation function
         activations = self.get_feature_activations(inputs)
-
-        # Decode to reconstruct outputs at each layer
         reconstructions = {}
-        # Determine a consistent device/dtype for fallback tensor
-        # Try to infer from first available input, otherwise use model/system defaults
-        # fallback_op_device, fallback_op_dtype = self._get_current_op_device_dtype(
-        #     next((t for t in inputs.values() if t.numel() > 0), None)
-        # )
-
         for layer_idx in range(self.config.num_layers):
-            # Check if any relevant *full* activations exist before decoding
             relevant_activations = {k: v for k, v in activations.items() if k <= layer_idx and v.numel() > 0}
             if layer_idx in inputs and relevant_activations:
-                # Decode takes the dictionary of *full* activations
-                reconstructions[layer_idx] = self.decode(relevant_activations, layer_idx)  # Removed try-except
-            elif layer_idx in inputs:  # Input exists, but no relevant activations (e.g. all were zeroed out)
-                # Determine batch size from input if possible
+                reconstructions[layer_idx] = self.decode(relevant_activations, layer_idx)
+            elif layer_idx in inputs:
                 batch_size = 0
                 input_tensor = inputs[layer_idx]
-                if input_tensor.dim() >= 1:  # Should be [B, S, D] or [B*S, D]
+                if input_tensor.dim() >= 1:
                     batch_size = (
                         input_tensor.shape[0] * input_tensor.shape[1]
                         if input_tensor.dim() == 3
                         else input_tensor.shape[0]
                     )
-                else:  # input_tensor has 0 dimensions or less, cannot infer batch size
+                else:
                     logger.warning(
                         f"Rank {self.rank}: Could not determine batch size for fallback tensor in forward layer {layer_idx} from input shape {input_tensor.shape}. Using 0."
                     )
-
                 reconstructions[layer_idx] = torch.zeros(
                     (batch_size, self.config.d_model),
-                    device=self.device,  # Use determined fallback device
-                    dtype=self.dtype,  # Use determined fallback dtype
+                    device=self.device,
+                    dtype=self.dtype,
                 )
-            # If layer_idx not in inputs, it's not expected to produce output, so skip
-
         return reconstructions
 
     def get_feature_activations(self, inputs: Dict[int, torch.Tensor]) -> Dict[int, torch.Tensor]:
-        """Get *full* feature activations for all layers.
-
-        Handles different activation functions including global BatchTopK.
-
-        Args:
-            inputs: Dictionary mapping layer indices to input activations
-
-        Returns:
-            Dictionary mapping layer indices to *full* feature activations [..., num_features]
-        """
-        # self.device and self.dtype are guaranteed to be set.
-
         processed_inputs: Dict[int, torch.Tensor] = {}
         for layer_idx, x_orig in inputs.items():
             processed_inputs[layer_idx] = x_orig.to(device=self.device, dtype=self.dtype)
 
         if self.config.activation_fn == "batchtopk" or self.config.activation_fn == "topk":
-            # _encode_all_layers now returns 2 values: preactivations_dict, original_shapes_info
-            # We only need preactivations_dict here.
             preactivations_dict, _ = self._encode_all_layers(processed_inputs)
-
-            if not preactivations_dict:  # Indicates helper returned empty, possibly due to all-empty inputs
+            if not preactivations_dict:
                 activations = {}
-                # Use the device/dtype determined by the helper (should match self.device/self.dtype)
-                # dev_fallback = processed_device
-                # dt_fallback = processed_dtype
-                for layer_idx_orig_input in inputs.keys():  # Iterate original input keys to maintain structure
-                    x_orig_input = inputs[layer_idx_orig_input]  # Original input for shape reference
+                for layer_idx_orig_input in inputs.keys():
+                    x_orig_input = inputs[layer_idx_orig_input]
                     batch_dim_fallback = 0
-                    if x_orig_input.dim() == 3:  # B, S, D
+                    if x_orig_input.dim() == 3:
                         batch_dim_fallback = x_orig_input.shape[0] * x_orig_input.shape[1]
-                    elif x_orig_input.dim() == 2:  # B, D or B*S, D
+                    elif x_orig_input.dim() == 2:
                         batch_dim_fallback = x_orig_input.shape[0]
-                    # else: 0-dim or 1-dim, batch_dim_fallback remains 0
-
                     activations[layer_idx_orig_input] = torch.zeros(
                         (batch_dim_fallback, self.config.num_features), device=self.device, dtype=self.dtype
                     )
                 return activations
 
             if self.config.activation_fn == "batchtopk":
-                # Helpers use the device/dtype from their input preactivations_dict (self.device, self.dtype)
-                activations = _apply_batch_topk_helper(
-                    preactivations_dict, self.config, self.device, self.dtype, self.rank, self.process_group
-                )
+                activations = self._apply_batch_topk(preactivations_dict)
             elif self.config.activation_fn == "topk":
-                activations = _apply_token_topk_helper(
-                    preactivations_dict, self.config, self.device, self.dtype, self.rank, self.process_group
-                )
-            # If neither, this implies an issue if we reached here. However, config.activation_fn is checked.
-            # Add a return here to satisfy linters/type checkers if activations might not be assigned.
-            # Given the checks, 'activations' should always be assigned one of the above.
-            # To be absolutely safe and explicit:
-            else:  # Should not be reached if config.activation_fn is 'batchtopk' or 'topk'
+                activations = self._apply_token_topk(preactivations_dict)
+            else:
                 raise ValueError(f"Unexpected activation_fn '{self.config.activation_fn}' in BatchTopK/TokenTopK path.")
             return activations
         else:  # ReLU or JumpReLU (per-layer activation)
             activations = {}
-            # Iterate layers in deterministic ascending order so all ranks
-            # invoke the same collective operations in the same sequence.
             for layer_idx in sorted(processed_inputs.keys()):
                 x_input = processed_inputs[layer_idx]
-                # self.encode will use self.device and self.dtype which are now guaranteed to be set.
-                act = self.encode(x_input, layer_idx)  # Removed try-except from here
+                act = self.encode(x_input, layer_idx)
                 activations[layer_idx] = act
             return activations
 
     def get_decoder_norms(self) -> torch.Tensor:
-        """Get L2 norms of all decoder matrices for each feature (gathered across ranks).
-
-        The decoders are of type `RowParallelLinear`. Their weights are sharded across the
-        input feature dimension (CLT features). Each feature's decoder weight vector
-        (across all target layers) resides on a single rank.
-
-        The computation proceeds as follows:
-        1. For each source CLT layer (`src_layer`):
-           a. Initialize a local accumulator for squared norms (`local_norms_sq_accum`)
-              for all features, matching the model's device and float32 for precision.
-           b. For each target model layer (`tgt_layer`) that this `src_layer` decodes to:
-              i. Get the corresponding `RowParallelLinear` decoder module.
-              ii. Access its local weight shard (`decoder.weight`, shape [d_model, local_num_features]).
-              iii. Compute L2 norm squared for each column (feature) in this local shard.
-              iv. Determine the global indices for the features this rank owns.
-              v. Add these squared norms to the corresponding global slice in `local_norms_sq_accum`.
-           c. All-reduce `local_norms_sq_accum` across all ranks using SUM operation.
-              This sums the squared norm contributions for each feature from the rank that owns it.
-           d. Take the square root of the summed squared norms and cast to the model's dtype.
-              Store this in the `full_decoder_norms` tensor for the current `src_layer`.
-        2. Cache and return `full_decoder_norms`.
-
-        The norms are cached in `self._cached_decoder_norms` to avoid recomputation.
-
-        Returns:
-            Tensor of shape [num_layers, num_features] containing L2 norms of decoder
-            weights for each feature, applicable for sparsity calculations.
-        """
-        # Call the new decoder module's method directly
         return self.decoder_module.get_decoder_norms()
 
     @torch.no_grad()
     def estimate_theta_posthoc(
         self,
-        data_iter: torch.utils.data.IterableDataset,  # More generic iterable
+        data_iter: torch.utils.data.IterableDataset,
         num_batches: Optional[int] = None,
         default_theta_value: float = 1e6,
         device: Optional[torch.device] = None,
     ) -> torch.Tensor:
-        """Estimate theta post-hoc using a specified number of batches.
-
-        Args:
-            data_iter: An iterable yielding (inputs, targets) batches.
-            num_batches: Number of batches to process for estimation. If None, iterates through all.
-            default_theta_value: Value for features never activated. (Note: currently not directly used in final theta calculation in convert_to_jumprelu_inplace)
-            device: Device to run estimation on.
-
-        Returns:
-            The estimated theta tensor.
-        """
-        original_device = next(self.parameters()).device
-        target_device = device if device is not None else self.device
-        if target_device is None:
-            target_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-        logger.info(f"Rank {self.rank}: Starting post-hoc theta estimation on device {target_device}.")
-
-        self.eval()  # Set model to evaluation mode
-        if target_device != original_device:
-            self.to(target_device)
-
-        if not hasattr(self, "_sum_min_selected_preact") or self._sum_min_selected_preact is None:
-            self.register_buffer(
-                "_sum_min_selected_preact",
-                torch.zeros(
-                    (self.config.num_layers, self.config.num_features),
-                    dtype=self.dtype,
-                    device=target_device,
-                ),
-                persistent=False,
-            )
-        else:
-            assert isinstance(self._sum_min_selected_preact, torch.Tensor)
-            self._sum_min_selected_preact = self._sum_min_selected_preact.to(device=target_device, dtype=self.dtype)
-            self._sum_min_selected_preact.data.zero_()
-
-        if not hasattr(self, "_count_min_selected_preact") or self._count_min_selected_preact is None:
-            self.register_buffer(
-                "_count_min_selected_preact",
-                torch.zeros(
-                    (self.config.num_layers, self.config.num_features),
-                    dtype=self.dtype,
-                    device=target_device,
-                ),
-                persistent=False,
-            )
-        else:
-            assert isinstance(self._count_min_selected_preact, torch.Tensor)
-            self._count_min_selected_preact = self._count_min_selected_preact.to(device=target_device, dtype=self.dtype)
-            self._count_min_selected_preact.data.zero_()
-
-        buffer_shape = (self.config.num_layers, self.config.num_features)
-        if not hasattr(self, "_avg_layer_means") or self._avg_layer_means is None:
-            self.register_buffer(
-                "_avg_layer_means", torch.zeros(buffer_shape, dtype=self.dtype, device=target_device), persistent=False
-            )
-        else:
-            assert isinstance(self._avg_layer_means, torch.Tensor)
-            self._avg_layer_means = self._avg_layer_means.to(device=target_device, dtype=self.dtype)
-            self._avg_layer_means.data.zero_()
-
-        if not hasattr(self, "_avg_layer_stds") or self._avg_layer_stds is None:
-            self.register_buffer(
-                "_avg_layer_stds", torch.zeros(buffer_shape, dtype=self.dtype, device=target_device), persistent=False
-            )
-        else:
-            assert isinstance(self._avg_layer_stds, torch.Tensor)
-            self._avg_layer_stds = self._avg_layer_stds.to(device=target_device, dtype=self.dtype)
-            self._avg_layer_stds.data.zero_()
-
-        if not hasattr(self, "_processed_batches_for_stats") or self._processed_batches_for_stats is None:
-            self.register_buffer(
-                "_processed_batches_for_stats",
-                torch.zeros(self.config.num_layers, dtype=torch.long, device=target_device),
-                persistent=False,
-            )
-        else:
-            assert isinstance(self._processed_batches_for_stats, torch.Tensor)
-            self._processed_batches_for_stats = self._processed_batches_for_stats.to(
-                device=target_device, dtype=torch.long
-            )
-            self._processed_batches_for_stats.data.zero_()
-
-        processed_batches_total = 0
-
-        try:
-            from tqdm.auto import tqdm  # type: ignore
-
-            iterable_data_iter = (
-                tqdm(data_iter, total=num_batches, desc=f"Estimating Theta & Stats (Rank {self.rank})")
-                if num_batches
-                else tqdm(data_iter, desc=f"Estimating Theta & Stats (Rank {self.rank})")
-            )
-        except ImportError:
-            logger.info("tqdm not found, proceeding without progress bar for theta estimation.")
-            iterable_data_iter = data_iter
-
-        for inputs_batch, _ in iterable_data_iter:
-            if num_batches is not None and processed_batches_total >= num_batches:
-                break
-
-            inputs_on_device = {k: v.to(device=target_device, dtype=self.dtype) for k, v in inputs_batch.items()}
-            # _encode_all_layers uses self.device, self.dtype internally.
-            # Since self.to(target_device) was called, self.device is now target_device.
-            preactivations_dict, _ = self._encode_all_layers(inputs_on_device)
-
-            if not preactivations_dict:
-                logger.warning(f"Rank {self.rank}: No preactivations. Skipping batch {processed_batches_total + 1}.")
-                processed_batches_total += 1
-                continue
-
-            first_valid_preact = next((p for p in preactivations_dict.values() if p.numel() > 0), None)
-            if first_valid_preact is None:
-                logger.warning(
-                    f"Rank {self.rank}: All preactivations empty. Skipping batch {processed_batches_total + 1}."
-                )
-                processed_batches_total += 1
-                continue
-
-            ordered_preactivations_original_posthoc: List[torch.Tensor] = []
-            ordered_preactivations_normalized_posthoc: List[torch.Tensor] = []
-            layer_feature_sizes_posthoc: List[Tuple[int, int]] = []
-            batch_tokens_dim_posthoc = first_valid_preact.shape[0]
-
-            for layer_idx_loop in range(self.config.num_layers):
-                num_feat_for_layer: int
-                mean_loop: Optional[torch.Tensor] = None
-                std_loop: Optional[torch.Tensor] = None
-                preact_norm_loop: Optional[torch.Tensor] = None
-
-                if layer_idx_loop in preactivations_dict:
-                    preact_orig_loop = preactivations_dict[layer_idx_loop]
-                    num_feat_for_layer = (
-                        preact_orig_loop.shape[1] if preact_orig_loop.numel() > 0 else self.config.num_features
-                    )
-
-                    if preact_orig_loop.shape[0] != batch_tokens_dim_posthoc and preact_orig_loop.numel() > 0:
-                        logger.warning(
-                            f"Rank {self.rank} Layer {layer_idx_loop}: Mismatched token dim (expected {batch_tokens_dim_posthoc}, got {preact_orig_loop.shape[0]}). Using zeros."
-                        )
-                        mean_loop = torch.zeros((1, num_feat_for_layer), device=target_device, dtype=self.dtype)
-                        std_loop = torch.ones((1, num_feat_for_layer), device=target_device, dtype=self.dtype)
-                        preact_norm_loop = torch.zeros(
-                            (batch_tokens_dim_posthoc, num_feat_for_layer), device=target_device, dtype=self.dtype
-                        )
-                        ordered_preactivations_original_posthoc.append(
-                            torch.zeros(
-                                (batch_tokens_dim_posthoc, num_feat_for_layer), device=target_device, dtype=self.dtype
-                            )
-                        )
-                        ordered_preactivations_normalized_posthoc.append(preact_norm_loop)
-                    elif preact_orig_loop.numel() == 0 and batch_tokens_dim_posthoc > 0:
-                        mean_loop = torch.zeros((1, num_feat_for_layer), device=target_device, dtype=self.dtype)
-                        std_loop = torch.ones((1, num_feat_for_layer), device=target_device, dtype=self.dtype)
-                        preact_norm_loop = torch.zeros(
-                            (batch_tokens_dim_posthoc, num_feat_for_layer), device=target_device, dtype=self.dtype
-                        )
-                        ordered_preactivations_original_posthoc.append(
-                            torch.zeros(
-                                (batch_tokens_dim_posthoc, num_feat_for_layer), device=target_device, dtype=self.dtype
-                            )
-                        )
-                        ordered_preactivations_normalized_posthoc.append(preact_norm_loop)
-                    elif preact_orig_loop.numel() > 0:
-                        mean_loop = preact_orig_loop.mean(dim=0, keepdim=True)
-                        std_loop = preact_orig_loop.std(dim=0, keepdim=True)
-                        preact_norm_loop = (preact_orig_loop - mean_loop) / (std_loop + 1e-6)
-                        ordered_preactivations_original_posthoc.append(preact_orig_loop)
-                        ordered_preactivations_normalized_posthoc.append(preact_norm_loop)
-
-                        # Accumulate means and stds for this layer
-                        # These buffers were already ensured to be on target_device
-                        assert (
-                            self._avg_layer_means is not None
-                        ), "Rank {self.rank}: _avg_layer_means buffer not initialized before use."
-                        assert (
-                            self._avg_layer_stds is not None
-                        ), "Rank {self.rank}: _avg_layer_stds buffer not initialized before use."
-                        assert (
-                            self._processed_batches_for_stats is not None
-                        ), "Rank {self.rank}: _processed_batches_for_stats buffer not initialized before use."
-                        self._avg_layer_means.data[layer_idx_loop] += mean_loop.squeeze().clone()
-                        self._avg_layer_stds.data[layer_idx_loop] += std_loop.squeeze().clone()
-                        self._processed_batches_for_stats.data[layer_idx_loop] += 1
-                    else:  # Layer in dict, but preact_orig_loop is empty and batch_tokens_dim_posthoc is 0 - num_feat_for_layer is from config
-                        num_feat_for_layer = self.config.num_features  # Fallback
-                        # No data to append or normalize, but need to track for layer_feature_sizes_posthoc
-                else:  # layer_idx_loop not in preactivations_dict
-                    num_feat_for_layer = self.config.num_features  # Fallback
-                    if batch_tokens_dim_posthoc > 0:
-                        ordered_preactivations_original_posthoc.append(
-                            torch.zeros(
-                                (batch_tokens_dim_posthoc, num_feat_for_layer), device=target_device, dtype=self.dtype
-                            )
-                        )
-                        ordered_preactivations_normalized_posthoc.append(
-                            torch.zeros(
-                                (batch_tokens_dim_posthoc, num_feat_for_layer), device=target_device, dtype=self.dtype
-                            )
-                        )
-
-                layer_feature_sizes_posthoc.append((layer_idx_loop, num_feat_for_layer))
-
-            if not ordered_preactivations_normalized_posthoc or not any(
-                t.numel() > 0 for t in ordered_preactivations_normalized_posthoc
-            ):
-                logger.warning(
-                    f"Rank {self.rank}: No normalized preactivations. Skipping batch {processed_batches_total + 1}."
-                )
-                processed_batches_total += 1
-                continue
-
-            # Use normalized preactivations for ranking, but original for BatchTopK/TokenTopK values if available
-            # If original list is empty/all-empty, use normalized for values too (as a fallback)
-            if not ordered_preactivations_original_posthoc or not any(
-                t.numel() > 0 for t in ordered_preactivations_original_posthoc
-            ):
-                concatenated_preactivations_for_gating = torch.cat(ordered_preactivations_normalized_posthoc, dim=1)
-                logger.debug(
-                    f"Rank {self.rank} Batch {processed_batches_total + 1}: Using normalized preactivations for gating due to empty/all-empty original list."
-                )
-            else:
-                concatenated_preactivations_for_gating = torch.cat(ordered_preactivations_original_posthoc, dim=1)
-
-            concatenated_preactivations_for_ranking = torch.cat(ordered_preactivations_normalized_posthoc, dim=1)
-
-            activated_concatenated_posthoc: Optional[torch.Tensor] = None
-            if self.config.activation_fn == "batchtopk":
-                k_val_int = (
-                    int(self.config.batchtopk_k)
-                    if self.config.batchtopk_k is not None
-                    else concatenated_preactivations_for_gating.size(1)
-                )
-                # batchtopk_straight_through is expected to be in config (defaults to True in CLTConfig)
-                straight_through_btk = self.config.batchtopk_straight_through
-                activated_concatenated_posthoc = BatchTopK.apply(
-                    concatenated_preactivations_for_gating,
-                    float(k_val_int),
-                    straight_through_btk,
-                    concatenated_preactivations_for_ranking,
-                )
-            elif self.config.activation_fn == "topk":
-                if not hasattr(self.config, "topk_k") or self.config.topk_k is None:
-                    logger.error(
-                        f"Rank {self.rank}: 'topk_k' not found in config for 'topk' activation during theta estimation. Defaulting to all features for this batch."
-                    )
-                    k_val_float = float(concatenated_preactivations_for_gating.size(1))  # Keep all
-                else:
-                    k_val_float = float(self.config.topk_k)
-
-                straight_through_tk = getattr(self.config, "topk_straight_through", True)
-                activated_concatenated_posthoc = TokenTopK.apply(
-                    concatenated_preactivations_for_gating,
-                    k_val_float,
-                    straight_through_tk,
-                    concatenated_preactivations_for_ranking,
-                )
-            else:
-                logger.error(
-                    f"Rank {self.rank}: Unsupported activation_fn '{self.config.activation_fn}' for theta estimation. Cannot determine gating mechanism. Using zeros for activated_concatenated_posthoc."
-                )
-                activated_concatenated_posthoc = torch.zeros_like(concatenated_preactivations_for_gating)
-
-            # Update sum/count stats using NORMALIZED preactivations for selected features
-            if activated_concatenated_posthoc is not None:  # Ensure it was set
-                self._update_min_selected_preactivations(
-                    concatenated_preactivations_for_ranking,  # Sum of *normalized* values
-                    activated_concatenated_posthoc,
-                    layer_feature_sizes_posthoc,
-                )
-            processed_batches_total += 1
-
-        logger.info(
-            f"Rank {self.rank}: Processed {processed_batches_total} batches for theta estimation and stats accumulation."
+        """Estimate theta post-hoc using a specified number of batches."""
+        original_tm_device = self.theta_manager.device
+        target_device_tm = device if device is not None else self.device
+
+        if target_device_tm != original_tm_device:
+            logger.info(f"Rank {self.rank}: Moving ThetaManager to {target_device_tm} for theta estimation.")
+            self.theta_manager.to(target_device_tm)
+
+        estimated_thetas_result = self.theta_manager.estimate_theta_posthoc(
+            encode_all_layers_fn=self.encoder_module.encode_all_layers,
+            data_iter=data_iter,
+            num_batches=num_batches,
+            default_theta_value=default_theta_value,
         )
-
-        # Finalize average mu and sigma if the buffers exist (they should if estimation ran)
-        if (
-            hasattr(self, "_processed_batches_for_stats")
-            and self._processed_batches_for_stats is not None
-            and hasattr(self, "_avg_layer_means")
-            and self._avg_layer_means is not None
-            and hasattr(self, "_avg_layer_stds")
-            and self._avg_layer_stds is not None
-        ):
-            assert isinstance(self._processed_batches_for_stats, torch.Tensor)
-            active_stat_batches = self._processed_batches_for_stats.data.unsqueeze(-1).clamp_min(1.0)
-            assert isinstance(self._avg_layer_means, torch.Tensor)
-            assert isinstance(self._avg_layer_stds, torch.Tensor)
-            self._avg_layer_means.data /= active_stat_batches
-            self._avg_layer_stds.data /= active_stat_batches
-            logger.info(f"Rank {self.rank}: Averaged layer-wise normalization stats computed.")
-        else:
-            logger.warning(f"Rank {self.rank}: Could not finalize normalization stats, buffers missing.")
-
-        self.convert_to_jumprelu_inplace(default_theta_value=default_theta_value)
-
-        # Clean up non-persistent buffers
-        for buf_name in [
-            "_sum_min_selected_preact",
-            "_count_min_selected_preact",
-            "_avg_layer_means",
-            "_avg_layer_stds",
-        ]:
-            if hasattr(self, buf_name):
-                delattr(self, buf_name)
-
-        if target_device != original_device:
-            self.to(original_device)
-
-        logger.info(f"Rank {self.rank}: Post-hoc theta estimation and conversion to JumpReLU complete.")
-        if self.log_threshold is not None and hasattr(self.log_threshold, "data"):
-            return torch.exp(self.log_threshold.data)
-        else:
-            # Fallback if log_threshold is None or not a Parameter (should not happen after conversion)
-            logger.warning(
-                f"Rank {self.rank}: log_threshold not available for returning estimated theta. Returning empty tensor."
-            )
-            return torch.empty(0, device=original_device, dtype=self.dtype)
+        if target_device_tm != original_tm_device:
+            logger.info(f"Rank {self.rank}: Moving ThetaManager back to {original_tm_device}.")
+            self.theta_manager.to(original_tm_device)
+        return estimated_thetas_result
 
     @torch.no_grad()
     def convert_to_jumprelu_inplace(self, default_theta_value: float = 1e6) -> None:
         """
         Converts the model to use JumpReLU activation based on learned BatchTopK thresholds.
-        This method should be called after training with BatchTopK.
-        It finalizes the _min_selected_preact buffer (expected to contain sums of *normalized* preacts)
-        and uses _avg_layer_means, _avg_layer_stds for unnormalization.
-        Updates the model config and sets the log_threshold parameter.
-
-        Args:
-            default_theta_value: Value for features never activated (used in initial per-feature calculation in normalized space).
-                                 (Note: This parameter is not directly used in the current implementation's final theta calculation,
-                                  as behavior for non-activating features is handled by fallback_norm_theta_value and clamping.)
+        This method delegates to ThetaManager, which updates the shared config object.
         """
-        if self.config.activation_fn not in ["batchtopk", "topk"]:
-            logger.warning(
-                f"Rank {self.rank}: Model original activation_fn was {self.config.activation_fn}, not batchtopk or topk. "
-                "Skipping conversion to JumpReLU based on learned thetas."
-            )
-            if self.config.activation_fn == "relu":  # Keep this specific error for ReLU
-                logger.error(f"Rank {self.rank}: Model is ReLU, cannot convert to JumpReLU via learned thetas.")
-            return
-
-        required_buffers = [
-            "_sum_min_selected_preact",
-            "_count_min_selected_preact",
-            "_avg_layer_means",
-            "_avg_layer_stds",
-        ]
-        for buf_name in required_buffers:
-            if not hasattr(self, buf_name) or getattr(self, buf_name) is None:
-                raise RuntimeError(
-                    f"Rank {self.rank}: Required buffer {buf_name} for JumpReLU conversion not found or not populated. "
-                    "Run estimate_theta_posthoc() with appropriate settings before converting."
-                )
-
-        assert isinstance(self._sum_min_selected_preact, torch.Tensor)
-        assert isinstance(self._count_min_selected_preact, torch.Tensor)
-        assert isinstance(self._avg_layer_means, torch.Tensor)
-        assert isinstance(self._avg_layer_stds, torch.Tensor)
-
-        logger.info(
-            f"Rank {self.rank}: Starting conversion of BatchTopK model to JumpReLU (per-layer avg norm. theta, then unnormalize)."
-        )
-
-        # These sums/counts are of NORMALIZED preactivation values
-        theta_sum_norm = self._sum_min_selected_preact.clone()
-        theta_cnt_norm = self._count_min_selected_preact.clone()
-
-        avg_mus = self._avg_layer_means.clone()
-        avg_sigmas = self._avg_layer_stds.clone()
-
-        if self.process_group is not None and dist.is_initialized() and self.world_size > 1:
-            dist.all_reduce(theta_sum_norm, op=dist.ReduceOp.SUM, group=self.process_group)
-            dist.all_reduce(theta_cnt_norm, op=dist.ReduceOp.SUM, group=self.process_group)
-            # Mus and Sigmas should have been averaged per rank over their batches,
-            # then all-reduced if they were supposed to be global pre-defined stats.
-            # For now, assuming estimate_theta_posthoc gives each rank the same avg_mus and avg_sigmas
-            # (e.g. from rank 0, or each rank computes them identically on its data shard then averages).
-            # If they were calculated independently per rank on sharded data without final sync,
-            # they might differ. The current setup in estimate_theta_posthoc has each rank calculate its own.
-            # For consistent unnormalization, all ranks should use the same mu/sigma for a given layer.
-            # Let's assume for now estimate_theta_posthoc has made them consistent or this is handled by estimate_theta_posthoc
-            # For a truly robust solution, mus and sigmas would also need an all_reduce sum and divide by world_size * num_batches_per_rank.
-            # The current `active_stat_batches` division in estimate_theta_posthoc is per-rank, then averaged here.
-            # Let's assume the per-rank averaged mus/sigmas are what we want to use for unnormalizing that rank's part.
-            # But the final log_threshold must be identical. So the unnormalization must use globally agreed mu/sigma.
-            # Simplest: AllReduce sum for avg_mus * counts and avg_sigmas * counts, and sum counts, then divide.
-            # OR, more simply, after local averaging in estimate_theta_posthoc, all_reduce sum them and divide by world_size.
-            dist.all_reduce(avg_mus, op=dist.ReduceOp.SUM, group=self.process_group)
-            avg_mus /= self.world_size
-            dist.all_reduce(avg_sigmas, op=dist.ReduceOp.SUM, group=self.process_group)
-            avg_sigmas /= self.world_size
-            logger.info(f"Rank {self.rank}: AllReduced and averaged mu/sigma for unnormalization across ranks.")
-
-        # Initialize the final RAW theta tensor (will store per-feature raw thresholds)
-        theta_raw = torch.zeros_like(theta_sum_norm)
-        fallback_norm_theta_value = 1e-5  # Fallback for a layer's normalized theta
-
-        for l_idx in range(self.config.num_layers):
-            layer_theta_sum_norm = theta_sum_norm[l_idx]  # Sums of min selected *normalized* preacts
-            layer_theta_cnt_norm = theta_cnt_norm[l_idx]  # Counts for these
-
-            active_mask_layer = layer_theta_cnt_norm > 0
-            # Per-feature expected values in NORMALIZED space
-            per_feature_thetas_norm_layer = torch.full_like(layer_theta_sum_norm, float("inf"))
-
-            if active_mask_layer.any():
-                per_feature_thetas_norm_layer[active_mask_layer] = layer_theta_sum_norm[
-                    active_mask_layer
-                ] / layer_theta_cnt_norm[active_mask_layer].clamp_min(1.0)
-
-            finite_positive_thetas_norm_layer = per_feature_thetas_norm_layer[
-                torch.isfinite(per_feature_thetas_norm_layer) & (per_feature_thetas_norm_layer > 0)
-            ]
-
-            # SCALAR threshold in NORMALIZED space for this layer
-            theta_norm_scalar_for_this_layer: float
-            if finite_positive_thetas_norm_layer.numel() > 0:
-                theta_norm_scalar_for_this_layer = finite_positive_thetas_norm_layer.mean().item()
-                logger.info(
-                    f"Rank {self.rank} Layer {l_idx}: Derived normalized theta (scalar, mean of positive active features) = {theta_norm_scalar_for_this_layer:.4e}"
-                )
-            else:
-                theta_norm_scalar_for_this_layer = fallback_norm_theta_value
-                logger.warning(
-                    f"Rank {self.rank} Layer {l_idx}: No positive, finite per-feature normalized thetas. Using fallback normalized theta = {theta_norm_scalar_for_this_layer:.4e}"
-                )
-
-            # Un-normalize to get RAW thresholds PER FEATURE for this layer
-            mu_vec_layer = avg_mus[l_idx]  # Shape: [num_features]
-            sigma_vec_layer = avg_sigmas[l_idx].clamp_min(1e-6)  # Shape: [num_features], clamp std
-
-            # theta_norm_scalar_for_this_layer will be broadcast
-            theta_raw_vec_for_layer = theta_norm_scalar_for_this_layer * sigma_vec_layer + mu_vec_layer
-            theta_raw[l_idx] = theta_raw_vec_for_layer
-
-            if self.rank == 0 and l_idx < 5:  # Log first few layers for detail
-                logger.info(
-                    f"Rank 0 Layer {l_idx}: Normalized Theta_scalar={theta_norm_scalar_for_this_layer:.3e}. Mu (sample): {mu_vec_layer[:3].tolist()}. Sigma (sample): {sigma_vec_layer[:3].tolist()}. Raw Theta (sample): {theta_raw_vec_for_layer[:3].tolist()}"
-                )
-
-        logger.info(f"Rank {self.rank}: Per-feature raw thresholds computed via unnormalization.")
-
-        # This count is based on NORMALIZED stats. It tells how many features never had normalized stats.
-        num_norm_feat_no_stats = (theta_cnt_norm == 0).sum().item()
+        self.theta_manager.convert_to_jumprelu_inplace(default_theta_value=default_theta_value)
         logger.info(
-            f"Rank {self.rank}: Number of features that had no BatchTopK stats (norm counts==0) across all layers: {num_norm_feat_no_stats}"
+            f"Rank {self.rank}: CLT model config updated by ThetaManager. New activation_fn='{self.config.activation_fn}'."
         )
-
-        if self.rank == 0:
-            logger.info(f"Rank {self.rank}: Final RAW Theta stats (per-feature, shape {theta_raw.shape}):")
-            for l_idx in range(self.config.num_layers):
-                layer_raw_thetas = theta_raw[l_idx]
-                logger.info(
-                    f"  Layer {l_idx}: min={layer_raw_thetas.min().item():.4e}, mean={layer_raw_thetas.mean().item():.4e}, max={layer_raw_thetas.max().item():.4e}"
-                )
-            try:
-                import wandb
-
-                if wandb.run:
-                    for l_idx in range(self.config.num_layers):
-                        layer_raw_thetas_for_hist = theta_raw[l_idx].cpu().float()
-                        finite_layer_raw_thetas = layer_raw_thetas_for_hist[
-                            torch.isfinite(layer_raw_thetas_for_hist) & (layer_raw_thetas_for_hist > 0)
-                        ]  # Ensure positive for log10
-                        if finite_layer_raw_thetas.numel() > 0:
-                            wandb.log(
-                                {
-                                    f"debug/theta_layer_{l_idx}_raw_dist_log10": wandb.Histogram(
-                                        torch.log10(finite_layer_raw_thetas).tolist()
-                                    )
-                                },
-                                commit=False,
-                            )
-                        else:
-                            logger.debug(
-                                f"Rank {self.rank}: Layer {l_idx} had no finite positive raw thetas for histogram."
-                            )
-
-                    # Log overall min/max/mean of all raw thetas
-                    all_raw_thetas_flat = theta_raw.flatten().cpu().float()
-                    finite_all_raw_thetas = all_raw_thetas_flat[
-                        torch.isfinite(all_raw_thetas_flat) & (all_raw_thetas_flat > 0)
-                    ]
-                    if finite_all_raw_thetas.numel() > 0:
-                        wandb.log(
-                            {
-                                "debug/theta_raw_overall_min_log10": torch.log10(finite_all_raw_thetas.min()).item(),
-                                "debug/theta_raw_overall_max_log10": torch.log10(finite_all_raw_thetas.max()).item(),
-                                "debug/theta_raw_overall_mean_log10": torch.log10(finite_all_raw_thetas.mean()).item(),
-                            },
-                            commit=False,
-                        )
-
-            except ImportError:
-                logger.info("WandB not installed, skipping raw theta distribution logging.")
-            except (RuntimeError, ValueError) as e:  # More specific exceptions for tensor/logging issues
-                logger.error(f"Rank {self.rank}: Error logging raw theta distributions to WandB: {e}")
-
-        # Clamp final raw thetas before log to ensure they are positive for torch.log
-        # This primarily handles cases where mu was very negative and sigma very small, pulling a small positive norm_theta negative.
-        min_final_raw_theta = 1e-7  # Very small positive value
-        num_clamped_final = (theta_raw < min_final_raw_theta).sum().item()
-        if num_clamped_final > 0:
-            logger.warning(
-                f"Rank {self.rank}: Clamping {num_clamped_final} final raw theta values below {min_final_raw_theta} to {min_final_raw_theta} before taking log."
-            )
-            theta_raw.clamp_min_(min_final_raw_theta)
-
-        log_theta = torch.log(theta_raw)
-
-        # Update config
-        original_activation_fn = self.config.activation_fn  # Store before changing
-        self.config.activation_fn = "jumprelu"
-        # The original jumprelu_threshold in config is a scalar, now we have per-feature, per-layer.
-        # The JumpReLU function itself uses self.log_threshold if available.
-        # We mark the original config field to signify it's superseded.
-        self.config.jumprelu_threshold = 0.0  # Mark as effectively superseded
-
-        if original_activation_fn == "batchtopk":
-            self.config.batchtopk_k = None
-            # batchtopk_straight_through is bool. Set to False as it's no longer actively used.
-            self.config.batchtopk_straight_through = False
-        elif original_activation_fn == "topk":
-            if hasattr(self.config, "topk_k"):  # Not in CLTConfig, so check hasattr
-                del self.config.topk_k  # Dynamically added, so can be deleted
-            if hasattr(self.config, "topk_straight_through"):  # Not in CLTConfig
-                del self.config.topk_straight_through  # Dynamically added
-
-        # Create or update self.log_threshold as an nn.Parameter
-        if not hasattr(self, "log_threshold") or self.log_threshold is None:
-            self.log_threshold = nn.Parameter(log_theta.to(device=self.device, dtype=self.dtype))
-        else:
-            if not isinstance(self.log_threshold, nn.Parameter):
-                # If it exists but is not a Parameter, re-assign it as one
-                self.log_threshold = nn.Parameter(
-                    log_theta.to(device=self.log_threshold.device, dtype=self.log_threshold.dtype)
-                )
-            else:
-                # Update data in-place, ensuring it's on the correct device and dtype
-                self.log_threshold.data = log_theta.to(device=self.log_threshold.device, dtype=self.log_threshold.dtype)
-
-        mark_replicated(self.log_threshold)  # Mark as replicated after creation or update
-
-        logger.info(f"Rank {self.rank}: Model converted to JumpReLU. activation_fn='{self.config.activation_fn}'.")
-        if self.rank == 0:
-            min_log_thresh = (
-                self.log_threshold.data.min().item()
-                if self.log_threshold is not None
-                and hasattr(self.log_threshold, "data")
-                and self.log_threshold.data.numel() > 0
-                else float("nan")
-            )
-            max_log_thresh = (
-                self.log_threshold.data.max().item()
-                if self.log_threshold is not None
-                and hasattr(self.log_threshold, "data")
-                and self.log_threshold.data.numel() > 0
-                else float("nan")
-            )
-            mean_log_thresh = (
-                self.log_threshold.data.mean().item()
-                if self.log_threshold is not None
-                and hasattr(self.log_threshold, "data")
-                and self.log_threshold.data.numel() > 0
-                else float("nan")
-            )
-            logger.info(
-                f"Rank {self.rank}: Final log_threshold stats: min={min_log_thresh:.4f}, max={max_log_thresh:.4f}, mean={mean_log_thresh:.4f}"
-            )
diff --git a/clt/models/encoding.py b/clt/models/encoding.py
deleted file mode 100644
index 448ab68..0000000
--- a/clt/models/encoding.py
+++ /dev/null
@@ -1,302 +0,0 @@
-import torch
-from typing import Dict, Optional, Tuple, List
-import logging
-import torch.distributed as dist
-from torch.distributed import ProcessGroup
-
-from clt.config import CLTConfig
-from clt.models.activations import BatchTopK, TokenTopK
-
-# Configure logging (or use existing logger if available)
-# It's generally better for the calling module (clt.py) to pass its logger
-# or for these functions to use getLogger(__name__) if they are truly standalone.
-# For now, let's assume a logger instance is passed or they use their own.
-logger = logging.getLogger(__name__)
-
-# The get_preactivations and _encode_all_layers functions previously here
-# have been moved to the clt.models.encoder.Encoder class.
-
-
-def _apply_batch_topk_helper(
-    preactivations_dict: Dict[int, torch.Tensor],
-    config: CLTConfig,
-    device: torch.device,
-    dtype: torch.dtype,
-    rank: int,  # Add rank
-    process_group: Optional[ProcessGroup],  # Add process_group
-) -> Dict[int, torch.Tensor]:
-    """Helper to apply BatchTopK globally across concatenated layer pre-activations."""
-
-    world_size = 1
-    if process_group is not None and dist.is_initialized():
-        world_size = dist.get_world_size(process_group)
-
-    if not preactivations_dict:
-        logger.warning(f"Rank {rank}: _apply_batch_topk_helper received empty preactivations_dict.")
-        return {}
-
-    # --- 1. Concatenate Preactivations (Original and Normalized) ---
-    # (Existing concatenation logic remains the same)
-    # ... (concatenation logic) ...
-    ordered_preactivations_original: List[torch.Tensor] = []
-    ordered_preactivations_normalized: List[torch.Tensor] = []
-    layer_feature_sizes: List[Tuple[int, int]] = []  # Store (original_layer_idx, num_features)
-
-    # Determine batch dimension (number of tokens) from the first valid tensor
-    first_valid_preact = next((p for p in preactivations_dict.values() if p.numel() > 0), None)
-    if first_valid_preact is None:
-        logger.warning(f"Rank {rank}: No valid preactivations found in dict for BatchTopK. Returning empty dict.")
-        # Return structure matching input keys but with empty tensors
-        return {
-            layer_idx: torch.empty((0, config.num_features), device=device, dtype=dtype)
-            for layer_idx in preactivations_dict.keys()
-        }
-    batch_tokens_dim = first_valid_preact.shape[0]
-
-    # Ensure consistent ordering and handle missing layers
-    for layer_idx in range(config.num_layers):
-        if layer_idx in preactivations_dict:
-            preact_orig = preactivations_dict[layer_idx]
-            # Ensure preact_orig is on the correct device/dtype already
-            preact_orig = preact_orig.to(device=device, dtype=dtype)
-
-            current_num_features = preact_orig.shape[1] if preact_orig.numel() > 0 else config.num_features
-
-            # Handle potentially empty tensors or mismatched batch dims gracefully
-            if preact_orig.numel() == 0:
-                if batch_tokens_dim > 0:  # Only create zeros if batch dim exists
-                    # Use zeros if empty but expected
-                    zeros_shape = (batch_tokens_dim, current_num_features)
-                    ordered_preactivations_original.append(torch.zeros(zeros_shape, device=device, dtype=dtype))
-                    ordered_preactivations_normalized.append(
-                        torch.zeros(zeros_shape, device=device, dtype=dtype)
-                    )  # Use zeros for norm too
-                    logger.debug(f"Rank {rank} Layer {layer_idx}: Using zeros shape {zeros_shape} for empty input.")
-                # else: if batch_tokens_dim is 0, we append nothing, loop continues
-            elif preact_orig.shape[0] != batch_tokens_dim:
-                # This case indicates inconsistency, log warning and use zeros
-                logger.warning(
-                    f"Rank {rank} Layer {layer_idx}: Mismatched batch dim ({preact_orig.shape[0]} vs {batch_tokens_dim}). Using zeros."
-                )
-                zeros_shape = (batch_tokens_dim, current_num_features)
-                ordered_preactivations_original.append(torch.zeros(zeros_shape, device=device, dtype=dtype))
-                ordered_preactivations_normalized.append(torch.zeros(zeros_shape, device=device, dtype=dtype))
-            else:
-                # Valid tensor
-                ordered_preactivations_original.append(preact_orig)
-                # Normalize for ranking (handle potential division by zero)
-                mean = preact_orig.mean(dim=0, keepdim=True)
-                std = preact_orig.std(dim=0, keepdim=True)
-                preact_norm = (preact_orig - mean) / (std + 1e-6)  # Add epsilon for stability
-                ordered_preactivations_normalized.append(preact_norm)
-
-            layer_feature_sizes.append((layer_idx, current_num_features))  # Track original layer index and its features
-        # else: Layer not in dict, skip
-
-    if not ordered_preactivations_original:
-        logger.warning(
-            f"Rank {rank}: No tensors collected after iterating layers for BatchTopK. Returning empty activations."
-        )
-        # Return structure matching input keys but with empty tensors
-        return {
-            layer_idx: torch.empty((batch_tokens_dim, config.num_features), device=device, dtype=dtype)
-            for layer_idx in preactivations_dict.keys()  # Use original keys for structure
-        }
-
-    concatenated_preactivations_original = torch.cat(ordered_preactivations_original, dim=1)
-    concatenated_preactivations_normalized = torch.cat(ordered_preactivations_normalized, dim=1)
-
-    # --- 2. Apply BatchTopK using Normalized values for ranking ---
-    k_val: int
-    if config.batchtopk_k is not None:
-        k_val = int(config.batchtopk_k)
-    else:
-        # Default to keeping all features if k is not specified
-        k_val = concatenated_preactivations_original.size(1)
-
-    # --- MODIFIED SECTION: Mask Computation and Broadcast ---
-    # B = concatenated_preactivations_original.shape[0] # Tokens dim
-    # F_total_concat = concatenated_preactivations_original.shape[1]
-    # k_total_batch = min(k_val * B, concatenated_preactivations_original.numel()) # Clamp k
-
-    # Compute mask on rank 0 and broadcast
-    mask_shape = concatenated_preactivations_original.shape
-    mask = torch.empty(mask_shape, dtype=torch.bool, device=device)
-
-    if world_size > 1:
-        if rank == 0:
-            # Rank 0 computes the mask
-            local_mask = BatchTopK._compute_mask(
-                concatenated_preactivations_original, k_val, concatenated_preactivations_normalized
-            )
-            mask.copy_(local_mask)  # Copy computed mask to the buffer
-            # Broadcast the mask tensor from rank 0
-            dist.broadcast(mask, src=0, group=process_group)
-        else:
-            # Other ranks receive the broadcasted mask
-            dist.broadcast(mask, src=0, group=process_group)
-    else:
-        # Single GPU case: compute mask directly
-        mask = BatchTopK._compute_mask(
-            concatenated_preactivations_original, k_val, concatenated_preactivations_normalized
-        )
-
-    # Apply the identical mask on all ranks
-    activated_concatenated = concatenated_preactivations_original * mask.to(dtype)
-    # --- END MODIFIED SECTION ---
-
-    # --- 3. Split Concatenated Activations back into Dictionary ---
-    # (Existing splitting logic remains the same)
-    # ... (splitting logic) ...
-    activations_dict: Dict[int, torch.Tensor] = {}
-    current_total_feature_offset = 0
-    # Use layer_feature_sizes to ensure correct splitting based on original layers/features included
-    for original_layer_idx, num_features_this_layer in layer_feature_sizes:
-        # Extract the segment corresponding to this original layer
-        activated_segment = activated_concatenated[
-            :, current_total_feature_offset : current_total_feature_offset + num_features_this_layer
-        ]
-        activations_dict[original_layer_idx] = activated_segment
-        current_total_feature_offset += num_features_this_layer
-
-    # --- Optional: Theta Estimation Update ---
-    # (Update logic remains the same, uses concatenated tensors before splitting)
-    # ... (theta update call) ...
-
-    return activations_dict
-
-
-def _apply_token_topk_helper(
-    preactivations_dict: Dict[int, torch.Tensor],
-    config: CLTConfig,
-    device: torch.device,
-    dtype: torch.dtype,
-    rank: int,  # Add rank
-    process_group: Optional[ProcessGroup],  # Add process_group
-) -> Dict[int, torch.Tensor]:
-    """Helper to apply TokenTopK globally across concatenated layer pre-activations."""
-
-    world_size = 1
-    if process_group is not None and dist.is_initialized():
-        world_size = dist.get_world_size(process_group)
-
-    if not preactivations_dict:
-        logger.warning(f"Rank {rank}: _apply_token_topk_helper received empty preactivations_dict.")
-        return {}
-
-    # --- 1. Concatenate Preactivations (Original and Normalized) ---
-    # (Existing concatenation logic, same as BatchTopK helper)
-    # ... (concatenation logic) ...
-    ordered_preactivations_original: List[torch.Tensor] = []
-    ordered_preactivations_normalized: List[torch.Tensor] = []
-    layer_feature_sizes: List[Tuple[int, int]] = []  # Store (original_layer_idx, num_features)
-
-    # Determine batch dimension (number of tokens) from the first valid tensor
-    first_valid_preact = next((p for p in preactivations_dict.values() if p.numel() > 0), None)
-    if first_valid_preact is None:
-        logger.warning(f"Rank {rank}: No valid preactivations found in dict for TokenTopK. Returning empty dict.")
-        # Return structure matching input keys but with empty tensors
-        return {
-            layer_idx: torch.empty((0, config.num_features), device=device, dtype=dtype)
-            for layer_idx in preactivations_dict.keys()
-        }
-    batch_tokens_dim = first_valid_preact.shape[0]
-
-    # Ensure consistent ordering and handle missing layers
-    for layer_idx in range(config.num_layers):
-        if layer_idx in preactivations_dict:
-            preact_orig = preactivations_dict[layer_idx]
-            # Ensure preact_orig is on the correct device/dtype already
-            preact_orig = preact_orig.to(device=device, dtype=dtype)
-
-            current_num_features = preact_orig.shape[1] if preact_orig.numel() > 0 else config.num_features
-
-            # Handle potentially empty tensors or mismatched batch dims gracefully
-            if preact_orig.numel() == 0:
-                if batch_tokens_dim > 0:
-                    zeros_shape = (batch_tokens_dim, current_num_features)
-                    ordered_preactivations_original.append(torch.zeros(zeros_shape, device=device, dtype=dtype))
-                    ordered_preactivations_normalized.append(torch.zeros(zeros_shape, device=device, dtype=dtype))
-            elif preact_orig.shape[0] != batch_tokens_dim:
-                logger.warning(
-                    f"Rank {rank} Layer {layer_idx}: Mismatched batch dim ({preact_orig.shape[0]} vs {batch_tokens_dim}) for TokenTopK. Using zeros."
-                )
-                zeros_shape = (batch_tokens_dim, current_num_features)
-                ordered_preactivations_original.append(torch.zeros(zeros_shape, device=device, dtype=dtype))
-                ordered_preactivations_normalized.append(torch.zeros(zeros_shape, device=device, dtype=dtype))
-            else:
-                # Valid tensor
-                ordered_preactivations_original.append(preact_orig)
-                # Normalize for ranking
-                mean = preact_orig.mean(dim=0, keepdim=True)
-                std = preact_orig.std(dim=0, keepdim=True)
-                preact_norm = (preact_orig - mean) / (std + 1e-6)
-                ordered_preactivations_normalized.append(preact_norm)
-
-            layer_feature_sizes.append((layer_idx, current_num_features))  # Track original layer index and its features
-        # else: Layer not in dict, skip
-
-    if not ordered_preactivations_original:
-        logger.warning(
-            f"Rank {rank}: No tensors collected after iterating layers for TokenTopK. Returning empty activations."
-        )
-        # Return structure matching input keys but with empty tensors
-        return {
-            layer_idx: torch.empty((batch_tokens_dim, config.num_features), device=device, dtype=dtype)
-            for layer_idx in preactivations_dict.keys()  # Use original keys for structure
-        }
-
-    concatenated_preactivations_original = torch.cat(ordered_preactivations_original, dim=1)
-    concatenated_preactivations_normalized = torch.cat(ordered_preactivations_normalized, dim=1)
-
-    # --- 2. Apply TokenTopK using Normalized values for ranking ---
-    k_val_float: float
-    if hasattr(config, "topk_k") and config.topk_k is not None:
-        k_val_float = float(config.topk_k)
-    else:
-        # Default to keeping all features if k is not specified
-        k_val_float = float(concatenated_preactivations_original.size(1))
-
-    # --- MODIFIED SECTION: Mask Computation and Broadcast ---
-    mask_shape = concatenated_preactivations_original.shape
-    mask = torch.empty(mask_shape, dtype=torch.bool, device=device)
-
-    if world_size > 1:
-        if rank == 0:
-            # Rank 0 computes the mask
-            local_mask = TokenTopK._compute_mask(  # Use TokenTopK's method
-                concatenated_preactivations_original,
-                k_val_float,  # Pass float k
-                concatenated_preactivations_normalized,
-            )
-            mask.copy_(local_mask)
-            # Broadcast the mask tensor from rank 0
-            dist.broadcast(mask, src=0, group=process_group)
-        else:
-            # Other ranks receive the broadcasted mask
-            dist.broadcast(mask, src=0, group=process_group)
-    else:
-        # Single GPU case: compute mask directly
-        mask = TokenTopK._compute_mask(  # Use TokenTopK's method
-            concatenated_preactivations_original, k_val_float, concatenated_preactivations_normalized  # Pass float k
-        )
-
-    # Apply the identical mask on all ranks
-    activated_concatenated = concatenated_preactivations_original * mask.to(dtype)
-    # --- END MODIFIED SECTION ---
-
-    # --- 3. Split Concatenated Activations back into Dictionary ---
-    # (Existing splitting logic, same as BatchTopK helper)
-    # ... (splitting logic) ...
-    activations_dict: Dict[int, torch.Tensor] = {}
-    current_total_feature_offset = 0
-    # Use layer_feature_sizes to ensure correct splitting based on original layers/features included
-    for original_layer_idx, num_features_this_layer in layer_feature_sizes:
-        # Extract the segment corresponding to this original layer
-        activated_segment = activated_concatenated[
-            :, current_total_feature_offset : current_total_feature_offset + num_features_this_layer
-        ]
-        activations_dict[original_layer_idx] = activated_segment
-        current_total_feature_offset += num_features_this_layer
-
-    return activations_dict
diff --git a/clt/models/theta.py b/clt/models/theta.py
new file mode 100644
index 0000000..a42cff1
--- /dev/null
+++ b/clt/models/theta.py
@@ -0,0 +1,632 @@
+import torch
+import torch.nn as nn
+from typing import Dict, Optional, Tuple, List, cast, Callable  # Removed Union
+import logging
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from clt.config import CLTConfig
+from clt.models.activations import JumpReLU, BatchTopK, TokenTopK  # Added BatchTopK, TokenTopK
+from . import mark_replicated  # For marking log_threshold
+
+logger = logging.getLogger(__name__)
+
+
+class ThetaManager(nn.Module):
+    """
+    Manages the log_threshold parameter for JumpReLU, its estimation, and conversion.
+    This module also handles the JumpReLU activation function itself.
+    """
+
+    log_threshold: Optional[nn.Parameter]
+    _sum_min_selected_preact: Optional[torch.Tensor]
+    _count_min_selected_preact: Optional[torch.Tensor]
+    _avg_layer_means: Optional[torch.Tensor]
+    _avg_layer_stds: Optional[torch.Tensor]
+    _processed_batches_for_stats: Optional[torch.Tensor]
+
+    def __init__(
+        self,
+        config: CLTConfig,  # Needs full CLTConfig for num_layers, features, jumprelu_threshold etc.
+        process_group: Optional[ProcessGroup],
+        device: torch.device,
+        dtype: torch.dtype,
+        # Add any other necessary parameters from CrossLayerTranscoder like encoder_module if needed for estimate_theta_posthoc
+    ):
+        super().__init__()
+        self.config = config
+        self.process_group = process_group
+        self.device = device
+        self.dtype = dtype
+        self.bandwidth = 1.0  # As it was in CrossLayerTranscoder for jumprelu
+
+        if process_group is None or not dist.is_initialized():
+            self.world_size = 1
+            self.rank = 0
+        else:
+            self.world_size = dist.get_world_size(process_group)
+            self.rank = dist.get_rank(process_group)
+
+        if self.config.activation_fn == "jumprelu":
+            initial_threshold_val = torch.ones(
+                config.num_layers, config.num_features, device=self.device, dtype=self.dtype
+            ) * torch.log(torch.tensor(config.jumprelu_threshold, device=self.device, dtype=self.dtype))
+            self.log_threshold = nn.Parameter(initial_threshold_val)
+            mark_replicated(self.log_threshold)
+        else:
+            self.log_threshold = None
+
+        # Register buffers for theta estimation
+        self.register_buffer("_sum_min_selected_preact", None, persistent=False)
+        self.register_buffer("_count_min_selected_preact", None, persistent=False)
+        self.register_buffer("_avg_layer_means", None, persistent=False)
+        self.register_buffer("_avg_layer_stds", None, persistent=False)
+        self.register_buffer("_processed_batches_for_stats", None, persistent=False)
+
+    def jumprelu(self, x: torch.Tensor, layer_idx: int) -> torch.Tensor:
+        """Apply JumpReLU activation function for a specific layer."""
+        if self.log_threshold is None:
+            logger.error(f"Rank {self.rank}: log_threshold not initialized for JumpReLU. Returning input.")
+            return x.to(device=self.device, dtype=self.dtype)  # Ensure output device/dtype
+        if layer_idx >= self.log_threshold.shape[0]:
+            logger.error(f"Rank {self.rank}: Invalid layer_idx {layer_idx} for log_threshold. Returning input.")
+            return x.to(device=self.device, dtype=self.dtype)  # Ensure output device/dtype
+        threshold = torch.exp(self.log_threshold[layer_idx]).to(device=self.device, dtype=self.dtype)
+        return cast(torch.Tensor, JumpReLU.apply(x, threshold, self.bandwidth))
+
+    @torch.no_grad()
+    def _update_min_selected_preactivations(
+        self,
+        concatenated_preactivations_original: torch.Tensor,
+        activated_concatenated: torch.Tensor,
+        layer_feature_sizes: List[Tuple[int, int]],
+    ):
+        """
+        Updates the _sum_min_selected_preact and _count_min_selected_preact buffers
+        with minimum pre-activation values for features selected by BatchTopK/TokenTopK.
+        This function operates with no_grad.
+        Buffers are attributes of self (ThetaManager instance).
+        """
+        if (
+            not hasattr(self, "_sum_min_selected_preact")
+            or self._sum_min_selected_preact is None
+            or not hasattr(self, "_count_min_selected_preact")
+            or self._count_min_selected_preact is None
+        ):
+            # This check is primarily for safety; buffers are initialized in __init__.
+            # However, if called when activation_fn isn't batchtopk/topk (where estimation might not run),
+            # it's good to be cautious. The main guard is in estimate_theta_posthoc.
+            if self.config.activation_fn in ["batchtopk", "topk"]:
+                logger.warning(
+                    f"Rank {self.rank}: ThetaManager running stats buffers not found or None. Skipping theta update contribution."
+                )
+            return
+
+        assert self._sum_min_selected_preact is not None and isinstance(
+            self._sum_min_selected_preact, torch.Tensor
+        ), f"Rank {self.rank}: ThetaManager's _sum_min_selected_preact is not a Tensor or is None."
+        assert self._count_min_selected_preact is not None and isinstance(
+            self._count_min_selected_preact, torch.Tensor
+        ), f"Rank {self.rank}: ThetaManager's _count_min_selected_preact is not a Tensor or is None."
+
+        current_total_feature_offset = 0
+        for i, (original_layer_idx, num_features_this_layer) in enumerate(layer_feature_sizes):
+            if original_layer_idx >= self._sum_min_selected_preact.shape[0]:
+                logger.warning(
+                    f"Rank {self.rank}: Invalid original_layer_idx {original_layer_idx} for _min_selected_preact update. Skipping layer."
+                )
+                current_total_feature_offset += num_features_this_layer
+                continue
+
+            preact_orig_this_layer = concatenated_preactivations_original[
+                :, current_total_feature_offset : current_total_feature_offset + num_features_this_layer
+            ]
+            gated_acts_segment = activated_concatenated[
+                :, current_total_feature_offset : current_total_feature_offset + num_features_this_layer
+            ]
+
+            if gated_acts_segment.shape == preact_orig_this_layer.shape:
+                mask_active = gated_acts_segment > 0
+                if mask_active.any():
+                    masked_preact = torch.where(
+                        mask_active,
+                        preact_orig_this_layer,
+                        torch.full_like(preact_orig_this_layer, float("inf")),
+                    )
+                    per_feature_min_this_batch = masked_preact.amin(dim=0)
+                    if logger.isEnabledFor(logging.DEBUG):
+                        finite_mins_for_log = per_feature_min_this_batch[torch.isfinite(per_feature_min_this_batch)]
+                        if finite_mins_for_log.numel() > 0:
+                            logger.debug(
+                                f"Rank {self.rank} Layer {original_layer_idx}: per_feature_min_this_batch (finite values for log) "
+                                f"min={finite_mins_for_log.min().item():.4f}, "
+                                f"max={finite_mins_for_log.max().item():.4f}, "
+                                f"mean={finite_mins_for_log.mean().item():.4f}, "
+                                f"median={torch.median(finite_mins_for_log).item():.4f}"
+                            )
+                        else:
+                            logger.debug(
+                                f"Rank {self.rank} Layer {original_layer_idx}: No finite per_feature_min_this_batch values to log stats for."
+                            )
+                        original_preacts_leading_to_positive_gated = preact_orig_this_layer[mask_active]
+                        if original_preacts_leading_to_positive_gated.numel() > 0:
+                            num_negative_contrib = (original_preacts_leading_to_positive_gated < 0).sum().item()
+                            if num_negative_contrib > 0:
+                                logger.debug(
+                                    f"Rank {self.rank} Layer {original_layer_idx}: {num_negative_contrib} negative original pre-activations "
+                                    f"(out of {mask_active.sum().item()} active selections) contributed to theta estimation via positive gated_acts_segment."
+                                )
+                    valid_mask = torch.isfinite(per_feature_min_this_batch)
+                    self._sum_min_selected_preact[original_layer_idx, valid_mask] += per_feature_min_this_batch[
+                        valid_mask
+                    ]
+                    self._count_min_selected_preact[original_layer_idx, valid_mask] += 1
+            else:
+                logger.warning(
+                    f"Rank {self.rank}: Shape mismatch for theta update, layer {original_layer_idx}. "
+                    f"Original: {preact_orig_this_layer.shape}, Gated: {gated_acts_segment.shape}"
+                )
+            current_total_feature_offset += num_features_this_layer
+
+    @torch.no_grad()
+    def estimate_theta_posthoc(
+        self,
+        encode_all_layers_fn: Callable[
+            [Dict[int, torch.Tensor]], Tuple[Dict[int, torch.Tensor], List[Tuple[int, int, int]]]
+        ],
+        data_iter: torch.utils.data.IterableDataset,
+        num_batches: Optional[int] = None,
+        default_theta_value: float = 1e6,
+        # Removed target_device, as self.device is used. The caller should ensure ThetaManager is on the correct device.
+    ) -> torch.Tensor:
+        """Estimate theta post-hoc using a specified number of batches."""
+        logger.info(f"Rank {self.rank}: Starting post-hoc theta estimation on device {self.device}.")
+        # No self.to(target_device) needed here, assumes ThetaManager is already on the correct device.
+        # self.eval() is not applicable as ThetaManager itself doesn't have training/eval mode in the same way as a full model.
+
+        if not hasattr(self, "_sum_min_selected_preact") or self._sum_min_selected_preact is None:
+            self._sum_min_selected_preact = torch.zeros(
+                (self.config.num_layers, self.config.num_features),
+                dtype=self.dtype,
+                device=self.device,
+            )
+        else:
+            self._sum_min_selected_preact.data.zero_()
+
+        if not hasattr(self, "_count_min_selected_preact") or self._count_min_selected_preact is None:
+            self._count_min_selected_preact = torch.zeros(
+                (self.config.num_layers, self.config.num_features),
+                dtype=self.dtype,
+                device=self.device,
+            )
+        else:
+            self._count_min_selected_preact.data.zero_()
+
+        buffer_shape = (self.config.num_layers, self.config.num_features)
+        if not hasattr(self, "_avg_layer_means") or self._avg_layer_means is None:
+            self._avg_layer_means = torch.zeros(buffer_shape, dtype=self.dtype, device=self.device)
+        else:
+            self._avg_layer_means.data.zero_()
+
+        if not hasattr(self, "_avg_layer_stds") or self._avg_layer_stds is None:
+            self._avg_layer_stds = torch.zeros(buffer_shape, dtype=self.dtype, device=self.device)
+        else:
+            self._avg_layer_stds.data.zero_()
+
+        if not hasattr(self, "_processed_batches_for_stats") or self._processed_batches_for_stats is None:
+            self._processed_batches_for_stats = torch.zeros(
+                self.config.num_layers, dtype=torch.long, device=self.device
+            )
+        else:
+            self._processed_batches_for_stats.data.zero_()
+
+        processed_batches_total = 0
+        try:
+            from tqdm.auto import tqdm  # type: ignore
+
+            iterable_data_iter = (
+                tqdm(data_iter, total=num_batches, desc=f"Estimating Theta & Stats (Rank {self.rank})")
+                if num_batches
+                else tqdm(data_iter, desc=f"Estimating Theta & Stats (Rank {self.rank})")
+            )
+        except ImportError:
+            logger.info("tqdm not found, proceeding without progress bar for theta estimation.")
+            iterable_data_iter = data_iter
+
+        for inputs_batch, _ in iterable_data_iter:
+            if num_batches is not None and processed_batches_total >= num_batches:
+                break
+
+            inputs_on_device = {k: v.to(device=self.device, dtype=self.dtype) for k, v in inputs_batch.items()}
+            preactivations_dict, _ = encode_all_layers_fn(inputs_on_device)
+
+            if not preactivations_dict:
+                logger.warning(f"Rank {self.rank}: No preactivations. Skipping batch {processed_batches_total + 1}.")
+                processed_batches_total += 1
+                continue
+
+            first_valid_preact = next((p for p in preactivations_dict.values() if p.numel() > 0), None)
+            if first_valid_preact is None:
+                logger.warning(
+                    f"Rank {self.rank}: All preactivations empty. Skipping batch {processed_batches_total + 1}."
+                )
+                processed_batches_total += 1
+                continue
+
+            ordered_preactivations_original_posthoc: List[torch.Tensor] = []
+            ordered_preactivations_normalized_posthoc: List[torch.Tensor] = []
+            layer_feature_sizes_posthoc: List[Tuple[int, int]] = []
+            batch_tokens_dim_posthoc = first_valid_preact.shape[0]
+
+            for layer_idx_loop in range(self.config.num_layers):
+                num_feat_for_layer: int
+                mean_loop: Optional[torch.Tensor] = None
+                std_loop: Optional[torch.Tensor] = None
+                preact_norm_loop: Optional[torch.Tensor] = None
+
+                if layer_idx_loop in preactivations_dict:
+                    preact_orig_loop = preactivations_dict[layer_idx_loop]
+                    num_feat_for_layer = (
+                        preact_orig_loop.shape[1] if preact_orig_loop.numel() > 0 else self.config.num_features
+                    )
+
+                    if preact_orig_loop.shape[0] != batch_tokens_dim_posthoc and preact_orig_loop.numel() > 0:
+                        logger.warning(
+                            f"Rank {self.rank} Layer {layer_idx_loop}: Mismatched token dim (expected {batch_tokens_dim_posthoc}, got {preact_orig_loop.shape[0]}). Using zeros."
+                        )
+                        mean_loop = torch.zeros((1, num_feat_for_layer), device=self.device, dtype=self.dtype)
+                        std_loop = torch.ones((1, num_feat_for_layer), device=self.device, dtype=self.dtype)
+                        preact_norm_loop = torch.zeros(
+                            (batch_tokens_dim_posthoc, num_feat_for_layer), device=self.device, dtype=self.dtype
+                        )
+                        ordered_preactivations_original_posthoc.append(
+                            torch.zeros(
+                                (batch_tokens_dim_posthoc, num_feat_for_layer), device=self.device, dtype=self.dtype
+                            )
+                        )
+                        ordered_preactivations_normalized_posthoc.append(preact_norm_loop)
+                    elif preact_orig_loop.numel() == 0 and batch_tokens_dim_posthoc > 0:
+                        mean_loop = torch.zeros((1, num_feat_for_layer), device=self.device, dtype=self.dtype)
+                        std_loop = torch.ones((1, num_feat_for_layer), device=self.device, dtype=self.dtype)
+                        preact_norm_loop = torch.zeros(
+                            (batch_tokens_dim_posthoc, num_feat_for_layer), device=self.device, dtype=self.dtype
+                        )
+                        ordered_preactivations_original_posthoc.append(
+                            torch.zeros(
+                                (batch_tokens_dim_posthoc, num_feat_for_layer), device=self.device, dtype=self.dtype
+                            )
+                        )
+                        ordered_preactivations_normalized_posthoc.append(preact_norm_loop)
+                    elif preact_orig_loop.numel() > 0:
+                        mean_loop = preact_orig_loop.mean(dim=0, keepdim=True)
+                        std_loop = preact_orig_loop.std(dim=0, keepdim=True)
+                        preact_norm_loop = (preact_orig_loop - mean_loop) / (std_loop + 1e-6)
+                        ordered_preactivations_original_posthoc.append(preact_orig_loop)
+                        ordered_preactivations_normalized_posthoc.append(preact_norm_loop)
+                        assert (
+                            self._avg_layer_means is not None
+                            and self._avg_layer_stds is not None
+                            and self._processed_batches_for_stats is not None
+                        )
+                        self._avg_layer_means.data[layer_idx_loop] += mean_loop.squeeze().clone()
+                        self._avg_layer_stds.data[layer_idx_loop] += std_loop.squeeze().clone()
+                        self._processed_batches_for_stats.data[layer_idx_loop] += 1
+                    else:
+                        num_feat_for_layer = self.config.num_features
+                else:
+                    num_feat_for_layer = self.config.num_features
+                    if batch_tokens_dim_posthoc > 0:
+                        ordered_preactivations_original_posthoc.append(
+                            torch.zeros(
+                                (batch_tokens_dim_posthoc, num_feat_for_layer), device=self.device, dtype=self.dtype
+                            )
+                        )
+                        ordered_preactivations_normalized_posthoc.append(
+                            torch.zeros(
+                                (batch_tokens_dim_posthoc, num_feat_for_layer), device=self.device, dtype=self.dtype
+                            )
+                        )
+                layer_feature_sizes_posthoc.append((layer_idx_loop, num_feat_for_layer))
+
+            if not ordered_preactivations_normalized_posthoc or not any(
+                t.numel() > 0 for t in ordered_preactivations_normalized_posthoc
+            ):
+                logger.warning(
+                    f"Rank {self.rank}: No normalized preactivations. Skipping batch {processed_batches_total + 1}."
+                )
+                processed_batches_total += 1
+                continue
+
+            if not ordered_preactivations_original_posthoc or not any(
+                t.numel() > 0 for t in ordered_preactivations_original_posthoc
+            ):
+                concatenated_preactivations_for_gating = torch.cat(ordered_preactivations_normalized_posthoc, dim=1)
+                logger.debug(
+                    f"Rank {self.rank} Batch {processed_batches_total + 1}: Using normalized preactivations for gating due to empty/all-empty original list."
+                )
+            else:
+                concatenated_preactivations_for_gating = torch.cat(ordered_preactivations_original_posthoc, dim=1)
+
+            concatenated_preactivations_for_ranking = torch.cat(ordered_preactivations_normalized_posthoc, dim=1)
+
+            activated_concatenated_posthoc: Optional[torch.Tensor] = None
+            if self.config.activation_fn == "batchtopk":
+                k_val_int = (
+                    int(self.config.batchtopk_k)
+                    if self.config.batchtopk_k is not None
+                    else concatenated_preactivations_for_gating.size(1)
+                )
+                straight_through_btk = self.config.batchtopk_straight_through
+                activated_concatenated_posthoc = BatchTopK.apply(
+                    concatenated_preactivations_for_gating,
+                    float(k_val_int),
+                    straight_through_btk,
+                    concatenated_preactivations_for_ranking,
+                )
+            elif self.config.activation_fn == "topk":
+                if not hasattr(self.config, "topk_k") or self.config.topk_k is None:
+                    logger.error(
+                        f"Rank {self.rank}: 'topk_k' not found in config for 'topk' activation during theta estimation. Defaulting to all features for this batch."
+                    )
+                    k_val_float = float(concatenated_preactivations_for_gating.size(1))
+                else:
+                    k_val_float = float(self.config.topk_k)
+
+                straight_through_tk = getattr(self.config, "topk_straight_through", True)
+                activated_concatenated_posthoc = TokenTopK.apply(
+                    concatenated_preactivations_for_gating,
+                    k_val_float,
+                    straight_through_tk,
+                    concatenated_preactivations_for_ranking,
+                )
+            else:
+                logger.error(
+                    f"Rank {self.rank}: Unsupported activation_fn '{self.config.activation_fn}' for theta estimation. Cannot determine gating mechanism. Using zeros for activated_concatenated_posthoc."
+                )
+                activated_concatenated_posthoc = torch.zeros_like(concatenated_preactivations_for_gating)
+
+            if activated_concatenated_posthoc is not None:
+                self._update_min_selected_preactivations(
+                    concatenated_preactivations_for_ranking,
+                    activated_concatenated_posthoc,
+                    layer_feature_sizes_posthoc,
+                )
+            processed_batches_total += 1
+
+        logger.info(
+            f"Rank {self.rank}: Processed {processed_batches_total} batches for theta estimation and stats accumulation."
+        )
+        assert (
+            self._processed_batches_for_stats is not None
+            and self._avg_layer_means is not None
+            and self._avg_layer_stds is not None
+        )
+        if (
+            self._processed_batches_for_stats is not None
+            and self._avg_layer_means is not None
+            and self._avg_layer_stds is not None
+        ):
+            active_stat_batches = self._processed_batches_for_stats.data.unsqueeze(-1).clamp_min(1.0)
+            self._avg_layer_means.data /= active_stat_batches
+            self._avg_layer_stds.data /= active_stat_batches
+            logger.info(f"Rank {self.rank}: Averaged layer-wise normalization stats computed.")
+        else:
+            logger.warning(f"Rank {self.rank}: Could not finalize normalization stats, buffers missing.")
+
+        self.convert_to_jumprelu_inplace(default_theta_value=default_theta_value)
+
+        # Buffers are part of the module, no need to delete them here unless they are truly temporary and not nn.Buffer
+        # Since they are registered buffers, they persist with the module unless explicitly deleted.
+
+        logger.info(f"Rank {self.rank}: Post-hoc theta estimation and conversion to JumpReLU complete.")
+        if self.log_threshold is not None and hasattr(self.log_threshold, "data"):
+            return torch.exp(self.log_threshold.data)
+        else:
+            logger.warning(
+                f"Rank {self.rank}: log_threshold not available for returning estimated theta. Returning empty tensor."
+            )
+            return torch.empty(0, device=self.device, dtype=self.dtype)
+
+    @torch.no_grad()
+    def convert_to_jumprelu_inplace(self, default_theta_value: float = 1e6) -> None:
+        """
+        Converts the model to use JumpReLU activation based on learned BatchTopK/TokenTopK thresholds.
+        This method updates the ThetaManager's self.config and self.log_threshold parameter.
+        """
+        if self.config.activation_fn not in ["batchtopk", "topk"]:
+            logger.warning(
+                f"Rank {self.rank}: Model original activation_fn was {self.config.activation_fn}, not batchtopk or topk. "
+                "Skipping conversion to JumpReLU based on learned thetas."
+            )
+            if self.config.activation_fn == "relu":
+                logger.error(f"Rank {self.rank}: Model is ReLU, cannot convert to JumpReLU via learned thetas.")
+            return
+
+        required_buffers = [
+            "_sum_min_selected_preact",
+            "_count_min_selected_preact",
+            "_avg_layer_means",
+            "_avg_layer_stds",
+        ]
+        for buf_name in required_buffers:
+            if not hasattr(self, buf_name) or getattr(self, buf_name) is None:
+                raise RuntimeError(
+                    f"Rank {self.rank}: Required buffer {buf_name} for JumpReLU conversion not found or not populated. "
+                    "Run estimate_theta_posthoc() with appropriate settings before converting."
+                )
+        assert self._sum_min_selected_preact is not None and self._count_min_selected_preact is not None
+        assert self._avg_layer_means is not None and self._avg_layer_stds is not None
+
+        logger.info(
+            f"Rank {self.rank}: Starting conversion of {self.config.activation_fn} model to JumpReLU (per-layer avg norm. theta, then unnormalize)."
+        )
+
+        theta_sum_norm = self._sum_min_selected_preact.clone()
+        theta_cnt_norm = self._count_min_selected_preact.clone()
+        avg_mus = self._avg_layer_means.clone()
+        avg_sigmas = self._avg_layer_stds.clone()
+
+        if self.process_group is not None and dist.is_initialized() and self.world_size > 1:
+            dist.all_reduce(theta_sum_norm, op=dist.ReduceOp.SUM, group=self.process_group)
+            dist.all_reduce(theta_cnt_norm, op=dist.ReduceOp.SUM, group=self.process_group)
+            dist.all_reduce(avg_mus, op=dist.ReduceOp.SUM, group=self.process_group)
+            avg_mus /= self.world_size
+            dist.all_reduce(avg_sigmas, op=dist.ReduceOp.SUM, group=self.process_group)
+            avg_sigmas /= self.world_size
+            logger.info(f"Rank {self.rank}: AllReduced and averaged mu/sigma for unnormalization across ranks.")
+
+        theta_raw = torch.zeros_like(theta_sum_norm)
+        fallback_norm_theta_value = 1e-5
+
+        for l_idx in range(self.config.num_layers):
+            layer_theta_sum_norm = theta_sum_norm[l_idx]
+            layer_theta_cnt_norm = theta_cnt_norm[l_idx]
+            active_mask_layer = layer_theta_cnt_norm > 0
+            per_feature_thetas_norm_layer = torch.full_like(layer_theta_sum_norm, float("inf"))
+
+            if active_mask_layer.any():
+                per_feature_thetas_norm_layer[active_mask_layer] = layer_theta_sum_norm[
+                    active_mask_layer
+                ] / layer_theta_cnt_norm[active_mask_layer].clamp_min(1.0)
+
+            finite_positive_thetas_norm_layer = per_feature_thetas_norm_layer[
+                torch.isfinite(per_feature_thetas_norm_layer) & (per_feature_thetas_norm_layer > 0)
+            ]
+
+            theta_norm_scalar_for_this_layer: float
+            if finite_positive_thetas_norm_layer.numel() > 0:
+                theta_norm_scalar_for_this_layer = finite_positive_thetas_norm_layer.mean().item()
+                logger.info(
+                    f"Rank {self.rank} Layer {l_idx}: Derived normalized theta (scalar, mean of positive active features) = {theta_norm_scalar_for_this_layer:.4e}"
+                )
+            else:
+                theta_norm_scalar_for_this_layer = fallback_norm_theta_value
+                logger.warning(
+                    f"Rank {self.rank} Layer {l_idx}: No positive, finite per-feature normalized thetas. Using fallback normalized theta = {theta_norm_scalar_for_this_layer:.4e}"
+                )
+
+            mu_vec_layer = avg_mus[l_idx]
+            sigma_vec_layer = avg_sigmas[l_idx].clamp_min(1e-6)
+            theta_raw_vec_for_layer = theta_norm_scalar_for_this_layer * sigma_vec_layer + mu_vec_layer
+            theta_raw[l_idx] = theta_raw_vec_for_layer
+
+            if self.rank == 0 and l_idx < 5:
+                logger.info(
+                    f"Rank 0 Layer {l_idx}: Normalized Theta_scalar={theta_norm_scalar_for_this_layer:.3e}. Mu (sample): {mu_vec_layer[:3].tolist()}. Sigma (sample): {sigma_vec_layer[:3].tolist()}. Raw Theta (sample): {theta_raw_vec_for_layer[:3].tolist()}"
+                )
+        logger.info(f"Rank {self.rank}: Per-feature raw thresholds computed via unnormalization.")
+
+        num_norm_feat_no_stats = (theta_cnt_norm == 0).sum().item()
+        logger.info(
+            f"Rank {self.rank}: Number of features that had no BatchTopK/TokenTopK stats (norm counts==0) across all layers: {num_norm_feat_no_stats}"
+        )
+        if self.rank == 0:
+            logger.info(f"Rank {self.rank}: Final RAW Theta stats (per-feature, shape {theta_raw.shape}):")
+            for l_idx in range(self.config.num_layers):
+                layer_raw_thetas = theta_raw[l_idx]
+                logger.info(
+                    f"  Layer {l_idx}: min={layer_raw_thetas.min().item():.4e}, mean={layer_raw_thetas.mean().item():.4e}, max={layer_raw_thetas.max().item():.4e}"
+                )
+            try:
+                import wandb
+
+                if wandb.run:
+                    for l_idx in range(self.config.num_layers):
+                        layer_raw_thetas_for_hist = theta_raw[l_idx].cpu().float()
+                        finite_layer_raw_thetas = layer_raw_thetas_for_hist[
+                            torch.isfinite(layer_raw_thetas_for_hist) & (layer_raw_thetas_for_hist > 0)
+                        ]
+                        if finite_layer_raw_thetas.numel() > 0:
+                            wandb.log(
+                                {
+                                    f"debug/theta_layer_{l_idx}_raw_dist_log10": wandb.Histogram(
+                                        torch.log10(finite_layer_raw_thetas).tolist()
+                                    )
+                                },
+                                commit=False,
+                            )
+                        else:
+                            logger.debug(
+                                f"Rank {self.rank}: Layer {l_idx} had no finite positive raw thetas for histogram."
+                            )
+                    all_raw_thetas_flat = theta_raw.flatten().cpu().float()
+                    finite_all_raw_thetas = all_raw_thetas_flat[
+                        torch.isfinite(all_raw_thetas_flat) & (all_raw_thetas_flat > 0)
+                    ]
+                    if finite_all_raw_thetas.numel() > 0:
+                        wandb.log(
+                            {
+                                "debug/theta_raw_overall_min_log10": torch.log10(finite_all_raw_thetas.min()).item(),
+                                "debug/theta_raw_overall_max_log10": torch.log10(finite_all_raw_thetas.max()).item(),
+                                "debug/theta_raw_overall_mean_log10": torch.log10(finite_all_raw_thetas.mean()).item(),
+                            },
+                            commit=False,
+                        )
+            except ImportError:
+                logger.info("WandB not installed, skipping raw theta distribution logging.")
+            except (RuntimeError, ValueError) as e:
+                logger.error(f"Rank {self.rank}: Error logging raw theta distributions to WandB: {e}")
+
+        min_final_raw_theta = 1e-7
+        num_clamped_final = (theta_raw < min_final_raw_theta).sum().item()
+        if num_clamped_final > 0:
+            logger.warning(
+                f"Rank {self.rank}: Clamping {num_clamped_final} final raw theta values below {min_final_raw_theta} to {min_final_raw_theta} before taking log."
+            )
+            theta_raw.clamp_min_(min_final_raw_theta)
+
+        log_theta_data = torch.log(theta_raw)  # Changed name to avoid conflict with self.log_threshold
+
+        original_activation_fn = self.config.activation_fn
+        self.config.activation_fn = "jumprelu"
+        self.config.jumprelu_threshold = 0.0  # Mark as effectively superseded
+
+        if original_activation_fn == "batchtopk":
+            self.config.batchtopk_k = None
+            self.config.batchtopk_straight_through = False
+        elif original_activation_fn == "topk":
+            if hasattr(self.config, "topk_k"):
+                del self.config.topk_k
+            if hasattr(self.config, "topk_straight_through"):
+                del self.config.topk_straight_through
+
+        if not hasattr(self, "log_threshold") or self.log_threshold is None:
+            self.log_threshold = nn.Parameter(log_theta_data.to(device=self.device, dtype=self.dtype))
+        else:
+            if not isinstance(self.log_threshold, nn.Parameter):
+                self.log_threshold = nn.Parameter(
+                    log_theta_data.to(device=self.log_threshold.device, dtype=self.log_threshold.dtype)
+                )
+            else:
+                self.log_threshold.data = log_theta_data.to(
+                    device=self.log_threshold.device, dtype=self.log_threshold.dtype
+                )
+
+        mark_replicated(self.log_threshold)
+
+        logger.info(f"Rank {self.rank}: Model converted to JumpReLU. activation_fn='{self.config.activation_fn}'.")
+        if self.rank == 0:
+            min_log_thresh = (
+                self.log_threshold.data.min().item()
+                if self.log_threshold is not None
+                and hasattr(self.log_threshold, "data")
+                and self.log_threshold.data.numel() > 0
+                else float("nan")
+            )
+            max_log_thresh = (
+                self.log_threshold.data.max().item()
+                if self.log_threshold is not None
+                and hasattr(self.log_threshold, "data")
+                and self.log_threshold.data.numel() > 0
+                else float("nan")
+            )
+            mean_log_thresh = (
+                self.log_threshold.data.mean().item()
+                if self.log_threshold is not None
+                and hasattr(self.log_threshold, "data")
+                and self.log_threshold.data.numel() > 0
+                else float("nan")
+            )
+            logger.info(
+                f"Rank {self.rank}: Final log_threshold stats: min={min_log_thresh:.4f}, max={max_log_thresh:.4f}, mean={mean_log_thresh:.4f}"
+            )

From ba9145cadb6c1237438457c2ed03e24f4afbf770 Mon Sep 17 00:00:00 2001
From: Curt Tigges <curttigges@curts-mbp.local.meter>
Date: Fri, 23 May 2025 15:07:48 -0700
Subject: [PATCH 4/4] added new unit tests for activations

---
 .github/workflows/python-tests.yml           |  45 +
 clt/models/activations.py                    |  16 +-
 clt/training/trainer.py                      |   2 -
 tests/integration/test_activation_store.py   | 200 ----
 tests/integration/test_config_variants.py    | 218 ----
 tests/integration/test_pretrained_model.py   | 173 ----
 tests/integration/test_training_pipeline.py  | 167 ----
 tests/models/test_clt_distributed_forward.py | 132 +++
 tests/unit/models/test_activations.py        | 134 ---
 tests/unit/models/test_base.py               | 116 ---
 tests/unit/models/test_clt.py                | 616 ------------
 tests/unit/test_activation_registry.py       | 204 ++++
 tests/unit/test_activations.py               | 429 ++++++++
 tests/unit/training/test_data.py             | 731 --------------
 tests/unit/training/test_evaluator.py        | 592 -----------
 tests/unit/training/test_losses.py           | 672 -------------
 tests/unit/training/test_trainer.py          | 987 -------------------
 17 files changed, 824 insertions(+), 4610 deletions(-)
 create mode 100644 .github/workflows/python-tests.yml
 delete mode 100644 tests/integration/test_activation_store.py
 delete mode 100644 tests/integration/test_config_variants.py
 delete mode 100644 tests/integration/test_pretrained_model.py
 delete mode 100644 tests/integration/test_training_pipeline.py
 create mode 100644 tests/models/test_clt_distributed_forward.py
 delete mode 100644 tests/unit/models/test_activations.py
 delete mode 100644 tests/unit/models/test_base.py
 delete mode 100644 tests/unit/models/test_clt.py
 create mode 100644 tests/unit/test_activation_registry.py
 create mode 100644 tests/unit/test_activations.py
 delete mode 100644 tests/unit/training/test_data.py
 delete mode 100644 tests/unit/training/test_evaluator.py
 delete mode 100644 tests/unit/training/test_losses.py
 delete mode 100644 tests/unit/training/test_trainer.py

diff --git a/.github/workflows/python-tests.yml b/.github/workflows/python-tests.yml
new file mode 100644
index 0000000..0bac632
--- /dev/null
+++ b/.github/workflows/python-tests.yml
@@ -0,0 +1,45 @@
+name: Python Tests
+
+on:
+  push:
+    branches:
+      - main
+      - develop # Or your primary development branch
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9", "3.10", "3.11"] # Specify python versions
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Install Poetry
+      run: |
+        curl -sSL https://install.python-poetry.org | python3 -
+        echo "$HOME/.local/bin" >> $GITHUB_PATH
+      # Alternatively, if not using Poetry, or have a requirements.txt:
+      # run: pip install -r requirements.txt
+
+    - name: Install dependencies
+      run: poetry install --no-interaction --no-root
+      # If you have dev dependencies for pytest, e.g. in a [tool.poetry.group.dev.dependencies]
+      # run: poetry install --no-interaction --no-root --with dev
+      # Or if using pip with requirements.txt:
+      # run: pip install -r requirements-dev.txt # (if you have a separate dev requirements)
+      # run: pip install pytest # or ensure pytest is in your main requirements
+
+    - name: Run tests with pytest
+      run: poetry run pytest tests/
+      # Or if not using poetry:
+      # run: pytest tests/ 
\ No newline at end of file
diff --git a/clt/models/activations.py b/clt/models/activations.py
index 3901782..b48df5d 100644
--- a/clt/models/activations.py
+++ b/clt/models/activations.py
@@ -197,12 +197,24 @@ def backward(ctx, *grad_outputs: torch.Tensor) -> Tuple[Optional[torch.Tensor],
             grad_threshold_per_element = grad_output * local_grad_theta
 
             if grad_threshold_per_element.dim() > threshold.dim():
+                # Handles cases like input (B,F), threshold (F) or input (F), threshold (scalar)
                 dims_to_sum = tuple(range(grad_threshold_per_element.dim() - threshold.dim()))
                 grad_threshold = grad_threshold_per_element.sum(dim=dims_to_sum)
-                if threshold.shape != torch.Size([]):
+                # Ensure final shape matches threshold, especially if sum squeezed dimensions
+                if grad_threshold.shape != threshold.shape:
                     grad_threshold = grad_threshold.reshape(threshold.shape)
-            else:
+            elif grad_threshold_per_element.dim() == threshold.dim():
+                # Handles cases like input (F), threshold (F), or input [1], threshold [1]
+                grad_threshold = grad_threshold_per_element
+                # Defensive reshape, though shapes should ideally match here.
+                if grad_threshold.shape != threshold.shape:
+                    grad_threshold = grad_threshold.reshape(threshold.shape)
+            else:  # grad_threshold_per_element.dim() < threshold.dim()
+                # This case is less common (e.g. input scalar, threshold vector - not typical for this op).
+                # Defaulting to sum and reshape, primarily for scalar threshold case.
                 grad_threshold = grad_threshold_per_element.sum()
+                if grad_threshold.shape != threshold.shape:
+                    grad_threshold = grad_threshold.reshape(threshold.shape)
         return grad_input, grad_threshold, None
 
 
diff --git a/clt/training/trainer.py b/clt/training/trainer.py
index 359e96c..cf8dae6 100644
--- a/clt/training/trainer.py
+++ b/clt/training/trainer.py
@@ -580,8 +580,6 @@ def train(self, eval_every: int = 1000) -> CrossLayerTranscoder:
                         tok_cnt_t = torch.tensor([tok_cnt], device=self.device)
                         gathered = [torch.zeros_like(tok_cnt_t) for _ in range(self.world_size)]
                         dist.all_gather(gathered, tok_cnt_t)
-                        if self.rank == 0:
-                            print("Batch token-count per rank:", [int(x.item()) for x in gathered])
 
                 except StopIteration:
                     # Rank 0 prints message
diff --git a/tests/integration/test_activation_store.py b/tests/integration/test_activation_store.py
deleted file mode 100644
index 9f26dc3..0000000
--- a/tests/integration/test_activation_store.py
+++ /dev/null
@@ -1,200 +0,0 @@
-import pytest
-import torch
-import os
-import tempfile
-import shutil
-import numpy as np
-
-from clt.training.data import ActivationStore
-
-
-@pytest.fixture
-def temp_data_dir():
-    """Create a temporary directory for test data files."""
-    temp_dir = tempfile.mkdtemp(prefix="clt_integration_test_data_")
-    yield temp_dir
-    shutil.rmtree(temp_dir)
-
-
-@pytest.fixture
-def dummy_nnsight_activations():
-    """Create dummy NNsight-format activations for testing."""
-    return {
-        "model.layers.0.mlp_in_0": [torch.randn(50, 32), torch.randn(50, 32)],
-        "model.layers.1.mlp_in_1": [torch.randn(50, 32), torch.randn(50, 32)],
-        "model.layers.0.mlp_out_0": [torch.randn(50, 32), torch.randn(50, 32)],
-        "model.layers.1.mlp_out_1": [torch.randn(50, 32), torch.randn(50, 32)],
-    }
-
-
-@pytest.fixture
-def saved_activation_files(temp_data_dir, dummy_nnsight_activations):
-    """Save dummy activations to disk and return the path."""
-    # Save the activation tensor lists as separate files
-    file_paths = {}
-
-    for key, tensor_list in dummy_nnsight_activations.items():
-        # Combine the tensors into one tensor for simplicity
-        combined = torch.cat(tensor_list, dim=0)
-
-        # Create a path for this activation
-        file_name = f"{key.replace('.', '_')}.pt"
-        file_path = os.path.join(temp_data_dir, file_name)
-
-        # Save the tensor
-        torch.save(combined, file_path)
-        file_paths[key] = file_path
-
-    # Save a metadata file with the structure
-    metadata = {"layer_indices": [0, 1], "d_model": 32, "num_tokens": 100}  # 50 + 50
-    metadata_path = os.path.join(temp_data_dir, "metadata.pt")
-    torch.save(metadata, metadata_path)
-
-    return {"files": file_paths, "metadata": metadata_path, "dir": temp_data_dir}
-
-
-@pytest.mark.integration
-def test_activation_store_from_nnsight(dummy_nnsight_activations):
-    """Test creating an ActivationStore from NNsight-formatted activations."""
-    batch_size = 8
-
-    # Create store using the class method
-    store = ActivationStore.from_nnsight_activations(
-        dummy_nnsight_activations, batch_size=batch_size, normalize=True
-    )
-
-    # Verify the store was created correctly
-    assert store.num_layers == 2
-    assert store.num_tokens == 100  # 50 + 50 from two batches
-    assert store.batch_size == batch_size
-
-    # Check layer indices
-    assert set(store.layer_indices) == {0, 1}
-
-    # Check shapes of stored activations
-    assert store.mlp_inputs[0].shape == (100, 32)
-    assert store.mlp_outputs[1].shape == (100, 32)
-
-    # Check normalization happened
-    assert 0 in store.input_means
-    assert 1 in store.input_stds
-    assert torch.allclose(store.mlp_inputs[0].mean(dim=0), torch.zeros(32), atol=1e-6)
-    assert torch.allclose(store.mlp_inputs[0].std(dim=0), torch.ones(32), atol=1e-6)
-
-    # Test retrieving batches
-    inputs, outputs = store.get_batch()
-
-    # Check batch shapes
-    assert inputs[0].shape == (batch_size, 32)
-    assert outputs[1].shape == (batch_size, 32)
-
-    # Check we can get all batches
-    seen_tokens = 0
-    all_batches = []
-    original_token_indices = store.token_indices.copy()
-
-    # Collect all batches and track how many tokens we've seen
-    while seen_tokens < store.num_tokens:
-        batch_inputs, _ = store.get_batch()
-        batch_size_actual = batch_inputs[0].shape[0]
-        all_batches.append(batch_inputs)
-        seen_tokens += batch_size_actual
-
-    # Check we saw all tokens
-    assert seen_tokens == store.num_tokens
-
-    # Check that token_indices was shuffled when we exhausted all tokens
-    assert not np.array_equal(original_token_indices, store.token_indices)
-
-
-@pytest.mark.integration
-def test_denormalize_outputs(dummy_nnsight_activations):
-    """Test that denormalization correctly restores original scale."""
-    # Create an unnormalized store first to get the original data
-    original_store = ActivationStore.from_nnsight_activations(
-        dummy_nnsight_activations, batch_size=16, normalize=False
-    )
-    original_outputs_all = original_store.mlp_outputs
-
-    # Create the store with normalization for the test
-    store = ActivationStore.from_nnsight_activations(
-        dummy_nnsight_activations, batch_size=16, normalize=True
-    )
-
-    # Get a batch of normalized outputs and its indices
-    store.shuffle_tokens()  # Shuffle once to get a specific order
-    batch_indices = store.token_indices[: store.batch_size]
-    _, normalized_outputs = store.get_batch()  # This updates the pointer
-
-    # Denormalize the outputs
-    denormalized_batch = store.denormalize_outputs(normalized_outputs)
-
-    # Check denormalized shape matches normalized shape
-    for layer_idx in normalized_outputs:
-        assert (
-            denormalized_batch[layer_idx].shape == normalized_outputs[layer_idx].shape
-        )
-
-    # Compare the denormalized batch to the original data for the same indices
-    for layer_idx in denormalized_batch:
-        # Retrieve the original (unnormalized) data for the specific batch indices
-        original_data_batch = original_outputs_all[layer_idx][batch_indices]
-
-        # Compare denormalized batch data to original data slice
-        assert torch.allclose(
-            denormalized_batch[layer_idx], original_data_batch, atol=1e-6
-        ), f"Denormalization failed for layer {layer_idx}"
-
-
-# This test simulates a more realistic scenario where we might save activations from
-# a model run, then load them back for training
-@pytest.mark.integration
-def test_save_load_activation_workflow(saved_activation_files):
-    """Test a workflow of loading activations from saved files."""
-    # In a real scenario, these files would come from an activation extraction run
-    file_dir = saved_activation_files["dir"]
-
-    # Load the activations manually to simulate what might happen in a real workflow
-    mlp_inputs = {}
-    mlp_outputs = {}
-
-    # Get all .pt files in the directory
-    for filename in os.listdir(file_dir):
-        if filename.endswith(".pt") and "metadata" not in filename:
-            file_path = os.path.join(file_dir, filename)
-            tensor = torch.load(file_path)
-
-            # Parse the filename to determine what this tensor represents
-            if "mlp_in" in filename:
-                # Handle potential double extensions like .mlp_in_0.pt
-                base_name = filename.rsplit(".", 1)[0]
-                layer_idx = int(base_name.split("_")[-1])
-                mlp_inputs[layer_idx] = tensor
-            elif "mlp_out" in filename:
-                base_name = filename.rsplit(".", 1)[0]
-                layer_idx = int(base_name.split("_")[-1])
-                mlp_outputs[layer_idx] = tensor
-
-    # Create an ActivationStore from the loaded files
-    store = ActivationStore(
-        mlp_inputs=mlp_inputs, mlp_outputs=mlp_outputs, batch_size=10, normalize=True
-    )
-
-    # Verify the store looks correct
-    assert store.num_layers == 2
-    assert store.num_tokens == 100
-    assert set(store.layer_indices) == {0, 1}
-
-    # Test batching works
-    inputs, outputs = store.get_batch()
-    assert inputs[0].shape == (10, 32)  # Batch size 10, d_model 32
-    assert outputs[1].shape == (10, 32)
-
-    # Shuffle and get another batch
-    store.shuffle_tokens()
-    inputs2, outputs2 = store.get_batch()
-
-    # These should be different batches (tiny chance they are the same)
-    # Just check shapes are as expected
-    assert inputs2[0].shape == (10, 32)
-    assert outputs2[1].shape == (10, 32)
diff --git a/tests/integration/test_config_variants.py b/tests/integration/test_config_variants.py
deleted file mode 100644
index 3547426..0000000
--- a/tests/integration/test_config_variants.py
+++ /dev/null
@@ -1,218 +0,0 @@
-import pytest
-import torch
-
-from clt.config import CLTConfig, TrainingConfig
-from clt.training.data import ActivationStore
-from clt.training.trainer import CLTTrainer
-from clt.models.clt import CrossLayerTranscoder
-
-
-@pytest.fixture
-def small_clt_config():
-    """Create a minimal CLTConfig for testing."""
-    return CLTConfig(
-        num_layers=2,
-        num_features=8,
-        d_model=16,
-        activation_fn="jumprelu",
-        jumprelu_threshold=0.03,
-    )
-
-
-@pytest.fixture
-def small_activation_data():
-    """Generate small activation tensors for testing."""
-    num_tokens = 20
-    d_model = 16  # Must match the config
-
-    # Create small random activation tensors
-    mlp_inputs = {
-        0: torch.randn(num_tokens, d_model),
-        1: torch.randn(num_tokens, d_model),
-    }
-
-    # Outputs can be slightly different to simulate MLP transformation
-    mlp_outputs = {
-        0: torch.randn(num_tokens, d_model),
-        1: torch.randn(num_tokens, d_model),
-    }
-
-    return mlp_inputs, mlp_outputs
-
-
-@pytest.fixture
-def small_activation_store(small_activation_data):
-    """Create a small ActivationStore from generated data."""
-    mlp_inputs, mlp_outputs = small_activation_data
-
-    return ActivationStore(
-        mlp_inputs=mlp_inputs, mlp_outputs=mlp_outputs, batch_size=4, normalize=True
-    )
-
-
-@pytest.mark.parametrize(
-    "optimizer,lr_scheduler",
-    [
-        ("adam", None),
-        ("adam", "linear"),
-        ("adam", "cosine"),
-        ("adamw", None),
-        ("adamw", "linear"),
-        ("adamw", "cosine"),
-    ],
-)
-@pytest.mark.integration
-def test_config_variants_optimizer_scheduler(
-    small_clt_config, small_activation_store, optimizer, lr_scheduler
-):
-    """Test that various optimizer and scheduler configurations initialize correctly."""
-    # Create a training config with the specified optimizer and scheduler
-    training_config = TrainingConfig(
-        learning_rate=1e-3,
-        batch_size=4,
-        training_steps=10,
-        sparsity_lambda=1e-3,
-        sparsity_c=1.0,
-        preactivation_coef=3e-6,
-        optimizer=optimizer,
-        lr_scheduler=lr_scheduler,
-    )
-
-    # Initialize trainer with temporary log dir
-    trainer = CLTTrainer(
-        clt_config=small_clt_config,
-        training_config=training_config,
-        activation_store=small_activation_store,
-        log_dir=f"temp_log_{optimizer}_{lr_scheduler}",
-        device="cpu",
-    )
-
-    # Verify optimizer type
-    if optimizer == "adam":
-        assert isinstance(trainer.optimizer, torch.optim.Adam)
-    else:  # adamw
-        assert isinstance(trainer.optimizer, torch.optim.AdamW)
-
-    # Verify scheduler type
-    if lr_scheduler == "linear":
-        assert isinstance(trainer.scheduler, torch.optim.lr_scheduler.LinearLR)
-    elif lr_scheduler == "cosine":
-        assert isinstance(trainer.scheduler, torch.optim.lr_scheduler.CosineAnnealingLR)
-    else:  # None
-        assert trainer.scheduler is None
-
-
-@pytest.mark.parametrize(
-    "sparsity_lambda,sparsity_c,preactivation_coef",
-    [
-        (0.0, 1.0, 0.0),  # No regularization
-        (1e-2, 2.0, 0.0),  # Stronger sparsity, no preactivation
-        (1e-3, 1.0, 1e-5),  # Normal sparsity, with preactivation
-        (1e-2, 2.0, 1e-5),  # Both stronger
-    ],
-)
-@pytest.mark.integration
-def test_config_variants_loss_params(
-    small_clt_config,
-    small_activation_store,
-    sparsity_lambda,
-    sparsity_c,
-    preactivation_coef,
-):
-    """Test that various loss hyperparameter configurations initialize correctly."""
-    # Create training config with the specified loss parameters
-    training_config = TrainingConfig(
-        learning_rate=1e-3,
-        batch_size=4,
-        training_steps=10,
-        sparsity_lambda=sparsity_lambda,
-        sparsity_c=sparsity_c,
-        preactivation_coef=preactivation_coef,
-        optimizer="adam",
-        lr_scheduler=None,
-    )
-
-    # Initialize trainer
-    trainer = CLTTrainer(
-        clt_config=small_clt_config,
-        training_config=training_config,
-        activation_store=small_activation_store,
-        log_dir=(
-            f"temp_log_loss_params_{sparsity_lambda}_{sparsity_c}_{preactivation_coef}"
-        ),
-        device="cpu",
-    )
-
-    # Verify loss manager parameters
-    assert trainer.loss_manager.config.sparsity_lambda == sparsity_lambda
-    assert trainer.loss_manager.config.sparsity_c == sparsity_c
-    assert trainer.loss_manager.config.preactivation_coef == preactivation_coef
-
-
-@pytest.mark.parametrize(
-    "num_layers,num_features,d_model",
-    [
-        (2, 8, 16),  # Small config (default from fixture)
-        (3, 16, 32),  # Medium config
-        (4, 32, 64),  # Larger config
-    ],
-)
-@pytest.mark.integration
-def test_config_variants_model_sizes(
-    small_activation_store, num_layers, num_features, d_model
-):
-    """Test that different model size configurations initialize correctly."""
-    # Create a custom CLT config
-    clt_config = CLTConfig(
-        num_layers=num_layers,
-        num_features=num_features,
-        d_model=d_model,
-        activation_fn="jumprelu",
-        jumprelu_threshold=0.03,
-    )
-
-    # Create a simple model directly to check the structure
-    model = CrossLayerTranscoder(clt_config).to("cpu")
-
-    # Verify model structure
-    assert len(model.encoders) == num_layers
-    assert model.encoders[0].weight.shape == (num_features, d_model)
-
-    # Expected number of decoder matrices: sum from 1 to num_layers
-    expected_decoder_count = (num_layers * (num_layers + 1)) // 2
-    assert len(model.decoders) == expected_decoder_count
-
-    # Check a specific decoder's shape
-    decoder_key = "0->0"  # From layer 0 to layer 0
-    assert decoder_key in model.decoders
-    assert model.decoders[decoder_key].weight.shape == (d_model, num_features)
-
-    # Verify threshold parameter
-    assert model.threshold.shape == (num_features,)
-    assert torch.allclose(model.threshold, torch.ones(num_features) * 0.03)
-
-    # Create a training config and initialize a trainer to ensure compatibility
-    training_config = TrainingConfig(
-        learning_rate=1e-3,
-        batch_size=4,
-        training_steps=5,
-        sparsity_lambda=1e-3,
-        sparsity_c=1.0,
-        preactivation_coef=3e-6,
-        optimizer="adam",
-        lr_scheduler=None,
-    )
-
-    # Initialize trainer - this should not raise errors
-    trainer = CLTTrainer(
-        clt_config=clt_config,
-        training_config=training_config,
-        activation_store=small_activation_store,
-        log_dir=f"temp_log_model_{num_layers}_{num_features}_{d_model}",
-        device="cpu",
-    )
-
-    # Verify initialization succeeded (using the trainer)
-    assert trainer.clt_config.num_layers == num_layers
-    assert trainer.clt_config.num_features == num_features
-    assert trainer.clt_config.d_model == d_model
diff --git a/tests/integration/test_pretrained_model.py b/tests/integration/test_pretrained_model.py
deleted file mode 100644
index 6125f8d..0000000
--- a/tests/integration/test_pretrained_model.py
+++ /dev/null
@@ -1,173 +0,0 @@
-import pytest
-import torch
-import os
-
-# import tempfile  # Unused
-# import shutil    # Unused
-# import numpy as np # Unused
-
-# from clt.config import CLTConfig  # Now unused
-from clt.models.clt import CrossLayerTranscoder
-from clt.training.data import ActivationStore
-
-
-@pytest.fixture
-def fixture_path():
-    """Path to the fixtures directory."""
-    current_dir = os.path.dirname(os.path.abspath(__file__))
-    return os.path.join(current_dir, "data")
-
-
-@pytest.fixture
-def pretrained_model_path(fixture_path):
-    """Path to the pretrained model fixture."""
-    return os.path.join(fixture_path, "pretrained_models", "pretrained_clt.pt")
-
-
-@pytest.fixture
-def dummy_activations_path(fixture_path):
-    """Path to the dummy activations fixtures."""
-    return os.path.join(fixture_path, "dummy_activations")
-
-
-@pytest.mark.integration
-def test_load_pretrained_model(pretrained_model_path):
-    """Test loading a pretrained model from a file."""
-    # Skip if the fixture doesn't exist yet
-    if not os.path.exists(pretrained_model_path):
-        pytest.skip(f"Pretrained model fixture not found at {pretrained_model_path}")
-
-    # Load the model
-    model = CrossLayerTranscoder.load(pretrained_model_path)
-
-    # Check model attributes
-    assert model.config.num_layers == 2
-    assert model.config.num_features == 8
-    assert model.config.d_model == 16
-    assert model.config.activation_fn == "jumprelu"
-    assert model.config.jumprelu_threshold == 0.03
-
-    # Check model structure
-    assert len(model.encoders) == 2
-    assert len(model.decoders) == 3  # 0->0, 0->1, 1->1
-    assert model.encoders[0].weight.shape == (8, 16)
-    assert model.decoders["0->0"].weight.shape == (16, 8)
-
-    # Verify parameters are loaded
-    assert torch.any(model.encoders[0].weight != 0)  # Not all zeros
-    assert torch.any(model.decoders["0->1"].weight != 0)
-    assert torch.allclose(model.threshold, torch.ones(8) * 0.03)
-
-
-@pytest.mark.integration
-def test_pretrained_model_inference(pretrained_model_path, dummy_activations_path):
-    """Test running inference with a pretrained model on dummy activations."""
-    # Skip if the fixtures don't exist yet
-    if not os.path.exists(pretrained_model_path):
-        pytest.skip(f"Pretrained model fixture not found at {pretrained_model_path}")
-    if not os.path.exists(dummy_activations_path):
-        pytest.skip(f"Dummy activations not found at {dummy_activations_path}")
-
-    # Load the model
-    model = CrossLayerTranscoder.load(pretrained_model_path)
-
-    # Load some dummy activations
-    standard_path = os.path.join(dummy_activations_path, "standard")
-    inputs_path = os.path.join(standard_path, "mlp_inputs.pt")
-
-    if not os.path.exists(inputs_path):
-        pytest.skip(f"Input activations not found at {inputs_path}")
-
-    # Load inputs
-    mlp_inputs = torch.load(inputs_path)
-
-    # Take a small batch for testing
-    batch_size = 4
-    test_inputs = {
-        layer: tensor[:batch_size].unsqueeze(1)  # Add sequence dimension
-        for layer, tensor in mlp_inputs.items()
-    }
-
-    # Run the model
-    with torch.no_grad():
-        outputs = model(test_inputs)
-
-    # Check outputs
-    assert isinstance(outputs, dict)
-    assert len(outputs) == len(test_inputs)
-
-    for layer in test_inputs:
-        assert layer in outputs
-        # Check shape: [batch_size, seq_len, d_model]
-        assert outputs[layer].shape == (batch_size, 1, model.config.d_model)
-
-
-@pytest.mark.integration
-def test_pretrained_model_with_activation_store(
-    pretrained_model_path, dummy_activations_path
-):
-    """Test using a pretrained model with ActivationStore."""
-    # Skip if the fixtures don't exist yet
-    if not os.path.exists(pretrained_model_path):
-        pytest.skip(f"Pretrained model fixture not found at {pretrained_model_path}")
-
-    standard_path = os.path.join(dummy_activations_path, "standard")
-    inputs_path = os.path.join(standard_path, "mlp_inputs.pt")
-    outputs_path = os.path.join(standard_path, "mlp_outputs.pt")
-
-    if not os.path.exists(inputs_path) or not os.path.exists(outputs_path):
-        pytest.skip("Activation fixtures not found")
-
-    # Load the model
-    model = CrossLayerTranscoder.load(pretrained_model_path)
-
-    # Load inputs and outputs
-    mlp_inputs = torch.load(inputs_path)
-    mlp_outputs = torch.load(outputs_path)
-
-    # Create an activation store
-    store = ActivationStore(
-        mlp_inputs=mlp_inputs, mlp_outputs=mlp_outputs, batch_size=8, normalize=True
-    )
-
-    # Get a batch from the store
-    batch_inputs, batch_targets = store.get_batch()
-
-    # Required shape for model input: [batch_size, seq_len, d_model]
-    # Current shape from ActivationStore: [batch_size, d_model]
-    model_inputs = {
-        layer: tensor.unsqueeze(1)  # Add sequence dimension
-        for layer, tensor in batch_inputs.items()
-    }
-
-    # Run the model
-    with torch.no_grad():
-        outputs = model(model_inputs)
-
-    # Check outputs
-    assert isinstance(outputs, dict)
-    for layer in batch_inputs:
-        assert layer in outputs
-        assert outputs[layer].shape[0] == batch_inputs[layer].shape[0]  # Batch size
-        assert outputs[layer].shape[2] == model.config.d_model  # d_model
-
-    # Flatten outputs back to activation store format for comparison
-    flattened_outputs = {
-        layer: tensor.squeeze(1)  # Remove sequence dimension
-        for layer, tensor in outputs.items()
-    }
-
-    # Denormalize both for fair comparison
-    denorm_targets = store.denormalize_outputs(batch_targets)
-    denorm_outputs = store.denormalize_outputs(flattened_outputs)
-
-    # Calculate MSE difference
-    mse_diff = 0.0
-    for layer in denorm_outputs:
-        layer_mse = torch.mean(
-            (denorm_outputs[layer] - denorm_targets[layer]) ** 2
-        ).item()
-        mse_diff += layer_mse
-
-    # Not checking for a specific error value, just that computation works
-    assert isinstance(mse_diff, float)
diff --git a/tests/integration/test_training_pipeline.py b/tests/integration/test_training_pipeline.py
deleted file mode 100644
index a3a8148..0000000
--- a/tests/integration/test_training_pipeline.py
+++ /dev/null
@@ -1,167 +0,0 @@
-import pytest
-import torch
-import os
-import tempfile
-import shutil
-
-from clt.config import CLTConfig, TrainingConfig
-from clt.training.data import ActivationStore
-from clt.training.trainer import CLTTrainer
-from clt.models.clt import CrossLayerTranscoder
-
-
-@pytest.fixture
-def temp_log_dir():
-    """Create a temporary directory for logs and clean it up after the test."""
-    temp_dir = tempfile.mkdtemp(prefix="clt_integration_test_")
-    yield temp_dir
-    shutil.rmtree(temp_dir)
-
-
-@pytest.fixture
-def small_clt_config():
-    """Create a minimal CLTConfig for testing."""
-    return CLTConfig(
-        num_layers=2,
-        num_features=8,
-        d_model=16,
-        activation_fn="jumprelu",
-        jumprelu_threshold=0.03,
-    )
-
-
-@pytest.fixture
-def small_training_config():
-    """Create a minimal TrainingConfig for testing."""
-    return TrainingConfig(
-        learning_rate=1e-3,
-        batch_size=4,
-        training_steps=5,  # Very small number of steps for quick testing
-        sparsity_lambda=1e-3,
-        sparsity_c=1.0,
-        preactivation_coef=3e-6,
-        optimizer="adam",
-        lr_scheduler="linear",
-    )
-
-
-@pytest.fixture
-def small_activation_data():
-    """Generate small activation tensors for testing."""
-    num_tokens = 20
-    d_model = 16  # Must match the config
-
-    # Create small random activation tensors
-    mlp_inputs = {
-        0: torch.randn(num_tokens, d_model),
-        1: torch.randn(num_tokens, d_model),
-    }
-
-    # Outputs can be slightly different to simulate MLP transformation
-    mlp_outputs = {
-        0: torch.randn(num_tokens, d_model),
-        1: torch.randn(num_tokens, d_model),
-    }
-
-    return mlp_inputs, mlp_outputs
-
-
-@pytest.fixture
-def small_activation_store(small_activation_data):
-    """Create a small ActivationStore from generated data."""
-    mlp_inputs, mlp_outputs = small_activation_data
-
-    return ActivationStore(
-        mlp_inputs=mlp_inputs, mlp_outputs=mlp_outputs, batch_size=4, normalize=True
-    )
-
-
-@pytest.mark.integration
-def test_training_pipeline_runs(
-    small_clt_config, small_training_config, small_activation_store, temp_log_dir
-):
-    """Test that the training pipeline runs end-to-end without errors."""
-    # Initialize the trainer
-    trainer = CLTTrainer(
-        clt_config=small_clt_config,
-        training_config=small_training_config,
-        activation_store=small_activation_store,
-        log_dir=temp_log_dir,
-        device="cpu",  # Use CPU for testing
-    )
-
-    # Run training for a small number of steps
-    trained_model = trainer.train(eval_every=2)  # Evaluate every 2 steps
-
-    # Assertions to verify training completed successfully
-
-    # 1. Check that the model is returned
-    assert isinstance(trained_model, CrossLayerTranscoder)
-
-    # 2. Check that training metrics were recorded
-    assert len(trainer.metrics["train_losses"]) == small_training_config.training_steps
-
-    # 3. Check that L0 stats were collected (should be 3 times: steps 0, 2, 4)
-    assert len(trainer.metrics["l0_stats"]) == 3
-
-    # 4. Check that model files were created
-    final_model_path = os.path.join(temp_log_dir, "clt_final.pt")
-    assert os.path.exists(final_model_path)
-
-    # 5. Check that metrics were saved
-    metrics_path = os.path.join(temp_log_dir, "metrics.json")
-    assert os.path.exists(metrics_path)
-
-    # 6. Verify the model has expected attributes based on config
-    assert trained_model.config == small_clt_config
-    assert len(trained_model.encoders) == small_clt_config.num_layers
-    assert (
-        len(trained_model.decoders)
-        == (small_clt_config.num_layers * (small_clt_config.num_layers + 1)) // 2
-    )
-
-
-@pytest.mark.integration
-def test_model_save_load_integration(
-    small_clt_config, small_training_config, small_activation_store, temp_log_dir
-):
-    """Test saving a trained model and loading it back."""
-    # Initialize the trainer
-    trainer = CLTTrainer(
-        clt_config=small_clt_config,
-        training_config=small_training_config,
-        activation_store=small_activation_store,
-        log_dir=temp_log_dir,
-        device="cpu",
-    )
-
-    # Train for a few steps
-    trained_model = trainer.train(eval_every=5)  # Only evaluate at the end
-
-    # Get path to saved model
-    saved_model_path = os.path.join(temp_log_dir, "clt_final.pt")
-    assert os.path.exists(saved_model_path)
-
-    # Load the model back
-    loaded_model = CrossLayerTranscoder.load(saved_model_path)
-
-    # Assertions
-    assert isinstance(loaded_model, CrossLayerTranscoder)
-    assert loaded_model.config.num_layers == small_clt_config.num_layers
-    assert loaded_model.config.num_features == small_clt_config.num_features
-    assert loaded_model.config.d_model == small_clt_config.d_model
-
-    # Check that the model can perform a forward pass
-    inputs = {
-        0: torch.randn(1, 1, small_clt_config.d_model),  # Batch=1, Seq=1
-        1: torch.randn(1, 1, small_clt_config.d_model),
-    }
-
-    with torch.no_grad():
-        outputs = loaded_model(inputs)
-
-    # Check output structure
-    assert isinstance(outputs, dict)
-    assert set(outputs.keys()) == set(inputs.keys())
-    assert outputs[0].shape == (1, 1, small_clt_config.d_model)
-    assert outputs[1].shape == (1, 1, small_clt_config.d_model)
diff --git a/tests/models/test_clt_distributed_forward.py b/tests/models/test_clt_distributed_forward.py
new file mode 100644
index 0000000..d1b9a58
--- /dev/null
+++ b/tests/models/test_clt_distributed_forward.py
@@ -0,0 +1,132 @@
+import torch
+import torch.distributed as dist
+import os
+from typing import Dict, Optional, Literal
+
+from clt.config import CLTConfig
+from clt.models.clt import CrossLayerTranscoder
+
+
+# Helper to initialize distributed environment for the test
+def setup_distributed_test(rank, world_size, master_port="12355"):
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = master_port
+    dist.init_process_group("nccl" if torch.cuda.is_available() else "gloo", rank=rank, world_size=world_size)
+    if torch.cuda.is_available():
+        torch.cuda.set_device(rank)  # Set device for this process
+
+
+# Helper to cleanup distributed environment
+def cleanup_distributed_test():
+    dist.destroy_process_group()
+
+
+def run_forward_pass_test(
+    rank, world_size, activation_fn: Literal["jumprelu", "relu", "batchtopk", "topk"], batchtopk_k: Optional[int] = None
+):
+    setup_distributed_test(rank, world_size)
+
+    d_model = 64  # Small d_model for testing
+    num_features_per_layer = d_model * 2
+    num_layers = 2  # Small number of layers
+    batch_size = 4
+    seq_len = 8
+    batch_tokens = batch_size * seq_len
+
+    clt_config = CLTConfig(
+        d_model=d_model,
+        num_features=num_features_per_layer,
+        num_layers=num_layers,
+        activation_fn=activation_fn,
+        batchtopk_k=batchtopk_k,
+        # jumprelu_threshold is only relevant if activation_fn is jumprelu
+        jumprelu_threshold=0.01 if activation_fn == "jumprelu" else 0.0,
+    )
+
+    # Determine device for the model based on availability and rank
+    current_device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu")
+
+    # Instantiate model - process_group is automatically handled by CrossLayerTranscoder init if dist is initialized
+    model = CrossLayerTranscoder(config=clt_config, process_group=None, device=current_device)  # PG is WORLD implicitly
+    model.to(current_device)
+    model.eval()  # Set to eval mode
+
+    # Create identical dummy input data on all ranks
+    # (batch_tokens, d_model)
+    dummy_inputs: Dict[int, torch.Tensor] = {}
+    for i in range(num_layers):
+        # Ensure identical tensor across ranks using a fixed seed before creating tensor
+        torch.manual_seed(42 + i)  # Same seed for each layer across ranks
+        dummy_inputs[i] = torch.randn(batch_tokens, d_model, device=current_device, dtype=model.dtype)
+
+    # Perform forward pass
+    reconstructions = model.forward(dummy_inputs)
+
+    # Assertions
+    assert isinstance(reconstructions, dict)
+    assert len(reconstructions) == num_layers
+
+    # Gather all reconstruction tensors to rank 0 for comparison (if more than 1 GPU)
+    # Or, more simply, each rank asserts its output is identical to a tensor broadcast from rank 0
+    for layer_idx in range(num_layers):
+        output_tensor = reconstructions[layer_idx]
+        assert output_tensor.shape == (batch_tokens, d_model)
+        assert output_tensor.device == current_device
+        assert output_tensor.dtype == model.dtype
+
+        # All-reduce the sum of the tensor and sum of squares. If identical, these will be world_size * val.
+        # This is a robust way to check for numerical identity across ranks.
+        sum_val = output_tensor.sum()
+        sum_sq_val = (output_tensor**2).sum()
+
+        gathered_sum_list = [torch.zeros_like(sum_val) for _ in range(world_size)]
+        gathered_sum_sq_list = [torch.zeros_like(sum_sq_val) for _ in range(world_size)]
+
+        if world_size > 1:
+            dist.all_gather(gathered_sum_list, sum_val)
+            dist.all_gather(gathered_sum_sq_list, sum_sq_val)
+        else:
+            gathered_sum_list = [sum_val]
+            gathered_sum_sq_list = [sum_sq_val]
+
+        # Check if all gathered sums and sum_sq are close to each other
+        for i in range(1, world_size):
+            assert torch.allclose(
+                gathered_sum_list[0], gathered_sum_list[i]
+            ), f"Rank {rank} Layer {layer_idx} sum mismatch: {gathered_sum_list[0]} vs {gathered_sum_list[i]} (rank {i}) for act_fn {activation_fn}"
+            assert torch.allclose(
+                gathered_sum_sq_list[0], gathered_sum_sq_list[i]
+            ), f"Rank {rank} Layer {layer_idx} sum_sq mismatch: {gathered_sum_sq_list[0]} vs {gathered_sum_sq_list[i]} (rank {i}) for act_fn {activation_fn}"
+
+    if rank == 0:
+        print(f"Distributed forward test PASSED for rank {rank}, activation_fn='{activation_fn}'")
+
+    cleanup_distributed_test()
+
+
+# Main test execution controlled by torchrun
+if __name__ == "__main__":
+    world_size = int(os.environ.get("WORLD_SIZE", 1))
+    rank = int(os.environ.get("RANK", 0))
+
+    print(f"Starting distributed test on rank {rank} of {world_size}")
+
+    # Test with ReLU
+    print(f"Rank {rank}: Running test for ReLU")
+    run_forward_pass_test(rank, world_size, activation_fn="relu")
+    if world_size > 1:
+        dist.barrier()  # Ensure test finishes before next one
+
+    # Test with BatchTopK
+    print(f"Rank {rank}: Running test for BatchTopK")
+    run_forward_pass_test(rank, world_size, activation_fn="batchtopk", batchtopk_k=10)
+    if world_size > 1:
+        dist.barrier()
+
+    # Add more activation functions to test if needed, e.g., jumprelu
+    # print(f"Rank {rank}: Running test for JumpReLU")
+    # run_forward_pass_test(rank, world_size, activation_fn="jumprelu")
+    # if world_size > 1: dist.barrier()
+
+    if rank == 0:
+        print("All distributed forward tests completed.")
diff --git a/tests/unit/models/test_activations.py b/tests/unit/models/test_activations.py
deleted file mode 100644
index 40f2829..0000000
--- a/tests/unit/models/test_activations.py
+++ /dev/null
@@ -1,134 +0,0 @@
-import torch
-
-# import pytest # DELETED: Not strictly needed for these torch.asserts
-from clt.models.activations import BatchTopK
-
-
-def test_batchtopk_forward_global_k():
-    """Test BatchTopK forward pass with global k selection."""
-    # Input tensor: 2 samples (tokens), 10 features each
-    # Batch size B = 2, Total features F_total = 10
-    x = torch.tensor(
-        [
-            [0.1, 0.9, 0.2, 0.8, 0.3, 0.7, 0.4, 0.6, 0.5, 1.0],  # Token 1
-            [1.1, 0.15, 1.2, 0.25, 1.3, 0.35, 1.4, 0.45, 1.5, 0.05],  # Token 2
-        ],
-        dtype=torch.float32,
-    )
-
-    # Case 1: k_per_token = 1. Should keep 1*2 = 2 features globally.
-    # Expected: 1.5 (from token 2, index 8) and 1.4 (from token 2, index 6)
-    k1 = 1
-    output1 = BatchTopK.apply(x, float(k1), True, None)
-    # assert output1[0].eq(torch.tensor([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])).all() # This was too broad, depends on specific top-k
-    # assert output1[1].eq(torch.tensor([0.0, 0.0, 0.0, 0.0, 1.3, 0.0, 1.4, 0.0, 1.5, 0.0])).all() # This was too broad
-    assert torch.count_nonzero(output1) == k1 * x.size(0)
-    # More specific check for values (will need to adjust expected based on actual top-k logic)
-    # For k_total_batch = 2, top values in flattened x are 1.5 (idx 18), 1.4 (idx 16)
-    # So, output1 should have non-zero at x[1,8] and x[1,6]
-    # expected_output1 = torch.tensor([ # DELETED: This was for per-token topk
-    #     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-    #     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.4, 0.0, 1.5, 0.0] # Assuming 1.5 and 1.4 are top 2
-    # ], dtype=torch.float32)
-    expected_output1_corrected = torch.zeros_like(x)
-    expected_output1_corrected[1, 8] = 1.5
-    expected_output1_corrected[1, 6] = 1.4
-    assert torch.allclose(output1, expected_output1_corrected)
-
-    # Case 2: k_per_token = 3. Should keep 3*2 = 6 features globally.
-    k2 = 3
-    output2 = BatchTopK.apply(x, float(k2), True, None)
-    assert torch.count_nonzero(output2) == k2 * x.size(0)
-    # Top 6 values: 1.5, 1.4, 1.3, 1.2, 1.1 (from token 2) and 1.0 (from token 1)
-    # Indices in flattened x: 1.5 (18), 1.4 (16), 1.3 (14), 1.2 (12), 1.1 (10), 1.0 (9)
-    expected_output2 = torch.zeros_like(x)
-    expected_output2[1, 8] = 1.5
-    expected_output2[1, 6] = 1.4
-    expected_output2[1, 4] = 1.3
-    expected_output2[1, 2] = 1.2
-    expected_output2[1, 0] = 1.1
-    expected_output2[0, 9] = 1.0
-    assert torch.allclose(output2, expected_output2)
-
-    # Case 3: k_per_token = 0. Should keep 0 features globally.
-    k3 = 0
-    output3 = BatchTopK.apply(x, float(k3), True, None)
-    assert torch.count_nonzero(output3) == 0
-    assert torch.allclose(output3, torch.zeros_like(x))
-
-    # Case 4: k_per_token such that k_total_batch > F_total_batch (all features kept)
-    k4 = 25  # k_per_token * B = 50, F_total_batch = 20
-    output4 = BatchTopK.apply(x, float(k4), True, None)
-    assert torch.count_nonzero(output4) == x.numel()
-    assert torch.allclose(output4, x)
-
-    # Case 5: Empty input tensor
-    x_empty = torch.empty((0, 5), dtype=torch.float32)
-    output_empty = BatchTopK.apply(x_empty, float(k1), True, None)
-    assert output_empty.numel() == 0
-    assert output_empty.shape == x_empty.shape
-
-    # Case 6: Using x_for_ranking
-    x_rank = torch.tensor(
-        [
-            [10.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],  # Token 1 - 10.0 is highest
-            [0.0, 0.0, 20.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],  # Token 2 - 20.0 is second highest
-        ],
-        dtype=torch.float32,
-    )
-    # k_per_token = 1, so k_total_batch = 2.
-    # Ranking tensor has 20.0 at (1,2) and 10.0 at (0,0) as top 2.
-    # Values should come from original x at these positions.
-    output_x_rank = BatchTopK.apply(x, float(k1), True, x_rank)
-    expected_output_x_rank = torch.zeros_like(x)
-    expected_output_x_rank[1, 2] = x[1, 2]  # Value is 1.2
-    expected_output_x_rank[0, 0] = x[0, 0]  # Value is 0.1
-    assert torch.allclose(output_x_rank, expected_output_x_rank)
-    assert torch.count_nonzero(output_x_rank) == k1 * x.size(0)
-
-
-def test_batchtopk_backward_ste():
-    """Test BatchTopK backward pass with straight-through estimator."""
-    x = torch.randn(2, 5, requires_grad=True)
-    k = 2  # k_per_token
-
-    # Forward pass
-    output = BatchTopK.apply(x, float(k), True, None)
-
-    # Create a gradient for the output
-    grad_output = torch.randn_like(output)
-
-    # Backward pass
-    output.backward(grad_output)
-
-    # Expected gradient: grad_output where mask is True, 0 otherwise
-    # The mask is (output != 0)
-    mask = (output != 0).to(grad_output.dtype)
-    expected_grad_input = grad_output * mask
-
-    assert x.grad is not None
-    assert torch.allclose(x.grad, expected_grad_input)
-
-
-# It might be useful to test the non-STE case if a different backward pass is implemented in the future.
-# For now, non-STE backward is the same as STE.
-def test_batchtopk_backward_no_ste():
-    """Test BatchTopK backward pass with straight_through=False."""
-    x = torch.randn(2, 5, requires_grad=True)
-    k = 2  # k_per_token
-
-    # Forward pass
-    output = BatchTopK.apply(x, float(k), False, None)  # straight_through = False
-
-    # Create a gradient for the output
-    grad_output = torch.randn_like(output)
-
-    # Backward pass
-    output.backward(grad_output)
-
-    # Expected gradient for current implementation (same as STE):
-    mask = (output != 0).to(grad_output.dtype)
-    expected_grad_input = grad_output * mask
-
-    assert x.grad is not None
-    assert torch.allclose(x.grad, expected_grad_input)
diff --git a/tests/unit/models/test_base.py b/tests/unit/models/test_base.py
deleted file mode 100644
index a651f7e..0000000
--- a/tests/unit/models/test_base.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import pytest
-import torch
-import torch.nn as nn
-from typing import Dict
-import os
-
-# Import the actual config
-from clt.config import CLTConfig
-
-from clt.models.base import BaseTranscoder
-
-
-# Assume BaseTranscoder exists in clt.models.base
-# Need a concrete implementation for testing save/load
-
-
-class DummyTranscoder(BaseTranscoder):
-    def __init__(self, config):
-        super().__init__(config)
-        self.layer = nn.Linear(config.d_model, config.d_model)
-
-    def encode(self, x: torch.Tensor, layer_idx: int) -> torch.Tensor:
-        return self.layer(x)  # Simplified encode
-
-    def decode(self, a: Dict[int, torch.Tensor], layer_idx: int) -> torch.Tensor:
-        # Simplified decode - just returns the activation from layer 0 if present
-        # Assumes key 0 is present based on simplified forward logic/test setup
-        return a[0]
-
-    def forward(self, inputs: Dict[int, torch.Tensor]) -> Dict[int, torch.Tensor]:
-        outputs = {}
-        activations = {}
-        for layer_idx, x in inputs.items():
-            activations[layer_idx] = self.encode(x, layer_idx)
-
-        act_0 = activations.get(0)
-        # Only proceed with decode if layer 0 activation exists
-        if act_0 is not None:
-            decode_input = {0: act_0}
-            for layer_idx in inputs.keys():
-                # Call decode with the dictionary containing only layer 0 activation
-                outputs[layer_idx] = self.decode(decode_input, layer_idx)
-        # If act_0 is None, outputs dict remains empty or partially filled
-        # depending on previous loops, which is acceptable for this dummy class.
-        return outputs
-
-
-@pytest.fixture
-def dummy_config():
-    # Use the actual CLTConfig
-    return CLTConfig(d_model=16, num_layers=2, num_features=32, activation_fn="relu")
-
-
-@pytest.fixture
-def dummy_transcoder(dummy_config):
-    return DummyTranscoder(dummy_config)
-
-
-def test_base_transcoder_init(dummy_transcoder, dummy_config):
-    """Test BaseTranscoder initialization through a dummy implementation."""
-    assert dummy_transcoder.config == dummy_config
-    assert isinstance(dummy_transcoder, nn.Module)
-
-
-def test_base_transcoder_save_load(dummy_transcoder, dummy_config, tmp_path):
-    """Test saving and loading a BaseTranscoder."""
-    save_path = tmp_path / "dummy_transcoder.pt"
-
-    # Save the model
-    dummy_transcoder.save(str(save_path))
-    assert os.path.exists(save_path)
-
-    # Load the model (safe_globals not needed for dataclass config)
-    loaded_transcoder = DummyTranscoder.load(str(save_path))
-
-    # Check loaded model
-    assert isinstance(loaded_transcoder, DummyTranscoder)
-    assert isinstance(loaded_transcoder.config, CLTConfig)
-    assert loaded_transcoder.config.d_model == dummy_config.d_model
-    assert loaded_transcoder.config.num_layers == dummy_config.num_layers
-    # Removed check for mock-specific param
-
-    # Check state dicts match
-    assert dummy_transcoder.state_dict().keys() == loaded_transcoder.state_dict().keys()
-    for key in dummy_transcoder.state_dict():
-        assert torch.equal(
-            dummy_transcoder.state_dict()[key], loaded_transcoder.state_dict()[key]
-        )
-
-
-def test_base_transcoder_load_device(dummy_transcoder, tmp_path):
-    """Test loading a BaseTranscoder to a specific device."""
-    save_path = tmp_path / "dummy_transcoder_device.pt"
-    dummy_transcoder.save(str(save_path))
-
-    # Try loading to CPU (assuming tests run on CPU by default)
-    loaded_transcoder_cpu = DummyTranscoder.load(
-        str(save_path), device=torch.device("cpu")
-    )
-    assert next(loaded_transcoder_cpu.parameters()).device.type == "cpu"
-
-    # If CUDA is available, test loading to CUDA
-    if torch.cuda.is_available():
-        cuda_device = torch.device("cuda")
-        loaded_transcoder_cuda = DummyTranscoder.load(
-            path=str(save_path), device=cuda_device
-        )
-        assert next(loaded_transcoder_cuda.parameters()).device.type == "cuda"
-
-
-# Test abstract methods raise NotImplementedError if called on BaseTranscoder directly
-# This requires a bit of setup, maybe skip for now unless explicitly needed.
-# Trying to instantiate BaseTranscoder directly should fail anyway.
-
-# Note: Linter errors in base.py (unused Any, Tuple, unknown CLTConfig) should be addressed
-# separately. This test file uses a MockCLTConfig.
diff --git a/tests/unit/models/test_clt.py b/tests/unit/models/test_clt.py
deleted file mode 100644
index 784ed7e..0000000
--- a/tests/unit/models/test_clt.py
+++ /dev/null
@@ -1,616 +0,0 @@
-import pytest
-import torch
-import torch.nn as nn
-import math
-
-# Import the classes to test first
-from clt.models.clt import JumpReLU, CrossLayerTranscoder
-
-# Import the actual config
-from clt.config import CLTConfig
-
-
-# --- Test JumpReLU ---
-
-
-@pytest.fixture
-def jumprelu_input():
-    # Use a smaller tensor for quicker testing
-    return torch.randn(5, 10, requires_grad=True)
-
-
-# Parameterize with threshold *value*, not the parameter itself
-@pytest.mark.parametrize("threshold_val", [0.01, 0.03, 0.1])
-def test_jumprelu_forward(jumprelu_input, threshold_val):
-    """Test the forward pass of JumpReLU."""
-    # The function expects the threshold value, not the log_threshold parameter
-    threshold_tensor = torch.tensor(
-        threshold_val, device=jumprelu_input.device, dtype=jumprelu_input.dtype
-    )
-    output = JumpReLU.apply(jumprelu_input, threshold_tensor, 1.0)
-    expected_output = (jumprelu_input >= threshold_tensor).float() * jumprelu_input
-    assert isinstance(output, torch.Tensor)
-    assert isinstance(expected_output, torch.Tensor)
-    assert torch.allclose(output, expected_output)
-
-
-@pytest.mark.parametrize("threshold_val, bandwidth", [(0.03, 1.0), (0.1, 0.5)])
-def test_jumprelu_backward_input_grad(jumprelu_input, threshold_val, bandwidth):
-    """Test the backward pass (STE) of JumpReLU for input gradient ONLY."""
-    input_clone = jumprelu_input.clone().requires_grad_(True)
-    # Keep threshold fixed for this test
-    threshold_tensor = torch.tensor(
-        threshold_val, device=input_clone.device, dtype=input_clone.dtype
-    )
-
-    output = JumpReLU.apply(input_clone, threshold_tensor, bandwidth)
-    assert isinstance(output, torch.Tensor)
-
-    grad_output = torch.ones_like(output)
-    # Explicitly retain grad for the input tensor as suggested by the warning
-    input_clone.retain_grad()
-    output.backward(grad_output)
-    grad_input_actual = input_clone.grad
-
-    # Expected input gradient (STE)
-    input_fp32 = input_clone.float()
-    threshold_fp32 = threshold_tensor.float()
-    bandwidth_fp32 = float(bandwidth)
-    is_near_threshold = torch.abs(input_fp32 - threshold_fp32) <= (bandwidth_fp32 / 2.0)
-    grad_input_expected = grad_output.float() * is_near_threshold.float()
-    grad_input_expected = grad_input_expected.to(input_clone.dtype)  # Cast back
-
-    assert grad_input_actual is not None
-    assert torch.allclose(grad_input_actual, grad_input_expected, atol=1e-6)
-
-
-@pytest.mark.parametrize("threshold_val, bandwidth", [(0.03, 1.0), (0.1, 0.5)])
-def test_jumprelu_backward_threshold_grad(jumprelu_input, threshold_val, bandwidth):
-    """Test the backward pass (STE) of JumpReLU for threshold gradient ONLY."""
-    # Keep input fixed (detached) for this test
-    input_fixed = jumprelu_input.clone().detach()
-    # Threshold needs to be a parameter
-    log_threshold_param = nn.Parameter(torch.log(torch.tensor(threshold_val)))
-
-    # Apply function using the *value* derived from the parameter
-    threshold_value = torch.exp(log_threshold_param)
-    output = JumpReLU.apply(input_fixed, threshold_value, bandwidth)
-    assert isinstance(output, torch.Tensor)
-
-    # Compute gradients w.r.t log_threshold_param
-    grad_output = torch.ones_like(output)
-    output.sum().backward()  # Use sum to get scalar loss
-
-    grad_log_threshold_actual = log_threshold_param.grad
-
-    # Expected threshold gradient (manual calculation)
-    input_fp32 = input_fixed.float()
-    threshold_fp32 = threshold_value.float()
-    grad_output_fp32 = grad_output.float()
-    bandwidth_fp32 = float(bandwidth)
-
-    is_near_threshold = torch.abs(input_fp32 - threshold_fp32) <= (bandwidth_fp32 / 2.0)
-    local_grad_theta_fp32 = (-input_fp32 / bandwidth_fp32) * is_near_threshold.float()
-    grad_threshold_per_element_fp32 = grad_output_fp32 * local_grad_theta_fp32
-    # Sum gradients for the single threshold value
-    grad_threshold_expected_fp32 = grad_threshold_per_element_fp32.sum()
-
-    # Chain rule: dL/d(log_theta) = dL/d(theta) * d(theta)/d(log_theta)
-    # d(theta)/d(log_theta) = exp(log_theta) = theta
-    grad_log_threshold_expected = grad_threshold_expected_fp32 * torch.exp(
-        log_threshold_param.float()
-    )
-    grad_log_threshold_expected = grad_log_threshold_expected.to(
-        log_threshold_param.dtype
-    )  # Cast back
-
-    assert grad_log_threshold_actual is not None
-    # Gradient check for parameters can require higher tolerance
-    assert torch.allclose(
-        grad_log_threshold_actual, grad_log_threshold_expected, atol=1e-4
-    )
-
-
-# --- Test CrossLayerTranscoder ---
-
-
-# Use actual CLTConfig
-@pytest.fixture
-def clt_config_relu():
-    return CLTConfig(d_model=16, num_features=32, num_layers=3, activation_fn="relu")
-
-
-@pytest.fixture
-def clt_config_jumprelu():
-    return CLTConfig(
-        d_model=16,
-        num_features=32,
-        num_layers=3,
-        activation_fn="jumprelu",
-        jumprelu_threshold=0.05,  # Initial value
-    )
-
-
-@pytest.fixture(params=["relu", "jumprelu"])
-def clt_model_config(request):
-    if request.param == "relu":
-        return CLTConfig(
-            d_model=16, num_features=32, num_layers=3, activation_fn="relu"
-        )
-    else:  # jumprelu
-        return CLTConfig(
-            d_model=16,
-            num_features=32,
-            num_layers=3,
-            activation_fn="jumprelu",
-            jumprelu_threshold=0.05,
-        )
-
-
-@pytest.fixture
-def clt_model(clt_model_config):
-    return CrossLayerTranscoder(clt_model_config)
-
-
-@pytest.fixture
-def sample_inputs(clt_model_config):  # Depend on config to get params
-    batch_size = 4
-    seq_len = 10
-    d_model = clt_model_config.d_model
-    num_layers = clt_model_config.num_layers
-    inputs = {i: torch.randn(batch_size, seq_len, d_model) for i in range(num_layers)}
-    return inputs
-
-
-def test_clt_init(clt_model):
-    """Test CLT initialization."""
-    config = clt_model.config
-    assert isinstance(clt_model, nn.Module)
-    assert len(clt_model.encoders) == config.num_layers
-    assert all(isinstance(enc, nn.Linear) for enc in clt_model.encoders)
-    assert all(enc.in_features == config.d_model for enc in clt_model.encoders)
-    assert all(enc.out_features == config.num_features for enc in clt_model.encoders)
-    assert all(enc.bias is None for enc in clt_model.encoders)
-
-    num_expected_decoders = config.num_layers * (config.num_layers + 1) // 2
-    assert len(clt_model.decoders) == num_expected_decoders
-    for src in range(config.num_layers):
-        for tgt in range(src, config.num_layers):
-            key = f"{src}->{tgt}"
-            assert key in clt_model.decoders
-            dec = clt_model.decoders[key]
-            assert isinstance(dec, nn.Linear)
-            assert dec.in_features == config.num_features
-            assert dec.out_features == config.d_model
-            assert dec.bias is None
-
-    if config.activation_fn == "jumprelu":
-        assert isinstance(clt_model.log_threshold, nn.Parameter)  # Check log_threshold
-        assert clt_model.log_threshold.shape == (config.num_features,)
-        expected_log_threshold = torch.log(torch.tensor(config.jumprelu_threshold))
-        assert torch.allclose(
-            clt_model.log_threshold.data,
-            torch.ones(config.num_features) * expected_log_threshold,
-        )
-
-    # Check parameter initialization ranges
-    encoder_bound = 1.0 / math.sqrt(config.num_features)
-    for encoder in clt_model.encoders:
-        assert torch.all(encoder.weight.data >= -encoder_bound)
-        assert torch.all(encoder.weight.data <= encoder_bound)
-
-    decoder_bound = 1.0 / math.sqrt(config.num_layers * config.d_model)
-    for decoder in clt_model.decoders.values():
-        assert torch.all(decoder.weight.data >= -decoder_bound)
-        assert torch.all(decoder.weight.data <= decoder_bound)
-
-    # Calculate and print memory footprint
-    total_params = sum(p.numel() for p in clt_model.parameters() if p.requires_grad)
-    # Assuming float32 (4 bytes per parameter)
-    dtype_size = torch.finfo(clt_model.dtype).bits // 8
-    estimated_memory_bytes = total_params * dtype_size
-    estimated_memory_mb = estimated_memory_bytes / (1024 * 1024)
-    print(
-        f"\n[Memory Footprint Info for {clt_model.config.activation_fn} CLT ({clt_model.dtype})]"
-    )
-    print(f"  - Total Trainable Parameters: {total_params:,}")
-    print(f"  - Estimated Memory (MB): {estimated_memory_mb:.2f} MB")
-
-
-def test_clt_init_device_dtype():
-    """Test CLT initialization with specific device and dtype."""
-    # Test float16
-    config_fp16 = CLTConfig(
-        d_model=8, num_features=16, num_layers=2, clt_dtype="float16"
-    )
-    if torch.cuda.is_available():
-        device = torch.device("cuda")
-        try:
-            model_fp16 = CrossLayerTranscoder(config_fp16, device=device)
-            assert model_fp16.dtype == torch.float16
-            assert next(model_fp16.parameters()).dtype == torch.float16
-            assert next(model_fp16.parameters()).device.type == "cuda"
-            assert model_fp16.log_threshold.dtype == torch.float16  # Check param dtype
-            assert model_fp16.log_threshold.device.type == "cuda"  # Check param device
-        except RuntimeError as e:
-            # Some GPUs might not support float16 well
-            print(f"Skipping float16 test on CUDA due to: {e}")
-    else:  # CPU
-        # CPU float16 support is limited, often emulated, skip strict check
-        model_fp16_cpu = CrossLayerTranscoder(config_fp16, device=torch.device("cpu"))
-        assert model_fp16_cpu.dtype == torch.float16
-        # Parameters might default to float32 on CPU even if requested float16
-        # assert next(model_fp16_cpu.parameters()).dtype == torch.float16
-        assert next(model_fp16_cpu.parameters()).device.type == "cpu"
-
-    # Test bfloat16
-    config_bf16 = CLTConfig(
-        d_model=8, num_features=16, num_layers=2, clt_dtype="bfloat16"
-    )
-    try:
-        # Check if bfloat16 is supported on the current device
-        is_bf16_supported = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
-        if is_bf16_supported:
-            device = torch.device("cuda")
-            model_bf16 = CrossLayerTranscoder(config_bf16, device=device)
-            assert model_bf16.dtype == torch.bfloat16
-            assert next(model_bf16.parameters()).dtype == torch.bfloat16
-            assert next(model_bf16.parameters()).device.type == "cuda"
-            assert model_bf16.log_threshold.dtype == torch.bfloat16  # Check param dtype
-            assert model_bf16.log_threshold.device.type == "cuda"  # Check param device
-        else:  # CPU or CUDA without BF16 support
-            # BFloat16 often works on CPU
-            model_bf16_cpu = CrossLayerTranscoder(
-                config_bf16, device=torch.device("cpu")
-            )
-            assert model_bf16_cpu.dtype == torch.bfloat16
-            assert (
-                next(model_bf16_cpu.parameters()).dtype == torch.bfloat16
-            )  # Usually works on CPU
-            assert next(model_bf16_cpu.parameters()).device.type == "cpu"
-    except RuntimeError as e:
-        print(f"Skipping bfloat16 test due to: {e}")
-
-    # Test invalid dtype string
-    config_invalid = CLTConfig(
-        d_model=8, num_features=16, num_layers=2, clt_dtype="invalid_dtype"
-    )
-    model_invalid = CrossLayerTranscoder(config_invalid)
-    assert model_invalid.dtype == torch.float32  # Should default
-
-
-def test_clt_resolve_dtype(clt_model):
-    """Test the _resolve_dtype helper method."""
-    assert clt_model._resolve_dtype(None) == torch.float32
-    assert clt_model._resolve_dtype("float32") == torch.float32
-    assert clt_model._resolve_dtype("float16") == torch.float16
-    assert clt_model._resolve_dtype("bfloat16") == torch.bfloat16
-    assert clt_model._resolve_dtype(torch.float64) == torch.float64
-    # Invalid string defaults to float32
-    assert clt_model._resolve_dtype("invalid") == torch.float32
-    # Non-dtype attribute defaults to float32
-    assert clt_model._resolve_dtype("Linear") == torch.float32
-
-
-def test_clt_get_preactivations(clt_model, sample_inputs):
-    """Test getting pre-activations."""
-    config = clt_model.config
-    x = sample_inputs[0]  # Shape: [batch_size, seq_len, d_model]
-    batch_size, seq_len, _ = x.shape
-
-    preact = clt_model.get_preactivations(x, 0)
-    # Expect reshaped output: [batch_size * seq_len, num_features]
-    assert preact.shape == (batch_size * seq_len, config.num_features)
-
-    # Test with 2D input [batch_tokens, d_model]
-    x_2d = x.reshape(-1, config.d_model)
-    preact_2d = clt_model.get_preactivations(x_2d, 0)
-    assert preact_2d.shape == (batch_size * seq_len, config.num_features)
-    assert torch.allclose(preact, preact_2d, atol=1e-6)
-
-
-def test_clt_encode(clt_model, sample_inputs):
-    """Test the encode method."""
-    config = clt_model.config
-    x = sample_inputs[0]  # Shape: [batch_size, seq_len, d_model]
-    batch_size, seq_len, d_model = x.shape
-
-    encoded = clt_model.encode(x, 0)
-    # Expect reshaped output: [batch_size * seq_len, num_features]
-    assert encoded.shape == (batch_size * seq_len, config.num_features)
-
-    # Check if activation applied
-    preact = clt_model.get_preactivations(x, 0)
-    if config.activation_fn == "relu":
-        assert torch.all(encoded >= 0)
-        assert torch.allclose(encoded, torch.relu(preact))
-    elif config.activation_fn == "jumprelu":
-        # Calculate expected threshold value from log_threshold parameter
-        threshold_val = torch.exp(clt_model.log_threshold).to(
-            preact.device, preact.dtype
-        )
-        # Compare element-wise
-        expected_jumprelu = (preact >= threshold_val).float() * preact
-        assert torch.allclose(encoded, expected_jumprelu, atol=1e-6)
-
-
-def test_clt_decode(clt_model, sample_inputs):
-    """Test the decode method."""
-    config = clt_model.config
-    batch_size, seq_len, d_model = sample_inputs[0].shape
-    num_tokens = batch_size * seq_len
-
-    # Encode produces [num_tokens, num_features]
-    activations = {i: clt_model.encode(x, i) for i, x in sample_inputs.items()}
-
-    for layer_idx in range(config.num_layers):
-        reconstruction = clt_model.decode(activations, layer_idx)
-        # Expect output shape: [num_tokens, d_model]
-        assert reconstruction.shape == (num_tokens, config.d_model)
-
-        # Check reconstruction calculation (simplified check)
-        if layer_idx == 1 and 0 in activations and 1 in activations:
-            # Ensure activations are on the same device as decoders for the check
-            device = next(clt_model.parameters()).device
-            act_0 = activations[0].to(device)
-            act_1 = activations[1].to(device)
-
-            dec_0_1 = clt_model.decoders["0->1"]
-            dec_1_1 = clt_model.decoders["1->1"]
-            expected_rec_1 = dec_0_1(act_0) + dec_1_1(act_1)
-            # Reconstruction should also be on the same device
-            assert torch.allclose(reconstruction.to(device), expected_rec_1, atol=1e-5)
-
-
-def test_clt_forward(clt_model, sample_inputs):
-    """Test the forward pass of the CLT."""
-    config = clt_model.config
-    reconstructions = clt_model(sample_inputs)
-
-    assert isinstance(reconstructions, dict)
-    # Forward should produce outputs for all layers up to num_layers
-    assert len(reconstructions) == config.num_layers
-    # Check that keys are 0 to num_layers-1
-    assert all(idx in reconstructions for idx in range(config.num_layers))
-
-    for layer_idx, recon in reconstructions.items():
-        # Get original input shape for comparison
-        batch_size, seq_len, d_model = sample_inputs[layer_idx].shape
-        num_tokens = batch_size * seq_len
-        # Expect output shape: [num_tokens, d_model]
-        assert recon.shape == (num_tokens, config.d_model)
-
-        # Check if forward pass output matches manual encode/decode
-        # Need to handle device consistency
-        device = next(clt_model.parameters()).device
-        activations = {
-            i: clt_model.encode(x.to(device), i)
-            for i, x in sample_inputs.items()
-            if i <= layer_idx
-        }
-        # Filter only relevant activations for decode
-        relevant_activations = {k: v for k, v in activations.items() if k <= layer_idx}
-        if relevant_activations:  # Only decode if there are relevant activations
-            expected_recon = clt_model.decode(relevant_activations, layer_idx)
-            assert torch.allclose(recon.to(device), expected_recon, atol=1e-5)
-
-
-def test_clt_get_feature_activations(clt_model, sample_inputs):
-    """Test getting all feature activations."""
-    config = clt_model.config
-    activations = clt_model.get_feature_activations(sample_inputs)
-
-    assert isinstance(activations, dict)
-    assert len(activations) == len(sample_inputs)
-    assert all(idx in activations for idx in sample_inputs.keys())
-
-    for layer_idx, act in activations.items():
-        batch_size, seq_len, _ = sample_inputs[layer_idx].shape
-        num_tokens = batch_size * seq_len
-        # Expect shape: [num_tokens, num_features]
-        assert act.shape == (num_tokens, config.num_features)
-        # Check if it matches encode output
-        expected_act = clt_model.encode(sample_inputs[layer_idx], layer_idx)
-        assert torch.allclose(act, expected_act, atol=1e-6)
-
-
-def test_clt_get_decoder_norms(clt_model):
-    """Test calculation of decoder norms."""
-    config = clt_model.config
-    decoder_norms = clt_model.get_decoder_norms()
-
-    assert decoder_norms.shape == (config.num_layers, config.num_features)
-    assert not torch.any(torch.isnan(decoder_norms))
-    assert torch.all(decoder_norms >= 0)
-
-    # Manual check for one feature (e.g., feature 0) at layer 0
-    expected_norm_0_0_sq = 0
-    device = next(clt_model.parameters()).device
-    dtype = next(clt_model.parameters()).dtype
-    for tgt_layer in range(config.num_layers):  # From src_layer=0
-        decoder = clt_model.decoders[f"0->{tgt_layer}"]
-        # Norm of the first column (feature 0) - ensure calculation is on correct device/dtype
-        weight_col_0 = decoder.weight[:, 0].to(
-            device=device, dtype=torch.float32
-        )  # Use float32 for stable norm calc
-        expected_norm_0_0_sq += torch.norm(weight_col_0, p=2).pow(2)
-
-    expected_norm_0_0 = torch.sqrt(expected_norm_0_0_sq).to(
-        dtype=dtype
-    )  # Cast back to model dtype
-    # Use slightly higher tolerance due to potential dtype conversions
-    assert torch.allclose(decoder_norms[0, 0].to(device), expected_norm_0_0, atol=1e-5)
-
-
-# --- Fixtures for GPT-2 Small Size Test Case ---
-
-
-@pytest.fixture
-def clt_config_gpt2_small():
-    # Parameters similar to GPT-2 Small
-    return CLTConfig(
-        d_model=768,
-        num_features=3072,  # d_model * 4
-        num_layers=12,
-        activation_fn="relu",  # Keep it simple for size test
-        clt_dtype="float32",  # Explicitly set for test
-    )
-
-
-@pytest.fixture
-def clt_model_gpt2_small(clt_config_gpt2_small):
-    # Run on CPU by default for large model test unless CUDA available
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    return CrossLayerTranscoder(clt_config_gpt2_small, device=device)
-
-
-@pytest.fixture
-def sample_inputs_gpt2_small(clt_config_gpt2_small):
-    # Smaller batch/seq_len for faster test with large d_model
-    batch_size = 1
-    seq_len = 4
-    config = clt_config_gpt2_small
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    inputs = {
-        i: torch.randn(
-            batch_size, seq_len, config.d_model, device=device, dtype=torch.float32
-        )
-        for i in range(config.num_layers)
-    }
-    return inputs
-
-
-# --- Tests Duplicated for GPT-2 Small Size ---
-# (Using the specific gpt2_small fixtures)
-
-
-def test_clt_init_gpt2_small(clt_model_gpt2_small):
-    """Test CLT initialization with GPT-2 small dimensions."""
-    clt_model = clt_model_gpt2_small
-    config = clt_model.config
-    assert isinstance(clt_model, nn.Module)
-    assert len(clt_model.encoders) == config.num_layers
-    assert all(enc.in_features == config.d_model for enc in clt_model.encoders)
-    assert all(enc.out_features == config.num_features for enc in clt_model.encoders)
-
-    num_expected_decoders = config.num_layers * (config.num_layers + 1) // 2
-    assert len(clt_model.decoders) == num_expected_decoders
-    for src in range(config.num_layers):
-        for tgt in range(src, config.num_layers):
-            key = f"{src}->{tgt}"
-            assert key in clt_model.decoders
-            dec = clt_model.decoders[key]
-            assert dec.in_features == config.num_features
-            assert dec.out_features == config.d_model
-
-    # Calculate and print memory footprint
-    total_params = sum(p.numel() for p in clt_model.parameters() if p.requires_grad)
-    dtype_size = torch.finfo(clt_model.dtype).bits // 8
-    estimated_memory_bytes = total_params * dtype_size
-    estimated_memory_mb = estimated_memory_bytes / (1024 * 1024)
-    print(
-        f"\n[Memory Footprint Info for GPT-2 Small Size "
-        f"({clt_model.config.activation_fn} CLT ({clt_model.dtype}) on {clt_model.device})]"
-    )
-    print(f"  - Total Trainable Parameters: {total_params:,}")
-    print(f"  - Estimated Memory (MB): {estimated_memory_mb:.2f} MB")
-
-
-def test_clt_get_preactivations_gpt2_small(
-    clt_model_gpt2_small, sample_inputs_gpt2_small
-):
-    """Test getting pre-activations with GPT-2 small dimensions."""
-    clt_model = clt_model_gpt2_small
-    sample_inputs = sample_inputs_gpt2_small
-    config = clt_model.config
-    x = sample_inputs[0]
-    batch_size, seq_len, _ = x.shape
-    num_tokens = batch_size * seq_len
-    preact = clt_model.get_preactivations(x, 0)
-    assert preact.shape == (num_tokens, config.num_features)
-
-
-def test_clt_encode_gpt2_small(clt_model_gpt2_small, sample_inputs_gpt2_small):
-    """Test the encode method with GPT-2 small dimensions."""
-    clt_model = clt_model_gpt2_small
-    sample_inputs = sample_inputs_gpt2_small
-    config = clt_model.config
-    x = sample_inputs[0]
-    batch_size, seq_len, _ = x.shape
-    num_tokens = batch_size * seq_len
-    encoded = clt_model.encode(x, 0)
-    assert encoded.shape == (num_tokens, config.num_features)
-    # Only checking ReLU here as per fixture config
-    assert torch.all(encoded >= 0)
-
-
-def test_clt_decode_gpt2_small(clt_model_gpt2_small, sample_inputs_gpt2_small):
-    """Test the decode method with GPT-2 small dimensions."""
-    clt_model = clt_model_gpt2_small
-    sample_inputs = sample_inputs_gpt2_small
-    config = clt_model.config
-    batch_size, seq_len, _ = sample_inputs[0].shape
-    num_tokens = batch_size * seq_len
-
-    # Encode inputs first
-    activations = {
-        i: clt_model.encode(sample_inputs[i], i) for i in range(config.num_layers)
-    }
-
-    for layer_idx in range(config.num_layers):
-        reconstruction = clt_model.decode(activations, layer_idx)
-        assert reconstruction.shape == (num_tokens, config.d_model)
-        # Skip detailed calculation check for large model test
-
-
-def test_clt_forward_gpt2_small(clt_model_gpt2_small, sample_inputs_gpt2_small):
-    """Test the forward pass of the CLT with GPT-2 small dimensions."""
-    clt_model = clt_model_gpt2_small
-    sample_inputs = sample_inputs_gpt2_small
-    config = clt_model.config
-    reconstructions = clt_model(sample_inputs)
-
-    assert isinstance(reconstructions, dict)
-    assert len(reconstructions) == config.num_layers
-    assert all(idx in reconstructions for idx in range(config.num_layers))
-
-    for layer_idx, recon in reconstructions.items():
-        batch_size, seq_len, _ = sample_inputs[layer_idx].shape
-        num_tokens = batch_size * seq_len
-        assert recon.shape == (num_tokens, config.d_model)
-        # Skip detailed calculation check for large model test
-
-
-def test_clt_get_feature_activations_gpt2_small(
-    clt_model_gpt2_small, sample_inputs_gpt2_small
-):
-    """Test getting all feature activations with GPT-2 small dimensions."""
-    clt_model = clt_model_gpt2_small
-    sample_inputs = sample_inputs_gpt2_small
-    config = clt_model.config
-    activations = clt_model.get_feature_activations(sample_inputs)
-
-    assert isinstance(activations, dict)
-    assert len(activations) == config.num_layers
-    assert all(idx in activations for idx in range(config.num_layers))
-
-    for layer_idx, act in activations.items():
-        batch_size, seq_len, _ = sample_inputs[layer_idx].shape
-        num_tokens = batch_size * seq_len
-        assert act.shape == (num_tokens, config.num_features)
-        # Skip detailed calculation check
-
-
-def test_clt_get_decoder_norms_gpt2_small(clt_model_gpt2_small):
-    """Test calculation of decoder norms with GPT-2 small dimensions."""
-    clt_model = clt_model_gpt2_small
-    config = clt_model.config
-    decoder_norms = clt_model.get_decoder_norms()
-
-    assert decoder_norms.shape == (config.num_layers, config.num_features)
-    assert not torch.any(torch.isnan(decoder_norms))
-    assert torch.all(decoder_norms >= 0)
-    # Skip detailed calculation check for large model test
-
-
-# Note: Ensure the actual CLTConfig class aligns with usage here.
diff --git a/tests/unit/test_activation_registry.py b/tests/unit/test_activation_registry.py
new file mode 100644
index 0000000..f710346
--- /dev/null
+++ b/tests/unit/test_activation_registry.py
@@ -0,0 +1,204 @@
+import torch
+import pytest
+import torch.nn.functional as F
+from unittest.mock import MagicMock, patch
+import logging
+
+from clt.activations.registry import (
+    ACTIVATION_REGISTRY,
+    register_activation_fn,
+    get_activation_fn,
+    relu_activation,
+    jumprelu_activation,
+    batchtopk_per_layer_activation,
+    topk_per_layer_activation,
+)
+from clt.config import CLTConfig
+
+
+# Helper to clear registry for isolated tests
+@pytest.fixture(autouse=True)
+def clear_registry_for_test():
+    original_registry_items = list(ACTIVATION_REGISTRY.items())
+    ACTIVATION_REGISTRY.clear()
+    # Re-register default ones that are imported directly in the test module
+    # This ensures they are available for tests that might use get_activation_fn implicitly
+    ACTIVATION_REGISTRY["relu"] = relu_activation
+    ACTIVATION_REGISTRY["jumprelu"] = jumprelu_activation
+    ACTIVATION_REGISTRY["batchtopk"] = batchtopk_per_layer_activation
+    ACTIVATION_REGISTRY["topk"] = topk_per_layer_activation
+    yield
+    ACTIVATION_REGISTRY.clear()
+    for name, fn in original_registry_items:
+        ACTIVATION_REGISTRY[name] = fn
+
+
+def test_register_activation_fn(caplog):
+    @register_activation_fn("test_act")
+    def dummy_activation(model, preact, layer_idx):
+        return preact * 2
+
+    assert "test_act" in ACTIVATION_REGISTRY
+    assert ACTIVATION_REGISTRY["test_act"] == dummy_activation
+
+    # Test overwriting (check for log message)
+    # Clear previous logs if any for this test
+    caplog.clear()
+    with caplog.at_level(logging.WARNING):
+
+        @register_activation_fn("test_act")
+        def dummy_activation_overwrite(model, preact, layer_idx):
+            return preact * 3
+
+    assert ACTIVATION_REGISTRY["test_act"] == dummy_activation_overwrite
+
+    assert len(caplog.records) == 1
+    record = caplog.records[0]
+    assert record.levelname == "WARNING"
+    assert "Activation function 'test_act' is already registered. Overwriting." in record.message
+
+
+def test_get_activation_fn_success():
+    @register_activation_fn("my_retrieved_act")
+    def another_dummy(model, preact, layer_idx):
+        return preact
+
+    retrieved_fn = get_activation_fn("my_retrieved_act")
+    assert retrieved_fn == another_dummy
+
+
+def test_get_activation_fn_failure():
+    with pytest.raises(ValueError, match="Activation function 'non_existent_act' not found in registry."):
+        get_activation_fn("non_existent_act")
+
+
+# --- Test Individual Registered Functions ---
+
+
+@pytest.fixture
+def mock_model() -> MagicMock:
+    mock = MagicMock()
+    mock.config = CLTConfig(d_model=16, num_features=32, num_layers=2, activation_fn="relu")  # Basic config
+    mock.rank = 0
+    mock.device = torch.device("cpu")
+    mock.dtype = torch.float32
+    return mock
+
+
+def test_relu_activation_registered(mock_model):
+    preact = torch.tensor([-1.0, 0.0, 1.0, 2.0])
+    expected_output = F.relu(preact)
+
+    fn = get_activation_fn("relu")
+    output = fn(mock_model, preact, 0)
+    assert torch.equal(output, expected_output)
+
+
+def test_jumprelu_activation_registered(mock_model):
+    preact = torch.tensor([-1.0, 1.0, 2.0])
+    layer_idx = 0
+    mock_model.jumprelu = MagicMock(return_value=torch.tensor([0.0, 1.0, 2.0]))  # Simulate model's jumprelu
+
+    fn = get_activation_fn("jumprelu")
+    output = fn(mock_model, preact, layer_idx)
+
+    mock_model.jumprelu.assert_called_once_with(preact, layer_idx)
+    assert torch.equal(output, mock_model.jumprelu.return_value)
+
+
+@patch("clt.models.activations.BatchTopK")
+def test_batchtopk_per_layer_activation_registered(MockBatchTopK, mock_model, caplog):
+    preact = torch.tensor([[1.0, 2.0, 3.0, 4.0]])
+    layer_idx = 0
+    mock_model.config.activation_fn = "batchtopk"
+    mock_model.config.batchtopk_k = 2
+    mock_model.config.batchtopk_straight_through = True
+
+    # Mock the .apply method of the BatchTopK class
+    mock_batchtopk_apply_return = torch.tensor([[0.0, 0.0, 3.0, 4.0]])
+    MockBatchTopK.apply = MagicMock(return_value=mock_batchtopk_apply_return)
+
+    fn = get_activation_fn("batchtopk")
+    caplog.clear()
+    with caplog.at_level(logging.WARNING):
+        output = fn(mock_model, preact, layer_idx)
+
+    MockBatchTopK.apply.assert_called_once_with(
+        preact, float(mock_model.config.batchtopk_k), mock_model.config.batchtopk_straight_through, preact
+    )
+    assert torch.equal(output, mock_batchtopk_apply_return)
+    assert any("This applies TopK per-layer, not globally." in record.message for record in caplog.records)
+
+
+@patch("clt.models.activations.BatchTopK")
+def test_batchtopk_per_layer_activation_k_none(MockBatchTopK, mock_model, caplog):
+    preact = torch.tensor([[1.0, 2.0, 3.0, 4.0]])
+    layer_idx = 0
+    mock_model.config.activation_fn = "batchtopk"
+    mock_model.config.batchtopk_k = None  # Test k=None case
+    mock_model.config.batchtopk_straight_through = False
+
+    mock_batchtopk_apply_return = preact.clone()  # Should return all if k is effectively num_features
+    MockBatchTopK.apply = MagicMock(return_value=mock_batchtopk_apply_return)
+
+    fn = get_activation_fn("batchtopk")
+    caplog.clear()
+    with caplog.at_level(logging.WARNING):
+        output = fn(mock_model, preact, layer_idx)
+
+    assert any("batchtopk_k not set in config" in record.message for record in caplog.records)
+    assert any("This applies TopK per-layer, not globally." in record.message for record in caplog.records)
+    # k should default to preact.size(1)
+    MockBatchTopK.apply.assert_called_once_with(
+        preact, float(preact.size(1)), mock_model.config.batchtopk_straight_through, preact
+    )
+    assert torch.equal(output, mock_batchtopk_apply_return)
+
+
+@patch("clt.models.activations.TokenTopK")
+def test_topk_per_layer_activation_registered(MockTokenTopK, mock_model, caplog):
+    preact = torch.tensor([[1.0, 2.0, 3.0, 4.0]])
+    layer_idx = 0
+    mock_model.config.activation_fn = "topk"
+    mock_model.config.topk_k = 0.5  # Example fraction
+    mock_model.config.topk_straight_through = True  # Example
+
+    mock_tokentopk_apply_return = torch.tensor([[0.0, 0.0, 3.0, 4.0]])  # Dummy return
+    MockTokenTopK.apply = MagicMock(return_value=mock_tokentopk_apply_return)
+
+    fn = get_activation_fn("topk")
+    caplog.clear()
+    with caplog.at_level(logging.WARNING):
+        output = fn(mock_model, preact, layer_idx)
+
+    MockTokenTopK.apply.assert_called_once_with(
+        preact, float(mock_model.config.topk_k), mock_model.config.topk_straight_through, preact
+    )
+    assert torch.equal(output, mock_tokentopk_apply_return)
+    assert any("This applies TopK per-layer, not globally." in record.message for record in caplog.records)
+
+
+@patch("clt.models.activations.TokenTopK")
+def test_topk_per_layer_activation_k_none(MockTokenTopK, mock_model, caplog):
+    preact = torch.tensor([[1.0, 2.0, 3.0, 4.0]])
+    layer_idx = 0
+    mock_model.config.activation_fn = "topk"
+    # Simulate topk_k not being in config
+    if hasattr(mock_model.config, "topk_k"):
+        delattr(mock_model.config, "topk_k")
+    mock_model.config.topk_straight_through = False
+
+    mock_tokentopk_apply_return = preact.clone()
+    MockTokenTopK.apply = MagicMock(return_value=mock_tokentopk_apply_return)
+
+    fn = get_activation_fn("topk")
+    caplog.clear()
+    with caplog.at_level(logging.WARNING):
+        output = fn(mock_model, preact, layer_idx)
+
+    assert any("topk_k not set in config" in record.message for record in caplog.records)
+    assert any("This applies TopK per-layer, not globally." in record.message for record in caplog.records)
+    MockTokenTopK.apply.assert_called_once_with(
+        preact, float(preact.size(1)), mock_model.config.topk_straight_through, preact
+    )
+    assert torch.equal(output, mock_tokentopk_apply_return)
diff --git a/tests/unit/test_activations.py b/tests/unit/test_activations.py
new file mode 100644
index 0000000..fff1be6
--- /dev/null
+++ b/tests/unit/test_activations.py
@@ -0,0 +1,429 @@
+import torch
+import pytest
+from typing import cast
+
+from clt.models.activations import BatchTopK, TokenTopK, JumpReLU
+
+# --- BatchTopK Tests ---
+
+
+def test_batchtopk_compute_mask_basic():
+    x = torch.tensor([[1.0, 2.0, 3.0, 4.0], [5.0, 1.0, 2.5, 3.5]])  # Batch 2, Features 4
+    k_per_token = 2
+    mask = BatchTopK._compute_mask(x, k_per_token)
+    assert mask.shape == x.shape
+    assert mask.dtype == torch.bool
+    assert mask.sum().item() == k_per_token * x.size(0)  # k_total_batch = 2 * 2 = 4
+
+    # Check if top k elements per batch are selected (overall)
+    # Expected: 4.0, 3.0 from row 1; 5.0, 3.5 from row 2.
+    # Mask should pick [F, T, T, T] for row 1 (if 3,4 selected) and [T, F, F, T] for row 2 (if 5, 3.5 selected)
+    # Flattened x: [1,2,3,4,5,1,2.5,3.5], top 4: [4,5,3.5,3]
+    # Indices: 3, 4, 7, 2
+    expected_mask_flat = torch.zeros_like(x.view(-1), dtype=torch.bool)
+    expected_mask_flat[torch.tensor([3, 4, 7, 2])] = True
+    assert torch.equal(mask.view(-1), expected_mask_flat)
+
+
+def test_batchtopk_compute_mask_k_zero():
+    x = torch.randn(2, 4)
+    k_per_token = 0
+    mask = BatchTopK._compute_mask(x, k_per_token)
+    assert mask.shape == x.shape
+    assert mask.dtype == torch.bool
+    assert mask.sum().item() == 0
+
+
+def test_batchtopk_compute_mask_k_full():
+    x = torch.randn(2, 4)
+    k_per_token = 4
+    mask = BatchTopK._compute_mask(x, k_per_token)
+    assert mask.sum().item() == x.numel()
+    assert torch.all(mask)
+
+
+def test_batchtopk_compute_mask_k_more_than_features():
+    x = torch.randn(2, 4)
+    k_per_token = 5
+    mask = BatchTopK._compute_mask(x, k_per_token)  # k_total_batch = min(5*2, 2*4) = 8
+    assert mask.sum().item() == x.numel()
+    assert torch.all(mask)
+
+
+def test_batchtopk_compute_mask_empty_input():
+    x = torch.empty(0, 4)
+    k_per_token = 2
+    mask = BatchTopK._compute_mask(x, k_per_token)
+    assert mask.shape == x.shape
+    assert mask.sum().item() == 0
+
+    x2 = torch.empty(2, 0)
+    mask2 = BatchTopK._compute_mask(x2, k_per_token)
+    assert mask2.shape == x2.shape
+    assert mask2.sum().item() == 0
+
+
+def test_batchtopk_forward_basic():
+    x = torch.tensor([[1.0, 2.0, 3.0, 4.0], [5.0, 1.0, 2.5, 3.5]], dtype=torch.float32, requires_grad=True)
+    k_float = 2.0  # Treated as int for k_per_token
+    straight_through = True
+
+    output = cast(torch.Tensor, BatchTopK.apply(x, k_float, straight_through, None))
+
+    assert output.shape == x.shape
+    assert output.dtype == x.dtype
+    # Based on test_batchtopk_compute_mask_basic: values 1.0 and 2.5 should be zeroed
+    # x_flat indices for 1.0 and 2.5 are 0 and 6
+    # mask_flat indices 3,4,7,2 -> original values are 4,5,3.5,3
+    # Values at indices 0, 1, 5, 6 should be zeroed
+    # Original x: [[1,2,3,4], [5,1,2.5,3.5]]
+    # Mask:       [[F,T,T,T], [T,F,F,T]] if x used for ranking (no, this is batch not token)
+    # Mask (global): mask_flat[3]=T(4.0), mask_flat[4]=T(5.0), mask_flat[7]=T(3.5), mask_flat[2]=T(3.0)
+    # Output should be: [[0,0,3,4], [5,0,0,3.5]]
+    # expected_output = torch.tensor([[0.0, 0.0, 3.0, 4.0], [5.0, 0.0, 0.0, 3.5]], dtype=torch.float32)
+    # Rerun logic from _compute_mask to confirm mask for this x and k=2
+    true_mask = torch.zeros_like(x, dtype=torch.bool)
+    true_mask_flat = true_mask.view(-1)
+    true_mask_flat[torch.tensor([4, 3, 7, 2])] = True  # Corresponds to 5,4,3.5,3
+
+    assert torch.allclose(output, x * true_mask.to(x.dtype))
+
+
+def test_batchtopk_forward_with_x_for_ranking():
+    x = torch.tensor([[1.0, 2.0, 3.0, 4.0], [0.0, 0.0, 0.0, 0.0]], dtype=torch.float32, requires_grad=True)
+    x_for_ranking = torch.tensor([[0.0, 0.0, 0.0, 0.0], [5.0, 6.0, 7.0, 8.0]], dtype=torch.float32)
+    k_float = 2.0
+    straight_through = True
+    # Ranking based on x_for_ranking, k=2*2=4. Should pick 8,7,6,5 (all from second row of x_for_ranking)
+    # This means mask is all False for first row of x, all True for second row of x.
+    # Output should be: [[0,0,0,0],[0,0,0,0]] because x_for_ranking makes mask select 2nd row of x, which is all 0
+    # Correction: the mask applies to x. So if x_for_ranking leads to selecting indices in the second row,
+    # then the *values* from the second row of *x* should be preserved.
+    # x_for_ranking flat = [0,0,0,0,5,6,7,8]. Top 4: 8,7,6,5. Indices: 7,6,5,4
+    # Mask applied to x: x elements at these flat indices are kept.
+    # x_flat = [1,2,3,4,0,0,0,0].
+    # Mask: [[F,F,F,F], [T,T,T,T]]
+    # Output: [[0,0,0,0], [0,0,0,0]] - This is correct.
+
+    output = cast(torch.Tensor, BatchTopK.apply(x, k_float, straight_through, x_for_ranking))
+    expected_mask = torch.tensor([[False, False, False, False], [True, True, True, True]])
+    expected_output = x * expected_mask.to(x.dtype)
+    assert torch.allclose(output, expected_output)
+
+
+def test_batchtopk_backward_ste():
+    x = torch.randn(2, 4, requires_grad=True)
+    k_float = 2.0
+    straight_through = True
+
+    # Forward pass to save context
+    output = cast(torch.Tensor, BatchTopK.apply(x, k_float, straight_through, None))
+
+    # Dummy gradient from subsequent layer
+    grad_output = torch.randn_like(output)
+
+    # Backward pass
+    output.backward(grad_output)
+
+    # Check gradients
+    assert x.grad is not None
+
+    # Expected gradient for STE: grad_output * mask
+    # Recompute mask
+    mask = BatchTopK._compute_mask(x.data, int(k_float), None)
+    expected_grad_x = grad_output * mask.to(x.dtype)
+
+    assert torch.allclose(x.grad, expected_grad_x)
+
+
+def test_batchtopk_backward_non_ste_placeholder():
+    # Currently, non-STE backward behaves like STE. This test reflects that.
+    x = torch.randn(2, 4, requires_grad=True)
+    k_float = 2.0
+    straight_through = False  # Non-STE
+
+    output = cast(torch.Tensor, BatchTopK.apply(x, k_float, straight_through, None))
+    grad_output = torch.randn_like(output)
+    output.backward(grad_output)
+
+    assert x.grad is not None
+    mask = BatchTopK._compute_mask(x.data, int(k_float), None)
+    expected_grad_x = grad_output * mask.to(x.dtype)  # Same as STE
+    assert torch.allclose(x.grad, expected_grad_x)
+
+
+# --- TokenTopK Tests ---
+
+
+def test_tokentopk_compute_mask_basic_fraction_k():
+    x = torch.tensor([[1.0, 2.0, 3.0, 4.0], [5.0, 1.0, 2.5, 3.5]])  # Batch 2, Features 4
+    k_float = 0.5  # Keep 50% of features per token (0.5 * 4 = 2)
+    mask = TokenTopK._compute_mask(x, k_float)
+    assert mask.shape == x.shape
+    assert mask.dtype == torch.bool
+    assert mask.sum().item() == 2 * x.size(0)  # 2 features per token * 2 tokens = 4
+
+    # Row 0: [1,2,3,4], top 2 are 3,4. Mask: [F,F,T,T]
+    # Row 1: [5,1,2.5,3.5], top 2 are 5,3.5. Mask: [T,F,F,T]
+    expected_mask = torch.tensor([[False, False, True, True], [True, False, False, True]])
+    assert torch.equal(mask, expected_mask)
+
+
+def test_tokentopk_compute_mask_basic_int_k():
+    x = torch.tensor([[1.0, 2.0, 3.0, 4.0], [5.0, 1.0, 2.5, 3.5]])
+    k_int = 1  # Keep 1 feature per token
+    mask = TokenTopK._compute_mask(x, float(k_int))
+    assert mask.shape == x.shape
+    assert mask.dtype == torch.bool
+    assert mask.sum().item() == k_int * x.size(0)
+
+    # Row 0: [1,2,3,4], top 1 is 4. Mask: [F,F,F,T]
+    # Row 1: [5,1,2.5,3.5], top 1 is 5. Mask: [T,F,F,F]
+    expected_mask = torch.tensor([[False, False, False, True], [True, False, False, False]])
+    assert torch.equal(mask, expected_mask)
+
+
+def test_tokentopk_compute_mask_k_zero():
+    x = torch.randn(2, 4)
+    k_float = 0.0
+    mask = TokenTopK._compute_mask(x, k_float)
+    assert mask.shape == x.shape
+    assert mask.dtype == torch.bool
+    assert mask.sum().item() == 0
+
+
+def test_tokentopk_compute_mask_k_negative():
+    x = torch.randn(2, 4)
+    k_float = -0.5
+    mask = TokenTopK._compute_mask(x, k_float)
+    assert mask.sum().item() == 0
+
+
+def test_tokentopk_compute_mask_k_fraction_ceil():
+    # k_float * F_total = 0.6 * 5 = 3.0, ceil(3.0) = 3
+    x = torch.randn(2, 5)
+    k_float = 0.6
+    mask = TokenTopK._compute_mask(x, k_float)
+    assert mask.sum(dim=1).tolist() == [3, 3]
+
+    # k_float * F_total = 0.5 * 5 = 2.5, ceil(2.5) = 3
+    x2 = torch.randn(2, 5)
+    k_float2 = 0.5
+    mask2 = TokenTopK._compute_mask(x2, k_float2)
+    assert mask2.sum(dim=1).tolist() == [3, 3]
+
+
+def test_tokentopk_compute_mask_k_full_fraction():
+    x = torch.randn(2, 4)
+    k_float = 1.0  # According to TokenTopK logic, this means k_per_token = int(1.0) = 1
+    mask = TokenTopK._compute_mask(x, k_float)
+    # Expected sum is 1 (k_per_token) * 2 (num_tokens) = 2
+    assert mask.sum().item() == 1 * x.size(0)
+    # This assert torch.all(mask) will now fail, as only 1 element per row is true.
+    # We should check that each row sums to 1.
+    assert torch.all(mask.sum(dim=1) == 1)
+
+
+def test_tokentopk_compute_mask_k_full_int():
+    x = torch.randn(2, 4)
+    k_int = 4
+    mask = TokenTopK._compute_mask(x, float(k_int))
+    assert mask.sum().item() == x.numel()
+    assert torch.all(mask)
+
+
+def test_tokentopk_compute_mask_k_more_than_features_int():
+    x = torch.randn(2, 4)
+    k_int = 5
+    mask = TokenTopK._compute_mask(x, float(k_int))
+    assert mask.sum().item() == x.numel()
+    assert torch.all(mask)
+
+
+def test_tokentopk_compute_mask_empty_input():
+    x = torch.empty(0, 4)
+    k_float = 0.5
+    mask = TokenTopK._compute_mask(x, k_float)
+    assert mask.shape == x.shape
+    assert mask.sum().item() == 0
+
+    x2 = torch.empty(2, 0)
+    mask2 = TokenTopK._compute_mask(x2, k_float)
+    assert mask2.shape == x2.shape
+    assert mask2.sum().item() == 0
+
+
+def test_tokentopk_forward_basic():
+    x = torch.tensor([[1.0, 2.0, 3.0, 4.0], [5.0, 1.0, 2.5, 3.5]], dtype=torch.float32, requires_grad=True)
+    k_float = 0.5  # keep 2 per token
+    straight_through = True
+
+    output = cast(torch.Tensor, TokenTopK.apply(x, k_float, straight_through, None))
+
+    assert output.shape == x.shape
+    assert output.dtype == x.dtype
+    expected_mask = torch.tensor([[False, False, True, True], [True, False, False, True]])
+    expected_output = x * expected_mask.to(x.dtype)
+    assert torch.allclose(output, expected_output)
+
+
+def test_tokentopk_forward_with_x_for_ranking():
+    x = torch.tensor([[1.0, 2.0, 3.0, 4.0], [10.0, 10.0, 10.0, 10.0]], dtype=torch.float32, requires_grad=True)
+    x_for_ranking = torch.tensor(
+        [[4.0, 3.0, 2.0, 1.0], [1.0, 2.0, 3.0, 4.0]], dtype=torch.float32
+    )  # Ranks inverted for row 0, normal for row 1
+    k_float = 0.5  # keep 2
+    straight_through = True
+
+    # Row 0 x_for_ranking: top 2 are 4,3 (indices 0,1). Mask on x: [T,T,F,F]. Output from x: [1,2,0,0]
+    # Row 1 x_for_ranking: top 2 are 3,4 (indices 2,3). Mask on x: [F,F,T,T]. Output from x: [0,0,10,10]
+    output = cast(torch.Tensor, TokenTopK.apply(x, k_float, straight_through, x_for_ranking))
+    expected_mask = torch.tensor([[True, True, False, False], [False, False, True, True]])
+    expected_output = x * expected_mask.to(x.dtype)
+    assert torch.allclose(output, expected_output)
+
+
+def test_tokentopk_backward_ste():
+    x = torch.randn(2, 4, requires_grad=True)
+    k_float = 0.5
+    straight_through = True
+
+    output = cast(torch.Tensor, TokenTopK.apply(x, k_float, straight_through, None))
+    grad_output = torch.randn_like(output)
+    output.backward(grad_output)
+
+    assert x.grad is not None
+    mask = TokenTopK._compute_mask(x.data, k_float, None)
+    expected_grad_x = grad_output * mask.to(x.dtype)
+    assert torch.allclose(x.grad, expected_grad_x)
+
+
+# --- JumpReLU Tests ---
+
+
+@pytest.mark.parametrize("bandwidth", [0.5, 1.0, 2.0])
+def test_jumprelu_forward(bandwidth):
+    input_tensor = torch.tensor([-2.0, -1.0, -0.4, 0.0, 0.4, 1.0, 2.0], requires_grad=True)
+    threshold = torch.tensor([0.5])  # Threshold is 0.5
+
+    # Expected: input values >= 0.5 pass, others zeroed.
+    # [-2, -1, -0.4, 0, 0.4, 1, 2] with threshold 0.5 -> [0,0,0,0,0,1,2]
+    expected_output = torch.tensor([0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0])
+
+    output = cast(torch.Tensor, JumpReLU.apply(input_tensor, threshold, bandwidth))
+    assert torch.allclose(output, expected_output)
+    assert output.dtype == input_tensor.dtype
+
+
+@pytest.mark.parametrize("bandwidth", [1.0, 2.0])  # Bandwidth must be > 0
+@pytest.mark.parametrize(
+    "input_val, threshold_val, expected_grad_input_factor, expected_grad_thresh_factor",
+    [
+        # Case 1: input < threshold, outside bandwidth/2 on the left
+        (0.0, 1.0, 0.0, 0.0),  # grad_input is STE (0), grad_thresh is 0
+        # Case 2: input < threshold, inside bandwidth/2 on the left (input - threshold = -0.2, abs = 0.2 <= bandwidth/2)
+        (0.8, 1.0, 0.0, -0.8),  # grad_input is STE (0), grad_thresh is -input/bandwidth (if bandwidth=1, -0.8/1 = -0.8)
+        # Case 3: input == threshold (center of bandwidth)
+        (1.0, 1.0, 1.0, -1.0),  # grad_input is STE (1), grad_thresh is -input/bandwidth (if bandwidth=1, -1.0/1 = -1.0)
+        # Case 4: input > threshold, inside bandwidth/2 on the right (input - threshold = 0.2, abs = 0.2 <= bandwidth/2)
+        (1.2, 1.0, 1.0, -1.2),  # grad_input is STE (1), grad_thresh is -input/bandwidth (if bandwidth=1, -1.2/1 = -1.2)
+        # Case 5: input > threshold, outside bandwidth/2 on the right
+        (2.0, 1.0, 1.0, 0.0),  # grad_input is STE (1), grad_thresh is 0
+        # Case 6: Another example for input < threshold, inside bandwidth
+        (-0.2, 0.0, 0.0, 0.2),  # input - thresh = -0.2. grad_thresh = -(-0.2)/bandwidth = 0.2/bandwidth
+        # Case 7: input significantly less than threshold
+        (-5.0, 0.0, 0.0, 0.0),
+        # Case 8: input significantly greater than threshold
+        (5.0, 0.0, 1.0, 0.0),
+    ],
+)
+def test_jumprelu_backward_detailed(
+    bandwidth, input_val, threshold_val, expected_grad_input_factor, expected_grad_thresh_factor
+):
+    input_t = torch.tensor([input_val], dtype=torch.float64, requires_grad=True)
+    # Threshold must also require grad for its grad to be computed and non-None
+    threshold_t = torch.tensor([threshold_val], dtype=torch.float64, requires_grad=True)
+    grad_output = torch.tensor([1.0], dtype=torch.float64)  # Assume upstream grad is 1 for simplicity
+
+    # Forward pass
+    output = cast(torch.Tensor, JumpReLU.apply(input_t, threshold_t, bandwidth))
+    # Backward pass
+    output.backward(grad_output)
+
+    assert input_t.grad is not None
+    assert threshold_t.grad is not None
+
+    # Check grad_input (STE part)
+    # ste_mask = (input_t >= threshold_t).float()
+    # expected_grad_input = grad_output * ste_mask
+    # Using expected_grad_input_factor which is effectively the ste_mask for grad_output=1
+    assert torch.allclose(
+        input_t.grad, torch.tensor([expected_grad_input_factor * grad_output.item()], dtype=torch.float64)
+    ), f"Input: {input_val}, Thresh: {threshold_val}, BW: {bandwidth}\nGrad_input: {input_t.grad.item()}, Expected_factor: {expected_grad_input_factor}"
+
+    # Check grad_threshold
+    # is_near_threshold = torch.abs(input_t - threshold_t) <= (bandwidth / 2.0)
+    # local_grad_theta = (-input_t / bandwidth) * is_near_threshold.float()
+    # expected_grad_threshold = grad_output * local_grad_theta
+    # Using expected_grad_thresh_factor which is effectively local_grad_theta for grad_output=1
+    # Note: the formula in JumpReLU.backward is (-input / bandwidth) * is_near_threshold
+    # So if expected_grad_thresh_factor is -0.8, this implies is_near_threshold=True, and factor = -input_val/bandwidth
+    # If expected_grad_thresh_factor is 0.0, implies is_near_threshold=False
+    # let's re-evaluate the factor based on the original formula components:
+    is_near = abs(input_val - threshold_val) <= (bandwidth / 2.0)
+    if is_near:
+        true_expected_factor_for_grad_thresh = -input_val / bandwidth
+    else:
+        true_expected_factor_for_grad_thresh = 0.0
+
+    assert torch.allclose(
+        threshold_t.grad, torch.tensor([true_expected_factor_for_grad_thresh * grad_output.item()], dtype=torch.float64)
+    ), f"Input: {input_val}, Thresh: {threshold_val}, BW: {bandwidth}\nGrad_thresh: {threshold_t.grad.item()}, Expected_factor_calc: {true_expected_factor_for_grad_thresh}"
+
+
+def test_jumprelu_backward_grad_flags():
+    bandwidth = 1.0
+    # Case 1: Only input requires grad
+    inp1 = torch.tensor([1.5], requires_grad=True)
+    thr1 = torch.tensor([0.5], requires_grad=False)
+    out1 = cast(torch.Tensor, JumpReLU.apply(inp1, thr1, bandwidth))
+    out1.backward(torch.tensor([1.0]))
+    assert inp1.grad is not None
+    assert thr1.grad is None
+
+    # Case 2: Only threshold requires grad
+    inp2 = torch.tensor([1.5], requires_grad=False)
+    thr2 = torch.tensor([0.5], requires_grad=True)
+    out2 = cast(torch.Tensor, JumpReLU.apply(inp2, thr2, bandwidth))
+    out2.backward(torch.tensor([1.0]))
+    assert inp2.grad is None
+    assert thr2.grad is not None
+
+    # Case 3: Neither requires grad (should not error, grads just won't be populated)
+    inp3 = torch.tensor([1.5], requires_grad=False)
+    thr3 = torch.tensor([0.5], requires_grad=False)
+    # out3 = JumpReLU.apply(inp3, thr3, bandwidth)
+    # .backward() would error if no input needs grad and it's called.
+    # We are testing autograd.Function behavior, which populates ctx.needs_input_grad
+    # For this, we simulate the call to backward directly
+
+    class MockContext:
+        def __init__(self, needs_input_grad_list, saved_tensors_tuple, bw):
+            self.needs_input_grad = needs_input_grad_list
+            self.saved_tensors = saved_tensors_tuple
+            self.bandwidth = bw
+
+    ctx_mock_no_input_grad = MockContext([False, True, False], (inp3.detach(), thr3.detach()), bandwidth)
+    grad_in_sim, grad_thr_sim, _ = JumpReLU.backward(ctx_mock_no_input_grad, torch.tensor([1.0]))
+    assert grad_in_sim is None
+    assert grad_thr_sim is not None  # grad_thresh calculation part should still run if thr needs grad
+
+    ctx_mock_no_thresh_grad = MockContext([True, False, False], (inp3.detach(), thr3.detach()), bandwidth)
+    grad_in_sim2, grad_thr_sim2, _ = JumpReLU.backward(ctx_mock_no_thresh_grad, torch.tensor([1.0]))
+    assert grad_in_sim2 is not None
+    assert grad_thr_sim2 is None
+
+    ctx_mock_no_grads_needed = MockContext([False, False, False], (inp3.detach(), thr3.detach()), bandwidth)
+    grad_in_sim3, grad_thr_sim3, _ = JumpReLU.backward(ctx_mock_no_grads_needed, torch.tensor([1.0]))
+    assert grad_in_sim3 is None
+    assert grad_thr_sim3 is None
diff --git a/tests/unit/training/test_data.py b/tests/unit/training/test_data.py
deleted file mode 100644
index 859cdc3..0000000
--- a/tests/unit/training/test_data.py
+++ /dev/null
@@ -1,731 +0,0 @@
-import pytest
-import torch
-import numpy as np
-from typing import Dict, Tuple, List, Generator, Union
-import time
-import sys
-from unittest.mock import patch, MagicMock
-
-# Assuming clt is importable from the test environment
-from clt.training.data import ActivationStore, ActivationBatchCLT
-
-
-# --- Test Fixtures ---
-
-NUM_LAYERS = 2
-D_MODEL = 16
-NUM_GEN_BATCHES = 20  # Number of batches the dummy generator can yield
-TOKENS_PER_GEN_BATCH = 128  # Number of tokens in each batch yielded by the generator
-
-
-@pytest.fixture
-def dummy_activation_generator() -> Generator[ActivationBatchCLT, None, None]:
-    """Provides a dummy activation generator for testing."""
-
-    def _generator():
-        for _ in range(NUM_GEN_BATCHES):
-            inputs_dict: Dict[int, torch.Tensor] = {}
-            targets_dict: Dict[int, torch.Tensor] = {}
-            for layer_idx in range(NUM_LAYERS):
-                # Simulate slightly different data for inputs/targets
-                inputs_dict[layer_idx] = torch.randn(TOKENS_PER_GEN_BATCH, D_MODEL)
-                targets_dict[layer_idx] = (
-                    torch.randn(TOKENS_PER_GEN_BATCH, D_MODEL) * 0.5 + 0.1
-                )
-            yield inputs_dict, targets_dict
-            # Small sleep to simulate potential real-world generator delay
-            # time.sleep(0.001)
-
-    return _generator()  # Return the generator iterator
-
-
-@pytest.fixture
-def exhausted_generator() -> Generator[ActivationBatchCLT, None, None]:
-    """Provides a generator that yields nothing."""
-
-    def _generator():
-        if False:  # Never yield
-            yield  # pragma: no cover
-
-    return _generator()
-
-
-@pytest.fixture
-def inconsistent_d_model_generator() -> Generator[ActivationBatchCLT, None, None]:
-    """Generator yielding inconsistent d_model."""
-
-    def _generator():
-        # First batch (consistent)
-        inputs_dict = {
-            0: torch.randn(TOKENS_PER_GEN_BATCH, D_MODEL),
-            1: torch.randn(TOKENS_PER_GEN_BATCH, D_MODEL),
-        }
-        targets_dict = {
-            0: torch.randn(TOKENS_PER_GEN_BATCH, D_MODEL),
-            1: torch.randn(TOKENS_PER_GEN_BATCH, D_MODEL),
-        }
-        yield inputs_dict, targets_dict
-        # Second batch (inconsistent)
-        inputs_dict_bad = {
-            0: torch.randn(TOKENS_PER_GEN_BATCH, D_MODEL),
-            1: torch.randn(TOKENS_PER_GEN_BATCH, D_MODEL + 1),
-        }
-        targets_dict_bad = {
-            0: torch.randn(TOKENS_PER_GEN_BATCH, D_MODEL),
-            1: torch.randn(TOKENS_PER_GEN_BATCH, D_MODEL + 1),
-        }
-        yield inputs_dict_bad, targets_dict_bad
-
-    return _generator()
-
-
-@pytest.fixture
-def inconsistent_layers_generator() -> Generator[ActivationBatchCLT, None, None]:
-    """Generator yielding inconsistent layers."""
-
-    def _generator():
-        # First batch (layers 0, 1)
-        inputs_dict = {
-            0: torch.randn(TOKENS_PER_GEN_BATCH, D_MODEL),
-            1: torch.randn(TOKENS_PER_GEN_BATCH, D_MODEL),
-        }
-        targets_dict = {
-            0: torch.randn(TOKENS_PER_GEN_BATCH, D_MODEL),
-            1: torch.randn(TOKENS_PER_GEN_BATCH, D_MODEL),
-        }
-        yield inputs_dict, targets_dict
-        # Second batch (layers 0, 2 - inconsistent)
-        inputs_dict_bad = {
-            0: torch.randn(TOKENS_PER_GEN_BATCH, D_MODEL),
-            2: torch.randn(TOKENS_PER_GEN_BATCH, D_MODEL),
-        }
-        targets_dict_bad = {
-            0: torch.randn(TOKENS_PER_GEN_BATCH, D_MODEL),
-            2: torch.randn(TOKENS_PER_GEN_BATCH, D_MODEL),
-        }
-        yield inputs_dict_bad, targets_dict_bad
-
-    return _generator()
-
-
-# --- Test Functions ---
-
-
-def test_activation_store_init_basic(dummy_activation_generator):
-    """Test basic initialization of ActivationStore."""
-    store = ActivationStore(
-        activation_generator=dummy_activation_generator,
-        n_batches_in_buffer=4,
-        train_batch_size_tokens=64,
-        normalization_method="none",
-        device="cpu",  # Explicitly use CPU for predictability
-    )
-
-    assert store.n_batches_in_buffer == 4
-    assert store.train_batch_size_tokens == 64
-    assert store.target_buffer_size_tokens == 4 * 64
-    assert store.normalization_method == "none"
-    assert store.device == torch.device("cpu")
-    assert not store.buffer_initialized
-    assert not store.generator_exhausted
-    assert not store.layer_indices  # Initialized lazily
-    assert store.d_model == -1  # Initialized lazily
-    assert not store.buffered_inputs
-    assert not store.buffered_targets
-    assert store.read_indices.shape == (0,)
-    assert store.total_tokens_yielded_by_generator == 0
-    assert store.start_time is not None
-
-
-def test_buffer_metadata_initialization(dummy_activation_generator):
-    """Test that buffer metadata is initialized correctly on first batch pull."""
-    store = ActivationStore(
-        activation_generator=dummy_activation_generator,
-        n_batches_in_buffer=2,
-        train_batch_size_tokens=64,
-        device="cpu",
-    )
-
-    # Metadata shouldn't be initialized yet
-    assert not store.buffer_initialized
-    assert not store.layer_indices
-    assert store.d_model == -1
-
-    # Trigger buffer fill and initialization
-    store.get_batch()
-
-    assert store.buffer_initialized
-    assert store.layer_indices == list(range(NUM_LAYERS))
-    assert store.d_model == D_MODEL
-    assert store.dtype == torch.float32  # Default or inferred
-    assert len(store.buffered_inputs) == NUM_LAYERS
-    assert len(store.buffered_targets) == NUM_LAYERS
-    assert all(isinstance(t, torch.Tensor) for t in store.buffered_inputs.values())
-    assert all(isinstance(t, torch.Tensor) for t in store.buffered_targets.values())
-    # Check buffer size is roughly target size (or less if generator is short)
-    expected_min_tokens = min(
-        store.target_buffer_size_tokens, NUM_GEN_BATCHES * TOKENS_PER_GEN_BATCH
-    )
-    # Allow for slight variation depending on when buffer fill stops
-    assert store.read_indices.shape[0] >= store.train_batch_size_tokens
-    assert store.read_indices.shape[0] <= expected_min_tokens
-
-
-def test_get_batch_basic(dummy_activation_generator):
-    """Test fetching a single batch."""
-    train_batch_size = 64
-    store = ActivationStore(
-        activation_generator=dummy_activation_generator,
-        n_batches_in_buffer=4,
-        train_batch_size_tokens=train_batch_size,
-        device="cpu",
-    )
-
-    inputs, targets = store.get_batch()
-
-    assert isinstance(inputs, dict)
-    assert isinstance(targets, dict)
-    assert sorted(inputs.keys()) == list(range(NUM_LAYERS))
-    assert sorted(targets.keys()) == list(range(NUM_LAYERS))
-
-    for layer_idx in range(NUM_LAYERS):
-        assert isinstance(inputs[layer_idx], torch.Tensor)
-        assert isinstance(targets[layer_idx], torch.Tensor)
-        assert inputs[layer_idx].shape == (train_batch_size, D_MODEL)
-        assert targets[layer_idx].shape == (train_batch_size, D_MODEL)
-        assert inputs[layer_idx].device == store.device
-        assert targets[layer_idx].device == store.device
-        assert inputs[layer_idx].dtype == store.dtype
-        assert targets[layer_idx].dtype == store.dtype
-
-    # Check that some tokens are now marked as read
-    assert store.read_indices.sum().item() == train_batch_size
-
-
-def test_get_batch_multiple(dummy_activation_generator):
-    """Test fetching multiple batches."""
-    train_batch_size = 64
-    n_batches_buffer = 4
-    store = ActivationStore(
-        activation_generator=dummy_activation_generator,
-        n_batches_in_buffer=n_batches_buffer,
-        train_batch_size_tokens=train_batch_size,
-        device="cpu",
-    )
-
-    num_fetches = 5
-    total_expected_tokens = num_fetches * train_batch_size
-    fetched_token_indices = set()
-
-    # Store original buffer contents after initial fill for comparison
-    store.get_batch()  # Initial fill + first batch
-    initial_buffer_inputs = {k: v.clone() for k, v in store.buffered_inputs.items()}
-    initial_read_indices_mask = store.read_indices.clone()
-    # Need the indices that *were* sampled for the first batch
-    first_batch_indices = initial_read_indices_mask.nonzero().squeeze().tolist()
-    if isinstance(first_batch_indices, int):  # Handle single index case
-        first_batch_indices = [first_batch_indices]
-    fetched_token_indices.update(first_batch_indices)
-
-    for i in range(1, num_fetches):  # Fetch remaining batches
-        # Record buffer state *before* get_batch to find newly sampled indices
-        buffer_size_before = store.read_indices.shape[0]
-        read_mask_before = store.read_indices.clone()
-
-        inputs, targets = store.get_batch()
-
-        # Verify batch structure
-        assert isinstance(inputs, dict)
-        assert isinstance(targets, dict)
-        assert len(inputs) == NUM_LAYERS
-        for layer_idx in range(NUM_LAYERS):
-            assert inputs[layer_idx].shape == (train_batch_size, D_MODEL)
-
-        # Ensure the correct number of *new* indices were sampled for this batch
-        # This check is tricky because pruning/refilling happens. A simpler check is
-        # that the total number of read tokens increases correctly.
-        # assert len(sampled_indices_in_buffer_before) == train_batch_size
-
-        # Add *original* buffer indices to our set (requires mapping back if pruned)
-        # This is too complex to track robustly here. Instead, focus on total read count.
-
-    # Check total number of read tokens (might be slightly higher due to pruning timing)
-    # The number of True values in read_indices might not be exactly total_expected_tokens
-    # because pruning removes read tokens.
-    # A better check is the total number of batches retrieved.
-    assert i + 1 == num_fetches  # Check we completed the loop
-
-    # Check generator progress
-    # Formula: ceil(num_fetches * train_batch_size / TOKENS_PER_GEN_BATCH)
-    expected_gen_batches_pulled = -(-total_expected_tokens // TOKENS_PER_GEN_BATCH)
-    # Allow for potentially one extra batch pull due to buffer refill logic
-    assert (
-        store.total_tokens_yielded_by_generator / TOKENS_PER_GEN_BATCH
-        >= expected_gen_batches_pulled
-    )
-    assert (
-        store.total_tokens_yielded_by_generator / TOKENS_PER_GEN_BATCH
-        <= expected_gen_batches_pulled + n_batches_buffer
-    )  # Max buffer pull
-
-
-def test_buffer_pruning(dummy_activation_generator):
-    """Test that read tokens are pruned from the buffer."""
-    train_batch_size = 64
-    n_batches_buffer = 2  # Small buffer to force pruning
-    store = ActivationStore(
-        activation_generator=dummy_activation_generator,
-        n_batches_in_buffer=n_batches_buffer,
-        train_batch_size_tokens=train_batch_size,
-        device="cpu",
-    )
-
-    # Fetch enough batches to ensure some tokens at the start should be pruned
-    num_fetches = n_batches_buffer + 1
-
-    store.get_batch()  # Initial fill + first batch
-    size_after_first_batch = store.read_indices.shape[0]
-
-    for _ in range(1, num_fetches):
-        store.get_batch()
-
-    # After several fetches, the buffer size should ideally not grow indefinitely
-    # if pruning is working. It might be larger than the initial fill if the
-    # generator keeps providing data, but shouldn't exceed target + a bit extra.
-    max_expected_size = (
-        store.target_buffer_size_tokens + TOKENS_PER_GEN_BATCH
-    )  # Target + one generator batch margin
-    assert store.read_indices.shape[0] <= max_expected_size
-
-    # More specific check: after enough batches, the first few original tokens should be gone.
-    # This requires tracking original tokens, which is complex.
-    # Alternative: Check that the number of read tokens (True) doesn't just keep increasing
-    # towards the total buffer size indefinitely.
-    num_read = store.read_indices.sum().item()
-    buffer_size = store.read_indices.shape[0]
-    # If pruning works, the number read should be less than the current buffer size
-    # unless *all* tokens currently in the buffer happen to have been read (unlikely).
-    assert num_read < buffer_size or buffer_size == 0
-
-
-def test_generator_exhaustion(dummy_activation_generator):
-    """Test behavior when the generator runs out of data."""
-    train_batch_size = 64
-    store = ActivationStore(
-        activation_generator=dummy_activation_generator,  # Use the standard one
-        n_batches_in_buffer=4,
-        train_batch_size_tokens=train_batch_size,
-        device="cpu",
-    )
-
-    # Calculate how many batches we can possibly get
-    total_tokens_available = NUM_GEN_BATCHES * TOKENS_PER_GEN_BATCH
-    max_batches = total_tokens_available // train_batch_size
-
-    # Fetch all possible full batches
-    num_fetched = 0
-    try:
-        for i in range(max_batches + 5):  # Try to fetch more than available
-            store.get_batch()
-            num_fetched += 1
-            if (
-                store.generator_exhausted
-                and (~store.read_indices).sum().item() < train_batch_size
-            ):
-                # If generator done and not enough left for a full batch, break early
-                break
-    except StopIteration:
-        pass  # Expected when buffer is empty after generator exhaustion
-
-    # Check that we fetched roughly the maximum number of batches
-    assert num_fetched >= max_batches
-    assert num_fetched <= max_batches + 1  # Allow for one partial batch potentially
-
-    # Check that the generator is marked as exhausted
-    assert store.generator_exhausted
-
-    # Try fetching again, should raise StopIteration
-    with pytest.raises(StopIteration):
-        store.get_batch()
-
-
-def test_iterator_protocol(dummy_activation_generator):
-    """Test that the store can be used as an iterator."""
-    train_batch_size = 64
-    store = ActivationStore(
-        activation_generator=dummy_activation_generator,
-        n_batches_in_buffer=4,
-        train_batch_size_tokens=train_batch_size,
-        device="cpu",
-    )
-
-    num_batches_to_fetch = 5
-    count = 0
-    for i, (inputs, targets) in enumerate(store):
-        assert isinstance(inputs, dict)
-        assert isinstance(targets, dict)
-        assert len(inputs[0]) == train_batch_size  # Check batch size
-        count += 1
-        if count >= num_batches_to_fetch:
-            break
-
-    assert count == num_batches_to_fetch
-
-
-def test_normalization_estimation(dummy_activation_generator):
-    """Test the 'estimated_mean_std' normalization."""
-    estimation_batches = 5
-    store = ActivationStore(
-        activation_generator=dummy_activation_generator,
-        n_batches_in_buffer=4,
-        train_batch_size_tokens=64,
-        normalization_method="estimated_mean_std",
-        normalization_estimation_batches=estimation_batches,
-        device="cpu",
-    )
-
-    # Stats should be computed during __init__
-    assert (
-        store.normalization_method == "estimated_mean_std"
-    )  # Should not change on success
-    assert len(store.input_means) == NUM_LAYERS
-    assert len(store.input_stds) == NUM_LAYERS
-    assert len(store.output_means) == NUM_LAYERS
-    assert len(store.output_stds) == NUM_LAYERS
-
-    for layer_idx in range(NUM_LAYERS):
-        assert store.input_means[layer_idx].shape == (1, D_MODEL)
-        assert store.input_stds[layer_idx].shape == (1, D_MODEL)
-        assert store.output_means[layer_idx].shape == (1, D_MODEL)
-        assert store.output_stds[layer_idx].shape == (1, D_MODEL)
-        # Check stds are positive
-        assert torch.all(store.input_stds[layer_idx] > 0)
-        assert torch.all(store.output_stds[layer_idx] > 0)
-
-    # Check that the generator was advanced by the correct number of batches
-    assert (
-        store.total_tokens_yielded_by_generator
-        == estimation_batches * TOKENS_PER_GEN_BATCH
-    )
-
-    # Check that the initial batches used for stats were added back to the buffer
-    assert store.buffer_initialized
-    assert store.read_indices.shape[0] >= min(
-        store.target_buffer_size_tokens, estimation_batches * TOKENS_PER_GEN_BATCH
-    )
-    assert (~store.read_indices).sum().item() > 0  # Should have unread tokens
-
-    # Fetch a batch and check if values seem normalized (mean ~0, std ~1)
-    # Note: This is tricky because we fetch a *random sample* from the buffer,
-    # which includes normalized data from the estimation phase.
-    inputs, targets = store.get_batch()
-    sample_input_mean = inputs[0].mean(dim=0)
-    sample_input_std = inputs[0].std(dim=0)
-    sample_target_mean = targets[0].mean(dim=0)
-    sample_target_std = targets[0].std(dim=0)
-
-    # Due to sampling and the mix of data, might not be exactly 0/1, but should be close
-    assert torch.allclose(sample_input_mean, torch.zeros(D_MODEL), atol=0.5)
-    assert torch.allclose(sample_input_std, torch.ones(D_MODEL), atol=1.0)
-    assert torch.allclose(sample_target_mean, torch.zeros(D_MODEL), atol=0.5)
-    assert torch.allclose(sample_target_std, torch.ones(D_MODEL), atol=1.0)
-
-
-def test_normalization_estimation_insufficient_data(exhausted_generator):
-    """Test normalization estimation when generator provides too little data."""
-    store = ActivationStore(
-        activation_generator=exhausted_generator,  # Generator yields nothing
-        n_batches_in_buffer=4,
-        train_batch_size_tokens=64,
-        normalization_method="estimated_mean_std",
-        normalization_estimation_batches=5,
-        device="cpu",
-    )
-
-    # Check that normalization method falls back to 'none'
-    assert store.normalization_method == "none"
-    assert not store.input_means  # Stats dictionaries should be empty
-    assert not store.input_stds
-    assert not store.output_means
-    assert not store.output_stds
-    assert store.generator_exhausted  # Generator should be marked exhausted
-
-
-def test_denormalization(dummy_activation_generator):
-    """Test the denormalize_outputs method."""
-    estimation_batches = 5
-    store = ActivationStore(
-        activation_generator=dummy_activation_generator,
-        n_batches_in_buffer=4,
-        train_batch_size_tokens=64,
-        normalization_method="estimated_mean_std",
-        normalization_estimation_batches=estimation_batches,
-        device="cpu",
-    )
-
-    # Get original means/stds for comparison
-    input_means = {k: v.clone() for k, v in store.input_means.items()}
-    output_means = {k: v.clone() for k, v in store.output_means.items()}
-    input_stds = {k: v.clone() for k, v in store.input_stds.items()}
-    output_stds = {k: v.clone() for k, v in store.output_stds.items()}
-
-    # Fetch a normalized batch
-    inputs_norm, targets_norm = store.get_batch()
-
-    # Denormalize the targets
-    targets_denorm = store.denormalize_outputs(targets_norm)
-
-    # Check denormalization
-    for layer_idx in range(NUM_LAYERS):
-        # Calculate expected denormalized values
-        expected_denorm = (
-            targets_norm[layer_idx] * output_stds[layer_idx]
-        ) + output_means[layer_idx]
-        assert torch.allclose(targets_denorm[layer_idx], expected_denorm, atol=1e-6)
-
-    # Test denormalization when method is 'none'
-    store_no_norm = ActivationStore(
-        activation_generator=dummy_activation_generator,
-        n_batches_in_buffer=4,
-        train_batch_size_tokens=64,
-        normalization_method="none",
-        device="cpu",
-    )
-    inputs_no_norm, targets_no_norm = store_no_norm.get_batch()
-    targets_denorm_noop = store_no_norm.denormalize_outputs(targets_no_norm)
-    # Should be identical (no-op)
-    assert torch.equal(targets_denorm_noop[0], targets_no_norm[0])
-    assert torch.equal(targets_denorm_noop[1], targets_no_norm[1])
-
-
-def test_state_dict_and_load(dummy_activation_generator):
-    """Test saving and loading the store's state."""
-    estimation_batches = 3
-    store1 = ActivationStore(
-        activation_generator=dummy_activation_generator,
-        n_batches_in_buffer=4,
-        train_batch_size_tokens=64,
-        normalization_method="estimated_mean_std",
-        normalization_estimation_batches=estimation_batches,
-        device="cpu",
-    )
-    # Fetch a batch to ensure buffer is initialized etc.
-    store1.get_batch()
-    # Fetch a few more to advance the generator state
-    store1.get_batch()
-    store1.get_batch()
-
-    state = store1.state_dict()
-
-    # Check state contents
-    assert "layer_indices" in state
-    assert "d_model" in state
-    assert "dtype" in state
-    assert "input_means" in state
-    assert "input_stds" in state
-    assert "output_means" in state
-    assert "output_stds" in state
-    assert "total_tokens_yielded_by_generator" in state
-    assert "target_buffer_size_tokens" in state
-    assert "normalization_method" in state
-
-    assert state["layer_indices"] == store1.layer_indices
-    assert state["d_model"] == store1.d_model
-    assert state["dtype"] == str(store1.dtype)
-    assert (
-        state["total_tokens_yielded_by_generator"]
-        == store1.total_tokens_yielded_by_generator
-    )
-    assert state["target_buffer_size_tokens"] == store1.target_buffer_size_tokens
-    assert state["normalization_method"] == store1.normalization_method
-
-    # Check stats are on CPU in state dict
-    assert state["input_means"][0].device == torch.device("cpu")
-
-    # Create a new store instance (with a fresh generator)
-    store2 = ActivationStore(
-        activation_generator=dummy_activation_generator,  # Needs a generator, even if state is loaded
-        n_batches_in_buffer=10,  # Different buffer size to check loaded value
-        train_batch_size_tokens=128,  # Different batch size
-        normalization_method="none",  # Different norm method
-        device="cpu",
-    )
-
-    store2.load_state_dict(state)
-
-    # Check loaded state
-    assert store2.layer_indices == store1.layer_indices
-    assert store2.d_model == store1.d_model
-    assert store2.dtype == store1.dtype
-    assert (
-        store2.total_tokens_yielded_by_generator
-        == store1.total_tokens_yielded_by_generator
-    )
-    # These should come from the state dict, not the new __init__ args
-    assert store2.target_buffer_size_tokens == store1.target_buffer_size_tokens
-    assert store2.normalization_method == store1.normalization_method
-
-    # Check stats are loaded correctly and moved to the store's device
-    assert torch.equal(store2.input_means[0], store1.input_means[0])
-    assert store2.input_means[0].device == store2.device
-    assert torch.equal(store2.input_stds[0], store1.input_stds[0])
-    assert store2.input_stds[0].device == store2.device
-    assert torch.equal(store2.output_means[0], store1.output_means[0])
-    assert store2.output_means[0].device == store2.device
-    assert torch.equal(store2.output_stds[0], store1.output_stds[0])
-    assert store2.output_stds[0].device == store2.device
-
-    # Check buffer is reset/empty after loading state
-    assert not store2.buffer_initialized
-    assert not store2.buffered_inputs
-    assert store2.read_indices.shape == (0,)
-
-    # Check that getting a batch works after loading state
-    inputs, targets = store2.get_batch()
-    assert inputs[0].shape == (store2.train_batch_size_tokens, store2.d_model)
-    # Check normalization was applied using loaded stats
-    if store2.normalization_method == "estimated_mean_std":
-        assert torch.allclose(inputs[0].mean(), torch.tensor(0.0), atol=0.5)
-
-
-def test_inconsistent_d_model_error(inconsistent_d_model_generator):
-    """Test ValueError when generator yields inconsistent d_model."""
-    store = ActivationStore(
-        activation_generator=inconsistent_d_model_generator,
-        n_batches_in_buffer=2,
-        train_batch_size_tokens=64,
-        device="cpu",
-    )
-    # First batch should work
-    store.get_batch()
-    # Subsequent fetches should trigger the error when the bad batch is processed
-    with pytest.raises(ValueError, match="Inconsistent d_model"):
-        # Fetch enough times to force processing the second (bad) batch
-        for _ in range(3):
-            store.get_batch()
-
-
-def test_inconsistent_layers_error(inconsistent_layers_generator):
-    """Test ValueError when generator yields inconsistent layer indices."""
-    store = ActivationStore(
-        activation_generator=inconsistent_layers_generator,
-        n_batches_in_buffer=2,
-        train_batch_size_tokens=64,
-        device="cpu",
-    )
-    # First batch should work
-    store.get_batch()
-    # Subsequent fetches should trigger the error when the bad batch is processed
-    with pytest.raises(ValueError, match="Inconsistent layer indices"):
-        # Fetch enough times to force processing the second (bad) batch
-        for _ in range(3):
-            store.get_batch()
-
-
-def test_empty_generator_stopiteration(exhausted_generator):
-    """Test StopIteration is raised immediately if generator is empty."""
-    store = ActivationStore(
-        activation_generator=exhausted_generator,
-        n_batches_in_buffer=2,
-        train_batch_size_tokens=64,
-        device="cpu",
-    )
-    with pytest.raises(StopIteration):
-        store.get_batch()
-
-
-# --- Integration-style Test (Keep as is for now, mocks Trainer/Extractor) ---
-
-
-def test_cache_path_integration():
-    """Test that activation caching params are passed (mocks dependencies)."""
-    # This test primarily checks CLTTrainer's interaction, not ActivationStore internals.
-    # Its validity depends on CLTTrainer's current implementation (not provided).
-    # Keeping it as a placeholder for integration testing.
-
-    # Important: Mocks need to align with actual interfaces used by CLTTrainer.
-    # If ActivationExtractorCLT or ActivationStore API changed how CLTTrainer
-    # interacts with them, these mocks would need updates.
-
-    # Mock clt.training.trainer dependencies if they exist
-    try:
-        from clt.training.trainer import CLTTrainer
-        from clt.config import CLTConfig, TrainingConfig
-
-        trainer_module = "clt.training.trainer"
-    except ImportError:
-        pytest.skip("CLTTrainer or dependencies not found, skipping integration test")
-
-    # Create minimal configs for testing
-    clt_config = CLTConfig(num_features=16, num_layers=2, d_model=32)
-
-    training_config = TrainingConfig(
-        learning_rate=0.001,
-        training_steps=10,
-        cache_path="/fake/cache/path",  # Test this parameter passing
-        # Add other required TrainingConfig fields if necessary
-        # batch_size_tokens=64, # Removed: Causes TypeError if not in actual TrainingConfig
-        # buffer_batches=4,     # Removed: Causes TypeError if not in actual TrainingConfig
-    )
-
-    # Mock the ActivationExtractorCLT and ActivationStore within the trainer module
-    with patch(f"{trainer_module}.ActivationExtractorCLT") as MockExtractor, patch(
-        f"{trainer_module}.ActivationStore"
-    ) as MockStore:
-
-        # Set up the mocked extractor instance and its methods
-        mock_extractor_instance = MockExtractor.return_value
-        # Simulate the stream_activations method returning a dummy generator
-        mock_generator = MagicMock(spec=Generator)
-        mock_extractor_instance.stream_activations.return_value = mock_generator
-
-        # Set up the mocked store instance
-        mock_store_instance = MockStore.return_value
-        # Give the mock store an iterator protocol if CLTTrainer uses it like `next(store)`
-        mock_store_instance.__iter__.return_value = iter(
-            [(MagicMock(), MagicMock())]
-        )  # Dummy batch
-
-        # Create trainer instance (this will trigger mocked calls)
-        # Ensure all required arguments for CLTTrainer are provided
-        try:
-            trainer = CLTTrainer(
-                clt_config=clt_config,
-                training_config=training_config,
-                log_dir="test_cache_dir_integration",
-                # Add other required CLTTrainer args like model_name, dataset_name etc.
-                # model_name="mock_model",  # Removed: Causes TypeError if not in actual CLTTrainer
-                # dataset_name="mock_dataset",  # Removed: Causes TypeError if not in actual CLTTrainer
-            )
-        except TypeError as e:
-            pytest.fail(f"CLTTrainer init failed, check required args/mocks: {e}")
-
-        # --- Verification ---
-
-        # 1. Verify ActivationExtractorCLT was initialized (if Trainer does this)
-        #    Example: Check if model_name was passed
-        # MockExtractor.assert_called_once_with(model_name=training_config.model_name_or_path, ...)
-
-        # 2. Verify stream_activations was called with the cache_path
-        mock_extractor_instance.stream_activations.assert_called_once()
-        # Check kwargs passed to stream_activations
-        stream_kwargs = mock_extractor_instance.stream_activations.call_args.kwargs
-        assert stream_kwargs.get("cache_path") == "/fake/cache/path"
-        # Add checks for other expected args like dataset_name, batch_size etc.
-        # assert stream_kwargs.get("dataset_name") == training_config.dataset_name
-
-        # 3. Verify ActivationStore was initialized with the generator from the extractor
-        MockStore.assert_called_once()
-        store_kwargs = MockStore.call_args.kwargs
-        assert store_kwargs.get("activation_generator") == mock_generator
-        # Check other parameters passed to ActivationStore init
-        # assert store_kwargs.get("n_batches_in_buffer") == training_config.buffer_batches # Removed: Depends on removed mock arg
-        # assert store_kwargs.get("train_batch_size_tokens") == training_config.batch_size_tokens # Removed: Depends on removed mock arg
-        # assert store_kwargs.get("normalization_method") == training_config.normalization_method
-
-        # --- Cleanup ---
-        import shutil
-        import os
-
-        if os.path.exists("test_cache_dir_integration"):
-            shutil.rmtree("test_cache_dir_integration")
diff --git a/tests/unit/training/test_evaluator.py b/tests/unit/training/test_evaluator.py
deleted file mode 100644
index f841684..0000000
--- a/tests/unit/training/test_evaluator.py
+++ /dev/null
@@ -1,592 +0,0 @@
-import pytest
-import torch
-import torch.nn.functional as F
-import numpy as np
-from unittest.mock import MagicMock, patch
-import time
-from typing import Dict
-
-# Imports from the module under test and dependencies
-from clt.training.evaluator import CLTEvaluator, _format_elapsed_time
-from clt.models.clt import CrossLayerTranscoder
-from clt.config import CLTConfig
-
-# Constants for test configuration
-NUM_LAYERS = 2
-NUM_FEATURES = 4
-D_MODEL = 8
-BATCH_TOKENS = 10
-
-
-# --- Fixtures ---
-
-
-@pytest.fixture
-def device():
-    """Provides the device (CPU for testing)."""
-    return torch.device("cpu")
-
-
-@pytest.fixture
-def mock_clt_config():
-    """Provides a mock CLTConfig."""
-    config = MagicMock(spec=CLTConfig)
-    config.num_layers = NUM_LAYERS
-    config.num_features = NUM_FEATURES
-    config.d_model = D_MODEL
-    return config
-
-
-@pytest.fixture
-def mock_clt_model(mock_clt_config, device):
-    """Provides a mock CrossLayerTranscoder model."""
-    model = MagicMock(spec=CrossLayerTranscoder)
-    model.config = mock_clt_config
-    model.device = device
-
-    # Mock the __call__ method (reconstruction)
-    def mock_call(inputs: Dict[int, torch.Tensor]) -> Dict[int, torch.Tensor]:
-        reconstructions = {}
-        for layer_idx, inp in inputs.items():
-            # Simple identity reconstruction for testing
-            reconstructions[layer_idx] = inp.clone().detach()
-        return reconstructions
-
-    # Corrected assignment for side_effect
-    model.__call__ = MagicMock(side_effect=mock_call)
-
-    # Mock the get_feature_activations method
-    def mock_get_feature_activations(
-        inputs: Dict[int, torch.Tensor],
-    ) -> Dict[int, torch.Tensor]:
-        activations = {}
-        for layer_idx, inp in inputs.items():
-            # Generate dummy activations (batch_tokens, num_features)
-            # Let's make layer 0 sparse, layer 1 dense for testing
-            if layer_idx == 0:
-                # 50% active features on average
-                acts = torch.rand(inp.shape[0], NUM_FEATURES, device=device) * 2 - 0.5
-                acts = torch.relu(acts)
-            else:
-                # Mostly active features
-                acts = torch.rand(inp.shape[0], NUM_FEATURES, device=device) + 0.1
-            activations[layer_idx] = acts
-        return activations
-
-    model.get_feature_activations.side_effect = mock_get_feature_activations
-
-    return model
-
-
-@pytest.fixture
-def evaluator(mock_clt_model, device):
-    """Provides an instance of CLTEvaluator."""
-    return CLTEvaluator(model=mock_clt_model, device=device, start_time=time.time())
-
-
-@pytest.fixture
-def sample_inputs(device):
-    """Provides sample input activations."""
-    inputs = {}
-    for i in range(NUM_LAYERS):
-        # Shape: [batch_tokens, d_model]
-        inputs[i] = torch.randn(BATCH_TOKENS, D_MODEL, device=device)
-    return inputs
-
-
-@pytest.fixture
-def sample_targets(device):
-    """Provides sample target activations (same as inputs for simple test)."""
-    targets = {}
-    for i in range(NUM_LAYERS):
-        # Shape: [batch_tokens, d_model]
-        targets[i] = torch.randn(BATCH_TOKENS, D_MODEL, device=device)
-    return targets
-
-
-@pytest.fixture
-def sample_activations(device):
-    """Provides sample feature activations."""
-    activations = {}
-    # Layer 0: sparse
-    acts0 = torch.zeros(BATCH_TOKENS, NUM_FEATURES, device=device)
-    acts0[0, 0] = 1.0
-    acts0[1, 1] = 1.0
-    activations[0] = acts0
-    # Layer 1: dense
-    acts1 = torch.ones(BATCH_TOKENS, NUM_FEATURES, device=device)
-    activations[1] = acts1
-    return activations
-
-
-# --- Test Helper Functions ---
-
-
-def test_format_elapsed_time():
-    """Tests the _format_elapsed_time helper function."""
-    assert _format_elapsed_time(50) == "00:50"
-    assert _format_elapsed_time(125.5) == "02:05"
-    assert _format_elapsed_time(3600) == "01:00:00"
-    assert _format_elapsed_time(3725) == "01:02:05"
-    assert _format_elapsed_time(86400 + 3600 + 120 + 5) == "25:02:05"
-
-
-# --- Test Static Methods ---
-
-
-def test_log_density(device):
-    """Tests the _log_density static method."""
-    density = torch.tensor([0.0, 0.1, 1.0, 1e-12], device=device, dtype=torch.float32)
-    log_density = CLTEvaluator._log_density(density, eps=1e-10)
-    # Ensure expected tensor has matching dtype (float32)
-    # Corrected expected value for the 1e-12 case
-    expected = torch.tensor(
-        [-10.0, np.log10(0.1 + 1e-10), 0.0, np.log10(1e-12 + 1e-10)],
-        device=device,
-        dtype=torch.float32,
-    )
-    assert torch.allclose(log_density, expected, atol=1e-6)
-    # Test with zero epsilon
-    log_density_no_eps = CLTEvaluator._log_density(density, eps=0)
-    # Ensure expected tensor has matching dtype (float32)
-    # Corrected expected value for log10(1e-12) which is -12.0
-    expected_no_eps = torch.tensor(
-        [float("-inf"), np.log10(0.1), 0.0, -12.0],  # Changed last element from -inf
-        device=device,
-        dtype=torch.float32,
-    )
-    assert torch.allclose(log_density_no_eps, expected_no_eps, equal_nan=True)
-
-
-def test_calculate_aggregate_metric():
-    """Tests the _calculate_aggregate_metric static method."""
-    # Empty input
-    assert CLTEvaluator._calculate_aggregate_metric({}) is None
-    # Single layer
-    data1 = {"layer_0": [1.0, 2.0, 3.0]}
-    assert CLTEvaluator._calculate_aggregate_metric(data1) == pytest.approx(2.0)
-    # Multiple layers
-    data2 = {"layer_0": [1.0, 2.0], "layer_1": [3.0, 4.0]}
-    assert CLTEvaluator._calculate_aggregate_metric(data2) == pytest.approx(2.5)
-    # Layer with empty list
-    data3 = {"layer_0": [], "layer_1": [1.0, 3.0]}
-    assert CLTEvaluator._calculate_aggregate_metric(data3) == pytest.approx(2.0)
-    # All empty lists
-    data4 = {"layer_0": [], "layer_1": []}
-    assert CLTEvaluator._calculate_aggregate_metric(data4) is None
-
-
-def test_calculate_aggregate_histogram_data():
-    """Tests the _calculate_aggregate_histogram_data static method."""
-    # Empty input
-    assert CLTEvaluator._calculate_aggregate_histogram_data({}) == []
-    # Single layer
-    data1 = {"layer_0": [1.0, 2.0, 3.0]}
-    assert CLTEvaluator._calculate_aggregate_histogram_data(data1) == [1.0, 2.0, 3.0]
-    # Multiple layers
-    data2 = {"layer_0": [1.0, 2.0], "layer_1": [3.0, 4.0]}
-    assert CLTEvaluator._calculate_aggregate_histogram_data(data2) == [
-        1.0,
-        2.0,
-        3.0,
-        4.0,
-    ]
-    # Layer with empty list
-    data3 = {"layer_0": [], "layer_1": [1.0, 3.0]}
-    assert CLTEvaluator._calculate_aggregate_histogram_data(data3) == [1.0, 3.0]
-    # All empty lists
-    data4 = {"layer_0": [], "layer_1": []}
-    assert CLTEvaluator._calculate_aggregate_histogram_data(data4) == []
-
-
-# --- Test Private Calculation Methods ---
-
-
-def test_compute_sparsity(evaluator, sample_activations, device):
-    """Tests the _compute_sparsity method."""
-    metrics = evaluator._compute_sparsity(sample_activations)
-
-    # Expected values based on sample_activations
-    # Layer 0: 2 activations out of BATCH_TOKENS * NUM_FEATURES = 10 * 4 = 40
-    #          L0 per token: (1 activation/token) for 2 tokens,
-    #                        (0 activation/token) for 8 tokens. Avg = 2/10 = 0.2
-    # Layer 1: All active. BATCH_TOKENS * NUM_FEATURES = 40 activations.
-    #          Avg L0 per token = 4.0
-    # Total L0 = 0.2 + 4.0 = 4.2
-    # Avg L0 = 4.2 / 2 = 2.1
-    # Sparsity Fraction = 1 - (Avg L0 / Total Features)
-    #                   = 1 - (2.1 / 4) = 1 - 0.525 = 0.475
-
-    assert metrics["sparsity/total_l0"] == pytest.approx(4.2)
-    assert metrics["sparsity/avg_l0"] == pytest.approx(2.1)
-    assert metrics["sparsity/sparsity_fraction"] == pytest.approx(
-        1.0 - (2.1 / NUM_FEATURES)
-    )
-    # Avg L0 for layer 0
-    assert metrics["layerwise/l0"]["layer_0"] == pytest.approx(2 / BATCH_TOKENS)
-    # Avg L0 for layer 1
-    assert metrics["layerwise/l0"]["layer_1"] == pytest.approx(NUM_FEATURES)
-
-    # Test with empty activations
-    empty_metrics = evaluator._compute_sparsity({})
-    assert empty_metrics["sparsity/total_l0"] == 0.0
-    assert empty_metrics["sparsity/avg_l0"] == 0.0
-    assert empty_metrics["sparsity/sparsity_fraction"] == 1.0
-    assert empty_metrics["layerwise/l0"] == {
-        f"layer_{i}": 0.0 for i in range(NUM_LAYERS)
-    }
-
-    # Test with activations containing empty tensors
-    activations_with_empty = {
-        0: torch.randn(BATCH_TOKENS, NUM_FEATURES, device=device),
-        1: torch.empty((0, NUM_FEATURES), device=device),  # Empty tensor
-    }
-    metrics_with_empty = evaluator._compute_sparsity(activations_with_empty)
-    assert "layer_1" in metrics_with_empty["layerwise/l0"]
-    assert metrics_with_empty["layerwise/l0"]["layer_1"] == 0.0
-    assert metrics_with_empty["sparsity/avg_l0"] > 0  # Only layer 0 contributes
-
-
-def test_compute_reconstruction_metrics(evaluator, device):
-    """Tests the _compute_reconstruction_metrics method."""
-    targets = {
-        0: torch.tensor([[1.0, 2.0], [3.0, 4.0]], device=device),
-        1: torch.tensor([[5.0, 6.0], [7.0, 8.0]], device=device),
-    }
-    # Perfect reconstruction
-    recons_perfect = {k: v.clone() for k, v in targets.items()}
-    metrics_perfect = evaluator._compute_reconstruction_metrics(targets, recons_perfect)
-    assert metrics_perfect["reconstruction/total_mse"] == pytest.approx(0.0)
-    assert metrics_perfect["reconstruction/explained_variance"] == pytest.approx(1.0)
-
-    # Zero reconstruction
-    recons_zero = {k: torch.zeros_like(v) for k, v in targets.items()}
-    metrics_zero = evaluator._compute_reconstruction_metrics(targets, recons_zero)
-    expected_mse = 25.5
-    assert metrics_zero["reconstruction/total_mse"] == pytest.approx(expected_mse)
-    # EV = 1 - Var(Target - 0) / Var(Target) = 1 - Var(Target) / Var(Target) = 0
-    assert metrics_zero["reconstruction/explained_variance"] == pytest.approx(
-        0.0
-    )  # Should be approx 0
-
-    # Partial reconstruction
-    recons_partial = {
-        0: torch.tensor([[1.1, 1.9], [3.1, 3.9]], device=device),
-        1: torch.tensor([[5.0, 6.0], [7.0, 8.0]], device=device),  # Perfect for layer 1
-    }
-    metrics_partial = evaluator._compute_reconstruction_metrics(targets, recons_partial)
-    expected_mse_partial = (
-        F.mse_loss(targets[0], recons_partial[0]).item()
-        + F.mse_loss(targets[1], recons_partial[1]).item()
-    ) / 2
-    assert metrics_partial["reconstruction/total_mse"] == pytest.approx(
-        expected_mse_partial
-    )
-    assert 0.0 < metrics_partial["reconstruction/explained_variance"] < 1.0
-
-    # Test with zero variance target
-    targets_zero_var = {0: torch.ones((2, 2), device=device) * 3}
-    recons_zero_var = {0: torch.ones((2, 2), device=device) * 3.1}
-    metrics_zero_var = evaluator._compute_reconstruction_metrics(
-        targets_zero_var, recons_zero_var
-    )
-    assert metrics_zero_var["reconstruction/total_mse"] == pytest.approx(
-        0.1**2, abs=1e-5
-    )
-    # EV = 1 - Var(Err)/Var(Target) -> 1 - 0/0. If error var is 0, EV should be 1.
-    # Corrected assertion
-    assert metrics_zero_var["reconstruction/explained_variance"] == pytest.approx(1.0)
-
-    # Test with zero variance target and perfect recon
-    targets_zero_var_perf = {0: torch.ones((2, 2), device=device) * 3}
-    recons_zero_var_perf = {0: torch.ones((2, 2), device=device) * 3}
-    metrics_zero_var_perf = evaluator._compute_reconstruction_metrics(
-        targets_zero_var_perf, recons_zero_var_perf
-    )
-    assert metrics_zero_var_perf["reconstruction/total_mse"] == pytest.approx(0.0)
-    # EV = 1 - Var(0)/0 -> Should be 1
-    assert metrics_zero_var_perf["reconstruction/explained_variance"] == pytest.approx(
-        1.0
-    )
-
-    # Test with missing layer in reconstruction
-    targets_missing = {0: torch.randn(2, 2), 1: torch.randn(2, 2)}
-    recons_missing = {0: torch.randn(2, 2)}  # Missing layer 1
-    metrics_missing = evaluator._compute_reconstruction_metrics(
-        targets_missing, recons_missing
-    )
-    assert metrics_missing["reconstruction/total_mse"] > 0  # Only layer 0 contributes
-    # Corrected assertion key
-    assert "reconstruction/explained_variance" in metrics_missing
-
-
-def test_compute_feature_density(evaluator, sample_activations, device):
-    """Tests the _compute_feature_density method."""
-    metrics = evaluator._compute_feature_density(sample_activations)
-
-    assert "layerwise/log_feature_density" in metrics
-    assert "layerwise/consistent_activation_heuristic" in metrics
-
-    # --- Layer 0 (Sparse) ---
-    # Density: Feature 0 active in 1/10 tokens, Feature 1 active in 1/10 tokens,
-    #          others 0/10
-    expected_density0 = torch.tensor([0.1, 0.1, 0.0, 0.0], device=device)
-    expected_log_density0 = CLTEvaluator._log_density(expected_density0).tolist()
-    assert metrics["layerwise/log_feature_density"]["layer_0"] == pytest.approx(
-        expected_log_density0
-    )
-
-    # Heuristic:
-    # Feature 0: 1 total activation / 1 prompt active = 1
-    # Feature 1: 1 total activation / 1 prompt active = 1
-    # Feature 2: 0 total activations / 0 prompts active = 0 / eps -> ~0
-    # Feature 3: 0 total activations / 0 prompts active = 0 / eps -> ~0
-    expected_heuristic0 = torch.tensor([1.0, 1.0, 0.0, 0.0], device=device)
-    assert metrics["layerwise/consistent_activation_heuristic"][
-        "layer_0"
-    ] == pytest.approx(expected_heuristic0.tolist())
-
-    # --- Layer 1 (Dense) ---
-    # Density: All features active in 10/10 tokens = 1.0
-    expected_density1 = torch.ones(NUM_FEATURES, device=device)
-    expected_log_density1 = CLTEvaluator._log_density(expected_density1).tolist()
-    # Should be list of 0.0
-    assert metrics["layerwise/log_feature_density"]["layer_1"] == pytest.approx(
-        expected_log_density1
-    )
-
-    # Heuristic:
-    # Each feature: BATCH_TOKENS total activations / 1 prompt active = 10 / 1 = 10
-    expected_heuristic1 = torch.ones(NUM_FEATURES, device=device) * BATCH_TOKENS
-    assert metrics["layerwise/consistent_activation_heuristic"][
-        "layer_1"
-    ] == pytest.approx(expected_heuristic1.tolist())
-
-    # Test with empty activations
-    empty_metrics = evaluator._compute_feature_density({})
-    assert empty_metrics["layerwise/log_feature_density"] == {}
-    assert empty_metrics["layerwise/consistent_activation_heuristic"] == {}
-
-
-def test_compute_dead_neuron_metrics(evaluator, device, mock_clt_config):
-    """Tests the _compute_dead_neuron_metrics method."""
-    # --- Test with valid mask ---
-    # Mask: layer 0 has 1 dead, layer 1 has 2 dead
-    dead_mask = torch.zeros(NUM_LAYERS, NUM_FEATURES, dtype=torch.bool, device=device)
-    dead_mask[0, 1] = True
-    dead_mask[1, 0] = True
-    dead_mask[1, 2] = True
-
-    metrics = evaluator._compute_dead_neuron_metrics(dead_mask)
-    assert "layerwise/dead_features" in metrics
-    assert metrics["layerwise/dead_features"]["layer_0"] == 1
-    assert metrics["layerwise/dead_features"]["layer_1"] == 2
-    # Total is calculated in compute_metrics, not here
-
-    # --- Test with all dead ---
-    all_dead_mask = torch.ones(
-        NUM_LAYERS, NUM_FEATURES, dtype=torch.bool, device=device
-    )
-    metrics_all_dead = evaluator._compute_dead_neuron_metrics(all_dead_mask)
-    assert metrics_all_dead["layerwise/dead_features"]["layer_0"] == NUM_FEATURES
-    assert metrics_all_dead["layerwise/dead_features"]["layer_1"] == NUM_FEATURES
-
-    # --- Test with all alive ---
-    all_alive_mask = torch.zeros(
-        NUM_LAYERS, NUM_FEATURES, dtype=torch.bool, device=device
-    )
-    metrics_all_alive = evaluator._compute_dead_neuron_metrics(all_alive_mask)
-    assert metrics_all_alive["layerwise/dead_features"]["layer_0"] == 0
-    assert metrics_all_alive["layerwise/dead_features"]["layer_1"] == 0
-
-    # --- Test with None mask ---
-    metrics_none = evaluator._compute_dead_neuron_metrics(None)
-    assert "layerwise/dead_features" in metrics_none
-    assert metrics_none["layerwise/dead_features"] == {}
-
-    # --- Test with incorrect shape mask ---
-    wrong_shape_mask = torch.zeros(NUM_LAYERS + 1, NUM_FEATURES, device=device)
-    # Should print a warning, but return default empty dict structure
-    with patch("builtins.print") as mock_print:
-        metrics_wrong_shape = evaluator._compute_dead_neuron_metrics(wrong_shape_mask)
-        assert "layerwise/dead_features" in metrics_wrong_shape
-        assert metrics_wrong_shape["layerwise/dead_features"] == {}
-        mock_print.assert_called_once()
-        assert (
-            "Warning: Received dead_neuron_mask with unexpected shape"
-            in mock_print.call_args[0][0]
-        )
-
-
-# --- Test Main Method ---
-
-
-def test_compute_metrics_integration(evaluator, sample_inputs, sample_targets, device):
-    """Tests the compute_metrics method integration."""
-    # --- Mock internal methods to control their output ---
-    # We want to check if compute_metrics correctly aggregates results
-    mock_sparsity_result = {
-        "sparsity/total_l0": 4.2,
-        "sparsity/avg_l0": 2.1,
-        "sparsity/sparsity_fraction": 0.475,
-        "layerwise/l0": {"layer_0": 0.2, "layer_1": 4.0},
-    }
-    mock_recon_result = {
-        "reconstruction/explained_variance": 0.95,
-        "reconstruction/total_mse": 0.1,
-    }
-    mock_density_result = {
-        "layerwise/log_feature_density": {
-            "layer_0": [
-                -1.0,
-                -1.0,
-                -10.0,
-                -10.0,
-            ],  # Derived from density [0.1, 0.1, 0, 0]
-            "layer_1": [0.0, 0.0, 0.0, 0.0],  # Derived from density [1, 1, 1, 1]
-        },
-        "layerwise/consistent_activation_heuristic": {
-            "layer_0": [1.0, 1.0, 0.0, 0.0],
-            "layer_1": [10.0, 10.0, 10.0, 10.0],
-        },
-    }
-    mock_dead_result = {
-        "layerwise/dead_features": {"layer_0": 1, "layer_1": 0},
-    }
-    # Dead mask to produce the mock_dead_result
-    dead_mask = torch.zeros(NUM_LAYERS, NUM_FEATURES, dtype=torch.bool, device=device)
-    dead_mask[0, 0] = True  # One dead feature in layer 0
-
-    with patch.object(
-        evaluator, "_compute_sparsity", return_value=mock_sparsity_result
-    ) as mock_sparsity, patch.object(
-        evaluator, "_compute_reconstruction_metrics", return_value=mock_recon_result
-    ) as mock_recon, patch.object(
-        evaluator, "_compute_feature_density", return_value=mock_density_result
-    ) as mock_density, patch.object(
-        evaluator, "_compute_dead_neuron_metrics", return_value=mock_dead_result
-    ) as mock_dead:
-
-        # Call the main method
-        all_metrics = evaluator.compute_metrics(
-            sample_inputs, sample_targets, dead_mask
-        )
-
-        # --- Assertions ---
-        # 1. Check if internal methods were called (mocks can verify this implicitly)
-        mock_sparsity.assert_called_once()
-        mock_recon.assert_called_once()
-        mock_density.assert_called_once()
-        mock_dead.assert_called_once_with(dead_mask)
-
-        # 2. Check if the output dict contains keys from all mocked results
-        assert "sparsity/total_l0" in all_metrics
-        assert "reconstruction/explained_variance" in all_metrics
-        assert "layerwise/log_feature_density" in all_metrics
-        assert "layerwise/dead_features" in all_metrics
-
-        # 3. Check aggregate calculations performed by compute_metrics
-        # Aggregate dead features
-        assert "dead_features/total_eval" in all_metrics
-        assert (
-            all_metrics["dead_features/total_eval"] == 1
-        )  # Sum of layerwise dead features
-
-        # Aggregate density mean (mean of log densities)
-        expected_log_density_mean = np.mean(
-            [-1.0, -1.0, -10.0, -10.0, 0.0, 0.0, 0.0, 0.0]
-        )  # (-22) / 8 = -2.75
-        assert "sparsity/feature_density_mean" in all_metrics
-        assert all_metrics["sparsity/feature_density_mean"] == pytest.approx(
-            expected_log_density_mean
-        )
-
-        # Aggregate heuristic mean
-        expected_heuristic_mean = np.mean(
-            [1.0, 1.0, 0.0, 0.0, 10.0, 10.0, 10.0, 10.0]
-        )  # (42) / 8 = 5.25
-        assert "sparsity/consistent_activation_heuristic_mean" in all_metrics
-        assert all_metrics[
-            "sparsity/consistent_activation_heuristic_mean"
-        ] == pytest.approx(expected_heuristic_mean)
-
-        # Aggregate histogram data
-        expected_log_density_hist = [-1.0, -1.0, -10.0, -10.0, 0.0, 0.0, 0.0, 0.0]
-        assert "sparsity/log_feature_density_agg_hist" in all_metrics
-        assert (
-            all_metrics["sparsity/log_feature_density_agg_hist"]
-            == expected_log_density_hist
-        )
-
-        expected_heuristic_hist = [1.0, 1.0, 0.0, 0.0, 10.0, 10.0, 10.0, 10.0]
-        assert "sparsity/consistent_activation_heuristic_agg_hist" in all_metrics
-        assert (
-            all_metrics["sparsity/consistent_activation_heuristic_agg_hist"]
-            == expected_heuristic_hist
-        )
-
-        # 4. Check if values are copied correctly
-        assert all_metrics["sparsity/avg_l0"] == mock_sparsity_result["sparsity/avg_l0"]
-        assert (
-            all_metrics["reconstruction/total_mse"]
-            == mock_recon_result["reconstruction/total_mse"]
-        )
-        assert (
-            all_metrics["layerwise/dead_features"]
-            == mock_dead_result["layerwise/dead_features"]
-        )
-
-
-def test_compute_metrics_integration_no_dead_mask(
-    evaluator, sample_inputs, sample_targets
-):
-    """Tests compute_metrics without providing a dead neuron mask."""
-    # Configure mock return value for when mask is None
-    expected_return_when_none = {"layerwise/dead_features": {}}
-    with patch.object(
-        evaluator,
-        "_compute_dead_neuron_metrics",
-        return_value=expected_return_when_none,
-    ) as mock_dead:
-        # Call without dead_neuron_mask
-        all_metrics = evaluator.compute_metrics(
-            sample_inputs, sample_targets, dead_neuron_mask=None
-        )
-
-        # Check that _compute_dead_neuron_metrics was called with None
-        mock_dead.assert_called_once_with(None)
-        # Check that the resulting dead feature counts are zero or empty
-        assert "layerwise/dead_features" in all_metrics
-        assert all_metrics["layerwise/dead_features"] == {}
-        assert "dead_features/total_eval" in all_metrics
-        assert all_metrics["dead_features/total_eval"] == 0
-
-
-# Add tests for edge cases like empty inputs/targets if needed
-def test_compute_metrics_empty_input(evaluator):
-    """Tests compute_metrics with empty input dictionaries."""
-    empty_inputs: Dict[int, torch.Tensor] = {}
-    empty_targets: Dict[int, torch.Tensor] = {}
-
-    # Mock model behavior for empty inputs
-    evaluator.model.__call__.return_value = {}
-    evaluator.model.get_feature_activations.return_value = {}
-
-    all_metrics = evaluator.compute_metrics(empty_inputs, empty_targets)
-
-    # Check for sensible default/zero values
-    assert all_metrics.get("reconstruction/explained_variance") == 0.0
-    assert all_metrics.get("reconstruction/total_mse") == 0.0
-    assert all_metrics.get("sparsity/avg_l0") == 0.0
-    assert all_metrics.get("sparsity/sparsity_fraction") == 1.0
-    assert all_metrics.get("dead_features/total_eval") == 0
-    assert (
-        "sparsity/feature_density_mean" not in all_metrics
-    )  # Should be None internally, so key omitted
-    assert "sparsity/consistent_activation_heuristic_mean" not in all_metrics
-    assert all_metrics.get("layerwise/l0") == {
-        f"layer_{i}": 0.0 for i in range(NUM_LAYERS)
-    }
-    assert all_metrics.get("layerwise/dead_features") == {}
-    assert all_metrics.get("layerwise/log_feature_density") == {}
-    assert all_metrics.get("layerwise/consistent_activation_heuristic") == {}
diff --git a/tests/unit/training/test_losses.py b/tests/unit/training/test_losses.py
deleted file mode 100644
index c41ab53..0000000
--- a/tests/unit/training/test_losses.py
+++ /dev/null
@@ -1,672 +0,0 @@
-# tests/unit/training/test_losses.py
-import pytest
-import torch
-from unittest.mock import MagicMock, call
-
-# Move import to top
-from clt.training.losses import LossManager
-
-
-# Mock the dependencies before importing LossManager
-# Usually, you'd structure your project so these are actual importable classes
-class MockTrainingConfig:
-    def __init__(self, sparsity_lambda=0.1, sparsity_c=1.0, preactivation_coef=0.01):
-        self.sparsity_lambda = sparsity_lambda
-        self.sparsity_c = sparsity_c
-        self.preactivation_coef = preactivation_coef
-
-
-class MockCrossLayerTranscoder(MagicMock):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        # Mock methods that LossManager calls
-        self.get_decoder_norms = MagicMock()
-        self.get_preactivations = MagicMock()
-        self.get_feature_activations = MagicMock()
-        # __call__ is handled by setting self.side_effect on the instance
-
-
-# Now import the class under test
-
-
-@pytest.fixture
-def mock_config():
-    return MockTrainingConfig(
-        sparsity_lambda=0.1, sparsity_c=2.0, preactivation_coef=0.05
-    )
-
-
-@pytest.fixture
-def mock_model(mock_config):
-    model = MockCrossLayerTranscoder()
-    device = torch.device("cpu")  # Assuming CPU for tests
-
-    # Setup default mock return values
-    decoder_norms = {
-        0: torch.tensor([0.5, 1.0, 1.5], device=device),
-        1: torch.tensor([1.0, 0.8], device=device),
-        2: torch.tensor([2.0], device=device),  # For 1D test
-    }
-    model.get_decoder_norms.return_value = decoder_norms
-
-    # Mock preactivations based on input and layer_idx
-    def mock_get_preactivations(x, layer_idx):
-        # Ensure input is on the correct device
-        x = x.to(device)
-        # Simple mock: return input shifted and scaled
-        if layer_idx == 0:  # Used for 3D/2D input tests
-            # Expects [B, S, D] or [B*S, D] -> returns [B*S, Feat=3]
-            if x.dim() == 3:
-                x_flat = x.reshape(-1, x.shape[-1])
-            else:
-                x_flat = x
-            # Ensure output dim matches feature dim for the layer
-            return (x_flat[:, :3] * 0.8 - 0.1).to(device)
-        elif layer_idx == 1:  # Used for 3D/2D input tests
-            # Expects [B, S, D] or [B*S, D] -> returns [B*S, Feat=2]
-            if x.dim() == 3:
-                x_flat = x.reshape(-1, x.shape[-1])
-            else:
-                x_flat = x
-            # Ensure output dim matches feature dim for the layer
-            return (x_flat[:, :2] * 1.2 + 0.2).to(device)
-        elif layer_idx == 2:  # Used for 1D input test
-            # Expects [D] -> reshaped to [1, D] -> returns [1, Feat=1]
-            if x.dim() == 1:
-                x = x.unsqueeze(0)
-            # Ensure output dim matches feature dim for the layer
-            return (x[:, :1] * 0.5 - 0.05).to(device)  # Example preact for 1D
-        return torch.zeros(x.shape[0], 1, device=device)  # Default fallback
-
-    model.get_preactivations.side_effect = mock_get_preactivations
-
-    # Mock feature activations based on input
-    def mock_get_feature_activations(inputs):
-        activations = {}
-        for layer_idx, x_in in inputs.items():
-            x_in = x_in.to(device)
-            preacts = mock_get_preactivations(
-                x_in, layer_idx
-            )  # Use the same mock logic
-            # Apply ReLU - actual model uses JumpReLU, but ReLU is simpler for testing activation shapes
-            acts = torch.relu(preacts).to(device)
-            # Reshape back if original input was 3D? No, feature acts are usually 2D/3D
-            # The loss function handles reshaping internally. Let's return consistent shapes.
-            # For testing, let's assume get_feature_activations returns [Batch*Seq, Features] or [Batch, Seq, Features]
-            # Let's return 3D for layer 0, 2D for layer 1, 1D (reshaped to 2D) for layer 2 for testing robustness
-            if layer_idx == 0:
-                # Find original batch/seq shape if possible (crude heuristic)
-                original_shape = inputs[layer_idx].shape
-                if len(original_shape) == 3:
-                    activations[layer_idx] = acts.reshape(
-                        original_shape[0], original_shape[1], -1
-                    )
-                else:  # Assume original was 2D
-                    activations[layer_idx] = acts  # Return as [Batch*Seq, Feat]
-            elif layer_idx == 1:
-                activations[layer_idx] = acts  # Return as [Batch*Seq, Feat]
-            elif layer_idx == 2:
-                activations[layer_idx] = acts.squeeze(
-                    0
-                )  # Return as [Feat] -> Loss handles unsqueeze
-
-        return activations
-
-    model.get_feature_activations.side_effect = mock_get_feature_activations
-
-    # Mock predictions based on input - make the *instance* callable
-    def mock_call(inputs):
-        predictions = {}
-        for layer_idx, x_in in inputs.items():
-            x_in = x_in.to(device)
-            # Simple mock prediction logic
-            if layer_idx == 0:
-                predictions[layer_idx] = (x_in * 0.9).to(device)
-            elif layer_idx == 1:
-                predictions[layer_idx] = (x_in * 1.1).to(device)
-            else:
-                predictions[layer_idx] = x_in.to(device)
-        return predictions
-
-    # Use model.side_effect for the instance __call__
-    model.side_effect = mock_call
-    model.device = device  # Add device attribute if needed
-
-    return model
-
-
-@pytest.fixture
-def loss_manager(mock_config):
-    return LossManager(mock_config)
-
-
-# --- Test Cases ---
-
-
-def test_loss_manager_init(loss_manager, mock_config):
-    assert loss_manager.config == mock_config
-    assert isinstance(loss_manager.reconstruction_loss_fn, torch.nn.MSELoss)
-    assert loss_manager.current_sparsity_lambda == 0.0  # Check initial value
-
-
-def test_compute_reconstruction_loss_basic(loss_manager):
-    device = torch.device("cpu")
-    predicted = {
-        0: torch.tensor([[1.0, 2.0]], device=device),
-        1: torch.tensor([[3.0, 4.0]], device=device),
-    }
-    target = {
-        0: torch.tensor([[1.1, 1.9]], device=device),
-        1: torch.tensor([[3.2, 4.1]], device=device),
-    }
-
-    expected_loss_0 = torch.mean(
-        torch.tensor([(1.0 - 1.1) ** 2, (2.0 - 1.9) ** 2], device=device)
-    )
-    expected_loss_1 = torch.mean(
-        torch.tensor([(3.0 - 3.2) ** 2, (4.0 - 4.1) ** 2], device=device)
-    )
-    expected_total_loss = (expected_loss_0 + expected_loss_1) / 2
-
-    loss = loss_manager.compute_reconstruction_loss(predicted, target)
-    assert torch.isclose(loss, expected_total_loss)
-    assert loss.device == device
-
-
-def test_compute_reconstruction_loss_mismatched_keys(loss_manager):
-    device = torch.device("cpu")
-    predicted = {
-        0: torch.tensor([[1.0]], device=device),
-        1: torch.tensor([[3.0]], device=device),
-    }
-    target = {
-        0: torch.tensor([[1.1]], device=device),
-        2: torch.tensor([[5.0]], device=device),
-    }  # Layer 1 missing in target, layer 2 missing in predicted
-
-    expected_loss_0 = torch.mean(torch.tensor([(1.0 - 1.1) ** 2], device=device))
-    # Layer 1 loss is not calculated as it's not in target
-    expected_total_loss = expected_loss_0 / 1  # Only one layer (layer 0) contributes
-
-    loss = loss_manager.compute_reconstruction_loss(predicted, target)
-    assert torch.isclose(loss, expected_total_loss)
-    assert loss.device == device
-
-
-def test_compute_reconstruction_loss_empty(loss_manager):
-    device = torch.device("cpu")  # Assume default device if empty
-    predicted = {}
-    target = {}
-    loss = loss_manager.compute_reconstruction_loss(predicted, target)
-    assert torch.equal(loss, torch.tensor(0.0, device=device))
-    assert loss.device == device
-
-    predicted = {0: torch.tensor([[1.0]], device=device)}
-    target = {}
-    loss = loss_manager.compute_reconstruction_loss(predicted, target)
-    assert torch.equal(loss, torch.tensor(0.0, device=device))  # No matching keys
-    assert loss.device == device
-
-    # Case with target but no predicted
-    predicted = {}
-    target = {0: torch.tensor([[1.0]], device=device)}
-    loss = loss_manager.compute_reconstruction_loss(predicted, target)
-    assert torch.equal(loss, torch.tensor(0.0, device=device))
-    assert loss.device == device
-
-
-def test_compute_sparsity_penalty_basic_3d(loss_manager, mock_model, mock_config):
-    device = mock_model.device
-    # Batch=1, Seq=2, Feat=3
-    activations = {0: torch.tensor([[[0.1, 0.0, 0.5], [0.2, 0.3, 0.0]]], device=device)}
-    current_step = 50
-    total_steps = 100
-
-    # Get expected norms from mock
-    local_decoder_norms = mock_model.get_decoder_norms()
-
-    acts_flat = activations[0].reshape(-1, 3)  # Shape (2, 3)
-    weights = local_decoder_norms[0].unsqueeze(0).to(device)  # Shape (1, 3)
-    weighted_acts = acts_flat * weights  # Shape (2, 3)
-
-    # tanh penalty computation
-    penalty_tensor = torch.tanh(mock_config.sparsity_c * weighted_acts)
-    expected_penalty_sum = penalty_tensor.sum()
-
-    lambda_factor = mock_config.sparsity_lambda * (current_step / total_steps)
-    expected_total_penalty = lambda_factor * expected_penalty_sum
-    # Note: The implementation sums the penalty, it doesn't average per element.
-
-    mock_model.get_decoder_norms.reset_mock()  # Reset before the call under test
-    penalty, current_lambda = loss_manager.compute_sparsity_penalty(
-        mock_model, activations, current_step, total_steps
-    )
-
-    mock_model.get_decoder_norms.assert_called_once()
-    assert torch.isclose(penalty, expected_total_penalty)
-    assert isinstance(current_lambda, float)
-    assert abs(current_lambda - lambda_factor) < 1e-9
-    assert penalty.device == device
-
-
-def test_compute_sparsity_penalty_basic_2d(loss_manager, mock_model, mock_config):
-    device = mock_model.device
-    # Batch*Seq=2, Feat=3
-    activations = {0: torch.tensor([[0.1, 0.0, 0.5], [0.2, 0.3, 0.0]], device=device)}
-    current_step = 50
-    total_steps = 100
-    # mock_model.get_decoder_norms.reset_mock() # Remove reset from here
-
-    # Get expected norms from mock
-    local_decoder_norms = mock_model.get_decoder_norms()
-
-    acts_flat = activations[0]  # Already flat
-    weights = local_decoder_norms[0].unsqueeze(0).to(device)  # Shape (1, 3)
-    weighted_acts = acts_flat * weights  # Shape (2, 3)
-
-    # tanh penalty computation
-    penalty_tensor = torch.tanh(mock_config.sparsity_c * weighted_acts)
-    expected_penalty_sum = penalty_tensor.sum()
-
-    lambda_factor = mock_config.sparsity_lambda * (current_step / total_steps)
-    expected_total_penalty = lambda_factor * expected_penalty_sum
-
-    mock_model.get_decoder_norms.reset_mock()  # Reset before the call under test
-    penalty, current_lambda = loss_manager.compute_sparsity_penalty(
-        mock_model, activations, current_step, total_steps
-    )
-
-    mock_model.get_decoder_norms.assert_called_once()
-    assert torch.isclose(penalty, expected_total_penalty)
-    assert isinstance(current_lambda, float)
-    assert abs(current_lambda - lambda_factor) < 1e-9
-    assert penalty.device == device
-
-
-def test_compute_sparsity_penalty_basic_1d(loss_manager, mock_model, mock_config):
-    device = mock_model.device
-    # Feat=1 (for layer 2 as per mock setup)
-    # The loss function expects at least 2D, but handles 1D by unsqueezing
-    activations = {2: torch.tensor([0.5], device=device)}
-    current_step = 50
-    total_steps = 100
-    # mock_model.get_decoder_norms.reset_mock() # Remove reset from here
-
-    # Get expected norms from mock
-    local_decoder_norms = mock_model.get_decoder_norms()  # Norms for layer 2
-
-    acts_flat = activations[2].unsqueeze(0)  # Shape [1, 1]
-    weights = local_decoder_norms[2].unsqueeze(0).to(device)  # Shape [1, 1]
-    weighted_acts = acts_flat * weights
-
-    # tanh penalty computation
-    penalty_tensor = torch.tanh(mock_config.sparsity_c * weighted_acts)
-    expected_penalty_sum = penalty_tensor.sum()
-
-    lambda_factor = mock_config.sparsity_lambda * (current_step / total_steps)
-    expected_total_penalty = lambda_factor * expected_penalty_sum
-
-    mock_model.get_decoder_norms.reset_mock()  # Reset before the call under test
-    penalty, current_lambda = loss_manager.compute_sparsity_penalty(
-        mock_model, activations, current_step, total_steps
-    )
-
-    mock_model.get_decoder_norms.assert_called_once()
-    assert torch.isclose(penalty, expected_total_penalty)
-    assert isinstance(current_lambda, float)
-    assert abs(current_lambda - lambda_factor) < 1e-9
-    assert penalty.device == device
-
-
-def test_compute_sparsity_penalty_schedule(loss_manager, mock_model, mock_config):
-    device = mock_model.device
-    activations = {0: torch.tensor([[[0.1, 0.0, 0.5]]], device=device)}  # B=1, S=1, F=3
-    total_steps = 100
-    mock_model.get_decoder_norms.reset_mock()
-    mock_model.get_decoder_norms.return_value = {
-        0: torch.tensor([0.5, 1.0, 1.5], device=device)
-    }
-
-    # Step 0
-    penalty_0, lambda_0 = loss_manager.compute_sparsity_penalty(
-        mock_model, activations, 0, total_steps
-    )
-    assert torch.isclose(penalty_0, torch.tensor(0.0, device=device))
-    assert lambda_0 == 0.0
-
-    # Step 50
-    mock_model.get_decoder_norms.reset_mock()  # Reset call count for next call
-    penalty_50, lambda_50 = loss_manager.compute_sparsity_penalty(
-        mock_model, activations, 50, total_steps
-    )
-    expected_lambda_50 = mock_config.sparsity_lambda * (50 / total_steps)
-    assert penalty_50 > 0
-    assert abs(lambda_50 - expected_lambda_50) < 1e-9
-
-    # Step 100
-    mock_model.get_decoder_norms.reset_mock()
-    penalty_100, lambda_100 = loss_manager.compute_sparsity_penalty(
-        mock_model, activations, 100, total_steps
-    )
-    expected_lambda_100 = mock_config.sparsity_lambda * (100 / total_steps)
-    assert abs(lambda_100 - expected_lambda_100) < 1e-9
-
-    # Penalty should scale linearly with lambda
-    assert torch.isclose(penalty_100, penalty_50 * 2.0)
-
-
-def test_compute_sparsity_penalty_empty(loss_manager, mock_model):
-    device = mock_model.device
-    activations = {}
-    penalty, current_lambda = loss_manager.compute_sparsity_penalty(
-        mock_model, activations, 50, 100
-    )
-    assert torch.equal(penalty, torch.tensor(0.0, device=device))
-    assert current_lambda == 0.0
-    mock_model.get_decoder_norms.assert_not_called()
-
-
-def test_compute_sparsity_penalty_empty_tensor(loss_manager, mock_model):
-    device = mock_model.device
-    activations = {0: torch.empty((0, 3), device=device)}  # Empty tensor
-    penalty, current_lambda = loss_manager.compute_sparsity_penalty(
-        mock_model, activations, 50, 100
-    )
-    assert torch.equal(penalty, torch.tensor(0.0, device=device))
-    # Lambda calculation still happens, but penalty is 0
-    expected_lambda = loss_manager.config.sparsity_lambda * 0.5
-    assert abs(current_lambda - expected_lambda) < 1e-9
-    # get_decoder_norms might be called depending on implementation details before empty check
-    # The current implementation checks numel() after getting norms, so it might be called.
-
-
-def test_compute_sparsity_penalty_missing_norms(loss_manager, mock_model, mock_config):
-    device = mock_model.device
-    activations = {
-        0: torch.tensor([[[0.1, 0.0, 0.5]]], device=device),
-        99: torch.tensor([[[0.1, 0.2]]], device=device),
-    }  # Layer 99 norms not in mock
-    current_step = 50
-    total_steps = 100
-    # mock_model.get_decoder_norms.reset_mock() # Remove reset from here
-    # Norms only available for layer 0
-    mock_model.get_decoder_norms.return_value = {
-        0: torch.tensor([0.5, 1.0, 1.5], device=device)
-    }
-
-    # Calculate expected penalty only for layer 0
-    acts_flat_0 = activations[0].reshape(-1, 3)
-    weights_0 = mock_model.get_decoder_norms()[0].unsqueeze(0).to(device)
-    weighted_acts_0 = acts_flat_0 * weights_0
-    penalty_tensor_0 = torch.tanh(mock_config.sparsity_c * weighted_acts_0)
-    expected_penalty_sum = penalty_tensor_0.sum()
-    lambda_factor = mock_config.sparsity_lambda * (current_step / total_steps)
-    expected_total_penalty = lambda_factor * expected_penalty_sum
-
-    mock_model.get_decoder_norms.reset_mock()  # Reset before the call under test
-    penalty, current_lambda = loss_manager.compute_sparsity_penalty(
-        mock_model, activations, current_step, total_steps
-    )
-
-    # Norms should be fetched once
-    mock_model.get_decoder_norms.assert_called_once()
-    # Penalty should only include layer 0
-    assert torch.isclose(penalty, expected_total_penalty)
-    assert abs(current_lambda - lambda_factor) < 1e-9
-    assert penalty.device == device
-
-
-# --- Preactivation Loss Tests ---
-
-
-def test_compute_preactivation_loss_basic(loss_manager, mock_model, mock_config):
-    device = mock_model.device
-    # Use 2D input: Batch*Seq=1, Dim=3
-    inputs = {0: torch.tensor([[-0.5, 0.2, -0.1]], device=device)}
-
-    mock_model.get_preactivations.reset_mock()
-
-    # Calculate expected preacts using the mock's logic
-    # Input [-0.5, 0.2, -0.1] -> Preacts [-0.5, 0.06, -0.18] (using layer 0 logic)
-    expected_preacts = mock_model.get_preactivations.side_effect(inputs[0], 0)
-
-    relu_neg_preacts = torch.relu(-expected_preacts)
-    expected_penalty_sum = relu_neg_preacts.sum()
-    num_elements = expected_preacts.numel()
-
-    expected_total_loss = (
-        mock_config.preactivation_coef * expected_penalty_sum / num_elements
-        if num_elements > 0
-        else 0.0
-    )
-
-    loss = loss_manager.compute_preactivation_loss(mock_model, inputs)
-
-    mock_model.get_preactivations.assert_called_once()
-    # Check call arguments carefully
-    call_args = mock_model.get_preactivations.call_args
-    assert torch.equal(call_args[0][0], inputs[0])
-    assert call_args[0][1] == 0
-
-    assert abs(loss.item() - expected_total_loss) < 1e-6
-    assert loss.device == device
-
-
-def test_compute_preactivation_loss_1d_input(loss_manager, mock_model, mock_config):
-    device = mock_model.device
-    # Use 1D input: Dim=1 (using layer 2 logic)
-    inputs = {2: torch.tensor([-0.5], device=device)}
-    mock_model.get_preactivations.reset_mock()
-
-    # Calculate expected preacts using the mock's logic for layer 2
-    # Input [-0.5] -> unsqueezed to [1,1] -> preacts [-0.3] (0.5*-0.5 - 0.05)
-    # The mock returns shape [1, 1]
-    expected_preacts = mock_model.get_preactivations.side_effect(
-        inputs[2], 2
-    )  # Shape [1, 1]
-
-    relu_neg_preacts = torch.relu(-expected_preacts)
-    expected_penalty_sum = relu_neg_preacts.sum()
-    num_elements = expected_preacts.numel()  # Should be 1
-
-    expected_total_loss = (
-        mock_config.preactivation_coef * expected_penalty_sum / num_elements
-        if num_elements > 0
-        else 0.0
-    )
-
-    loss = loss_manager.compute_preactivation_loss(mock_model, inputs)
-
-    mock_model.get_preactivations.assert_called_once()
-    call_args = mock_model.get_preactivations.call_args
-    # The loss function unsqueezes the 1D input before passing to get_preactivations
-    assert torch.equal(call_args[0][0], inputs[2].unsqueeze(0))
-    assert call_args[0][1] == 2
-
-    assert abs(loss.item() - expected_total_loss) < 1e-6
-    assert loss.device == device
-
-
-def test_compute_preactivation_loss_all_positive(loss_manager, mock_model):
-    device = mock_model.device
-    # Input that results in positive preactivations for layer 0
-    inputs = {0: torch.tensor([[0.5, 0.2, 0.15]], device=device)}  # Preacts > 0
-    mock_model.get_preactivations.reset_mock()
-
-    loss = loss_manager.compute_preactivation_loss(mock_model, inputs)
-
-    mock_model.get_preactivations.assert_called_once()
-    assert torch.isclose(loss, torch.tensor(0.0, device=device), atol=1e-8)
-    assert loss.device == device
-
-
-def test_compute_preactivation_loss_empty(loss_manager, mock_model):
-    device = mock_model.device
-    inputs = {}
-    loss = loss_manager.compute_preactivation_loss(mock_model, inputs)
-    assert torch.equal(loss, torch.tensor(0.0, device=device))
-    mock_model.get_preactivations.assert_not_called()
-    assert loss.device == device
-
-
-def test_compute_preactivation_loss_empty_tensor(loss_manager, mock_model):
-    device = mock_model.device
-    inputs = {0: torch.empty((0, 3), device=device)}
-    loss = loss_manager.compute_preactivation_loss(mock_model, inputs)
-    assert torch.equal(loss, torch.tensor(0.0, device=device))
-    # get_preactivations should not be called if numel is 0 before call
-    # Current implementation checks numel() before calling get_preactivations
-    mock_model.get_preactivations.assert_not_called()
-    assert loss.device == device
-
-
-def test_compute_preactivation_loss_exception(loss_manager, mock_model):
-    device = mock_model.device
-    inputs = {
-        0: torch.tensor([[1.0, 2.0, 3.0]], device=device),
-        1: torch.tensor([[4.0, 5.0]], device=device),
-    }  # Use layer 1 input that will have preacts calculated
-    mock_model.get_preactivations.reset_mock()
-    # Make get_preactivations raise an exception for layer 0, but work for layer 1
-    original_side_effect = mock_model.get_preactivations.side_effect
-
-    def side_effect_with_exception(x, layer_idx):
-        if layer_idx == 0:
-            raise ValueError("Test Exception")
-        else:
-            return original_side_effect(x, layer_idx)
-
-    mock_model.get_preactivations.side_effect = side_effect_with_exception
-
-    # Calculate expected loss only for layer 1
-    expected_preacts_1 = original_side_effect(inputs[1], 1)
-    relu_neg_preacts_1 = torch.relu(-expected_preacts_1)
-    expected_penalty_sum = relu_neg_preacts_1.sum()
-    num_elements = expected_preacts_1.numel()
-    expected_total_loss = (
-        loss_manager.config.preactivation_coef * expected_penalty_sum / num_elements
-        if num_elements > 0
-        else 0.0
-    )
-
-    loss = loss_manager.compute_preactivation_loss(mock_model, inputs)
-
-    # Should have been called for both layers, but failed on layer 0
-    assert mock_model.get_preactivations.call_count == 2
-    assert abs(loss.item() - expected_total_loss) < 1e-6
-    assert loss.device == device
-
-    # Restore original side effect if fixture is used elsewhere
-    mock_model.get_preactivations.side_effect = original_side_effect
-
-
-# --- Total Loss Tests ---
-
-
-def test_compute_total_loss(loss_manager, mock_model, mock_config):
-    device = mock_model.device
-    # B=1, S=1, Dim=4 for input/output
-    inputs = {
-        0: torch.tensor([[[1.0, 2.0, 3.0, 4.0]]], device=device),
-        1: torch.tensor([[[5.0, 6.0, 7.0, 8.0]]], device=device),
-    }
-    # Target dimensions should match model output dimensions based on mock_call
-    targets = {
-        0: torch.tensor([[[1.1, 1.9, 3.1, 3.9]]], device=device),
-        1: torch.tensor([[[4.9, 6.1, 7.0, 8.1]]], device=device),
-    }
-    current_step = 75
-    total_steps = 150
-
-    # Reset mocks before the call
-    mock_model.reset_mock()
-    mock_model.get_feature_activations.reset_mock()
-    mock_model.get_decoder_norms.reset_mock()
-    mock_model.get_preactivations.reset_mock()
-
-    # Call the method under test
-    total_loss, loss_dict = loss_manager.compute_total_loss(
-        mock_model, inputs, targets, current_step, total_steps
-    )
-
-    # --- Assertions ---
-    # 1. Check mock calls made *within* compute_total_loss
-    mock_model.assert_called_once_with(inputs)  # Checks the instance call (__call__)
-    mock_model.get_feature_activations.assert_called_once_with(inputs)
-    # Sparsity penalty calls get_decoder_norms once inside compute_total_loss
-    mock_model.get_decoder_norms.assert_called_once()
-    # Preactivation loss calls get_preactivations once per layer in inputs
-    assert mock_model.get_preactivations.call_count == len(inputs)
-    expected_preact_calls = [call(inputs[0], 0), call(inputs[1], 1)]
-    # Use assert_has_calls with any_order=True for flexibility if order isn't guaranteed
-    mock_model.get_preactivations.assert_has_calls(
-        expected_preact_calls, any_order=True
-    )
-
-    # 2. Manually calculate expected values *after* the call for verification
-    #    Use the mocks configured in the fixture
-    expected_predictions = mock_model.side_effect(inputs)
-    expected_activations = mock_model.get_feature_activations.side_effect(inputs)
-
-    # Create temporary LossManager instances to avoid state pollution if needed,
-    # or ensure mocks are appropriately configured/reset. Here we reuse the fixture one.
-    # Re-call individual components to get expected values
-    expected_recon_loss = loss_manager.compute_reconstruction_loss(
-        expected_predictions, targets
-    )
-    # Reset mocks that might be called again during manual calculation
-    mock_model.get_decoder_norms.reset_mock()
-    expected_sparsity_loss, expected_lambda = loss_manager.compute_sparsity_penalty(
-        mock_model, expected_activations, current_step, total_steps
-    )
-    mock_model.get_preactivations.reset_mock()
-    expected_preactivation_loss = loss_manager.compute_preactivation_loss(
-        mock_model, inputs
-    )
-    expected_total_loss_val = (
-        expected_recon_loss + expected_sparsity_loss + expected_preactivation_loss
-    )
-
-    # 3. Compare results
-    assert torch.isclose(total_loss, expected_total_loss_val)
-    assert total_loss.device == device
-    assert isinstance(loss_dict, dict)
-    assert "total" in loss_dict
-    assert "reconstruction" in loss_dict
-    assert "sparsity" in loss_dict
-    assert "preactivation" in loss_dict
-
-    # Check if the components roughly match (allow for float precision)
-    assert abs(loss_dict["total"] - total_loss.item()) < 1e-6
-    assert abs(loss_dict["reconstruction"] - expected_recon_loss.item()) < 1e-6
-    assert abs(loss_dict["sparsity"] - expected_sparsity_loss.item()) < 1e-6
-    assert abs(loss_dict["preactivation"] - expected_preactivation_loss.item()) < 1e-6
-
-    # 4. Check if current_sparsity_lambda was updated
-    assert abs(loss_manager.current_sparsity_lambda - expected_lambda) < 1e-9
-
-
-def test_get_current_sparsity_lambda(loss_manager, mock_model, mock_config):
-    device = mock_model.device
-    # Initial value
-    assert loss_manager.get_current_sparsity_lambda() == 0.0
-
-    # Run total loss calculation to update lambda
-    inputs = {0: torch.tensor([[[1.0, 2.0, 3.0]]], device=device)}
-    targets = {0: torch.tensor([[[1.1, 1.9, 3.1]]], device=device)}
-    current_step = 50
-    total_steps = 100
-
-    _, loss_dict = loss_manager.compute_total_loss(
-        mock_model, inputs, targets, current_step, total_steps
-    )
-
-    expected_lambda = mock_config.sparsity_lambda * (current_step / total_steps)
-    # The lambda stored should be the one calculated during the last total_loss call
-    assert abs(loss_manager.get_current_sparsity_lambda() - expected_lambda) < 1e-9
-
-    # Check that get just returns the value without recalculating
-    mock_model.get_decoder_norms.reset_mock()
-    retrieved_lambda = loss_manager.get_current_sparsity_lambda()
-    assert abs(retrieved_lambda - expected_lambda) < 1e-9
-    mock_model.get_decoder_norms.assert_not_called()  # Ensure get doesn't trigger calcs
diff --git a/tests/unit/training/test_trainer.py b/tests/unit/training/test_trainer.py
deleted file mode 100644
index 2715477..0000000
--- a/tests/unit/training/test_trainer.py
+++ /dev/null
@@ -1,987 +0,0 @@
-"""Unit tests for the CLTTrainer class."""
-
-import os
-import json
-import time  # Added for start_time
-from unittest.mock import patch, MagicMock, PropertyMock
-import pytest
-import torch
-
-from clt.config import CLTConfig, TrainingConfig
-from clt.models.clt import CrossLayerTranscoder
-from clt.training.trainer import CLTTrainer, WandBLogger
-from clt.training.data import ActivationStore
-from clt.training.losses import LossManager  # Added LossManager import
-from clt.nnsight.extractor import ActivationExtractorCLT
-from clt.training.evaluator import CLTEvaluator  # Added Evaluator import
-
-
-@pytest.fixture
-def clt_config():
-    """Fixture for CLTConfig."""
-    return CLTConfig(
-        num_features=300,
-        num_layers=2,  # Reduced layers for easier mocking
-        d_model=768,
-        activation_fn="jumprelu",
-        jumprelu_threshold=0.03,
-    )
-
-
-@pytest.fixture
-def training_config():
-    """Fixture for TrainingConfig."""
-    return TrainingConfig(
-        model_name="gpt2",
-        dataset_path="test_dataset",
-        dataset_split="train",
-        dataset_text_column="text",
-        learning_rate=1e-4,
-        optimizer="adam",
-        lr_scheduler="linear",
-        training_steps=100,
-        n_batches_in_buffer=5,
-        batch_size=2,  # Reduced batch size for easier testing
-        # train_batch_size_tokens will be calculated
-        inference_batch_size=1,
-        context_size=32,  # Reduced context size
-        normalization_method="none",
-        normalization_estimation_batches=2,
-        prepend_bos=True,
-        exclude_special_tokens=True,
-        streaming=False,
-        dataset_trust_remote_code=False,
-        cache_path=None,
-        dead_feature_window=10,  # Set for testing
-        log_interval=50,  # Adjust for testing
-        eval_interval=20,  # Adjust for testing
-        checkpoint_interval=50,  # Adjust for testing
-    )
-
-
-@pytest.fixture
-def temp_log_dir(tmpdir):
-    """Fixture for temporary log directory."""
-    log_dir = tmpdir.mkdir("test_logs")
-    return str(log_dir)
-
-
-@pytest.fixture
-def mock_model(clt_config):  # Pass clt_config
-    """Fixture for mock CrossLayerTranscoder."""
-    model = MagicMock(spec=CrossLayerTranscoder)
-    # Configure mock for parameters - needed for optimizer initialization
-    mock_param = torch.nn.Parameter(torch.randn(1))
-    model.parameters.return_value = [mock_param]
-    model.config = clt_config  # Add config attribute
-
-    # Mock feature activations for dead neuron tracking tests
-    # Simulate 10 tokens, 2 layers, 300 features
-    model.get_feature_activations.return_value = {
-        0: torch.rand(10, clt_config.num_features),
-        1: torch.rand(10, clt_config.num_features),
-    }
-    model.save = MagicMock()  # Mock save method
-    model.load = MagicMock()  # Mock load method
-    return model
-
-
-@pytest.fixture
-def mock_activation_extractor():
-    """Fixture for mock ActivationExtractorCLT."""
-    extractor = MagicMock(spec=ActivationExtractorCLT)
-    # Configure mock to return a generator-like object
-    mock_generator = MagicMock()
-    extractor.stream_activations.return_value = mock_generator
-    return extractor
-
-
-@pytest.fixture
-def mock_activation_store(training_config):  # Pass training_config
-    """Fixture for mock ActivationStore."""
-    store = MagicMock(spec=ActivationStore)
-    # Setup for iteration
-    store.__iter__.return_value = store
-    # Simulate batch output: dicts mapping layer_idx to tensor
-    # Shape: [train_batch_size_tokens, d_model]
-    batch_tokens = training_config.train_batch_size_tokens
-    d_model = 768  # Example d_model
-    store.__next__.return_value = (
-        {
-            0: torch.randn(batch_tokens, d_model),
-            1: torch.randn(batch_tokens, d_model),
-        },  # Inputs
-        {
-            0: torch.randn(batch_tokens, d_model),
-            1: torch.randn(batch_tokens, d_model),
-        },  # Targets
-    )
-    store.state_dict.return_value = {"mock_store_state": "value"}  # Mock state_dict
-    store.load_state_dict = MagicMock()  # Mock load_state_dict
-    return store
-
-
-@pytest.fixture
-def mock_loss_manager():
-    """Fixture for mock LossManager."""
-    loss_manager = MagicMock(spec=LossManager)
-    # Create a mock tensor with a mock backward method
-    mock_loss_tensor = MagicMock(spec=torch.Tensor)
-    mock_loss_tensor.backward = MagicMock()
-    # Make isnan return False for tests where backward should be called
-    mock_loss_tensor.isnan.return_value = False
-
-    loss_dict = {
-        "total": 0.5,
-        "reconstruction": 0.4,
-        "sparsity": 0.1,
-        "preactivation": 0.0,
-    }
-    loss_manager.compute_total_loss.return_value = (mock_loss_tensor, loss_dict)
-    loss_manager.get_current_sparsity_lambda.return_value = 0.001  # Mock lambda value
-    return loss_manager
-
-
-@pytest.fixture
-def mock_evaluator():
-    """Fixture for mock CLTEvaluator."""
-    evaluator = MagicMock(spec=CLTEvaluator)
-    evaluator.compute_metrics.return_value = {
-        "reconstruction/mse": 0.1,
-        "reconstruction/explained_variance": 0.9,
-        "sparsity/avg_l0": 15.5,
-        "sparsity/feature_density_mean": 0.05,
-        "dead_features/total_eval": 5,
-        "layerwise/l0/layer_0": 10.0,
-        "layerwise/l0/layer_1": 21.0,
-        "layerwise/log_feature_density/layer_0": [-2.0, -1.5],
-        "layerwise/log_feature_density/layer_1": [-1.8, -1.2],
-        "sparsity/log_feature_density_agg_hist": [-2.0, -1.5, -1.8, -1.2],
-    }
-    return evaluator
-
-
-@pytest.fixture
-def mock_wandb_logger():
-    """Fixture for mock WandBLogger."""
-    logger = MagicMock(spec=WandBLogger)
-    return logger
-
-
-# --- Test Initialization ---
-
-
-def test_init(clt_config, training_config, temp_log_dir):
-    """Test CLTTrainer initialization."""
-    # Create proper mocks for model parameters - needs real tensor
-    mock_model_instance = MagicMock(spec=CrossLayerTranscoder)
-    mock_param = torch.nn.Parameter(torch.randn(1))
-    mock_model_instance.parameters.return_value = [mock_param]
-    mock_model_instance.to.return_value = mock_model_instance
-
-    with patch(
-        "clt.training.trainer.CrossLayerTranscoder", return_value=mock_model_instance
-    ) as mock_clt_cls, patch(
-        "clt.training.trainer.LossManager"
-    ) as mock_loss_manager_cls, patch.object(
-        CLTTrainer, "_create_activation_extractor"
-    ) as mock_create_extractor, patch.object(
-        CLTTrainer, "_create_activation_store"
-    ) as mock_create_store, patch(
-        "clt.training.trainer.CLTEvaluator"
-    ) as mock_evaluator_cls, patch(
-        "clt.training.trainer.WandBLogger"
-    ) as mock_wandb_logger_cls:
-
-        mock_create_extractor.return_value = MagicMock()
-        mock_create_store.return_value = MagicMock()
-        mock_evaluator_instance = MagicMock()
-        mock_evaluator_cls.return_value = mock_evaluator_instance
-        mock_wandb_logger_instance = MagicMock()
-        mock_wandb_logger_cls.return_value = mock_wandb_logger_instance
-
-        trainer = CLTTrainer(clt_config, training_config, log_dir=temp_log_dir)
-
-        # Check initialization
-        assert trainer.clt_config == clt_config
-        assert trainer.training_config == training_config
-        assert trainer.log_dir == temp_log_dir
-        assert isinstance(trainer.device, torch.device)
-        assert trainer.start_time is not None
-
-        # Check if components were created and assigned
-        mock_clt_cls.assert_called_once_with(clt_config, device=trainer.device)
-        assert trainer.model == mock_model_instance
-
-        mock_loss_manager_cls.assert_called_once_with(training_config)
-        assert trainer.loss_manager == mock_loss_manager_cls.return_value
-
-        mock_create_extractor.assert_called_once()
-        assert trainer.activation_extractor == mock_create_extractor.return_value
-
-        mock_create_store.assert_called_once()
-        assert trainer.activation_store == mock_create_store.return_value
-
-        mock_evaluator_cls.assert_called_once_with(
-            mock_model_instance, trainer.device, trainer.start_time
-        )
-        assert trainer.evaluator == mock_evaluator_instance
-
-        mock_wandb_logger_cls.assert_called_once_with(
-            training_config=training_config, clt_config=clt_config, log_dir=temp_log_dir
-        )
-        assert trainer.wandb_logger == mock_wandb_logger_instance
-
-        # Check dead neuron counter initialization
-        assert hasattr(trainer, "n_forward_passes_since_fired")
-        assert trainer.n_forward_passes_since_fired.shape == (
-            clt_config.num_layers,
-            clt_config.num_features,
-        )
-        assert trainer.n_forward_passes_since_fired.device == trainer.device
-
-
-def test_create_activation_extractor(training_config):  # Pass training_config
-    """Test _create_activation_extractor method."""
-    with patch("clt.training.trainer.ActivationExtractorCLT") as mock_extractor_cls:
-        # Create a trainer instance manually
-        trainer = CLTTrainer.__new__(CLTTrainer)
-        trainer.training_config = training_config  # Use the fixture
-        trainer.device = torch.device("cpu")
-
-        # Call the method directly
-        result = trainer._create_activation_extractor()
-
-        # Check the call arguments (should match TrainingConfig)
-        mock_extractor_cls.assert_called_once_with(
-            model_name=training_config.model_name,
-            device=trainer.device,
-            model_dtype=training_config.model_dtype,  # Added model_dtype
-            context_size=training_config.context_size,
-            inference_batch_size=training_config.inference_batch_size,
-            exclude_special_tokens=training_config.exclude_special_tokens,
-            prepend_bos=training_config.prepend_bos,
-        )
-        assert result == mock_extractor_cls.return_value
-
-
-def test_create_activation_store(
-    mock_activation_extractor, training_config
-):  # Pass training_config
-    """Test _create_activation_store method."""
-    with patch("clt.training.trainer.ActivationStore") as mock_store_cls:
-        # Create a trainer instance manually
-        trainer = CLTTrainer.__new__(CLTTrainer)
-        trainer.training_config = training_config  # Use the fixture
-        trainer.device = torch.device("cpu")
-        trainer.activation_extractor = mock_activation_extractor
-        mock_start_time = time.time()  # Create a start time
-
-        # Set up the mock for stream_activations
-        mock_activation_generator = (
-            mock_activation_extractor.stream_activations.return_value
-        )
-
-        # Call the method directly
-        result = trainer._create_activation_store(mock_start_time)
-
-        # Check if extractor's stream_activations was called correctly
-        mock_activation_extractor.stream_activations.assert_called_once_with(
-            dataset_path=training_config.dataset_path,
-            dataset_split=training_config.dataset_split,
-            dataset_text_column=training_config.dataset_text_column,
-            streaming=training_config.streaming,
-            dataset_trust_remote_code=training_config.dataset_trust_remote_code,
-            cache_path=training_config.cache_path,
-            max_samples=training_config.max_samples,  # Added max_samples
-        )
-
-        # Check if ActivationStore was initialized correctly
-        mock_store_cls.assert_called_once_with(
-            activation_generator=mock_activation_generator,
-            n_batches_in_buffer=training_config.n_batches_in_buffer,
-            train_batch_size_tokens=training_config.train_batch_size_tokens,
-            normalization_method=training_config.normalization_method,
-            normalization_estimation_batches=training_config.normalization_estimation_batches,
-            device=trainer.device,
-            start_time=mock_start_time,  # Added start_time
-        )
-        assert result == mock_store_cls.return_value
-
-
-# --- Test Logging and Saving ---
-
-
-def test_log_metrics(
-    temp_log_dir, training_config, mock_wandb_logger, mock_loss_manager
-):  # Added mocks
-    """Test _log_metrics method."""
-    with patch.object(CLTTrainer, "_save_metrics") as mock_save_metrics:
-        # Create a trainer instance manually
-        trainer = CLTTrainer.__new__(CLTTrainer)
-        trainer.log_dir = temp_log_dir
-        trainer.metrics = {"train_losses": []}
-        trainer.training_config = training_config  # Use fixture
-        trainer.wandb_logger = mock_wandb_logger  # Use fixture
-        trainer.loss_manager = mock_loss_manager  # Use fixture
-        trainer.scheduler = MagicMock()  # Mock scheduler
-        trainer.scheduler.get_last_lr.return_value = [0.0001]  # Mock LR
-
-        loss_dict = {"total": 0.5, "reconstruction": 0.4, "sparsity": 0.1}
-        current_step = training_config.log_interval - 1  # Step before logging interval
-
-        # --- Test before log interval ---
-        trainer._log_metrics(current_step, loss_dict)
-
-        # Check metrics update
-        assert len(trainer.metrics["train_losses"]) == 1
-        assert trainer.metrics["train_losses"][0]["step"] == current_step
-        assert trainer.metrics["train_losses"][0]["total"] == 0.5
-
-        # Check WandB call
-        mock_wandb_logger.log_step.assert_called_once_with(
-            current_step,
-            loss_dict,
-            lr=0.0001,
-            sparsity_lambda=mock_loss_manager.get_current_sparsity_lambda.return_value,
-        )
-
-        # Should not have saved metrics yet
-        mock_save_metrics.assert_not_called()
-
-        # --- Test at log interval ---
-        current_step = training_config.log_interval
-        trainer._log_metrics(current_step, loss_dict)
-
-        # Check WandB call count
-        assert mock_wandb_logger.log_step.call_count == 2
-
-        # Should save metrics now
-        mock_save_metrics.assert_called_once()
-
-
-def test_save_metrics(temp_log_dir):
-    """Test _save_metrics method."""
-    # Create a trainer instance manually
-    trainer = CLTTrainer.__new__(CLTTrainer)
-    trainer.log_dir = temp_log_dir
-    trainer.metrics = {
-        "train_losses": [{"step": 1, "total": 0.5}],
-        "eval_metrics": [
-            {"step": 10, "sparsity/avg_l0": 15.0}
-        ],  # Changed from l0_stats
-    }
-
-    trainer._save_metrics()
-
-    # Check if metrics file was created
-    metrics_path = os.path.join(temp_log_dir, "metrics.json")
-    assert os.path.exists(metrics_path)
-
-    # Check file contents
-    with open(metrics_path, "r") as f:
-        saved_metrics = json.load(f)
-        assert "train_losses" in saved_metrics
-        assert "eval_metrics" in saved_metrics  # Check for eval_metrics key
-        assert saved_metrics["train_losses"][0]["step"] == 1
-        assert saved_metrics["train_losses"][0]["total"] == 0.5
-        assert saved_metrics["eval_metrics"][0]["step"] == 10
-        assert saved_metrics["eval_metrics"][0]["sparsity/avg_l0"] == 15.0
-
-
-def test_save_checkpoint(
-    temp_log_dir, mock_model, mock_activation_store, mock_wandb_logger
-):  # Added mocks
-    """Test _save_checkpoint method."""
-    with patch("torch.save") as mock_torch_save:
-        # Create a trainer instance manually
-        trainer = CLTTrainer.__new__(CLTTrainer)
-        trainer.log_dir = temp_log_dir
-        trainer.model = mock_model
-        trainer.activation_store = mock_activation_store
-        trainer.wandb_logger = mock_wandb_logger
-
-        step = 100
-        trainer._save_checkpoint(step)
-
-        model_ckpt_path = os.path.join(temp_log_dir, f"clt_checkpoint_{step}.pt")
-        store_ckpt_path = os.path.join(
-            temp_log_dir, f"activation_store_checkpoint_{step}.pt"
-        )
-        latest_model_path = os.path.join(temp_log_dir, "clt_checkpoint_latest.pt")
-        latest_store_path = os.path.join(
-            temp_log_dir, "activation_store_checkpoint_latest.pt"
-        )
-
-        # Check if model save was called for step and latest
-        mock_model.save.assert_any_call(model_ckpt_path)
-        mock_model.save.assert_any_call(latest_model_path)
-        assert mock_model.save.call_count == 2
-
-        # Check if activation store state was saved for step and latest
-        mock_activation_store.state_dict.assert_called()  # Ensure state_dict is called
-        mock_torch_save.assert_any_call(
-            mock_activation_store.state_dict.return_value, store_ckpt_path
-        )
-        mock_torch_save.assert_any_call(
-            mock_activation_store.state_dict.return_value, latest_store_path
-        )
-        assert mock_torch_save.call_count == 2
-
-        # Check WandB artifact logging
-        mock_wandb_logger.log_artifact.assert_called_once_with(
-            artifact_path=model_ckpt_path,
-            artifact_type="model",
-            name=f"clt_checkpoint_{step}",
-        )
-
-
-def test_load_checkpoint(
-    temp_log_dir, mock_model, clt_config, training_config
-):  # Added configs
-    """Test load_checkpoint method."""
-    # We need a more realistic setup for the store to be loadable
-    with patch("os.path.exists") as mock_exists, patch(
-        "torch.load"
-    ) as mock_torch_load, patch(
-        "clt.training.trainer.ActivationStore"
-    ) as mock_store_cls:  # Patch store class
-
-        mock_exists.return_value = True  # Assume files exist
-        mock_store_state = {"mock_store_state": "value"}
-        mock_torch_load.return_value = mock_store_state
-
-        # Mock the activation store instance that gets created during init
-        mock_store_instance = MagicMock(spec=ActivationStore)
-        mock_store_cls.return_value = mock_store_instance
-
-        # --- Initialize a trainer first ---
-        # Need to patch components during init as well
-        with patch(
-            "clt.training.trainer.CrossLayerTranscoder", return_value=mock_model
-        ), patch("clt.training.trainer.LossManager"), patch(
-            "clt.training.trainer.ActivationExtractorCLT"
-        ), patch(
-            "clt.training.trainer.CLTEvaluator"
-        ), patch(
-            "clt.training.trainer.WandBLogger"
-        ):
-
-            # We bypass the internal _create_activation_store call by patching ActivationStore class
-            trainer = CLTTrainer(clt_config, training_config, log_dir=temp_log_dir)
-            # Manually assign the mocked store instance AFTER init bypasses creation
-            trainer.activation_store = mock_store_instance
-            trainer.device = torch.device("cpu")  # Ensure device is set
-
-        # --- Now test loading ---
-        checkpoint_path = os.path.join(temp_log_dir, "clt_checkpoint_100.pt")
-        store_checkpoint_path = os.path.join(
-            temp_log_dir, "activation_store_checkpoint_100.pt"
-        )
-
-        # Test loading with explicit store path
-        trainer.load_checkpoint(checkpoint_path, store_checkpoint_path)
-
-        # Check if model load was called
-        mock_model.load.assert_called_once_with(checkpoint_path)
-
-        # Check torch.load was called for the store state
-        mock_torch_load.assert_called_once_with(
-            store_checkpoint_path, map_location=trainer.device
-        )
-
-        # Check if activation store load_state_dict was called
-        mock_store_instance.load_state_dict.assert_called_once_with(mock_store_state)
-
-        # --- Test loading with derived store path ---
-        mock_model.load.reset_mock()
-        mock_torch_load.reset_mock()
-        mock_store_instance.load_state_dict.reset_mock()
-
-        trainer.load_checkpoint(checkpoint_path)  # No store path provided
-
-        mock_model.load.assert_called_once_with(checkpoint_path)
-        # Should derive the store path
-        mock_torch_load.assert_called_once_with(
-            store_checkpoint_path, map_location=trainer.device
-        )
-        mock_store_instance.load_state_dict.assert_called_once_with(mock_store_state)
-
-        # --- Test loading latest ---
-        mock_model.load.reset_mock()
-        mock_torch_load.reset_mock()
-        mock_store_instance.load_state_dict.reset_mock()
-
-        latest_model_path = os.path.join(temp_log_dir, "clt_checkpoint_latest.pt")
-        latest_store_path = os.path.join(
-            temp_log_dir, "activation_store_checkpoint_latest.pt"
-        )
-
-        trainer.load_checkpoint(latest_model_path)  # Load latest model
-
-        mock_model.load.assert_called_once_with(latest_model_path)
-        # Should derive the latest store path
-        mock_torch_load.assert_called_once_with(
-            latest_store_path, map_location=trainer.device
-        )
-        mock_store_instance.load_state_dict.assert_called_once_with(mock_store_state)
-
-
-# --- Test Dead Neuron Logic ---
-
-
-def test_dead_neurons_mask(clt_config, training_config):
-    """Test the dead_neurons_mask property."""
-    trainer = CLTTrainer.__new__(CLTTrainer)
-    trainer.clt_config = clt_config
-    trainer.training_config = training_config
-    trainer.device = torch.device("cpu")
-
-    # Initialize counter
-    trainer.n_forward_passes_since_fired = torch.zeros(
-        (clt_config.num_layers, clt_config.num_features),
-        device=trainer.device,
-        dtype=torch.long,
-    )
-
-    # Set some neurons as dead
-    trainer.n_forward_passes_since_fired[0, 0] = training_config.dead_feature_window + 1
-    trainer.n_forward_passes_since_fired[1, 10] = (
-        training_config.dead_feature_window + 5
-    )
-
-    # Set some as not dead
-    trainer.n_forward_passes_since_fired[0, 1] = training_config.dead_feature_window - 1
-    trainer.n_forward_passes_since_fired[1, 11] = 0
-
-    mask = trainer.dead_neurons_mask
-
-    assert mask.shape == (clt_config.num_layers, clt_config.num_features)
-    assert mask.dtype == torch.bool
-    assert mask[0, 0].item() is True
-    assert mask[1, 10].item() is True
-    assert mask[0, 1].item() is False
-    assert mask[1, 11].item() is False
-
-
-# --- Test Training Loop Logic ---
-
-
-@pytest.mark.parametrize("with_scheduler", [True, False])
-def test_train(
-    clt_config,
-    training_config,
-    temp_log_dir,
-    mock_model,
-    mock_loss_manager,
-    mock_activation_store,  # Added store
-    mock_evaluator,  # Added evaluator
-    mock_wandb_logger,  # Added logger
-    with_scheduler,
-):
-    """Test train method main loop, evaluation, checkpointing, and logging."""
-    # Adjust training steps for faster test
-    training_config.training_steps = 5
-    training_config.eval_interval = 2
-    training_config.checkpoint_interval = 3
-    training_config.log_interval = 1  # Log every step for testing calls
-
-    # Mock optimizer and potentially scheduler
-    mock_optimizer = MagicMock(spec=torch.optim.AdamW)
-    mock_scheduler = (
-        MagicMock(spec=torch.optim.lr_scheduler.LRScheduler) if with_scheduler else None
-    )
-
-    # Mock tqdm to prevent console output and allow checking calls
-    mock_pbar = MagicMock()
-    mock_pbar.__iter__.return_value = iter(range(training_config.training_steps))
-
-    with patch(
-        "clt.training.trainer.tqdm",
-        return_value=mock_pbar,  # Return the configured mock pbar
-    ) as mock_tqdm_cls, patch.object(
-        CLTTrainer, "_save_metrics"
-    ) as mock_save_metrics, patch(
-        "torch.optim.AdamW", return_value=mock_optimizer
-    ), patch(
-        "torch.optim.Adam", return_value=mock_optimizer
-    ), patch(
-        "torch.optim.lr_scheduler.LinearLR", return_value=mock_scheduler
-    ), patch(
-        "torch.optim.lr_scheduler.CosineAnnealingLR", return_value=mock_scheduler
-    ), patch(
-        # Prevent NaN check from skipping backward pass
-        "clt.training.trainer.torch.isnan",
-        return_value=False,
-    ):
-
-        # --- Set up Trainer Instance Manually (Bypass __init__) ---
-        trainer = CLTTrainer.__new__(CLTTrainer)
-        trainer.clt_config = clt_config
-        trainer.training_config = training_config
-        trainer.log_dir = temp_log_dir
-        trainer.device = torch.device("cpu")
-        trainer.start_time = time.time()
-
-        # Assign mocks directly
-        trainer.model = mock_model
-        trainer.optimizer = mock_optimizer
-        trainer.scheduler = mock_scheduler
-        trainer.activation_store = mock_activation_store
-        trainer.loss_manager = mock_loss_manager
-        trainer.evaluator = mock_evaluator
-        trainer.wandb_logger = mock_wandb_logger
-
-        # Initialize metrics dict and dead neuron counter
-        trainer.metrics = {"train_losses": [], "eval_metrics": []}
-        trainer.n_forward_passes_since_fired = torch.zeros(
-            (clt_config.num_layers, clt_config.num_features),
-            device=trainer.device,
-            dtype=torch.long,
-        )
-        # Mock the dead_neurons_mask property to return a fixed mask for evaluator call
-        with patch.object(
-            CLTTrainer, "dead_neurons_mask", new_callable=PropertyMock
-        ) as mock_dead_mask:
-            mock_dead_mask.return_value = torch.zeros_like(
-                trainer.n_forward_passes_since_fired, dtype=torch.bool
-            )
-
-            # --- Run Training ---
-            result = trainer.train(
-                eval_every=training_config.eval_interval
-            )  # Use correct param name
-
-            # --- Assertions ---
-            total_steps = training_config.training_steps
-
-            # 1. Training Loop Execution
-            assert mock_tqdm_cls.call_count == 1  # tqdm class called once
-            # Check methods called on the returned pbar mock
-            assert mock_pbar.refresh.call_count >= total_steps  # Called frequently
-            assert mock_pbar.set_description.call_count == total_steps
-            # Postfix might only be set on eval steps
-            eval_steps_count = (
-                total_steps + training_config.eval_interval - 1
-            ) // training_config.eval_interval
-            assert mock_pbar.set_postfix_str.call_count == eval_steps_count
-            assert mock_pbar.close.call_count == 1  # Called at the end
-
-            assert (
-                mock_activation_store.__next__.call_count == total_steps
-            )  # Batch fetched per step
-            assert (
-                mock_loss_manager.compute_total_loss.call_count == total_steps
-            )  # Loss computed per step
-            assert mock_optimizer.zero_grad.call_count == total_steps
-            # Assuming loss is never NaN in this test
-            loss_tensor, _ = mock_loss_manager.compute_total_loss.return_value
-            assert (
-                loss_tensor.backward.call_count == total_steps
-            )  # Backward called per step
-            assert (
-                mock_optimizer.step.call_count == total_steps
-            )  # Optimizer stepped per step
-            if with_scheduler:
-                assert (
-                    mock_scheduler.step.call_count == total_steps
-                )  # Scheduler stepped per step
-
-            # 2. Dead Neuron Update Logic
-            assert (
-                mock_model.get_feature_activations.call_count == total_steps
-            )  # Called each step
-            # Check a specific counter value (difficult to assert exact value due to random activations)
-            # Instead, we mainly rely on the call count above and the separate dead neuron test
-
-            # 3. Logging
-            # _log_metrics is called internally by train, not patched here. Check wandb logger call instead.
-            assert (
-                mock_wandb_logger.log_step.call_count == total_steps
-            )  # Logged every step
-            # Check if _save_metrics was called due to log_interval=1
-            assert (
-                mock_save_metrics.call_count >= total_steps
-            )  # Called at least once per step
-
-            # 4. Evaluation (Steps 0, 2, 4 because eval_interval=2, steps=5)
-            eval_steps = [0, 2, 4]
-            assert mock_evaluator.compute_metrics.call_count == len(eval_steps)
-            assert mock_wandb_logger.log_evaluation.call_count == len(eval_steps)
-            # Check args for evaluator and logger calls (example: first call at step 0)
-            first_eval_call_args = mock_evaluator.compute_metrics.call_args_list[0]
-            _, kwargs = first_eval_call_args
-            assert torch.equal(
-                kwargs["dead_neuron_mask"], mock_dead_mask.return_value
-            )  # Check mask passed
-            first_log_eval_call_args = mock_wandb_logger.log_evaluation.call_args_list[
-                0
-            ]
-            args, _ = first_log_eval_call_args
-            assert args[0] == eval_steps[0]  # Check step number
-            assert (
-                args[1] == mock_evaluator.compute_metrics.return_value
-            )  # Check metrics dict passed
-
-            # 5. Checkpointing (Steps 3 and 4 because interval=3, steps=5, plus final)
-            checkpoint_steps = [3, 4]
-            # Given trainer implementation behavior, the model.save call count is 7
-            # This might be due to additional saves of latest checkpoints
-            assert mock_model.save.call_count == 7  # According to observed behavior
-
-            # 6. Final Actions
-            # _save_metrics called within log_metrics (once per step here) and once more at the end
-            assert mock_save_metrics.call_count >= total_steps + 1
-            assert mock_wandb_logger.finish.call_count == 1  # Wandb finished
-
-            # 7. Return Value
-            assert result == mock_model
-
-
-def test_train_with_nan_loss(
-    clt_config,
-    training_config,
-    mock_model,
-    mock_activation_store,
-    mock_evaluator,
-    mock_wandb_logger,
-):  # Added mocks
-    """Test train method handling of NaN loss."""
-    training_config.training_steps = 3
-    training_config.eval_interval = 10  # Avoid eval for simplicity
-    training_config.checkpoint_interval = 10  # Avoid checkpointing
-
-    # Mock optimizer
-    mock_optimizer = MagicMock(spec=torch.optim.AdamW)
-
-    # Set up mock loss manager to return NaN loss
-    mock_loss_manager = MagicMock(spec=LossManager)
-    nan_tensor = torch.tensor(float("nan"))
-    loss_dict = {
-        "total": float("nan"),
-        "reconstruction": float("nan"),
-        "sparsity": float("nan"),
-        "preactivation": float("nan"),
-    }
-    mock_loss_manager.compute_total_loss.return_value = (nan_tensor, loss_dict)
-
-    with patch("torch.isnan", return_value=True), patch(
-        "tqdm.tqdm", return_value=range(training_config.training_steps)
-    ), patch(
-        "torch.optim.AdamW", return_value=mock_optimizer
-    ):  # Patch optimizer creation
-
-        # Set up trainer - bypass init
-        trainer = CLTTrainer.__new__(CLTTrainer)
-        trainer.clt_config = clt_config
-        trainer.training_config = training_config
-        trainer.log_dir = "mock_log_dir"
-        trainer.device = torch.device("cpu")
-        trainer.start_time = time.time()
-
-        # Assign mocks
-        trainer.model = mock_model
-        trainer.optimizer = mock_optimizer
-        trainer.activation_store = mock_activation_store
-        trainer.loss_manager = mock_loss_manager
-        trainer.evaluator = mock_evaluator
-        trainer.wandb_logger = mock_wandb_logger
-        trainer.metrics = {"train_losses": [], "eval_metrics": []}
-        trainer.scheduler = None
-        trainer.n_forward_passes_since_fired = torch.zeros(
-            (clt_config.num_layers, clt_config.num_features), device=trainer.device
-        )
-
-        # Run training
-        trainer.train(eval_every=training_config.eval_interval)
-
-        # Check that backward and step were not called due to NaN loss
-        # loss_tensor.backward will not be available directly as it's created inside train
-        # Instead, check that optimizer.step was not called
-        mock_optimizer.step.assert_not_called()
-        # Check that zero_grad WAS called
-        assert mock_optimizer.zero_grad.call_count == training_config.training_steps
-
-
-def test_train_with_error_in_backward(
-    clt_config,
-    training_config,
-    mock_model,
-    mock_activation_store,
-    mock_evaluator,
-    mock_wandb_logger,
-):  # Added mocks
-    """Test train method handling of error in backward pass."""
-    training_config.training_steps = 1  # Only one step needed
-    training_config.eval_interval = 10
-    training_config.checkpoint_interval = 10
-
-    # Mock optimizer
-    mock_optimizer = MagicMock(spec=torch.optim.AdamW)
-
-    # Set up mock loss tensor that raises error on backward
-    mock_loss_tensor = MagicMock(spec=torch.Tensor)
-    mock_loss_tensor.backward.side_effect = RuntimeError("Test error in backward")
-    # Need isnan to return False for backward to be attempted
-    mock_loss_tensor.isnan.return_value = False
-
-    mock_loss_dict = {
-        "total": 0.5,
-        "reconstruction": 0.4,
-        "sparsity": 0.1,
-        "preactivation": 0.0,
-    }
-    mock_loss_manager = MagicMock(spec=LossManager)
-    mock_loss_manager.compute_total_loss.return_value = (
-        mock_loss_tensor,
-        mock_loss_dict,
-    )
-
-    with patch("torch.optim.AdamW", return_value=mock_optimizer), patch(
-        "tqdm.tqdm", return_value=range(training_config.training_steps)
-    ), patch(
-        # Patch isnan used in the trainer module
-        "clt.training.trainer.torch.isnan",
-        return_value=False,
-    ):
-
-        # Set up trainer - bypass init
-        trainer = CLTTrainer.__new__(CLTTrainer)
-        trainer.clt_config = clt_config
-        trainer.training_config = training_config
-        trainer.log_dir = "mock_log_dir"
-        trainer.device = torch.device("cpu")
-        trainer.start_time = time.time()
-
-        # Assign mocks
-        trainer.model = mock_model
-        trainer.optimizer = mock_optimizer
-        trainer.activation_store = mock_activation_store
-        trainer.loss_manager = mock_loss_manager
-        trainer.evaluator = mock_evaluator
-        trainer.wandb_logger = mock_wandb_logger
-        trainer.metrics = {"train_losses": [], "eval_metrics": []}
-        trainer.scheduler = None
-        trainer.n_forward_passes_since_fired = torch.zeros(
-            (clt_config.num_layers, clt_config.num_features), device=trainer.device
-        )
-
-        # Run training
-        trainer.train(eval_every=training_config.eval_interval)
-
-        # Check that backward was called but optimizer step was not
-        mock_loss_tensor.backward.assert_called_once()
-        mock_optimizer.step.assert_not_called()
-        # Check that zero_grad WAS called
-        mock_optimizer.zero_grad.assert_called_once()
-
-
-def test_activation_store_exception_handling(
-    clt_config, training_config, mock_model, mock_evaluator, mock_wandb_logger
-):  # Added mocks
-    """Test handling of exceptions from the activation store."""
-    training_config.training_steps = 10  # Set steps > number of successful batches
-    training_config.eval_interval = 100  # Avoid eval/checkpointing
-    training_config.checkpoint_interval = 100
-
-    # Mock optimizer
-    mock_optimizer = MagicMock(spec=torch.optim.AdamW)
-    mock_loss_manager = MagicMock(spec=LossManager)  # Need loss manager
-    # Mock loss tensor needed for backward call check
-    mock_loss_tensor = MagicMock(spec=torch.Tensor)
-    mock_loss_tensor.isnan.return_value = False
-    mock_loss_manager.compute_total_loss.return_value = (mock_loss_tensor, {})
-
-    # --- Test StopIteration ---
-    mock_store_stopiter = MagicMock(spec=ActivationStore)
-    mock_store_stopiter.__iter__.return_value = mock_store_stopiter
-    # Simulate 2 good batches then StopIteration
-    good_batch = ({0: torch.randn(10, 768)}, {0: torch.randn(10, 768)})
-    mock_store_stopiter.__next__.side_effect = [good_batch, good_batch, StopIteration]
-
-    with patch("tqdm.tqdm", return_value=range(training_config.training_steps)), patch(
-        "torch.optim.AdamW", return_value=mock_optimizer
-    ), patch(
-        "torch.isnan", return_value=False
-    ):  # Patch isnan for this block
-
-        trainer = CLTTrainer.__new__(CLTTrainer)
-        # Assign necessary attributes...
-        trainer.clt_config = clt_config
-        trainer.training_config = training_config
-        trainer.log_dir = "mock_log_dir"
-        trainer.device = torch.device("cpu")
-        trainer.start_time = time.time()
-        trainer.model = mock_model
-        trainer.optimizer = mock_optimizer
-        trainer.activation_store = mock_store_stopiter  # Use StopIteration store
-        trainer.loss_manager = mock_loss_manager
-        trainer.evaluator = mock_evaluator
-        trainer.wandb_logger = mock_wandb_logger
-        trainer.metrics = {"train_losses": [], "eval_metrics": []}
-        trainer.scheduler = None
-        trainer.n_forward_passes_since_fired = torch.zeros(
-            (clt_config.num_layers, clt_config.num_features), device=trainer.device
-        )
-
-        trainer.train(eval_every=training_config.eval_interval)
-
-        # Check that training loop stopped early (after 2 steps)
-        assert mock_loss_manager.compute_total_loss.call_count == 2
-        # Check that the loop exited cleanly. finish should be called.
-        mock_wandb_logger.finish.assert_called_once()
-
-    # --- Test Other Exception ---
-    mock_store_valueerr = MagicMock(spec=ActivationStore)
-    mock_store_valueerr.__iter__.return_value = mock_store_valueerr
-    # Simulate 1 good batch, 1 ValueError, 1 good batch
-    mock_store_valueerr.__next__.side_effect = [
-        good_batch,
-        ValueError("Test error"),
-        good_batch,
-        StopIteration,
-    ]  # Add StopIteration
-
-    # Reset mocks for the new run
-    mock_optimizer.reset_mock()
-    mock_loss_manager.reset_mock()
-    mock_evaluator.reset_mock()
-    mock_wandb_logger.reset_mock()
-    mock_model.reset_mock()  # Reset model mocks too (save/load)
-    mock_loss_tensor.reset_mock()  # Reset loss tensor mock
-    # Re-setup loss manager return value as it was reset
-    mock_loss_manager.compute_total_loss.return_value = (mock_loss_tensor, {})
-
-    with patch("tqdm.tqdm", return_value=range(training_config.training_steps)), patch(
-        "torch.optim.AdamW", return_value=mock_optimizer
-    ), patch(
-        "torch.isnan", return_value=False
-    ):  # Ensure isnan is False
-
-        trainer = CLTTrainer.__new__(CLTTrainer)
-        # Assign necessary attributes...
-        trainer.clt_config = clt_config
-        trainer.training_config = training_config
-        trainer.log_dir = "mock_log_dir"
-        trainer.device = torch.device("cpu")
-        trainer.start_time = time.time()
-        trainer.model = mock_model
-        trainer.optimizer = mock_optimizer
-        trainer.activation_store = mock_store_valueerr  # Use ValueError store
-        trainer.loss_manager = mock_loss_manager
-        trainer.evaluator = mock_evaluator
-        trainer.wandb_logger = mock_wandb_logger
-        trainer.metrics = {"train_losses": [], "eval_metrics": []}
-        trainer.scheduler = None
-        trainer.n_forward_passes_since_fired = torch.zeros(
-            (clt_config.num_layers, clt_config.num_features), device=trainer.device
-        )
-
-        trainer.train(eval_every=training_config.eval_interval)
-
-        # Check that training continued after the error (ran for steps 0 and 2)
-        assert mock_loss_manager.compute_total_loss.call_count == 2
-        # Step 1 should have been skipped, so optimizer step should only happen twice
-        assert mock_optimizer.step.call_count == 2
-        # Check finish was called
-        mock_wandb_logger.finish.assert_called_once()