curt-tigges
diff --git a/‎clt/activation_generation/generator.py‎
Lines changed: 31 additions & 12 deletions b/‎clt/activation_generation/generator.py‎
Lines changed: 31 additions & 12 deletions
diff --git a/‎pytest.ini‎
Lines changed: 2 additions & 1 deletion b/‎pytest.ini‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎tests/conftest.py‎
Lines changed: 21 additions & 0 deletions b/‎tests/conftest.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎tests/integration/test_clt_distributed.py‎
Lines changed: 130 additions & 0 deletions b/‎tests/integration/test_clt_distributed.py‎
Lines changed: 130 additions & 0 deletions
diff --git a/‎tests/integration/test_clt_end_to_end.py‎
Lines changed: 101 additions & 0 deletions b/‎tests/integration/test_clt_end_to_end.py‎
Lines changed: 101 additions & 0 deletions
@@ -798,30 +798,40 @@ def _write_chunk(
                                 chunks=(min(rows, 16384), d_model),
                             )
 
+                        # --- Use a SINGLE permutation shared across all layers --- #
+                        if rows > 0:
+                            shared_perm = torch.randperm(rows, device=next(iter(buf_inp_gpu.values()))[0].device)
+                        else:
+                            # Degenerate case – zero-row chunk (should not normally happen)
+                            shared_perm = None
+
                         layer_data_to_write = []
                         for layer_id in layer_ids:
                             with self._conditional_measure(f"chunk_{chunk_idx}_layer_{layer_id}_data_prep"):
                                 with self._conditional_measure(f"chunk_{chunk_idx}_layer_{layer_id}_concat"):
                                     layer_inp_gpu = torch.cat(buf_inp_gpu[layer_id], dim=0)
                                     layer_tgt_gpu = torch.cat(buf_tgt_gpu[layer_id], dim=0)
 
-                                with self._conditional_measure(f"chunk_{chunk_idx}_layer_{layer_id}_permute"):
-                                    perm = torch.randperm(rows, device=layer_inp_gpu.device)
-                                    layer_inp_gpu_perm = layer_inp_gpu[perm]
-                                    layer_tgt_gpu_perm = layer_tgt_gpu[perm]
+                                if shared_perm is not None:
+                                    with self._conditional_measure(f"chunk_{chunk_idx}_layer_{layer_id}_permute"):
+                                        layer_inp_gpu_perm = layer_inp_gpu[shared_perm]
+                                        layer_tgt_gpu_perm = layer_tgt_gpu[shared_perm]
+                                else:
+                                    layer_inp_gpu_perm = layer_inp_gpu
+                                    layer_tgt_gpu_perm = layer_tgt_gpu
 
                                 with self._conditional_measure(f"chunk_{chunk_idx}_layer_{layer_id}_cpu_transfer"):
                                     layer_inp_cpu = layer_inp_gpu_perm.cpu()
                                     layer_tgt_cpu = layer_tgt_gpu_perm.cpu()
 
                                 with self._conditional_measure(f"chunk_{chunk_idx}_layer_{layer_id}_convert_numpy"):
                                     inputs_np = (
-                                        layer_inp_cpu.numpy().view(np.uint16)
+                                        layer_inp_cpu.view(torch.int16).numpy()
                                         if self.torch_dtype == torch.bfloat16
                                         else layer_inp_cpu.numpy()
                                     )
                                     targets_np = (
-                                        layer_tgt_cpu.numpy().view(np.uint16)
+                                        layer_tgt_cpu.view(torch.int16).numpy()
                                         if self.torch_dtype == torch.bfloat16
                                         else layer_tgt_cpu.numpy()
                                     )
@@ -855,29 +865,38 @@ def write_layer_data(layer_id_arg: int, inputs_data: np.ndarray, targets_data: n
 
             elif self.cfg.output_format == "npz":
                 npz_save_dict = {}
+                # --- Use a SINGLE permutation shared across all layers (same as HDF5 path) --- #
+                if rows > 0:
+                    shared_perm = torch.randperm(rows, device=next(iter(buf_inp_gpu.values()))[0].device)
+                else:
+                    shared_perm = None
+
                 for layer_id in layer_ids:
                     with self._conditional_measure(f"chunk_{chunk_idx}_layer_{layer_id}_data_prep_npz"):
                         with self._conditional_measure(f"chunk_{chunk_idx}_layer_{layer_id}_concat_npz"):
                             layer_inp_gpu = torch.cat(buf_inp_gpu[layer_id], dim=0)
                             layer_tgt_gpu = torch.cat(buf_tgt_gpu[layer_id], dim=0)
 
-                        with self._conditional_measure(f"chunk_{chunk_idx}_layer_{layer_id}_permute_npz"):
-                            perm = torch.randperm(rows, device=layer_inp_gpu.device)
-                            layer_inp_gpu_perm = layer_inp_gpu[perm]
-                            layer_tgt_gpu_perm = layer_tgt_gpu[perm]
+                        if shared_perm is not None:
+                            with self._conditional_measure(f"chunk_{chunk_idx}_layer_{layer_id}_permute_npz"):
+                                layer_inp_gpu_perm = layer_inp_gpu[shared_perm]
+                                layer_tgt_gpu_perm = layer_tgt_gpu[shared_perm]
+                        else:
+                            layer_inp_gpu_perm = layer_inp_gpu
+                            layer_tgt_gpu_perm = layer_tgt_gpu
 
                         with self._conditional_measure(f"chunk_{chunk_idx}_layer_{layer_id}_cpu_transfer_npz"):
                             layer_inp_cpu = layer_inp_gpu_perm.cpu()
                             layer_tgt_cpu = layer_tgt_gpu_perm.cpu()
 
                         with self._conditional_measure(f"chunk_{chunk_idx}_layer_{layer_id}_convert_numpy_npz"):
                             inputs_np = (
-                                layer_inp_cpu.numpy().view(np.uint16)
+                                layer_inp_cpu.view(torch.int16).numpy()
                                 if self.torch_dtype == torch.bfloat16
                                 else layer_inp_cpu.numpy()
                             )
                             targets_np = (
-                                layer_tgt_cpu.numpy().view(np.uint16)
+                                layer_tgt_cpu.view(torch.int16).numpy()
                                 if self.torch_dtype == torch.bfloat16
                                 else layer_tgt_cpu.numpy()
                             )
 
@@ -1,3 +1,4 @@
 [pytest]
 markers =
-    integration: marks tests as integration tests that verify multiple components working together \
+    integration: marks tests as integration tests that verify multiple components working together \
+    require_gpu: marks tests that require a GPU (CUDA or MPS) to run \
@@ -0,0 +1,21 @@
+import pytest
+import torch
+
+
+def get_available_devices():
+    """Returns available devices, including cpu, mps, and cuda if available."""
+    devices = ["cpu"]
+    if torch.cuda.is_available():
+        devices.append("cuda")
+    if torch.backends.mps.is_available():
+        devices.append("mps")
+    return devices
+
+
+DEVICES = get_available_devices()
+
+
+@pytest.fixture(params=DEVICES)
+def device(request):
+    """Fixture to iterate over all available devices."""
+    return torch.device(request.param)
@@ -0,0 +1,130 @@
+import pytest
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import os
+from typing import cast
+
+from clt.config import CLTConfig
+from clt.models.clt import CrossLayerTranscoder
+
+
+def setup_distributed_environment(rank, world_size, port="12356"):
+    """Initializes the distributed process group."""
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = port
+    dist.init_process_group("gloo", rank=rank, world_size=world_size)
+
+
+def cleanup_distributed_environment():
+    """Cleans up the distributed process group."""
+    dist.destroy_process_group()
+
+
+def distributed_test_runner(rank, world_size, test_fn, *args):
+    """A wrapper to run a distributed test function."""
+    setup_distributed_environment(rank, world_size)
+    try:
+        test_fn(rank, world_size, *args)
+    finally:
+        cleanup_distributed_environment()
+
+
+# --- Test Functions (to be run in separate processes) ---
+
+
+def _test_forward_pass_distributed(rank, world_size):
+    """
+    Tests that the forward pass produces identical results on all ranks.
+    """
+    device = torch.device("cpu")
+    torch.manual_seed(42)  # Ensure same model initialization
+
+    clt_config = CLTConfig(num_layers=2, d_model=8, num_features=16, activation_fn="relu")
+    model = CrossLayerTranscoder(config=clt_config, process_group=dist.group.WORLD, device=device)
+
+    # All ranks get the same input
+    torch.manual_seed(123)
+    sample_inputs = {
+        0: torch.randn(20, clt_config.d_model, device=device),
+        1: torch.randn(20, clt_config.d_model, device=device),
+    }
+
+    reconstructions = model.forward(sample_inputs)
+    loss = torch.mean(reconstructions[0])  # A simple, deterministic loss
+
+    # Gather the loss from all ranks
+    loss_list = [torch.zeros_like(loss) for _ in range(world_size)]
+    dist.all_gather(loss_list, loss)
+
+    # The loss, and therefore the forward pass result, should be identical on all ranks
+    for other_loss in loss_list:
+        assert torch.allclose(loss, other_loss), "Forward pass results (losses) differ across ranks"
+
+
+def _test_sharded_gradient(rank, world_size):
+    """
+    Tests that sharded parameters receive different gradients on each rank.
+    """
+    device = torch.device("cpu")
+    # Use rank-specific seed for weight initialization to ensure different weights
+    torch.manual_seed(42 + rank)
+
+    clt_config = CLTConfig(num_layers=2, d_model=8, num_features=16, activation_fn="relu")
+    model = CrossLayerTranscoder(config=clt_config, process_group=dist.group.WORLD, device=device)
+
+    # All ranks get the same input
+    torch.manual_seed(123)
+    sample_inputs = {0: torch.randn(5, clt_config.d_model, device=device)}
+
+    # Forward pass
+    reconstructions = model.forward(sample_inputs)
+
+    # Create a loss that depends on the actual output values
+    # This will produce different gradients for different weight values
+    target = torch.randn_like(reconstructions[0])
+    loss = torch.nn.functional.mse_loss(reconstructions[0], target)
+
+    # Backward pass
+    loss.backward()
+
+    # Test gradients of a SHARDED parameter (e.g., Encoder weights)
+    sharded_grad_optional = model.encoder_module.encoders[0].weight.grad
+    assert sharded_grad_optional is not None, "Gradient for sharded parameter should exist"
+    sharded_grad = cast(torch.Tensor, sharded_grad_optional)
+
+    # Gather all gradients to compare
+    grad_list = [torch.zeros_like(sharded_grad) for _ in range(world_size)]
+    dist.all_gather(grad_list, sharded_grad)
+
+    # The gradients for a sharded parameter should be DIFFERENT on each rank
+    # because each rank has different weights and computes different outputs
+    assert not torch.allclose(
+        grad_list[0], grad_list[1], rtol=1e-5, atol=1e-8
+    ), "Gradients for sharded parameters should be different across ranks"
+
+
+# --- Pytest Test Class ---
+
+
+@pytest.mark.integration
+@pytest.mark.distributed
+@pytest.mark.skipif(not dist.is_available(), reason="torch.distributed not available")
+class TestCLTDistributed:
+    def test_forward_pass(self):
+        world_size = 2
+        mp.spawn(  # type: ignore[attr-defined]
+            distributed_test_runner,
+            args=(world_size, _test_forward_pass_distributed),
+            nprocs=world_size,
+            join=True,
+        )
+
+    def test_gradient_sharding(self):
+        world_size = 2
+        mp.spawn(  # type: ignore[attr-defined]
+            distributed_test_runner,
+            args=(world_size, _test_sharded_gradient),
+            nprocs=world_size,
+            join=True,
+        )
@@ -0,0 +1,101 @@
+import pytest
+import torch
+
+from clt.config import CLTConfig
+from clt.models.clt import CrossLayerTranscoder
+
+
+def get_available_devices():
+    """Returns available devices, including cpu, mps, and cuda if available."""
+    devices = ["cpu"]
+    if torch.cuda.is_available():
+        devices.append("cuda")
+    if torch.backends.mps.is_available():
+        devices.append("mps")
+    return devices
+
+
+DEVICES = get_available_devices()
+
+
+@pytest.fixture(params=DEVICES)
+def device(request):
+    """Fixture to iterate over all available devices."""
+    return torch.device(request.param)
+
+
+@pytest.fixture
+def clt_config():
+    """Provides a basic CLTConfig for end-to-end testing."""
+    return CLTConfig(
+        num_layers=2,
+        d_model=8,
+        num_features=16,
+        activation_fn="relu",  # Use simple ReLU for gradient checking
+    )
+
+
+@pytest.fixture
+def clt_model(clt_config, device):
+    """Provides a CrossLayerTranscoder instance for integration tests."""
+    model = CrossLayerTranscoder(
+        config=clt_config,
+        process_group=None,
+        device=device,
+    )
+    # Ensure all parameters have requires_grad=True for the backward pass test
+    for param in model.parameters():
+        param.requires_grad = True
+    return model.to(device)
+
+
+@pytest.fixture
+def sample_inputs(clt_config, device):
+    """Provides a sample input dictionary with consistent token counts."""
+    total_tokens = 20
+    return {
+        0: torch.randn(total_tokens, clt_config.d_model, device=device),
+        1: torch.randn(total_tokens, clt_config.d_model, device=device),
+    }
+
+
+class TestCLTEndToEnd:
+    def test_forward_backward_pass(self, clt_model, sample_inputs):
+        """
+        Tests a full forward and backward pass to ensure gradients are computed.
+        """
+        # --- Forward Pass ---
+        reconstructions = clt_model.forward(sample_inputs)
+
+        # --- Loss Calculation ---
+        # A simple MSE loss between the reconstructions and the original inputs
+        loss = torch.tensor(0.0, device=clt_model.device, dtype=torch.float32)
+        for layer_idx, recon_tensor in reconstructions.items():
+            original_tensor = sample_inputs[layer_idx]
+            loss += torch.mean((recon_tensor - original_tensor) ** 2)
+
+        # --- Backward Pass ---
+        try:
+            loss.backward()
+        except Exception as e:
+            pytest.fail(f"Backward pass failed with exception: {e}")
+
+        # --- Gradient Check ---
+        # Check that some gradients have been computed. We check a few key parameters.
+        # Encoder weights for layer 0
+        assert clt_model.encoder_module.encoders[0].weight.grad is not None
+        assert torch.all(torch.isfinite(clt_model.encoder_module.encoders[0].weight.grad))
+        assert not torch.all(clt_model.encoder_module.encoders[0].weight.grad == 0)
+
+        # Decoder weights for 0->1
+        decoder_key = "0->1"
+        assert clt_model.decoder_module.decoders[decoder_key].weight.grad is not None
+        assert torch.all(torch.isfinite(clt_model.decoder_module.decoders[decoder_key].weight.grad))
+        assert not torch.all(clt_model.decoder_module.decoders[decoder_key].weight.grad == 0)
+
+        # Decoder bias for 1->1
+        decoder_key = "1->1"
+        if clt_model.decoder_module.decoders[decoder_key].bias_param is not None:
+            assert clt_model.decoder_module.decoders[decoder_key].bias_param.grad is not None
+            assert torch.all(torch.isfinite(clt_model.decoder_module.decoders[decoder_key].bias_param.grad))
+            # Note: Bias gradients can sometimes be zero in simple cases, so we don't assert non-zero