curt-tigges
diff --git a/‎.github/workflows/python-tests.yml‎
Lines changed: 45 additions & 0 deletions b/‎.github/workflows/python-tests.yml‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎clt/models/activations.py‎
Lines changed: 14 additions & 2 deletions b/‎clt/models/activations.py‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎clt/training/trainer.py‎
Lines changed: 0 additions & 2 deletions b/‎clt/training/trainer.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎tests/integration/test_activation_store.py‎
Lines changed: 0 additions & 200 deletions b/‎tests/integration/test_activation_store.py‎
Lines changed: 0 additions & 200 deletions
@@ -0,0 +1,45 @@
+name: Python Tests
+
+on:
+  push:
+    branches:
+      - main
+      - develop # Or your primary development branch
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9", "3.10", "3.11"] # Specify python versions
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Install Poetry
+      run: |
+        curl -sSL https://install.python-poetry.org | python3 -
+        echo "$HOME/.local/bin" >> $GITHUB_PATH
+      # Alternatively, if not using Poetry, or have a requirements.txt:
+      # run: pip install -r requirements.txt
+
+    - name: Install dependencies
+      run: poetry install --no-interaction --no-root
+      # If you have dev dependencies for pytest, e.g. in a [tool.poetry.group.dev.dependencies]
+      # run: poetry install --no-interaction --no-root --with dev
+      # Or if using pip with requirements.txt:
+      # run: pip install -r requirements-dev.txt # (if you have a separate dev requirements)
+      # run: pip install pytest # or ensure pytest is in your main requirements
+
+    - name: Run tests with pytest
+      run: poetry run pytest tests/
+      # Or if not using poetry:
+      # run: pytest tests/ 
@@ -197,12 +197,24 @@ def backward(ctx, *grad_outputs: torch.Tensor) -> Tuple[Optional[torch.Tensor],
             grad_threshold_per_element = grad_output * local_grad_theta
 
             if grad_threshold_per_element.dim() > threshold.dim():
+                # Handles cases like input (B,F), threshold (F) or input (F), threshold (scalar)
                 dims_to_sum = tuple(range(grad_threshold_per_element.dim() - threshold.dim()))
                 grad_threshold = grad_threshold_per_element.sum(dim=dims_to_sum)
-                if threshold.shape != torch.Size([]):
+                # Ensure final shape matches threshold, especially if sum squeezed dimensions
+                if grad_threshold.shape != threshold.shape:
                     grad_threshold = grad_threshold.reshape(threshold.shape)
-            else:
+            elif grad_threshold_per_element.dim() == threshold.dim():
+                # Handles cases like input (F), threshold (F), or input [1], threshold [1]
+                grad_threshold = grad_threshold_per_element
+                # Defensive reshape, though shapes should ideally match here.
+                if grad_threshold.shape != threshold.shape:
+                    grad_threshold = grad_threshold.reshape(threshold.shape)
+            else:  # grad_threshold_per_element.dim() < threshold.dim()
+                # This case is less common (e.g. input scalar, threshold vector - not typical for this op).
+                # Defaulting to sum and reshape, primarily for scalar threshold case.
                 grad_threshold = grad_threshold_per_element.sum()
+                if grad_threshold.shape != threshold.shape:
+                    grad_threshold = grad_threshold.reshape(threshold.shape)
         return grad_input, grad_threshold, None
 
 
 
@@ -580,8 +580,6 @@ def train(self, eval_every: int = 1000) -> CrossLayerTranscoder:
                         tok_cnt_t = torch.tensor([tok_cnt], device=self.device)
                         gathered = [torch.zeros_like(tok_cnt_t) for _ in range(self.world_size)]
                         dist.all_gather(gathered, tok_cnt_t)
-                        if self.rank == 0:
-                            print("Batch token-count per rank:", [int(x.item()) for x in gathered])
 
                 except StopIteration:
                     # Rank 0 prints message