sapientinc · jryanhaber · Aug 12, 2025 · Aug 12, 2025 · Aug 12, 2025 · Aug 12, 2025
diff --git a/dataset/holon_tags.csv b/dataset/holon_tags.csv
@@ -0,0 +1,7 @@
+holon_id,title,description,tags
+NAL-001,Next AI Labs,"Next AI Labs is a pioneering center dedicated to developing sentient AI devoted to human flourishing. Focused on research and development across industries via cutting‑edge AI innovations. Suggested next steps include research publications, partnerships, and collaborative projects.",AI Alignment; Human Flourishing; Research Lab
+NAL-002,Public Facing Interfaces,Manifesto and other public-facing materials for Next AI Labs.,Public Interfaces; User Vision
+NAL-003,Funding for Social Impact Non Profits,Paths to fund an aligned AI lab for human flourishing.,Funding Strategy; Social Impact; Philanthropy
+NAL-004,Advisors,Advisory relationships for the lab.,Advisory Network; Partnerships
+NAL-005,Relationships,Key collaborators and strategic relationships.,Relationship Building; Partnerships
+NAL-006,Personal / Well Being,Founder personal capacity and wellbeing guardrails.,Founder Wellbeing
diff --git a/dataset/tags_master.csv b/dataset/tags_master.csv
@@ -0,0 +1,27 @@
+tag,description
+Leverage Hunting,Identify outsized positive-impact changes and compounding loops.
+Churn Reduction,Reduce cancellations and early churn.
+Go-To-Market,"Positioning, channels, and activation motion."
+User Vision,Narrative and promise communicated to users.
+Product-Market Fit,Evidence and work toward strong problem–solution fit.
+User Retention,Keep existing users active and engaged.
+Automated Emails,"Lifecycle, re‑engagement, and triggered emails."
+Memory Injection,Persisting and recalling high‑value user memories in AI flows.
+Privacy Promise,Comms and guarantees about data privacy.
+Major Email Announcement,Big broadcast email moments / launches.
+AI Alignment,Safety/alignment research and practices.
+Human Flourishing,Explicit aim to benefit human wellbeing.
+Research Lab,Institutional R&D context.
+Social Impact,Nonprofit/impact orientation.
+Funding Strategy,How to finance the org/initiative.
+Philanthropy,Foundation-based grants and gifts.
+Government Grants,NSF/DARPA/UKRI/ERC and similar funding.
+Corporate Partnerships,Partnerships with tech companies and foundations.
+Compute Grants,Credits/GPUs/compute access programs.
+Venture Capital,"VC sources, terms, and strategy."
+Impact Investing,Investment with explicit social outcomes.
+Partnerships,Collaboration and ecosystem relationships.
+Public Interfaces,"Manifesto, website, and other public-facing touchpoints."
+Advisory Network,"Advisors, mentors, and expert board."
+Relationship Building,"Allies, collaborators, and stakeholder ties."
+Founder Wellbeing,"Personal capacity, health, and sustainability."
diff --git a/evaluate.py b/evaluate.py
@@ -5,6 +5,19 @@
 import torch
 import torch.distributed as dist
 
+
+def get_device():
+    import torch
+    if torch.backends.mps.is_available():
+        return torch.device("mps")
+    elif torch.cuda.is_available():
+        return torch.device("cuda")
+    else:
+        return torch.device("cpu")
+
+device = get_device()
+print(f"Using device: {device}")
+
 import pydantic
 from omegaconf import OmegaConf
 from pretrain import PretrainConfig, init_train_state, evaluate, create_dataloader
@@ -29,7 +42,8 @@ def launch():
         RANK = dist.get_rank()
         WORLD_SIZE = dist.get_world_size()
 
-        torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
+        if device.type == "cuda":
+            torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
 
     with open(os.path.join(os.path.dirname(eval_cfg.checkpoint), "all_config.yaml"), "r") as f:
         config = PretrainConfig(**yaml.safe_load(f))
@@ -45,9 +59,9 @@ def launch():
     train_state = init_train_state(config, train_metadata, world_size=WORLD_SIZE)
     # Try unwrap torch.compile
     try:
-        train_state.model.load_state_dict(torch.load(eval_cfg.checkpoint, map_location="cuda"), assign=True)
+        train_state.model.load_state_dict(torch.load(eval_cfg.checkpoint, map_location=device), assign=True)
     except:
-        train_state.model.load_state_dict({k.removeprefix("_orig_mod."): v for k, v in torch.load(eval_cfg.checkpoint, map_location="cuda").items()}, assign=True)
+        train_state.model.load_state_dict({k.removeprefix("_orig_mod."): v for k, v in torch.load(eval_cfg.checkpoint, map_location=device).items()}, assign=True)
 
     train_state.step = 0
     ckpt_filename = os.path.basename(eval_cfg.checkpoint)

diff --git a/models/layers.py b/models/layers.py
@@ -7,8 +7,12 @@
 try:
     from flash_attn_interface import flash_attn_func  # type: ignore[import]
 except ImportError:
-    # Fallback to FlashAttention 2
-    from flash_attn import flash_attn_func  # type: ignore[import]
+    try:
+        # Fallback to FlashAttention 2
+        from flash_attn import flash_attn_func  # type: ignore[import]
+    except ImportError:
+        # Conditional fallback for systems without flash_attn (e.g., MPS)
+        flash_attn_func = None
 
 from models.common import trunc_normal_init_
 
@@ -126,10 +130,18 @@ def forward(self, cos_sin: CosSin, hidden_states: torch.Tensor) -> torch.Tensor:
             cos, sin = cos_sin
             query, key = apply_rotary_pos_emb(query, key, cos, sin)
 
-        # flash attn
-        attn_output = flash_attn_func(q=query, k=key, v=value, causal=self.causal)
-        if isinstance(attn_output, tuple):  # fa2 and fa3 compatibility
-            attn_output = attn_output[0]
+        # flash attn with conditional fallback
+        if flash_attn_func is not None:
+            attn_output = flash_attn_func(q=query, k=key, v=value, causal=self.causal)
+            if isinstance(attn_output, tuple):  # fa2 and fa3 compatibility
+                attn_output = attn_output[0]
+        else:
+            # Conditional fallback to PyTorch attention for systems without flash_attn
+            query = query.transpose(1, 2)  # [batch_size, num_heads, seq_len, head_dim]
+            key = key.transpose(1, 2)
+            value = value.transpose(1, 2)
+            attn_output = F.scaled_dot_product_attention(query, key, value, is_causal=self.causal)
+            attn_output = attn_output.transpose(1, 2)  # back to [batch_size, seq_len, num_heads, head_dim]
 
         # attn_output: [batch_size, num_heads, seq_len, head_dim]
         attn_output = attn_output.view(batch_size, seq_len, self.output_size)  # type: ignore

diff --git a/pretrain.py b/pretrain.py
@@ -10,13 +10,30 @@
 from torch import nn
 from torch.utils.data import DataLoader
 
+
+def get_device():
+    import torch
+    if torch.backends.mps.is_available():
+        return torch.device("mps")
+    elif torch.cuda.is_available():
+        return torch.device("cuda")
+    else:
+        return torch.device("cpu")
+
+device = get_device()
+print(f"Using device: {device}")
+
 import tqdm
 import wandb
 import coolname
 import hydra
 import pydantic
 from omegaconf import DictConfig
-from adam_atan2 import AdamATan2
+try:
+    from adam_atan2 import AdamATan2
+except ImportError:
+    # Fallback to AdamW when adam_atan2_backend is not available
+    from torch.optim import AdamW as AdamATan2
 
 from puzzle_dataset import PuzzleDataset, PuzzleDatasetConfig, PuzzleDatasetMetadata
 from utils.functions import load_model_class, get_model_source_path
@@ -121,7 +138,7 @@ def create_model(config: PretrainConfig, train_metadata: PuzzleDatasetMetadata,
     model_cls = load_model_class(config.arch.name)
     loss_head_cls = load_model_class(config.arch.loss.name)
 
-    with torch.device("cuda"):
+    with torch.device(device):
         model: nn.Module = model_cls(model_cfg)
         model = loss_head_cls(model, **config.arch.loss.__pydantic_extra__)  # type: ignore
         if "DISABLE_COMPILE" not in os.environ:
@@ -212,11 +229,11 @@ def train_batch(config: PretrainConfig, train_state: TrainState, batch: Any, glo
         return
 
     # To device
-    batch = {k: v.cuda() for k, v in batch.items()}
+    batch = {k: v.to(device) for k, v in batch.items()}
 
     # Init carry if it is None
     if train_state.carry is None:
-        with torch.device("cuda"):
+        with torch.device(device):
             train_state.carry = train_state.model.initial_carry(batch)  # type: ignore
 
     # Forward
@@ -276,8 +293,8 @@ def evaluate(config: PretrainConfig, train_state: TrainState, eval_loader: torch
         carry = None
         for set_name, batch, global_batch_size in eval_loader:
             # To device
-            batch = {k: v.cuda() for k, v in batch.items()}
-            with torch.device("cuda"):
+            batch = {k: v.to(device) for k, v in batch.items()}
+            with torch.device(device):
                 carry = train_state.model.initial_carry(batch)  # type: ignore
 
             # Forward
@@ -300,7 +317,7 @@ def evaluate(config: PretrainConfig, train_state: TrainState, eval_loader: torch
 
             if metric_values is None:
                 metric_keys = list(sorted(metrics.keys()))  # Sort keys to guarantee all processes use the same order.
-                metric_values = torch.zeros((len(set_ids), len(metrics.values())), dtype=torch.float32, device="cuda")
+                metric_values = torch.zeros((len(set_ids), len(metrics.values())), dtype=torch.float32, device=device)
 
             metric_values[set_id] += torch.stack([metrics[k] for k in metric_keys])
             metric_global_batch_size[set_id] += global_batch_size
@@ -390,7 +407,8 @@ def launch(hydra_config: DictConfig):
         RANK = dist.get_rank()
         WORLD_SIZE = dist.get_world_size()
 
-        torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
+        if device.type == "cuda":
+            torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
 
     # Load sync'ed config
     config = load_synced_config(hydra_config, rank=RANK, world_size=WORLD_SIZE)

diff --git a/tests/system_adaptability/test_classification.py b/tests/system_adaptability/test_classification.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+"""
+Classification test script for HRM using holon tags data.
+Tests the model's ability to classify text using the tag taxonomy.
+"""
+
+import pandas as pd
+import torch
+import os
+import sys
+from pathlib import Path
+
+def get_device():
+    """Universal device detection for HRM testing"""
+    if torch.backends.mps.is_available():
+        return torch.device("mps")
+    elif torch.cuda.is_available():
+        return torch.device("cuda")
+    else:
+        return torch.device("cpu")
+
+def load_classification_data():
+    """Load holon tags and tags master for classification testing"""
+    dataset_path = Path("dataset")
+
+    # Load holon tags (the data to classify)
+    holon_tags = pd.read_csv(dataset_path / "holon_tags.csv")
+    print(f"Loaded {len(holon_tags)} holon entries")
+
+    # Load tags master (the classification taxonomy)
+    tags_master = pd.read_csv(dataset_path / "tags_master.csv")
+    print(f"Loaded {len(tags_master)} tag definitions")
+
+    return holon_tags, tags_master
+
+def prepare_classification_examples():
+    """Prepare text examples for classification testing"""
+    holon_tags, tags_master = load_classification_data()
+
+    examples = []
+    for _, row in holon_tags.iterrows():
+        example = {
+            'id': row['holon_id'],
+            'title': row['title'],
+            'description': row['description'],
+            'true_tags': row['tags'].split('; ') if pd.notna(row['tags']) else [],
+            'full_text': f"{row['title']}: {row['description']}"
+        }
+        examples.append(example)
+
+    print(f"Prepared {len(examples)} classification examples")
+    return examples, tags_master
+
+def test_device_compatibility():
+    """Test basic tensor operations on the detected device"""
+    device = get_device()
+    print(f"Testing device compatibility: {device}")
+
+    try:
+        # Test tensor creation and operations
+        x = torch.randn(10, 10).to(device)
+        y = torch.randn(10, 10).to(device)
+        z = torch.matmul(x, y)
+
+        print(f"✅ Device test passed - tensor operations work on {device}")
+        return True
+    except Exception as e:
+        print(f"❌ Device test failed: {e}")
+        return False
+
+def run_classification_test():
+    """Main classification test runner"""
+    print("=" * 60)
+    print("HRM CLASSIFICATION TEST")
+    print("=" * 60)
+
+    # Test device compatibility
+    if not test_device_compatibility():
+        return False
+
+    # Load and prepare data
+    try:
+        examples, tags_master = prepare_classification_examples()
+
+        print(f"\\nClassification Test Data Summary:")
+        print(f"- Examples to classify: {len(examples)}")
+        print(f"- Available tags: {len(tags_master)}")
+        print(f"- Device: {get_device()}")
+
+        # Show sample data
+        print(f"\\nSample classification example:")
+        sample = examples[0]
+        print(f"ID: {sample['id']}")
+        print(f"Title: {sample['title']}")
+        print(f"Description: {sample['description'][:100]}...")
+        print(f"True tags: {sample['true_tags']}")
+
+        print(f"\\nAvailable tag categories:")
+        for _, tag in tags_master.head(10).iterrows():
+            print(f"- {tag['tag']}: {tag['description']}")
+
+        print(f"\\n✅ Classification test data prepared successfully!")
+        print(f"\\n📋 NEXT STEPS:")
+        print(f"1. Load a pretrained HRM model checkpoint")
+        print(f"2. Run inference on the prepared examples") 
+        print(f"3. Compare predicted tags vs true tags")
+
+        return True
+
+    except Exception as e:
+        print(f"❌ Classification test failed: {e}")
+        return False
+
+if __name__ == "__main__":
+    success = run_classification_test()
+    sys.exit(0 if success else 1)
diff --git a/tests/system_adaptability/test_device.py b/tests/system_adaptability/test_device.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+"""Simple test script to verify device detection works correctly."""
+
+def get_device():
+    import torch
+    if torch.backends.mps.is_available():
+        return torch.device("mps")
+    elif torch.cuda.is_available():
+        return torch.device("cuda")
+    else:
+        return torch.device("cpu")
+
+if __name__ == "__main__":
+    device = get_device()
+    print(f"Using device: {device}")
+
+    # Test tensor creation and basic operations
+    import torch
+    x = torch.randn(3, 3).to(device)
+    y = torch.randn(3, 3).to(device)
+    z = x + y
+    print(f"Tensor operation successful on {device}")
+    print(f"Result shape: {z.shape}")