setupmlflow globally

irenedea · rithwik-db · commit 02df60ca944d · 2025-08-09T23:29:13.000Z
works

comment
diff --git a/test_single_controller_ppo.py b/test_single_controller_ppo.py
@@ -22,13 +22,14 @@
 from typing import Any, Optional
 
 from composer.loggers import MLFlowLogger
+import mlflow
 import ray
 import torch
 import torch.distributed as dist
 from composer import Trainer
 from composer.core import get_precision_context
 from composer.optim import DecoupledAdamW
-from composer.utils import dist as composer_dist
+from composer.utils import create_symlink_file, dist as composer_dist, get_file
 from llmfoundry.data import build_dataloader
 from omegaconf import OmegaConf as om
 from transformers import AutoTokenizer
@@ -48,7 +49,10 @@
 from compose_rl.controllers import BaseDistributedGPUActor, SPMDActorGroup
 from compose_rl.controllers.buffer import Buffer
 from compose_rl.algorithms.online.callback_utils import preprocess_batches
+from databricks.sdk import WorkspaceClient
 
+MLFLOW_RUN_NAME=os.environ['COMPOSER_RUN_NAME'] # SHOULD BE SET BY MCLI
+MLFLOW_EXPERIMENT_NAME=f'/Users/{WorkspaceClient().current_user.me().user_name}/test_single_controller'
 
 @contextmanager
 def time_it(name: str):
@@ -129,7 +133,7 @@ def build_train_config(self, config: Any):
             'global_train_batch_size': self.device_train_batch_size * self.world_size,
             'device_train_batch_size': self.device_train_batch_size,
             'device_train_microbatch_size': self.device_train_batch_size,
-            'save_folder': self.config.save_folder,
+            'save_folder': os.path.join('dbfs:/databricks/mlflow-tracking/{mlflow_experiment_id}/{mlflow_run_id}', self.config.save_folder),
             'log_config': self.config.log_config,
             'max_seq_len': self.max_seq_len,
             'python_log_level': self.config.python_log_level,
@@ -220,7 +224,7 @@ def build_ppo_trainer(self):
             loggers=[mlflow_logger],
             device_train_microbatch_size=self.config.device_train_microbatch_size,
             load_path=self.ref_path,
-            save_folder=self.config.save_folder,
+            save_folder=os.path.join('dbfs:/databricks/mlflow-tracking/{mlflow_experiment_id}/{mlflow_run_id}', self.config.save_folder),
             save_interval='1iter',
             autoresume=self.config.autoresume,
         )
@@ -338,6 +342,8 @@ def train_1_iter(self):
     async def run(self, num_iterations: int, experience_buffer: 'ExperienceBuffer', parameter_buffer: 'ParameterBuffer', inference_server: 'InferenceServer', lock: asyncio.Lock, rollout_semaphore: asyncio.Semaphore, eval_semaphore: asyncio.Semaphore):
         # the overall design rn is we have a async def run function for each of the subcontroller that is responsible for async primitives but leave the rest of the logic to be sync function and use
         # asyncio.to_thread to bridge the async and sync world
+
+        # TODO: Load experience buffer from checkpoints, this will make checkpointing work for async
         for _ in range(num_iterations):
             # Simple example of adding elements to the experience buffer
             # Populate the train actor group with the rollouts and then train
@@ -465,10 +471,12 @@ def __init__(
         self.iter_num = 0
 
         # Load the latest checkpoint
-        self.latest_checkpoint = os.path.join(self.save_folder, 'latest.symlink')
 
-        if config.autoresume and os.path.exists(self.latest_checkpoint):
+        self.latest_checkpoint = os.path.join(self.save_folder, 'latest_rollout_agent.symlink') # TODO: This might need to use the updated path
+
+        if config.autoresume and _artifact_exists(self.latest_checkpoint):
             print(f'Autoresuming from checkpoint for RolloutAgent.')
+            get_file(self.latest_checkpoint, self.latest_checkpoint, overwrite=True)
             with open(self.latest_checkpoint, 'rb') as f:
                 checkpoint = pickle.load(f)
             self.iter_num = checkpoint['iter_num']
@@ -529,9 +537,13 @@ def get_next_iter_rollouts(self):
                 'streaming_dataloader': streaming_dataloader_state_dict,
             }, f)
 
+        mlflow.log_artifact(checkpoint_path, save_folder_iter, run_id=_get_mlflow_run_id())
+
         if os.path.exists(self.latest_checkpoint):
             os.remove(self.latest_checkpoint)
-        os.symlink(checkpoint_path, self.latest_checkpoint)
+        create_symlink_file(checkpoint_path, self.latest_checkpoint)
+        
+        mlflow.log_artifact(self.latest_checkpoint, self.config.save_folder, run_id=_get_mlflow_run_id())
         return iter_data
 
     async def run(self, num_iterations: int, experience_buffer: 'ExperienceBuffer', lock: asyncio.Lock, rollout_semaphore: asyncio.Semaphore):
@@ -728,6 +740,80 @@ async def train_async(self, max_duration: int | str):
         await asyncio.gather(train_task, rollout_task, eval_task)
         self.train_actor.collective_methods.close_trainer()
 
+def _get_mlflow_run_id() -> Optional[str]:
+    return os.environ.get('MLFLOW_RUN_ID', None)
+
+def _setup_mlflow():
+    print('setting up mlflow')
+    dist.init_process_group(backend='gloo')
+    # Create a new MLFlow run to be used for the entire run
+    mlflow.set_tracking_uri('databricks')
+
+    # get mlflow experiment
+    experiment = mlflow.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)
+    if experiment is None:
+        experiment_id = mlflow.create_experiment(MLFLOW_EXPERIMENT_NAME)
+    else:
+        experiment_id = experiment.experiment_id
+    mlflow.set_experiment(experiment_id=experiment_id)
+
+
+
+    run_id = None
+    if composer_dist.get_global_rank() == 0:
+        # find a preexisting run if it exists
+        existing_runs = mlflow.search_runs(
+            experiment_ids=[experiment_id],
+            filter_string=f'tags.run_name = "{MLFLOW_RUN_NAME}"',
+            output_format='list',
+        ) if config.autoresume else []
+        if len(existing_runs) > 0:
+            run_id = existing_runs[0].info.run_id
+            print(f'Resuming mlflow run with run id: {run_id}')
+        else:
+            run_id = mlflow.start_run(run_name=MLFLOW_RUN_NAME).info.run_id
+            print(f'Creating new mlflow run with run id: {run_id}')
+    broadcast_list = [run_id]
+
+    composer_dist.broadcast_object_list(broadcast_list, src=0)
+
+    # set all the right enviornment variables
+    run_id = broadcast_list[0]
+    assert run_id is not None and experiment_id is not None, "Run ID and experiment ID must be set"
+    os.environ['MLFLOW_RUN_ID'] = run_id
+    os.environ['MLFLOW_EXPERIMENT_ID'] = experiment_id
+    os.environ['MLFLOW_TRACKING_URI'] = 'databricks'
+
+    dist.destroy_process_group()
+
+
+def _artifact_exists(artifact_path: str) -> bool:
+    """Return True if artifact_path exists (file or directory) for the run."""
+    client = mlflow.MlflowClient()
+    artifact_path = artifact_path.strip("/")
+
+    run_id = _get_mlflow_run_id()
+    assert run_id is not None, "Run ID must be set"
+
+    # Walk down the path parts level-by-level
+    parent = ""
+    if artifact_path:
+        parts = artifact_path.split("/")
+        for i, part in enumerate(parts):
+            entries = {os.path.basename(fi.path): fi for fi in client.list_artifacts(run_id, parent)}
+            if part not in entries:
+                return False
+            fi = entries[part]
+            is_last = (i == len(parts) - 1)
+            if not is_last and not fi.is_dir:
+                # trying to descend into a file
+                return False
+            parent = fi.path  # descend
+
+    # If we got here, the path exists (root or found item).
+    return True
+    
+    
 
 def _run_single_controller_ppo(
     config: Any,
@@ -744,6 +830,8 @@ def _run_single_controller_ppo(
     # Disable setting CUDA_VISIBLE_DEVICES by ray, we will set it manually
     os.environ['RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES'] = '1'
 
+    _setup_mlflow()
+
     with start_ray_server() as _address:
         # only rank 0 is the master controller
         if dist.get_rank() == 0: