Fix formatting issues for DataStates-LLM

Avinash · Avinash · commit 1c701d7c61b1 · 2025-03-24T14:45:11.000-05:00
Signed-off-by: Avinash Maurya &lt;amaurya@anl.gov&gt;
diff --git a/deepspeed/datastates/__init__.py b/deepspeed/datastates/__init__.py
@@ -1,2 +1,6 @@
-# Copyright by DataStates Team (Argonne National Laboratory): https://github.com/DataStates/
-# Maintained by DataStates Team (Argonne National Laboratory): https://github.com/DataStates/
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# MIT License Copyright (c) UChicago Argonne LLC, operator of Argonne National Laboratory.
+
+# DeepSpeed Team
diff --git a/deepspeed/datastates/config.py b/deepspeed/datastates/config.py
@@ -1,8 +1,15 @@
-# Copyright DataStates Team (Argonne National Laboratory): https://github.com/DataStates/
-# Maintained by DataStates Team (Argonne National Laboratory): https://github.com/DataStates/
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# MIT License Copyright (c) UChicago Argonne LLC, operator of Argonne National Laboratory.
+
+# DeepSpeed Team
 
 from deepspeed.runtime.config_utils import DeepSpeedConfigObject
+
+
 class DeepSpeedDataStatesConfig(DeepSpeedConfigObject):
+
     def __init__(self, param_dict):
         super(DeepSpeedDataStatesConfig, self).__init__()
 
@@ -11,4 +18,4 @@ def __init__(self, param_dict):
 
         if "datastates_ckpt" in param_dict.keys():
             self.enabled = True
-            self.config = param_dict["datastates_ckpt"]
+            self.config = param_dict["datastates_ckpt"]
diff --git a/deepspeed/runtime/checkpoint_engine/README.md b/deepspeed/runtime/checkpoint_engine/README.md
@@ -40,7 +40,7 @@ class CheckpointEngine(object):
 
 ### Asynchronous Lazy Checkpointing using DataStates-LLM
 
-DataStates-LLM is an asynchrnous checkpointing approach optimized for LLM pre-training and can be obtained at https://github.com/DataStates/datastates-llm. To enable datastates-llm checkpointing, specify the `host_cache_size` (in gigabytes) which reserves pinned host memory for asynchronous checkpoint flushing, and `parser_threads` to parse multiple checkpoint file requests in parallel using the following lines in config.json supplied during the launch:
+DataStates-LLM is an asynchronous checkpointing approach optimized for LLM pre-training and can be obtained at https://github.com/DataStates/datastates-llm. To enable datastates-llm checkpointing, specify the `host_cache_size` (in gigabytes) which reserves pinned host memory for asynchronous checkpoint flushing, and `parser_threads` to parse multiple checkpoint file requests in parallel using the following lines in config.json supplied during the launch:
 ```
 {
     ... other deepspeed config options,
diff --git a/deepspeed/runtime/checkpoint_engine/checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/checkpoint_engine.py
@@ -31,4 +31,4 @@ def commit(self, tag):
 
     def wait(self):
         # To wait in asynchronous checkpoint engines (e.g. DataStates-LLM) for the previous snapshot to finish
-        pass
+        pass
diff --git a/deepspeed/runtime/checkpoint_engine/datastates_checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/datastates_checkpoint_engine.py
@@ -1,8 +1,11 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
-# Copyright by DataStates Team (Argonne National Laboratory): https://github.com/DataStates/
-# Maintained by DataStates Team (Argonne National Laboratory): https://github.com/DataStates/
+# MIT License Copyright (c) UChicago Argonne LLC, operator of Argonne National Laboratory.
 
-from deepspeed.utils import logger, log_dist
+# DeepSpeed Team
+
+from deepspeed.utils import log_dist
 from deepspeed.runtime.checkpoint_engine.checkpoint_engine import \
     CheckpointEngine
 from datastates.llm import Checkpointing
@@ -13,21 +16,19 @@ class DataStatesCheckpointEngine(CheckpointEngine):
     def __init__(self, deepspeed_config, rank):
         super().__init__(deepspeed_config)
         self.ckpt_engine = Checkpointing(deepspeed_config, rank)
-    
+
     def create(self, tag):
         log_dist(f"[DataStates] Checkpoint {tag} is about to be saved!", ranks=[0])
         return None
 
     def save(self, state_dict, path: str):
         return self.ckpt_engine.save(state_dict, path)
-    
+
     def load(self, path: str, map_location=None):
         return self.ckpt_engine.load(path, map_location)
-    
+
     def commit(self, tag):
         return self.ckpt_engine.commit(tag)
 
     def wait(self):
         return self.ckpt_engine.wait()
-
-   
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
@@ -1064,9 +1064,12 @@ def _configure_checkpointing(self, dist_init_required):
         if self._config is not None and self._config.datastates_config.enabled:
             try:
                 from deepspeed.runtime.checkpoint_engine.datastates_checkpoint_engine import DataStatesCheckpointEngine
-                self.checkpoint_engine = DataStatesCheckpointEngine(deepspeed_config=self._config, rank=dist.get_rank())
+                self.checkpoint_engine = DataStatesCheckpointEngine(deepspeed_config=self._config,
+                                                                    rank=dist.get_rank())
             except ImportError as err:
-                raise Exception(f"The datastates-llm checkpoint engine was not found! Will fall back to torch.save. Details: {err}")
+                raise Exception(
+                    f"The datastates-llm checkpoint engine was not found! Will fall back to torch.save. Details: {err}"
+                )
 
         dp_rank = groups._get_sequence_data_parallel_rank()
 
diff --git a/deepspeed/runtime/pipe/module.py b/deepspeed/runtime/pipe/module.py
@@ -22,6 +22,7 @@
 from deepspeed.accelerator import get_accelerator
 from deepspeed.checkpoint.utils import clone_tensors_for_torch_save
 
+
 class PipelineError(Exception):
     """Errors related to the use of deepspeed.PipelineModule """
 
@@ -617,7 +618,7 @@ def save_state_dict(self, save_dir, checkpoint_engine, exclude_frozen_params=Fal
             if exclude_frozen_params:
                 for n in self._get_frozen_parameter_names(layer):
                     del orig_state_dict[n]
-                
+
             if debloat_memory:
                 final_state_dict = clone_tensors_for_torch_save(orig_state_dict)
             else:
diff --git a/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py b/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py
@@ -8,7 +8,6 @@
 import torch
 from deepspeed.ops.op_builder import AsyncIOBuilder
 from deepspeed import comm as dist
-import torch
 
 from deepspeed.runtime.swap_tensor.constants import *
 from deepspeed.runtime.swap_tensor.utils import swap_in_tensors, swap_out_tensors, print_object

Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,7 @@ class CheckpointEngine(object):`
`40`	`40`
`41`	`41`	`### Asynchronous Lazy Checkpointing using DataStates-LLM`
`42`	`42`
`43`		-DataStates-LLM is an asynchrnous checkpointing approach optimized for LLM pre-training and can be obtained at https://github.com/DataStates/datastates-llm. To enable datastates-llm checkpointing, specify the `host_cache_size` (in gigabytes) which reserves pinned host memory for asynchronous checkpoint flushing, and `parser_threads` to parse multiple checkpoint file requests in parallel using the following lines in config.json supplied during the launch:
	`43`	+DataStates-LLM is an asynchronous checkpointing approach optimized for LLM pre-training and can be obtained at https://github.com/DataStates/datastates-llm. To enable datastates-llm checkpointing, specify the `host_cache_size` (in gigabytes) which reserves pinned host memory for asynchronous checkpoint flushing, and `parser_threads` to parse multiple checkpoint file requests in parallel using the following lines in config.json supplied during the launch:
`44`	`44`	```
`45`	`45`	`{`
`46`	`46`	`... other deepspeed config options,`