huggingface · Nech-C · Jan 2, 2025 · Jan 2, 2025 · Aug 9, 2025 · Aug 22, 2025
diff --git a/src/accelerate/utils/modeling.py b/src/accelerate/utils/modeling.py
@@ -1295,6 +1295,7 @@ def infer_auto_device_map(
     model: nn.Module,
     max_memory: Optional[dict[Union[int, str], Union[int, str]]] = None,
     no_split_module_classes: Optional[list[str]] = None,
+    reserve_max_layer: bool = False,
     dtype: Optional[Union[str, torch.dtype]] = None,
     special_dtypes: Optional[dict[str, Union[str, torch.dtype]]] = None,
     verbose: bool = False,
@@ -1328,6 +1329,9 @@ def infer_auto_device_map(
         no_split_module_classes (`List[str]`, *optional*):
             A list of layer class names that should never be split across device (for instance any layer that has a
             residual connection).
+        reserve_max_layer (`bool`, *optional*, defaults to `True`):
+            Whether to reserve the maximum layer size for the main devices. This allows more efficient memory allocation
+            when multiple GPUs are present and no offloading to CPU or disk is needed.
         dtype (`str` or `torch.dtype`, *optional*):
             If provided, the weights will be converted to that type when loaded.
         special_dtypes (`Dict[str, Union[str, torch.device]]`, *optional*):
@@ -1363,21 +1367,24 @@ def infer_auto_device_map(
     device_minimum_assignment_memory = {}
 
     # Initialize maximum largest layer, to know which space to keep in memory
-    max_layer_size, max_layer_names = get_max_layer_size(modules_to_treat, module_sizes, no_split_module_classes)
-
+    if reserve_max_layer:
+        max_layer_size, max_layer_names = get_max_layer_size(modules_to_treat, module_sizes, no_split_module_classes)
+    else:
+        max_layer_size, max_layer_names = 0, []
     # Ready ? This is going to be a bit messy.
     while len(modules_to_treat) > 0:
         name, module = modules_to_treat.pop(0)
         if verbose:
             print(f"\nTreating module {name}.")
         # Max size in the remaining layers may have changed since we took one, so we maybe update it.
-        max_layer_names = [n for n in max_layer_names if n != name and not n.startswith(name + ".")]
-        if len(max_layer_names) == 0:
-            max_layer_size, max_layer_names = get_max_layer_size(
-                [(n, m) for n, m in modules_to_treat if isinstance(m, torch.nn.Module)],
-                module_sizes,
-                no_split_module_classes,
-            )
+        if reserve_max_layer:
+            max_layer_names = [n for n in max_layer_names if n != name and not n.startswith(name + ".")]
+            if len(max_layer_names) == 0:
+                max_layer_size, max_layer_names = get_max_layer_size(
+                    [(n, m) for n, m in modules_to_treat if isinstance(m, torch.nn.Module)],
+                    module_sizes,
+                    no_split_module_classes,
+                )
         # Assess size needed
         module_size = module_sizes[name]
 
@@ -1406,8 +1413,8 @@ def infer_auto_device_map(
         device = devices[current_device]
         current_max_size = max_memory[device] if device != "disk" else None
         current_memory_reserved = 0
-        # Reduce max size available by the largest layer.
-        if devices[current_device] in main_devices:
+
+        if devices[current_device] in main_devices and reserve_max_layer:
             current_max_size = current_max_size - max_layer_size
             current_memory_reserved = max_layer_size
 
@@ -1486,11 +1493,12 @@ def infer_auto_device_map(
                     + modules_to_treat[tied_module_index + 1 :]
                 )
                 # Update the max layer size.
-                max_layer_size, max_layer_names = get_max_layer_size(
-                    [(n, m) for n, m in modules_to_treat if isinstance(m, torch.nn.Module)],
-                    module_sizes,
-                    no_split_module_classes,
-                )
+                if reserve_max_layer:
+                    max_layer_size, max_layer_names = get_max_layer_size(
+                        [(n, m) for n, m in modules_to_treat if isinstance(m, torch.nn.Module)],
+                        module_sizes,
+                        no_split_module_classes,
+                    )
                 split_happened = True
                 break
 
@@ -1526,11 +1534,12 @@ def infer_auto_device_map(
                 modules_children = list(module.named_parameters(recurse=False)) + modules_children
                 modules_to_treat = [(f"{name}.{n}", v) for n, v in modules_children] + modules_to_treat
                 # Update the max layer size.
-                max_layer_size, max_layer_names = get_max_layer_size(
-                    [(n, m) for n, m in modules_to_treat if isinstance(m, torch.nn.Module)],
-                    module_sizes,
-                    no_split_module_classes,
-                )
+                if reserve_max_layer:
+                    max_layer_size, max_layer_names = get_max_layer_size(
+                        [(n, m) for n, m in modules_to_treat if isinstance(m, torch.nn.Module)],
+                        module_sizes,
+                        no_split_module_classes,
+                    )
                 continue
 
         # If no module is assigned to the current device, we attempt to allocate a fallback module
@@ -1562,6 +1571,33 @@ def infer_auto_device_map(
 
     device_memory_used = {device: mem for device, mem in device_memory_used.items() if mem > 0}
 
+    # before we return, we check if the device map has offloaded layers that aren't accounted for as memory used
+    # if so, we call infer_auto_device_map again with the reserve_max_layer set to True
+    if not reserve_max_layer and device_map:
+
+        if set(device_map.values()) == {"cpu"} or set(device_map.values()) == {"cpu", "disk"}:
+            main_device = "cpu"
+        else:
+            main_device = [d for d in device_map.values() if d not in ["cpu", "disk"]]
+            if not main_device:
+                return device_map
+            main_device = main_device[0]
+
+        offloaded_devices = ["disk"] if main_device == "cpu" or main_device == "mps" else ["cpu", "disk"]
+
+        if any(device in offloaded_devices for device in device_map.values()):
+            return infer_auto_device_map(
+                model,
+                max_memory,
+                no_split_module_classes,
+                reserve_max_layer=True,
+                dtype=dtype,
+                special_dtypes=special_dtypes,
+                verbose=verbose,
+                clean_result=clean_result,
+                offload_buffers=offload_buffers,
+                fallback_allocation=fallback_allocation,
+            )
     if clean_result:
         device_map = clean_device_map(device_map)
 

diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
@@ -60,9 +60,9 @@
 class ModelForTest(nn.Module):
     def __init__(self):
         super().__init__()
-        self.linear1 = nn.Linear(3, 4)
-        self.batchnorm = nn.BatchNorm1d(4)
-        self.linear2 = nn.Linear(4, 5)
+        self.linear1 = nn.Linear(3, 4)  # (12 + 4) * 4 = 64
+        self.batchnorm = nn.BatchNorm1d(4)  # (4 * 4) * 4 + 8 = 72
+        self.linear2 = nn.Linear(4, 5)  # (20 + 5) * 4 = 100
 
     def forward(self, x):
         return self.linear2(self.batchnorm(self.linear1(x)))
@@ -555,7 +555,7 @@ def test_infer_auto_device_map(self):
         # model has size 236: linear1 64, batchnorm 72, linear2 100
         try:
             with self.assertLogs() as cm:
-                device_map = infer_auto_device_map(model, max_memory={0: 200, 1: 200})
+                device_map = infer_auto_device_map(model, max_memory={0: 200, 1: 200}, reserve_max_layer=True)
                 self.assertFalse(any("insufficient memory" in out for out in cm.output))
         except AssertionError:
             # No logs exist; test passes implicitly
@@ -564,7 +564,19 @@ def test_infer_auto_device_map(self):
         # only linear1 fits on device 0 as we keep memory available for the maximum layer in case of offload
         assert device_map == {"linear1": 0, "batchnorm": 1, "linear2": 1}
 
-        device_map = infer_auto_device_map(model, max_memory={0: 200, 1: 172, 2: 200})
+        # test with reserve_max_layer=False (default)
+        try:
+            with self.assertLogs() as cm:
+                device_map = infer_auto_device_map(model, max_memory={0: 200, 1: 200})
+                self.assertFalse(any("insufficient memory" in out for out in cm.output))
+        except AssertionError:
+            # No logs exist; test passes implicitly
+            pass
+
+        # since there are no offloaded modules, we can allocate linear1 and batchnorm to device 0
+        assert device_map == {"linear1": 0, "batchnorm": 0, "linear2": 1}
+
+        device_map = infer_auto_device_map(model, max_memory={0: 200, 1: 172, 2: 200}, reserve_max_layer=True)
         # On device 1, we don't care about keeping size available for the max layer, so even if there is just the
         # size available for batchnorm + linear2, they fit here.
         assert device_map == {"linear1": 0, "batchnorm": 1, "linear2": 1}
@@ -576,28 +588,72 @@ def test_infer_auto_device_map(self):
 
         # When splitting a bigger model, the split is done at the layer level
         model = nn.Sequential(ModelForTest(), ModelForTest(), ModelForTest())
-        device_map = infer_auto_device_map(model, max_memory={0: 500, 1: 500})
+        device_map = infer_auto_device_map(model, max_memory={0: 500, 1: 500}, reserve_max_layer=True)
         assert device_map == {"0": 0, "1.linear1": 0, "1.batchnorm": 0, "1.linear2": 1, "2": 1}
 
+        # Splitting is done when not reserving max layer
+        device_map = infer_auto_device_map(model, max_memory={0: 536, 1: 500})
+        assert device_map == {"0": 0, "1": 0, "2.linear1": 0, "2.batchnorm": 1, "2.linear2": 1}
+
         # With no_split_module_classes, it's done at that module level
         model = nn.Sequential(ModelForTest(), ModelForTest(), ModelForTest())
         device_map = infer_auto_device_map(
-            model, max_memory={0: 500, 1: 500}, no_split_module_classes=["ModelForTest"]
+            model,
+            max_memory={0: 500, 1: 500},
+            no_split_module_classes=["ModelForTest"],
+            reserve_max_layer=True
         )
         assert device_map == {"0": 0, "1": 1, "2": 1}
 
+        # Make sure no splitting happens when reserve_max_layer=False
+        model = nn.Sequential(ModelForTest(), ModelForTest(), ModelForTest())
+        device_map = infer_auto_device_map(
+            model,
+            max_memory={0: 500, 1: 500},
+            no_split_module_classes=["ModelForTest"]
+        )
+        assert device_map == {"0": 0, "1": 0, "2": 1}
+
+        model = nn.Sequential(nn.Linear(10, 5), nn.Linear(5, 5), nn.Linear(5, 15))
+        gpu_0_mem = 145 * 4
+        gpu_1_mem = 100 * 4
+        cpu_mem = 700 * 4
+
+        # Setting reserve_max_layer to False prevents unnecessary offloading
+        device_map = infer_auto_device_map(model, max_memory={0: gpu_0_mem, 1: gpu_1_mem, 'cpu': cpu_mem}, reserve_max_layer=True)
+        assert device_map == {'0': 0, '1': 1, '2': 'cpu'}
+
+        device_map = infer_auto_device_map(model, max_memory={0: gpu_0_mem, 1: gpu_1_mem, 'cpu': cpu_mem}, reserve_max_layer=False)
+        assert device_map == {'0': 0, '1': 0, '2': 1}
+
+        # When offloading is necessary, both produce the same device_map with offloading
+        model = nn.Sequential(nn.Linear(10, 5), nn.Linear(5, 5), nn.Linear(5, 15), nn.Linear(5, 15))
+
+        expected_device_map = {'0': 0, '1': 1, '2': 'cpu', '3': 'cpu'}
+        device_map = infer_auto_device_map(model, max_memory={0: gpu_0_mem, 1: gpu_1_mem, 'cpu': cpu_mem}, reserve_max_layer=True)
+        assert device_map == expected_device_map
+
+        device_map = infer_auto_device_map(model, max_memory={0: gpu_0_mem, 1: gpu_1_mem, 'cpu': cpu_mem}, reserve_max_layer=False)
+        assert device_map == expected_device_map
+
     def test_infer_auto_device_map_with_tied_weights(self):
         model = nn.Sequential(
             OrderedDict([("layer1", ModelForTest()), ("layer2", ModelForTest()), ("layer3", ModelForTest())])
         )
+        # With reserve_max_layer=True
         model.layer3.linear2.weight = model.layer1.linear2.weight
-        device_map = infer_auto_device_map(model, max_memory={0: 400, 1: 500})
+        device_map = infer_auto_device_map(model, max_memory={0: 400, 1: 500}, reserve_max_layer=True)
         expected = {"layer1": 0, "layer3.linear2": 0, "layer2": 1, "layer3.linear1": 1, "layer3.batchnorm": 1}
         assert device_map == expected
 
+        # With reserve_max_layer=False
+        device_map = infer_auto_device_map(model, max_memory={0: 400, 1: 500})
+        expected = {"layer1": 0, "layer3": 0, "layer2": 1}
+        assert device_map == expected
+
         # With three weights tied together
         model.layer2.linear2.weight = model.layer1.linear2.weight
-        device_map = infer_auto_device_map(model, max_memory={0: 400, 1: 500})
+        device_map = infer_auto_device_map(model, max_memory={0: 400, 1: 500}, reserve_max_layer=True)
         expected = {
             "layer1": 0,
             "layer2.linear2": 0,
@@ -609,9 +665,22 @@ def test_infer_auto_device_map_with_tied_weights(self):
         }
         assert device_map == expected
 
+        # With three weights tied together and reserve_max_layer=True
+        device_map = infer_auto_device_map(model, max_memory={0: 400, 1: 500})
+        expected = {
+            "layer1": 0,
+            "layer2.linear2": 0,
+            "layer3.linear2": 0,
+            "layer2.linear1": 0,
+            "layer2.batchnorm": 1,
+            "layer3.linear1": 1,
+            "layer3.batchnorm": 1,
+        }
+        assert device_map == expected
+
         # With two groups of weights tied together
         model.layer2.linear1.weight = model.layer1.linear1.weight
-        device_map = infer_auto_device_map(model, max_memory={0: 400, 1: 500})
+        device_map = infer_auto_device_map(model, max_memory={0: 400, 1: 500}, reserve_max_layer=True)
         expected = {
             "layer1": 0,
             "layer2.linear1": 0,
@@ -623,6 +692,18 @@ def test_infer_auto_device_map_with_tied_weights(self):
         }
         assert device_map == expected
 
+        # With two groups of weights tied together and reserve_max_layer=False
+        model.layer2.linear1.weight = model.layer1.linear1.weight
+        device_map = infer_auto_device_map(model, max_memory={0: 400, 1: 500})
+        expected = {
+            "layer1": 0,
+            "layer2": 0,
+            "layer3.linear2": 0,
+            "layer3.linear1": 1,
+            "layer3.batchnorm": 1,
+        }
+        assert device_map == expected
+
         # With weights ties in the same module
         model = nn.Sequential(
             OrderedDict(
@@ -636,10 +717,14 @@ def test_infer_auto_device_map_with_tied_weights(self):
         )
         model.linear3.weight = model.linear1.weight
         model.linear3.bias = model.linear1.bias
-        device_map = infer_auto_device_map(model, max_memory={0: 250, 1: 400})
+        device_map = infer_auto_device_map(model, max_memory={0: 250, 1: 400}, reserve_max_layer=True)
         expected = {"linear1": 0, "linear2": 1, "linear3": 0, "linear4": 1}
         assert device_map == expected
 
+        device_map = infer_auto_device_map(model, max_memory={0: 250, 1: 400})
+        expected = {"linear1": 0, "linear2": 0, "linear3": 0, "linear4": 1}
+        assert device_map == expected
+
         # With tied weights sharing a same prefix name (`compute.weight` vs `compute.weight_submodule.parameter`)
         class SubModule(torch.nn.Module):
             def __init__(self, ref_to_parameter):
@@ -732,10 +817,18 @@ def test_infer_auto_device_map_with_buffer_check_and_multi_devices(self):
         # Should NOT print a warning in such case
         with warnings.catch_warnings(record=True) as w:
             warnings.simplefilter("always")
-            device_map = infer_auto_device_map(model, max_memory={0: 400, 1: 400, "cpu": "1GB"})
+            device_map = infer_auto_device_map(model, max_memory={0: 400, 1: 400, "cpu": "1GB"}, reserve_max_layer=True)
         assert len(w) == 0
         assert device_map == {"linear1": 0, "batchnorm": 1, "linear2": "cpu", "linear3": "cpu"}
 
+        # With reserve_max_layer=False, linear1 and batchnorm will fit on device 0, and linear2 and linear3 will fit on device 2.
+        # Should NOT print a warning in such case
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            device_map = infer_auto_device_map(model, max_memory={0: 400, 1: 400})
+        assert len(w) == 0
+        assert device_map == {"linear1": 0, "batchnorm": 0, "linear2": 1, "linear3": 1}
+
         # Now we have two devices, but neither the first nor the second device can hold all remaining buffers
         # Should print a warning as intended in such case
         with self.assertWarns(Warning):
@@ -875,6 +968,43 @@ def test_infer_auto_device_map_with_fallback_allocation_and_buffers(self):
         expected_device_map = {"batchnorm": 0, "linear1": "disk", "linear2": "disk"}
         assert device_map == expected_device_map
 
+    def test_infer_auto_device_map_reserve_max_layer(self):
+        model = ModelForTest()
+        # When there is only one execution device,
+        # having reserve_max_layer=False  doesn't change device map
+        expected = {"linear1": 0, "batchnorm": "cpu", "linear2": "cpu"}
+        max_mem = {0: 184, "cpu": 400}
+        assert infer_auto_device_map(model, max_memory=max_mem, reserve_max_layer=False) == expected
+
+        assert infer_auto_device_map(model, max_memory=max_mem, reserve_max_layer=True) == expected
+
+        # When there are multiple execution devices and reserving for the max
+        # layer is unnecessary, setting reserve_max_layer=False makes efficient allocation
+        max_mem = {0: 184, 1: 100, "cpu": 400}
+        assert infer_auto_device_map(model, max_memory=max_mem, reserve_max_layer=False) == {"linear1": 0, "batchnorm": 0, "linear2": 1}
+
+        # Less efficient allocation due to unnecessary reservation for the same max_memory config
+        assert infer_auto_device_map(model, max_memory=max_mem, reserve_max_layer=True) == {"linear1": 0, "batchnorm": 1, "linear2": "cpu"}
+
+        # When there are multiple execution devices with offloaded modules,
+        # having reserve_max_layer=False doesn't have any effects
+        model = nn.Sequential(
+            OrderedDict(
+                [
+                    ("linear1", nn.Linear(10, 10)),  # 440 each
+                    ("linear2", nn.Linear(10, 10)),
+                    ("linear3", nn.Linear(10, 10)),
+                    ("linear4", nn.Linear(10, 10)),
+                    ("linear5", nn.Linear(10, 10))
+                ]
+            )
+        )
+
+        expected = {"linear1": 0, "linear2": 1, "linear3": 1, "linear4": "cpu", "linear5": "cpu"}
+        max_mem = {0: 880, 1: 880, "cpu": 1600}
+        assert infer_auto_device_map(model, max_memory=max_mem, reserve_max_layer=False) == expected
+        assert infer_auto_device_map(model, max_memory=max_mem, reserve_max_layer=True) == expected
+
     @require_non_cpu
     def test_get_balanced_memory(self):
         model = ModelForTest()