Add load and run tests for checkpoints that we want to have BC

jerryzh168 · jerryzh168 · commit 80009d95af86 · 2025-08-18T15:39:18.000-07:00
Summary:
Added load and run tests to make sure previously saved checkpoints can continue to load and run.

includes FP8, INT4 and INT4 + preshuffled checkpoints since these might reach larger audience

Test Plan:
python test/integration/test_load_and_run_checkpoint.py

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/test/integration/test_load_and_run_checkpoint.py b/test/integration/test_load_and_run_checkpoint.py
@@ -16,8 +16,18 @@
 
 from torchao.utils import is_fbcode, is_sm_at_least_89
 
-_MODEL_NAME_AND_VERSIONS = [
-    ("torchao-testing/opt-125m-float8dq-row-v1-0.13-dev", 1),
+_DEPRECATED_MODEL_INFO = [
+    (
+        "torchao-testing/opt-125m-float8dq-row-v1-0.13-dev",
+        1,
+        "Float8DynamicActivationFloat8WeightConfig",
+    ),
+]
+
+_MODEL_INFO = [
+    ("torchao-testing/opt-125m-FP8-v2-0.13-dev", 2),
+    ("torchao-testing/opt-125m-INT4-preshuffled-v2-0.13-dev", 2),
+    ("torchao-testing/opt-125m-INT4-v2-0.13-dev", 2),
 ]
 
 
@@ -27,29 +37,28 @@
     is_fbcode(),
     "Skipping the test in fbcode for now, not sure how to download from transformers",
 )
-class TestLoadingDeprecatedCheckpoint(TestCase):
-    @common_utils.parametrize("model_name_and_version", _MODEL_NAME_AND_VERSIONS)
-    def test_load_model_and_run(self, model_name_and_version):
+class TestLoadAndRunCheckpoint(TestCase):
+    @common_utils.parametrize("model_info", _DEPRECATED_MODEL_INFO)
+    def test_load_and_run_deprecated_checkpoints(self, model_info):
         """Test that we print correct warning message when loading a deprecated checkpoint
         and making sure the deprecated checkpoints can still be loaded
         """
         # Load and quantize model
-        model_name, version = model_name_and_version
+        model_name, version, config_name = model_info
         with warnings.catch_warnings(record=True) as caught_warnings:
             quantized_model = AutoModelForCausalLM.from_pretrained(
                 model_name,
                 torch_dtype="bfloat16",
-                device_map="cuda",
+                device_map="cuda:0",
             )
             assert any(
                 "Stored version is not the same as current default version of the config"
                 in str(w.message)
                 for w in caught_warnings
             ), "Didn't get expected warning message for version mismatch"
 
-            # TODO: generalize when we test more checkpoints
             assert any(
-                "Models quantized with version 1 of Float8DynamicActivationFloat8WeightConfig is deprecated"
+                f"Models quantized with version 1 of {config_name} is deprecated"
                 in str(w.message)
                 for w in caught_warnings
             ), "Didn't get expected warning message for deprecation"
@@ -70,8 +79,35 @@ def test_load_model_and_run(self, model_name_and_version):
             generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )
 
+    @common_utils.parametrize("model_info", _MODEL_INFO)
+    def test_load_and_run_checkpoints(self, model_info):
+        """Test that we print correct warning message when loading a deprecated checkpoint
+        and making sure the deprecated checkpoints can still be loaded
+        """
+        model_name, version = model_info
+        # Load and quantize model
+        quantized_model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype="bfloat16",
+            device_map="cuda:0",
+        )
+        assert isinstance(quantized_model.config.quantization_config, TorchAoConfig)
+        assert quantized_model.config.quantization_config.quant_type.version == version
+
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        prompt = ("Hello, my name is",)
+        inputs = tokenizer(
+            prompt,
+            return_tensors="pt",
+        ).to("cuda")
+        generated_ids = quantized_model.generate(**inputs, max_new_tokens=128)
+        # make sure it runs
+        _ = tokenizer.batch_decode(
+            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+
 
-common_utils.instantiate_parametrized_tests(TestLoadingDeprecatedCheckpoint)
+common_utils.instantiate_parametrized_tests(TestLoadAndRunCheckpoint)
 
 if __name__ == "__main__":
     run_tests()