pytorch · jerryzh168 · Sep 22, 2025 · Sep 11, 2025 · Sep 13, 2025 · Sep 15, 2025
diff --git a/.github/scripts/torchao_model_releases/eval_peak_memory_usage.py b/.github/scripts/torchao_model_releases/eval_peak_memory_usage.py
@@ -12,7 +12,7 @@
 
 def eval_peak_memory_usage(model_id: str):
     model = AutoModelForCausalLM.from_pretrained(
-        model_id, device_map="auto", torch_dtype=torch.bfloat16
+        model_id, device_map="auto", dtype=torch.bfloat16
     )
     tokenizer = AutoTokenizer.from_pretrained(model_id)
 

diff --git a/.github/scripts/torchao_model_releases/quantize_and_upload.py b/.github/scripts/torchao_model_releases/quantize_and_upload.py
@@ -36,7 +36,7 @@ def _get_username():
 
 def _untie_weights_and_save_locally(model_id):
     untied_model = AutoModelForCausalLM.from_pretrained(
-        model_id, torch_dtype="auto", device_map="auto"
+        model_id, dtype="auto", device_map="auto"
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -209,15 +209,15 @@ def _untie_weights_and_save_locally(model_id):
 from torchao.quantization import Int4WeightOnlyConfig
 quant_config = Int4WeightOnlyConfig(group_size=128, int4_packing_format="tile_packed_to_4d", int4_choose_qparams_algorithm="hqq")
 quantization_config = TorchAoConfig(quant_type=quant_config)
-quantized_model = AutoModelForCausalLM.from_pretrained(model_to_quantize, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
+quantized_model = AutoModelForCausalLM.from_pretrained(model_to_quantize, device_map="auto", dtype=torch.bfloat16, quantization_config=quantization_config)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 """
 
 _fp8_quant_code = """
 from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, PerRow
 quant_config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
 quantization_config = TorchAoConfig(quant_type=quant_config)
-quantized_model = AutoModelForCausalLM.from_pretrained(model_to_quantize, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
+quantized_model = AutoModelForCausalLM.from_pretrained(model_to_quantize, device_map="auto", dtype=torch.bfloat16, quantization_config=quantization_config)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 """
 
@@ -238,7 +238,7 @@ def _untie_weights_and_save_locally(model_id):
 )
 quant_config = ModuleFqnToConfig({{"_default": linear_config, "model.embed_tokens": embedding_config}})
 quantization_config = TorchAoConfig(quant_type=quant_config, include_input_output_embeddings=True, modules_to_not_convert=[])
-quantized_model = AutoModelForCausalLM.from_pretrained(model_to_quantize, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
+quantized_model = AutoModelForCausalLM.from_pretrained(model_to_quantize, device_map="auto", dtype=torch.bfloat16, quantization_config=quantization_config)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 """
 
@@ -251,7 +251,7 @@ def _untie_weights_and_save_locally(model_id):
 model = AutoModelForCausalLM.from_pretrained(
     model_to_quantize,
     device_map="auto",
-    torch_dtype=torch.bfloat16,
+    dtype=torch.bfloat16,
 )
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
@@ -332,7 +332,7 @@ def _untie_weights_and_save_locally(model_id):
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
-    torch_dtype="auto",
+    dtype="auto",
     device_map="auto"
 )
 
@@ -394,7 +394,7 @@ def _untie_weights_and_save_locally(model_id):
 
 # use "{base_model}" or "{quantized_model}"
 model_id = "{quantized_model}"
-quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
+quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", dtype=torch.bfloat16)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 torch.cuda.reset_peak_memory_stats()
@@ -538,7 +538,7 @@ def _untie_weights_and_save_locally(model_id):
 import torch
 
 model_id = "{base_model}"
-untied_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="auto")
+untied_model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto", device_map="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 print(untied_model)
@@ -668,7 +668,7 @@ def quantize_and_upload(
         model = AutoModelForCausalLM.from_pretrained(
             model_to_quantize,
             device_map="auto",
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
         )
         tokenizer = AutoTokenizer.from_pretrained(model_id)
 
@@ -708,7 +708,7 @@ def quantize_and_upload(
         quantized_model = AutoModelForCausalLM.from_pretrained(
             model_to_quantize,
             device_map="auto",
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             quantization_config=quantization_config,
         )
         tokenizer = AutoTokenizer.from_pretrained(model_id)

diff --git a/README.md b/README.md
@@ -159,7 +159,7 @@ quantization_config = TorchAoConfig(quant_type=Int4WeightOnlyConfig(group_size=1
 # Load and automatically quantize
 quantized_model = AutoModelForCausalLM.from_pretrained(
     "microsoft/Phi-4-mini-instruct",
-    torch_dtype="auto",
+    dtype="auto",
     device_map="auto",
     quantization_config=quantization_config
 )

diff --git a/benchmarks/_models/eval_hf_models.py b/benchmarks/_models/eval_hf_models.py
@@ -25,7 +25,7 @@ def quantize_model_and_save(model_id, quant_config, output_dir="results"):
     quantized_model = AutoModelForCausalLM.from_pretrained(
         model_id,
         device_map="auto",
-        torch_dtype=torch.bfloat16,
+        dtype=torch.bfloat16,
         quantization_config=quantization_config,
     )
     tokenizer = AutoTokenizer.from_pretrained(model_id)

diff --git a/docs/source/serving.rst b/docs/source/serving.rst
@@ -35,7 +35,7 @@ For this example, we'll use ``Float8DynamicActivationFloat8WeightConfig`` on the
 
     quant_config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
     quantization_config = TorchAoConfig(quant_type=quant_config)
-    quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
+    quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", dtype=torch.bfloat16, quantization_config=quantization_config)
     tokenizer = AutoTokenizer.from_pretrained(model_id)
 
     # Push the model to hub
@@ -116,7 +116,7 @@ Install the required packages:
     model = AutoModelForCausalLM.from_pretrained(
         model_path,
         device_map="auto",
-        torch_dtype="auto",
+        dtype="auto",
         trust_remote_code=True,
     )
     tokenizer = AutoTokenizer.from_pretrained(model_path)
@@ -165,7 +165,7 @@ Optionally, we can quantize the embedding and lm_head differently, since those l
     from transformers.modeling_utils import find_tied_parameters
 
     model_id = "microsoft/Phi-4-mini-instruct"
-    untied_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="auto")
+    untied_model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto", device_map="auto")
     tokenizer = AutoTokenizer.from_pretrained(model_id)
 
     print(untied_model)
@@ -233,7 +233,7 @@ Quantizing the model for mobile deployment using TorchAO's ``Int8DynamicActivati
     quantization_config = TorchAoConfig(quant_type=quant_config, include_embedding=True, untie_embedding_weights=True, modules_to_not_convert=[])
 
     # either use `untied_model_id` or `untied_model_local_path`
-    quantized_model = AutoModelForCausalLM.from_pretrained(untied_model_id, torch_dtype=torch.float32, device_map="auto", quantization_config=quantization_config)
+    quantized_model = AutoModelForCausalLM.from_pretrained(untied_model_id, dtype=torch.float32, device_map="auto", quantization_config=quantization_config)
     tokenizer = AutoTokenizer.from_pretrained(model_id)
 
     # Push to hub
@@ -316,7 +316,7 @@ For Phi-4-mini-instruct, when quantized with float8 dynamic quant, we can reduce
 
     # use "microsoft/Phi-4-mini-instruct" or "pytorch/Phi-4-mini-instruct-float8dq"
     model_id = "pytorch/Phi-4-mini-instruct-float8dq"
-    quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
+    quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", dtype=torch.bfloat16)
     tokenizer = AutoTokenizer.from_pretrained(model_id)
 
     torch.cuda.reset_peak_memory_stats()

diff --git a/docs/source/torchao_vllm_integration.md b/docs/source/torchao_vllm_integration.md
@@ -88,7 +88,7 @@ quantization_config = TorchAoConfig(
 # Load and automatically quantize the model
 model = AutoModelForCausalLM.from_pretrained(
     "meta-llama/Llama-3.2-1B",
-    torch_dtype="auto",
+    dtype="auto",
     device_map="auto",
     quantization_config=quantization_config
 )

diff --git a/test/integration/test_load_and_run_checkpoint.py b/test/integration/test_load_and_run_checkpoint.py
@@ -193,7 +193,7 @@ def test_deprecated_hf_models(self, model_info):
         with warnings.catch_warnings(record=True) as caught_warnings:
             quantized_model = AutoModelForCausalLM.from_pretrained(
                 model_name,
-                torch_dtype="bfloat16",
+                dtype="bfloat16",
                 device_map="cuda:0",
             )
             # version mismatch check in config.py
@@ -250,7 +250,7 @@ def test_deprecated_hf_models(self, model_info):
         with warnings.catch_warnings(record=True) as caught_warnings:
             _ = AutoModelForCausalLM.from_pretrained(
                 _HIGH_PRECISION_MODEL,
-                torch_dtype="bfloat16",
+                dtype="bfloat16",
                 device_map="cuda:0",
                 quantization_config=quantized_model.config.quantization_config,
             )

diff --git a/test/integration/test_vllm.py b/test/integration/test_vllm.py
@@ -153,7 +153,7 @@ def quantize_and_save_model(
         # Load and quantize model
         quantized_model = AutoModelForCausalLM.from_pretrained(
             model_name,
-            torch_dtype="bfloat16",
+            dtype="bfloat16",
             device_map="cuda",
             quantization_config=quantization_config,
         )

diff --git a/torchao/prototype/autoround/autoround_llm.py b/torchao/prototype/autoround/autoround_llm.py
@@ -88,7 +88,7 @@ def main(args):
     # Get the model, tokenizer, and decoder_cls
     model_name_or_path = args.model_name_or_path
     model, tokenizer, decoder_cls = ar_utils.get_float_model_info(
-        model_name_or_path, torch_dtype=torch.bfloat16
+        model_name_or_path, dtype=torch.bfloat16
     )
     # Disable the `use_cache` for calibration stage.
     model.config.use_cache = False

diff --git a/torchao/prototype/autoround/eval_autoround.py b/torchao/prototype/autoround/eval_autoround.py
@@ -86,7 +86,7 @@ def main(args):
     with torch.no_grad():
         model_name_or_path = args.model_name_or_path
         model, tokenizer, decoder_cls = ar_utils.get_float_model_info(
-            model_name_or_path, torch_dtype=torch.bfloat16
+            model_name_or_path, dtype=torch.bfloat16
         )
         model.eval()
         model_device = args.model_device

diff --git a/torchao/prototype/autoround/utils.py b/torchao/prototype/autoround/utils.py
@@ -140,11 +140,11 @@ def _auto_detect_decoder_cls(model):
             return type(first_module)
 
 
-def get_float_model_info(model_name_or_path, torch_dtype=torch.float32):
+def get_float_model_info(model_name_or_path, dtype=torch.float32):
     import transformers
 
     model = transformers.AutoModelForCausalLM.from_pretrained(
-        model_name_or_path, torch_dtype=torch_dtype
+        model_name_or_path, dtype=dtype
     )
     tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path)
     decoder_cls = _auto_detect_decoder_cls(model)

diff --git a/torchao/prototype/awq/example.py b/torchao/prototype/awq/example.py
@@ -215,7 +215,7 @@ def quantize_and_eval(
     # load any model with torch.nn.linear layers
     tokenizer = AutoTokenizer.from_pretrained(repo_id)
     model = (
-        AutoModelForCausalLM.from_pretrained(repo_id, torch_dtype=precision)
+        AutoModelForCausalLM.from_pretrained(repo_id, dtype=precision)
         .eval()
         .to(device)
     )

diff --git a/torchao/prototype/moe_quant/llama4_quant.py b/torchao/prototype/moe_quant/llama4_quant.py
@@ -58,7 +58,7 @@ def convert_fn(module):
 
 
 model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
-model = Llama4ForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+model = Llama4ForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 _replace_with_custom_fn_if_matches_filter(

diff --git a/torchao/prototype/quantization/mixed_precision/scripts/fit.py b/torchao/prototype/quantization/mixed_precision/scripts/fit.py
@@ -84,7 +84,7 @@ def main(max_seqlen, checkpoint, nsamples, max_iter, num_layers):
 
     # have been tested models Llama-3-8B, Llama-2-7B, Mistral-7B, and stories110M
     model = transformers.AutoModelForCausalLM.from_pretrained(
-        checkpoint, torch_dtype=torch.bfloat16
+        checkpoint, dtype=torch.bfloat16
     )
     tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)
     model = model.to(device)

diff --git a/torchao/prototype/quantization/mixed_precision/scripts/hessian_grad.py b/torchao/prototype/quantization/mixed_precision/scripts/hessian_grad.py
@@ -130,7 +130,7 @@ def main(layer_id, checkpoint, max_seqlen, max_iter, nsamples):
     with sdpa_kernel(SDPBackend.MATH):
         # have been tested models Llama-3-8B, Llama-2-7B, Mistral-7B, and stories110M
         model = transformers.AutoModelForCausalLM.from_pretrained(
-            checkpoint, torch_dtype=torch.bfloat16
+            checkpoint, dtype=torch.bfloat16
         )
         tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)
         model = model.cuda()

diff --git a/torchao/prototype/quantization/mixed_precision/scripts/hessian_vhp.py b/torchao/prototype/quantization/mixed_precision/scripts/hessian_vhp.py
@@ -100,7 +100,7 @@ def f(*new_params):
     with sdpa_kernel(SDPBackend.MATH):
         # have been tested models Llama-3-8B, Llama-2-7B, Mistral-7B, and stories110M
         model = transformers.AutoModelForCausalLM.from_pretrained(
-            checkpoint, torch_dtype=torch.bfloat16
+            checkpoint, dtype=torch.bfloat16
         )
         tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)
         model = model.to(device)

diff --git a/torchao/prototype/quantization/mixed_precision/scripts/utils.py b/torchao/prototype/quantization/mixed_precision/scripts/utils.py
@@ -106,7 +106,7 @@ def cal_model_size(model, fqn_to_config):
 def load_model(repo_id, device):
     tokenizer = AutoTokenizer.from_pretrained(repo_id)
     model = AutoModelForCausalLM.from_pretrained(
-        repo_id, torch_dtype=torch.bfloat16
+        repo_id, dtype=torch.bfloat16
     ).to(device=device)
     return model, tokenizer
 

diff --git a/torchao/prototype/smoothquant/example.py b/torchao/prototype/smoothquant/example.py
@@ -88,7 +88,7 @@ def quantize_and_eval(
     t0 = time.time()
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     model = (
-        AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+        AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16)
         .eval()
         .to(device)
     )
@@ -155,7 +155,7 @@ def compare_models(
     torch.manual_seed(34)
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     model = (
-        AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+        AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16)
         .eval()
         .to(device)
     )
@@ -167,7 +167,7 @@ def compare_models(
     print("Benchmarking W8A8-dynamic without SmoothQuant...")
     torch.manual_seed(34)
     w8a8_model = (
-        AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+        AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16)
         .eval()
         .to(device)
     )