pytorch · namgyu-youn · Sep 11, 2025 · Sep 13, 2025 · Sep 15, 2025 · Sep 20, 2025
diff --git a/.github/scripts/torchao_model_releases/quantize_and_upload.py b/.github/scripts/torchao_model_releases/quantize_and_upload.py
@@ -592,7 +592,7 @@ def _untie_weights_and_save_locally(model_id):
 python -m executorch.examples.models.qwen3.convert_weights $(hf download {quantized_model}) pytorch_model_converted.bin
 ```
 
-Once we have the checkpoint, we export it to ExecuTorch with a max_seq_length/max_context_length of 1024 to the XNNPACK backend as follows. 
+Once we have the checkpoint, we export it to ExecuTorch with a max_seq_length/max_context_length of 1024 to the XNNPACK backend as follows.
 
 [TODO: fix config path in note where necessary]
 (Note: ExecuTorch LLM export script requires config.json have certain key names. The correct config to use for the LLM export script is located at examples/models/qwen3/config/4b_config.json within the ExecuTorch repo.)

diff --git a/README.md b/README.md
@@ -159,7 +159,7 @@ quantization_config = TorchAoConfig(quant_type=Int4WeightOnlyConfig(group_size=1
 # Load and automatically quantize
 quantized_model = AutoModelForCausalLM.from_pretrained(
     "microsoft/Phi-4-mini-instruct",
-    torch_dtype="auto",
+    dtype="auto",
     device_map="auto",
     quantization_config=quantization_config
 )

diff --git a/benchmarks/_models/eval_hf_models.py b/benchmarks/_models/eval_hf_models.py
@@ -25,7 +25,7 @@ def quantize_model_and_save(model_id, quant_config, output_dir="results"):
     quantized_model = AutoModelForCausalLM.from_pretrained(
         model_id,
         device_map="auto",
-        torch_dtype=torch.bfloat16,
+        dtype=torch.bfloat16,
         quantization_config=quantization_config,
     )
     tokenizer = AutoTokenizer.from_pretrained(model_id)

diff --git a/docs/source/serving.rst b/docs/source/serving.rst
@@ -85,7 +85,7 @@ Install the required packages:
     model = AutoModelForCausalLM.from_pretrained(
         model_path,
         device_map="auto",
-        torch_dtype="auto",
+        dtype="auto",
         trust_remote_code=True,
     )
     tokenizer = AutoTokenizer.from_pretrained(model_path)
@@ -134,7 +134,7 @@ Optionally, we can quantize the embedding and lm_head differently, since those l
     from transformers.modeling_utils import find_tied_parameters
 
     model_id = "microsoft/Phi-4-mini-instruct"
-    untied_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="auto")
+    untied_model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto", device_map="auto")
     tokenizer = AutoTokenizer.from_pretrained(model_id)
 
     print(untied_model)
@@ -202,7 +202,7 @@ Quantizing the model for mobile deployment using TorchAO's ``Int8DynamicActivati
     quantization_config = TorchAoConfig(quant_type=quant_config, include_embedding=True, untie_embedding_weights=True, modules_to_not_convert=[])
 
     # either use `untied_model_id` or `untied_model_local_path`
-    quantized_model = AutoModelForCausalLM.from_pretrained(untied_model_id, torch_dtype=torch.float32, device_map="auto", quantization_config=quantization_config)
+    quantized_model = AutoModelForCausalLM.from_pretrained(untied_model_id, dtype=torch.float32, device_map="auto", quantization_config=quantization_config)
     tokenizer = AutoTokenizer.from_pretrained(model_id)
 
     # Push to hub
@@ -285,7 +285,7 @@ For Phi-4-mini-instruct, when quantized with float8 dynamic quant, we can reduce
 
     # use "microsoft/Phi-4-mini-instruct" or "pytorch/Phi-4-mini-instruct-float8dq"
     model_id = "pytorch/Phi-4-mini-instruct-float8dq"
-    quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
+    quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", dtype=torch.bfloat16)
     tokenizer = AutoTokenizer.from_pretrained(model_id)
 
     torch.cuda.reset_peak_memory_stats()

diff --git a/docs/source/torchao_vllm_integration.md b/docs/source/torchao_vllm_integration.md
@@ -88,7 +88,7 @@ quantization_config = TorchAoConfig(
 # Load and automatically quantize the model
 model = AutoModelForCausalLM.from_pretrained(
     "meta-llama/Llama-3.2-1B",
-    torch_dtype="auto",
+    dtype="auto",
     device_map="auto",
     quantization_config=quantization_config
 )

diff --git a/test/integration/test_load_and_run_checkpoint.py b/test/integration/test_load_and_run_checkpoint.py
@@ -193,7 +193,7 @@ def test_deprecated_hf_models(self, model_info):
         with warnings.catch_warnings(record=True) as caught_warnings:
             quantized_model = AutoModelForCausalLM.from_pretrained(
                 model_name,
-                torch_dtype="bfloat16",
+                dtype="bfloat16",
                 device_map="cuda:0",
             )
             # version mismatch check in config.py
@@ -250,7 +250,7 @@ def test_deprecated_hf_models(self, model_info):
         with warnings.catch_warnings(record=True) as caught_warnings:
             _ = AutoModelForCausalLM.from_pretrained(
                 _HIGH_PRECISION_MODEL,
-                torch_dtype="bfloat16",
+                dtype="bfloat16",
                 device_map="cuda:0",
                 quantization_config=quantized_model.config.quantization_config,
             )

diff --git a/test/integration/test_vllm.py b/test/integration/test_vllm.py
@@ -153,7 +153,7 @@ def quantize_and_save_model(
         # Load and quantize model
         quantized_model = AutoModelForCausalLM.from_pretrained(
             model_name,
-            torch_dtype="bfloat16",
+            dtype="bfloat16",
             device_map="cuda",
             quantization_config=quantization_config,
         )

diff --git a/torchao/prototype/autoround/autoround_llm.py b/torchao/prototype/autoround/autoround_llm.py
@@ -88,7 +88,7 @@ def main(args):
     # Get the model, tokenizer, and decoder_cls
     model_name_or_path = args.model_name_or_path
     model, tokenizer, decoder_cls = ar_utils.get_float_model_info(
-        model_name_or_path, torch_dtype=torch.bfloat16
+        model_name_or_path, dtype=torch.bfloat16
     )
     # Disable the `use_cache` for calibration stage.
     model.config.use_cache = False

diff --git a/torchao/prototype/autoround/eval_autoround.py b/torchao/prototype/autoround/eval_autoround.py
@@ -86,7 +86,7 @@ def main(args):
     with torch.no_grad():
         model_name_or_path = args.model_name_or_path
         model, tokenizer, decoder_cls = ar_utils.get_float_model_info(
-            model_name_or_path, torch_dtype=torch.bfloat16
+            model_name_or_path, dtype=torch.bfloat16
         )
         model.eval()
         model_device = args.model_device

diff --git a/torchao/prototype/autoround/utils.py b/torchao/prototype/autoround/utils.py
@@ -140,11 +140,11 @@ def _auto_detect_decoder_cls(model):
             return type(first_module)
 
 
-def get_float_model_info(model_name_or_path, torch_dtype=torch.float32):
+def get_float_model_info(model_name_or_path, dtype=torch.float32):
     import transformers
 
     model = transformers.AutoModelForCausalLM.from_pretrained(
-        model_name_or_path, torch_dtype=torch_dtype
+        model_name_or_path, dtype=dtype
     )
     tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path)
     decoder_cls = _auto_detect_decoder_cls(model)

diff --git a/torchao/prototype/awq/example.py b/torchao/prototype/awq/example.py
@@ -215,9 +215,7 @@ def quantize_and_eval(
     # load any model with torch.nn.linear layers
     tokenizer = AutoTokenizer.from_pretrained(repo_id)
     model = (
-        AutoModelForCausalLM.from_pretrained(repo_id, torch_dtype=precision)
-        .eval()
-        .to(device)
+        AutoModelForCausalLM.from_pretrained(repo_id, dtype=precision).eval().to(device)
     )
     print(f"Time to load model: {time.time() - t0:.02f} seconds")
     if quant.startswith("awq-int4wo"):

diff --git a/torchao/prototype/moe_quant/llama4_quant.py b/torchao/prototype/moe_quant/llama4_quant.py
@@ -58,7 +58,7 @@ def convert_fn(module):
 
 
 model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
-model = Llama4ForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+model = Llama4ForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 _replace_with_custom_fn_if_matches_filter(

diff --git a/torchao/prototype/quantization/mixed_precision/scripts/fit.py b/torchao/prototype/quantization/mixed_precision/scripts/fit.py
@@ -84,7 +84,7 @@ def main(max_seqlen, checkpoint, nsamples, max_iter, num_layers):
 
     # have been tested models Llama-3-8B, Llama-2-7B, Mistral-7B, and stories110M
     model = transformers.AutoModelForCausalLM.from_pretrained(
-        checkpoint, torch_dtype=torch.bfloat16
+        checkpoint, dtype=torch.bfloat16
     )
     tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)
     model = model.to(device)

diff --git a/torchao/prototype/quantization/mixed_precision/scripts/hessian_grad.py b/torchao/prototype/quantization/mixed_precision/scripts/hessian_grad.py
@@ -130,7 +130,7 @@ def main(layer_id, checkpoint, max_seqlen, max_iter, nsamples):
     with sdpa_kernel(SDPBackend.MATH):
         # have been tested models Llama-3-8B, Llama-2-7B, Mistral-7B, and stories110M
         model = transformers.AutoModelForCausalLM.from_pretrained(
-            checkpoint, torch_dtype=torch.bfloat16
+            checkpoint, dtype=torch.bfloat16
         )
         tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)
         model = model.cuda()

diff --git a/torchao/prototype/quantization/mixed_precision/scripts/hessian_vhp.py b/torchao/prototype/quantization/mixed_precision/scripts/hessian_vhp.py
@@ -100,7 +100,7 @@ def f(*new_params):
     with sdpa_kernel(SDPBackend.MATH):
         # have been tested models Llama-3-8B, Llama-2-7B, Mistral-7B, and stories110M
         model = transformers.AutoModelForCausalLM.from_pretrained(
-            checkpoint, torch_dtype=torch.bfloat16
+            checkpoint, dtype=torch.bfloat16
         )
         tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)
         model = model.to(device)

diff --git a/torchao/prototype/quantization/mixed_precision/scripts/utils.py b/torchao/prototype/quantization/mixed_precision/scripts/utils.py
@@ -105,9 +105,9 @@ def cal_model_size(model, fqn_to_config):
 
 def load_model(repo_id, device):
     tokenizer = AutoTokenizer.from_pretrained(repo_id)
-    model = AutoModelForCausalLM.from_pretrained(
-        repo_id, torch_dtype=torch.bfloat16
-    ).to(device=device)
+    model = AutoModelForCausalLM.from_pretrained(repo_id, dtype=torch.bfloat16).to(
+        device=device
+    )
     return model, tokenizer
 
 

diff --git a/torchao/prototype/smoothquant/example.py b/torchao/prototype/smoothquant/example.py
@@ -88,7 +88,7 @@ def quantize_and_eval(
     t0 = time.time()
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     model = (
-        AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+        AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16)
         .eval()
         .to(device)
     )
@@ -155,7 +155,7 @@ def compare_models(
     torch.manual_seed(34)
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     model = (
-        AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+        AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16)
         .eval()
         .to(device)
     )
@@ -167,7 +167,7 @@ def compare_models(
     print("Benchmarking W8A8-dynamic without SmoothQuant...")
     torch.manual_seed(34)
     w8a8_model = (
-        AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+        AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16)
         .eval()
         .to(device)
     )