diff --git a/.github/scripts/torchao_model_releases/quantize_and_upload.py b/.github/scripts/torchao_model_releases/quantize_and_upload.py index 22ce6ee6df..1edda90ef6 100644 --- a/.github/scripts/torchao_model_releases/quantize_and_upload.py +++ b/.github/scripts/torchao_model_releases/quantize_and_upload.py @@ -592,7 +592,7 @@ def _untie_weights_and_save_locally(model_id): python -m executorch.examples.models.qwen3.convert_weights $(hf download {quantized_model}) pytorch_model_converted.bin ``` -Once we have the checkpoint, we export it to ExecuTorch with a max_seq_length/max_context_length of 1024 to the XNNPACK backend as follows. +Once we have the checkpoint, we export it to ExecuTorch with a max_seq_length/max_context_length of 1024 to the XNNPACK backend as follows. [TODO: fix config path in note where necessary] (Note: ExecuTorch LLM export script requires config.json have certain key names. The correct config to use for the LLM export script is located at examples/models/qwen3/config/4b_config.json within the ExecuTorch repo.) diff --git a/README.md b/README.md index cd46a3953b..9330900300 100644 --- a/README.md +++ b/README.md @@ -159,7 +159,7 @@ quantization_config = TorchAoConfig(quant_type=Int4WeightOnlyConfig(group_size=1 # Load and automatically quantize quantized_model = AutoModelForCausalLM.from_pretrained( "microsoft/Phi-4-mini-instruct", - torch_dtype="auto", + dtype="auto", device_map="auto", quantization_config=quantization_config ) diff --git a/benchmarks/_models/eval_hf_models.py b/benchmarks/_models/eval_hf_models.py index b0e635c3f0..3cd6887ab6 100644 --- a/benchmarks/_models/eval_hf_models.py +++ b/benchmarks/_models/eval_hf_models.py @@ -25,7 +25,7 @@ def quantize_model_and_save(model_id, quant_config, output_dir="results"): quantized_model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", - torch_dtype=torch.bfloat16, + dtype=torch.bfloat16, quantization_config=quantization_config, ) tokenizer = AutoTokenizer.from_pretrained(model_id) diff --git a/docs/source/serving.rst b/docs/source/serving.rst index d639a78093..d95132ded7 100644 --- a/docs/source/serving.rst +++ b/docs/source/serving.rst @@ -85,7 +85,7 @@ Install the required packages: model = AutoModelForCausalLM.from_pretrained( model_path, device_map="auto", - torch_dtype="auto", + dtype="auto", trust_remote_code=True, ) tokenizer = AutoTokenizer.from_pretrained(model_path) @@ -134,7 +134,7 @@ Optionally, we can quantize the embedding and lm_head differently, since those l from transformers.modeling_utils import find_tied_parameters model_id = "microsoft/Phi-4-mini-instruct" - untied_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="auto") + untied_model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto", device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_id) print(untied_model) @@ -202,7 +202,7 @@ Quantizing the model for mobile deployment using TorchAO's ``Int8DynamicActivati quantization_config = TorchAoConfig(quant_type=quant_config, include_embedding=True, untie_embedding_weights=True, modules_to_not_convert=[]) # either use `untied_model_id` or `untied_model_local_path` - quantized_model = AutoModelForCausalLM.from_pretrained(untied_model_id, torch_dtype=torch.float32, device_map="auto", quantization_config=quantization_config) + quantized_model = AutoModelForCausalLM.from_pretrained(untied_model_id, dtype=torch.float32, device_map="auto", quantization_config=quantization_config) tokenizer = AutoTokenizer.from_pretrained(model_id) # Push to hub @@ -285,7 +285,7 @@ For Phi-4-mini-instruct, when quantized with float8 dynamic quant, we can reduce # use "microsoft/Phi-4-mini-instruct" or "pytorch/Phi-4-mini-instruct-float8dq" model_id = "pytorch/Phi-4-mini-instruct-float8dq" - quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16) + quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", dtype=torch.bfloat16) tokenizer = AutoTokenizer.from_pretrained(model_id) torch.cuda.reset_peak_memory_stats() diff --git a/docs/source/torchao_vllm_integration.md b/docs/source/torchao_vllm_integration.md index 870a6c2958..be3a8e4f51 100644 --- a/docs/source/torchao_vllm_integration.md +++ b/docs/source/torchao_vllm_integration.md @@ -88,7 +88,7 @@ quantization_config = TorchAoConfig( # Load and automatically quantize the model model = AutoModelForCausalLM.from_pretrained( "meta-llama/Llama-3.2-1B", - torch_dtype="auto", + dtype="auto", device_map="auto", quantization_config=quantization_config ) diff --git a/test/integration/test_load_and_run_checkpoint.py b/test/integration/test_load_and_run_checkpoint.py index 806565011e..6bdee4a1b8 100644 --- a/test/integration/test_load_and_run_checkpoint.py +++ b/test/integration/test_load_and_run_checkpoint.py @@ -193,7 +193,7 @@ def test_deprecated_hf_models(self, model_info): with warnings.catch_warnings(record=True) as caught_warnings: quantized_model = AutoModelForCausalLM.from_pretrained( model_name, - torch_dtype="bfloat16", + dtype="bfloat16", device_map="cuda:0", ) # version mismatch check in config.py @@ -250,7 +250,7 @@ def test_deprecated_hf_models(self, model_info): with warnings.catch_warnings(record=True) as caught_warnings: _ = AutoModelForCausalLM.from_pretrained( _HIGH_PRECISION_MODEL, - torch_dtype="bfloat16", + dtype="bfloat16", device_map="cuda:0", quantization_config=quantized_model.config.quantization_config, ) diff --git a/test/integration/test_vllm.py b/test/integration/test_vllm.py index f798a9cd6a..32a7a8b405 100644 --- a/test/integration/test_vllm.py +++ b/test/integration/test_vllm.py @@ -153,7 +153,7 @@ def quantize_and_save_model( # Load and quantize model quantized_model = AutoModelForCausalLM.from_pretrained( model_name, - torch_dtype="bfloat16", + dtype="bfloat16", device_map="cuda", quantization_config=quantization_config, ) diff --git a/torchao/prototype/autoround/autoround_llm.py b/torchao/prototype/autoround/autoround_llm.py index 822ee6554b..8d29fe3388 100644 --- a/torchao/prototype/autoround/autoround_llm.py +++ b/torchao/prototype/autoround/autoround_llm.py @@ -88,7 +88,7 @@ def main(args): # Get the model, tokenizer, and decoder_cls model_name_or_path = args.model_name_or_path model, tokenizer, decoder_cls = ar_utils.get_float_model_info( - model_name_or_path, torch_dtype=torch.bfloat16 + model_name_or_path, dtype=torch.bfloat16 ) # Disable the `use_cache` for calibration stage. model.config.use_cache = False diff --git a/torchao/prototype/autoround/eval_autoround.py b/torchao/prototype/autoround/eval_autoround.py index caebf85a2f..c0c35ea667 100644 --- a/torchao/prototype/autoround/eval_autoround.py +++ b/torchao/prototype/autoround/eval_autoround.py @@ -86,7 +86,7 @@ def main(args): with torch.no_grad(): model_name_or_path = args.model_name_or_path model, tokenizer, decoder_cls = ar_utils.get_float_model_info( - model_name_or_path, torch_dtype=torch.bfloat16 + model_name_or_path, dtype=torch.bfloat16 ) model.eval() model_device = args.model_device diff --git a/torchao/prototype/autoround/utils.py b/torchao/prototype/autoround/utils.py index 0ca0d83fd3..bac1c494ed 100644 --- a/torchao/prototype/autoround/utils.py +++ b/torchao/prototype/autoround/utils.py @@ -140,11 +140,11 @@ def _auto_detect_decoder_cls(model): return type(first_module) -def get_float_model_info(model_name_or_path, torch_dtype=torch.float32): +def get_float_model_info(model_name_or_path, dtype=torch.float32): import transformers model = transformers.AutoModelForCausalLM.from_pretrained( - model_name_or_path, torch_dtype=torch_dtype + model_name_or_path, dtype=dtype ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path) decoder_cls = _auto_detect_decoder_cls(model) diff --git a/torchao/prototype/awq/example.py b/torchao/prototype/awq/example.py index cc7f530b6f..2099c55633 100644 --- a/torchao/prototype/awq/example.py +++ b/torchao/prototype/awq/example.py @@ -215,9 +215,7 @@ def quantize_and_eval( # load any model with torch.nn.linear layers tokenizer = AutoTokenizer.from_pretrained(repo_id) model = ( - AutoModelForCausalLM.from_pretrained(repo_id, torch_dtype=precision) - .eval() - .to(device) + AutoModelForCausalLM.from_pretrained(repo_id, dtype=precision).eval().to(device) ) print(f"Time to load model: {time.time() - t0:.02f} seconds") if quant.startswith("awq-int4wo"): diff --git a/torchao/prototype/moe_quant/llama4_quant.py b/torchao/prototype/moe_quant/llama4_quant.py index e38f0a9ca3..ae6abccea5 100644 --- a/torchao/prototype/moe_quant/llama4_quant.py +++ b/torchao/prototype/moe_quant/llama4_quant.py @@ -58,7 +58,7 @@ def convert_fn(module): model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct" -model = Llama4ForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16) +model = Llama4ForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16) tokenizer = AutoTokenizer.from_pretrained(model_id) _replace_with_custom_fn_if_matches_filter( diff --git a/torchao/prototype/quantization/mixed_precision/scripts/fit.py b/torchao/prototype/quantization/mixed_precision/scripts/fit.py index d8e6be4550..bf663cb1c4 100644 --- a/torchao/prototype/quantization/mixed_precision/scripts/fit.py +++ b/torchao/prototype/quantization/mixed_precision/scripts/fit.py @@ -84,7 +84,7 @@ def main(max_seqlen, checkpoint, nsamples, max_iter, num_layers): # have been tested models Llama-3-8B, Llama-2-7B, Mistral-7B, and stories110M model = transformers.AutoModelForCausalLM.from_pretrained( - checkpoint, torch_dtype=torch.bfloat16 + checkpoint, dtype=torch.bfloat16 ) tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint) model = model.to(device) diff --git a/torchao/prototype/quantization/mixed_precision/scripts/hessian_grad.py b/torchao/prototype/quantization/mixed_precision/scripts/hessian_grad.py index 1e7b403e3d..df811829a3 100644 --- a/torchao/prototype/quantization/mixed_precision/scripts/hessian_grad.py +++ b/torchao/prototype/quantization/mixed_precision/scripts/hessian_grad.py @@ -130,7 +130,7 @@ def main(layer_id, checkpoint, max_seqlen, max_iter, nsamples): with sdpa_kernel(SDPBackend.MATH): # have been tested models Llama-3-8B, Llama-2-7B, Mistral-7B, and stories110M model = transformers.AutoModelForCausalLM.from_pretrained( - checkpoint, torch_dtype=torch.bfloat16 + checkpoint, dtype=torch.bfloat16 ) tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint) model = model.cuda() diff --git a/torchao/prototype/quantization/mixed_precision/scripts/hessian_vhp.py b/torchao/prototype/quantization/mixed_precision/scripts/hessian_vhp.py index faf46b01eb..2d0a2fb735 100644 --- a/torchao/prototype/quantization/mixed_precision/scripts/hessian_vhp.py +++ b/torchao/prototype/quantization/mixed_precision/scripts/hessian_vhp.py @@ -100,7 +100,7 @@ def f(*new_params): with sdpa_kernel(SDPBackend.MATH): # have been tested models Llama-3-8B, Llama-2-7B, Mistral-7B, and stories110M model = transformers.AutoModelForCausalLM.from_pretrained( - checkpoint, torch_dtype=torch.bfloat16 + checkpoint, dtype=torch.bfloat16 ) tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint) model = model.to(device) diff --git a/torchao/prototype/quantization/mixed_precision/scripts/utils.py b/torchao/prototype/quantization/mixed_precision/scripts/utils.py index 5a47664200..b1e0cbca8f 100644 --- a/torchao/prototype/quantization/mixed_precision/scripts/utils.py +++ b/torchao/prototype/quantization/mixed_precision/scripts/utils.py @@ -105,9 +105,9 @@ def cal_model_size(model, fqn_to_config): def load_model(repo_id, device): tokenizer = AutoTokenizer.from_pretrained(repo_id) - model = AutoModelForCausalLM.from_pretrained( - repo_id, torch_dtype=torch.bfloat16 - ).to(device=device) + model = AutoModelForCausalLM.from_pretrained(repo_id, dtype=torch.bfloat16).to( + device=device + ) return model, tokenizer diff --git a/torchao/prototype/smoothquant/example.py b/torchao/prototype/smoothquant/example.py index dbf764e526..8602b57e20 100644 --- a/torchao/prototype/smoothquant/example.py +++ b/torchao/prototype/smoothquant/example.py @@ -88,7 +88,7 @@ def quantize_and_eval( t0 = time.time() tokenizer = AutoTokenizer.from_pretrained(model_id) model = ( - AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16) + AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16) .eval() .to(device) ) @@ -155,7 +155,7 @@ def compare_models( torch.manual_seed(34) tokenizer = AutoTokenizer.from_pretrained(model_id) model = ( - AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16) + AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16) .eval() .to(device) ) @@ -167,7 +167,7 @@ def compare_models( print("Benchmarking W8A8-dynamic without SmoothQuant...") torch.manual_seed(34) w8a8_model = ( - AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16) + AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16) .eval() .to(device) )