Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually can you revert changes to this folder (.github/scripts/torchao_model_releases/) for now? we might have use cases that's using older transformer versions right now

Original file line number Diff line number Diff line change
Expand Up @@ -592,7 +592,7 @@ def _untie_weights_and_save_locally(model_id):
python -m executorch.examples.models.qwen3.convert_weights $(hf download {quantized_model}) pytorch_model_converted.bin
```

Once we have the checkpoint, we export it to ExecuTorch with a max_seq_length/max_context_length of 1024 to the XNNPACK backend as follows.
Once we have the checkpoint, we export it to ExecuTorch with a max_seq_length/max_context_length of 1024 to the XNNPACK backend as follows.

[TODO: fix config path in note where necessary]
(Note: ExecuTorch LLM export script requires config.json have certain key names. The correct config to use for the LLM export script is located at examples/models/qwen3/config/4b_config.json within the ExecuTorch repo.)
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ quantization_config = TorchAoConfig(quant_type=Int4WeightOnlyConfig(group_size=1
# Load and automatically quantize
quantized_model = AutoModelForCausalLM.from_pretrained(
"microsoft/Phi-4-mini-instruct",
torch_dtype="auto",
dtype="auto",
device_map="auto",
quantization_config=quantization_config
)
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/_models/eval_hf_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def quantize_model_and_save(model_id, quant_config, output_dir="results"):
quantized_model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.bfloat16,
dtype=torch.bfloat16,
quantization_config=quantization_config,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
Expand Down
8 changes: 4 additions & 4 deletions docs/source/serving.rst
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ Install the required packages:
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="auto",
torch_dtype="auto",
dtype="auto",
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_path)
Expand Down Expand Up @@ -134,7 +134,7 @@ Optionally, we can quantize the embedding and lm_head differently, since those l
from transformers.modeling_utils import find_tied_parameters

model_id = "microsoft/Phi-4-mini-instruct"
untied_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="auto")
untied_model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id)

print(untied_model)
Expand Down Expand Up @@ -202,7 +202,7 @@ Quantizing the model for mobile deployment using TorchAO's ``Int8DynamicActivati
quantization_config = TorchAoConfig(quant_type=quant_config, include_embedding=True, untie_embedding_weights=True, modules_to_not_convert=[])

# either use `untied_model_id` or `untied_model_local_path`
quantized_model = AutoModelForCausalLM.from_pretrained(untied_model_id, torch_dtype=torch.float32, device_map="auto", quantization_config=quantization_config)
quantized_model = AutoModelForCausalLM.from_pretrained(untied_model_id, dtype=torch.float32, device_map="auto", quantization_config=quantization_config)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Push to hub
Expand Down Expand Up @@ -285,7 +285,7 @@ For Phi-4-mini-instruct, when quantized with float8 dynamic quant, we can reduce

# use "microsoft/Phi-4-mini-instruct" or "pytorch/Phi-4-mini-instruct-float8dq"
model_id = "pytorch/Phi-4-mini-instruct-float8dq"
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_id)

torch.cuda.reset_peak_memory_stats()
Expand Down
2 changes: 1 addition & 1 deletion docs/source/torchao_vllm_integration.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ quantization_config = TorchAoConfig(
# Load and automatically quantize the model
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.2-1B",
torch_dtype="auto",
dtype="auto",
device_map="auto",
quantization_config=quantization_config
)
Expand Down
4 changes: 2 additions & 2 deletions test/integration/test_load_and_run_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def test_deprecated_hf_models(self, model_info):
with warnings.catch_warnings(record=True) as caught_warnings:
quantized_model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype="bfloat16",
dtype="bfloat16",
device_map="cuda:0",
)
# version mismatch check in config.py
Expand Down Expand Up @@ -250,7 +250,7 @@ def test_deprecated_hf_models(self, model_info):
with warnings.catch_warnings(record=True) as caught_warnings:
_ = AutoModelForCausalLM.from_pretrained(
_HIGH_PRECISION_MODEL,
torch_dtype="bfloat16",
dtype="bfloat16",
device_map="cuda:0",
quantization_config=quantized_model.config.quantization_config,
)
Expand Down
2 changes: 1 addition & 1 deletion test/integration/test_vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def quantize_and_save_model(
# Load and quantize model
quantized_model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype="bfloat16",
dtype="bfloat16",
device_map="cuda",
quantization_config=quantization_config,
)
Expand Down
2 changes: 1 addition & 1 deletion torchao/prototype/autoround/autoround_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def main(args):
# Get the model, tokenizer, and decoder_cls
model_name_or_path = args.model_name_or_path
model, tokenizer, decoder_cls = ar_utils.get_float_model_info(
model_name_or_path, torch_dtype=torch.bfloat16
model_name_or_path, dtype=torch.bfloat16
)
# Disable the `use_cache` for calibration stage.
model.config.use_cache = False
Expand Down
2 changes: 1 addition & 1 deletion torchao/prototype/autoround/eval_autoround.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def main(args):
with torch.no_grad():
model_name_or_path = args.model_name_or_path
model, tokenizer, decoder_cls = ar_utils.get_float_model_info(
model_name_or_path, torch_dtype=torch.bfloat16
model_name_or_path, dtype=torch.bfloat16
)
model.eval()
model_device = args.model_device
Expand Down
4 changes: 2 additions & 2 deletions torchao/prototype/autoround/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,11 +140,11 @@ def _auto_detect_decoder_cls(model):
return type(first_module)


def get_float_model_info(model_name_or_path, torch_dtype=torch.float32):
def get_float_model_info(model_name_or_path, dtype=torch.float32):
import transformers

model = transformers.AutoModelForCausalLM.from_pretrained(
model_name_or_path, torch_dtype=torch_dtype
model_name_or_path, dtype=dtype
)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path)
decoder_cls = _auto_detect_decoder_cls(model)
Expand Down
4 changes: 1 addition & 3 deletions torchao/prototype/awq/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,9 +215,7 @@ def quantize_and_eval(
# load any model with torch.nn.linear layers
tokenizer = AutoTokenizer.from_pretrained(repo_id)
model = (
AutoModelForCausalLM.from_pretrained(repo_id, torch_dtype=precision)
.eval()
.to(device)
AutoModelForCausalLM.from_pretrained(repo_id, dtype=precision).eval().to(device)
)
print(f"Time to load model: {time.time() - t0:.02f} seconds")
if quant.startswith("awq-int4wo"):
Expand Down
2 changes: 1 addition & 1 deletion torchao/prototype/moe_quant/llama4_quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def convert_fn(module):


model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
model = Llama4ForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
model = Llama4ForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_id)

_replace_with_custom_fn_if_matches_filter(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def main(max_seqlen, checkpoint, nsamples, max_iter, num_layers):

# have been tested models Llama-3-8B, Llama-2-7B, Mistral-7B, and stories110M
model = transformers.AutoModelForCausalLM.from_pretrained(
checkpoint, torch_dtype=torch.bfloat16
checkpoint, dtype=torch.bfloat16
)
tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)
model = model.to(device)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def main(layer_id, checkpoint, max_seqlen, max_iter, nsamples):
with sdpa_kernel(SDPBackend.MATH):
# have been tested models Llama-3-8B, Llama-2-7B, Mistral-7B, and stories110M
model = transformers.AutoModelForCausalLM.from_pretrained(
checkpoint, torch_dtype=torch.bfloat16
checkpoint, dtype=torch.bfloat16
)
tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)
model = model.cuda()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def f(*new_params):
with sdpa_kernel(SDPBackend.MATH):
# have been tested models Llama-3-8B, Llama-2-7B, Mistral-7B, and stories110M
model = transformers.AutoModelForCausalLM.from_pretrained(
checkpoint, torch_dtype=torch.bfloat16
checkpoint, dtype=torch.bfloat16
)
tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)
model = model.to(device)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,9 +105,9 @@ def cal_model_size(model, fqn_to_config):

def load_model(repo_id, device):
tokenizer = AutoTokenizer.from_pretrained(repo_id)
model = AutoModelForCausalLM.from_pretrained(
repo_id, torch_dtype=torch.bfloat16
).to(device=device)
model = AutoModelForCausalLM.from_pretrained(repo_id, dtype=torch.bfloat16).to(
device=device
)
return model, tokenizer


Expand Down
6 changes: 3 additions & 3 deletions torchao/prototype/smoothquant/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def quantize_and_eval(
t0 = time.time()
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = (
AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16)
.eval()
.to(device)
)
Expand Down Expand Up @@ -155,7 +155,7 @@ def compare_models(
torch.manual_seed(34)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = (
AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16)
.eval()
.to(device)
)
Expand All @@ -167,7 +167,7 @@ def compare_models(
print("Benchmarking W8A8-dynamic without SmoothQuant...")
torch.manual_seed(34)
w8a8_model = (
AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16)
.eval()
.to(device)
)
Expand Down
Loading