Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

def eval_peak_memory_usage(model_id: str):
model = AutoModelForCausalLM.from_pretrained(
model_id, device_map="auto", torch_dtype=torch.bfloat16
model_id, device_map="auto", dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Expand Down
20 changes: 10 additions & 10 deletions .github/scripts/torchao_model_releases/quantize_and_upload.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually can you revert changes to this folder (.github/scripts/torchao_model_releases/) for now? we might have use cases that's using older transformer versions right now

Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def _get_username():

def _untie_weights_and_save_locally(model_id):
untied_model = AutoModelForCausalLM.from_pretrained(
model_id, torch_dtype="auto", device_map="auto"
model_id, dtype="auto", device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
Expand Down Expand Up @@ -209,15 +209,15 @@ def _untie_weights_and_save_locally(model_id):
from torchao.quantization import Int4WeightOnlyConfig
quant_config = Int4WeightOnlyConfig(group_size=128, int4_packing_format="tile_packed_to_4d", int4_choose_qparams_algorithm="hqq")
quantization_config = TorchAoConfig(quant_type=quant_config)
quantized_model = AutoModelForCausalLM.from_pretrained(model_to_quantize, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
quantized_model = AutoModelForCausalLM.from_pretrained(model_to_quantize, device_map="auto", dtype=torch.bfloat16, quantization_config=quantization_config)
tokenizer = AutoTokenizer.from_pretrained(model_id)
"""

_fp8_quant_code = """
from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, PerRow
quant_config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
quantization_config = TorchAoConfig(quant_type=quant_config)
quantized_model = AutoModelForCausalLM.from_pretrained(model_to_quantize, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
quantized_model = AutoModelForCausalLM.from_pretrained(model_to_quantize, device_map="auto", dtype=torch.bfloat16, quantization_config=quantization_config)
tokenizer = AutoTokenizer.from_pretrained(model_id)
"""

Expand All @@ -238,7 +238,7 @@ def _untie_weights_and_save_locally(model_id):
)
quant_config = ModuleFqnToConfig({{"_default": linear_config, "model.embed_tokens": embedding_config}})
quantization_config = TorchAoConfig(quant_type=quant_config, include_input_output_embeddings=True, modules_to_not_convert=[])
quantized_model = AutoModelForCausalLM.from_pretrained(model_to_quantize, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
quantized_model = AutoModelForCausalLM.from_pretrained(model_to_quantize, device_map="auto", dtype=torch.bfloat16, quantization_config=quantization_config)
tokenizer = AutoTokenizer.from_pretrained(model_id)
"""

Expand All @@ -251,7 +251,7 @@ def _untie_weights_and_save_locally(model_id):
model = AutoModelForCausalLM.from_pretrained(
model_to_quantize,
device_map="auto",
torch_dtype=torch.bfloat16,
dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Expand Down Expand Up @@ -332,7 +332,7 @@ def _untie_weights_and_save_locally(model_id):
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype="auto",
dtype="auto",
device_map="auto"
)

Expand Down Expand Up @@ -394,7 +394,7 @@ def _untie_weights_and_save_locally(model_id):

# use "{base_model}" or "{quantized_model}"
model_id = "{quantized_model}"
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_id)

torch.cuda.reset_peak_memory_stats()
Expand Down Expand Up @@ -538,7 +538,7 @@ def _untie_weights_and_save_locally(model_id):
import torch

model_id = "{base_model}"
untied_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="auto")
untied_model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id)

print(untied_model)
Expand Down Expand Up @@ -668,7 +668,7 @@ def quantize_and_upload(
model = AutoModelForCausalLM.from_pretrained(
model_to_quantize,
device_map="auto",
torch_dtype=torch.bfloat16,
dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Expand Down Expand Up @@ -708,7 +708,7 @@ def quantize_and_upload(
quantized_model = AutoModelForCausalLM.from_pretrained(
model_to_quantize,
device_map="auto",
torch_dtype=torch.bfloat16,
dtype=torch.bfloat16,
quantization_config=quantization_config,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ quantization_config = TorchAoConfig(quant_type=Int4WeightOnlyConfig(group_size=1
# Load and automatically quantize
quantized_model = AutoModelForCausalLM.from_pretrained(
"microsoft/Phi-4-mini-instruct",
torch_dtype="auto",
dtype="auto",
device_map="auto",
quantization_config=quantization_config
)
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/_models/eval_hf_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def quantize_model_and_save(model_id, quant_config, output_dir="results"):
quantized_model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.bfloat16,
dtype=torch.bfloat16,
quantization_config=quantization_config,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
Expand Down
10 changes: 5 additions & 5 deletions docs/source/serving.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ For this example, we'll use ``Float8DynamicActivationFloat8WeightConfig`` on the

quant_config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
quantization_config = TorchAoConfig(quant_type=quant_config)
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", dtype=torch.bfloat16, quantization_config=quantization_config)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Push the model to hub
Expand Down Expand Up @@ -116,7 +116,7 @@ Install the required packages:
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="auto",
torch_dtype="auto",
dtype="auto",
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_path)
Expand Down Expand Up @@ -165,7 +165,7 @@ Optionally, we can quantize the embedding and lm_head differently, since those l
from transformers.modeling_utils import find_tied_parameters

model_id = "microsoft/Phi-4-mini-instruct"
untied_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="auto")
untied_model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id)

print(untied_model)
Expand Down Expand Up @@ -233,7 +233,7 @@ Quantizing the model for mobile deployment using TorchAO's ``Int8DynamicActivati
quantization_config = TorchAoConfig(quant_type=quant_config, include_embedding=True, untie_embedding_weights=True, modules_to_not_convert=[])

# either use `untied_model_id` or `untied_model_local_path`
quantized_model = AutoModelForCausalLM.from_pretrained(untied_model_id, torch_dtype=torch.float32, device_map="auto", quantization_config=quantization_config)
quantized_model = AutoModelForCausalLM.from_pretrained(untied_model_id, dtype=torch.float32, device_map="auto", quantization_config=quantization_config)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Push to hub
Expand Down Expand Up @@ -316,7 +316,7 @@ For Phi-4-mini-instruct, when quantized with float8 dynamic quant, we can reduce

# use "microsoft/Phi-4-mini-instruct" or "pytorch/Phi-4-mini-instruct-float8dq"
model_id = "pytorch/Phi-4-mini-instruct-float8dq"
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_id)

torch.cuda.reset_peak_memory_stats()
Expand Down
2 changes: 1 addition & 1 deletion docs/source/torchao_vllm_integration.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ quantization_config = TorchAoConfig(
# Load and automatically quantize the model
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.2-1B",
torch_dtype="auto",
dtype="auto",
device_map="auto",
quantization_config=quantization_config
)
Expand Down
4 changes: 2 additions & 2 deletions test/integration/test_load_and_run_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def test_deprecated_hf_models(self, model_info):
with warnings.catch_warnings(record=True) as caught_warnings:
quantized_model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype="bfloat16",
dtype="bfloat16",
device_map="cuda:0",
)
# version mismatch check in config.py
Expand Down Expand Up @@ -250,7 +250,7 @@ def test_deprecated_hf_models(self, model_info):
with warnings.catch_warnings(record=True) as caught_warnings:
_ = AutoModelForCausalLM.from_pretrained(
_HIGH_PRECISION_MODEL,
torch_dtype="bfloat16",
dtype="bfloat16",
device_map="cuda:0",
quantization_config=quantized_model.config.quantization_config,
)
Expand Down
2 changes: 1 addition & 1 deletion test/integration/test_vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def quantize_and_save_model(
# Load and quantize model
quantized_model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype="bfloat16",
dtype="bfloat16",
device_map="cuda",
quantization_config=quantization_config,
)
Expand Down
2 changes: 1 addition & 1 deletion torchao/prototype/autoround/autoround_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def main(args):
# Get the model, tokenizer, and decoder_cls
model_name_or_path = args.model_name_or_path
model, tokenizer, decoder_cls = ar_utils.get_float_model_info(
model_name_or_path, torch_dtype=torch.bfloat16
model_name_or_path, dtype=torch.bfloat16
)
# Disable the `use_cache` for calibration stage.
model.config.use_cache = False
Expand Down
2 changes: 1 addition & 1 deletion torchao/prototype/autoround/eval_autoround.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def main(args):
with torch.no_grad():
model_name_or_path = args.model_name_or_path
model, tokenizer, decoder_cls = ar_utils.get_float_model_info(
model_name_or_path, torch_dtype=torch.bfloat16
model_name_or_path, dtype=torch.bfloat16
)
model.eval()
model_device = args.model_device
Expand Down
4 changes: 2 additions & 2 deletions torchao/prototype/autoround/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,11 +140,11 @@ def _auto_detect_decoder_cls(model):
return type(first_module)


def get_float_model_info(model_name_or_path, torch_dtype=torch.float32):
def get_float_model_info(model_name_or_path, dtype=torch.float32):
import transformers

model = transformers.AutoModelForCausalLM.from_pretrained(
model_name_or_path, torch_dtype=torch_dtype
model_name_or_path, dtype=dtype
)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path)
decoder_cls = _auto_detect_decoder_cls(model)
Expand Down
2 changes: 1 addition & 1 deletion torchao/prototype/awq/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ def quantize_and_eval(
# load any model with torch.nn.linear layers
tokenizer = AutoTokenizer.from_pretrained(repo_id)
model = (
AutoModelForCausalLM.from_pretrained(repo_id, torch_dtype=precision)
AutoModelForCausalLM.from_pretrained(repo_id, dtype=precision)
.eval()
.to(device)
)
Expand Down
2 changes: 1 addition & 1 deletion torchao/prototype/moe_quant/llama4_quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def convert_fn(module):


model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
model = Llama4ForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
model = Llama4ForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_id)

_replace_with_custom_fn_if_matches_filter(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def main(max_seqlen, checkpoint, nsamples, max_iter, num_layers):

# have been tested models Llama-3-8B, Llama-2-7B, Mistral-7B, and stories110M
model = transformers.AutoModelForCausalLM.from_pretrained(
checkpoint, torch_dtype=torch.bfloat16
checkpoint, dtype=torch.bfloat16
)
tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)
model = model.to(device)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def main(layer_id, checkpoint, max_seqlen, max_iter, nsamples):
with sdpa_kernel(SDPBackend.MATH):
# have been tested models Llama-3-8B, Llama-2-7B, Mistral-7B, and stories110M
model = transformers.AutoModelForCausalLM.from_pretrained(
checkpoint, torch_dtype=torch.bfloat16
checkpoint, dtype=torch.bfloat16
)
tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)
model = model.cuda()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def f(*new_params):
with sdpa_kernel(SDPBackend.MATH):
# have been tested models Llama-3-8B, Llama-2-7B, Mistral-7B, and stories110M
model = transformers.AutoModelForCausalLM.from_pretrained(
checkpoint, torch_dtype=torch.bfloat16
checkpoint, dtype=torch.bfloat16
)
tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)
model = model.to(device)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def cal_model_size(model, fqn_to_config):
def load_model(repo_id, device):
tokenizer = AutoTokenizer.from_pretrained(repo_id)
model = AutoModelForCausalLM.from_pretrained(
repo_id, torch_dtype=torch.bfloat16
repo_id, dtype=torch.bfloat16
).to(device=device)
return model, tokenizer

Expand Down
6 changes: 3 additions & 3 deletions torchao/prototype/smoothquant/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def quantize_and_eval(
t0 = time.time()
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = (
AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16)
.eval()
.to(device)
)
Expand Down Expand Up @@ -155,7 +155,7 @@ def compare_models(
torch.manual_seed(34)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = (
AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16)
.eval()
.to(device)
)
Expand All @@ -167,7 +167,7 @@ def compare_models(
print("Benchmarking W8A8-dynamic without SmoothQuant...")
torch.manual_seed(34)
w8a8_model = (
AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16)
.eval()
.to(device)
)
Expand Down
Loading