From d32fd51739f5de7bd9d05895bc2c78fa82855ed2 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Fri, 12 Sep 2025 13:28:38 +0800 Subject: [PATCH 1/7] support autoround v0.7 Signed-off-by: Kaihui-intel --- .../torch/algorithms/weight_only/autoround.py | 20 +++++++++++++------ .../torch/quantization/algorithm_entry.py | 4 ++++ .../torch/quantization/config.py | 7 +++++++ .../weight_only/test_autoround.py | 19 ++++++++++++++++-- 4 files changed, 42 insertions(+), 8 deletions(-) diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py index e3b6a8a590c..40ce3d0444e 100644 --- a/neural_compressor/torch/algorithms/weight_only/autoround.py +++ b/neural_compressor/torch/algorithms/weight_only/autoround.py @@ -37,6 +37,7 @@ def _is_auto_round_available(): from auto_round.export.export_to_itrex.export import pack_model # pylint: disable=E0401 from auto_round.mllm import lmms_eval, mllm_eval from auto_round.mllm.template import Template, get_template +from auto_round.schemes import QuantizationScheme from neural_compressor.torch.algorithms import Quantizer from neural_compressor.torch.utils import get_accelerator, logger @@ -53,7 +54,7 @@ def __init__( enable_full_range: bool = False, ##for symmetric, TODO support later batch_size: int = 8, amp: bool = True, - device: str = None, + device_map: str = None, lr_scheduler=None, dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k", enable_quanted_input: bool = True, @@ -91,6 +92,8 @@ def __init__( processor=None, template: Union[str, Template] = None, truncation: bool = False, + # 0.7 + scheme: Union[str, dict, QuantizationScheme] = "W4A16", **kwargs, ): """Init a AutQRoundQuantizer object. @@ -122,7 +125,7 @@ def __init__( enable_full_range (bool): Whether to enable full range quantization (default is False). batch_size (int): Batch size for training (default is 8). amp (bool): Whether to use automatic mixed precision (default is True). - device: The device to be used for tuning (default is "auto"). + device_map: The device to be used for tuning (default is None). lr_scheduler: The learning rate scheduler to be used. dataset (str): The default dataset name (default is "NeelNanda/pile-10k"). enable_quanted_input (bool): Whether to use the output of the previous quantized block as @@ -161,6 +164,7 @@ def __init__( image_processor (Processor): Image processor for special model like llava. template (Template): The template to specify process for different mllms. truncation (bool): Activates truncation to cut input sequences longer than `max_length` to `max_length`. + scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations. Returns: The quantized model. @@ -205,6 +209,8 @@ def __init__( self.image_processor = image_processor self.template = template self.truncation = truncation + self.scheme = scheme + self.device_map = device_map self.enable_w4afp8 = self._is_w4afp8() def _is_w4afp8(self): @@ -237,12 +243,13 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): rounder = AutoRoundMLLM( model, tokenizer=self.tokenizer, + scheme=self.scheme, processor=self.processor, image_processor=self.image_processor, layer_config=self.quant_config, batch_size=self.batch_size, amp=self.amp, - device=self.device, + device_map=self.device_map, lr_scheduler=self.lr_scheduler, dataset=dataloader, extra_data_dir=self.extra_data_dir, @@ -278,12 +285,13 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): rounder = AutoRound( model=model, tokenizer=self.tokenizer, + scheme=self.scheme, dataset=dataloader, layer_config=self.quant_config or {}, enable_full_range=self.enable_full_range, batch_size=self.batch_size, amp=self.amp, - device=self.device, + device_map=self.device_map, lr_scheduler=self.lr_scheduler, enable_quanted_input=self.enable_quanted_input, enable_minmax_tuning=self.enable_minmax_tuning, @@ -317,7 +325,7 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): elif "itrex" in self.export_format: model = pack_model(model, weight_config, device=self.device, inplace=True) else: # pragma: no cover - model = rounder.save_quantized(output_dir=None, format=self.export_format, device=self.device, inplace=True) + model = rounder.save_quantized(output_dir="temp_auto_round", format=self.export_format, inplace=True) return model @@ -342,7 +350,7 @@ def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42 from auto_round.calib_dataset import get_dataloader # pylint: disable=E0401 dataloader = get_dataloader( - tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=seed, bs=bs, nsamples=nsamples + tokenizer, seqlen, dataset_name=dataset_name, seed=seed, bs=bs, nsamples=nsamples ) return dataloader diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py index 1df11392636..5bb11fb69bc 100644 --- a/neural_compressor/torch/quantization/algorithm_entry.py +++ b/neural_compressor/torch/quantization/algorithm_entry.py @@ -629,6 +629,8 @@ def autoround_quantize_entry( image_processor = quant_config.image_processor template = quant_config.template truncation = quant_config.truncation + scheme = quant_config.scheme + device_map = quant_config.device_map kwargs.pop("example_inputs") quantizer = get_quantizer( @@ -666,6 +668,8 @@ def autoround_quantize_entry( image_processor=image_processor, template=template, truncation=truncation, + scheme=scheme, + device_map=device_map, ) model = quantizer.execute(model=model, mode=mode, *args, **kwargs) model.qconfig = configs_mapping diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index 48a682a00a7..27e5a85551e 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -971,6 +971,9 @@ def __init__( # v0.4 enable_norm_bias_tuning: bool = False, enable_torch_compile: bool = None, + # v0.7 + scheme: str | dict = "W4A16", + device_map: str = None, # mllm is_mllm: bool = False, quant_nontext_module: bool = False, @@ -1029,6 +1032,8 @@ def __init__( image_processor (Processor): Image processor for special model like llava. template (Template): The template to specify process for different mllms. truncation (bool): Activates truncation to cut input sequences longer than `max_length` to `max_length`. + device_map: The device to be used for tuning. + scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations. white_list (Optional[List[OP_NAME_OR_MODULE_TYPE]]): White list of operator names or module types. Default is DEFAULT_WHITE_LIST. """ @@ -1073,6 +1078,8 @@ def __init__( self.image_processor = image_processor self.template = template self.truncation = truncation + self.scheme = scheme + self.device_map = device_map self._post_init() @classmethod diff --git a/test/3x/torch/quantization/weight_only/test_autoround.py b/test/3x/torch/quantization/weight_only/test_autoround.py index 839427c4828..c9a5e35ebb8 100644 --- a/test/3x/torch/quantization/weight_only/test_autoround.py +++ b/test/3x/torch/quantization/weight_only/test_autoround.py @@ -247,7 +247,7 @@ def test_mllm(self): seed=42, nsamples=1, gradient_accumulate_steps=1, - quant_nontext_module=False, + quant_nontext_module=True, processor=processor, ) quant_config = AutoRoundConfig( @@ -258,7 +258,7 @@ def test_mllm(self): batch_size=batch_size, iters=1, seqlen=seqlen, - quant_nontext_module=False, + quant_nontext_module=True, truncation=truncation, gradient_accumulate_steps=gradient_accumulate_steps, ) @@ -283,6 +283,21 @@ def test_mllm(self): # q_model.save(output_dir="saved_results_tiny-random-GPTJForCausalLM", format="huggingface") # loaded_model = load("saved_results_tiny-random-GPTJForCausalLM", format="huggingface", trust_remote_code=True) + @pytest.mark.parametrize("scheme", ["MXFP4", "NVFP4"]) + def test_scheme(self, scheme): + fp32_model = copy.deepcopy(self.gptj) + quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, amp=False ,scale_dtype="fp16", + scheme=scheme, export_format="llm_compressor") + logger.info(f"Test AutoRound with config {quant_config}") + + # quantizer execute + model = prepare(model=fp32_model, quant_config=quant_config) + run_fn(model, self.dataloader) + q_model = convert(model) + out = q_model(self.inp)[0] + assert q_model is not None, "Quantization failed!" + assert q_model.transformer.h[0].attn.k_proj.bits is 4 + assert torch.allclose(out, self.label, atol=1e-1) @pytest.mark.skipif(not is_habana_framework_installed(), reason="Habana framework is not installed") @pytest.mark.skipif(os.getenv("PT_HPU_LAZY_MODE", "0") == "1", reason="Lazy mode is enabled") From 0f525fbdfe03dc14fd3d54701331f34cbbe3b734 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 12 Sep 2025 07:01:52 +0000 Subject: [PATCH 2/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_compressor/torch/algorithms/weight_only/autoround.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py index 40ce3d0444e..b3c6b292831 100644 --- a/neural_compressor/torch/algorithms/weight_only/autoround.py +++ b/neural_compressor/torch/algorithms/weight_only/autoround.py @@ -349,9 +349,7 @@ def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42 """ from auto_round.calib_dataset import get_dataloader # pylint: disable=E0401 - dataloader = get_dataloader( - tokenizer, seqlen, dataset_name=dataset_name, seed=seed, bs=bs, nsamples=nsamples - ) + dataloader = get_dataloader(tokenizer, seqlen, dataset_name=dataset_name, seed=seed, bs=bs, nsamples=nsamples) return dataloader From 78a0bdf98fc23620172762b73398454dfbd1f773 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Mon, 15 Sep 2025 15:09:26 +0800 Subject: [PATCH 3/7] update ut Signed-off-by: Kaihui-intel --- test/3x/torch/quantization/weight_only/test_autoround.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/3x/torch/quantization/weight_only/test_autoround.py b/test/3x/torch/quantization/weight_only/test_autoround.py index c9a5e35ebb8..f7fb1fb4303 100644 --- a/test/3x/torch/quantization/weight_only/test_autoround.py +++ b/test/3x/torch/quantization/weight_only/test_autoround.py @@ -57,6 +57,13 @@ def set_hpu_torch_compile_envs(): auto_round_installed = True except ImportError: auto_round_installed = False + +try: + import compressed_tensors + + ct_installed = True +except ImportError: + ct_installed = False @torch.no_grad() @@ -283,6 +290,7 @@ def test_mllm(self): # q_model.save(output_dir="saved_results_tiny-random-GPTJForCausalLM", format="huggingface") # loaded_model = load("saved_results_tiny-random-GPTJForCausalLM", format="huggingface", trust_remote_code=True) + @pytest.mark.skipif(not ct_installed, reason="compressed-tensors module is not installed") @pytest.mark.parametrize("scheme", ["MXFP4", "NVFP4"]) def test_scheme(self, scheme): fp32_model = copy.deepcopy(self.gptj) From a571afa93ab1d68b5af13c7c1eba49fa5cfc2e71 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Mon, 15 Sep 2025 15:40:11 +0800 Subject: [PATCH 4/7] add ut requirments Signed-off-by: Kaihui-intel --- test/3x/torch/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/test/3x/torch/requirements.txt b/test/3x/torch/requirements.txt index 16b7e508083..9b090217af0 100644 --- a/test/3x/torch/requirements.txt +++ b/test/3x/torch/requirements.txt @@ -1,4 +1,5 @@ auto_round +compressed-tensors datasets deepspeed @ git+https://github.com/HabanaAI/DeepSpeed.git@1.21.0 expecttest From 97724ddba4fece48664a6574d1694c918137b654 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 16 Sep 2025 09:11:38 +0800 Subject: [PATCH 5/7] skip test_scheme Signed-off-by: Kaihui-intel --- test/3x/torch/quantization/weight_only/test_autoround.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/3x/torch/quantization/weight_only/test_autoround.py b/test/3x/torch/quantization/weight_only/test_autoround.py index f7fb1fb4303..07632520a17 100644 --- a/test/3x/torch/quantization/weight_only/test_autoround.py +++ b/test/3x/torch/quantization/weight_only/test_autoround.py @@ -4,7 +4,7 @@ import pytest import torch import transformers -from packaging.version import Version +from packaging.version import Version, parse import os from functools import lru_cache @@ -290,6 +290,7 @@ def test_mllm(self): # q_model.save(output_dir="saved_results_tiny-random-GPTJForCausalLM", format="huggingface") # loaded_model = load("saved_results_tiny-random-GPTJForCausalLM", format="huggingface", trust_remote_code=True) + @pytest.mark.skipif(parse(auto_round.__version__) > parse("0.7.0")) @pytest.mark.skipif(not ct_installed, reason="compressed-tensors module is not installed") @pytest.mark.parametrize("scheme", ["MXFP4", "NVFP4"]) def test_scheme(self, scheme): From e00c060973aa80eeb44390964a0bacd3eabea397 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 16 Sep 2025 09:14:04 +0800 Subject: [PATCH 6/7] skip test_scheme Signed-off-by: Kaihui-intel --- test/3x/torch/quantization/weight_only/test_autoround.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/3x/torch/quantization/weight_only/test_autoround.py b/test/3x/torch/quantization/weight_only/test_autoround.py index 07632520a17..8d66dac1a46 100644 --- a/test/3x/torch/quantization/weight_only/test_autoround.py +++ b/test/3x/torch/quantization/weight_only/test_autoround.py @@ -290,7 +290,7 @@ def test_mllm(self): # q_model.save(output_dir="saved_results_tiny-random-GPTJForCausalLM", format="huggingface") # loaded_model = load("saved_results_tiny-random-GPTJForCausalLM", format="huggingface", trust_remote_code=True) - @pytest.mark.skipif(parse(auto_round.__version__) > parse("0.7.0")) + @pytest.mark.skipif(parse(auto_round.__version__) <= parse("0.7.0")) @pytest.mark.skipif(not ct_installed, reason="compressed-tensors module is not installed") @pytest.mark.parametrize("scheme", ["MXFP4", "NVFP4"]) def test_scheme(self, scheme): From 46374171cb9b673b980283a2ba11647d34ca7737 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 16 Sep 2025 12:15:32 +0800 Subject: [PATCH 7/7] fix ut skip Signed-off-by: Kaihui-intel --- .../quantization/weight_only/test_autoround.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/test/3x/torch/quantization/weight_only/test_autoround.py b/test/3x/torch/quantization/weight_only/test_autoround.py index 8d66dac1a46..b919e56cf4d 100644 --- a/test/3x/torch/quantization/weight_only/test_autoround.py +++ b/test/3x/torch/quantization/weight_only/test_autoround.py @@ -290,13 +290,21 @@ def test_mllm(self): # q_model.save(output_dir="saved_results_tiny-random-GPTJForCausalLM", format="huggingface") # loaded_model = load("saved_results_tiny-random-GPTJForCausalLM", format="huggingface", trust_remote_code=True) - @pytest.mark.skipif(parse(auto_round.__version__) <= parse("0.7.0")) - @pytest.mark.skipif(not ct_installed, reason="compressed-tensors module is not installed") + @pytest.mark.skipif(parse(auto_round.__version__) <= parse("0.7.0"), + reason="Export with llm_compressor format does not return a model.") + @pytest.mark.skipif(not ct_installed, reason="The compressed-tensors module is not installed.") @pytest.mark.parametrize("scheme", ["MXFP4", "NVFP4"]) def test_scheme(self, scheme): fp32_model = copy.deepcopy(self.gptj) - quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, amp=False ,scale_dtype="fp16", - scheme=scheme, export_format="llm_compressor") + quant_config = AutoRoundConfig( + nsamples=32, + seqlen=10, + iters=10, + amp=False, + scale_dtype="fp16", + scheme=scheme, + export_format="llm_compressor", + ) logger.info(f"Test AutoRound with config {quant_config}") # quantizer execute