From d32fd51739f5de7bd9d05895bc2c78fa82855ed2 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Fri, 12 Sep 2025 13:28:38 +0800
Subject: [PATCH 1/7] support autoround v0.7

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../torch/algorithms/weight_only/autoround.py | 20 +++++++++++++------
 .../torch/quantization/algorithm_entry.py     |  4 ++++
 .../torch/quantization/config.py              |  7 +++++++
 .../weight_only/test_autoround.py             | 19 ++++++++++++++++--
 4 files changed, 42 insertions(+), 8 deletions(-)

diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py
index e3b6a8a590c..40ce3d0444e 100644
--- a/neural_compressor/torch/algorithms/weight_only/autoround.py
+++ b/neural_compressor/torch/algorithms/weight_only/autoround.py
@@ -37,6 +37,7 @@ def _is_auto_round_available():
 from auto_round.export.export_to_itrex.export import pack_model  # pylint: disable=E0401
 from auto_round.mllm import lmms_eval, mllm_eval
 from auto_round.mllm.template import Template, get_template
+from auto_round.schemes import QuantizationScheme
 
 from neural_compressor.torch.algorithms import Quantizer
 from neural_compressor.torch.utils import get_accelerator, logger
@@ -53,7 +54,7 @@ def __init__(
         enable_full_range: bool = False,  ##for symmetric, TODO support later
         batch_size: int = 8,
         amp: bool = True,
-        device: str = None,
+        device_map: str = None,
         lr_scheduler=None,
         dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
         enable_quanted_input: bool = True,
@@ -91,6 +92,8 @@ def __init__(
         processor=None,
         template: Union[str, Template] = None,
         truncation: bool = False,
+        # 0.7
+        scheme: Union[str, dict, QuantizationScheme] = "W4A16",
         **kwargs,
     ):
         """Init a AutQRoundQuantizer object.
@@ -122,7 +125,7 @@ def __init__(
             enable_full_range (bool): Whether to enable full range quantization (default is False).
             batch_size (int): Batch size for training (default is 8).
             amp (bool): Whether to use automatic mixed precision (default is True).
-            device: The device to be used for tuning (default is "auto").
+            device_map: The device to be used for tuning (default is None).
             lr_scheduler: The learning rate scheduler to be used.
             dataset (str): The default dataset name (default is "NeelNanda/pile-10k").
             enable_quanted_input (bool): Whether to use the output of the previous quantized block as
@@ -161,6 +164,7 @@ def __init__(
             image_processor (Processor): Image processor for special model like llava.
             template (Template): The template to specify process for different mllms.
             truncation (bool): Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations.
 
         Returns:
             The quantized model.
@@ -205,6 +209,8 @@ def __init__(
         self.image_processor = image_processor
         self.template = template
         self.truncation = truncation
+        self.scheme = scheme
+        self.device_map = device_map
         self.enable_w4afp8 = self._is_w4afp8()
 
     def _is_w4afp8(self):
@@ -237,12 +243,13 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
             rounder = AutoRoundMLLM(
                 model,
                 tokenizer=self.tokenizer,
+                scheme=self.scheme,
                 processor=self.processor,
                 image_processor=self.image_processor,
                 layer_config=self.quant_config,
                 batch_size=self.batch_size,
                 amp=self.amp,
-                device=self.device,
+                device_map=self.device_map,
                 lr_scheduler=self.lr_scheduler,
                 dataset=dataloader,
                 extra_data_dir=self.extra_data_dir,
@@ -278,12 +285,13 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
             rounder = AutoRound(
                 model=model,
                 tokenizer=self.tokenizer,
+                scheme=self.scheme,
                 dataset=dataloader,
                 layer_config=self.quant_config or {},
                 enable_full_range=self.enable_full_range,
                 batch_size=self.batch_size,
                 amp=self.amp,
-                device=self.device,
+                device_map=self.device_map,
                 lr_scheduler=self.lr_scheduler,
                 enable_quanted_input=self.enable_quanted_input,
                 enable_minmax_tuning=self.enable_minmax_tuning,
@@ -317,7 +325,7 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
         elif "itrex" in self.export_format:
             model = pack_model(model, weight_config, device=self.device, inplace=True)
         else:  # pragma: no cover
-            model = rounder.save_quantized(output_dir=None, format=self.export_format, device=self.device, inplace=True)
+            model = rounder.save_quantized(output_dir="temp_auto_round", format=self.export_format, inplace=True)
 
         return model
 
@@ -342,7 +350,7 @@ def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42
     from auto_round.calib_dataset import get_dataloader  # pylint: disable=E0401
 
     dataloader = get_dataloader(
-        tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=seed, bs=bs, nsamples=nsamples
+        tokenizer, seqlen, dataset_name=dataset_name, seed=seed, bs=bs, nsamples=nsamples
     )
     return dataloader
 
diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py
index 1df11392636..5bb11fb69bc 100644
--- a/neural_compressor/torch/quantization/algorithm_entry.py
+++ b/neural_compressor/torch/quantization/algorithm_entry.py
@@ -629,6 +629,8 @@ def autoround_quantize_entry(
             image_processor = quant_config.image_processor
             template = quant_config.template
             truncation = quant_config.truncation
+            scheme = quant_config.scheme
+            device_map = quant_config.device_map
 
     kwargs.pop("example_inputs")
     quantizer = get_quantizer(
@@ -666,6 +668,8 @@ def autoround_quantize_entry(
         image_processor=image_processor,
         template=template,
         truncation=truncation,
+        scheme=scheme,
+        device_map=device_map,
     )
     model = quantizer.execute(model=model, mode=mode, *args, **kwargs)
     model.qconfig = configs_mapping
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index 48a682a00a7..27e5a85551e 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -971,6 +971,9 @@ def __init__(
         # v0.4
         enable_norm_bias_tuning: bool = False,
         enable_torch_compile: bool = None,
+        # v0.7
+        scheme: str | dict = "W4A16",
+        device_map: str = None,
         # mllm
         is_mllm: bool = False,
         quant_nontext_module: bool = False,
@@ -1029,6 +1032,8 @@ def __init__(
             image_processor (Processor): Image processor for special model like llava.
             template (Template): The template to specify process for different mllms.
             truncation (bool): Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            device_map: The device to be used for tuning.
+            scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations.
             white_list (Optional[List[OP_NAME_OR_MODULE_TYPE]]): White list of operator names or module types.
               Default is DEFAULT_WHITE_LIST.
         """
@@ -1073,6 +1078,8 @@ def __init__(
         self.image_processor = image_processor
         self.template = template
         self.truncation = truncation
+        self.scheme = scheme
+        self.device_map = device_map
         self._post_init()
 
     @classmethod
diff --git a/test/3x/torch/quantization/weight_only/test_autoround.py b/test/3x/torch/quantization/weight_only/test_autoround.py
index 839427c4828..c9a5e35ebb8 100644
--- a/test/3x/torch/quantization/weight_only/test_autoround.py
+++ b/test/3x/torch/quantization/weight_only/test_autoround.py
@@ -247,7 +247,7 @@ def test_mllm(self):
             seed=42,
             nsamples=1,
             gradient_accumulate_steps=1,
-            quant_nontext_module=False,
+            quant_nontext_module=True,
             processor=processor,
         )
         quant_config = AutoRoundConfig(
@@ -258,7 +258,7 @@ def test_mllm(self):
             batch_size=batch_size,
             iters=1,
             seqlen=seqlen,
-            quant_nontext_module=False,
+            quant_nontext_module=True,
             truncation=truncation,
             gradient_accumulate_steps=gradient_accumulate_steps,
         )
@@ -283,6 +283,21 @@ def test_mllm(self):
     #     q_model.save(output_dir="saved_results_tiny-random-GPTJForCausalLM", format="huggingface")
     #     loaded_model = load("saved_results_tiny-random-GPTJForCausalLM", format="huggingface", trust_remote_code=True)
 
+    @pytest.mark.parametrize("scheme", ["MXFP4", "NVFP4"])
+    def test_scheme(self, scheme):
+        fp32_model = copy.deepcopy(self.gptj)
+        quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, amp=False ,scale_dtype="fp16", 
+                                       scheme=scheme, export_format="llm_compressor")
+        logger.info(f"Test AutoRound with config {quant_config}")
+
+        # quantizer execute
+        model = prepare(model=fp32_model, quant_config=quant_config)
+        run_fn(model, self.dataloader)
+        q_model = convert(model)
+        out = q_model(self.inp)[0]
+        assert q_model is not None, "Quantization failed!"
+        assert q_model.transformer.h[0].attn.k_proj.bits is 4
+        assert torch.allclose(out, self.label, atol=1e-1)
 
 @pytest.mark.skipif(not is_habana_framework_installed(), reason="Habana framework is not installed")
 @pytest.mark.skipif(os.getenv("PT_HPU_LAZY_MODE", "0") == "1", reason="Lazy mode is enabled")

From 0f525fbdfe03dc14fd3d54701331f34cbbe3b734 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 12 Sep 2025 07:01:52 +0000
Subject: [PATCH 2/7] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_compressor/torch/algorithms/weight_only/autoround.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py
index 40ce3d0444e..b3c6b292831 100644
--- a/neural_compressor/torch/algorithms/weight_only/autoround.py
+++ b/neural_compressor/torch/algorithms/weight_only/autoround.py
@@ -349,9 +349,7 @@ def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42
     """
     from auto_round.calib_dataset import get_dataloader  # pylint: disable=E0401
 
-    dataloader = get_dataloader(
-        tokenizer, seqlen, dataset_name=dataset_name, seed=seed, bs=bs, nsamples=nsamples
-    )
+    dataloader = get_dataloader(tokenizer, seqlen, dataset_name=dataset_name, seed=seed, bs=bs, nsamples=nsamples)
     return dataloader
 
 

From 78a0bdf98fc23620172762b73398454dfbd1f773 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Mon, 15 Sep 2025 15:09:26 +0800
Subject: [PATCH 3/7] update ut

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 test/3x/torch/quantization/weight_only/test_autoround.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/test/3x/torch/quantization/weight_only/test_autoround.py b/test/3x/torch/quantization/weight_only/test_autoround.py
index c9a5e35ebb8..f7fb1fb4303 100644
--- a/test/3x/torch/quantization/weight_only/test_autoround.py
+++ b/test/3x/torch/quantization/weight_only/test_autoround.py
@@ -57,6 +57,13 @@ def set_hpu_torch_compile_envs():
     auto_round_installed = True
 except ImportError:
     auto_round_installed = False
+    
+try:
+    import compressed_tensors
+
+    ct_installed = True
+except ImportError:
+    ct_installed = False
 
 
 @torch.no_grad()
@@ -283,6 +290,7 @@ def test_mllm(self):
     #     q_model.save(output_dir="saved_results_tiny-random-GPTJForCausalLM", format="huggingface")
     #     loaded_model = load("saved_results_tiny-random-GPTJForCausalLM", format="huggingface", trust_remote_code=True)
 
+    @pytest.mark.skipif(not ct_installed, reason="compressed-tensors module is not installed")
     @pytest.mark.parametrize("scheme", ["MXFP4", "NVFP4"])
     def test_scheme(self, scheme):
         fp32_model = copy.deepcopy(self.gptj)

From a571afa93ab1d68b5af13c7c1eba49fa5cfc2e71 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Mon, 15 Sep 2025 15:40:11 +0800
Subject: [PATCH 4/7] add ut requirments

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 test/3x/torch/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/3x/torch/requirements.txt b/test/3x/torch/requirements.txt
index 16b7e508083..9b090217af0 100644
--- a/test/3x/torch/requirements.txt
+++ b/test/3x/torch/requirements.txt
@@ -1,4 +1,5 @@
 auto_round
+compressed-tensors
 datasets
 deepspeed @ git+https://github.com/HabanaAI/DeepSpeed.git@1.21.0
 expecttest

From 97724ddba4fece48664a6574d1694c918137b654 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 16 Sep 2025 09:11:38 +0800
Subject: [PATCH 5/7] skip test_scheme

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 test/3x/torch/quantization/weight_only/test_autoround.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/3x/torch/quantization/weight_only/test_autoround.py b/test/3x/torch/quantization/weight_only/test_autoround.py
index f7fb1fb4303..07632520a17 100644
--- a/test/3x/torch/quantization/weight_only/test_autoround.py
+++ b/test/3x/torch/quantization/weight_only/test_autoround.py
@@ -4,7 +4,7 @@
 import pytest
 import torch
 import transformers
-from packaging.version import Version
+from packaging.version import Version, parse
 import os
 from functools import lru_cache
 
@@ -290,6 +290,7 @@ def test_mllm(self):
     #     q_model.save(output_dir="saved_results_tiny-random-GPTJForCausalLM", format="huggingface")
     #     loaded_model = load("saved_results_tiny-random-GPTJForCausalLM", format="huggingface", trust_remote_code=True)
 
+    @pytest.mark.skipif(parse(auto_round.__version__) > parse("0.7.0"))
     @pytest.mark.skipif(not ct_installed, reason="compressed-tensors module is not installed")
     @pytest.mark.parametrize("scheme", ["MXFP4", "NVFP4"])
     def test_scheme(self, scheme):

From e00c060973aa80eeb44390964a0bacd3eabea397 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 16 Sep 2025 09:14:04 +0800
Subject: [PATCH 6/7] skip test_scheme

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 test/3x/torch/quantization/weight_only/test_autoround.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/3x/torch/quantization/weight_only/test_autoround.py b/test/3x/torch/quantization/weight_only/test_autoround.py
index 07632520a17..8d66dac1a46 100644
--- a/test/3x/torch/quantization/weight_only/test_autoround.py
+++ b/test/3x/torch/quantization/weight_only/test_autoround.py
@@ -290,7 +290,7 @@ def test_mllm(self):
     #     q_model.save(output_dir="saved_results_tiny-random-GPTJForCausalLM", format="huggingface")
     #     loaded_model = load("saved_results_tiny-random-GPTJForCausalLM", format="huggingface", trust_remote_code=True)
 
-    @pytest.mark.skipif(parse(auto_round.__version__) > parse("0.7.0"))
+    @pytest.mark.skipif(parse(auto_round.__version__) <= parse("0.7.0"))
     @pytest.mark.skipif(not ct_installed, reason="compressed-tensors module is not installed")
     @pytest.mark.parametrize("scheme", ["MXFP4", "NVFP4"])
     def test_scheme(self, scheme):

From 46374171cb9b673b980283a2ba11647d34ca7737 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 16 Sep 2025 12:15:32 +0800
Subject: [PATCH 7/7] fix ut skip

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../quantization/weight_only/test_autoround.py   | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/test/3x/torch/quantization/weight_only/test_autoround.py b/test/3x/torch/quantization/weight_only/test_autoround.py
index 8d66dac1a46..b919e56cf4d 100644
--- a/test/3x/torch/quantization/weight_only/test_autoround.py
+++ b/test/3x/torch/quantization/weight_only/test_autoround.py
@@ -290,13 +290,21 @@ def test_mllm(self):
     #     q_model.save(output_dir="saved_results_tiny-random-GPTJForCausalLM", format="huggingface")
     #     loaded_model = load("saved_results_tiny-random-GPTJForCausalLM", format="huggingface", trust_remote_code=True)
 
-    @pytest.mark.skipif(parse(auto_round.__version__) <= parse("0.7.0"))
-    @pytest.mark.skipif(not ct_installed, reason="compressed-tensors module is not installed")
+    @pytest.mark.skipif(parse(auto_round.__version__) <= parse("0.7.0"),
+                        reason="Export with llm_compressor format does not return a model.")
+    @pytest.mark.skipif(not ct_installed, reason="The compressed-tensors module is not installed.")
     @pytest.mark.parametrize("scheme", ["MXFP4", "NVFP4"])
     def test_scheme(self, scheme):
         fp32_model = copy.deepcopy(self.gptj)
-        quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, amp=False ,scale_dtype="fp16", 
-                                       scheme=scheme, export_format="llm_compressor")
+        quant_config = AutoRoundConfig(
+            nsamples=32,
+            seqlen=10,
+            iters=10,
+            amp=False,
+            scale_dtype="fp16",
+            scheme=scheme,
+            export_format="llm_compressor",
+        )
         logger.info(f"Test AutoRound with config {quant_config}")
 
         # quantizer execute