vllm-project · kylesayrs · Aug 20, 2025 · Aug 21, 2025 · Aug 21, 2025 · Aug 21, 2025
diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py
@@ -6,7 +6,7 @@
 from llmcompressor.utils import dispatch_for_generation
 
 # Select model and load it.
-model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+model_id = "meta-llama/Llama-3.1-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
@@ -51,7 +51,10 @@ def tokenize(sample):
 
 # Configure the quantization algorithm to run.
 #   * quantize the weights to 4 bit with GPTQ with a group size 128
-recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
+recipe = [
+    GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]),
+    GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]),
+]
 
 # Apply algorithms.
 oneshot(

diff --git a/examples/transform/spinquant_example.py b/examples/transform/spinquant_example.py
@@ -1,29 +1,63 @@
+from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.modifiers.transform import SpinQuantModifier
 from llmcompressor.utils import dispatch_for_generation
 
 # Select model and load it.
-MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+# TODO: change back
+MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"
 
 model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
-# NOTE: currently only fused rotations (R1 & R2) are available
-# Learned rotations and online rotations (R3 & R4) will be added
-# in a future release.
-# Configure the quantization algorithm to run.
-#   * apply spinquant transforms to model to reduce quantization loss
-#   * quantize the weights to 4 bit with group size 128
+# Select calibration dataset.
+DATASET_ID = "mit-han-lab/pile-val-backup"
+DATASET_SPLIT = "validation"
+
+# Select number of samples. 256 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
+NUM_CALIBRATION_SAMPLES = 256
+MAX_SEQUENCE_LENGTH = 512
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            [{"role": "user", "content": example["text"]}],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+# TODO
 recipe = [
-    SpinQuantModifier(rotations=["R1", "R2"], transform_type="hadamard"),
-    QuantizationModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]),
+    SpinQuantModifier(rotations=["R3"], transform_type="random-hadamard"),
+    GPTQModifier(targets=["Linear"], scheme="W4A16", ignore=["lm_head"]),
 ]
 
 # Apply algorithms.
-oneshot(model=model, recipe=recipe, pipeline="datafree")
+oneshot(model=model, dataset=ds, recipe=recipe)
 
 # Confirm generations of the quantized model look sane.
 print("\n\n")
@@ -35,6 +69,6 @@
 print("==========================================\n\n")
 
 # Save to disk compressed.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-spinquantR1R2-w4a16"
+SAVE_DIR = MODEL_ID.split("/")[1] + "-spinquant-W4A16"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/setup.py b/setup.py
@@ -189,7 +189,7 @@ def localversion_func(version: ScmVersion) -> str:
             "torchvision",
             "librosa",
             "soundfile",
-            "torchcodec",
+            #"torchcodec",
             # linting, formatting, and type checking
             "black~=24.4.2",
             "isort~=5.13.2",

diff --git a/src/llmcompressor/modifiers/quantization/__init__.py b/src/llmcompressor/modifiers/quantization/__init__.py
@@ -1,5 +1,4 @@
 # flake8: noqa
 
-from .cache import *
 from .gptq import *
 from .quantization import *
diff --git a/src/llmcompressor/modifiers/quantization/cache.py b/src/llmcompressor/modifiers/quantization/cache.py