Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions examples/quantization_w4a16/llama3_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from llmcompressor.utils import dispatch_for_generation

# Select model and load it.
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
model_id = "meta-llama/Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id)

Expand Down Expand Up @@ -51,7 +51,10 @@ def tokenize(sample):

# Configure the quantization algorithm to run.
# * quantize the weights to 4 bit with GPTQ with a group size 128
recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
recipe = [
GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]),
GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]),
]

# Apply algorithms.
oneshot(
Expand Down
58 changes: 46 additions & 12 deletions examples/transform/spinquant_example.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,63 @@
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.transform import SpinQuantModifier
from llmcompressor.utils import dispatch_for_generation

# Select model and load it.
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
# TODO: change back
MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"

model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# NOTE: currently only fused rotations (R1 & R2) are available
# Learned rotations and online rotations (R3 & R4) will be added
# in a future release.
# Configure the quantization algorithm to run.
# * apply spinquant transforms to model to reduce quantization loss
# * quantize the weights to 4 bit with group size 128
# Select calibration dataset.
DATASET_ID = "mit-han-lab/pile-val-backup"
DATASET_SPLIT = "validation"

# Select number of samples. 256 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 256
MAX_SEQUENCE_LENGTH = 512

# Load dataset and preprocess.
ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
ds = ds.shuffle(seed=42)


def preprocess(example):
return {
"text": tokenizer.apply_chat_template(
[{"role": "user", "content": example["text"]}],
tokenize=False,
)
}


ds = ds.map(preprocess)


# Tokenize inputs.
def tokenize(sample):
return tokenizer(
sample["text"],
padding=False,
max_length=MAX_SEQUENCE_LENGTH,
truncation=True,
add_special_tokens=False,
)


# TODO
recipe = [
SpinQuantModifier(rotations=["R1", "R2"], transform_type="hadamard"),
QuantizationModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]),
SpinQuantModifier(rotations=["R3"], transform_type="random-hadamard"),
GPTQModifier(targets=["Linear"], scheme="W4A16", ignore=["lm_head"]),
]

# Apply algorithms.
oneshot(model=model, recipe=recipe, pipeline="datafree")
oneshot(model=model, dataset=ds, recipe=recipe)

# Confirm generations of the quantized model look sane.
print("\n\n")
Expand All @@ -35,6 +69,6 @@
print("==========================================\n\n")

# Save to disk compressed.
SAVE_DIR = MODEL_ID.split("/")[1] + "-spinquantR1R2-w4a16"
SAVE_DIR = MODEL_ID.split("/")[1] + "-spinquant-W4A16"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def localversion_func(version: ScmVersion) -> str:
"torchvision",
"librosa",
"soundfile",
"torchcodec",
#"torchcodec",
# linting, formatting, and type checking
"black~=24.4.2",
"isort~=5.13.2",
Expand Down
1 change: 0 additions & 1 deletion src/llmcompressor/modifiers/quantization/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
# flake8: noqa

from .cache import *
from .gptq import *
from .quantization import *
208 changes: 0 additions & 208 deletions src/llmcompressor/modifiers/quantization/cache.py

This file was deleted.

Loading
Loading