From 33dba6ce0231f052cdd4b7e9f02e90747733cf48 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Thu, 6 Nov 2025 20:52:33 -0500 Subject: [PATCH 1/6] convert : handle compressed-tensors quant method --- convert_hf_to_gguf.py | 87 +++++++++++++++++++++++++++++++++++++++---- gguf-py/gguf/lazy.py | 11 ++++-- 2 files changed, 88 insertions(+), 10 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 222f6ed6dc40f..9a3e56a9c8796 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -278,15 +278,14 @@ def dequant_bitnet(weight: Tensor, scale: Tensor) -> Tensor: # The scale is inverted return data / scale.float() - def dequant_simple(weight: Tensor, scale: Tensor) -> Tensor: + def dequant_simple(weight: Tensor, scale: Tensor, block_size: Sequence[int] | None = None) -> Tensor: scale = scale.float() - if (weight_block_size := quant_config.get("weight_block_size")): - # TODO: make sure it's a list of integers - for i, size in enumerate(weight_block_size): + if block_size is not None: + for i, size in enumerate(block_size): scale = scale.repeat_interleave(size, i) - # unpad the scale (e.g. when the tensor size isn't a multiple of the block size) - scale = scale[tuple(slice(0, size) for size in weight.shape)] + # unpad the scale (e.g. when the tensor size isn't a multiple of the block size) + scale = scale[tuple(slice(0, size) for size in weight.shape)] return weight.float() * scale @@ -333,6 +332,40 @@ def dequant_gptq(g_idx: Tensor, qweight: Tensor, qzeros: Tensor, scales: Tensor) return (scales[g_idx].float() * (weight - zeros[g_idx]).float()).T + def dequant_packed(w: Tensor, scale: Tensor, shape_tensor: Tensor, zero_point: Tensor | None, num_bits: int, group_size: int): + assert w.dtype == torch.int32 + shape = tuple(shape_tensor.tolist()) + assert len(shape) == 2 + mask = (1 << num_bits) - 1 + + shifts = torch.arange(0, 32 - (num_bits - 1), num_bits, dtype=torch.int32) + if self.lazy: + shifts = LazyTorchTensor.from_eager(shifts) + + if zero_point is None: + offset = 1 << (num_bits - 1) + else: + assert len(zero_point.shape) == 2 + offset = (zero_point.unsqueeze(1) >> shifts.reshape(1, -1, 1)) & mask + offset = offset.reshape(-1, zero_point.shape[1]) + # trim padding, and prepare for broadcast + # NOTE: the zero-point is packed along dim 0 + offset = offset[:shape[0], :].unsqueeze(-1) + + # extract values + # NOTE: the weights are packed along dim 1 + unpacked = (w.unsqueeze(-1) >> shifts.reshape(1, 1, -1)) & mask + unpacked = unpacked.reshape(shape[0], -1) + + # trim padding + unpacked = unpacked[:, :shape[1]] + + # prepare for broadcast of the scale + unpacked = unpacked.reshape(shape[0], (unpacked.shape[-1] + group_size - 1) // group_size, group_size) + unpacked = unpacked - offset + + return (unpacked * scale.unsqueeze(-1)).reshape(shape) + if quant_method == "bitnet": for name in self.model_tensors.keys(): if name.endswith(".weight_scale"): @@ -342,12 +375,13 @@ def dequant_gptq(g_idx: Tensor, qweight: Tensor, qzeros: Tensor, scales: Tensor) self.model_tensors[weight_name] = lambda w=w, s=s: dequant_bitnet(w(), s()) tensors_to_remove.append(name) elif quant_method == "fp8": + block_size = quant_config.get("weight_block_size") for name in self.model_tensors.keys(): if name.endswith(".weight_scale_inv"): weight_name = name.removesuffix("_scale_inv") w = self.model_tensors[weight_name] s = self.model_tensors[name] - self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s()) + self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs) tensors_to_remove.append(name) elif quant_method == "gptq": for name in self.model_tensors.keys(): @@ -371,6 +405,45 @@ def dequant_gptq(g_idx: Tensor, qweight: Tensor, qzeros: Tensor, scales: Tensor) ".scales", ) ] + elif quant_method == "compressed-tensors": + quant_format = quant_config["format"] + groups = quant_config["config_groups"] + if len(groups) > 1: + raise NotImplementedError("Can't handle multiple config groups for compressed-tensors yet") + weight_config = tuple(groups.values())[0]["weights"] + if quant_format == "float-quantized": + block_size = weight_config.get("block_structure", None) + assert weight_config.get("strategy") == "channel" + assert weight_config.get("group_size") == None # didn't find a model using this yet + for name in self.model_tensors.keys(): + if name.endswith(".weight_scale"): + weight_name = name.removesuffix("_scale") + w = self.model_tensors[weight_name] + s = self.model_tensors[name] + self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), block_size) + tensors_to_remove.append(name) + elif quant_format == "pack-quantized": + assert weight_config.get("strategy") == "group" + assert weight_config.get("type", "int") == "int" + num_bits = weight_config.get("num_bits") + group_size = weight_config.get("group_size") + assert isinstance(num_bits, int) + assert isinstance(group_size, int) + for name in self.model_tensors.keys(): + if name.endswith(".weight_packed"): + base_name = name.removesuffix("_packed") + w = self.model_tensors[name] + scale = self.model_tensors[base_name + "_scale"] + shape = self.model_tensors[base_name + "_shape"] + zero_point = self.model_tensors.get(base_name + "_zero_point", lambda: None) + new_tensors[base_name] = ( + lambda w=w, scale=scale, shape=shape, zero_point=zero_point: dequant_packed( + w(), scale(), shape(), zero_point(), num_bits, group_size, + ) + ) + tensors_to_remove += [base_name + n for n in ("_packed", "_shape", "_scale")] + if (base_name + "_zero_point") in self.model_tensors: + tensors_to_remove.append(base_name + "_zero_point") else: raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}") diff --git a/gguf-py/gguf/lazy.py b/gguf-py/gguf/lazy.py index f9bcadae0224b..6d0f1cbbbdef3 100644 --- a/gguf-py/gguf/lazy.py +++ b/gguf-py/gguf/lazy.py @@ -48,13 +48,18 @@ def wrapped_special_op(self, *args, **kwargs): # NOTE: doing this from a metaclass is very convenient # TODO: make this even more comprehensive for binary_op in ( - "lt", "le", "eq", "ne", "ge", "gt", "not" - "abs", "add", "and", "floordiv", "invert", "lshift", "mod", "mul", "matmul", - "neg", "or", "pos", "pow", "rshift", "sub", "truediv", "xor", + "lt", "le", "eq", "ne", "ge", "gt", + "add", "and", "floordiv", "lshift", "mod", "mul", "matmul", + "or", "pos", "pow", "rshift", "sub", "truediv", "xor", "iadd", "iand", "ifloordiv", "ilshift", "imod", "imul", "ior", "irshift", "isub", "ixor", "radd", "rand", "rfloordiv", "rmul", "ror", "rpow", "rsub", "rtruediv", "rxor", ): attr_name = f"__{binary_op}__" + # evaluation on the meta tensor is needed in case there's broadcasting + namespace[attr_name] = mk_wrap(attr_name, meta_noop=False) + + for unary_op in ("not", "abs", "invert", "neg"): + attr_name = f"__{unary_op}__" # the result of these operators usually has the same shape and dtype as the input, # so evaluation on the meta tensor can be skipped. namespace[attr_name] = mk_wrap(attr_name, meta_noop=True) From d23bdd57b04824b14e5e52b70af738c544c8c3cd Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Thu, 6 Nov 2025 21:11:52 -0500 Subject: [PATCH 2/6] convert : handle int-quantized models --- convert_hf_to_gguf.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 9a3e56a9c8796..14ccb24942b7a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -411,9 +411,11 @@ def dequant_packed(w: Tensor, scale: Tensor, shape_tensor: Tensor, zero_point: T if len(groups) > 1: raise NotImplementedError("Can't handle multiple config groups for compressed-tensors yet") weight_config = tuple(groups.values())[0]["weights"] - if quant_format == "float-quantized": + + if quant_format == "float-quantized" or quant_format == "int-quantized": block_size = weight_config.get("block_structure", None) - assert weight_config.get("strategy") == "channel" + strategy = weight_config.get("strategy") + assert strategy == "channel" or strategy == "block" assert weight_config.get("group_size") == None # didn't find a model using this yet for name in self.model_tensors.keys(): if name.endswith(".weight_scale"): @@ -444,6 +446,8 @@ def dequant_packed(w: Tensor, scale: Tensor, shape_tensor: Tensor, zero_point: T tensors_to_remove += [base_name + n for n in ("_packed", "_shape", "_scale")] if (base_name + "_zero_point") in self.model_tensors: tensors_to_remove.append(base_name + "_zero_point") + else: + raise NotImplementedError(f"Quant format {quant_format!r} for method {quant_method!r} is not yet supported") else: raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}") From 33dcb44aa20bbbebb687cbe058ca50768d170fc6 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Thu, 6 Nov 2025 21:34:21 -0500 Subject: [PATCH 3/6] convert : handle naive-quantized models --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 14ccb24942b7a..284916312308c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -412,7 +412,7 @@ def dequant_packed(w: Tensor, scale: Tensor, shape_tensor: Tensor, zero_point: T raise NotImplementedError("Can't handle multiple config groups for compressed-tensors yet") weight_config = tuple(groups.values())[0]["weights"] - if quant_format == "float-quantized" or quant_format == "int-quantized": + if quant_format == "float-quantized" or quant_format == "int-quantized" or quant_format == "naive-quantized": block_size = weight_config.get("block_structure", None) strategy = weight_config.get("strategy") assert strategy == "channel" or strategy == "block" From 987862ad8c218cdb459ca03d5c60727c4a20cc26 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Thu, 6 Nov 2025 21:51:20 -0500 Subject: [PATCH 4/6] gguf-py : __pos__ is also unary --- gguf-py/gguf/lazy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gguf-py/gguf/lazy.py b/gguf-py/gguf/lazy.py index 6d0f1cbbbdef3..c126f09c5091b 100644 --- a/gguf-py/gguf/lazy.py +++ b/gguf-py/gguf/lazy.py @@ -50,7 +50,7 @@ def wrapped_special_op(self, *args, **kwargs): for binary_op in ( "lt", "le", "eq", "ne", "ge", "gt", "add", "and", "floordiv", "lshift", "mod", "mul", "matmul", - "or", "pos", "pow", "rshift", "sub", "truediv", "xor", + "or", "pow", "rshift", "sub", "truediv", "xor", "iadd", "iand", "ifloordiv", "ilshift", "imod", "imul", "ior", "irshift", "isub", "ixor", "radd", "rand", "rfloordiv", "rmul", "ror", "rpow", "rsub", "rtruediv", "rxor", ): @@ -58,7 +58,7 @@ def wrapped_special_op(self, *args, **kwargs): # evaluation on the meta tensor is needed in case there's broadcasting namespace[attr_name] = mk_wrap(attr_name, meta_noop=False) - for unary_op in ("not", "abs", "invert", "neg"): + for unary_op in ("not", "abs", "invert", "neg", "pos"): attr_name = f"__{unary_op}__" # the result of these operators usually has the same shape and dtype as the input, # so evaluation on the meta tensor can be skipped. From 3770d9410d4e0358baac3d52c748dd2c7ceffe40 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Thu, 6 Nov 2025 21:52:27 -0500 Subject: [PATCH 5/6] convert : fix flake8 lint --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 284916312308c..bc68be066c93d 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -416,7 +416,7 @@ def dequant_packed(w: Tensor, scale: Tensor, shape_tensor: Tensor, zero_point: T block_size = weight_config.get("block_structure", None) strategy = weight_config.get("strategy") assert strategy == "channel" or strategy == "block" - assert weight_config.get("group_size") == None # didn't find a model using this yet + assert weight_config.get("group_size") is None # didn't find a model using this yet for name in self.model_tensors.keys(): if name.endswith(".weight_scale"): weight_name = name.removesuffix("_scale") From 128118fdbed9f07bf85849edd810daf12f70e92a Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Thu, 6 Nov 2025 21:59:32 -0500 Subject: [PATCH 6/6] convert : use F32 for dequant of pack-quantized tensors --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index bc68be066c93d..b155d112b1ace 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -364,7 +364,7 @@ def dequant_packed(w: Tensor, scale: Tensor, shape_tensor: Tensor, zero_point: T unpacked = unpacked.reshape(shape[0], (unpacked.shape[-1] + group_size - 1) // group_size, group_size) unpacked = unpacked - offset - return (unpacked * scale.unsqueeze(-1)).reshape(shape) + return (unpacked * scale.unsqueeze(-1).float()).reshape(shape) if quant_method == "bitnet": for name in self.model_tensors.keys():