vllm-project · WoosukKwon · Mar 1, 2024 · Jan 17, 2024 · Jan 17, 2024 · Jan 17, 2024
diff --git a/csrc/ops.h b/csrc/ops.h
@@ -70,6 +70,13 @@ torch::Tensor awq_gemm(
   torch::Tensor _scaling_factors,
   torch::Tensor _zeros,
   int split_k_iters);
+
+void marlin_gemm(
+  const torch::Tensor& input,
+  const torch::Tensor& weights,
+        torch::Tensor& output,
+  const torch::Tensor& scales,
+        torch::Tensor& workspace);
 #endif
 
 void squeezellm_gemm(

diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
@@ -51,11 +51,12 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 #ifndef USE_ROCM
   // Quantization ops
   ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
+  ops.def("marlin_gemm", &marlin_gemm, "Marlin Optimized Quantized GEMM for GPTQ");
 #endif
   ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ");
   ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");
   ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
-
+  
   // Cache ops
   pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
   cache_ops.def(

diff --git a/csrc/quantization/marlin/marlin_cuda_kernel.cu b/csrc/quantization/marlin/marlin_cuda_kernel.cu
diff --git a/setup.py b/setup.py
@@ -226,6 +226,8 @@ def get_torch_arch_list() -> Set[str]:
 
 if _is_cuda():
     vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
+    print("\n\n HERE \n\n")
+    vllm_extension_sources.append("csrc/quantization/marlin/marlin_cuda_kernel.cu")
 
 vllm_extension = CUDAExtension(
     name="vllm._C",

diff --git a/vllm/config.py b/vllm/config.py
@@ -144,8 +144,8 @@ def _verify_tokenizer_mode(self) -> None:
         self.tokenizer_mode = tokenizer_mode
 
     def _verify_quantization(self) -> None:
-        supported_quantization = ["awq", "gptq", "squeezellm"]
-        rocm_not_supported_quantization = ["awq"]
+        supported_quantization = ["awq", "gptq", "squeezellm", "marlin"]
+        rocm_not_supported_quantization = ["awq", "marlin"]
         if self.quantization is not None:
             self.quantization = self.quantization.lower()
 
@@ -172,9 +172,10 @@ def _verify_quantization(self) -> None:
                 raise ValueError(
                     f"{self.quantization} quantization is currently not supported "
                     f"in ROCm.")
-            logger.warning(f"{self.quantization} quantization is not fully "
-                           "optimized yet. The speed can be slower than "
-                           "non-quantized models.")
+            if self.quantization != "marlin":
+                logger.warning(f"{self.quantization} quantization is not fully "
+                                "optimized yet. The speed can be slower than "
+                                "non-quantized models.")
 
     def _verify_cuda_graph(self) -> None:
         if self.max_context_len_to_capture is None:

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
@@ -280,6 +280,14 @@ def weight_loader(self,
                 if packed_dim == output_dim:
                     shard_size = shard_size // param.pack_factor
                     shard_offset = shard_offset // param.pack_factor
+
+                    # If marlin, we need to adjust the offset and size to account
+                    # for the tiling.
+                    marlin_tile_size = getattr(param, "tile_size", None)
+                    if marlin_tile_size is not None:
+                        shard_size = shard_size * marlin_tile_size
+                        shard_offset = shard_offset * marlin_tile_size
+
                 loaded_weight_shard = loaded_weight.narrow(
                     output_dim, shard_offset, shard_size)
                 self.weight_loader(param, loaded_weight_shard, shard_id)
@@ -297,6 +305,14 @@ def weight_loader(self,
             if packed_dim == output_dim:
                 shard_size = shard_size // param.pack_factor
                 shard_offset = shard_offset // param.pack_factor
+
+                # If marlin, we need to adjust the offset and size to account
+                # for the tiling.
+                marlin_tile_size = getattr(param, "tile_size", None)
+                if marlin_tile_size is not None:
+                    shard_size = shard_size * marlin_tile_size
+                    shard_offset = shard_offset * marlin_tile_size
+
             param_data = param_data.narrow(output_dim, shard_offset,
                                            shard_size)
             start_idx = tp_rank * shard_size
@@ -376,7 +392,10 @@ def weight_loader(self,
                       loaded_shard_id: Optional[str] = None):
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
+
         if loaded_shard_id is None:
+            print("--------- HERE 2")
+
             # Loaded weight is already packed.
             if output_dim is None:
                 assert param_data.shape == loaded_weight.shape
@@ -397,6 +416,14 @@ def weight_loader(self,
                 if packed_dim == output_dim:
                     shard_size = shard_size // param.pack_factor
                     shard_offset = shard_offset // param.pack_factor
+
+                    # If marlin, we need to adjust the offset and size to account
+                    # for the tiling.
+                    marlin_tile_size = getattr(param, "tile_size", None)
+                    if marlin_tile_size is not None:
+                        shard_size = shard_size * marlin_tile_size
+                        shard_offset = shard_offset * marlin_tile_size
+
                 loaded_weight_shard = loaded_weight.narrow(
                     output_dim, shard_offset, shard_size)
                 self.weight_loader(param, loaded_weight_shard, shard_id)
@@ -421,6 +448,14 @@ def weight_loader(self,
             if packed_dim == output_dim:
                 shard_size = shard_size // param.pack_factor
                 shard_offset = shard_offset // param.pack_factor
+
+                # If marlin, we need to adjust the offset and size to account 
+                # for the tiling
+                marlin_tile_size = getattr(param, "tile_size", None)
+                if marlin_tile_size is not None:
+                    shard_size = shard_size * marlin_tile_size
+                    shard_offset = shard_offset * marlin_tile_size
+
             param_data = param_data.narrow(output_dim, shard_offset,
                                            shard_size)
             if loaded_shard_id == "q":

@@ -4,11 +4,13 @@
 from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
+from vllm.model_executor.layers.quantization.marlin import MarlinConfig
 
 _QUANTIZATION_CONFIG_REGISTRY = {
     "awq": AWQConfig,
     "gptq": GPTQConfig,
     "squeezellm": SqueezeLLMConfig,
+    "marlin": MarlinConfig,
 }
 
 

@@ -0,0 +1,188 @@
+import numpy as np
+from typing import Any, Dict, List, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm._C import ops
+from vllm.model_executor.layers.linear import (LinearMethodBase,
+                                               set_weight_attrs)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+
+# Essentially all reasonable GPUs have less than 256 SMs so this should be safe for now
+MAX_SMS = 256
+# Tile size used by Marlin Kernels
+TILE_SIZE = 16
+# 4 Bits Packed Into 32 Bit Dtype
+PACK_FACTOR = 32 // 4
+
+class MarlinConfig(QuantizationConfig):
+    """Config class for Marlin.
+
+    Reference: https://github.com/IST-DASLab/marlin/tree/master
+    """
+
+    def __init__(
+        self,
+        group_size: int,
+    ) -> None:
+        self.group_size = group_size
+        # 4Bits packed into Int32.
+        self.pack_factor = 32 // 4
+        # Tile size of 16 used by Marlin.
+        self.tile_size = 16
+
+        # todo(rib-2): add channelwise support (-1).
+        if self.group_size != 128:
+            raise ValueError(
+                "Currently, only group size 128 is supported for Marlin "
+                f"but got {self.group_size} bits.")
+
+    def __repr__(self) -> str:
+        return (f"MarlinConfig(group_size={self.group_size}")
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "marlin"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.half]
+
+    @classmethod
+    # Need to figure it out
-    # Need to figure it out
-    # Need to figure it out
+    def get_min_capability(cls) -> int:
+        return 60
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "MarlinConfig":
+        group_size = cls.get_from_keys(config, ["group_size"])
+        return cls(group_size)
+
+    def get_linear_method(self) -> "MarlinLinearMethod":
+        return MarlinLinearMethod(self)
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+class MarlinLinearMethod(LinearMethodBase):
+    """Linear method for Marlin.
+
+    Args:
+        quant_config: The Marlin quantization config.
+    """
+
+    def __init__(self, quant_config: MarlinConfig):
+        self.quant_config = quant_config
+        self._perm_len = 1024
+
+    def create_weights(
+        self,
+        input_size_per_partition: int,
+        output_size_per_partition: int,
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+    ) -> Dict[str, Any]:
+        del output_size  # Unused.
+        if params_dtype != torch.float16:
+            raise ValueError(
+                f"The params dtype must be float16, but got {params_dtype}")
+        if input_size_per_partition % self.quant_config.group_size != 0:
+            raise ValueError(
+                "The input size is not aligned with the quantized "
+                "weight shape. This can be caused by too large "
+                "tensor parallel size.")
+        if output_size_per_partition % self.quant_config.pack_factor != 0:
+            raise ValueError(
+                "The output size is not aligned with the quantized "
+                "weight shape. This can be caused by too large "
+                "tensor parallel size.")
+        if input_size_per_partition % 128 != 0:
+            raise ValueError(
+                "The input_size_per_partition must be divisible by 128, "
+                f"but got {input_size_per_partition}")
+
+        if output_size_per_partition % 256 != 0:
+            raise ValueError(
+                "The output_size_per_partition must be divisible by 256, "
+                f"but got {output_size_per_partition}")
+
+        # check that we have at least 4 tiles horizontally in the shard
+        num_tiles_per_perm = self._perm_len // (self.quant_config.tile_size ** 2)
+        if output_size_per_partition % num_tiles_per_perm != 0:
+            raise ValueError(
+                "Each permutation group must reside on the same gpu"
+            )
+
+        # Quantized 4Bit weights packed into Int32.
+        qweight = Parameter(
+            torch.empty(
+                input_size_per_partition // self.quant_config.tile_size, 
+                output_size_per_partition * self.quant_config.tile_size // self.quant_config.pack_factor, 
+                device="cuda",
+                dtype=torch.int32
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            qweight, {
+                "input_dim": 0,
+                "output_dim": 1,
+                "packed_dim": 1,
+                "pack_factor": self.quant_config.pack_factor,
+                "tile_size": TILE_SIZE,
+            })
+
+        # Scales in Float16.
+        scales = Parameter(
+            torch.empty(
+                input_size_per_partition // self.quant_config.group_size,
+                output_size_per_partition, 
+                device="cuda",
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(scales, {
+            "input_dim": None if input_size == input_size_per_partition else 0,
+            "output_dim": 1,
+        })
+
+        # Workspace for the marlin kernels.
+        self.workspace = torch.empty(MAX_SMS, dtype=torch.int)
+
+        return {
+            "B": qweight,
+            "s": scales,
+        }
+
+    def apply_weights(self,
+                      weights: Dict[str, Any],
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        qweight = weights["B"]
+        scales = weights["s"]
+
+        output = torch.empty(
+            x.shape[:-1] + (scales.shape[1],), 
+            dtype=x.dtype, 
+            device=x.device
+        )
+        ops.marlin_gemm(
+            x.view(-1, x.shape[-1]),
+            qweight,
+            output.view(-1, output.shape[-1]),
+            scales,
+            self.workspace
+        )
+
+        if bias is not None:
+            output = output + bias
+        return output
+