sgl-project · zhyncs · Oct 17, 2025 · Oct 15, 2025 · Oct 15, 2025 · Oct 15, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -27,9 +27,9 @@ repos:
     rev: v0.11.7
     hooks:
       - id: ruff
-        args: [--select=F401, --fixable=F401]
-        files: ^(benchmark/|docs/|examples/)
-        exclude: \.ipynb$|^python/sglang/srt/grpc/.*_pb2\.py$|^python/sglang/srt/grpc/.*_pb2_grpc\.py$|^python/sglang/srt/grpc/.*_pb2\.pyi$|^python/sglang/srt/grpc/.*_pb2_grpc\.pyi$
+        args: [--select=F401,F821, --fixable=F401]
+        files: ^(benchmark/|docs/|examples/|python/sglang/)
+        exclude: __init__\.py$|\.ipynb$|^python/sglang/srt/grpc/.*_pb2\.py$|^python/sglang/srt/grpc/.*_pb2_grpc\.py$|^python/sglang/srt/grpc/.*_pb2\.pyi$|^python/sglang/srt/grpc/.*_pb2_grpc\.pyi$
   - repo: https://github.com/psf/black
     rev: 24.10.0
     hooks:

diff --git a/python/sglang/srt/_custom_ops.py b/python/sglang/srt/_custom_ops.py
@@ -15,7 +15,7 @@
     # ROCm does not use vllm custom allreduce
     if use_vllm_custom_allreduce and not is_hip():
         try:
-            import vllm._C
+            import vllm._C  # noqa: F401
         except ImportError as e:
             logger.warning("Failed to import from vllm._C with %r", e)
     else:

diff --git a/python/sglang/srt/compilation/cuda_piecewise_backend.py b/python/sglang/srt/compilation/cuda_piecewise_backend.py
@@ -9,7 +9,6 @@
 import torch
 import torch.fx as fx
 
-import sglang.srt.compilation.weak_ref_tensor_jit
 from sglang.srt.compilation.compilation_config import CompilationConfig
 from sglang.srt.compilation.compilation_counter import compilation_counter
 

diff --git a/python/sglang/srt/configs/deepseekvl2.py b/python/sglang/srt/configs/deepseekvl2.py
@@ -1,5 +1,4 @@
 import math
-import os
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
 

diff --git a/python/sglang/srt/configs/dots_vlm.py b/python/sglang/srt/configs/dots_vlm.py
@@ -1,10 +1,5 @@
-from typing import Any, List, Optional, Union
-
-from transformers import AutoProcessor, LlamaTokenizerFast, PretrainedConfig
-from transformers.feature_extraction_utils import BatchFeature
-from transformers.image_utils import ImageInput
-from transformers.processing_utils import ProcessingKwargs, Unpack
-from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers import AutoProcessor, PretrainedConfig
+from transformers.processing_utils import ProcessingKwargs
 
 try:
     from transformers import Qwen2_5_VLProcessor

diff --git a/python/sglang/srt/configs/falcon_h1.py b/python/sglang/srt/configs/falcon_h1.py
@@ -14,17 +14,12 @@
 # limitations under the License.
 """Falcon-H1 model configuration"""
 
-import enum
 
 from transformers.configuration_utils import PretrainedConfig
-from transformers.modeling_rope_utils import rope_config_validation
 from transformers.utils import logging
 
 from sglang.srt.configs.mamba_utils import Mamba2CacheParams, Mamba2StateShape
-from sglang.srt.layers.dp_attention import (
-    get_attention_tp_size,
-    get_tensor_model_parallel_world_size,
-)
+from sglang.srt.layers.dp_attention import get_tensor_model_parallel_world_size
 
 logger = logging.get_logger(__name__)
 

diff --git a/python/sglang/srt/configs/qwen3_next.py b/python/sglang/srt/configs/qwen3_next.py
@@ -21,7 +21,6 @@
 from transformers.utils import logging
 
 from sglang.srt.configs.mamba_utils import Mamba2CacheParams, Mamba2StateShape
-from sglang.srt.distributed.utils import divide
 from sglang.srt.layers.dp_attention import get_attention_tp_size
 
 logger = logging.get_logger(__name__)

diff --git a/python/sglang/srt/connector/remote_instance.py b/python/sglang/srt/connector/remote_instance.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import logging
-from typing import Generator, List, Optional, Tuple
+from typing import Generator, Optional, Tuple
 from urllib.parse import urlparse
 
 import torch

@@ -1,6 +1,6 @@
 import logging
 import os
-from typing import List, Optional
+from typing import List
 
 import torch
 

@@ -25,7 +25,7 @@
 from collections import deque
 from dataclasses import dataclass
 from http import HTTPStatus
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type, Union
+from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union
 
 import torch
 from torch.distributed import ProcessGroup
@@ -48,10 +48,7 @@
 )
 from sglang.srt.layers.dp_attention import get_attention_tp_size
 from sglang.srt.managers.schedule_batch import FINISH_ABORT, RequestStage, ScheduleBatch
-from sglang.srt.mem_cache.allocator import (
-    BaseTokenToKVPoolAllocator,
-    SWATokenToKVPoolAllocator,
-)
+from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
 from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
 from sglang.srt.mem_cache.memory_pool import (
     HybridLinearKVPool,
@@ -61,7 +58,6 @@
     ReqToTokenPool,
     SWAKVPool,
 )
-from sglang.srt.model_executor.forward_batch_info import ForwardMode
 from sglang.srt.utils import get_int_env_var, require_mlp_sync
 from sglang.srt.utils.torch_memory_saver_adapter import TorchMemorySaverAdapter
 

@@ -20,7 +20,6 @@
 from __future__ import annotations
 
 import logging
-import threading
 import time
 from collections import deque
 from http import HTTPStatus
@@ -54,7 +53,7 @@
     NSATokenToKVPool,
     SWAKVPool,
 )
-from sglang.srt.model_executor.forward_batch_info import ForwardMode, PPProxyTensors
+from sglang.srt.model_executor.forward_batch_info import PPProxyTensors
 from sglang.srt.utils import (
     DynamicGradMode,
     broadcast_pyobj,

@@ -32,7 +32,7 @@
         ops.meta_size()
     else:
         # Use custom allreduce from sgl kernel (ROCM and TRT-LLM)
-        import sgl_kernel
+        import sgl_kernel  # noqa: F401
     custom_ar = True
 except Exception:
     # For CPUs

@@ -4,7 +4,7 @@
 import os
 from contextlib import contextmanager
 from enum import IntEnum
-from typing import Any, Callable, List, Optional, TypeVar, Union
+from typing import Optional, Union
 
 import torch
 import torch.distributed as dist
@@ -24,7 +24,7 @@
     mscclpp_is_available = False
 if _is_cuda:
     try:
-        import sgl_kernel
+        import sgl_kernel  # noqa: F401
 
         mscclpp_is_available = True
     except:

@@ -9,7 +9,7 @@
 from sglang.srt.distributed.device_communicators.all_reduce_utils import (
     SYMM_MEM_ALL_REDUCE_MAX_SIZES,
 )
-from sglang.srt.utils import get_device_capability, is_cuda, is_hip
+from sglang.srt.utils import is_cuda, is_hip
 
 try:
     import torch.distributed._symmetric_memory as torch_symm_mem

@@ -1,5 +1,4 @@
 import base64
-import os
 import pickle
 import time
 from pathlib import Path

@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copied from vLLM
-import json
 import logging
 from abc import ABC, abstractmethod
 from typing import Union

@@ -3,7 +3,6 @@
 # Adapted from vLLM: https://github.com/vllm-project/vllm/blob/1b9902806915040ac9b3029f2ab7522ec505afc3/vllm/entrypoints/harmony_utils.py
 # Slight differences in processing chat messages
 import datetime
-import json
 from collections.abc import Iterable
 from typing import Literal, Optional, Union
 

@@ -19,7 +19,6 @@
 
 import asyncio
 import dataclasses
-import json
 import logging
 import multiprocessing as multiprocessing
 import os

@@ -1,15 +1,9 @@
-import copy
-import dataclasses
 import multiprocessing
-import pickle
-import threading
 import time
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import List, Optional, Tuple
 
-import pybase64
 import requests
 import torch
-import torch.distributed as dist
 
 from sglang.srt.entrypoints.EngineBase import EngineBase
 from sglang.srt.entrypoints.http_server import launch_server

@@ -3,8 +3,6 @@
 
 import torch
 
-from sglang.srt.utils import get_bool_env_var
-
 
 def balanced_packing(
     weight: torch.Tensor, num_packs: int

@@ -6,11 +6,7 @@
 
 from sglang.srt.entrypoints.openai.protocol import Tool
 from sglang.srt.function_call.base_format_detector import BaseFormatDetector
-from sglang.srt.function_call.core_types import (
-    StreamingParseResult,
-    StructureInfo,
-    _GetInfoFunc,
-)
+from sglang.srt.function_call.core_types import StreamingParseResult, _GetInfoFunc
 from sglang.srt.function_call.ebnf_composer import EBNFComposer
 
 logger = logging.getLogger(__name__)

@@ -1,5 +1,3 @@
-import json
-import re
 from typing import List
 
 from sglang.srt.entrypoints.openai.protocol import Tool

@@ -1,4 +1,3 @@
-import json
 from json import JSONDecodeError, JSONDecoder
 from json.decoder import WHITESPACE
 from typing import Any, List, Literal, Optional, Tuple, Union

diff --git a/python/sglang/srt/grpc/compile_proto.py b/python/sglang/srt/grpc/compile_proto.py
@@ -70,7 +70,7 @@ def compile_proto(proto_file: Path, output_dir: Path, verbose: bool = True) -> b
 
     # Check if grpc_tools is available
     try:
-        import grpc_tools.protoc
+        import grpc_tools.protoc  # noqa: F401
     except ImportError:
         print("Error: grpcio-tools not installed")
         print(

diff --git a/python/sglang/srt/grpc/grpc_request_manager.py b/python/sglang/srt/grpc/grpc_request_manager.py
@@ -27,7 +27,6 @@
     TokenizedEmbeddingReqInput,
     TokenizedGenerateReqInput,
 )
-from sglang.srt.managers.scheduler import is_health_check_generate_req
 from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.utils import get_zmq_socket, kill_process_tree
 from sglang.utils import get_exception_traceback

@@ -380,4 +380,7 @@ def get_cross_encoder_activation_function(config: PretrainedConfig):
     logger.info(
         "sgl-kernel is not available on Non-NV, Non-AMD platforms or Non-AMX CPUs. Fallback to other kernel libraries."
     )
-    from vllm.model_executor.layers.activation import GeluAndMul, SiluAndMul
+    from vllm.model_executor.layers.activation import (  # noqa: F401
+        GeluAndMul,
+        SiluAndMul,
+    )
@@ -20,7 +20,6 @@
     from sglang.srt.layers.radix_attention import RadixAttention
     from sglang.srt.model_executor.model_runner import ModelRunner
 
-import os
 
 import numpy as np
 

@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING, Optional
 
 import torch
 

@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 
-import warnings
 from typing import Optional
 
 import torch

@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 
-from typing import Optional, Tuple
+from typing import Optional
 
 import torch
 import triton

@@ -3,9 +3,7 @@
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 
 import torch
-import torch.nn.functional as F
 import triton
-import triton.language as tl
 
 from sglang.srt.layers.attention.fla.utils import tensor_cache
 

@@ -5,7 +5,6 @@
 # This backward pass is faster for dimensions up to 8k, but after that it's much slower due to register spilling.
 # The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine.
 
-import math
 
 import torch
 import torch.nn.functional as F

@@ -9,8 +9,6 @@
 import triton.language as tl
 
 from sglang.srt.layers.attention.fla.index import prepare_chunk_indices
-from sglang.srt.layers.attention.fla.op import safe_exp
-from sglang.srt.layers.attention.fla.utils import check_shared_mem
 
 
 @triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None})

@@ -50,7 +50,6 @@
         fast_decode_plan,
     )
     from flashinfer.cascade import merge_state
-    from flashinfer.decode import _get_range_buf, get_seq_lens
 
 
 class WrapperDispatch(Enum):

@@ -1,4 +1,4 @@
-from typing import Optional, Union
+from typing import Optional
 
 import torch
 

@@ -1,9 +1,6 @@
-from dataclasses import astuple, dataclass
-from functools import lru_cache
 from typing import Optional, Union
 
 import torch
-import torch.nn.functional as F
 
 from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
 from sglang.srt.layers.attention.fla.chunk import chunk_gated_delta_rule

@@ -14,7 +14,7 @@
 
 class IntelAMXAttnBackend(AttentionBackend):
     def __init__(self, model_runner: ModelRunner):
-        import sgl_kernel
+        import sgl_kernel  # noqa: F401
 
         super().__init__()
         self.forward_metadata = None

@@ -4,7 +4,6 @@
 
 from typing import List, Optional, Union
 
-import numpy as np
 import torch
 import triton
 import triton.language as tl

@@ -10,7 +10,6 @@
 
 import torch
 import triton
-import triton.language as tl
 from einops import rearrange
 from packaging import version
 

@@ -13,7 +13,7 @@ def is_mla_preprocess_enabled() -> bool:
 
 
 if is_mla_preprocess_enabled():
-    import sgl_kernel_npu
+    import sgl_kernel_npu  # noqa: F401
     import torch_npu
 
     torch.npu.config.allow_internal_format = True