diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 04eb1ecc304..1e09712abb7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,9 +27,9 @@ repos: rev: v0.11.7 hooks: - id: ruff - args: [--select=F401, --fixable=F401] - files: ^(benchmark/|docs/|examples/) - exclude: \.ipynb$|^python/sglang/srt/grpc/.*_pb2\.py$|^python/sglang/srt/grpc/.*_pb2_grpc\.py$|^python/sglang/srt/grpc/.*_pb2\.pyi$|^python/sglang/srt/grpc/.*_pb2_grpc\.pyi$ + args: [--select=F401,F821, --fixable=F401] + files: ^(benchmark/|docs/|examples/|python/sglang/) + exclude: __init__\.py$|\.ipynb$|^python/sglang/srt/grpc/.*_pb2\.py$|^python/sglang/srt/grpc/.*_pb2_grpc\.py$|^python/sglang/srt/grpc/.*_pb2\.pyi$|^python/sglang/srt/grpc/.*_pb2_grpc\.pyi$ - repo: https://github.com/psf/black rev: 24.10.0 hooks: diff --git a/python/sglang/srt/_custom_ops.py b/python/sglang/srt/_custom_ops.py index 5ed175312c9..de47707c18a 100644 --- a/python/sglang/srt/_custom_ops.py +++ b/python/sglang/srt/_custom_ops.py @@ -15,7 +15,7 @@ # ROCm does not use vllm custom allreduce if use_vllm_custom_allreduce and not is_hip(): try: - import vllm._C + import vllm._C # noqa: F401 except ImportError as e: logger.warning("Failed to import from vllm._C with %r", e) else: diff --git a/python/sglang/srt/compilation/cuda_piecewise_backend.py b/python/sglang/srt/compilation/cuda_piecewise_backend.py index 9f4b8cc8e8a..44e3803ff5d 100644 --- a/python/sglang/srt/compilation/cuda_piecewise_backend.py +++ b/python/sglang/srt/compilation/cuda_piecewise_backend.py @@ -9,7 +9,6 @@ import torch import torch.fx as fx -import sglang.srt.compilation.weak_ref_tensor_jit from sglang.srt.compilation.compilation_config import CompilationConfig from sglang.srt.compilation.compilation_counter import compilation_counter diff --git a/python/sglang/srt/configs/deepseekvl2.py b/python/sglang/srt/configs/deepseekvl2.py index bcb0afe5ae7..9621f058bf6 100644 --- a/python/sglang/srt/configs/deepseekvl2.py +++ b/python/sglang/srt/configs/deepseekvl2.py @@ -1,5 +1,4 @@ import math -import os from dataclasses import dataclass from typing import Dict, List, Optional, Tuple diff --git a/python/sglang/srt/configs/dots_vlm.py b/python/sglang/srt/configs/dots_vlm.py index 155d6ee47c1..dc921582ccf 100644 --- a/python/sglang/srt/configs/dots_vlm.py +++ b/python/sglang/srt/configs/dots_vlm.py @@ -1,10 +1,5 @@ -from typing import Any, List, Optional, Union - -from transformers import AutoProcessor, LlamaTokenizerFast, PretrainedConfig -from transformers.feature_extraction_utils import BatchFeature -from transformers.image_utils import ImageInput -from transformers.processing_utils import ProcessingKwargs, Unpack -from transformers.tokenization_utils_base import PreTokenizedInput, TextInput +from transformers import AutoProcessor, PretrainedConfig +from transformers.processing_utils import ProcessingKwargs try: from transformers import Qwen2_5_VLProcessor diff --git a/python/sglang/srt/configs/falcon_h1.py b/python/sglang/srt/configs/falcon_h1.py index d323b056db2..b8869b4ffa3 100644 --- a/python/sglang/srt/configs/falcon_h1.py +++ b/python/sglang/srt/configs/falcon_h1.py @@ -14,17 +14,12 @@ # limitations under the License. """Falcon-H1 model configuration""" -import enum from transformers.configuration_utils import PretrainedConfig -from transformers.modeling_rope_utils import rope_config_validation from transformers.utils import logging from sglang.srt.configs.mamba_utils import Mamba2CacheParams, Mamba2StateShape -from sglang.srt.layers.dp_attention import ( - get_attention_tp_size, - get_tensor_model_parallel_world_size, -) +from sglang.srt.layers.dp_attention import get_tensor_model_parallel_world_size logger = logging.get_logger(__name__) diff --git a/python/sglang/srt/configs/qwen3_next.py b/python/sglang/srt/configs/qwen3_next.py index 09c9b5a1b3e..630227a2c62 100644 --- a/python/sglang/srt/configs/qwen3_next.py +++ b/python/sglang/srt/configs/qwen3_next.py @@ -21,7 +21,6 @@ from transformers.utils import logging from sglang.srt.configs.mamba_utils import Mamba2CacheParams, Mamba2StateShape -from sglang.srt.distributed.utils import divide from sglang.srt.layers.dp_attention import get_attention_tp_size logger = logging.get_logger(__name__) diff --git a/python/sglang/srt/connector/remote_instance.py b/python/sglang/srt/connector/remote_instance.py index e1f00037f8c..0a4e67cfd2f 100644 --- a/python/sglang/srt/connector/remote_instance.py +++ b/python/sglang/srt/connector/remote_instance.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import logging -from typing import Generator, List, Optional, Tuple +from typing import Generator, Optional, Tuple from urllib.parse import urlparse import torch diff --git a/python/sglang/srt/disaggregation/ascend/transfer_engine.py b/python/sglang/srt/disaggregation/ascend/transfer_engine.py index a1fe58ce605..a701838b6a6 100644 --- a/python/sglang/srt/disaggregation/ascend/transfer_engine.py +++ b/python/sglang/srt/disaggregation/ascend/transfer_engine.py @@ -1,6 +1,6 @@ import logging import os -from typing import List, Optional +from typing import List import torch diff --git a/python/sglang/srt/disaggregation/decode.py b/python/sglang/srt/disaggregation/decode.py index 45589ec51fb..5e05cdd7408 100644 --- a/python/sglang/srt/disaggregation/decode.py +++ b/python/sglang/srt/disaggregation/decode.py @@ -25,7 +25,7 @@ from collections import deque from dataclasses import dataclass from http import HTTPStatus -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type, Union +from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union import torch from torch.distributed import ProcessGroup @@ -48,10 +48,7 @@ ) from sglang.srt.layers.dp_attention import get_attention_tp_size from sglang.srt.managers.schedule_batch import FINISH_ABORT, RequestStage, ScheduleBatch -from sglang.srt.mem_cache.allocator import ( - BaseTokenToKVPoolAllocator, - SWATokenToKVPoolAllocator, -) +from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache from sglang.srt.mem_cache.memory_pool import ( HybridLinearKVPool, @@ -61,7 +58,6 @@ ReqToTokenPool, SWAKVPool, ) -from sglang.srt.model_executor.forward_batch_info import ForwardMode from sglang.srt.utils import get_int_env_var, require_mlp_sync from sglang.srt.utils.torch_memory_saver_adapter import TorchMemorySaverAdapter diff --git a/python/sglang/srt/disaggregation/prefill.py b/python/sglang/srt/disaggregation/prefill.py index 23cd0dd1754..86ef0498fc5 100644 --- a/python/sglang/srt/disaggregation/prefill.py +++ b/python/sglang/srt/disaggregation/prefill.py @@ -20,7 +20,6 @@ from __future__ import annotations import logging -import threading import time from collections import deque from http import HTTPStatus @@ -54,7 +53,7 @@ NSATokenToKVPool, SWAKVPool, ) -from sglang.srt.model_executor.forward_batch_info import ForwardMode, PPProxyTensors +from sglang.srt.model_executor.forward_batch_info import PPProxyTensors from sglang.srt.utils import ( DynamicGradMode, broadcast_pyobj, diff --git a/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py b/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py index bb7128206a5..72668bf2e26 100644 --- a/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py +++ b/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py @@ -32,7 +32,7 @@ ops.meta_size() else: # Use custom allreduce from sgl kernel (ROCM and TRT-LLM) - import sgl_kernel + import sgl_kernel # noqa: F401 custom_ar = True except Exception: # For CPUs diff --git a/python/sglang/srt/distributed/device_communicators/pymscclpp.py b/python/sglang/srt/distributed/device_communicators/pymscclpp.py index 78269ed05a3..5d7511c2c2a 100644 --- a/python/sglang/srt/distributed/device_communicators/pymscclpp.py +++ b/python/sglang/srt/distributed/device_communicators/pymscclpp.py @@ -4,7 +4,7 @@ import os from contextlib import contextmanager from enum import IntEnum -from typing import Any, Callable, List, Optional, TypeVar, Union +from typing import Optional, Union import torch import torch.distributed as dist @@ -24,7 +24,7 @@ mscclpp_is_available = False if _is_cuda: try: - import sgl_kernel + import sgl_kernel # noqa: F401 mscclpp_is_available = True except: diff --git a/python/sglang/srt/distributed/device_communicators/symm_mem.py b/python/sglang/srt/distributed/device_communicators/symm_mem.py index 0d69a33a28f..48e20627e90 100644 --- a/python/sglang/srt/distributed/device_communicators/symm_mem.py +++ b/python/sglang/srt/distributed/device_communicators/symm_mem.py @@ -9,7 +9,7 @@ from sglang.srt.distributed.device_communicators.all_reduce_utils import ( SYMM_MEM_ALL_REDUCE_MAX_SIZES, ) -from sglang.srt.utils import get_device_capability, is_cuda, is_hip +from sglang.srt.utils import is_cuda, is_hip try: import torch.distributed._symmetric_memory as torch_symm_mem diff --git a/python/sglang/srt/distributed/naive_distributed.py b/python/sglang/srt/distributed/naive_distributed.py index 61165d90c05..b340ff44d6e 100644 --- a/python/sglang/srt/distributed/naive_distributed.py +++ b/python/sglang/srt/distributed/naive_distributed.py @@ -1,5 +1,4 @@ import base64 -import os import pickle import time from pathlib import Path diff --git a/python/sglang/srt/entrypoints/context.py b/python/sglang/srt/entrypoints/context.py index 9314083b4c1..972c0f4f3ca 100644 --- a/python/sglang/srt/entrypoints/context.py +++ b/python/sglang/srt/entrypoints/context.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # Copied from vLLM -import json import logging from abc import ABC, abstractmethod from typing import Union diff --git a/python/sglang/srt/entrypoints/harmony_utils.py b/python/sglang/srt/entrypoints/harmony_utils.py index ad6350d165f..68bbbf09467 100644 --- a/python/sglang/srt/entrypoints/harmony_utils.py +++ b/python/sglang/srt/entrypoints/harmony_utils.py @@ -3,7 +3,6 @@ # Adapted from vLLM: https://github.com/vllm-project/vllm/blob/1b9902806915040ac9b3029f2ab7522ec505afc3/vllm/entrypoints/harmony_utils.py # Slight differences in processing chat messages import datetime -import json from collections.abc import Iterable from typing import Literal, Optional, Union diff --git a/python/sglang/srt/entrypoints/http_server.py b/python/sglang/srt/entrypoints/http_server.py index 335be026d09..00fe4ca17e7 100644 --- a/python/sglang/srt/entrypoints/http_server.py +++ b/python/sglang/srt/entrypoints/http_server.py @@ -19,7 +19,6 @@ import asyncio import dataclasses -import json import logging import multiprocessing as multiprocessing import os diff --git a/python/sglang/srt/entrypoints/http_server_engine.py b/python/sglang/srt/entrypoints/http_server_engine.py index d1db80d656f..9ab665a05a7 100644 --- a/python/sglang/srt/entrypoints/http_server_engine.py +++ b/python/sglang/srt/entrypoints/http_server_engine.py @@ -1,15 +1,9 @@ -import copy -import dataclasses import multiprocessing -import pickle -import threading import time -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import List, Optional, Tuple -import pybase64 import requests import torch -import torch.distributed as dist from sglang.srt.entrypoints.EngineBase import EngineBase from sglang.srt.entrypoints.http_server import launch_server diff --git a/python/sglang/srt/eplb/eplb_algorithms/deepseek.py b/python/sglang/srt/eplb/eplb_algorithms/deepseek.py index 180ccdee452..34bbc491027 100644 --- a/python/sglang/srt/eplb/eplb_algorithms/deepseek.py +++ b/python/sglang/srt/eplb/eplb_algorithms/deepseek.py @@ -3,8 +3,6 @@ import torch -from sglang.srt.utils import get_bool_env_var - def balanced_packing( weight: torch.Tensor, num_packs: int diff --git a/python/sglang/srt/function_call/glm4_moe_detector.py b/python/sglang/srt/function_call/glm4_moe_detector.py index 845b5d41fd6..301d0e0dedc 100644 --- a/python/sglang/srt/function_call/glm4_moe_detector.py +++ b/python/sglang/srt/function_call/glm4_moe_detector.py @@ -6,11 +6,7 @@ from sglang.srt.entrypoints.openai.protocol import Tool from sglang.srt.function_call.base_format_detector import BaseFormatDetector -from sglang.srt.function_call.core_types import ( - StreamingParseResult, - StructureInfo, - _GetInfoFunc, -) +from sglang.srt.function_call.core_types import StreamingParseResult, _GetInfoFunc from sglang.srt.function_call.ebnf_composer import EBNFComposer logger = logging.getLogger(__name__) diff --git a/python/sglang/srt/function_call/json_array_parser.py b/python/sglang/srt/function_call/json_array_parser.py index 5144cb83b7d..6d6bffc996c 100644 --- a/python/sglang/srt/function_call/json_array_parser.py +++ b/python/sglang/srt/function_call/json_array_parser.py @@ -1,5 +1,3 @@ -import json -import re from typing import List from sglang.srt.entrypoints.openai.protocol import Tool diff --git a/python/sglang/srt/function_call/utils.py b/python/sglang/srt/function_call/utils.py index 5ad3f6e89a0..d85e5e6c030 100644 --- a/python/sglang/srt/function_call/utils.py +++ b/python/sglang/srt/function_call/utils.py @@ -1,4 +1,3 @@ -import json from json import JSONDecodeError, JSONDecoder from json.decoder import WHITESPACE from typing import Any, List, Literal, Optional, Tuple, Union diff --git a/python/sglang/srt/grpc/compile_proto.py b/python/sglang/srt/grpc/compile_proto.py index 7aa145075c9..c2c4c0aa64f 100755 --- a/python/sglang/srt/grpc/compile_proto.py +++ b/python/sglang/srt/grpc/compile_proto.py @@ -70,7 +70,7 @@ def compile_proto(proto_file: Path, output_dir: Path, verbose: bool = True) -> b # Check if grpc_tools is available try: - import grpc_tools.protoc + import grpc_tools.protoc # noqa: F401 except ImportError: print("Error: grpcio-tools not installed") print( diff --git a/python/sglang/srt/grpc/grpc_request_manager.py b/python/sglang/srt/grpc/grpc_request_manager.py index a8acb4bc411..81845388b02 100644 --- a/python/sglang/srt/grpc/grpc_request_manager.py +++ b/python/sglang/srt/grpc/grpc_request_manager.py @@ -27,7 +27,6 @@ TokenizedEmbeddingReqInput, TokenizedGenerateReqInput, ) -from sglang.srt.managers.scheduler import is_health_check_generate_req from sglang.srt.server_args import PortArgs, ServerArgs from sglang.srt.utils import get_zmq_socket, kill_process_tree from sglang.utils import get_exception_traceback diff --git a/python/sglang/srt/layers/activation.py b/python/sglang/srt/layers/activation.py index 5dc48821adc..f9bb6d6f57d 100644 --- a/python/sglang/srt/layers/activation.py +++ b/python/sglang/srt/layers/activation.py @@ -380,4 +380,7 @@ def get_cross_encoder_activation_function(config: PretrainedConfig): logger.info( "sgl-kernel is not available on Non-NV, Non-AMD platforms or Non-AMX CPUs. Fallback to other kernel libraries." ) - from vllm.model_executor.layers.activation import GeluAndMul, SiluAndMul + from vllm.model_executor.layers.activation import ( # noqa: F401 + GeluAndMul, + SiluAndMul, + ) diff --git a/python/sglang/srt/layers/attention/ascend_backend.py b/python/sglang/srt/layers/attention/ascend_backend.py index bc118d6c505..f795c65d0f5 100644 --- a/python/sglang/srt/layers/attention/ascend_backend.py +++ b/python/sglang/srt/layers/attention/ascend_backend.py @@ -20,7 +20,6 @@ from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.model_executor.model_runner import ModelRunner -import os import numpy as np diff --git a/python/sglang/srt/layers/attention/base_attn_backend.py b/python/sglang/srt/layers/attention/base_attn_backend.py index d0ab5ca82b7..dcbf1c8fdf1 100644 --- a/python/sglang/srt/layers/attention/base_attn_backend.py +++ b/python/sglang/srt/layers/attention/base_attn_backend.py @@ -1,7 +1,7 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, Optional import torch diff --git a/python/sglang/srt/layers/attention/fla/chunk.py b/python/sglang/srt/layers/attention/fla/chunk.py index a48a9e649f3..21d93ac0044 100644 --- a/python/sglang/srt/layers/attention/fla/chunk.py +++ b/python/sglang/srt/layers/attention/fla/chunk.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang -import warnings from typing import Optional import torch diff --git a/python/sglang/srt/layers/attention/fla/chunk_o.py b/python/sglang/srt/layers/attention/fla/chunk_o.py index d672c646beb..b2ae826f760 100644 --- a/python/sglang/srt/layers/attention/fla/chunk_o.py +++ b/python/sglang/srt/layers/attention/fla/chunk_o.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang -from typing import Optional, Tuple +from typing import Optional import torch import triton diff --git a/python/sglang/srt/layers/attention/fla/index.py b/python/sglang/srt/layers/attention/fla/index.py index 754b9871462..31b2e524e2a 100644 --- a/python/sglang/srt/layers/attention/fla/index.py +++ b/python/sglang/srt/layers/attention/fla/index.py @@ -3,9 +3,7 @@ # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang import torch -import torch.nn.functional as F import triton -import triton.language as tl from sglang.srt.layers.attention.fla.utils import tensor_cache diff --git a/python/sglang/srt/layers/attention/fla/layernorm_gated.py b/python/sglang/srt/layers/attention/fla/layernorm_gated.py index 50b7244c6e9..b7dd39b1292 100644 --- a/python/sglang/srt/layers/attention/fla/layernorm_gated.py +++ b/python/sglang/srt/layers/attention/fla/layernorm_gated.py @@ -5,7 +5,6 @@ # This backward pass is faster for dimensions up to 8k, but after that it's much slower due to register spilling. # The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine. -import math import torch import torch.nn.functional as F diff --git a/python/sglang/srt/layers/attention/fla/wy_fast.py b/python/sglang/srt/layers/attention/fla/wy_fast.py index d51500eb459..fa39312df21 100644 --- a/python/sglang/srt/layers/attention/fla/wy_fast.py +++ b/python/sglang/srt/layers/attention/fla/wy_fast.py @@ -9,8 +9,6 @@ import triton.language as tl from sglang.srt.layers.attention.fla.index import prepare_chunk_indices -from sglang.srt.layers.attention.fla.op import safe_exp -from sglang.srt.layers.attention.fla.utils import check_shared_mem @triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None}) diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py index ab4398b0b4d..33ff82ca6b2 100644 --- a/python/sglang/srt/layers/attention/flashinfer_backend.py +++ b/python/sglang/srt/layers/attention/flashinfer_backend.py @@ -50,7 +50,6 @@ fast_decode_plan, ) from flashinfer.cascade import merge_state - from flashinfer.decode import _get_range_buf, get_seq_lens class WrapperDispatch(Enum): diff --git a/python/sglang/srt/layers/attention/hybrid_attn_backend.py b/python/sglang/srt/layers/attention/hybrid_attn_backend.py index 7a78fd4d1c6..4f1439c264a 100644 --- a/python/sglang/srt/layers/attention/hybrid_attn_backend.py +++ b/python/sglang/srt/layers/attention/hybrid_attn_backend.py @@ -1,4 +1,4 @@ -from typing import Optional, Union +from typing import Optional import torch diff --git a/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py b/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py index 7f2e90255fd..5ea9e6c8e43 100644 --- a/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py +++ b/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py @@ -1,9 +1,6 @@ -from dataclasses import astuple, dataclass -from functools import lru_cache from typing import Optional, Union import torch -import torch.nn.functional as F from sglang.srt.layers.attention.base_attn_backend import AttentionBackend from sglang.srt.layers.attention.fla.chunk import chunk_gated_delta_rule diff --git a/python/sglang/srt/layers/attention/intel_amx_backend.py b/python/sglang/srt/layers/attention/intel_amx_backend.py index 39e5c7428ad..4b2974c44e0 100644 --- a/python/sglang/srt/layers/attention/intel_amx_backend.py +++ b/python/sglang/srt/layers/attention/intel_amx_backend.py @@ -14,7 +14,7 @@ class IntelAMXAttnBackend(AttentionBackend): def __init__(self, model_runner: ModelRunner): - import sgl_kernel + import sgl_kernel # noqa: F401 super().__init__() self.forward_metadata = None diff --git a/python/sglang/srt/layers/attention/mamba/causal_conv1d_triton.py b/python/sglang/srt/layers/attention/mamba/causal_conv1d_triton.py index dbd9dac347a..88a65ddd0a1 100644 --- a/python/sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +++ b/python/sglang/srt/layers/attention/mamba/causal_conv1d_triton.py @@ -4,7 +4,6 @@ from typing import List, Optional, Union -import numpy as np import torch import triton import triton.language as tl diff --git a/python/sglang/srt/layers/attention/mamba/ops/ssd_combined.py b/python/sglang/srt/layers/attention/mamba/ops/ssd_combined.py index d27fc562ea7..6e2e74752ba 100644 --- a/python/sglang/srt/layers/attention/mamba/ops/ssd_combined.py +++ b/python/sglang/srt/layers/attention/mamba/ops/ssd_combined.py @@ -10,7 +10,6 @@ import torch import triton -import triton.language as tl from einops import rearrange from packaging import version diff --git a/python/sglang/srt/layers/attention/npu_ops/mla_preprocess.py b/python/sglang/srt/layers/attention/npu_ops/mla_preprocess.py index 06a55254529..76f802bd291 100644 --- a/python/sglang/srt/layers/attention/npu_ops/mla_preprocess.py +++ b/python/sglang/srt/layers/attention/npu_ops/mla_preprocess.py @@ -13,7 +13,7 @@ def is_mla_preprocess_enabled() -> bool: if is_mla_preprocess_enabled(): - import sgl_kernel_npu + import sgl_kernel_npu # noqa: F401 import torch_npu torch.npu.config.allow_internal_format = True diff --git a/python/sglang/srt/layers/attention/nsa/nsa_indexer.py b/python/sglang/srt/layers/attention/nsa/nsa_indexer.py index ebb5b85da26..b9f399899e2 100644 --- a/python/sglang/srt/layers/attention/nsa/nsa_indexer.py +++ b/python/sglang/srt/layers/attention/nsa/nsa_indexer.py @@ -1,7 +1,7 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple +from typing import TYPE_CHECKING, Any, Dict, Optional import torch import torch.nn.functional as F @@ -547,7 +547,7 @@ def forward_npu( forward_batch: ForwardBatch, layer_id: int, ) -> torch.Tensor: - import custom_ops + import custom_ops # noqa: F401 import torch_npu from sglang.srt.layers.dp_attention import ( diff --git a/python/sglang/srt/layers/attention/nsa_backend.py b/python/sglang/srt/layers/attention/nsa_backend.py index 74d293fd310..6ec4652f415 100644 --- a/python/sglang/srt/layers/attention/nsa_backend.py +++ b/python/sglang/srt/layers/attention/nsa_backend.py @@ -1,6 +1,5 @@ from __future__ import annotations -import sys from dataclasses import dataclass from typing import TYPE_CHECKING, Dict, List, Literal, Optional, TypeAlias @@ -34,18 +33,18 @@ if _is_hip: try: - from aiter import ( + from aiter import ( # noqa: F401 flash_attn_varlen_func, mha_batch_prefill_func, paged_attention_ragged, ) - from aiter.mla import mla_decode_fwd, mla_prefill_fwd + from aiter.mla import mla_decode_fwd, mla_prefill_fwd # noqa: F401 except ImportError: print( "aiter is AMD specific kernel library. Please make sure aiter is installed on your AMD device." ) else: - from sgl_kernel.flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache + from sgl_kernel.flash_attn import flash_attn_with_kvcache @dataclass(frozen=True) diff --git a/python/sglang/srt/layers/layernorm.py b/python/sglang/srt/layers/layernorm.py index a0b75780bd7..c60314ad913 100644 --- a/python/sglang/srt/layers/layernorm.py +++ b/python/sglang/srt/layers/layernorm.py @@ -372,4 +372,4 @@ def extra_repr(self): logger.info( "sgl-kernel layernorm implementation is not available on current platform. Fallback to other kernel libraries." ) - from vllm.model_executor.layers.layernorm import GemmaRMSNorm, RMSNorm + from vllm.model_executor.layers.layernorm import GemmaRMSNorm, RMSNorm # noqa: F401 diff --git a/python/sglang/srt/layers/moe/cutlass_moe.py b/python/sglang/srt/layers/moe/cutlass_moe.py index d0fb4e3ef48..870749d4799 100755 --- a/python/sglang/srt/layers/moe/cutlass_moe.py +++ b/python/sglang/srt/layers/moe/cutlass_moe.py @@ -116,8 +116,6 @@ def cutlass_fused_experts_fp8( if is_cuda: from sglang.srt.layers.quantization.fp8_kernel import ( - per_group_transpose, - per_token_group_quant_fp8_hopper_moe_mn_major, sglang_per_token_group_quant_fp8, ) diff --git a/python/sglang/srt/layers/moe/cutlass_w4a8_moe.py b/python/sglang/srt/layers/moe/cutlass_w4a8_moe.py index 2a84dedc4bf..800c8c83a6b 100644 --- a/python/sglang/srt/layers/moe/cutlass_w4a8_moe.py +++ b/python/sglang/srt/layers/moe/cutlass_w4a8_moe.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 """Cutlass W4A8 MoE kernel.""" -import logging from typing import Optional import torch diff --git a/python/sglang/srt/layers/moe/ep_moe/kernels.py b/python/sglang/srt/layers/moe/ep_moe/kernels.py index ef4262a1c1c..89bab802cf0 100644 --- a/python/sglang/srt/layers/moe/ep_moe/kernels.py +++ b/python/sglang/srt/layers/moe/ep_moe/kernels.py @@ -1,12 +1,9 @@ import logging -from typing import List, Optional import torch import triton -from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8 -from sglang.srt.utils import ceil_div, dispose_tensor, is_cuda -from sglang.utils import is_in_ci +from sglang.srt.utils import ceil_div, is_cuda logger = logging.getLogger(__name__) diff --git a/python/sglang/srt/layers/moe/flashinfer_cutedsl_moe.py b/python/sglang/srt/layers/moe/flashinfer_cutedsl_moe.py index 1d37236e020..8026b1e67c5 100644 --- a/python/sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +++ b/python/sglang/srt/layers/moe/flashinfer_cutedsl_moe.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Optional, Union +from typing import Optional, Union import torch from flashinfer.cute_dsl.blockscaled_gemm import grouped_gemm_nt_masked diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py index 1ff77818421..0eb2a917036 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py @@ -43,13 +43,7 @@ ) if is_flashinfer_available(): - from flashinfer import ( - RoutingMethodType, - fp4_quantize, - reorder_rows_for_gated_act_gemm, - shuffle_matrix_a, - shuffle_matrix_sf_a, - ) + from flashinfer import RoutingMethodType, fp4_quantize _is_hip = is_hip() _is_cpu_amx_available = cpu_has_amx_support() diff --git a/python/sglang/srt/layers/moe/moe_runner/triton.py b/python/sglang/srt/layers/moe/moe_runner/triton.py index 116fdcaa019..8c77d758043 100644 --- a/python/sglang/srt/layers/moe/moe_runner/triton.py +++ b/python/sglang/srt/layers/moe/moe_runner/triton.py @@ -51,7 +51,9 @@ if _is_cuda or _is_hip: - from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size + from sgl_kernel import ( # noqa: F401 + moe_align_block_size as sgl_moe_align_block_size, + ) @dataclass diff --git a/python/sglang/srt/layers/moe/rocm_moe_utils.py b/python/sglang/srt/layers/moe/rocm_moe_utils.py index 5fe2de1e584..efa6bb1bb23 100644 --- a/python/sglang/srt/layers/moe/rocm_moe_utils.py +++ b/python/sglang/srt/layers/moe/rocm_moe_utils.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from enum import IntEnum -from functools import cache from typing import Optional import torch diff --git a/python/sglang/srt/layers/moe/token_dispatcher/deepep.py b/python/sglang/srt/layers/moe/token_dispatcher/deepep.py index 618c4cf9eb1..8667d8747c5 100644 --- a/python/sglang/srt/layers/moe/token_dispatcher/deepep.py +++ b/python/sglang/srt/layers/moe/token_dispatcher/deepep.py @@ -3,7 +3,7 @@ import logging from contextlib import nullcontext from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Tuple, Union +from typing import TYPE_CHECKING, List, NamedTuple, Optional, Tuple, Union from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder from sglang.srt.layers.moe.token_dispatcher.base import ( diff --git a/python/sglang/srt/layers/moe/token_dispatcher/mooncake.py b/python/sglang/srt/layers/moe/token_dispatcher/mooncake.py index d6d56186563..54ba8f1b562 100644 --- a/python/sglang/srt/layers/moe/token_dispatcher/mooncake.py +++ b/python/sglang/srt/layers/moe/token_dispatcher/mooncake.py @@ -22,7 +22,7 @@ except ImportError: use_mooncake_ep = False -from enum import Enum, IntEnum, auto +from enum import Enum, auto import torch import torch.distributed as dist diff --git a/python/sglang/srt/layers/quantization/awq.py b/python/sglang/srt/layers/quantization/awq.py index 9cba60c2b53..d796008c888 100644 --- a/python/sglang/srt/layers/quantization/awq.py +++ b/python/sglang/srt/layers/quantization/awq.py @@ -3,7 +3,7 @@ import logging import warnings -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional +from typing import TYPE_CHECKING, Any, Dict, List, Optional import torch diff --git a/python/sglang/srt/layers/quantization/base_config.py b/python/sglang/srt/layers/quantization/base_config.py index 4a5b7905eee..18300517702 100644 --- a/python/sglang/srt/layers/quantization/base_config.py +++ b/python/sglang/srt/layers/quantization/base_config.py @@ -3,7 +3,6 @@ import inspect from abc import ABC, abstractmethod -from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type import torch diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index e2ff25e6868..3517bc5e267 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -5,7 +5,7 @@ import enum import logging from enum import Enum -from typing import TYPE_CHECKING, List, Optional +from typing import TYPE_CHECKING, List import torch from compressed_tensors import CompressionFormat @@ -21,14 +21,7 @@ per_tensor_dequantize, replace_parameter, ) -from sglang.srt.utils import ( - get_bool_env_var, - is_cpu, - is_cuda, - is_hip, - is_npu, - set_weight_attrs, -) +from sglang.srt.utils import get_bool_env_var, is_hip, set_weight_attrs if TYPE_CHECKING: from sglang.srt.layers.moe.fused_moe_triton import FusedMoE @@ -49,7 +42,7 @@ from sglang.srt.layers.moe.rocm_moe_utils import rocm_fused_experts_tkw1 try: - import vllm + import vllm # noqa: F401 VLLM_AVAILABLE = True except ImportError: diff --git a/python/sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py b/python/sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py index 6a7ae00d0d4..9bb34046d51 100644 --- a/python/sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +++ b/python/sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py @@ -12,7 +12,7 @@ def _compute_enable_deep_gemm(): return False try: - import deep_gemm + import deep_gemm # noqa: F401 except ImportError: return False diff --git a/python/sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py b/python/sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py index 02945f44961..1f2f4542a94 100644 --- a/python/sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +++ b/python/sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py @@ -5,7 +5,7 @@ import torch from sglang.srt.layers.quantization.deep_gemm_wrapper import compile_utils -from sglang.srt.layers.quantization.deep_gemm_wrapper.configurer import ( +from sglang.srt.layers.quantization.deep_gemm_wrapper.configurer import ( # noqa: F401 DEEPGEMM_BLACKWELL, DEEPGEMM_SCALE_UE8M0, ENABLE_JIT_DEEPGEMM, @@ -17,7 +17,7 @@ if ENABLE_JIT_DEEPGEMM: import deep_gemm - from deep_gemm.utils.layout import get_mn_major_tma_aligned_tensor + from deep_gemm.utils.layout import get_mn_major_tma_aligned_tensor # noqa: F401 _SANITY_CHECK = get_bool_env_var("SGLANG_DEEPGEMM_SANITY_CHECK") diff --git a/python/sglang/srt/layers/quantization/fp8_kernel.py b/python/sglang/srt/layers/quantization/fp8_kernel.py index 580f103f212..bd962891663 100644 --- a/python/sglang/srt/layers/quantization/fp8_kernel.py +++ b/python/sglang/srt/layers/quantization/fp8_kernel.py @@ -67,7 +67,7 @@ raise ImportError("aiter is required when SGLANG_USE_AITER is set to True") else: try: - import vllm._C + import vllm._C # noqa: F401 except ImportError: raise ImportError("vllm is required when SGLANG_USE_AITER is set to False") diff --git a/python/sglang/srt/layers/quantization/fpgemm_fp8.py b/python/sglang/srt/layers/quantization/fpgemm_fp8.py index 5a78626ff3c..0c703010179 100644 --- a/python/sglang/srt/layers/quantization/fpgemm_fp8.py +++ b/python/sglang/srt/layers/quantization/fpgemm_fp8.py @@ -11,7 +11,6 @@ from sglang.srt.layers.linear import LinearBase from sglang.srt.layers.parameter import ChannelQuantScaleParameter, ModelWeightParameter from sglang.srt.layers.quantization.base_config import ( - FusedMoEMethodBase, LinearMethodBase, QuantizationConfig, QuantizeMethodBase, @@ -28,7 +27,7 @@ prepare_fp8_layer_for_marlin, ) from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod -from sglang.srt.layers.quantization.utils import is_layer_skipped, replace_parameter +from sglang.srt.layers.quantization.utils import is_layer_skipped from sglang.srt.utils import get_bool_env_var, is_cuda _is_cuda = is_cuda() diff --git a/python/sglang/srt/layers/quantization/gptq.py b/python/sglang/srt/layers/quantization/gptq.py index ccd3d46f705..be28f07f8c1 100644 --- a/python/sglang/srt/layers/quantization/gptq.py +++ b/python/sglang/srt/layers/quantization/gptq.py @@ -199,7 +199,6 @@ def get_quant_method( self, layer: torch.nn.Module, prefix: str ) -> Optional[LinearMethodBase]: # Delay the import to avoid circular dependency - from sglang.srt.layers.linear import LinearBase from sglang.srt.layers.moe.fused_moe_triton import FusedMoE if isinstance(layer, FusedMoE): diff --git a/python/sglang/srt/layers/quantization/int8_kernel.py b/python/sglang/srt/layers/quantization/int8_kernel.py index 9e92412ac9d..91cba1c3278 100644 --- a/python/sglang/srt/layers/quantization/int8_kernel.py +++ b/python/sglang/srt/layers/quantization/int8_kernel.py @@ -8,7 +8,7 @@ import triton import triton.language as tl -from sglang.srt.utils import get_bool_env_var, get_device_name, is_cuda +from sglang.srt.utils import get_device_name, is_cuda _is_cuda = is_cuda() if _is_cuda: diff --git a/python/sglang/srt/layers/quantization/modelopt_quant.py b/python/sglang/srt/layers/quantization/modelopt_quant.py index d5c1db3a84c..f1c6dafb592 100755 --- a/python/sglang/srt/layers/quantization/modelopt_quant.py +++ b/python/sglang/srt/layers/quantization/modelopt_quant.py @@ -1059,16 +1059,7 @@ def prepare_static_weights_for_kernel( intermediate_size, num_experts, ): - from flashinfer import ( - RoutingMethodType, - e2m1_and_ufp8sf_scale_to_float, - fp4_quantize, - next_positive_power_of_2, - nvfp4_block_scale_interleave, - reorder_rows_for_gated_act_gemm, - shuffle_matrix_a, - shuffle_matrix_sf_a, - ) + from flashinfer import nvfp4_block_scale_interleave from flashinfer.fused_moe.core import ( _maybe_get_cached_w2_permute_indices, _maybe_get_cached_w3_w1_permute_indices, diff --git a/python/sglang/srt/layers/quantization/petit.py b/python/sglang/srt/layers/quantization/petit.py index 2c608507c9c..daac52ee2e0 100644 --- a/python/sglang/srt/layers/quantization/petit.py +++ b/python/sglang/srt/layers/quantization/petit.py @@ -2,7 +2,7 @@ import logging -from typing import Any, Callable, Dict, List, Optional +from typing import Any, Dict, List, Optional import regex as re import torch diff --git a/python/sglang/srt/layers/quantization/quark/quark_moe.py b/python/sglang/srt/layers/quantization/quark/quark_moe.py index d1ad13f4810..3d2d52cd22e 100644 --- a/python/sglang/srt/layers/quantization/quark/quark_moe.py +++ b/python/sglang/srt/layers/quantization/quark/quark_moe.py @@ -3,16 +3,16 @@ from __future__ import annotations import logging -from typing import TYPE_CHECKING, Any, Callable, Optional +from typing import TYPE_CHECKING, Any import torch -from aiter import ActivationType, QuantType, biased_grouped_topk +from aiter import ActivationType, QuantType from aiter.fused_moe import fused_moe from aiter.utility.fp4_utils import e8m0_shuffle from sglang.srt.layers.moe import MoeRunnerConfig from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase -from sglang.srt.utils import get_bool_env_var, is_hip, mxfp_supported, set_weight_attrs +from sglang.srt.utils import is_hip, set_weight_attrs if TYPE_CHECKING: from sglang.srt.layers.moe.token_dispatcher import ( diff --git a/python/sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py b/python/sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py index a0787baaf0f..a8322b4963d 100644 --- a/python/sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +++ b/python/sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py @@ -2,20 +2,13 @@ from typing import Any, Callable, Optional -import aiter import torch -import torch.nn.functional as F -from aiter.ops.gemm_op_a4w4 import gemm_a4w4 -from aiter.ops.shuffle import shuffle_weight from aiter.ops.triton.gemm_afp4wfp4 import gemm_afp4wfp4 from aiter.ops.triton.gemm_afp4wfp4_pre_quant_atomic import gemm_afp4wfp4_pre_quant from aiter.ops.triton.quant import dynamic_mxfp4_quant -from aiter.utility import dtypes -from aiter.utility.fp4_utils import e8m0_shuffle from sglang.srt.layers.parameter import GroupQuantScaleParameter, PackedvLLMParameter from sglang.srt.layers.quantization.quark.schemes import QuarkScheme -from sglang.srt.utils import get_bool_env_var __all__ = ["QuarkW4A4MXFP4"] diff --git a/python/sglang/srt/layers/quantization/utils.py b/python/sglang/srt/layers/quantization/utils.py index 63b8b6eb797..d407b95f277 100644 --- a/python/sglang/srt/layers/quantization/utils.py +++ b/python/sglang/srt/layers/quantization/utils.py @@ -11,7 +11,6 @@ import torch from sglang.srt.layers.quantization.fp8_kernel import scaled_fp8_quant -from sglang.srt.utils import is_cuda if TYPE_CHECKING: from sglang.srt.layers.quantization.base_config import QuantizationConfig diff --git a/python/sglang/srt/layers/quantization/w4afp8.py b/python/sglang/srt/layers/quantization/w4afp8.py index e97de07d799..7c5d4554a67 100644 --- a/python/sglang/srt/layers/quantization/w4afp8.py +++ b/python/sglang/srt/layers/quantization/w4afp8.py @@ -1,14 +1,13 @@ from __future__ import annotations import logging -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple +from typing import TYPE_CHECKING, Any, Dict, List, Optional import torch from torch.nn import Module from torch.nn.parameter import Parameter -from sglang.srt.distributed.parallel_state import get_moe_expert_parallel_world_size -from sglang.srt.layers.linear import LinearBase, UnquantizedLinearMethod +from sglang.srt.layers.linear import UnquantizedLinearMethod from sglang.srt.layers.quantization.base_config import ( FusedMoEMethodBase, QuantizationConfig, @@ -17,11 +16,11 @@ from sglang.srt.layers.quantization.fp8 import Fp8LinearMethod from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod from sglang.srt.layers.quantization.utils import is_layer_skipped -from sglang.srt.utils import is_npu, set_weight_attrs +from sglang.srt.utils import set_weight_attrs if TYPE_CHECKING: from sglang.srt.layers.moe import MoeRunnerConfig - from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE, EPMoE + from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE from sglang.srt.layers.moe.token_dispatcher import ( CombineInput, DeepEPNormalOutput, diff --git a/python/sglang/srt/layers/quantization/w8a8_int8.py b/python/sglang/srt/layers/quantization/w8a8_int8.py index 17a79190df7..77be31163ec 100644 --- a/python/sglang/srt/layers/quantization/w8a8_int8.py +++ b/python/sglang/srt/layers/quantization/w8a8_int8.py @@ -1,28 +1,12 @@ from __future__ import annotations -import importlib -import sys from types import MappingProxyType -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Dict, - List, - Mapping, - Optional, - Tuple, - Union, - cast, -) +from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple, Union, cast import torch from torch.nn.parameter import Parameter -from sglang.srt.distributed import ( - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, -) +from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.amx_utils import _amx_process_weight_after_loading from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo diff --git a/python/sglang/srt/layers/utils.py b/python/sglang/srt/layers/utils.py index 45e15479128..e88f3a938ad 100644 --- a/python/sglang/srt/layers/utils.py +++ b/python/sglang/srt/layers/utils.py @@ -1,6 +1,5 @@ import logging import re -from functools import lru_cache import torch diff --git a/python/sglang/srt/lora/backend/triton_backend.py b/python/sglang/srt/lora/backend/triton_backend.py index f99e2c006c7..722915efc51 100644 --- a/python/sglang/srt/lora/backend/triton_backend.py +++ b/python/sglang/srt/lora/backend/triton_backend.py @@ -11,7 +11,6 @@ ) from sglang.srt.lora.utils import LoRABatchInfo from sglang.srt.model_executor.forward_batch_info import ForwardBatch -from sglang.srt.server_args import ServerArgs class TritonLoRABackend(BaseLoRABackend): diff --git a/python/sglang/srt/lora/eviction_policy.py b/python/sglang/srt/lora/eviction_policy.py index 7d1f5f91adf..d4b29612f06 100644 --- a/python/sglang/srt/lora/eviction_policy.py +++ b/python/sglang/srt/lora/eviction_policy.py @@ -20,7 +20,7 @@ import time from abc import ABC, abstractmethod from collections import OrderedDict -from typing import Any, Dict, List, Optional, Set +from typing import Optional, Set logger = logging.getLogger(__name__) diff --git a/python/sglang/srt/lora/lora_manager.py b/python/sglang/srt/lora/lora_manager.py index 30d3386e28d..19ff874dc1d 100644 --- a/python/sglang/srt/lora/lora_manager.py +++ b/python/sglang/srt/lora/lora_manager.py @@ -16,7 +16,7 @@ # and "Punica: Multi-Tenant LoRA Serving" import logging -from typing import Dict, Iterable, List, Optional, Set, Tuple +from typing import Dict, Iterable, List, Optional import torch diff --git a/python/sglang/srt/managers/cache_controller.py b/python/sglang/srt/managers/cache_controller.py index f36d61ee09a..b5c4aa17234 100644 --- a/python/sglang/srt/managers/cache_controller.py +++ b/python/sglang/srt/managers/cache_controller.py @@ -14,11 +14,10 @@ """ import logging -import math import threading import time -from queue import Empty, Full, PriorityQueue, Queue -from typing import TYPE_CHECKING, List, NamedTuple, Optional, Set, Tuple +from queue import Empty, Full, Queue +from typing import TYPE_CHECKING, List, NamedTuple, Optional import torch @@ -41,7 +40,7 @@ get_attention_tp_size, is_dp_attention_enabled, ) -from sglang.srt.mem_cache.memory_pool import MHATokenToKVPool, MLATokenToKVPool +from sglang.srt.mem_cache.memory_pool import MLATokenToKVPool logger = logging.getLogger(__name__) diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 264d89bb9d0..a39a7a53536 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -59,11 +59,10 @@ SWATokenToKVPoolAllocator, ) from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache -from sglang.srt.mem_cache.chunk_cache import ChunkCache, SWAChunkCache +from sglang.srt.mem_cache.chunk_cache import SWAChunkCache from sglang.srt.mem_cache.common import ( alloc_for_decode, alloc_for_extend, - alloc_token_slots, evict_from_tree_cache, ) from sglang.srt.mem_cache.mamba_radix_cache import MambaRadixCache @@ -76,7 +75,6 @@ from sglang.srt.sampling.sampling_params import SamplingParams from sglang.srt.server_args import ServerArgs, get_global_server_args from sglang.srt.utils import flatten_nested_list -from sglang.srt.utils.common import next_power_of_2 if TYPE_CHECKING: from sglang.srt.configs.model_config import ModelConfig diff --git a/python/sglang/srt/managers/scheduler_metrics_mixin.py b/python/sglang/srt/managers/scheduler_metrics_mixin.py index 91fff9e9b36..34832f3e316 100644 --- a/python/sglang/srt/managers/scheduler_metrics_mixin.py +++ b/python/sglang/srt/managers/scheduler_metrics_mixin.py @@ -3,13 +3,10 @@ import logging import time from collections import defaultdict -from typing import TYPE_CHECKING, Dict, List, Optional, Union - -import torch +from typing import TYPE_CHECKING, List, Optional from sglang.srt.disaggregation.kv_events import EventPublisherFactory, KVEventBatch from sglang.srt.disaggregation.utils import DisaggregationMode -from sglang.srt.managers.io_struct import TokenizedGenerateReqInput from sglang.srt.managers.schedule_policy import PrefillAdder from sglang.srt.managers.scheduler import Req, ScheduleBatch from sglang.srt.metrics.collector import SchedulerMetricsCollector, SchedulerStats diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 03c15fde952..3e325ca4d0b 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -16,7 +16,6 @@ import asyncio import copy import dataclasses -import json import logging import math import os diff --git a/python/sglang/srt/managers/utils.py b/python/sglang/srt/managers/utils.py index ccd3f0fe2d8..fa343519849 100644 --- a/python/sglang/srt/managers/utils.py +++ b/python/sglang/srt/managers/utils.py @@ -1,8 +1,7 @@ from __future__ import annotations import logging -import multiprocessing as mp -from typing import TYPE_CHECKING, Dict, List, Optional +from typing import TYPE_CHECKING, Optional from sglang.srt.layers.logits_processor import LogitsProcessorOutput from sglang.srt.managers.schedule_batch import Req diff --git a/python/sglang/srt/mem_cache/allocator_ascend.py b/python/sglang/srt/mem_cache/allocator_ascend.py index 2c606187a95..4adbf592a24 100644 --- a/python/sglang/srt/mem_cache/allocator_ascend.py +++ b/python/sglang/srt/mem_cache/allocator_ascend.py @@ -92,7 +92,7 @@ def alloc_extend( ) if num_new_pages_item < 200: - import sgl_kernel_npu + import sgl_kernel_npu # noqa: F401 torch.ops.npu.alloc_extend( prefix_lens, diff --git a/python/sglang/srt/mem_cache/base_prefix_cache.py b/python/sglang/srt/mem_cache/base_prefix_cache.py index 34df996893f..fb85497c329 100644 --- a/python/sglang/srt/mem_cache/base_prefix_cache.py +++ b/python/sglang/srt/mem_cache/base_prefix_cache.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any, List, NamedTuple, Optional, Tuple +from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Tuple import torch diff --git a/python/sglang/srt/mem_cache/evict_policy.py b/python/sglang/srt/mem_cache/evict_policy.py index ddd2ab6c31a..491d3d846be 100644 --- a/python/sglang/srt/mem_cache/evict_policy.py +++ b/python/sglang/srt/mem_cache/evict_policy.py @@ -1,7 +1,7 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, List, Tuple, Union +from typing import TYPE_CHECKING, Tuple, Union if TYPE_CHECKING: from sglang.srt.mem_cache.radix_cache import TreeNode diff --git a/python/sglang/srt/mem_cache/mamba_radix_cache.py b/python/sglang/srt/mem_cache/mamba_radix_cache.py index 7467daa5d56..739b204ed99 100644 --- a/python/sglang/srt/mem_cache/mamba_radix_cache.py +++ b/python/sglang/srt/mem_cache/mamba_radix_cache.py @@ -22,7 +22,6 @@ import heapq import time from collections import defaultdict -from functools import partial from typing import TYPE_CHECKING, List, Optional, Tuple import torch @@ -33,7 +32,6 @@ from sglang.srt.mem_cache.radix_cache import ( RadixKey, _key_match_page_size1, - _key_match_paged, get_child_key, ) diff --git a/python/sglang/srt/mem_cache/memory_pool_host.py b/python/sglang/srt/mem_cache/memory_pool_host.py index f6d655af095..edfae2cfe94 100644 --- a/python/sglang/srt/mem_cache/memory_pool_host.py +++ b/python/sglang/srt/mem_cache/memory_pool_host.py @@ -1,7 +1,6 @@ import abc import logging import threading -from enum import IntEnum from functools import wraps from typing import Optional diff --git a/python/sglang/srt/mem_cache/multimodal_cache.py b/python/sglang/srt/mem_cache/multimodal_cache.py index 63a1775430c..42c31a8e866 100644 --- a/python/sglang/srt/mem_cache/multimodal_cache.py +++ b/python/sglang/srt/mem_cache/multimodal_cache.py @@ -1,6 +1,5 @@ import logging from collections import OrderedDict -from typing import Dict import torch diff --git a/python/sglang/srt/mem_cache/radix_cache.py b/python/sglang/srt/mem_cache/radix_cache.py index f8259433092..9009d4e926b 100644 --- a/python/sglang/srt/mem_cache/radix_cache.py +++ b/python/sglang/srt/mem_cache/radix_cache.py @@ -23,7 +23,7 @@ import time from collections import defaultdict from functools import lru_cache, partial -from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Iterator, List, Optional, Tuple, Union import torch diff --git a/python/sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py b/python/sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py index 2e54e9816f9..14494d81980 100644 --- a/python/sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +++ b/python/sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py @@ -3,20 +3,8 @@ import torch import torch.distributed -from aibrix_kvcache import ( - BaseKVCacheManager, - GroupAwareKVCacheManager, - KVCacheBlockLayout, - KVCacheBlockSpec, - KVCacheConfig, - KVCacheMetrics, - KVCacheTensorSpec, - ModelSpec, - TokenListView, -) -from aibrix_kvcache.common.absl_logging import getLogger, log_every_n_seconds, log_if +from aibrix_kvcache.common.absl_logging import log_every_n_seconds from aibrix_kvcache_storage import AibrixKVCacheStorage -from torch.distributed import Backend, ProcessGroup from sglang.srt.mem_cache.hicache_storage import HiCacheStorageConfig from sglang.srt.mem_cache.memory_pool import MHATokenToKVPool diff --git a/python/sglang/srt/mem_cache/storage/eic/eic_storage.py b/python/sglang/srt/mem_cache/storage/eic/eic_storage.py index 0acd5b65fd3..f3cc1563257 100644 --- a/python/sglang/srt/mem_cache/storage/eic/eic_storage.py +++ b/python/sglang/srt/mem_cache/storage/eic/eic_storage.py @@ -2,21 +2,18 @@ import logging import os import time -import uuid -from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, List, Optional, Tuple import eic import torch import yaml -from sglang.srt.layers.dp_attention import get_attention_tp_rank, get_attention_tp_size from sglang.srt.mem_cache.hicache_storage import ( HiCacheStorage, HiCacheStorageConfig, HiCacheStorageExtraInfo, ) -from sglang.srt.mem_cache.memory_pool_host import HostKVCache, MLATokenToKVPoolHost +from sglang.srt.mem_cache.memory_pool_host import HostKVCache logger = logging.getLogger(__name__) diff --git a/python/sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py b/python/sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py index c7a485fa048..d789a205348 100644 --- a/python/sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +++ b/python/sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py @@ -1,6 +1,5 @@ import logging import os -import threading from abc import ABC, abstractmethod from typing import List diff --git a/python/sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py b/python/sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py index bf31cbb3894..9fdadf6ac2b 100644 --- a/python/sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +++ b/python/sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py @@ -2,7 +2,7 @@ import logging import threading -from typing import TYPE_CHECKING, List, Optional +from typing import TYPE_CHECKING, Optional import torch diff --git a/python/sglang/srt/mem_cache/storage/nixl/hicache_nixl.py b/python/sglang/srt/mem_cache/storage/nixl/hicache_nixl.py index 55b3dd976a0..8965acb4aaa 100644 --- a/python/sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +++ b/python/sglang/srt/mem_cache/storage/nixl/hicache_nixl.py @@ -1,9 +1,8 @@ -import hashlib import logging import os import time import uuid -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, List, Optional, Union import torch diff --git a/python/sglang/srt/mem_cache/storage/nixl/nixl_utils.py b/python/sglang/srt/mem_cache/storage/nixl/nixl_utils.py index 6e3d2a900cc..b04f9e58d84 100644 --- a/python/sglang/srt/mem_cache/storage/nixl/nixl_utils.py +++ b/python/sglang/srt/mem_cache/storage/nixl/nixl_utils.py @@ -1,6 +1,6 @@ import logging import os -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, List, Optional, Tuple, Union import torch diff --git a/python/sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py b/python/sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py index 3784ab91ad1..aea004a6d72 100755 --- a/python/sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +++ b/python/sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py @@ -2,7 +2,7 @@ import os import unittest -from typing import List, Optional +from typing import List from unittest.mock import MagicMock import torch diff --git a/python/sglang/srt/metrics/func_timer.py b/python/sglang/srt/metrics/func_timer.py index fbb01bac806..51d445ab44e 100644 --- a/python/sglang/srt/metrics/func_timer.py +++ b/python/sglang/srt/metrics/func_timer.py @@ -18,7 +18,7 @@ import asyncio import time from functools import wraps -from typing import Any, Callable, List, Optional +from typing import Any, Callable, Optional from sglang.srt.metrics.utils import exponential_buckets diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index b1b8b7ff3fb..ef780899dd9 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -104,11 +104,7 @@ ) from sglang.srt.model_executor.cpu_graph_runner import CPUGraphRunner from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner -from sglang.srt.model_executor.forward_batch_info import ( - ForwardBatch, - ForwardMode, - PPProxyTensors, -) +from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors from sglang.srt.model_executor.npu_graph_runner import NPUGraphRunner from sglang.srt.model_executor.piecewise_cuda_graph_runner import ( PiecewiseCudaGraphRunner, diff --git a/python/sglang/srt/model_executor/npu_graph_runner.py b/python/sglang/srt/model_executor/npu_graph_runner.py index db7dcd15943..cfd9abbcf21 100644 --- a/python/sglang/srt/model_executor/npu_graph_runner.py +++ b/python/sglang/srt/model_executor/npu_graph_runner.py @@ -19,10 +19,9 @@ import threading from typing import TYPE_CHECKING, Optional, Union -import numpy as np import torch -from sglang.srt.configs.model_config import AttentionArch, is_deepseek_nsa +from sglang.srt.configs.model_config import is_deepseek_nsa from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner logger = logging.getLogger(__name__) diff --git a/python/sglang/srt/models/bailing_moe.py b/python/sglang/srt/models/bailing_moe.py index 2cb7d596104..e768c0a53a9 100644 --- a/python/sglang/srt/models/bailing_moe.py +++ b/python/sglang/srt/models/bailing_moe.py @@ -19,7 +19,7 @@ # limitations under the License. """SGLang BailingMoE model.""" import logging -from typing import Any, Dict, Iterable, Optional, Tuple, Union +from typing import Iterable, Optional, Tuple, Union import torch import torch.nn.functional as F @@ -59,7 +59,6 @@ from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE from sglang.srt.layers.moe.token_dispatcher import DeepEPDispatcher from sglang.srt.layers.moe.topk import TopK -from sglang.srt.layers.moe.utils import DeepEPMode from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.layers.rotary_embedding import get_rope diff --git a/python/sglang/srt/models/bert.py b/python/sglang/srt/models/bert.py index d7f3301c656..45494423fe8 100644 --- a/python/sglang/srt/models/bert.py +++ b/python/sglang/srt/models/bert.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, Iterable, Optional, Set, Tuple +from typing import Iterable, Optional, Set, Tuple import torch from torch import nn diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index fb9cd4f6c9f..f24923a73a5 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -183,9 +183,9 @@ awq_dequantize_triton as awq_dequantize, ) elif _is_npu: - import custom_ops - import sgl_kernel_npu - import torch_npu + import custom_ops # noqa: F401 + import sgl_kernel_npu # noqa: F401 + import torch_npu # noqa: F401 else: pass diff --git a/python/sglang/srt/models/dots_ocr.py b/python/sglang/srt/models/dots_ocr.py index ee48909ed18..d1f60feccb5 100644 --- a/python/sglang/srt/models/dots_ocr.py +++ b/python/sglang/srt/models/dots_ocr.py @@ -6,7 +6,6 @@ import torch import torch.nn as nn -from transformers.activations import ACT2FN from sglang.srt.configs import DotsOCRConfig from sglang.srt.layers.logits_processor import LogitsProcessor @@ -22,7 +21,6 @@ from sglang.srt.models.dots_vlm_vit import DotsVisionTransformer from sglang.srt.models.qwen2 import Qwen2ForCausalLM from sglang.srt.utils import add_prefix -from sglang.srt.utils.hf_transformers_utils import get_processor logger = logging.getLogger(__name__) diff --git a/python/sglang/srt/models/dots_vlm.py b/python/sglang/srt/models/dots_vlm.py index 95475058f5e..d626b1ef6ad 100644 --- a/python/sglang/srt/models/dots_vlm.py +++ b/python/sglang/srt/models/dots_vlm.py @@ -23,7 +23,6 @@ from torch import nn from sglang.srt.configs.dots_vlm import DotsVLMConfig -from sglang.srt.distributed import parallel_state from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.managers.mm_utils import ( MultiModalityDataPaddingPatternMultimodalTokens, diff --git a/python/sglang/srt/models/falcon_h1.py b/python/sglang/srt/models/falcon_h1.py index c35613bcb2e..0fab9e410d0 100644 --- a/python/sglang/srt/models/falcon_h1.py +++ b/python/sglang/srt/models/falcon_h1.py @@ -1,4 +1,3 @@ -import enum import logging from typing import Any, Iterable, List, Optional, Set, Tuple diff --git a/python/sglang/srt/models/gemma3n_mm.py b/python/sglang/srt/models/gemma3n_mm.py index 3c52635dd9e..86f7fd516dc 100644 --- a/python/sglang/srt/models/gemma3n_mm.py +++ b/python/sglang/srt/models/gemma3n_mm.py @@ -14,8 +14,7 @@ ) from transformers.models.auto.modeling_auto import AutoModel -from sglang.srt.layers.layernorm import RMSNorm -from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear +from sglang.srt.layers.linear import RowParallelLinear from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding diff --git a/python/sglang/srt/models/glm4_moe.py b/python/sglang/srt/models/glm4_moe.py index 35ce0c40db5..2d4bf41f12d 100644 --- a/python/sglang/srt/models/glm4_moe.py +++ b/python/sglang/srt/models/glm4_moe.py @@ -44,10 +44,8 @@ ) from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import ( - ColumnParallelLinear, MergedColumnParallelLinear, QKVParallelLinear, - ReplicatedLinear, RowParallelLinear, ) from sglang.srt.layers.logits_processor import LogitsProcessor @@ -78,16 +76,12 @@ BumpAllocator, LazyValue, add_prefix, - bind_or_assign, cpu_has_amx_support, get_bool_env_var, get_device_sm, - get_int_env_var, is_cpu, is_cuda, - is_flashinfer_available, is_hip, - is_non_idle_and_non_empty, log_info_on_rank0, use_intel_amx_backend, ) diff --git a/python/sglang/srt/models/gpt_oss.py b/python/sglang/srt/models/gpt_oss.py index 1f280f37ef9..6d80adf0fac 100644 --- a/python/sglang/srt/models/gpt_oss.py +++ b/python/sglang/srt/models/gpt_oss.py @@ -85,7 +85,7 @@ if _is_cuda: - from sgl_kernel import FusedSetKVBufferArg + from sgl_kernel import FusedSetKVBufferArg # noqa: F401 class GptOssConfig(PretrainedConfig): diff --git a/python/sglang/srt/models/hunyuan.py b/python/sglang/srt/models/hunyuan.py index c1ed2543c62..7c6fd9e48a7 100644 --- a/python/sglang/srt/models/hunyuan.py +++ b/python/sglang/srt/models/hunyuan.py @@ -12,18 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only HunYuan model compatible with HuggingFace weights.""" -import logging import re -from dataclasses import dataclass -from enum import Enum, auto -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Dict, Iterable, Optional, Tuple import torch from torch import nn from transformers import PretrainedConfig from sglang.srt.distributed import ( - get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, @@ -46,7 +42,6 @@ from sglang.srt.layers.rotary_embedding import get_rope from sglang.srt.layers.sampler import Sampler from sglang.srt.layers.vocab_parallel_embedding import ( - DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding, ) @@ -56,7 +51,7 @@ kv_cache_scales_loader, maybe_remap_kv_scale_name, ) -from sglang.srt.utils import add_prefix, is_hip +from sglang.srt.utils import is_hip expert_distribution_recorder = ExpertDistributionRecorder() diff --git a/python/sglang/srt/models/interns1.py b/python/sglang/srt/models/interns1.py index c7383ed2583..e896843ff02 100644 --- a/python/sglang/srt/models/interns1.py +++ b/python/sglang/srt/models/interns1.py @@ -5,7 +5,6 @@ from transformers import PretrainedConfig from sglang.srt.layers.attention import vision_utils -from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.managers.mm_utils import ( diff --git a/python/sglang/srt/models/llama_eagle3.py b/python/sglang/srt/models/llama_eagle3.py index 87ae7ade5d5..d0605d08de5 100644 --- a/python/sglang/srt/models/llama_eagle3.py +++ b/python/sglang/srt/models/llama_eagle3.py @@ -27,7 +27,7 @@ from sglang.srt.distributed import get_pp_group from sglang.srt.layers.layernorm import RMSNorm -from sglang.srt.layers.linear import QKVParallelLinear, RowParallelLinear +from sglang.srt.layers.linear import QKVParallelLinear from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.vocab_parallel_embedding import ( diff --git a/python/sglang/srt/models/longcat_flash.py b/python/sglang/srt/models/longcat_flash.py index edfadfa0a1b..ffca2bad09a 100644 --- a/python/sglang/srt/models/longcat_flash.py +++ b/python/sglang/srt/models/longcat_flash.py @@ -44,9 +44,7 @@ ) from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation -from sglang.srt.eplb.expert_location_dispatch import ExpertLocationDispatchInfo from sglang.srt.layers.activation import SiluAndMul -from sglang.srt.layers.amx_utils import PackWeightMethod from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes from sglang.srt.layers.dp_attention import ( get_attention_tp_rank, @@ -87,20 +85,15 @@ from sglang.srt.server_args import get_global_server_args from sglang.srt.utils import ( BumpAllocator, - LazyValue, add_prefix, bind_or_assign, cpu_has_amx_support, get_bool_env_var, get_device_sm, - get_int_env_var, is_cpu, is_cuda, - is_flashinfer_available, is_hip, - is_non_idle_and_non_empty, is_npu, - is_sm100_supported, ) _is_hip = is_hip() @@ -113,13 +106,7 @@ _device_sm = get_device_sm() if _is_cuda: - from sgl_kernel import ( - awq_dequantize, - bmm_fp8, - dsv3_fused_a_gemm, - dsv3_router_gemm, - merge_state_v2, - ) + from sgl_kernel import awq_dequantize elif _is_cpu and _is_cpu_amx_available: pass elif _is_hip: diff --git a/python/sglang/srt/models/longcat_flash_nextn.py b/python/sglang/srt/models/longcat_flash_nextn.py index 69bd1548d4e..a6092785acc 100644 --- a/python/sglang/srt/models/longcat_flash_nextn.py +++ b/python/sglang/srt/models/longcat_flash_nextn.py @@ -32,14 +32,10 @@ import concurrent.futures import logging -import os -from enum import IntEnum, auto -from typing import Any, Dict, Iterable, Optional, Tuple, Union +from typing import Iterable, Optional, Tuple import torch -import torch.nn.functional as F from torch import nn -from tqdm import tqdm from sglang.srt.configs import LongcatFlashConfig from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder @@ -75,7 +71,6 @@ from sglang.srt.models.longcat_flash import LongcatFlashForCausalLM, LongcatFlashMLP from sglang.srt.utils import ( BumpAllocator, - LazyValue, add_prefix, bind_or_assign, cpu_has_amx_support, @@ -97,13 +92,7 @@ _device_sm = get_device_sm() if _is_cuda: - from sgl_kernel import ( - awq_dequantize, - bmm_fp8, - dsv3_fused_a_gemm, - dsv3_router_gemm, - merge_state_v2, - ) + from sgl_kernel import awq_dequantize elif _is_cpu and _is_cpu_amx_available: pass elif _is_hip: diff --git a/python/sglang/srt/models/mimo.py b/python/sglang/srt/models/mimo.py index 2a89e7706e3..15aad8f41c5 100644 --- a/python/sglang/srt/models/mimo.py +++ b/python/sglang/srt/models/mimo.py @@ -1,28 +1,17 @@ # Adapted from qwen2.py -from functools import partial -from typing import Any, Dict, Iterable, Optional, Tuple +from typing import Iterable, Optional, Tuple import torch from torch import nn -from sglang.srt.distributed import ( - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, - split_tensor_along_last_dim, - tensor_model_parallel_all_gather, -) -from sglang.srt.layers.layernorm import RMSNorm -from sglang.srt.layers.linear import QKVParallelLinear, RowParallelLinear from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.pooler import Pooler, PoolingType from sglang.srt.layers.quantization.base_config import QuantizationConfig -from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.rotary_embedding import get_rope from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_loader.weight_utils import default_weight_loader -from sglang.srt.models.qwen2 import Qwen2DecoderLayer, Qwen2MLP, Qwen2Model +from sglang.srt.models.qwen2 import Qwen2DecoderLayer, Qwen2Model from sglang.srt.utils import add_prefix MiMoConfig = None diff --git a/python/sglang/srt/models/mimo_mtp.py b/python/sglang/srt/models/mimo_mtp.py index 89e8c02cd62..2702a637d46 100644 --- a/python/sglang/srt/models/mimo_mtp.py +++ b/python/sglang/srt/models/mimo_mtp.py @@ -1,7 +1,6 @@ # Adapted from https://github.com/vllm-project/vllm/pull/17433/files and deepseek_nextn.py -from functools import partial -from typing import Any, Dict, Iterable, Optional, Tuple +from typing import Iterable, Optional, Tuple import torch from torch import nn diff --git a/python/sglang/srt/models/minicpmo.py b/python/sglang/srt/models/minicpmo.py index 2f8271c6cbd..b83a86e221e 100644 --- a/python/sglang/srt/models/minicpmo.py +++ b/python/sglang/srt/models/minicpmo.py @@ -43,7 +43,6 @@ general_mm_embed_routine, ) from sglang.srt.managers.schedule_batch import ( - Modality, MultimodalDataItem, MultimodalInputs, flatten_nested_list, @@ -59,8 +58,6 @@ try: from transformers import LogitsWarper from vector_quantize_pytorch import GroupedResidualFSQ - from vocos import Vocos - from vocos.pretrained import instantiate_class _tts_deps = True except: diff --git a/python/sglang/srt/models/mixtral.py b/python/sglang/srt/models/mixtral.py index 81026f9bb83..cb55848cfc7 100644 --- a/python/sglang/srt/models/mixtral.py +++ b/python/sglang/srt/models/mixtral.py @@ -24,7 +24,6 @@ from transformers import MixtralConfig from sglang.srt.distributed import ( - get_moe_expert_parallel_world_size, get_pp_group, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, diff --git a/python/sglang/srt/models/opt.py b/python/sglang/srt/models/opt.py index a571e8937be..bf989f6e89e 100644 --- a/python/sglang/srt/models/opt.py +++ b/python/sglang/srt/models/opt.py @@ -17,7 +17,6 @@ from typing import Optional, Union import torch -import torch.nn.functional as F from torch import nn from transformers import OPTConfig @@ -26,10 +25,8 @@ get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, ) -from sglang.srt.layers.activation import get_act_fn from sglang.srt.layers.linear import ( ColumnParallelLinear, - MergedColumnParallelLinear, QKVParallelLinear, ReplicatedLinear, RowParallelLinear, @@ -38,7 +35,7 @@ from sglang.srt.layers.pooler import Pooler, PoolingType from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.utils import PPMissingLayer, get_layer_id +from sglang.srt.layers.utils import get_layer_id from sglang.srt.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding, @@ -47,7 +44,6 @@ from sglang.srt.model_loader.weight_utils import ( default_weight_loader, kv_cache_scales_loader, - maybe_remap_kv_scale_name, ) from sglang.srt.utils import add_prefix, make_layers diff --git a/python/sglang/srt/models/phi.py b/python/sglang/srt/models/phi.py index f48895c67f3..5679bc98781 100644 --- a/python/sglang/srt/models/phi.py +++ b/python/sglang/srt/models/phi.py @@ -1,5 +1,5 @@ # Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/phi.py -from typing import Iterable, Optional, Union +from typing import Iterable, Optional import torch from torch import nn diff --git a/python/sglang/srt/models/phi4mm.py b/python/sglang/srt/models/phi4mm.py index 37a638acb5c..6d00144d2db 100644 --- a/python/sglang/srt/models/phi4mm.py +++ b/python/sglang/srt/models/phi4mm.py @@ -24,7 +24,7 @@ import numpy as np import torch from torch import nn -from transformers import PretrainedConfig, SiglipVisionConfig +from transformers import PretrainedConfig from sglang.srt.layers.quantization import QuantizationConfig from sglang.srt.managers.mm_utils import ( diff --git a/python/sglang/srt/models/phimoe.py b/python/sglang/srt/models/phimoe.py index 4604aeef989..0d147c2b178 100644 --- a/python/sglang/srt/models/phimoe.py +++ b/python/sglang/srt/models/phimoe.py @@ -18,7 +18,6 @@ from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.layers.rotary_embedding import get_rope -from sglang.srt.layers.utils import PPMissingLayer from sglang.srt.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, diff --git a/python/sglang/srt/models/pixtral.py b/python/sglang/srt/models/pixtral.py index 04a7362d8cb..209b40645a6 100644 --- a/python/sglang/srt/models/pixtral.py +++ b/python/sglang/srt/models/pixtral.py @@ -16,13 +16,10 @@ Using mistral-community/pixtral-12b as reference. """ -import logging -import math from typing import Iterable, List, Optional, Set, Tuple, Union import torch import torch.nn as nn -import torch.nn.functional as F from transformers import PixtralVisionConfig, PretrainedConfig from transformers.models.pixtral.modeling_pixtral import PixtralRotaryEmbedding from transformers.models.pixtral.modeling_pixtral import ( diff --git a/python/sglang/srt/models/qwen.py b/python/sglang/srt/models/qwen.py index 009650411e3..206908b4900 100644 --- a/python/sglang/srt/models/qwen.py +++ b/python/sglang/srt/models/qwen.py @@ -15,7 +15,6 @@ # Adapted from # https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/qwen.py#L1 -import time from typing import Any, Dict, Iterable, Optional, Tuple import torch diff --git a/python/sglang/srt/models/qwen2_audio.py b/python/sglang/srt/models/qwen2_audio.py index 8609758a958..98f30636aba 100644 --- a/python/sglang/srt/models/qwen2_audio.py +++ b/python/sglang/srt/models/qwen2_audio.py @@ -23,30 +23,18 @@ # limitations under the License. """Inference-only Qwen2-Audio model compatible with HuggingFace weights.""" import logging -import math -from functools import lru_cache, partial -from typing import Any, Iterable, List, Optional, Tuple, Type, TypedDict +from typing import Any, Iterable, List, Optional, Tuple import torch import torch.nn as nn -import torch.nn.functional as F -from einops import rearrange -from transformers import AutoTokenizer, Qwen2AudioEncoderConfig, Qwen2Config -from transformers.activations import ACT2FN +from transformers import Qwen2AudioEncoderConfig, Qwen2Config from transformers.models.qwen2_audio.configuration_qwen2_audio import Qwen2AudioConfig from transformers.models.qwen2_audio.modeling_qwen2_audio import ( Qwen2AudioEncoder, Qwen2AudioMultiModalProjector, ) -from sglang.srt.layers.activation import QuickGELU -from sglang.srt.layers.attention.vision import VisionAttention -from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear -from sglang.srt.layers.logits_processor import LogitsProcessor -from sglang.srt.layers.pooler import Pooler, PoolingType from sglang.srt.layers.quantization.base_config import QuantizationConfig -from sglang.srt.layers.utils import get_layer_id -from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead from sglang.srt.managers.mm_utils import ( MultiModalityDataPaddingPatternMultimodalTokens, general_mm_embed_routine, @@ -60,7 +48,6 @@ from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.models.qwen2 import Qwen2ForCausalLM from sglang.srt.utils import add_prefix -from sglang.srt.utils.hf_transformers_utils import get_processor logger = logging.getLogger(__name__) diff --git a/python/sglang/srt/models/qwen2_vl.py b/python/sglang/srt/models/qwen2_vl.py index 7a42829e834..73a212f5b31 100644 --- a/python/sglang/srt/models/qwen2_vl.py +++ b/python/sglang/srt/models/qwen2_vl.py @@ -28,7 +28,6 @@ import torch import torch.nn as nn -import torch.nn.functional as F from einops import rearrange from transformers import Qwen2VLConfig from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLVisionConfig diff --git a/python/sglang/srt/models/qwen3_next.py b/python/sglang/srt/models/qwen3_next.py index 1b11aa30bf3..9fe9e774848 100644 --- a/python/sglang/srt/models/qwen3_next.py +++ b/python/sglang/srt/models/qwen3_next.py @@ -1,18 +1,12 @@ import enum import logging -from typing import Any, Dict, Iterable, Optional, Set, Tuple +from typing import Any, Iterable, Optional, Set, Tuple import torch -import torch.nn.functional as F from torch import nn from sglang.srt.configs.qwen3_next import Qwen3NextConfig -from sglang.srt.distributed import ( - divide, - get_pp_group, - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, -) +from sglang.srt.distributed import divide, get_pp_group from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation from sglang.srt.layers.attention.fla.layernorm_gated import RMSNorm as RMSNormGated @@ -23,10 +17,9 @@ get_attention_tp_size, is_dp_attention_enabled, ) -from sglang.srt.layers.layernorm import GemmaRMSNorm, RMSNorm +from sglang.srt.layers.layernorm import GemmaRMSNorm from sglang.srt.layers.linear import ( ColumnParallelLinear, - MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear, ) diff --git a/python/sglang/srt/models/qwen3_vl.py b/python/sglang/srt/models/qwen3_vl.py index c41eb040316..be81eef62b6 100644 --- a/python/sglang/srt/models/qwen3_vl.py +++ b/python/sglang/srt/models/qwen3_vl.py @@ -20,18 +20,13 @@ import numpy as np import torch import torch.nn as nn -import torch.nn.functional as F from einops import rearrange from transformers.activations import ACT2FN from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import ( Qwen2_5_VisionRotaryEmbedding, ) -from sglang.srt.configs.qwen3_vl import ( - Qwen3VLConfig, - Qwen3VLTextConfig, - Qwen3VLVisionConfig, -) +from sglang.srt.configs.qwen3_vl import Qwen3VLConfig, Qwen3VLVisionConfig from sglang.srt.layers.attention.vision import VisionAttention from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear from sglang.srt.layers.logits_processor import LogitsProcessor @@ -47,11 +42,7 @@ MultimodalDataItem, MultimodalInputs, ) -from sglang.srt.model_executor.forward_batch_info import ( - ForwardBatch, - ForwardMode, - PPProxyTensors, -) +from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.models.qwen3 import Qwen3Model from sglang.srt.utils import add_prefix diff --git a/python/sglang/srt/models/qwen3_vl_moe.py b/python/sglang/srt/models/qwen3_vl_moe.py index c4d56a25701..3bf0b11239f 100644 --- a/python/sglang/srt/models/qwen3_vl_moe.py +++ b/python/sglang/srt/models/qwen3_vl_moe.py @@ -25,12 +25,8 @@ get_moe_expert_parallel_world_size, get_tensor_model_parallel_rank, ) -from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE from sglang.srt.layers.quantization.base_config import QuantizationConfig -from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead -from sglang.srt.managers.mm_utils import general_mm_embed_routine -from sglang.srt.managers.schedule_batch import MultimodalDataItem from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.models.qwen3_moe import Qwen3MoeModel diff --git a/python/sglang/srt/models/roberta.py b/python/sglang/srt/models/roberta.py index 209be1296b5..9fad5cfa3cc 100644 --- a/python/sglang/srt/models/roberta.py +++ b/python/sglang/srt/models/roberta.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -import itertools from typing import Iterable, Optional, Tuple import torch diff --git a/python/sglang/srt/models/sarashina2_vision.py b/python/sglang/srt/models/sarashina2_vision.py index eae34134923..f58908b5d15 100644 --- a/python/sglang/srt/models/sarashina2_vision.py +++ b/python/sglang/srt/models/sarashina2_vision.py @@ -17,7 +17,6 @@ from typing import Iterable, List, Optional, Tuple import torch -import torch.nn.functional as F from torch import nn from transformers import LlamaConfig diff --git a/python/sglang/srt/models/step3_vl.py b/python/sglang/srt/models/step3_vl.py index 14d277f9f38..5a9e74ab622 100644 --- a/python/sglang/srt/models/step3_vl.py +++ b/python/sglang/srt/models/step3_vl.py @@ -1,8 +1,7 @@ import logging import math -from collections.abc import Iterable from math import sqrt -from typing import Any, Dict, Iterable, List, Literal, Optional, Tuple, TypedDict, Union +from typing import Any, Dict, Iterable, List, Optional, Tuple import torch from torch import nn diff --git a/python/sglang/srt/multimodal/processors/deepseek_vl_v2.py b/python/sglang/srt/multimodal/processors/deepseek_vl_v2.py index b09402d0be1..26708e8dc01 100644 --- a/python/sglang/srt/multimodal/processors/deepseek_vl_v2.py +++ b/python/sglang/srt/multimodal/processors/deepseek_vl_v2.py @@ -18,9 +18,6 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. from typing import List, Union -import torch - -from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem from sglang.srt.models.deepseek_vl2 import DeepseekVL2ForCausalLM from sglang.srt.multimodal.processors.base_processor import ( BaseMultimodalProcessor, diff --git a/python/sglang/srt/multimodal/processors/dots_vlm.py b/python/sglang/srt/multimodal/processors/dots_vlm.py index 3b95beff3a8..5f095d150f5 100644 --- a/python/sglang/srt/multimodal/processors/dots_vlm.py +++ b/python/sglang/srt/multimodal/processors/dots_vlm.py @@ -1,5 +1,4 @@ import asyncio -import math import re from typing import Dict, List, Union diff --git a/python/sglang/srt/multimodal/processors/glm4v.py b/python/sglang/srt/multimodal/processors/glm4v.py index e3c8edc9283..2051a426fa0 100644 --- a/python/sglang/srt/multimodal/processors/glm4v.py +++ b/python/sglang/srt/multimodal/processors/glm4v.py @@ -1,4 +1,3 @@ -import re from typing import List, Union from decord import VideoReader @@ -9,10 +8,7 @@ from sglang.srt.multimodal.processors.base_processor import ( BaseMultimodalProcessor as SGLangBaseProcessor, ) -from sglang.srt.multimodal.processors.base_processor import ( - BaseMultiModalProcessorOutput, - MultimodalSpecialTokens, -) +from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens class Glm4vImageProcessor(SGLangBaseProcessor): diff --git a/python/sglang/srt/multimodal/processors/internvl.py b/python/sglang/srt/multimodal/processors/internvl.py index c9a2d97ef28..a1ef6b67554 100644 --- a/python/sglang/srt/multimodal/processors/internvl.py +++ b/python/sglang/srt/multimodal/processors/internvl.py @@ -4,10 +4,8 @@ import numpy as np import torch -import torchvision.transforms as T from decord import VideoReader, cpu, gpu from PIL import Image -from torchvision.transforms import InterpolationMode from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem from sglang.srt.models.interns1 import InternS1ForConditionalGeneration diff --git a/python/sglang/srt/multimodal/processors/janus_pro.py b/python/sglang/srt/multimodal/processors/janus_pro.py index 54d6c197884..044e31dd29a 100644 --- a/python/sglang/srt/multimodal/processors/janus_pro.py +++ b/python/sglang/srt/multimodal/processors/janus_pro.py @@ -1,6 +1,5 @@ from typing import List, Union -from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem from sglang.srt.models.deepseek_janus_pro import MultiModalityCausalLM from sglang.srt.multimodal.processors.base_processor import ( BaseMultimodalProcessor, diff --git a/python/sglang/srt/multimodal/processors/mllama4.py b/python/sglang/srt/multimodal/processors/mllama4.py index 6a01f2aebff..4f04688b8ec 100644 --- a/python/sglang/srt/multimodal/processors/mllama4.py +++ b/python/sglang/srt/multimodal/processors/mllama4.py @@ -1,13 +1,5 @@ from typing import List, Union -import torch -from transformers.image_utils import SizeDict -from transformers.models.llama4.image_processing_llama4_fast import ( - find_supported_resolutions, - get_best_fit, -) - -from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem from sglang.srt.models.mllama4 import Llama4ForConditionalGeneration from sglang.srt.multimodal.processors.base_processor import ( BaseMultimodalProcessor, diff --git a/python/sglang/srt/multimodal/processors/phi4mm.py b/python/sglang/srt/multimodal/processors/phi4mm.py index 1487d2ca2f7..c59a41685a2 100644 --- a/python/sglang/srt/multimodal/processors/phi4mm.py +++ b/python/sglang/srt/multimodal/processors/phi4mm.py @@ -3,7 +3,6 @@ from transformers.processing_utils import ProcessorMixin -from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem from sglang.srt.models.phi4mm import Phi4MMForCausalLM from sglang.srt.multimodal.processors.base_processor import ( BaseMultimodalProcessor, diff --git a/python/sglang/srt/multimodal/processors/step3_vl.py b/python/sglang/srt/multimodal/processors/step3_vl.py index ee537e68e7a..6bd691ecf3f 100644 --- a/python/sglang/srt/multimodal/processors/step3_vl.py +++ b/python/sglang/srt/multimodal/processors/step3_vl.py @@ -1,7 +1,7 @@ import math import re from itertools import product -from typing import List, Literal, Optional, TypedDict, Union +from typing import List, Optional, Union import numpy as np import torch diff --git a/python/sglang/srt/parser/reasoning_parser.py b/python/sglang/srt/parser/reasoning_parser.py index f50368aed9c..0c01ede9cba 100644 --- a/python/sglang/srt/parser/reasoning_parser.py +++ b/python/sglang/srt/parser/reasoning_parser.py @@ -1,4 +1,3 @@ -import re from typing import Dict, Optional, Tuple, Type from sglang.srt.parser.harmony_parser import HarmonyParser diff --git a/python/sglang/srt/server_args_config_parser.py b/python/sglang/srt/server_args_config_parser.py index 74dc676778a..2fee7fc0ce8 100644 --- a/python/sglang/srt/server_args_config_parser.py +++ b/python/sglang/srt/server_args_config_parser.py @@ -5,7 +5,7 @@ import logging from pathlib import Path -from typing import Any, Dict, List, Union +from typing import Any, Dict, List import yaml diff --git a/python/sglang/srt/speculative/eagle_worker.py b/python/sglang/srt/speculative/eagle_worker.py index e141a02386a..cb59b31f820 100644 --- a/python/sglang/srt/speculative/eagle_worker.py +++ b/python/sglang/srt/speculative/eagle_worker.py @@ -56,7 +56,7 @@ ) if is_cuda(): - from sgl_kernel import segment_packbits + from sgl_kernel import segment_packbits # noqa: F401 logger = logging.getLogger(__name__) SGLANG_RETURN_ORIGINAL_LOGPROB = get_bool_env_var("SGLANG_RETURN_ORIGINAL_LOGPROB") diff --git a/python/sglang/srt/speculative/spec_utils.py b/python/sglang/srt/speculative/spec_utils.py index d89236dbe83..c00391bcb56 100644 --- a/python/sglang/srt/speculative/spec_utils.py +++ b/python/sglang/srt/speculative/spec_utils.py @@ -22,8 +22,6 @@ from sglang.srt.utils import is_cuda, is_hip if TYPE_CHECKING: - from sglang.srt.mem_cache.allocator import TokenToKVPoolAllocator - from sglang.srt.mem_cache.memory_pool import ReqToTokenPool from sglang.srt.speculative.eagle_info import EagleVerifyInput diff --git a/python/sglang/srt/speculative/standalone_worker.py b/python/sglang/srt/speculative/standalone_worker.py index 23f9b9dd2c9..302799cc613 100644 --- a/python/sglang/srt/speculative/standalone_worker.py +++ b/python/sglang/srt/speculative/standalone_worker.py @@ -11,7 +11,7 @@ from sglang.srt.utils import empty_context, get_bool_env_var, is_cuda if is_cuda(): - from sgl_kernel import segment_packbits + from sgl_kernel import segment_packbits # noqa: F401 logger = logging.getLogger(__name__) SGLANG_RETURN_ORIGINAL_LOGPROB = get_bool_env_var("SGLANG_RETURN_ORIGINAL_LOGPROB") diff --git a/python/sglang/srt/utils/common.py b/python/sglang/srt/utils/common.py index 51ee7d10ee1..e2e6798c9f1 100644 --- a/python/sglang/srt/utils/common.py +++ b/python/sglang/srt/utils/common.py @@ -228,7 +228,7 @@ def support_triton(backend: str) -> bool: try: - import sgl_kernel + import sgl_kernel # noqa: F401 is_intel_amx_backend_available = hasattr( torch.ops.sgl_kernel, "convert_weight_packed" @@ -1556,7 +1556,7 @@ def get_hpu_memory_capacity(): def get_npu_memory_capacity(): try: - import torch_npu + import torch_npu # noqa: F401 return torch.npu.mem_get_info()[1] // 1024 // 1024 # unit: MB except ImportError as e: @@ -1743,7 +1743,7 @@ def get_device(device_id: Optional[int] = None) -> str: if is_habana_available(): try: - import habana_frameworks.torch.hpu + import habana_frameworks.torch.hpu # noqa: F401 if torch.hpu.is_available(): if device_id == None: @@ -1773,7 +1773,7 @@ def get_device_count() -> int: if is_habana_available(): try: - import habana_frameworks.torch.hpu + import habana_frameworks.torch.hpu # noqa: F401 if torch.hpu.is_available(): return torch.hpu.device_count() diff --git a/python/sglang/srt/utils/host_shared_memory.py b/python/sglang/srt/utils/host_shared_memory.py index c599527f9b8..20ddf8fc7ef 100644 --- a/python/sglang/srt/utils/host_shared_memory.py +++ b/python/sglang/srt/utils/host_shared_memory.py @@ -1,5 +1,4 @@ import logging -import os from dataclasses import dataclass from multiprocessing import shared_memory from pathlib import Path diff --git a/python/sglang/test/attention/test_flashattn_mla_backend.py b/python/sglang/test/attention/test_flashattn_mla_backend.py index ebfd0b39544..16f94a2b234 100644 --- a/python/sglang/test/attention/test_flashattn_mla_backend.py +++ b/python/sglang/test/attention/test_flashattn_mla_backend.py @@ -4,7 +4,6 @@ from sglang.srt.configs.model_config import AttentionArch from sglang.srt.layers.attention.flashattention_backend import FlashAttentionBackend -from sglang.srt.layers.attention.torch_native_backend import TorchNativeAttnBackend from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.mem_cache.memory_pool import MLATokenToKVPool from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode diff --git a/python/sglang/test/attention/test_prefix_chunk_info.py b/python/sglang/test/attention/test_prefix_chunk_info.py index c02d4d1d68f..2b85b695b8c 100644 --- a/python/sglang/test/attention/test_prefix_chunk_info.py +++ b/python/sglang/test/attention/test_prefix_chunk_info.py @@ -2,8 +2,6 @@ import torch -from sglang.srt.layers.attention.flashattention_backend import FlashAttentionBackend -from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.mem_cache.memory_pool import MLATokenToKVPool from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode from sglang.test.test_utils import CustomTestCase diff --git a/python/sglang/test/few_shot_gsm8k_engine.py b/python/sglang/test/few_shot_gsm8k_engine.py index 05b095713d0..567816cfcf7 100644 --- a/python/sglang/test/few_shot_gsm8k_engine.py +++ b/python/sglang/test/few_shot_gsm8k_engine.py @@ -1,16 +1,13 @@ import argparse import ast import asyncio -import json import re import time import numpy as np import sglang as sgl -from sglang.lang.api import set_default_backend -from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint -from sglang.utils import download_and_cache_file, dump_state_text, read_jsonl +from sglang.utils import download_and_cache_file, read_jsonl INVALID = -9999999 diff --git a/python/sglang/test/simple_eval_gpqa.py b/python/sglang/test/simple_eval_gpqa.py index b77ca773e32..b39366ef5df 100644 --- a/python/sglang/test/simple_eval_gpqa.py +++ b/python/sglang/test/simple_eval_gpqa.py @@ -18,7 +18,6 @@ HTML_JINJA, Eval, EvalResult, - MessageList, SamplerBase, SingleEvalResult, format_multichoice_question, diff --git a/python/sglang/test/simple_eval_humaneval.py b/python/sglang/test/simple_eval_humaneval.py index 25dcdd53af6..efd03af3825 100644 --- a/python/sglang/test/simple_eval_humaneval.py +++ b/python/sglang/test/simple_eval_humaneval.py @@ -11,8 +11,6 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from typing import Dict, List, Optional -import tqdm - try: from human_eval.data import read_problems from human_eval.evaluation import estimate_pass_at_k @@ -41,7 +39,6 @@ def evaluate_functional_correctness( Evaluates the functional correctness of generated samples, and writes results to f"{sample_file}_results.jsonl.gz" """ - import copy # Check the generated samples against test suites. with ThreadPoolExecutor(max_workers=n_workers) as executor: diff --git a/python/sglang/test/test_block_fp8.py b/python/sglang/test/test_block_fp8.py index 80202d15e07..2390489cad4 100644 --- a/python/sglang/test/test_block_fp8.py +++ b/python/sglang/test/test_block_fp8.py @@ -1,5 +1,4 @@ import itertools -import os import unittest import torch @@ -577,7 +576,7 @@ def setUpClass(cls): if not torch.cuda.is_available(): raise unittest.SkipTest("CUDA is not available") try: - import deep_gemm + import deep_gemm # noqa: F401 except ImportError: raise unittest.SkipTest("DeepGEMM is not available") torch.set_default_device("cuda") diff --git a/python/sglang/test/test_block_fp8_deep_gemm_blackwell.py b/python/sglang/test/test_block_fp8_deep_gemm_blackwell.py index 36d7acddbcd..ac7239ea0f3 100644 --- a/python/sglang/test/test_block_fp8_deep_gemm_blackwell.py +++ b/python/sglang/test/test_block_fp8_deep_gemm_blackwell.py @@ -1,5 +1,4 @@ import itertools -import os import unittest from typing import List, Tuple diff --git a/python/sglang/test/test_cutlass_moe.py b/python/sglang/test/test_cutlass_moe.py index 377534a495d..fdab5a3acb0 100755 --- a/python/sglang/test/test_cutlass_moe.py +++ b/python/sglang/test/test_cutlass_moe.py @@ -1,5 +1,4 @@ import argparse -import time import torch import triton # Added import diff --git a/python/sglang/test/test_cutlass_w4a8_moe.py b/python/sglang/test/test_cutlass_w4a8_moe.py index 7d96cccd5e0..e75154ef4b3 100644 --- a/python/sglang/test/test_cutlass_w4a8_moe.py +++ b/python/sglang/test/test_cutlass_w4a8_moe.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Literal, Optional +from typing import Optional import pytest import torch diff --git a/python/sglang/test/test_marlin_moe.py b/python/sglang/test/test_marlin_moe.py index 77b0109dff7..d58200edd7e 100644 --- a/python/sglang/test/test_marlin_moe.py +++ b/python/sglang/test/test_marlin_moe.py @@ -1,4 +1,3 @@ -import types from typing import Optional import pytest