Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ repos:
rev: v0.11.7
hooks:
- id: ruff
args: [--select=F401, --fixable=F401]
files: ^(benchmark/|docs/|examples/)
exclude: \.ipynb$|^python/sglang/srt/grpc/.*_pb2\.py$|^python/sglang/srt/grpc/.*_pb2_grpc\.py$|^python/sglang/srt/grpc/.*_pb2\.pyi$|^python/sglang/srt/grpc/.*_pb2_grpc\.pyi$
args: [--select=F401,F821, --fixable=F401]
files: ^(benchmark/|docs/|examples/|python/sglang/)
exclude: __init__\.py$|\.ipynb$|^python/sglang/srt/grpc/.*_pb2\.py$|^python/sglang/srt/grpc/.*_pb2_grpc\.py$|^python/sglang/srt/grpc/.*_pb2\.pyi$|^python/sglang/srt/grpc/.*_pb2_grpc\.pyi$
- repo: https://github.com/psf/black
rev: 24.10.0
hooks:
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/srt/_custom_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# ROCm does not use vllm custom allreduce
if use_vllm_custom_allreduce and not is_hip():
try:
import vllm._C
import vllm._C # noqa: F401
except ImportError as e:
logger.warning("Failed to import from vllm._C with %r", e)
else:
Expand Down
1 change: 0 additions & 1 deletion python/sglang/srt/compilation/cuda_piecewise_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import torch
import torch.fx as fx

import sglang.srt.compilation.weak_ref_tensor_jit
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure about this one.

from sglang.srt.compilation.compilation_config import CompilationConfig
from sglang.srt.compilation.compilation_counter import compilation_counter

Expand Down
1 change: 0 additions & 1 deletion python/sglang/srt/configs/deepseekvl2.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import math
import os
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple

Expand Down
9 changes: 2 additions & 7 deletions python/sglang/srt/configs/dots_vlm.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,5 @@
from typing import Any, List, Optional, Union

from transformers import AutoProcessor, LlamaTokenizerFast, PretrainedConfig
from transformers.feature_extraction_utils import BatchFeature
from transformers.image_utils import ImageInput
from transformers.processing_utils import ProcessingKwargs, Unpack
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
from transformers import AutoProcessor, PretrainedConfig
from transformers.processing_utils import ProcessingKwargs

try:
from transformers import Qwen2_5_VLProcessor
Expand Down
7 changes: 1 addition & 6 deletions python/sglang/srt/configs/falcon_h1.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,12 @@
# limitations under the License.
"""Falcon-H1 model configuration"""

import enum

from transformers.configuration_utils import PretrainedConfig
from transformers.modeling_rope_utils import rope_config_validation
from transformers.utils import logging

from sglang.srt.configs.mamba_utils import Mamba2CacheParams, Mamba2StateShape
from sglang.srt.layers.dp_attention import (
get_attention_tp_size,
get_tensor_model_parallel_world_size,
)
from sglang.srt.layers.dp_attention import get_tensor_model_parallel_world_size

logger = logging.get_logger(__name__)

Expand Down
1 change: 0 additions & 1 deletion python/sglang/srt/configs/qwen3_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
from transformers.utils import logging

from sglang.srt.configs.mamba_utils import Mamba2CacheParams, Mamba2StateShape
from sglang.srt.distributed.utils import divide
from sglang.srt.layers.dp_attention import get_attention_tp_size

logger = logging.get_logger(__name__)
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/srt/connector/remote_instance.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0

import logging
from typing import Generator, List, Optional, Tuple
from typing import Generator, Optional, Tuple
from urllib.parse import urlparse

import torch
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
import os
from typing import List, Optional
from typing import List

import torch

Expand Down
8 changes: 2 additions & 6 deletions python/sglang/srt/disaggregation/decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from collections import deque
from dataclasses import dataclass
from http import HTTPStatus
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type, Union
from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union

import torch
from torch.distributed import ProcessGroup
Expand All @@ -48,10 +48,7 @@
)
from sglang.srt.layers.dp_attention import get_attention_tp_size
from sglang.srt.managers.schedule_batch import FINISH_ABORT, RequestStage, ScheduleBatch
from sglang.srt.mem_cache.allocator import (
BaseTokenToKVPoolAllocator,
SWATokenToKVPoolAllocator,
)
from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
from sglang.srt.mem_cache.memory_pool import (
HybridLinearKVPool,
Expand All @@ -61,7 +58,6 @@
ReqToTokenPool,
SWAKVPool,
)
from sglang.srt.model_executor.forward_batch_info import ForwardMode
from sglang.srt.utils import get_int_env_var, require_mlp_sync
from sglang.srt.utils.torch_memory_saver_adapter import TorchMemorySaverAdapter

Expand Down
3 changes: 1 addition & 2 deletions python/sglang/srt/disaggregation/prefill.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from __future__ import annotations

import logging
import threading
import time
from collections import deque
from http import HTTPStatus
Expand Down Expand Up @@ -54,7 +53,7 @@
NSATokenToKVPool,
SWAKVPool,
)
from sglang.srt.model_executor.forward_batch_info import ForwardMode, PPProxyTensors
from sglang.srt.model_executor.forward_batch_info import PPProxyTensors
from sglang.srt.utils import (
DynamicGradMode,
broadcast_pyobj,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
ops.meta_size()
else:
# Use custom allreduce from sgl kernel (ROCM and TRT-LLM)
import sgl_kernel
import sgl_kernel # noqa: F401
custom_ar = True
except Exception:
# For CPUs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
from contextlib import contextmanager
from enum import IntEnum
from typing import Any, Callable, List, Optional, TypeVar, Union
from typing import Optional, Union

import torch
import torch.distributed as dist
Expand All @@ -24,7 +24,7 @@
mscclpp_is_available = False
if _is_cuda:
try:
import sgl_kernel
import sgl_kernel # noqa: F401

mscclpp_is_available = True
except:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from sglang.srt.distributed.device_communicators.all_reduce_utils import (
SYMM_MEM_ALL_REDUCE_MAX_SIZES,
)
from sglang.srt.utils import get_device_capability, is_cuda, is_hip
from sglang.srt.utils import is_cuda, is_hip

try:
import torch.distributed._symmetric_memory as torch_symm_mem
Expand Down
1 change: 0 additions & 1 deletion python/sglang/srt/distributed/naive_distributed.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import base64
import os
import pickle
import time
from pathlib import Path
Expand Down
1 change: 0 additions & 1 deletion python/sglang/srt/entrypoints/context.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# Copied from vLLM
import json
import logging
from abc import ABC, abstractmethod
from typing import Union
Expand Down
1 change: 0 additions & 1 deletion python/sglang/srt/entrypoints/harmony_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# Adapted from vLLM: https://github.com/vllm-project/vllm/blob/1b9902806915040ac9b3029f2ab7522ec505afc3/vllm/entrypoints/harmony_utils.py
# Slight differences in processing chat messages
import datetime
import json
from collections.abc import Iterable
from typing import Literal, Optional, Union

Expand Down
1 change: 0 additions & 1 deletion python/sglang/srt/entrypoints/http_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

import asyncio
import dataclasses
import json
import logging
import multiprocessing as multiprocessing
import os
Expand Down
8 changes: 1 addition & 7 deletions python/sglang/srt/entrypoints/http_server_engine.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,9 @@
import copy
import dataclasses
import multiprocessing
import pickle
import threading
import time
from typing import Any, Dict, List, Optional, Tuple, Union
from typing import List, Optional, Tuple

import pybase64
import requests
import torch
import torch.distributed as dist

from sglang.srt.entrypoints.EngineBase import EngineBase
from sglang.srt.entrypoints.http_server import launch_server
Expand Down
2 changes: 0 additions & 2 deletions python/sglang/srt/eplb/eplb_algorithms/deepseek.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@

import torch

from sglang.srt.utils import get_bool_env_var


def balanced_packing(
weight: torch.Tensor, num_packs: int
Expand Down
6 changes: 1 addition & 5 deletions python/sglang/srt/function_call/glm4_moe_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,7 @@

from sglang.srt.entrypoints.openai.protocol import Tool
from sglang.srt.function_call.base_format_detector import BaseFormatDetector
from sglang.srt.function_call.core_types import (
StreamingParseResult,
StructureInfo,
_GetInfoFunc,
)
from sglang.srt.function_call.core_types import StreamingParseResult, _GetInfoFunc
from sglang.srt.function_call.ebnf_composer import EBNFComposer

logger = logging.getLogger(__name__)
Expand Down
2 changes: 0 additions & 2 deletions python/sglang/srt/function_call/json_array_parser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import json
import re
from typing import List

from sglang.srt.entrypoints.openai.protocol import Tool
Expand Down
1 change: 0 additions & 1 deletion python/sglang/srt/function_call/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import json
from json import JSONDecodeError, JSONDecoder
from json.decoder import WHITESPACE
from typing import Any, List, Literal, Optional, Tuple, Union
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/srt/grpc/compile_proto.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def compile_proto(proto_file: Path, output_dir: Path, verbose: bool = True) -> b

# Check if grpc_tools is available
try:
import grpc_tools.protoc
import grpc_tools.protoc # noqa: F401
except ImportError:
print("Error: grpcio-tools not installed")
print(
Expand Down
1 change: 0 additions & 1 deletion python/sglang/srt/grpc/grpc_request_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
TokenizedEmbeddingReqInput,
TokenizedGenerateReqInput,
)
from sglang.srt.managers.scheduler import is_health_check_generate_req
from sglang.srt.server_args import PortArgs, ServerArgs
from sglang.srt.utils import get_zmq_socket, kill_process_tree
from sglang.utils import get_exception_traceback
Expand Down
5 changes: 4 additions & 1 deletion python/sglang/srt/layers/activation.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,4 +380,7 @@ def get_cross_encoder_activation_function(config: PretrainedConfig):
logger.info(
"sgl-kernel is not available on Non-NV, Non-AMD platforms or Non-AMX CPUs. Fallback to other kernel libraries."
)
from vllm.model_executor.layers.activation import GeluAndMul, SiluAndMul
from vllm.model_executor.layers.activation import ( # noqa: F401
GeluAndMul,
SiluAndMul,
)
1 change: 0 additions & 1 deletion python/sglang/srt/layers/attention/ascend_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from sglang.srt.layers.radix_attention import RadixAttention
from sglang.srt.model_executor.model_runner import ModelRunner

import os

import numpy as np

Expand Down
2 changes: 1 addition & 1 deletion python/sglang/srt/layers/attention/base_attn_backend.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Optional, Union
from typing import TYPE_CHECKING, Optional

import torch

Expand Down
1 change: 0 additions & 1 deletion python/sglang/srt/layers/attention/fla/chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
# -*- coding: utf-8 -*-
# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang

import warnings
from typing import Optional

import torch
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/srt/layers/attention/fla/chunk_o.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang

from typing import Optional, Tuple
from typing import Optional

import torch
import triton
Expand Down
2 changes: 0 additions & 2 deletions python/sglang/srt/layers/attention/fla/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,7 @@
# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang

import torch
import torch.nn.functional as F
import triton
import triton.language as tl

from sglang.srt.layers.attention.fla.utils import tensor_cache

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
# This backward pass is faster for dimensions up to 8k, but after that it's much slower due to register spilling.
# The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine.

import math

import torch
import torch.nn.functional as F
Expand Down
2 changes: 0 additions & 2 deletions python/sglang/srt/layers/attention/fla/wy_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
import triton.language as tl

from sglang.srt.layers.attention.fla.index import prepare_chunk_indices
from sglang.srt.layers.attention.fla.op import safe_exp
from sglang.srt.layers.attention.fla.utils import check_shared_mem


@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None})
Expand Down
1 change: 0 additions & 1 deletion python/sglang/srt/layers/attention/flashinfer_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@
fast_decode_plan,
)
from flashinfer.cascade import merge_state
from flashinfer.decode import _get_range_buf, get_seq_lens


class WrapperDispatch(Enum):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional, Union
from typing import Optional

import torch

Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
from dataclasses import astuple, dataclass
from functools import lru_cache
from typing import Optional, Union

import torch
import torch.nn.functional as F

from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
from sglang.srt.layers.attention.fla.chunk import chunk_gated_delta_rule
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/srt/layers/attention/intel_amx_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

class IntelAMXAttnBackend(AttentionBackend):
def __init__(self, model_runner: ModelRunner):
import sgl_kernel
import sgl_kernel # noqa: F401

super().__init__()
self.forward_metadata = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

from typing import List, Optional, Union

import numpy as np
import torch
import triton
import triton.language as tl
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

import torch
import triton
import triton.language as tl
from einops import rearrange
from packaging import version

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def is_mla_preprocess_enabled() -> bool:


if is_mla_preprocess_enabled():
import sgl_kernel_npu
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure about this one.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we need to keep this

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added back in 17b44c5

import sgl_kernel_npu # noqa: F401
import torch_npu

torch.npu.config.allow_internal_format = True
Expand Down
Loading
Loading