From ad40a477642fec8682ecdd3ab3a34a4cdd019189 Mon Sep 17 00:00:00 2001 From: ZHENYU <550149470@qq.com> Date: Wed, 6 Aug 2025 17:47:28 -0700 Subject: [PATCH 1/2] samples: Add KV cache event synchronization configuration examples Add sample configurations for enabling KV cache event synchronization: - Network policy to allow ZMQ traffic on ports 5557-5558 - vLLM deployment with KV events enabled via CLI arguments - vLLM deployment with KV events enabled via environment variables These samples demonstrate how to configure vLLM pods to publish KV cache events via ZMQ for real-time cache state synchronization with AIBrix gateway. Signed-off-by: ZHENYU <550149470@qq.com> --- samples/network-policies/allow-kv-events.yaml | 22 +++ .../quickstart/model-with-kv-events-env.yaml | 65 +++++++++ samples/quickstart/model-with-kv-events.yaml | 130 ++++++++++++++++++ 3 files changed, 217 insertions(+) create mode 100644 samples/network-policies/allow-kv-events.yaml create mode 100644 samples/quickstart/model-with-kv-events-env.yaml create mode 100644 samples/quickstart/model-with-kv-events.yaml diff --git a/samples/network-policies/allow-kv-events.yaml b/samples/network-policies/allow-kv-events.yaml new file mode 100644 index 000000000..3751a1af5 --- /dev/null +++ b/samples/network-policies/allow-kv-events.yaml @@ -0,0 +1,22 @@ +--- +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-kv-events + namespace: default +spec: + podSelector: + matchLabels: + app: gateway-plugins + policyTypes: + - Ingress + ingress: + - from: + - podSelector: + matchLabels: + model.aibrix.ai/kv-events-enabled: "true" + ports: + - protocol: TCP + port: 5557 + - protocol: TCP + port: 5558 diff --git a/samples/quickstart/model-with-kv-events-env.yaml b/samples/quickstart/model-with-kv-events-env.yaml new file mode 100644 index 000000000..deebc4147 --- /dev/null +++ b/samples/quickstart/model-with-kv-events-env.yaml @@ -0,0 +1,65 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + model.aibrix.ai/name: llama-8b-instruct + model.aibrix.ai/kv-events-enabled: "true" + name: llama-8b-instruct + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + model.aibrix.ai/name: llama-8b-instruct + template: + metadata: + labels: + model.aibrix.ai/name: llama-8b-instruct + model.aibrix.ai/kv-events-enabled: "true" + spec: + containers: + - name: vllm-openai + image: vllm/vllm-openai:v0.7.1 + command: + - python3 + - -m + - vllm.entrypoints.openai.api_server + - --host + - "0.0.0.0" + - --port + - "8000" + - --model + - meta-llama/Llama-3.1-8B-Instruct + - --served-model-name + - llama-8b-instruct + env: + # NEW: KV event configuration via environment + - name: VLLM_ENABLE_KV_CACHE_EVENTS + value: "true" + - name: VLLM_KV_EVENTS_PUBLISHER + value: "zmq" + - name: VLLM_KV_EVENTS_ENDPOINT + value: "tcp://*:5557" + - name: VLLM_KV_EVENTS_REPLAY_ENDPOINT + value: "tcp://*:5558" + - name: VLLM_KV_EVENTS_BUFFER_STEPS + value: "10000" + # Performance tuning + - name: VLLM_KV_EVENTS_HWM + value: "100000" # ZMQ high water mark + ports: + - containerPort: 8000 + protocol: TCP + name: api + - containerPort: 5557 + protocol: TCP + name: kv-events + - containerPort: 5558 + protocol: TCP + name: kv-replay + resources: + limits: + nvidia.com/gpu: "1" + requests: + nvidia.com/gpu: "1" diff --git a/samples/quickstart/model-with-kv-events.yaml b/samples/quickstart/model-with-kv-events.yaml new file mode 100644 index 000000000..99649a9ed --- /dev/null +++ b/samples/quickstart/model-with-kv-events.yaml @@ -0,0 +1,130 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + model.aibrix.ai/name: deepseek-r1-distill-llama-8b + model.aibrix.ai/port: "8000" + model.aibrix.ai/kv-events-enabled: "true" # NEW: Enable KV events + name: deepseek-r1-distill-llama-8b + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + model.aibrix.ai/name: deepseek-r1-distill-llama-8b + template: + metadata: + labels: + model.aibrix.ai/name: deepseek-r1-distill-llama-8b + model.aibrix.ai/kv-events-enabled: "true" # NEW: Required for discovery + spec: + containers: + - name: vllm-openai + image: vllm/vllm-openai:v0.7.1 + command: + - python3 + - -m + - vllm.entrypoints.openai.api_server + - --host + - "0.0.0.0" + - --port + - "8000" + - --uvicorn-log-level + - warning + - --model + - deepseek-ai/DeepSeek-R1-Distill-Llama-8B + - --served-model-name + - deepseek-r1-distill-llama-8b + - --max-model-len + - "12288" + # NEW: KV event publishing configuration + - --enable-kv-cache-events + - --kv-events-publisher + - zmq + - --kv-events-endpoint + - "tcp://*:5557" + - --kv-events-replay-endpoint + - "tcp://*:5558" + - --kv-events-buffer-steps + - "10000" + ports: + - containerPort: 8000 + protocol: TCP + name: api + # NEW: KV event ports + - containerPort: 5557 + protocol: TCP + name: kv-events + - containerPort: 5558 + protocol: TCP + name: kv-replay + resources: + limits: + nvidia.com/gpu: "1" + requests: + nvidia.com/gpu: "1" + # Health checks remain the same + livenessProbe: + httpGet: + path: /health + port: 8000 + scheme: HTTP + failureThreshold: 3 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + readinessProbe: + httpGet: + path: /health + port: 8000 + scheme: HTTP + failureThreshold: 5 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + startupProbe: + httpGet: + path: /health + port: 8000 + scheme: HTTP + failureThreshold: 30 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + +--- + +apiVersion: v1 +kind: Service +metadata: + labels: + model.aibrix.ai/name: deepseek-r1-distill-llama-8b + prometheus-discovery: "true" + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + name: deepseek-r1-distill-llama-8b + namespace: default +spec: + ports: + - name: serve + port: 8000 + protocol: TCP + targetPort: 8000 + - name: metrics + port: 8080 + protocol: TCP + targetPort: 8080 + # NEW: Expose KV event ports (optional, for debugging) + - name: kv-events + port: 5557 + protocol: TCP + targetPort: 5557 + - name: kv-replay + port: 5558 + protocol: TCP + targetPort: 5558 + selector: + model.aibrix.ai/name: deepseek-r1-distill-llama-8b + type: ClusterIP From c09d475e0296f8e499659e28c625b3b3e0a1922b Mon Sep 17 00:00:00 2001 From: ZHENYU <550149470@qq.com> Date: Wed, 6 Aug 2025 18:17:52 -0700 Subject: [PATCH 2/2] samples: Improve KV event sync sample configurations - Add explanatory comments for namespace field in all samples Clarifies that 'default' namespace is used for quickstart purposes and provides guidance on deploying to different namespaces - Add health probes to model-with-kv-events-env.yaml Ensures consistency with other quickstart samples and provides production-ready configuration with liveness, readiness, and startup probes These improvements address code review feedback while maintaining consistency with existing AIBrix sample conventions. Signed-off-by: ZHENYU <550149470@qq.com> --- samples/network-policies/allow-kv-events.yaml | 2 ++ .../quickstart/model-with-kv-events-env.yaml | 29 +++++++++++++++++++ samples/quickstart/model-with-kv-events.yaml | 4 +++ 3 files changed, 35 insertions(+) diff --git a/samples/network-policies/allow-kv-events.yaml b/samples/network-policies/allow-kv-events.yaml index 3751a1af5..cb4999a95 100644 --- a/samples/network-policies/allow-kv-events.yaml +++ b/samples/network-policies/allow-kv-events.yaml @@ -3,6 +3,8 @@ apiVersion: networking.k8s.io/v1 kind: NetworkPolicy metadata: name: allow-kv-events + # Note: This sample uses 'default' namespace for quickstart purposes. + # To deploy to a different namespace, change this value or use: kubectl apply -f -n namespace: default spec: podSelector: diff --git a/samples/quickstart/model-with-kv-events-env.yaml b/samples/quickstart/model-with-kv-events-env.yaml index deebc4147..24ae08106 100644 --- a/samples/quickstart/model-with-kv-events-env.yaml +++ b/samples/quickstart/model-with-kv-events-env.yaml @@ -6,6 +6,8 @@ metadata: model.aibrix.ai/name: llama-8b-instruct model.aibrix.ai/kv-events-enabled: "true" name: llama-8b-instruct + # Note: This sample uses 'default' namespace for quickstart purposes. + # To deploy to a different namespace, change this value or use: kubectl apply -f -n namespace: default spec: replicas: 2 @@ -63,3 +65,30 @@ spec: nvidia.com/gpu: "1" requests: nvidia.com/gpu: "1" + livenessProbe: + httpGet: + path: /health + port: 8000 + scheme: HTTP + failureThreshold: 3 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + readinessProbe: + httpGet: + path: /health + port: 8000 + scheme: HTTP + failureThreshold: 5 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + startupProbe: + httpGet: + path: /health + port: 8000 + scheme: HTTP + failureThreshold: 30 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 diff --git a/samples/quickstart/model-with-kv-events.yaml b/samples/quickstart/model-with-kv-events.yaml index 99649a9ed..dbda468f2 100644 --- a/samples/quickstart/model-with-kv-events.yaml +++ b/samples/quickstart/model-with-kv-events.yaml @@ -7,6 +7,8 @@ metadata: model.aibrix.ai/port: "8000" model.aibrix.ai/kv-events-enabled: "true" # NEW: Enable KV events name: deepseek-r1-distill-llama-8b + # Note: This sample uses 'default' namespace for quickstart purposes. + # To deploy to a different namespace, change this value or use: kubectl apply -f -n namespace: default spec: replicas: 1 @@ -105,6 +107,8 @@ metadata: prometheus.io/scrape: "true" prometheus.io/port: "8080" name: deepseek-r1-distill-llama-8b + # Note: This sample uses 'default' namespace for quickstart purposes. + # To deploy to a different namespace, change this value or use: kubectl apply -f -n namespace: default spec: ports: