From ad40a477642fec8682ecdd3ab3a34a4cdd019189 Mon Sep 17 00:00:00 2001
From: ZHENYU <550149470@qq.com>
Date: Wed, 6 Aug 2025 17:47:28 -0700
Subject: [PATCH 1/2] samples: Add KV cache event synchronization configuration
 examples

Add sample configurations for enabling KV cache event synchronization:
- Network policy to allow ZMQ traffic on ports 5557-5558
- vLLM deployment with KV events enabled via CLI arguments
- vLLM deployment with KV events enabled via environment variables

These samples demonstrate how to configure vLLM pods to publish KV cache events
via ZMQ for real-time cache state synchronization with AIBrix gateway.

Signed-off-by: ZHENYU <550149470@qq.com>
---
 samples/network-policies/allow-kv-events.yaml |  22 +++
 .../quickstart/model-with-kv-events-env.yaml  |  65 +++++++++
 samples/quickstart/model-with-kv-events.yaml  | 130 ++++++++++++++++++
 3 files changed, 217 insertions(+)
 create mode 100644 samples/network-policies/allow-kv-events.yaml
 create mode 100644 samples/quickstart/model-with-kv-events-env.yaml
 create mode 100644 samples/quickstart/model-with-kv-events.yaml

diff --git a/samples/network-policies/allow-kv-events.yaml b/samples/network-policies/allow-kv-events.yaml
new file mode 100644
index 000000000..3751a1af5
--- /dev/null
+++ b/samples/network-policies/allow-kv-events.yaml
@@ -0,0 +1,22 @@
+---
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  name: allow-kv-events
+  namespace: default
+spec:
+  podSelector:
+    matchLabels:
+      app: gateway-plugins
+  policyTypes:
+    - Ingress
+  ingress:
+    - from:
+        - podSelector:
+            matchLabels:
+              model.aibrix.ai/kv-events-enabled: "true"
+      ports:
+        - protocol: TCP
+          port: 5557
+        - protocol: TCP
+          port: 5558
diff --git a/samples/quickstart/model-with-kv-events-env.yaml b/samples/quickstart/model-with-kv-events-env.yaml
new file mode 100644
index 000000000..deebc4147
--- /dev/null
+++ b/samples/quickstart/model-with-kv-events-env.yaml
@@ -0,0 +1,65 @@
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    model.aibrix.ai/name: llama-8b-instruct
+    model.aibrix.ai/kv-events-enabled: "true"
+  name: llama-8b-instruct
+  namespace: default
+spec:
+  replicas: 2
+  selector:
+    matchLabels:
+      model.aibrix.ai/name: llama-8b-instruct
+  template:
+    metadata:
+      labels:
+        model.aibrix.ai/name: llama-8b-instruct
+        model.aibrix.ai/kv-events-enabled: "true"
+    spec:
+      containers:
+        - name: vllm-openai
+          image: vllm/vllm-openai:v0.7.1
+          command:
+            - python3
+            - -m
+            - vllm.entrypoints.openai.api_server
+            - --host
+            - "0.0.0.0"
+            - --port
+            - "8000"
+            - --model
+            - meta-llama/Llama-3.1-8B-Instruct
+            - --served-model-name
+            - llama-8b-instruct
+          env:
+            # NEW: KV event configuration via environment
+            - name: VLLM_ENABLE_KV_CACHE_EVENTS
+              value: "true"
+            - name: VLLM_KV_EVENTS_PUBLISHER
+              value: "zmq"
+            - name: VLLM_KV_EVENTS_ENDPOINT
+              value: "tcp://*:5557"
+            - name: VLLM_KV_EVENTS_REPLAY_ENDPOINT
+              value: "tcp://*:5558"
+            - name: VLLM_KV_EVENTS_BUFFER_STEPS
+              value: "10000"
+            # Performance tuning
+            - name: VLLM_KV_EVENTS_HWM
+              value: "100000"  # ZMQ high water mark
+          ports:
+            - containerPort: 8000
+              protocol: TCP
+              name: api
+            - containerPort: 5557
+              protocol: TCP
+              name: kv-events
+            - containerPort: 5558
+              protocol: TCP
+              name: kv-replay
+          resources:
+            limits:
+              nvidia.com/gpu: "1"
+            requests:
+              nvidia.com/gpu: "1"
diff --git a/samples/quickstart/model-with-kv-events.yaml b/samples/quickstart/model-with-kv-events.yaml
new file mode 100644
index 000000000..99649a9ed
--- /dev/null
+++ b/samples/quickstart/model-with-kv-events.yaml
@@ -0,0 +1,130 @@
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    model.aibrix.ai/name: deepseek-r1-distill-llama-8b
+    model.aibrix.ai/port: "8000"
+    model.aibrix.ai/kv-events-enabled: "true"  # NEW: Enable KV events
+  name: deepseek-r1-distill-llama-8b
+  namespace: default
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      model.aibrix.ai/name: deepseek-r1-distill-llama-8b
+  template:
+    metadata:
+      labels:
+        model.aibrix.ai/name: deepseek-r1-distill-llama-8b
+        model.aibrix.ai/kv-events-enabled: "true"  # NEW: Required for discovery
+    spec:
+      containers:
+        - name: vllm-openai
+          image: vllm/vllm-openai:v0.7.1
+          command:
+            - python3
+            - -m
+            - vllm.entrypoints.openai.api_server
+            - --host
+            - "0.0.0.0"
+            - --port
+            - "8000"
+            - --uvicorn-log-level
+            - warning
+            - --model
+            - deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+            - --served-model-name
+            - deepseek-r1-distill-llama-8b
+            - --max-model-len
+            - "12288"
+            # NEW: KV event publishing configuration
+            - --enable-kv-cache-events
+            - --kv-events-publisher
+            - zmq
+            - --kv-events-endpoint
+            - "tcp://*:5557"
+            - --kv-events-replay-endpoint
+            - "tcp://*:5558"
+            - --kv-events-buffer-steps
+            - "10000"
+          ports:
+            - containerPort: 8000
+              protocol: TCP
+              name: api
+            # NEW: KV event ports
+            - containerPort: 5557
+              protocol: TCP
+              name: kv-events
+            - containerPort: 5558
+              protocol: TCP
+              name: kv-replay
+          resources:
+            limits:
+              nvidia.com/gpu: "1"
+            requests:
+              nvidia.com/gpu: "1"
+          # Health checks remain the same
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: 8000
+              scheme: HTTP
+            failureThreshold: 3
+            periodSeconds: 5
+            successThreshold: 1
+            timeoutSeconds: 1
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: 8000
+              scheme: HTTP
+            failureThreshold: 5
+            periodSeconds: 5
+            successThreshold: 1
+            timeoutSeconds: 1
+          startupProbe:
+            httpGet:
+              path: /health
+              port: 8000
+              scheme: HTTP
+            failureThreshold: 30
+            periodSeconds: 5
+            successThreshold: 1
+            timeoutSeconds: 1
+
+---
+
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    model.aibrix.ai/name: deepseek-r1-distill-llama-8b
+    prometheus-discovery: "true"
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/port: "8080"
+  name: deepseek-r1-distill-llama-8b
+  namespace: default
+spec:
+  ports:
+    - name: serve
+      port: 8000
+      protocol: TCP
+      targetPort: 8000
+    - name: metrics
+      port: 8080
+      protocol: TCP
+      targetPort: 8080
+    # NEW: Expose KV event ports (optional, for debugging)
+    - name: kv-events
+      port: 5557
+      protocol: TCP
+      targetPort: 5557
+    - name: kv-replay
+      port: 5558
+      protocol: TCP
+      targetPort: 5558
+  selector:
+    model.aibrix.ai/name: deepseek-r1-distill-llama-8b
+  type: ClusterIP

From c09d475e0296f8e499659e28c625b3b3e0a1922b Mon Sep 17 00:00:00 2001
From: ZHENYU <550149470@qq.com>
Date: Wed, 6 Aug 2025 18:17:52 -0700
Subject: [PATCH 2/2] samples: Improve KV event sync sample configurations

- Add explanatory comments for namespace field in all samples
  Clarifies that 'default' namespace is used for quickstart purposes
  and provides guidance on deploying to different namespaces

- Add health probes to model-with-kv-events-env.yaml
  Ensures consistency with other quickstart samples and provides
  production-ready configuration with liveness, readiness, and startup probes

These improvements address code review feedback while maintaining
consistency with existing AIBrix sample conventions.

Signed-off-by: ZHENYU <550149470@qq.com>
---
 samples/network-policies/allow-kv-events.yaml |  2 ++
 .../quickstart/model-with-kv-events-env.yaml  | 29 +++++++++++++++++++
 samples/quickstart/model-with-kv-events.yaml  |  4 +++
 3 files changed, 35 insertions(+)

diff --git a/samples/network-policies/allow-kv-events.yaml b/samples/network-policies/allow-kv-events.yaml
index 3751a1af5..cb4999a95 100644
--- a/samples/network-policies/allow-kv-events.yaml
+++ b/samples/network-policies/allow-kv-events.yaml
@@ -3,6 +3,8 @@ apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
   name: allow-kv-events
+  # Note: This sample uses 'default' namespace for quickstart purposes.
+  # To deploy to a different namespace, change this value or use: kubectl apply -f <file> -n <your-namespace>
   namespace: default
 spec:
   podSelector:
diff --git a/samples/quickstart/model-with-kv-events-env.yaml b/samples/quickstart/model-with-kv-events-env.yaml
index deebc4147..24ae08106 100644
--- a/samples/quickstart/model-with-kv-events-env.yaml
+++ b/samples/quickstart/model-with-kv-events-env.yaml
@@ -6,6 +6,8 @@ metadata:
     model.aibrix.ai/name: llama-8b-instruct
     model.aibrix.ai/kv-events-enabled: "true"
   name: llama-8b-instruct
+  # Note: This sample uses 'default' namespace for quickstart purposes.
+  # To deploy to a different namespace, change this value or use: kubectl apply -f <file> -n <your-namespace>
   namespace: default
 spec:
   replicas: 2
@@ -63,3 +65,30 @@ spec:
               nvidia.com/gpu: "1"
             requests:
               nvidia.com/gpu: "1"
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: 8000
+              scheme: HTTP
+            failureThreshold: 3
+            periodSeconds: 5
+            successThreshold: 1
+            timeoutSeconds: 1
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: 8000
+              scheme: HTTP
+            failureThreshold: 5
+            periodSeconds: 5
+            successThreshold: 1
+            timeoutSeconds: 1
+          startupProbe:
+            httpGet:
+              path: /health
+              port: 8000
+              scheme: HTTP
+            failureThreshold: 30
+            periodSeconds: 5
+            successThreshold: 1
+            timeoutSeconds: 1
diff --git a/samples/quickstart/model-with-kv-events.yaml b/samples/quickstart/model-with-kv-events.yaml
index 99649a9ed..dbda468f2 100644
--- a/samples/quickstart/model-with-kv-events.yaml
+++ b/samples/quickstart/model-with-kv-events.yaml
@@ -7,6 +7,8 @@ metadata:
     model.aibrix.ai/port: "8000"
     model.aibrix.ai/kv-events-enabled: "true"  # NEW: Enable KV events
   name: deepseek-r1-distill-llama-8b
+  # Note: This sample uses 'default' namespace for quickstart purposes.
+  # To deploy to a different namespace, change this value or use: kubectl apply -f <file> -n <your-namespace>
   namespace: default
 spec:
   replicas: 1
@@ -105,6 +107,8 @@ metadata:
     prometheus.io/scrape: "true"
     prometheus.io/port: "8080"
   name: deepseek-r1-distill-llama-8b
+  # Note: This sample uses 'default' namespace for quickstart purposes.
+  # To deploy to a different namespace, change this value or use: kubectl apply -f <file> -n <your-namespace>
   namespace: default
 spec:
   ports: