diff --git a/samples/network-policies/allow-kv-events.yaml b/samples/network-policies/allow-kv-events.yaml new file mode 100644 index 000000000..cb4999a95 --- /dev/null +++ b/samples/network-policies/allow-kv-events.yaml @@ -0,0 +1,24 @@ +--- +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-kv-events + # Note: This sample uses 'default' namespace for quickstart purposes. + # To deploy to a different namespace, change this value or use: kubectl apply -f -n + namespace: default +spec: + podSelector: + matchLabels: + app: gateway-plugins + policyTypes: + - Ingress + ingress: + - from: + - podSelector: + matchLabels: + model.aibrix.ai/kv-events-enabled: "true" + ports: + - protocol: TCP + port: 5557 + - protocol: TCP + port: 5558 diff --git a/samples/quickstart/model-with-kv-events-env.yaml b/samples/quickstart/model-with-kv-events-env.yaml new file mode 100644 index 000000000..24ae08106 --- /dev/null +++ b/samples/quickstart/model-with-kv-events-env.yaml @@ -0,0 +1,94 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + model.aibrix.ai/name: llama-8b-instruct + model.aibrix.ai/kv-events-enabled: "true" + name: llama-8b-instruct + # Note: This sample uses 'default' namespace for quickstart purposes. + # To deploy to a different namespace, change this value or use: kubectl apply -f -n + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + model.aibrix.ai/name: llama-8b-instruct + template: + metadata: + labels: + model.aibrix.ai/name: llama-8b-instruct + model.aibrix.ai/kv-events-enabled: "true" + spec: + containers: + - name: vllm-openai + image: vllm/vllm-openai:v0.7.1 + command: + - python3 + - -m + - vllm.entrypoints.openai.api_server + - --host + - "0.0.0.0" + - --port + - "8000" + - --model + - meta-llama/Llama-3.1-8B-Instruct + - --served-model-name + - llama-8b-instruct + env: + # NEW: KV event configuration via environment + - name: VLLM_ENABLE_KV_CACHE_EVENTS + value: "true" + - name: VLLM_KV_EVENTS_PUBLISHER + value: "zmq" + - name: VLLM_KV_EVENTS_ENDPOINT + value: "tcp://*:5557" + - name: VLLM_KV_EVENTS_REPLAY_ENDPOINT + value: "tcp://*:5558" + - name: VLLM_KV_EVENTS_BUFFER_STEPS + value: "10000" + # Performance tuning + - name: VLLM_KV_EVENTS_HWM + value: "100000" # ZMQ high water mark + ports: + - containerPort: 8000 + protocol: TCP + name: api + - containerPort: 5557 + protocol: TCP + name: kv-events + - containerPort: 5558 + protocol: TCP + name: kv-replay + resources: + limits: + nvidia.com/gpu: "1" + requests: + nvidia.com/gpu: "1" + livenessProbe: + httpGet: + path: /health + port: 8000 + scheme: HTTP + failureThreshold: 3 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + readinessProbe: + httpGet: + path: /health + port: 8000 + scheme: HTTP + failureThreshold: 5 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + startupProbe: + httpGet: + path: /health + port: 8000 + scheme: HTTP + failureThreshold: 30 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 diff --git a/samples/quickstart/model-with-kv-events.yaml b/samples/quickstart/model-with-kv-events.yaml new file mode 100644 index 000000000..dbda468f2 --- /dev/null +++ b/samples/quickstart/model-with-kv-events.yaml @@ -0,0 +1,134 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + model.aibrix.ai/name: deepseek-r1-distill-llama-8b + model.aibrix.ai/port: "8000" + model.aibrix.ai/kv-events-enabled: "true" # NEW: Enable KV events + name: deepseek-r1-distill-llama-8b + # Note: This sample uses 'default' namespace for quickstart purposes. + # To deploy to a different namespace, change this value or use: kubectl apply -f -n + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + model.aibrix.ai/name: deepseek-r1-distill-llama-8b + template: + metadata: + labels: + model.aibrix.ai/name: deepseek-r1-distill-llama-8b + model.aibrix.ai/kv-events-enabled: "true" # NEW: Required for discovery + spec: + containers: + - name: vllm-openai + image: vllm/vllm-openai:v0.7.1 + command: + - python3 + - -m + - vllm.entrypoints.openai.api_server + - --host + - "0.0.0.0" + - --port + - "8000" + - --uvicorn-log-level + - warning + - --model + - deepseek-ai/DeepSeek-R1-Distill-Llama-8B + - --served-model-name + - deepseek-r1-distill-llama-8b + - --max-model-len + - "12288" + # NEW: KV event publishing configuration + - --enable-kv-cache-events + - --kv-events-publisher + - zmq + - --kv-events-endpoint + - "tcp://*:5557" + - --kv-events-replay-endpoint + - "tcp://*:5558" + - --kv-events-buffer-steps + - "10000" + ports: + - containerPort: 8000 + protocol: TCP + name: api + # NEW: KV event ports + - containerPort: 5557 + protocol: TCP + name: kv-events + - containerPort: 5558 + protocol: TCP + name: kv-replay + resources: + limits: + nvidia.com/gpu: "1" + requests: + nvidia.com/gpu: "1" + # Health checks remain the same + livenessProbe: + httpGet: + path: /health + port: 8000 + scheme: HTTP + failureThreshold: 3 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + readinessProbe: + httpGet: + path: /health + port: 8000 + scheme: HTTP + failureThreshold: 5 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + startupProbe: + httpGet: + path: /health + port: 8000 + scheme: HTTP + failureThreshold: 30 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + +--- + +apiVersion: v1 +kind: Service +metadata: + labels: + model.aibrix.ai/name: deepseek-r1-distill-llama-8b + prometheus-discovery: "true" + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + name: deepseek-r1-distill-llama-8b + # Note: This sample uses 'default' namespace for quickstart purposes. + # To deploy to a different namespace, change this value or use: kubectl apply -f -n + namespace: default +spec: + ports: + - name: serve + port: 8000 + protocol: TCP + targetPort: 8000 + - name: metrics + port: 8080 + protocol: TCP + targetPort: 8080 + # NEW: Expose KV event ports (optional, for debugging) + - name: kv-events + port: 5557 + protocol: TCP + targetPort: 5557 + - name: kv-replay + port: 5558 + protocol: TCP + targetPort: 5558 + selector: + model.aibrix.ai/name: deepseek-r1-distill-llama-8b + type: ClusterIP