diff --git a/.github/workflows/build-and-deploy.yml b/.github/workflows/build-and-deploy.yml index f28c60b..38c933e 100644 --- a/.github/workflows/build-and-deploy.yml +++ b/.github/workflows/build-and-deploy.yml @@ -8,17 +8,29 @@ jobs: build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@b4ffde82f28162d9a60b3dfb39e4f2447a3b1c7f - name: Set up Docker - uses: docker/setup-buildx-action@v2 + uses: docker/setup-buildx-action@f3821f4794d9a373d160d5e86f922b622d21b008 + - name: Set up kubectl + uses: azure/setup-kubectl@bd24c49a951a6eb2de75e8b4785905a9a05fd85e + with: + version: v1.28.3 + - name: Set up Helm + uses: azure/setup-helm@e9a68a7554547a8bb11e9a56ce3cfd31c8eaa974 + with: + version: v3.12.3 + - name: Configure kubeconfig + run: | + mkdir -p ~/.kube + echo "${{ secrets.KUBECONFIG_DATA }}" | base64 -d > ~/.kube/config - name: Build run: docker build -t example/vllm:${{ github.sha }} . - name: Scan - uses: aquasecurity/trivy-action@0.20.0 + uses: aquasecurity/trivy-action@4d1a13b66e041b35769128a8b06845050806e06e with: image-ref: example/vllm:${{ github.sha }} - name: Login - uses: docker/login-action@v3 + uses: docker/login-action@ee0af82ac35b689a7dce1aa3e9e4b0943aa53d25 with: registry: ghcr.io username: ${{ github.actor }} @@ -26,4 +38,8 @@ jobs: - name: Push run: docker push example/vllm:${{ github.sha }} - name: Helm Upgrade - run: helm upgrade --install tensorizer helm/tensorizer-vllm --set image=example/vllm:${{ github.sha }} --namespace test --create-namespace + run: | + helm upgrade --install tensorizer helm/tensorizer-vllm \ + --set image=example/vllm:${{ github.sha }} \ + --namespace test \ + --create-namespace diff --git a/docs/observability.md b/docs/observability.md index 8314661..93efe5f 100644 --- a/docs/observability.md +++ b/docs/observability.md @@ -15,8 +15,9 @@ invoking the model. 1. The `kube-state-metrics` and `node-exporter` dashboards show cluster health. 2. vLLM exports Prometheus metrics such as `vllm_engine_execution_time`. -3. Logs are collected via Loki; search by `app=vllm`. -4. For a local demo use the [`observability/` example](../examples/observability/grafana/README.md) +3. Ensure Prometheus scrapes the vLLM service on port `8000` to populate Grafana. +4. Logs are collected via Loki; search by `app=vllm`. +5. For a local demo use the [`observability/` example](../examples/observability/grafana/README.md) which spins up Prometheus and Grafana with Docker Compose. Screenshots can be added to `docs/img/` for presentations. diff --git a/examples/tensorizer/serialize_and_load.py b/examples/tensorizer/serialize_and_load.py index 033c841..ddbfe55 100644 --- a/examples/tensorizer/serialize_and_load.py +++ b/examples/tensorizer/serialize_and_load.py @@ -7,6 +7,7 @@ import argparse import os import threading +from functools import partial from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer import torch @@ -30,8 +31,8 @@ def upload_to_s3(path: str, bucket: str, key: str) -> None: def serve_file(path: str, port: int) -> threading.Thread: directory = os.path.dirname(os.path.abspath(path)) - os.chdir(directory) - server = ThreadingHTTPServer(("0.0.0.0", port), SimpleHTTPRequestHandler) + handler = partial(SimpleHTTPRequestHandler, directory=directory) + server = ThreadingHTTPServer(("0.0.0.0", port), handler) thread = threading.Thread(target=server.serve_forever, daemon=True) thread.start() return thread diff --git a/gitops/argocd/app.yaml b/gitops/argocd/app.yaml index 07a80df..2d64c48 100644 --- a/gitops/argocd/app.yaml +++ b/gitops/argocd/app.yaml @@ -9,7 +9,7 @@ spec: source: repoURL: https://github.com/coreweave/tensorizer path: helm/tensorizer-vllm - targetRevision: HEAD + targetRevision: main project: default syncPolicy: automated: diff --git a/helm/tensorizer-vllm/templates/deployment.yaml b/helm/tensorizer-vllm/templates/deployment.yaml index c68ed75..5bc95c8 100644 --- a/helm/tensorizer-vllm/templates/deployment.yaml +++ b/helm/tensorizer-vllm/templates/deployment.yaml @@ -18,3 +18,16 @@ spec: args: ["serve", "--model", "{{ .Values.modelURI }}", "--tensorizer"] ports: - containerPort: 8000 + {{- if .Values.s3.secretName }} + env: + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: {{ .Values.s3.secretName }} + key: accessKeyId + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: {{ .Values.s3.secretName }} + key: secretAccessKey + {{- end }} diff --git a/helm/tensorizer-vllm/values.yaml b/helm/tensorizer-vllm/values.yaml index bfe6722..38b8c37 100644 --- a/helm/tensorizer-vllm/values.yaml +++ b/helm/tensorizer-vllm/values.yaml @@ -1,3 +1,5 @@ -image: "vllm/vllm:latest" +image: "vllm/vllm:0.2.2" modelURI: "s3://my-bucket/models/tiny-gpt2.tensors" host: "vllm.example.com" +s3: + secretName: "" diff --git a/k8s/knative-service.yaml b/k8s/knative-service.yaml index 35c168c..9f3083b 100644 --- a/k8s/knative-service.yaml +++ b/k8s/knative-service.yaml @@ -9,7 +9,18 @@ spec: autoscaling.knative.dev/minScale: "0" spec: containers: - - image: vllm/vllm:latest + - image: vllm/vllm:0.2.2 args: ["serve", "--model", "s3://my-bucket/models/tiny-gpt2.tensors", "--tensorizer"] + env: + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: s3-credentials + key: accessKeyId + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: s3-credentials + key: secretAccessKey ports: - containerPort: 8000 diff --git a/k8s/networkpolicy.yaml b/k8s/networkpolicy.yaml new file mode 100644 index 0000000..f38a7c1 --- /dev/null +++ b/k8s/networkpolicy.yaml @@ -0,0 +1,23 @@ +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: tensorizer-allow +spec: + podSelector: + matchLabels: + app: tensorizer + policyTypes: + - Ingress + - Egress + ingress: + - from: + - namespaceSelector: {} + ports: + - protocol: TCP + port: 8000 + egress: + - to: + - namespaceSelector: {} + ports: + - protocol: TCP + port: 443 diff --git a/k8s/rbac.yaml b/k8s/rbac.yaml new file mode 100644 index 0000000..97e9a61 --- /dev/null +++ b/k8s/rbac.yaml @@ -0,0 +1,25 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: tensorizer-sa +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: tensorizer-role +rules: + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: tensorizer-rolebinding +subjects: + - kind: ServiceAccount + name: tensorizer-sa +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: tensorizer-role