diff --git a/inference/a4/llama3.1-8b-instruct/vllm/Chart.yaml b/inference/a4/llama3.1-8b-instruct/vllm/Chart.yaml new file mode 100644 index 00000000..4344ef89 --- /dev/null +++ b/inference/a4/llama3.1-8b-instruct/vllm/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: a4x-benchmark +description: a4x-benchmark +type: application +version: 0.1.0 +appVersion: "1.16.0" diff --git a/inference/a4/llama3.1-8b-instruct/vllm/README.md b/inference/a4/llama3.1-8b-instruct/vllm/README.md new file mode 100644 index 00000000..d041aa70 --- /dev/null +++ b/inference/a4/llama3.1-8b-instruct/vllm/README.md @@ -0,0 +1,92 @@ +# Run vLLM llama3_1_8b_instruct on a4 on GKE + +This recipe covers running a vLLM inference workload on a4 on +GKE. + +## Create the GKE Cluster +Create your a4 cluster using [XPK](https://github.com/AI-Hypercomputer/xpk). +The next sections assume you have created a cluster with a4 +nodes. + +## Deploy vLLM Workload on GKE + +### Configure kubectl to communicate with your cluster + +``` +gcloud container clusters get-credentials ${CLUSTER_NAME} --location=${LOCATION} +``` + +### Generate a new Hugging Face token if you don't already have one + +On the huggingface website, create an account if necessary, and go to Your +Profile > Settings > Access Tokens. +Select Create new token. +Specify a name of your choice and a role with at least Read permissions. +Select Generate a token, follow the prompts and save your token. + +(NOTE: Also ensure that your account has access to the model on Hugging Face. +For example, for llama3, you will need to explicitly get permission): + +### Run the benchmark +In this directory, run: + +``` +helm install ${RUN_NAME} . --set hf_token=${HF_TOKEN} +``` + +The benchmark will launch a client and server pod. + +On the server pod, at the end of the server startup you’ll see logs such as: + +``` +$ kubectl logs deployment/vllm-tpu -f + +(APIServer pid=1) INFO: Started server process [1] +(APIServer pid=1) INFO: Waiting for application startup. +(APIServer pid=1) INFO: Application startup complete. +``` + +The client pod will wait until the server is up and then start the benchmark. +On the client pod, you'll see logs such as: + +``` +$ kubectl logs -f vllm-bench + +============ Serving Benchmark Result ============ +Successful requests: 10 +Failed requests: 0 +Benchmark duration (s): xx +Total input tokens: xxx +Total generated tokens: xxx +Request throughput (req/s): xx +Output token throughput (tok/s): xxx +Peak output token throughput (tok/s): xxx +Peak concurrent requests: 10.00 +Total Token throughput (tok/s): xxx +---------------Time to First Token---------------- +Mean TTFT (ms): xxx +Median TTFT (ms): xxx +P99 TTFT (ms): xxx +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): xxx +Median TPOT (ms): xxx +P99 TPOT (ms): xxx +---------------Inter-token Latency---------------- +Mean ITL (ms): xxx +Median ITL (ms): xxx +P99 ITL (ms): xxx +================================================== +``` + +### Customizing the benchmark + +In order to change the parameters of the benchmark, such as the model, number of +prompts, etc, you can modify the settings in values.yaml. + +### Cleanup +When you are done running the benchmark, the server will still be running. You +can cleanup by running: + +``` +helm uninstall ${RUN_NAME} +``` diff --git a/inference/a4/llama3.1-8b-instruct/vllm/templates/benchmark.yaml b/inference/a4/llama3.1-8b-instruct/vllm/templates/benchmark.yaml new file mode 100644 index 00000000..0f526143 --- /dev/null +++ b/inference/a4/llama3.1-8b-instruct/vllm/templates/benchmark.yaml @@ -0,0 +1,167 @@ +# yamllint disable +{{- if .Values.is_multi_host }} +{{- fail "Multi-host for GPU is not supported in this template." }} +{{- end }} +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: "{{ .Release.Name }}-hyperdisk-balanced" +provisioner: pd.csi.storage.gke.io +parameters: + type: hyperdisk-balanced +reclaimPolicy: Delete +volumeBindingMode: WaitForFirstConsumer +allowVolumeExpansion: true +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: "{{ .Release.Name }}-hd-claim" +spec: + storageClassName: "{{ .Release.Name }}-hyperdisk-balanced" + accessModes: + - ReadWriteOnce + resources: + requests: + storage: {{.Values.server.data_disk_size}} +--- +apiVersion: v1 +kind: Secret +metadata: + name: "{{ .Release.Name }}-hf-secret" +type: Opaque +stringData: + hf_api_token: {{.Values.hf_token}} +--- +{{- if .Values.gcp_service_account }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ .Values.k8s_service_account | quote }} + namespace: default + annotations: + iam.gke.io/gcp-service-account: {{ .Values.gcp_service_account }} +{{- end }} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: "{{ .Release.Name }}-gpu-server" +spec: + replicas: 1 + selector: + matchLabels: + app: "{{ .Release.Name }}-gpu-pod" + template: + metadata: + labels: + app: "{{ .Release.Name }}-gpu-pod" + {{- if .Values.kueue_local_queue }} + kueue.x-k8s.io/queue-name: {{ .Values.kueue_local_queue | quote }} + {{- end }} + spec: + {{- if .Values.gcp_service_account }} + serviceAccountName: {{ .Values.k8s_service_account | quote }} + {{- end }} + nodeSelector: + cloud.google.com/gke-accelerator: nvidia-b200 + tolerations: + - operator: "Exists" + key: nvidia.com/gpu + - operator: "Exists" + key: cloud.google.com/impending-node-termination + - key: "kubernetes.io/arch" + operator: "Equal" + value: "arm64" + effect: "NoSchedule" + containers: + - name: vllm-gpu + image: {{.Values.server.image}} + command: ["/bin/bash", "-c"] + args: + - {{ .Values.server.bash_command | quote }} + env: + - name: LD_LIBRARY_PATH + value: /usr/local/gib/lib64:/usr/local/nvidia/lib64 + - name: NCCL_DEBUG + value: INFO + - name: NCCL_IGNORE_DISABLED_P2P + value: "1" + - name: HF_HOME + value: /data + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: "{{ .Release.Name }}-hf-secret" + key: hf_api_token + ports: + - containerPort: 8000 + resources: + limits: + nvidia.com/gpu: "{{.Values.server.num_chips_per_node}}" + requests: + nvidia.com/gpu: "{{.Values.server.num_chips_per_node}}" + readinessProbe: + tcpSocket: + port: 8000 + initialDelaySeconds: 15 + periodSeconds: 10 + volumeMounts: + - mountPath: "/data" + name: data-volume + - mountPath: /dev/shm + name: dshm + volumes: + - emptyDir: + medium: Memory + name: dshm + - name: data-volume + persistentVolumeClaim: + claimName: "{{ .Release.Name }}-hd-claim" +--- +apiVersion: v1 +kind: Service +metadata: + name: "{{ .Release.Name }}-vllm-service" +spec: + selector: + app: "{{ .Release.Name }}-gpu-pod" + type: LoadBalancer + ports: + - name: http + protocol: TCP + port: 8000 + targetPort: 8000 +--- +apiVersion: v1 +kind: Pod +metadata: + name: "{{ .Release.Name }}-client" + labels: + {{- if .Values.kueue_local_queue }} + kueue.x-k8s.io/queue-name: {{ .Values.kueue_local_queue | quote }} + {{- end }} +spec: + {{- if .Values.gcp_service_account }} + serviceAccountName: {{ .Values.k8s_service_account | quote }} + {{- end }} + tolerations: + - operator: "Exists" + key: nvidia.com/gpu + - operator: "Exists" + key: cloud.google.com/impending-node-termination + terminationGracePeriodSeconds: 60 + containers: + - name: vllm-bench + image: {{.Values.client.image}} + command: ["/bin/bash", "-c"] + args: ["{{.Values.client.bash_command}}"] + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + key: hf_api_token + name: "{{ .Release.Name }}-hf-secret" + - name: SERVER_HOSTNAME + value: "{{ .Release.Name }}-vllm-service.default.svc.cluster.local" + restartPolicy: Never diff --git a/inference/a4/llama3.1-8b-instruct/vllm/values.yaml b/inference/a4/llama3.1-8b-instruct/vllm/values.yaml new file mode 100644 index 00000000..74fe9f51 --- /dev/null +++ b/inference/a4/llama3.1-8b-instruct/vllm/values.yaml @@ -0,0 +1,54 @@ +"client": + "bash_command": |- + while ! curl http://${SERVER_HOSTNAME}:8000/ping; do sleep 30 && echo 'Waiting for server...'; done + + vllm bench serve \ + --dataset-name=random \ + --random-input-len=1024 \ + --random-output-len=128 \ + --host=${SERVER_HOSTNAME} \ + --port=8000 \ + --model=meta-llama/Llama-3.1-8B-Instruct \ + --tokenizer=meta-llama/Llama-3.1-8B-Instruct \ + --num-prompts=100 + "client_type": |- + VLLM_BENCH + "image": |- + vllm/vllm-openai:latest +"gcp_service_account": !!null |- + null +"hf_token": !!null |- + null +"is_multi_host": !!bool |- + false +"kueue_local_queue": !!null |- + null +"kueue_priority_class": |- + medium +"server": + "bash_command": |- + VLLM_CUSTOM_ALL_REDUCE=0 NCCL_DEBUG=INFO python3 -m vllm.entrypoints.openai.api_server \ + --host=0.0.0.0 \ + --port=8000 \ + --download-dir=/data \ + --tensor-parallel-size=8 \ + --max-model-len=8192 \ + --gpu-memory-utilization=0.95 \ + --max-num-batched-tokens=8192 \ + --max-num-seqs=256 \ + --model=meta-llama/Llama-3.1-8B-Instruct \ + --tokenizer=meta-llama/Llama-3.1-8B-Instruct \ + --no-enable-prefix-caching \ + --additional_config='--enforce-eager --disable-custom-all-reduce' + "data_disk_size": |- + 100Gi + "image": |- + vllm/vllm-openai:latest + "num_chips_per_node": !!int |- + 8 + "num_nodes": !!int |- + 1 + "profile": !!bool |- + false + "worker_bash_command": !!null |- + null