diff --git a/inference/a4/llama3.1-8b-instruct/vllm/Chart.yaml b/inference/a4/llama3.1-8b-instruct/vllm/Chart.yaml
new file mode 100644
index 00000000..4344ef89
--- /dev/null
+++ b/inference/a4/llama3.1-8b-instruct/vllm/Chart.yaml
@@ -0,0 +1,6 @@
+apiVersion: v2
+name: a4x-benchmark
+description: a4x-benchmark
+type: application
+version: 0.1.0
+appVersion: "1.16.0"
diff --git a/inference/a4/llama3.1-8b-instruct/vllm/README.md b/inference/a4/llama3.1-8b-instruct/vllm/README.md
new file mode 100644
index 00000000..d041aa70
--- /dev/null
+++ b/inference/a4/llama3.1-8b-instruct/vllm/README.md
@@ -0,0 +1,92 @@
+# Run vLLM llama3_1_8b_instruct on a4 on GKE
+
+This recipe covers running a vLLM inference workload on a4 on
+GKE.
+
+## Create the GKE Cluster
+Create your a4 cluster using [XPK](https://github.com/AI-Hypercomputer/xpk).
+The next sections assume you have created a cluster with a4
+nodes.
+
+## Deploy vLLM Workload on GKE
+
+### Configure kubectl to communicate with your cluster
+
+```
+gcloud container clusters get-credentials ${CLUSTER_NAME} --location=${LOCATION}
+```
+
+### Generate a new Hugging Face token if you don't already have one
+
+On the huggingface website, create an account if necessary, and go to Your
+Profile > Settings > Access Tokens.
+Select Create new token.
+Specify a name of your choice and a role with at least Read permissions.
+Select Generate a token, follow the prompts and save your token.
+
+(NOTE: Also ensure that your account has access to the model on Hugging Face.
+For example, for llama3, you will need to explicitly get permission):
+
+### Run the benchmark
+In this directory, run:
+
+```
+helm install ${RUN_NAME} . --set hf_token=${HF_TOKEN}
+```
+
+The benchmark will launch a client and server pod.
+
+On the server pod, at the end of the server startup you’ll see logs such as:
+
+```
+$ kubectl logs deployment/vllm-tpu -f
+
+(APIServer pid=1) INFO:     Started server process [1]
+(APIServer pid=1) INFO:     Waiting for application startup.
+(APIServer pid=1) INFO:     Application startup complete.
+```
+
+The client pod will wait until the server is up and then start the benchmark.
+On the client pod, you'll see logs such as:
+
+```
+$ kubectl logs -f vllm-bench
+
+============ Serving Benchmark Result ============
+Successful requests:                     10
+Failed requests:                         0
+Benchmark duration (s):                  xx
+Total input tokens:                      xxx
+Total generated tokens:                  xxx
+Request throughput (req/s):              xx
+Output token throughput (tok/s):         xxx
+Peak output token throughput (tok/s):    xxx
+Peak concurrent requests:                10.00
+Total Token throughput (tok/s):          xxx
+---------------Time to First Token----------------
+Mean TTFT (ms):                          xxx
+Median TTFT (ms):                        xxx
+P99 TTFT (ms):                           xxx
+-----Time per Output Token (excl. 1st token)------
+Mean TPOT (ms):                          xxx
+Median TPOT (ms):                       xxx
+P99 TPOT (ms):                           xxx
+---------------Inter-token Latency----------------
+Mean ITL (ms):                           xxx
+Median ITL (ms):                         xxx
+P99 ITL (ms):                            xxx
+==================================================
+```
+
+### Customizing the benchmark
+
+In order to change the parameters of the benchmark, such as the model, number of
+prompts, etc, you can modify the settings in values.yaml.
+
+### Cleanup
+When you are done running the benchmark, the server will still be running. You
+can cleanup by running:
+
+```
+helm uninstall ${RUN_NAME}
+```
diff --git a/inference/a4/llama3.1-8b-instruct/vllm/templates/benchmark.yaml b/inference/a4/llama3.1-8b-instruct/vllm/templates/benchmark.yaml
new file mode 100644
index 00000000..0f526143
--- /dev/null
+++ b/inference/a4/llama3.1-8b-instruct/vllm/templates/benchmark.yaml
@@ -0,0 +1,167 @@
+# yamllint disable
+{{- if .Values.is_multi_host }}
+{{- fail "Multi-host for GPU is not supported in this template." }}
+{{- end }}
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+  name: "{{ .Release.Name }}-hyperdisk-balanced"
+provisioner: pd.csi.storage.gke.io
+parameters:
+  type: hyperdisk-balanced
+reclaimPolicy: Delete
+volumeBindingMode: WaitForFirstConsumer
+allowVolumeExpansion: true
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: "{{ .Release.Name }}-hd-claim"
+spec:
+  storageClassName: "{{ .Release.Name }}-hyperdisk-balanced"
+  accessModes:
+  - ReadWriteOnce
+  resources:
+    requests:
+      storage: {{.Values.server.data_disk_size}}
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: "{{ .Release.Name }}-hf-secret"
+type: Opaque
+stringData:
+  hf_api_token: {{.Values.hf_token}}
+---
+{{- if .Values.gcp_service_account }}
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: {{ .Values.k8s_service_account | quote }}
+  namespace: default
+  annotations:
+    iam.gke.io/gcp-service-account: {{ .Values.gcp_service_account }}
+{{- end }}
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: "{{ .Release.Name }}-gpu-server"
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: "{{ .Release.Name }}-gpu-pod"
+  template:
+    metadata:
+      labels:
+        app: "{{ .Release.Name }}-gpu-pod"
+        {{- if .Values.kueue_local_queue }}
+        kueue.x-k8s.io/queue-name: {{ .Values.kueue_local_queue | quote }}
+        {{- end }}
+    spec:
+      {{- if .Values.gcp_service_account }}
+      serviceAccountName: {{ .Values.k8s_service_account | quote }}
+      {{- end }}
+      nodeSelector:
+        cloud.google.com/gke-accelerator: nvidia-b200
+      tolerations:
+      - operator: "Exists"
+        key: nvidia.com/gpu
+      - operator: "Exists"
+        key: cloud.google.com/impending-node-termination
+      - key: "kubernetes.io/arch"
+        operator: "Equal"
+        value: "arm64"
+        effect: "NoSchedule"
+      containers:
+      - name: vllm-gpu
+        image: {{.Values.server.image}}
+        command: ["/bin/bash", "-c"]
+        args:
+        - {{ .Values.server.bash_command | quote }}
+        env:
+        - name: LD_LIBRARY_PATH
+          value: /usr/local/gib/lib64:/usr/local/nvidia/lib64
+        - name: NCCL_DEBUG
+          value: INFO
+        - name: NCCL_IGNORE_DISABLED_P2P
+          value: "1"
+        - name: HF_HOME
+          value: /data
+        - name: HUGGING_FACE_HUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: "{{ .Release.Name }}-hf-secret"
+              key: hf_api_token
+        ports:
+        - containerPort: 8000
+        resources:
+          limits:
+            nvidia.com/gpu: "{{.Values.server.num_chips_per_node}}"
+          requests:
+            nvidia.com/gpu: "{{.Values.server.num_chips_per_node}}"
+        readinessProbe:
+          tcpSocket:
+            port: 8000
+          initialDelaySeconds: 15
+          periodSeconds: 10
+        volumeMounts:
+        - mountPath: "/data"
+          name: data-volume
+        - mountPath: /dev/shm
+          name: dshm
+      volumes:
+      - emptyDir:
+          medium: Memory
+        name: dshm
+      - name: data-volume
+        persistentVolumeClaim:
+          claimName: "{{ .Release.Name }}-hd-claim"
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: "{{ .Release.Name }}-vllm-service"
+spec:
+  selector:
+    app: "{{ .Release.Name }}-gpu-pod"
+  type: LoadBalancer
+  ports:
+  - name: http
+    protocol: TCP
+    port: 8000
+    targetPort: 8000
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: "{{ .Release.Name }}-client"
+  labels:
+    {{- if .Values.kueue_local_queue }}
+    kueue.x-k8s.io/queue-name: {{ .Values.kueue_local_queue | quote }}
+    {{- end }}
+spec:
+  {{- if .Values.gcp_service_account }}
+  serviceAccountName: {{ .Values.k8s_service_account | quote }}
+  {{- end }}
+  tolerations:
+  - operator: "Exists"
+    key: nvidia.com/gpu
+  - operator: "Exists"
+    key: cloud.google.com/impending-node-termination
+  terminationGracePeriodSeconds: 60
+  containers:
+  - name: vllm-bench
+    image: {{.Values.client.image}}
+    command: ["/bin/bash", "-c"]
+    args: ["{{.Values.client.bash_command}}"]
+    env:
+    - name: HUGGING_FACE_HUB_TOKEN
+      valueFrom:
+        secretKeyRef:
+          key: hf_api_token
+          name: "{{ .Release.Name }}-hf-secret"
+    - name: SERVER_HOSTNAME
+      value: "{{ .Release.Name }}-vllm-service.default.svc.cluster.local"
+  restartPolicy: Never
diff --git a/inference/a4/llama3.1-8b-instruct/vllm/values.yaml b/inference/a4/llama3.1-8b-instruct/vllm/values.yaml
new file mode 100644
index 00000000..74fe9f51
--- /dev/null
+++ b/inference/a4/llama3.1-8b-instruct/vllm/values.yaml
@@ -0,0 +1,54 @@
+"client":
+  "bash_command": |-
+    while ! curl http://${SERVER_HOSTNAME}:8000/ping; do sleep 30 && echo 'Waiting for server...'; done
+
+    vllm bench serve \
+    --dataset-name=random \
+    --random-input-len=1024 \
+    --random-output-len=128 \
+    --host=${SERVER_HOSTNAME} \
+    --port=8000 \
+    --model=meta-llama/Llama-3.1-8B-Instruct \
+    --tokenizer=meta-llama/Llama-3.1-8B-Instruct \
+    --num-prompts=100
+  "client_type": |-
+    VLLM_BENCH
+  "image": |-
+    vllm/vllm-openai:latest
+"gcp_service_account": !!null |-
+  null
+"hf_token": !!null |-
+  null
+"is_multi_host": !!bool |-
+  false
+"kueue_local_queue": !!null |-
+  null
+"kueue_priority_class": |-
+  medium
+"server":
+  "bash_command": |-
+    VLLM_CUSTOM_ALL_REDUCE=0 NCCL_DEBUG=INFO python3 -m vllm.entrypoints.openai.api_server \
+    --host=0.0.0.0 \
+    --port=8000 \
+    --download-dir=/data \
+    --tensor-parallel-size=8 \
+    --max-model-len=8192 \
+    --gpu-memory-utilization=0.95 \
+    --max-num-batched-tokens=8192 \
+    --max-num-seqs=256 \
+    --model=meta-llama/Llama-3.1-8B-Instruct \
+    --tokenizer=meta-llama/Llama-3.1-8B-Instruct \
+    --no-enable-prefix-caching \
+    --additional_config='--enforce-eager --disable-custom-all-reduce'
+  "data_disk_size": |-
+    100Gi
+  "image": |-
+    vllm/vllm-openai:latest
+  "num_chips_per_node": !!int |-
+    8
+  "num_nodes": !!int |-
+    1
+  "profile": !!bool |-
+    false
+  "worker_bash_command": !!null |-
+    null