From 1f6051ef321886dc84029253ff1a9e642653725e Mon Sep 17 00:00:00 2001 From: Pawel Date: Wed, 11 Feb 2026 11:43:25 +0100 Subject: [PATCH 01/11] save --- .../long_context/README.md | 70 +++++++++++++++---- 1 file changed, 57 insertions(+), 13 deletions(-) diff --git a/demos/continuous_batching/long_context/README.md b/demos/continuous_batching/long_context/README.md index 41b75b062e..1660053a88 100644 --- a/demos/continuous_batching/long_context/README.md +++ b/demos/continuous_batching/long_context/README.md @@ -28,12 +28,19 @@ Export the model Qwen/Qwen2.5-7B-Instruct-1M which has the max context length of curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/export_model.py -o export_model.py pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/requirements.txt mkdir models -python export_model.py text_generation --source_model Qwen/Qwen2.5-7B-Instruct-1M --weight-format int4 --config_file_path models/config.json --model_repository_path models +python export_model.py text_generation --source_model openai/gpt-oss-20b --weight-format int4 --config_file_path models/config.json --model_repository_path models --tool_parser gptoss --reasoning_parser gptoss +curl -L -o models/openai/gpt-oss-20b/chat_template.jinja https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/releases/2025/4/extras/chat_template_examples/chat_template_gpt_oss_multiturn.jinja ``` Start OVMS: ```bash -docker run -it --rm -u $(id -u) -p 8000:8000 -v $(pwd)/models/:/models:rw openvino/model_server:latest --rest_port 8000 --source_model Qwen/Qwen2.5-7B-Instruct-1M --model_repository_path /models --task text_generation --enable_prefix_caching true --kv_cache_precision u8 --target_device CPU +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models openvino/model_server:weekly --rest_port 8000 --source_model openai/gpt-oss-20b --model_repository_path models --tool_parser gptoss --task text_generation --cache_dir /models/.cache --enable_prefix_caching true +``` + +```bash +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ +--rest_port 8000 --source_model openai/gpt-oss-20b --model_repository_path models \ +--tool_parser gptoss --reasoning_parser gptoss --target_device GPU --task text_generation --enable_prefix_caching true ``` ## Dataset for experiments @@ -55,19 +62,56 @@ Let's check the performance git clone --branch v0.9.1 --depth 1 https://github.com/vllm-project/vllm cd vllm pip3 install -r requirements/cpu.txt . --extra-index-url https://download.pytorch.org/whl/cpu -python benchmarks/benchmark_serving.py --host localhost --port 8000 --endpoint /v3/chat/completions --backend openai-chat --model Qwen/Qwen2.5-7B-Instruct-1M --dataset-name custom --dataset-path ../dataset.jsonl --num-prompts 10 --max-concurrency 1 --custom-output-len 50 +python benchmarks/benchmark_serving.py --host localhost --port 8000 --endpoint /v3/chat/completions --backend openai-chat --model openai/gpt-oss-20b --dataset-name custom --dataset-path ../dataset.jsonl --num-prompts 10 --max-concurrency 1 --custom-output-len 50 +``` + +# GPU +``` +============ Serving Benchmark Result ============ +Successful requests: 10 +Benchmark duration (s): 33.49 +Total input tokens: 49774 +Total generated tokens: 500 +Request throughput (req/s): 0.30 +Output token throughput (tok/s): 14.93 +Total Token throughput (tok/s): 1501.34 +---------------Time to First Token---------------- +Mean TTFT (ms): 126.35 +Median TTFT (ms): 125.42 +P99 TTFT (ms): 135.13 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 65.75 +Median TPOT (ms): 65.69 +P99 TPOT (ms): 66.04 +---------------Inter-token Latency---------------- +Mean ITL (ms): 87.07 +Median ITL (ms): 65.98 +P99 ITL (ms): 199.35 +================================================== +``` +# CPU +``` ============ Serving Benchmark Result ============ -Successful requests: 10 -Benchmark duration (s): 31.44 -Total input tokens: 500414 -Total generated tokens: 500 -Request throughput (req/s): 0.32 -Output token throughput (tok/s): 15.91 -Total Token throughput (tok/s): 15934.81 +Successful requests: 10 +Benchmark duration (s): 29.54 +Total input tokens: 49774 +Total generated tokens: 500 +Request throughput (req/s): 0.34 +Output token throughput (tok/s): 16.92 +Total Token throughput (tok/s): 1701.74 ---------------Time to First Token---------------- -Mean TTFT (ms): 1551.46 -Median TTFT (ms): 518.46 -P99 TTFT (ms): 3260.48 +Mean TTFT (ms): 173.65 +Median TTFT (ms): 171.61 +P99 TTFT (ms): 190.84 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 56.74 +Median TPOT (ms): 56.82 +P99 TPOT (ms): 56.87 +---------------Inter-token Latency---------------- +Mean ITL (ms): 75.14 +Median ITL (ms): 56.81 +P99 ITL (ms): 171.99 +================================================== ``` The results shown above, despite very long context, have much lower TTFT latency with prefix caching. As long as the beginning of the request prompt is reused, KV cache can be also reused to speed up prompt processing. From 80e97a91429d95e33f37e37d3a539e1545afbc6e Mon Sep 17 00:00:00 2001 From: Pawel Date: Wed, 11 Feb 2026 15:41:11 +0100 Subject: [PATCH 02/11] save --- .../long_context/README.md | 69 +++++++++++-------- .../long_context/custom_dataset.py | 2 +- 2 files changed, 43 insertions(+), 28 deletions(-) diff --git a/demos/continuous_batching/long_context/README.md b/demos/continuous_batching/long_context/README.md index 1660053a88..b925ee060b 100644 --- a/demos/continuous_batching/long_context/README.md +++ b/demos/continuous_batching/long_context/README.md @@ -22,7 +22,7 @@ Compression reduces this memory usage, enabling longer prompts or more parallel Let's demonstrate all the optimizations combined and test it with the real life scenario of sending multiple various questions in the same context. It will illustrate the gain from the prefix caching on the first token latency, improved second token latency thanks to prompt lookup and moderate memory consumption despite very long prompts and parallel execution. -Export the model Qwen/Qwen2.5-7B-Instruct-1M which has the max context length of 1 million tokens! +Export the model openai/gpt-oss-20b which has the max context length of 131k tokens. ```bash curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/export_model.py -o export_model.py @@ -33,15 +33,23 @@ curl -L -o models/openai/gpt-oss-20b/chat_template.jinja https://raw.githubuserc ``` Start OVMS: + +::::{tab-set} +:::{tab-item} CPU +:sync: CPU ```bash docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models openvino/model_server:weekly --rest_port 8000 --source_model openai/gpt-oss-20b --model_repository_path models --tool_parser gptoss --task text_generation --cache_dir /models/.cache --enable_prefix_caching true ``` - +::: +:::{tab-item} GPU +:sync: GPU ```bash docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ --rest_port 8000 --source_model openai/gpt-oss-20b --model_repository_path models \ --tool_parser gptoss --reasoning_parser gptoss --target_device GPU --task text_generation --enable_prefix_caching true ``` +::: +:::: ## Dataset for experiments @@ -65,31 +73,10 @@ pip3 install -r requirements/cpu.txt . --extra-index-url https://download.pytorc python benchmarks/benchmark_serving.py --host localhost --port 8000 --endpoint /v3/chat/completions --backend openai-chat --model openai/gpt-oss-20b --dataset-name custom --dataset-path ../dataset.jsonl --num-prompts 10 --max-concurrency 1 --custom-output-len 50 ``` -# GPU -``` -============ Serving Benchmark Result ============ -Successful requests: 10 -Benchmark duration (s): 33.49 -Total input tokens: 49774 -Total generated tokens: 500 -Request throughput (req/s): 0.30 -Output token throughput (tok/s): 14.93 -Total Token throughput (tok/s): 1501.34 ----------------Time to First Token---------------- -Mean TTFT (ms): 126.35 -Median TTFT (ms): 125.42 -P99 TTFT (ms): 135.13 ------Time per Output Token (excl. 1st token)------ -Mean TPOT (ms): 65.75 -Median TPOT (ms): 65.69 -P99 TPOT (ms): 66.04 ----------------Inter-token Latency---------------- -Mean ITL (ms): 87.07 -Median ITL (ms): 65.98 -P99 ITL (ms): 199.35 -================================================== -``` -# CPU + +::::{tab-set} +:::{tab-item} CPU +:sync: CPU ``` ============ Serving Benchmark Result ============ Successful requests: 10 @@ -113,6 +100,34 @@ Median ITL (ms): 56.81 P99 ITL (ms): 171.99 ================================================== ``` +::: +:::{tab-item} GPU +:sync: GPU +``` +============ Serving Benchmark Result ============ +Successful requests: 10 +Benchmark duration (s): 33.49 +Total input tokens: 49774 +Total generated tokens: 500 +Request throughput (req/s): 0.30 +Output token throughput (tok/s): 14.93 +Total Token throughput (tok/s): 1501.34 +---------------Time to First Token---------------- +Mean TTFT (ms): 126.35 +Median TTFT (ms): 125.42 +P99 TTFT (ms): 135.13 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 65.75 +Median TPOT (ms): 65.69 +P99 TPOT (ms): 66.04 +---------------Inter-token Latency---------------- +Mean ITL (ms): 87.07 +Median ITL (ms): 65.98 +P99 ITL (ms): 199.35 +================================================== +``` +::: +:::: The results shown above, despite very long context, have much lower TTFT latency with prefix caching. As long as the beginning of the request prompt is reused, KV cache can be also reused to speed up prompt processing. diff --git a/demos/continuous_batching/long_context/custom_dataset.py b/demos/continuous_batching/long_context/custom_dataset.py index bc166d8978..bf9ecc1b1e 100644 --- a/demos/continuous_batching/long_context/custom_dataset.py +++ b/demos/continuous_batching/long_context/custom_dataset.py @@ -50,7 +50,7 @@ def download_file(url): parser = argparse.ArgumentParser(description="Generate a dataset of long context examples.") parser.add_argument("--file_url", type=str, default="https://ota.bodleian.ox.ac.uk/repository/xmlui/bitstream/handle/20.500.12024/2011/donquix-2011.txt", help="URL of the file to download") -parser.add_argument("--model_name", type=str, default="Qwen/Qwen2.5-7B-Instruct-1M", help="Model name for the tokenizer") +parser.add_argument("--model_name", type=str, default="openai/gpt-oss-20b", help="Model name for the tokenizer") parser.add_argument("--limit_context_tokens", type=int, default=50000, help="Maximum number of tokens to use for the context") args = parser.parse_args() From 7b6c4595ea1ef821592709cf23e0da35752d4d08 Mon Sep 17 00:00:00 2001 From: Pawel Date: Wed, 18 Feb 2026 13:57:43 +0100 Subject: [PATCH 03/11] save --- .../long_context/README.md | 56 +++++++++++-------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/demos/continuous_batching/long_context/README.md b/demos/continuous_batching/long_context/README.md index b925ee060b..7c090ffe18 100644 --- a/demos/continuous_batching/long_context/README.md +++ b/demos/continuous_batching/long_context/README.md @@ -106,24 +106,24 @@ P99 ITL (ms): 171.99 ``` ============ Serving Benchmark Result ============ Successful requests: 10 -Benchmark duration (s): 33.49 +Benchmark duration (s): 26.56 Total input tokens: 49774 Total generated tokens: 500 -Request throughput (req/s): 0.30 -Output token throughput (tok/s): 14.93 -Total Token throughput (tok/s): 1501.34 +Request throughput (req/s): 0.38 +Output token throughput (tok/s): 18.82 +Total Token throughput (tok/s): 1892.75 ---------------Time to First Token---------------- -Mean TTFT (ms): 126.35 -Median TTFT (ms): 125.42 -P99 TTFT (ms): 135.13 +Mean TTFT (ms): 101.70 +Median TTFT (ms): 101.62 +P99 TTFT (ms): 102.47 -----Time per Output Token (excl. 1st token)------ -Mean TPOT (ms): 65.75 -Median TPOT (ms): 65.69 -P99 TPOT (ms): 66.04 +Mean TPOT (ms): 52.12 +Median TPOT (ms): 51.96 +P99 TPOT (ms): 53.55 ---------------Inter-token Latency---------------- -Mean ITL (ms): 87.07 -Median ITL (ms): 65.98 -P99 ITL (ms): 199.35 +Mean ITL (ms): 69.02 +Median ITL (ms): 52.13 +P99 ITL (ms): 160.02 ================================================== ``` ::: @@ -133,17 +133,29 @@ The results shown above, despite very long context, have much lower TTFT latency ## Performance Comparison Table -| Context Length (tokens) | TTFT No Caching (ms) | TTFT Prefix Caching (ms) | KV Cache Usage (GB) | -|------------------------|----------------------|--------------------------|---------------------| -| 1,000 | 785 | 141 | 0.1 | -| 5,000 | 4160 | 172 | 0.2 | -| 10,000 | 9570 | 217 | 0.4 | -| 50,000 | 152,589 | 795 | 1.5 | -| 100,000 | 624,713 | 1097 | 3.1 | -| 200,000 | | 5406 | 6.2 | +::::{tab-set} +:::{tab-item} CPU +| Context Length (tokens) | TTFT No Caching (ms) | TTFT Prefix Caching (ms) | +|------------------------|------------------|---------------------| +| 10,000 | 176.89 | 170.42 | +| 50,000 | 177.75 | 171.19 | +| 100,000 | 179.16 | 172.79 | +| 200,000 | 181.29 | 175.05 | + +::: +:::{tab-item} GPU +:sync: GPU +| Context Length (tokens) | TTFT No Caching (ms) | TTFT Prefix Caching (ms) | +|------------------------|-------------------|-------------------------| +| 10,000 | 101.82 | 101.47 | +| 50,000 | 103.93 | 101.98 | +| 100,000 | 105.23 | 104.67 | +| 200,000 | 127.61 | 111.49 | + +::: +:::: The results show that the cache usage grows linearly with the context length. -First token generation without prefix caching is growing significantly with the prompt size. Prefix caching is very effective in reducing the first token generation making the long context calls practical even on slower HW. ## Testing accuracy From 9a3f16e19a5436b9676d8693252ece2701ba909a Mon Sep 17 00:00:00 2001 From: Pawel Date: Thu, 19 Feb 2026 12:04:21 +0100 Subject: [PATCH 04/11] save --- .../long_context/README.md | 24 +++++++++++++++---- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/demos/continuous_batching/long_context/README.md b/demos/continuous_batching/long_context/README.md index 7c090ffe18..a182912af5 100644 --- a/demos/continuous_batching/long_context/README.md +++ b/demos/continuous_batching/long_context/README.md @@ -172,11 +172,25 @@ Such experiment can confirm the impact on accuracy from the model quantization a ## Cache Precision Comparison -| Cache Precision | Plugin Config | Accuracy (longbench_gov_report, concurrency 50) | Max Cache Usage (GB) | Duration (s for 100 requests) | -|-----------------|--------------|-----------------------------------------------|----------------------|-------------------------------| -| INT8 | "KV_CACHE_PRECISION":"u8" | 0.3374 | 11 | 41m6.993s | -| BF16 | "KV_CACHE_PRECISION":"bf16" | 0.3297 | 20 | 40m15.359s | -| FP32 | "KV_CACHE_PRECISION":"FP32","EXECUTION_MODE_HINT": "ACCURACY" | 0.331 | 37 | 105m15.876s | +::::{tab-set} +:::{tab-item} CPU +| Cache Precision | Plugin Config | Accuracy (longbench_gov_report, concurrency 50) | Duration (s for 100 requests) | +|-----------------|--------------|-----------------------------------------------|-------------------------------| +| INT8 | "KV_CACHE_PRECISION":"u8" | | | +| BF16 | "KV_CACHE_PRECISION":"bf16" | | | +| FP32 | "KV_CACHE_PRECISION":"FP32","EXECUTION_MODE_HINT": "ACCURACY" | | | + +::: +:::{tab-item} GPU +:sync: GPU +| Cache Precision | Plugin Config | Accuracy (longbench_gov_report, concurrency 50) | Duration (s for 100 requests) | +|-----------------|--------------|-----------------------------------------------|-------------------------------| +| INT8 | "KV_CACHE_PRECISION":"u8" | | | +| BF16 | "KV_CACHE_PRECISION":"bf16" | | | +| FP32 | "KV_CACHE_PRECISION":"FP32","EXECUTION_MODE_HINT": "ACCURACY" | | | + +::: +:::: The results in an experiment captured on Xeon Gen4 server show that KV cache compression has minimal impact on accuracy and significantly reduces memory consumption. Slower execution with FP32 precision is a result of disabled AMX acceleration. From 6c7de302cb40d2fed601292ef5a5d8253422583b Mon Sep 17 00:00:00 2001 From: Pawel Date: Wed, 25 Feb 2026 08:52:30 +0100 Subject: [PATCH 05/11] save --- .../long_context/README.md | 69 +++++++++++-------- 1 file changed, 40 insertions(+), 29 deletions(-) diff --git a/demos/continuous_batching/long_context/README.md b/demos/continuous_batching/long_context/README.md index a182912af5..12325cfac0 100644 --- a/demos/continuous_batching/long_context/README.md +++ b/demos/continuous_batching/long_context/README.md @@ -22,6 +22,10 @@ Compression reduces this memory usage, enabling longer prompts or more parallel Let's demonstrate all the optimizations combined and test it with the real life scenario of sending multiple various questions in the same context. It will illustrate the gain from the prefix caching on the first token latency, improved second token latency thanks to prompt lookup and moderate memory consumption despite very long prompts and parallel execution. +::::{tab-set} +:::{tab-item} CPU and GPU +:sync:CPU + Export the model openai/gpt-oss-20b which has the max context length of 131k tokens. ```bash @@ -31,6 +35,13 @@ mkdir models python export_model.py text_generation --source_model openai/gpt-oss-20b --weight-format int4 --config_file_path models/config.json --model_repository_path models --tool_parser gptoss --reasoning_parser gptoss curl -L -o models/openai/gpt-oss-20b/chat_template.jinja https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/releases/2025/4/extras/chat_template_examples/chat_template_gpt_oss_multiturn.jinja ``` +::: +:::{tab-item} NPU +```bash +docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --pull --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-cw-ov --target_device NPU --task text_generation --enable_prefix_caching true --max_num_batch_tokens 16000 --tool_parser hermes3 +``` +::: +:::: Start OVMS: @@ -38,7 +49,7 @@ Start OVMS: :::{tab-item} CPU :sync: CPU ```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models openvino/model_server:weekly --rest_port 8000 --source_model openai/gpt-oss-20b --model_repository_path models --tool_parser gptoss --task text_generation --cache_dir /models/.cache --enable_prefix_caching true +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models openvino/model_server:weekly --rest_port 8000 --source_model openai/gpt-oss-20b --model_repository_path models --tool_parser gptoss --reasoning_parser gptoss --task text_generation --cache_dir /models/.cache --enable_prefix_caching true ``` ::: :::{tab-item} GPU @@ -49,6 +60,13 @@ docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/model --tool_parser gptoss --reasoning_parser gptoss --target_device GPU --task text_generation --enable_prefix_caching true ``` ::: +:::{tab-item} NPU +:sync: NPU +```bash +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ +--rest_port 8000 --model_name OpenVINO/Qwen3-8B-int4-cw-ov --model_repository_path models +``` +::: :::: ## Dataset for experiments @@ -127,6 +145,12 @@ P99 ITL (ms): 160.02 ================================================== ``` ::: +:::{tab-item} NPU +:sync: NPU +``` +TODO +``` +::: :::: The results shown above, despite very long context, have much lower TTFT latency with prefix caching. As long as the beginning of the request prompt is reused, KV cache can be also reused to speed up prompt processing. @@ -137,21 +161,28 @@ The results shown above, despite very long context, have much lower TTFT latency :::{tab-item} CPU | Context Length (tokens) | TTFT No Caching (ms) | TTFT Prefix Caching (ms) | |------------------------|------------------|---------------------| -| 10,000 | 176.89 | 170.42 | -| 50,000 | 177.75 | 171.19 | -| 100,000 | 179.16 | 172.79 | -| 200,000 | 181.29 | 175.05 | +| 10,000 | TODO | 170.42 | +| 50,000 | TODO | 171.19 | +| 100,000 | TODO | 172.79 | +| 200,000 | TODO | 175.05 | ::: :::{tab-item} GPU :sync: GPU | Context Length (tokens) | TTFT No Caching (ms) | TTFT Prefix Caching (ms) | |------------------------|-------------------|-------------------------| -| 10,000 | 101.82 | 101.47 | -| 50,000 | 103.93 | 101.98 | -| 100,000 | 105.23 | 104.67 | -| 200,000 | 127.61 | 111.49 | +| 10,000 | TODO | 101.47 | +| 50,000 | TODO | 101.98 | +| 100,000 | TODO | 104.67 | +| 200,000 | TODO | 111.49 | +::: +:::{tab-item} NPU +:sync: NPU +| Context Length (tokens) | TTFT No Caching (ms) | TTFT Prefix Caching (ms) | +|------------------------|-------------------|-------------------------| +| 10,000 | TODO | TODO | +| 15,000 | TODO | TODO | ::: :::: @@ -172,26 +203,6 @@ Such experiment can confirm the impact on accuracy from the model quantization a ## Cache Precision Comparison -::::{tab-set} -:::{tab-item} CPU -| Cache Precision | Plugin Config | Accuracy (longbench_gov_report, concurrency 50) | Duration (s for 100 requests) | -|-----------------|--------------|-----------------------------------------------|-------------------------------| -| INT8 | "KV_CACHE_PRECISION":"u8" | | | -| BF16 | "KV_CACHE_PRECISION":"bf16" | | | -| FP32 | "KV_CACHE_PRECISION":"FP32","EXECUTION_MODE_HINT": "ACCURACY" | | | - -::: -:::{tab-item} GPU -:sync: GPU -| Cache Precision | Plugin Config | Accuracy (longbench_gov_report, concurrency 50) | Duration (s for 100 requests) | -|-----------------|--------------|-----------------------------------------------|-------------------------------| -| INT8 | "KV_CACHE_PRECISION":"u8" | | | -| BF16 | "KV_CACHE_PRECISION":"bf16" | | | -| FP32 | "KV_CACHE_PRECISION":"FP32","EXECUTION_MODE_HINT": "ACCURACY" | | | - -::: -:::: - The results in an experiment captured on Xeon Gen4 server show that KV cache compression has minimal impact on accuracy and significantly reduces memory consumption. Slower execution with FP32 precision is a result of disabled AMX acceleration. From 43da05f72ede9a46718846c118be6ac07d1d61a6 Mon Sep 17 00:00:00 2001 From: Pawel Date: Mon, 2 Mar 2026 07:36:47 +0100 Subject: [PATCH 06/11] benchmarks --- .../long_context/README.md | 71 +++++++++++++------ 1 file changed, 51 insertions(+), 20 deletions(-) diff --git a/demos/continuous_batching/long_context/README.md b/demos/continuous_batching/long_context/README.md index 12325cfac0..3ebf4d8bbc 100644 --- a/demos/continuous_batching/long_context/README.md +++ b/demos/continuous_batching/long_context/README.md @@ -38,7 +38,7 @@ curl -L -o models/openai/gpt-oss-20b/chat_template.jinja https://raw.githubuserc ::: :::{tab-item} NPU ```bash -docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --pull --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-cw-ov --target_device NPU --task text_generation --enable_prefix_caching true --max_num_batch_tokens 16000 --tool_parser hermes3 +docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --pull --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-cw-ov --target_device NPU --task text_generation --enable_prefix_caching true --max_num_batched_tokens 16000 --tool_parser hermes3 ``` ::: :::: @@ -63,7 +63,7 @@ docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/model :::{tab-item} NPU :sync: NPU ```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ --rest_port 8000 --model_name OpenVINO/Qwen3-8B-int4-cw-ov --model_repository_path models ``` ::: @@ -76,7 +76,7 @@ To test the performance using vllm benchmarking script, let's create a custom da ```bash curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/releases/2025/3/demos/continuous_batching/long_context/custom_dataset.py -o custom_dataset.py pip install requests transformers -python custom_dataset.py --limit_context_tokens 50000 +python custom_dataset.py --limit_context_tokens 5000 ``` It will create a file called `dataset.jsonl` with 10 requests of shared context body limited to 50000 tokens. @@ -148,7 +148,27 @@ P99 ITL (ms): 160.02 :::{tab-item} NPU :sync: NPU ``` -TODO +============ Serving Benchmark Result ============ +Successful requests: 10 +Benchmark duration (s): 57.14 +Total input tokens: 50294 +Total generated tokens: 500 +Request throughput (req/s): 0.18 +Output token throughput (tok/s): 8.75 +Total Token throughput (tok/s): 888.98 +---------------Time to First Token---------------- +Mean TTFT (ms): 2292.98 +Median TTFT (ms): 2306.25 +P99 TTFT (ms): 2317.77 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 69.77 +Median TPOT (ms): 69.92 +P99 TPOT (ms): 70.45 +---------------Inter-token Latency---------------- +Mean ITL (ms): 72.89 +Median ITL (ms): 71.35 +P99 ITL (ms): 214.61 +================================================== ``` ::: :::: @@ -159,30 +179,41 @@ The results shown above, despite very long context, have much lower TTFT latency ::::{tab-set} :::{tab-item} CPU -| Context Length (tokens) | TTFT No Caching (ms) | TTFT Prefix Caching (ms) | -|------------------------|------------------|---------------------| -| 10,000 | TODO | 170.42 | -| 50,000 | TODO | 171.19 | -| 100,000 | TODO | 172.79 | -| 200,000 | TODO | 175.05 | +Platform: Intel(R) Xeon(R) Platinum 8480+ +| Context Length (tokens) | TTFT No Caching (ms) | TTFT Prefix Caching (ms) | KV Cache Usage (GB) | +|------------------------|------------------|---------------------|-----------------------| +| 1,000 | 4 420 | 190.84 | 0.03 | +| 2,500 | 9 627 | 272.56 | 0.07 | +| 5,000 | 17 736 | 369.66 | 0.1 | +| 10,000 | 36 684 | 680.28 | 0.2 | +| 25,000 | 100 807 | 1570.07 | 0.6 | +| 50,000 | 287 788 | 5133.87 | 1.3 | ::: -:::{tab-item} GPU +:::{tab-item} iGPU :sync: GPU -| Context Length (tokens) | TTFT No Caching (ms) | TTFT Prefix Caching (ms) | -|------------------------|-------------------|-------------------------| -| 10,000 | TODO | 101.47 | -| 50,000 | TODO | 101.98 | -| 100,000 | TODO | 104.67 | -| 200,000 | TODO | 111.49 | +Platform: Intel(R) Core(TM) Ultra 5 338H +| Context Length (tokens) | TTFT No Caching (ms) | TTFT Prefix Caching (ms) | KV Cache Usage (GB) | +|------------------------|------------------|---------------------|-----------------------| +| 1,000 | 1 729 | 279.75 | 0.03 | +| 2,500 | 3 752 | 367.02 | 0.07 | +| 5,000 | 7 215 | 364.82 | 0.1 | +| 10,000 | 17 380 | 599.86 | 0.2 | +| 25,000 | 59 201 | 991.01 | 0.6 | +| 50,000 | 160 138 | 2305.10 | 1.3 | ::: :::{tab-item} NPU :sync: NPU +Platform: Intel(R) Core(TM) Ultra 5 338H | Context Length (tokens) | TTFT No Caching (ms) | TTFT Prefix Caching (ms) | -|------------------------|-------------------|-------------------------| -| 10,000 | TODO | TODO | -| 15,000 | TODO | TODO | +|------------------------|------------------|---------------------| +| 500 | 1521.75 | 1489.22 | +| 1,000 | 3061.18 | 1729.39 | +| 2,000 | 3072.92 | 1806.56 | +| 4,000 | 6697.62 | 2421.26 | +| 8,000 | 16046.92 | 3232.11 | +| 16,000 | 53378.22 | 6585.93 | ::: :::: From 31800694bc7e0208b6a63d8d53c873ea3f446d18 Mon Sep 17 00:00:00 2001 From: Pawel Date: Mon, 2 Mar 2026 11:03:20 +0100 Subject: [PATCH 07/11] ov repo models --- .../long_context/README.md | 44 ++++++++++--------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/demos/continuous_batching/long_context/README.md b/demos/continuous_batching/long_context/README.md index 3ebf4d8bbc..fc725ec525 100644 --- a/demos/continuous_batching/long_context/README.md +++ b/demos/continuous_batching/long_context/README.md @@ -22,24 +22,29 @@ Compression reduces this memory usage, enabling longer prompts or more parallel Let's demonstrate all the optimizations combined and test it with the real life scenario of sending multiple various questions in the same context. It will illustrate the gain from the prefix caching on the first token latency, improved second token latency thanks to prompt lookup and moderate memory consumption despite very long prompts and parallel execution. +Prepare models directory: +```bash +mkdir models +``` + ::::{tab-set} -:::{tab-item} CPU and GPU +:::{tab-item} CPU :sync:CPU - -Export the model openai/gpt-oss-20b which has the max context length of 131k tokens. - ```bash -curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/export_model.py -o export_model.py -pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/requirements.txt -mkdir models -python export_model.py text_generation --source_model openai/gpt-oss-20b --weight-format int4 --config_file_path models/config.json --model_repository_path models --tool_parser gptoss --reasoning_parser gptoss -curl -L -o models/openai/gpt-oss-20b/chat_template.jinja https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/releases/2025/4/extras/chat_template_examples/chat_template_gpt_oss_multiturn.jinja +docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --pull --model_repository_path /models --source_model OpenVINO/gpt-oss-20b-int4-ov --tool_parser gptoss --reasoning_parser gptoss --task text_generation --enable_prefix_caching true +``` +::: +::: {tab-item} GPU +:sync: GPU +```bash +docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest-gpu --pull --model_repository_path /models --source_model OpenVINO/gpt-oss-20b-int4-ov --tool_parser gptoss --reasoning_parser gptoss --task text_generation --enable_prefix_caching true --target_device GPU ``` ::: :::{tab-item} NPU ```bash -docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --pull --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-cw-ov --target_device NPU --task text_generation --enable_prefix_caching true --max_num_batched_tokens 16000 --tool_parser hermes3 +docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest-gpu --pull --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-cw-ov --target_device NPU --task text_generation --enable_prefix_caching true --max_prompt_len 16000 --tool_parser hermes3 --plugin_config "{\"NPUW_LLM_PREFILL_ATTENTION_HINT\": \"PYRAMID\"}" ``` +**Note:** It's recommended to set `--max_prompt_len` value to as low as possible. This will improve performence, but limit number of tokens model will accept. ::: :::: @@ -49,22 +54,21 @@ Start OVMS: :::{tab-item} CPU :sync: CPU ```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models openvino/model_server:weekly --rest_port 8000 --source_model openai/gpt-oss-20b --model_repository_path models --tool_parser gptoss --reasoning_parser gptoss --task text_generation --cache_dir /models/.cache --enable_prefix_caching true +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models openvino/model_server:latest --rest_port 8000 --model_name OpenVINO/gpt-oss-20b-int4-ov --model_path /models/OpenVINO/gpt-oss-20b-int4-ov ``` ::: :::{tab-item} GPU :sync: GPU ```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ ---rest_port 8000 --source_model openai/gpt-oss-20b --model_repository_path models \ ---tool_parser gptoss --reasoning_parser gptoss --target_device GPU --task text_generation --enable_prefix_caching true +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu \ +--rest_port 8000 --model_name OpenVINO/gpt-oss-20b-int4-ov --model_path /models/OpenVINO/gpt-oss-20b-int4-ov ``` ::: :::{tab-item} NPU :sync: NPU ```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \ ---rest_port 8000 --model_name OpenVINO/Qwen3-8B-int4-cw-ov --model_repository_path models +docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu \ +--rest_port 8000 --model_name OpenVINO/Qwen3-8B-int4-cw-ov --model_path /models/OpenVINO/Qwen3-8B-int4-cw-ov ``` ::: :::: @@ -88,7 +92,7 @@ Let's check the performance git clone --branch v0.9.1 --depth 1 https://github.com/vllm-project/vllm cd vllm pip3 install -r requirements/cpu.txt . --extra-index-url https://download.pytorch.org/whl/cpu -python benchmarks/benchmark_serving.py --host localhost --port 8000 --endpoint /v3/chat/completions --backend openai-chat --model openai/gpt-oss-20b --dataset-name custom --dataset-path ../dataset.jsonl --num-prompts 10 --max-concurrency 1 --custom-output-len 50 +python benchmarks/benchmark_serving.py --host localhost --port 8000 --endpoint /v3/chat/completions --backend openai-chat --model OpenVINO/gpt-oss-20b-int4-ov --dataset-name custom --dataset-path ../dataset.jsonl --num-prompts 10 --max-concurrency 1 --custom-output-len 50 ``` @@ -212,7 +216,7 @@ Platform: Intel(R) Core(TM) Ultra 5 338H | 1,000 | 3061.18 | 1729.39 | | 2,000 | 3072.92 | 1806.56 | | 4,000 | 6697.62 | 2421.26 | -| 8,000 | 16046.92 | 3232.11 | +| 8,000 | 16046.92 | 3232.11 | | 16,000 | 53378.22 | 6585.93 | ::: :::: @@ -227,7 +231,7 @@ The only difference is that the configured testing task should include a relevan For example: ``` -lm-eval --model local-chat-completions --tasks longbench_gov_report --model_args model=Qwen/Qwen2.5-7B-Instruct-1M,base_url=http://localhost:8000/v3/chat/completions,num_concurrent=10,tokenized_requests=False,timeout=3000 --verbosity DEBUG --seed 1 --apply_chat_template +lm-eval --model local-chat-completions --tasks longbench_gov_report --model_args model=OpenVINO/gpt-oss-20b-int4-ov,base_url=http://localhost:8000/v3/chat/completions,num_concurrent=10,tokenized_requests=False,timeout=3000 --verbosity DEBUG --seed 1 --apply_chat_template ``` Such experiment can confirm the impact on accuracy from the model quantization and KV cache compression. @@ -243,7 +247,7 @@ Enable prefix caching feature with `--enable_prefix_caching` parameter when you Use KV cache compression as INT8 which is the default setting. -Set the KV cache size via `--cache_size` parameter based on the available memory, expected concurrency and context length. It will improve the performance. +Set the KV cache size via `--cache_size` parameter based on the available memory, expected concurrency and context length or use default value (`0`) to make it dynamic. It will improve the performance. **Note** You can force reducing the concurrency on the server using a parameter `--rest_workers` which by default allows number of connections the same like number of CPU cores. Alternatively the limit can be set on the model level in `--max_num_seqs`. From 56c3c198a3805f2af696686186c4ee1a559f342f Mon Sep 17 00:00:00 2001 From: Pawel Date: Mon, 2 Mar 2026 11:23:41 +0100 Subject: [PATCH 08/11] model change --- demos/continuous_batching/long_context/custom_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/continuous_batching/long_context/custom_dataset.py b/demos/continuous_batching/long_context/custom_dataset.py index bf9ecc1b1e..a552443754 100644 --- a/demos/continuous_batching/long_context/custom_dataset.py +++ b/demos/continuous_batching/long_context/custom_dataset.py @@ -50,7 +50,7 @@ def download_file(url): parser = argparse.ArgumentParser(description="Generate a dataset of long context examples.") parser.add_argument("--file_url", type=str, default="https://ota.bodleian.ox.ac.uk/repository/xmlui/bitstream/handle/20.500.12024/2011/donquix-2011.txt", help="URL of the file to download") -parser.add_argument("--model_name", type=str, default="openai/gpt-oss-20b", help="Model name for the tokenizer") +parser.add_argument("--model_name", type=str, default="OpenVINO/gpt-oss-20b-int4-ov", help="Model name for the tokenizer") parser.add_argument("--limit_context_tokens", type=int, default=50000, help="Maximum number of tokens to use for the context") args = parser.parse_args() From 6e9b2345ea4bc8735f2fafb2dbb7078179a3ad6f Mon Sep 17 00:00:00 2001 From: Pawel Date: Mon, 2 Mar 2026 11:52:27 +0100 Subject: [PATCH 09/11] tokenizer missmatch fix --- demos/continuous_batching/long_context/README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/demos/continuous_batching/long_context/README.md b/demos/continuous_batching/long_context/README.md index fc725ec525..37aeb48d7b 100644 --- a/demos/continuous_batching/long_context/README.md +++ b/demos/continuous_batching/long_context/README.md @@ -80,8 +80,21 @@ To test the performance using vllm benchmarking script, let's create a custom da ```bash curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/releases/2025/3/demos/continuous_batching/long_context/custom_dataset.py -o custom_dataset.py pip install requests transformers +``` + +::::{tab-set} +:::{tab-item} CPU and GPU +:sync:CPU +```bash python custom_dataset.py --limit_context_tokens 5000 ``` +::: +:::{tab-item} NPU +```bash +python custom_dataset.py --limit_context_tokens 5000 --model_name OpenVINO/Qwen3-8B-int4-cw-ov +``` +::: +:::: It will create a file called `dataset.jsonl` with 10 requests of shared context body limited to 50000 tokens. From b8b7216b3ac39f0c0247b8261069ac632dc2fb1b Mon Sep 17 00:00:00 2001 From: Pawel Date: Mon, 2 Mar 2026 13:17:42 +0100 Subject: [PATCH 10/11] pull and start commands --- .../long_context/README.md | 32 ++----------------- 1 file changed, 3 insertions(+), 29 deletions(-) diff --git a/demos/continuous_batching/long_context/README.md b/demos/continuous_batching/long_context/README.md index 37aeb48d7b..a7e44e414a 100644 --- a/demos/continuous_batching/long_context/README.md +++ b/demos/continuous_batching/long_context/README.md @@ -31,48 +31,22 @@ mkdir models :::{tab-item} CPU :sync:CPU ```bash -docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --pull --model_repository_path /models --source_model OpenVINO/gpt-oss-20b-int4-ov --tool_parser gptoss --reasoning_parser gptoss --task text_generation --enable_prefix_caching true +docker run --user $(id -u):$(id -g) -d --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --rest_port 8000 --model_repository_path /models --source_model OpenVINO/gpt-oss-20b-int4-ov --tool_parser gptoss --reasoning_parser gptoss --task text_generation --enable_prefix_caching true ``` ::: ::: {tab-item} GPU :sync: GPU ```bash -docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest-gpu --pull --model_repository_path /models --source_model OpenVINO/gpt-oss-20b-int4-ov --tool_parser gptoss --reasoning_parser gptoss --task text_generation --enable_prefix_caching true --target_device GPU +docker run --user $(id -u):$(id -g) -d --rm -v $(pwd)/models:/models:rw openvino/model_server:latest-gpu --rest_port 8000 --model_repository_path /models --source_model OpenVINO/gpt-oss-20b-int4-ov --tool_parser gptoss --reasoning_parser gptoss --task text_generation --enable_prefix_caching true --target_device GPU ``` ::: :::{tab-item} NPU ```bash -docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest-gpu --pull --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-cw-ov --target_device NPU --task text_generation --enable_prefix_caching true --max_prompt_len 16000 --tool_parser hermes3 --plugin_config "{\"NPUW_LLM_PREFILL_ATTENTION_HINT\": \"PYRAMID\"}" +docker run --user $(id -u):$(id -g) -d --rm -v $(pwd)/models:/models:rw openvino/model_server:latest-gpu --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-cw-ov --target_device NPU --task text_generation --enable_prefix_caching true --max_prompt_len 16000 --tool_parser hermes3 --plugin_config "{\"NPUW_LLM_PREFILL_ATTENTION_HINT\": \"PYRAMID\"}" ``` **Note:** It's recommended to set `--max_prompt_len` value to as low as possible. This will improve performence, but limit number of tokens model will accept. ::: :::: - -Start OVMS: - -::::{tab-set} -:::{tab-item} CPU -:sync: CPU -```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models openvino/model_server:latest --rest_port 8000 --model_name OpenVINO/gpt-oss-20b-int4-ov --model_path /models/OpenVINO/gpt-oss-20b-int4-ov -``` -::: -:::{tab-item} GPU -:sync: GPU -```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu \ ---rest_port 8000 --model_name OpenVINO/gpt-oss-20b-int4-ov --model_path /models/OpenVINO/gpt-oss-20b-int4-ov -``` -::: -:::{tab-item} NPU -:sync: NPU -```bash -docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:latest-gpu \ ---rest_port 8000 --model_name OpenVINO/Qwen3-8B-int4-cw-ov --model_path /models/OpenVINO/Qwen3-8B-int4-cw-ov -``` -::: -:::: - ## Dataset for experiments To test the performance using vllm benchmarking script, let's create a custom dataset with long shared context and a set of questions in each request. That way we can create a dataset with identical very long context with different queries related to the context. That is a common scenario for RAG applications which generates response based on a complete knowledge base. To make this experiment similar to real live, the context is not synthetic but build with the content of Don Quixote story with 10 different questions related to the story. Because the context is reused, it is a perfect case for benefitting from prefix caching. From 47fb5f177df5a0e7761f43526de3d4d6c38e5abf Mon Sep 17 00:00:00 2001 From: Pawel Date: Tue, 3 Mar 2026 13:47:14 +0100 Subject: [PATCH 11/11] improvement --- .../long_context/README.md | 137 ++---------------- .../long_context/custom_dataset.py | 99 ------------- 2 files changed, 10 insertions(+), 226 deletions(-) delete mode 100644 demos/continuous_batching/long_context/custom_dataset.py diff --git a/demos/continuous_batching/long_context/README.md b/demos/continuous_batching/long_context/README.md index a7e44e414a..7c481e4c2f 100644 --- a/demos/continuous_batching/long_context/README.md +++ b/demos/continuous_batching/long_context/README.md @@ -31,140 +31,30 @@ mkdir models :::{tab-item} CPU :sync:CPU ```bash -docker run --user $(id -u):$(id -g) -d --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --rest_port 8000 --model_repository_path /models --source_model OpenVINO/gpt-oss-20b-int4-ov --tool_parser gptoss --reasoning_parser gptoss --task text_generation --enable_prefix_caching true +docker run --user $(id -u):$(id -g) -d --rm -v $(pwd)/models:/models:rw -p 8000:8000 openvino/model_server:latest --rest_port 8000 --model_repository_path /models --source_model OpenVINO/gpt-oss-20b-int4-ov --tool_parser gptoss --reasoning_parser gptoss --task text_generation --enable_prefix_caching true ``` ::: ::: {tab-item} GPU :sync: GPU ```bash -docker run --user $(id -u):$(id -g) -d --rm -v $(pwd)/models:/models:rw openvino/model_server:latest-gpu --rest_port 8000 --model_repository_path /models --source_model OpenVINO/gpt-oss-20b-int4-ov --tool_parser gptoss --reasoning_parser gptoss --task text_generation --enable_prefix_caching true --target_device GPU +docker run --user $(id -u):$(id -g) -d --rm -v $(pwd)/models:/models:rw -p 8000:8000 openvino/model_server:latest-gpu --rest_port 8000 --model_repository_path /models --source_model OpenVINO/gpt-oss-20b-int4-ov --tool_parser gptoss --reasoning_parser gptoss --task text_generation --enable_prefix_caching true --target_device GPU ``` ::: :::{tab-item} NPU ```bash -docker run --user $(id -u):$(id -g) -d --rm -v $(pwd)/models:/models:rw openvino/model_server:latest-gpu --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-cw-ov --target_device NPU --task text_generation --enable_prefix_caching true --max_prompt_len 16000 --tool_parser hermes3 --plugin_config "{\"NPUW_LLM_PREFILL_ATTENTION_HINT\": \"PYRAMID\"}" +docker run --user $(id -u):$(id -g) -d --rm -v $(pwd)/models:/models:rw -p 8000:8000 openvino/model_server:latest-gpu --rest_port 8000 --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-cw-ov --target_device NPU --task text_generation --enable_prefix_caching true --max_prompt_len 16000 --tool_parser hermes3 --plugin_config "{\"NPUW_LLM_PREFILL_ATTENTION_HINT\": \"PYRAMID\"}" ``` **Note:** It's recommended to set `--max_prompt_len` value to as low as possible. This will improve performence, but limit number of tokens model will accept. ::: :::: -## Dataset for experiments - -To test the performance using vllm benchmarking script, let's create a custom dataset with long shared context and a set of questions in each request. That way we can create a dataset with identical very long context with different queries related to the context. That is a common scenario for RAG applications which generates response based on a complete knowledge base. To make this experiment similar to real live, the context is not synthetic but build with the content of Don Quixote story with 10 different questions related to the story. Because the context is reused, it is a perfect case for benefitting from prefix caching. - -```bash -curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/releases/2025/3/demos/continuous_batching/long_context/custom_dataset.py -o custom_dataset.py -pip install requests transformers -``` - -::::{tab-set} -:::{tab-item} CPU and GPU -:sync:CPU -```bash -python custom_dataset.py --limit_context_tokens 5000 -``` -::: -:::{tab-item} NPU -```bash -python custom_dataset.py --limit_context_tokens 5000 --model_name OpenVINO/Qwen3-8B-int4-cw-ov -``` -::: -:::: - -It will create a file called `dataset.jsonl` with 10 requests of shared context body limited to 50000 tokens. ## Testing performance -Let's check the performance +Using `vllm` benchmark it's possible to check performence of the model with desired context lenght. It's also available set prefix parameters check performence benefit from prefix caching. ```bash -git clone --branch v0.9.1 --depth 1 https://github.com/vllm-project/vllm -cd vllm -pip3 install -r requirements/cpu.txt . --extra-index-url https://download.pytorch.org/whl/cpu -python benchmarks/benchmark_serving.py --host localhost --port 8000 --endpoint /v3/chat/completions --backend openai-chat --model OpenVINO/gpt-oss-20b-int4-ov --dataset-name custom --dataset-path ../dataset.jsonl --num-prompts 10 --max-concurrency 1 --custom-output-len 50 -``` - - -::::{tab-set} -:::{tab-item} CPU -:sync: CPU -``` -============ Serving Benchmark Result ============ -Successful requests: 10 -Benchmark duration (s): 29.54 -Total input tokens: 49774 -Total generated tokens: 500 -Request throughput (req/s): 0.34 -Output token throughput (tok/s): 16.92 -Total Token throughput (tok/s): 1701.74 ----------------Time to First Token---------------- -Mean TTFT (ms): 173.65 -Median TTFT (ms): 171.61 -P99 TTFT (ms): 190.84 ------Time per Output Token (excl. 1st token)------ -Mean TPOT (ms): 56.74 -Median TPOT (ms): 56.82 -P99 TPOT (ms): 56.87 ----------------Inter-token Latency---------------- -Mean ITL (ms): 75.14 -Median ITL (ms): 56.81 -P99 ITL (ms): 171.99 -================================================== -``` -::: -:::{tab-item} GPU -:sync: GPU -``` -============ Serving Benchmark Result ============ -Successful requests: 10 -Benchmark duration (s): 26.56 -Total input tokens: 49774 -Total generated tokens: 500 -Request throughput (req/s): 0.38 -Output token throughput (tok/s): 18.82 -Total Token throughput (tok/s): 1892.75 ----------------Time to First Token---------------- -Mean TTFT (ms): 101.70 -Median TTFT (ms): 101.62 -P99 TTFT (ms): 102.47 ------Time per Output Token (excl. 1st token)------ -Mean TPOT (ms): 52.12 -Median TPOT (ms): 51.96 -P99 TPOT (ms): 53.55 ----------------Inter-token Latency---------------- -Mean ITL (ms): 69.02 -Median ITL (ms): 52.13 -P99 ITL (ms): 160.02 -================================================== +pip install vllm --extra-index-url https://wheels.vllm.ai/nightly/cpu +vllm bench serve --backend openai --base-url http://localhost:8000/ --endpoint v3/completions --model OpenVINO/gpt-oss-20b-int4-ov --tokenizer openai/gpt-oss-20b --prefix-repetition-prefix-len 50000 --prefix-repetition-suffix-len 10 --prefix-repetition-output-len 20 --prefix-repetition-num-prefixes 1 --num-prompts 2 --max_concurrency 1 --dataset-name prefix_repetition --num-warmups 1 ``` -::: -:::{tab-item} NPU -:sync: NPU -``` -============ Serving Benchmark Result ============ -Successful requests: 10 -Benchmark duration (s): 57.14 -Total input tokens: 50294 -Total generated tokens: 500 -Request throughput (req/s): 0.18 -Output token throughput (tok/s): 8.75 -Total Token throughput (tok/s): 888.98 ----------------Time to First Token---------------- -Mean TTFT (ms): 2292.98 -Median TTFT (ms): 2306.25 -P99 TTFT (ms): 2317.77 ------Time per Output Token (excl. 1st token)------ -Mean TPOT (ms): 69.77 -Median TPOT (ms): 69.92 -P99 TPOT (ms): 70.45 ----------------Inter-token Latency---------------- -Mean ITL (ms): 72.89 -Median ITL (ms): 71.35 -P99 ITL (ms): 214.61 -================================================== -``` -::: -:::: - -The results shown above, despite very long context, have much lower TTFT latency with prefix caching. As long as the beginning of the request prompt is reused, KV cache can be also reused to speed up prompt processing. ## Performance Comparison Table @@ -208,7 +98,7 @@ Platform: Intel(R) Core(TM) Ultra 5 338H ::: :::: -The results show that the cache usage grows linearly with the context length. +The results show that the cache usage grows exponentialy with the context length. Prefix caching is very effective in reducing the first token generation making the long context calls practical even on slower HW. ## Testing accuracy @@ -216,17 +106,10 @@ Prefix caching is very effective in reducing the first token generation making t Testing accuracy for use cases with long context can be done via [lm-eval_harness](../accuracy/README.md). The only difference is that the configured testing task should include a relevant dataset. -For example: -``` -lm-eval --model local-chat-completions --tasks longbench_gov_report --model_args model=OpenVINO/gpt-oss-20b-int4-ov,base_url=http://localhost:8000/v3/chat/completions,num_concurrent=10,tokenized_requests=False,timeout=3000 --verbosity DEBUG --seed 1 --apply_chat_template -``` - -Such experiment can confirm the impact on accuracy from the model quantization and KV cache compression. - -## Cache Precision Comparison +## Cache Precision -The results in an experiment captured on Xeon Gen4 server show that KV cache compression has minimal impact on accuracy and significantly reduces memory consumption. -Slower execution with FP32 precision is a result of disabled AMX acceleration. +KV cache compression has minimal impact on accuracy and significantly reduces memory consumption and benchmark time. +It's recommended to use default KV cache precision which is INT8. ## Recommendations diff --git a/demos/continuous_batching/long_context/custom_dataset.py b/demos/continuous_batching/long_context/custom_dataset.py deleted file mode 100644 index a552443754..0000000000 --- a/demos/continuous_batching/long_context/custom_dataset.py +++ /dev/null @@ -1,99 +0,0 @@ -# -# Copyright (c) 2025 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - - -# This script generates a dataset of long context examples for performance evaluation -import os -import json -import requests -from transformers import AutoTokenizer -import argparse - -# function to download a file from a URL and convert it to text -def download_file(url): - output_path = os.path.basename(url) - headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"} - response = requests.get(url, headers=headers) - if response.status_code == 200: - with open(output_path, "wb") as file: - file.write(response.content) - print(f"File downloaded and saved as {output_path}") - else: - print(f"Failed to download file. Status code: {response.status_code}") - if url.endswith(".txt"): - with open(output_path, "r", encoding="utf-8") as file: - text = file.read() - print(f"Text file read successfully. Length of text: {len(text)} characters") - return text - elif url.endswith(".pdf"): - with open(output_path, "rb") as file: - pdf = PyPDF2.PdfReader(file) - text = "" - for page in pdf.pages: - text += page.extract_text() - return text - else: - raise ValueError("Unsupported file type. Only .txt and .pdf files are supported.") - -parser = argparse.ArgumentParser(description="Generate a dataset of long context examples.") -parser.add_argument("--file_url", type=str, default="https://ota.bodleian.ox.ac.uk/repository/xmlui/bitstream/handle/20.500.12024/2011/donquix-2011.txt", help="URL of the file to download") -parser.add_argument("--model_name", type=str, default="OpenVINO/gpt-oss-20b-int4-ov", help="Model name for the tokenizer") -parser.add_argument("--limit_context_tokens", type=int, default=50000, help="Maximum number of tokens to use for the context") -args = parser.parse_args() - -file_url = args.file_url -model_name = args.model_name -limit_context_tokens = args.limit_context_tokens - -text = download_file(file_url) - -# Initialize the tokenizer -tokenizer = AutoTokenizer.from_pretrained(model_name) - -# Tokenize the text -tokens = tokenizer(text)['input_ids'] -print(f"Number of tokens: {len(tokens)}") - -if limit_context_tokens is not None: - if len(tokens) > limit_context_tokens: - tokens = tokens[:limit_context_tokens] - print(f"Tokens truncated to {limit_context_tokens} tokens") - text = tokenizer.decode(tokens) - -list_of_questions = [ - "Summarize the text in few sentences.", - "What are the main points discussed in the text?", - "What is the main theme of the text?", - "What are the key arguments presented in the text?", - "Who is the main character in the text?", - "Describe shortly the main character.", - "What was the most funny part of the text?", - "What was the most sad part of the text?", - "What was the most interesting part of the text?", - "Summarize shortly the first paragraph of the text.", -] -dataset = "" -for question in list_of_questions: - prompt = f"For the given CONTEXT answer the QUESTION. \n CONTEXT: {text}\n QUESTION {question}\n" - item = {"prompt": prompt } - dataset += json.dumps(item, ensure_ascii=False) + "\n" - - -# Save the dataset to a JSON file -output_file = "dataset.jsonl" -with open(output_file, "w", encoding="utf-8") as file: - file.write(dataset) -print(f"Dataset saved to {output_file}")