openvinotoolkit · michalkulakowski · Dec 17, 2025 · Feb 20, 2026 · Feb 24, 2026 · Feb 24, 2026
diff --git a/Dockerfile.redhat b/Dockerfile.redhat
@@ -100,6 +100,37 @@ WORKDIR /ovms/third_party/opencv
 RUN if [ "$VERBOSE_LOGS" == "ON" ] ; then export VERBOSE=1 ; fi && ./install_opencv.sh
 ####### End of OpenCV
 
+# Build espeak-ng from sources
+FROM base_build as espeak_build
+
+ARG ESPEAK_NG_VERSION=1.51.1
+WORKDIR /tmp/espeak_build
+
+RUN dnf install -y libtool automake autoconf pkgconfig && \
+    dnf clean all
+
+RUN cd /tmp/espeak_build && \
+    git clone --branch ${ESPEAK_NG_VERSION} https://github.com/espeak-ng/espeak-ng.git espeak-ng-src 2>&1 && \
+    ls -lah /tmp/espeak_build/
+
+RUN cd /tmp/espeak_build/espeak-ng-src && \
+    touch AUTHORS NEWS && \
+    libtoolize --force --copy && \
+    aclocal && \
+    autoheader && \
+    autoconf && \
+    automake --add-missing --copy && \
+    ./configure --prefix=/usr/local \
+        --disable-shared \
+        --enable-static \
+        --disable-mbrola \
+        --disable-klatt \
+        --without-audio && \
+    make -j$(nproc) && \
+    make install
+
+RUN rm -rf /tmp/espeak_build
+
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 FROM base_build as build
 ARG BASE_IMAGE
@@ -404,6 +435,9 @@ LABEL base-image=${RELEASE_BASE_IMAGE}
 ENV PYTHONPATH=/ovms/lib/python:/ovms/python_deps
 
 WORKDIR /
+COPY --from=base_build /usr/local/share/espeak-ng-data /usr/local/share/espeak-ng-data
+ENV ESPEAK_DATA_PATH=/usr/local/share/espeak-ng-data
+
 SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 COPY ./install_redhat_gpu_drivers.sh /install_gpu_drivers.sh
 # hadolint ignore=DL3003,DL3041,SC2164,SC1091

diff --git a/Dockerfile.ubuntu b/Dockerfile.ubuntu
@@ -87,6 +87,37 @@ WORKDIR /ovms/third_party/opencv
 RUN ./install_opencv.sh
 ####### End of OpenCV
 
+# Build espeak-ng from sources
+
+ARG ESPEAK_NG_VERSION=1.51.1
+WORKDIR /tmp/espeak_build
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        libtool automake autoconf pkg-config && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+RUN cd /tmp/espeak_build && \
+    git clone --branch ${ESPEAK_NG_VERSION} https://github.com/espeak-ng/espeak-ng.git espeak-ng-src 2>&1 && \
+    ls -lah /tmp/espeak_build/
+
+RUN cd /tmp/espeak_build/espeak-ng-src && \
+    touch AUTHORS NEWS && \
+    libtoolize --force --copy && \
+    aclocal && \
+    autoheader && \
+    autoconf && \
+    automake --add-missing --copy && \
+    ./configure --prefix=/usr/local \
+        --disable-shared \
+        --enable-static \
+        --disable-mbrola \
+        --disable-klatt \
+        --without-audio && \
+    make -j$(nproc) && \
+    make install
+
+RUN rm -rf /tmp/espeak_build
+
 ################### BASE BUILD ##########################
 FROM base_build as build
 ARG BASE_IMAGE
@@ -99,6 +130,7 @@ RUN if [ "$BASE_OS" == "ubuntu24" ] ; then apt-get update && \
     apt-get install -y software-properties-common --no-install-recommends; add-apt-repository 'ppa:deadsnakes/ppa' -y && \
     apt-get clean && rm -rf /var/lib/apt/lists/* ; fi
 ENV PIP_BREAK_SYSTEM_PACKAGES=1
+
 RUN apt-get update && apt-get install --no-install-recommends -y \
             libgflags-dev \
             bc \
@@ -262,7 +294,6 @@ ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/intel/openvino/runtime/lib/intel64/:
 # FROM BASE BUILD
 COPY --from=base_build /opt/opencv /opt/opencv/
 COPY third_party /ovms/third_party/
-
 # Mediapipe
 COPY BUILD.bazel /ovms/
 COPY *\.bzl /ovms/
@@ -392,6 +423,9 @@ RUN if [ -f /ovms_release/lib/libovms_shared.so ] ; then mv /ovms_release/lib/li
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
 FROM $BASE_IMAGE as release
+ARG http_proxy
+ARG https_proxy
+ARG no_proxy
 ARG INSTALL_RPMS_FROM_URL=
 ARG INSTALL_DRIVER_VERSION="24.26.30049"
 ARG GPU=0
@@ -408,6 +442,9 @@ SHELL ["/bin/bash", "-c"]
 WORKDIR /
 
 COPY release_files/drivers /drivers
+COPY --from=base_build /usr/local/share/espeak-ng-data /usr/local/share/espeak-ng-data
+ENV ESPEAK_DATA_PATH=/usr/local/share/espeak-ng-data
+
 SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 ARG INSTALL_DRIVER_VERSION="24.39.31294"
 COPY ./install_ubuntu_gpu_drivers.sh /tmp/install_gpu_drivers.sh

diff --git a/demos/audio/export_kokoro.py b/demos/audio/export_kokoro.py
@@ -0,0 +1,141 @@
+#
+# Copyright (C) 2026 Intel Corporation
+# SPDX-License-Identifier: Apache 2.0
+#
+
+import torch
+import json
+import time
+from pathlib import Path
+from kokoro.model import KModel
+from kokoro import KPipeline
+import openvino as ov
+import shutil
+
+MAX_SEQ_LENGTH = 500
+
+
+class KokoroTTSPipeline:
+    def __init__(self):
+        model_id = "hexgrad/Kokoro-82M"
+        self.pipeline = KPipeline(lang_code="a", repo_id=model_id)
+
+    def __call__(self, text: str, voice: str = "af_heart"):
+        with torch.no_grad():
+            generator = self.pipeline(text, voice=voice)
+            result = next(generator)
+        return result.audio
+
+
+class OVKModel(KModel):
+    def __init__(self, model_dir: Path, device: str, plugin_config: dict = {}):
+        torch.nn.Module.__init__(self)
+
+        core = ov.Core()
+
+        self.repo_id = model_id
+        with (model_dir / "config.json").open("r", encoding="utf-8") as f:
+            config = json.load(f)
+        self.vocab = config["vocab"]
+        print("Starting to compile OpenVINO model on device:", device)
+
+        start = time.time()
+        self.model = core.compile_model(model_dir / "openvino_model.xml", device.upper(), config=plugin_config)
+        print(f"Model compiled successfully in {time.time() - start:.2f}s.")
+        self.context_length = config["plbert"]["max_position_embeddings"]
+
+    @property
+    def device(self):
+        return torch.device("cpu")
+
+    def forward_with_tokens(self, input_ids: torch.LongTensor, ref_s: torch.FloatTensor, speed: float = 1) -> tuple[torch.FloatTensor, torch.LongTensor]:
+        text_len = input_ids.shape[-1]
+
+        if text_len < MAX_SEQ_LENGTH:
+            # 0 in this model context is acting as BOS/EOS/PAD.
+            # Since 0 causes artifacts, we might consider space (16) or period (4).
+            padding_value = 16
+            input_ids = torch.nn.functional.pad(input_ids, (0, MAX_SEQ_LENGTH - text_len), value=padding_value)
+
+        start = time.time()
+        print("Running inference on OpenVINO model...")
+        outputs = self.model([input_ids, ref_s, torch.tensor(speed)])
+        print(f"Inference completed in {time.time() - start:.2f}s.")
+
+        audio = torch.from_numpy(outputs[0])
+        pred_dur = torch.from_numpy(outputs[1])
+
+        if text_len < MAX_SEQ_LENGTH:
+            pred_dur = pred_dur[:text_len]
+            # Approximate audio trimming based on duration ratio
+            total_dur = outputs[1].sum()
+            valid_dur = pred_dur.sum()
+            if total_dur > 0:
+                audio_keep = int(audio.shape[-1] * (valid_dur / total_dur))
+                audio = audio[:audio_keep]
+
+        return audio, pred_dur
+
+    @staticmethod
+    def download_and_convert(model_dir: Path, repo_id: str, ttsPipeline: KokoroTTSPipeline):
+        import openvino as ov
+        from huggingface_hub import hf_hub_download
+        import gc
+
+        if not (model_dir / "openvino_model.xml").exists():
+            print(f"Converting Kokoro model to OpenVINO format at {model_dir}...")
+            model = ttsPipeline.pipeline.model
+            model.forward = model.forward_with_tokens
+            input_ids = torch.randint(1, 100, (48,)).numpy()
+            input_ids = torch.LongTensor([[0, *input_ids, 0]])
+            style = torch.randn(1, 256)
+            speed = torch.randint(1, 10, (1,), dtype=torch.float32)
+
+            ov_model = ov.convert_model(model, example_input=(input_ids, style, speed), input=[
+                                        ov.PartialShape("[1, 2..]"), ov.PartialShape([1, -1])])
+            ov.save_model(ov_model, model_dir / "openvino_model.xml")
+            hf_hub_download(repo_id=model_id, filename="config.json", local_dir=model_dir)
+        else:
+            print(f"OpenVINO model already exists at {model_dir}, skipping conversion.")
+
+        gc.collect()
+
+    @staticmethod
+    def convert_to_static(input_model_dir: Path, output_model_dir: Path):
+        import openvino as ov
+
+        print(f"Converting OpenVINO model to static shapes at {input_model_dir}...")
+        core = ov.Core()
+        model = core.read_model(input_model_dir / "openvino_model.xml")
+        static_shape = {"input_ids": [1, MAX_SEQ_LENGTH], "ref_s": [1, 256], "speed": [1], }
+        model.reshape(static_shape)
+        print("Reshaped model inputs:", model.inputs)
+        ov.save_model(model, output_model_dir / "openvino_model.xml")
+        print("Conversion to static shapes completed.")
+        # Copy config file
+        shutil.copy(input_model_dir / "config.json", output_model_dir / "config.json")
+
+
+if __name__ == "__main__":
+
+    model_id = "hexgrad/Kokoro-82M-v1.1-zh"
+
+    # Download model from Hugging Face and convert to OpenVINO format.
+    pipeline = KokoroTTSPipeline()
+
+    # Convert and save the Kokoro model to OpenVINO format
+    OVKModel.download_and_convert(Path("./kokoro_openvino_model_zh"), repo_id=model_id, ttsPipeline=pipeline)
+
+    # To run inference on NPU, model must have static input shapes
+    OVKModel.convert_to_static(Path("./kokoro_openvino_model_zh"), Path("./kokoro_static_openvino_model_zh"))
+    # # Execution on NPU require config file
+    # config = {
+    #     "NPU": {
+    #         "NPU_USE_NPUW": "YES",
+    #         "NPUW_DEVICES": "NPU,CPU",
+    #         "NPUW_KOKORO": "YES",
+    #     }
+    # }
+
+    # # NPUW_CACHE_DIR can be used to avoid compilation on every run
+    # config["NPU"]["NPUW_CACHE_DIR"] = "./npu_cache_kokoro"
diff --git a/demos/audio/tts_test_strings.py b/demos/audio/tts_test_strings.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+"""Send a battery of tricky TTS test strings to an OpenAI-compatible
+speech endpoint, one by one, and save each result as a numbered WAV file.
+
+Usage:
+    python tts_test_strings.py --endpoint http://localhost:8000/v3 \
+                               --model kokoro \
+                               [--voice None] \
+                               [--output-dir tts_output]
+"""
+
+import argparse
+import os
+import sys
+import time
+
+from openai import OpenAI
+
+TEST_STRINGS = [
+    'Dr. A. B. Carter Jr. met Sen. O\'Neill at 5 p.m., Wed., in Washington, D.C.',
+    'Mr. Smith, Ph.D., arrived on Fri. at 6:30 a.m.; Mrs. Jones left at noon.',
+    'We meet on 01/02/2025 at 05:30 IST; is that India or Israel time?',
+    'The deadline is 2025\u201102\u201101 23:59 UTC\u221205:00 (EST).',
+    'He finished 1st; she was 22nd\u2014barely.',
+    'Prices: $1,234.56 vs \u20ac1.234,56; also \u00a512 345 (thin space).',
+    'Add \u00be cup, then \u00bd tsp; total \u2248 1\u00bc cups.',
+    'Chapter XLIV starts on page ix; version v2.0.0 follows v1.12.9.',
+    'Dose: 5 mg vs 5 \u03bcg\u2014don\'t confuse micrograms with milligrams.',
+    'Avogadro\'s number is 6.022e23; \u03c0 \u2248 3.14159; \u221a2 \u2248 1.4142.',
+    'Temperature dropped to \u221210 \u00b0C (14 \u00b0F) with 90% RH.',
+    'Visit https://example.com/a/b?x=1&y=2#frag or email ops+alerts@example.org.',
+    'Open C:\\Program Files\\Project\\config.yaml or /usr/local/bin/run.sh.',
+    '.NET, Node.js, C#, C++17, and Rust\'s crate\u2011names\u2011with\u2011hyphens.',
+    '"WYSIWYG," "GIF" (hard or soft g?), "SQL" (sequel or S\u2011Q\u2011L?).',
+    'I will present the present to the lead singer who stepped on the lead.',
+    'They desert the desert; the dove dove; he wound the wound.',
+    'Please record the record before the minute is up in a minute.',
+    'She sells seashells by the seashore; truly Irish wristwatch.',
+    'Unique New York, toy boat, red leather yellow leather.',
+    'A na\u00efve co\u00f6perative fa\u00e7ade in S\u00e3o Paulo; \u0141\u00f3d\u017a and Krak\u00f3w in Poland.',
+    'Pi\u00f1ata, jalape\u00f1o, cr\u00e8me br\u00fbl\u00e9e, bouillabaisse, d\u00e9j\u00e0 vu.',
+    '\U0001f44d\U0001f3fb is a thumbs\u2011up with light skin tone; \U0001f9d1\u200d\U0001f4bb writes code; \U0001f468\u200d\U0001f469\u200d\U0001f467\u200d\U0001f466 is a family; \U0001f1f5\U0001f1f1 is a flag.',
+    'Faces: \U0001f642\U0001f609\U0001f610\U0001f611\U0001f636; hearts: \u2764\ufe0f\U0001f9e1\U0001f49b\U0001f49a\U0001f499; mixed: \U0001f937\u200d\u2642\ufe0f\U0001f926\u200d\u2640\ufe0f.',
+    'Latin "A" vs Cyrillic "\u0410"; Greek "\u03c1" vs Latin "p"; micro "\u00b5" vs Greek "\u03bc".',
+    '\u05e9\u05dc\u05d5\u05dd and \u0645\u0631\u062d\u0628\u064b\u0627 appear with left\u2011to\u2011right text in one line.',
+    'Prosody markers: \u02c8primary, \u02ccsecondary, and length \u02d0 are tricky for tokenizers.',
+    'Arrows for intonation: \u2197 rising, \u2198 falling, \u2193 drop.',
+    'He said, "She replied, \'no\u2014never\u2026\'," then left\u2014silently.',
+    'Parentheticals (like this\u2014really!) and em\u2011dashes\u2014here\u2014confuse prosody.',
+    'Let f(x)=x^2; then d/dx x^2=2x; \u2202/\u2202x is the operator.',
+    'Inline code x += 1; and TeX E=mc^2 should be read clearly.',
+    'N,N\u2011Diethyl\u2011meta\u2011toluamide (DEET) differs from p\u2011xylene and m\u2011cresol.',
+    'The RFC 7231/HTTP\u2011semantics "GET" vs "HEAD" distinction matters.',
+    'Read "macOS" vs "Mac OS", "iOS", "SQL", "URL", and "S3" correctly.',
+]
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Send TTS test strings to an OpenAI-compatible speech endpoint."
+    )
+    parser.add_argument(
+        "--endpoint", required=True,
+        help="Base URL of the API (e.g. http://localhost:8000/v3)"
+    )
+    parser.add_argument(
+        "--model", required=True,
+        help="Model name to use for speech generation"
+    )
+    parser.add_argument(
+        "--voice", default=None,
+        help="Voice name (default: voice1)"
+    )
+    parser.add_argument(
+        "--output-dir", default="tts_output",
+        help="Directory to save output WAV files (default: tts_output)"
+    )
+    args = parser.parse_args()
+
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    client = OpenAI(base_url=args.endpoint, api_key="unused")
+
+    total = len(TEST_STRINGS)
+    print(f"Sending {total} test strings to {args.endpoint} (model={args.model}, voice={args.voice})")
+    print(f"Output directory: {args.output_dir}\n")
+
+    succeeded = 0
+    failed = 0
+    total_size_kb = 0.0
+    t_start = time.time()
+
+    for idx, text in enumerate(TEST_STRINGS, start=1):
+        preview = text[:80] + ("..." if len(text) > 80 else "")
+        print(f"[{idx:2d}/{total}] {preview}")
+
+        out_path = os.path.join(args.output_dir, f"{idx:02d}.wav")
+        t0 = time.time()
+        try:
+            response = client.audio.speech.create(
+                model=args.model,
+                voice=args.voice,
+                input=text,
+            )
+            response.write_to_file(out_path)
+            elapsed = time.time() - t0
+            size_kb = os.path.getsize(out_path) / 1024
+            total_size_kb += size_kb
+            succeeded += 1
+            print(f"        -> {out_path}  ({size_kb:.1f} KB, {elapsed:.2f}s)")
+        except Exception as exc:
+            elapsed = time.time() - t0
+            failed += 1
+            print(f"        !! FAILED after {elapsed:.2f}s: {exc}", file=sys.stderr)
+
+    total_elapsed = time.time() - t_start
+    print(f"\n{'='*60}")
+    print(f"Summary: {succeeded} succeeded, {failed} failed out of {total}")
+    print(f"Total time: {total_elapsed:.2f}s  (avg {total_elapsed/total:.2f}s per string)")
+    print(f"Total audio size: {total_size_kb:.1f} KB")
+    print(f"{'='*60}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/BUILD b/src/BUILD
@@ -563,6 +563,7 @@ ovms_cc_library(
                 "//src/image_gen:image_gen_calculator",
                 "//src/audio/speech_to_text:s2t_calculator",
                 "//src/audio/text_to_speech:t2s_calculator",
+                "//src/audio/kokoro:kokoro_calculator",
                 "//src/audio:audio_utils",
                 "//src/image_gen:imagegen_init",
                 "//src/llm:openai_completions_api_handler",