From 7047b8fb2f9ce858a6bfdc8746a5b7505d6476e8 Mon Sep 17 00:00:00 2001
From: Michal Kulakowski <michal.kulakowski@intel.com>
Date: Wed, 17 Dec 2025 17:24:27 +0100
Subject: [PATCH 01/11] Initiali suppor of Kokoro model

---
 Dockerfile.redhat                             |   5 +-
 Dockerfile.ubuntu                             |   4 +-
 src/BUILD                                     |   1 +
 src/audio/audio_utils.cpp                     |  31 ++
 src/audio/audio_utils.hpp                     |   2 +
 src/audio/kokoro/BUILD                        |  60 ++++
 src/audio/kokoro/kokoro_calculator.cc         | 320 ++++++++++++++++++
 src/audio/kokoro/kokoro_calculator.proto      |  33 ++
 src/audio/kokoro/kokoro_servable.hpp          | 197 +++++++++++
 src/logging.cpp                               |   4 +
 src/logging.hpp                               |   1 +
 .../mediapipegraphdefinition.cpp              |  23 ++
 .../mediapipegraphdefinition.hpp              |   8 +-
 .../mediapipegraphexecutor.cpp                |   4 +-
 .../mediapipegraphexecutor.hpp                |   3 +
 15 files changed, 692 insertions(+), 4 deletions(-)
 create mode 100644 src/audio/kokoro/BUILD
 create mode 100644 src/audio/kokoro/kokoro_calculator.cc
 create mode 100644 src/audio/kokoro/kokoro_calculator.proto
 create mode 100644 src/audio/kokoro/kokoro_servable.hpp

diff --git a/Dockerfile.redhat b/Dockerfile.redhat
index bc574eaaf2..41e02ecc12 100644
--- a/Dockerfile.redhat
+++ b/Dockerfile.redhat
@@ -127,7 +127,9 @@ RUN dnf install -y -d6 \
             python3.12 \
             python3.12-devel \
             python3.12-pip \
-            libicu-devel && \
+            libicu-devel \
+            espeak-ng \
+            espeak-ng-devel && \
             dnf clean all
 
 WORKDIR /
@@ -416,6 +418,7 @@ RUN if [ -f /usr/bin/dnf ] ; then export DNF_TOOL=dnf ; echo -e "max_parallel_do
     if ! [[ $debug_bazel_flags == *"py_off"* ]]; then \
         $DNF_TOOL install -y python3.12-libs --setopt=install_weak_deps=0 --nodocs; \
     fi ; \
+    $DNF_TOOL install -y espeak-ng --setopt=install_weak_deps=0 --nodocs; \
     $DNF_TOOL install -y shadow-utils; \
     $DNF_TOOL clean all ; \
     cp -v /etc/ssl/certs/ca-bundle.crt /etc/ssl/certs/ca-certificates.crt ; \
diff --git a/Dockerfile.ubuntu b/Dockerfile.ubuntu
index 15e47daf20..d80087c646 100644
--- a/Dockerfile.ubuntu
+++ b/Dockerfile.ubuntu
@@ -100,6 +100,8 @@ RUN if [ "$BASE_OS" == "ubuntu24" ] ; then apt-get update && \
     apt-get clean && rm -rf /var/lib/apt/lists/* ; fi
 ENV PIP_BREAK_SYSTEM_PACKAGES=1
 RUN apt-get update && apt-get install --no-install-recommends -y \
+            espeak-ng \
+            libespeak-ng-dev \
             libgflags-dev \
             bc \
             ca-certificates \
@@ -413,7 +415,7 @@ ARG INSTALL_DRIVER_VERSION="24.39.31294"
 COPY ./install_ubuntu_gpu_drivers.sh /tmp/install_gpu_drivers.sh
 # hadolint ignore=DL3003,SC2164
 RUN apt-get update ; \
-    apt-get install -y --no-install-recommends curl ca-certificates libxml2 || exit 1; \
+    apt-get install -y --no-install-recommends curl ca-certificates libxml2 espeak-ng espeak-ng-data || exit 1; \
     if [ "$GPU" == "1" ] ; then \
 	/tmp/install_gpu_drivers.sh ; \
     fi ; \
diff --git a/src/BUILD b/src/BUILD
index 71321ca7ee..0318099727 100644
--- a/src/BUILD
+++ b/src/BUILD
@@ -563,6 +563,7 @@ ovms_cc_library(
                 "//src/image_gen:image_gen_calculator",
                 "//src/audio/speech_to_text:s2t_calculator",
                 "//src/audio/text_to_speech:t2s_calculator",
+                "//src/audio/kokoro:kokoro_calculator",
                 "//src/audio:audio_utils",
                 "//src/image_gen:imagegen_init",
                 "//src/llm:openai_completions_api_handler",
diff --git a/src/audio/audio_utils.cpp b/src/audio/audio_utils.cpp
index 77b38e70df..01daafb351 100644
--- a/src/audio/audio_utils.cpp
+++ b/src/audio/audio_utils.cpp
@@ -188,3 +188,34 @@ void prepareAudioOutput(void** ppData, size_t& pDataSize, uint16_t bitsPerSample
     auto outputPreparationTime = (timer.elapsed<std::chrono::microseconds>(OUTPUT_PREPARATION)) / 1000;
     SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Output preparation time: {} ms", outputPreparationTime);
 }
+
+
+void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, uint16_t bitsPerSample, size_t speechSize, const float* waveformPtr) {
+    enum : unsigned int {
+        OUTPUT_PREPARATION,
+        TIMER_END
+    };
+    Timer<TIMER_END> timer;
+    timer.start(OUTPUT_PREPARATION);
+    drwav_data_format format;
+    format.container = drwav_container_riff;
+    format.format = DR_WAVE_FORMAT_IEEE_FLOAT;
+    format.channels = 1;
+    format.sampleRate = 24000;  // assume it is always 24 KHz
+    format.bitsPerSample = bitsPerSample;
+    drwav wav;
+    size_t totalSamples = speechSize * format.channels;
+
+    auto status = drwav_init_memory_write(&wav, ppData, &pDataSize, &format, nullptr);
+    if (status == DRWAV_FALSE) {
+        throw std::runtime_error("Failed to write all frames");
+    }
+    drwav_uint64 framesWritten = drwav_write_pcm_frames(&wav, totalSamples, waveformPtr);
+    if (framesWritten != totalSamples) {
+        throw std::runtime_error("Failed to write all frames");
+    }
+    drwav_uninit(&wav);
+    timer.stop(OUTPUT_PREPARATION);
+    auto outputPreparationTime = (timer.elapsed<std::chrono::microseconds>(OUTPUT_PREPARATION)) / 1000;
+    SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Output preparation time: {} ms", outputPreparationTime);
+}
\ No newline at end of file
diff --git a/src/audio/audio_utils.hpp b/src/audio/audio_utils.hpp
index cbeea8b457..874e83dca4 100644
--- a/src/audio/audio_utils.hpp
+++ b/src/audio/audio_utils.hpp
@@ -25,3 +25,5 @@ bool isWavBuffer(const std::string buf);
 std::vector<float> readWav(const std::string_view& wavData);
 std::vector<float> readMp3(const std::string_view& mp3Data);
 void prepareAudioOutput(void** ppData, size_t& pDataSize, uint16_t bitsPerSample, size_t speechSize, const float* waveformPtr);
+void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, uint16_t bitsPerSample, size_t speechSize, const float* waveformPtr);
+
diff --git a/src/audio/kokoro/BUILD b/src/audio/kokoro/BUILD
new file mode 100644
index 0000000000..d7d3b64b1a
--- /dev/null
+++ b/src/audio/kokoro/BUILD
@@ -0,0 +1,60 @@
+#
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+load("@mediapipe//mediapipe/framework/port:build_config.bzl", "mediapipe_cc_proto_library", "mediapipe_proto_library")
+load("//:common_settings.bzl", "ovms_cc_library")
+
+ovms_cc_library(
+    name = "kokoro_servable",
+    hdrs = ["kokoro_servable.hpp"],
+    deps= ["//third_party:openvino",
+    "//src:libovms_ovinferrequestsqueue",
+    ],
+    visibility = ["//visibility:public"],
+    alwayslink = 1,
+)
+
+ovms_cc_library(
+    name = "kokoro_calculator",
+    srcs = ["kokoro_calculator.cc"],
+    deps = [
+        "@mediapipe//mediapipe/framework:calculator_framework",
+        "//src:httppayload",
+        "//src:libovmslogging",
+        "kokoro_calculator_cc_proto",
+        "//src/port:dr_audio",
+        "//src/port:rapidjson_stringbuffer",
+        "//src/port:rapidjson_writer",
+        ":kokoro_servable",
+        "//third_party:genai",
+        "//src/audio:audio_utils",
+        "//src:executingstreamidguard",
+        "//src:model_metric_reporter",
+        "//third_party/espeak_ng:espeak_ng",
+    ],
+    visibility = ["//visibility:public"],
+    alwayslink = 1,
+)
+
+mediapipe_proto_library(
+    name = "kokoro_calculator_proto",
+    srcs = ["kokoro_calculator.proto"],
+    visibility = ["//visibility:private"],
+    deps = [
+        "@mediapipe//mediapipe/framework:calculator_options_proto",
+        "@mediapipe//mediapipe/framework:calculator_proto",
+    ],
+)
diff --git a/src/audio/kokoro/kokoro_calculator.cc b/src/audio/kokoro/kokoro_calculator.cc
new file mode 100644
index 0000000000..986dd92fab
--- /dev/null
+++ b/src/audio/kokoro/kokoro_calculator.cc
@@ -0,0 +1,320 @@
+//*****************************************************************************
+// Copyright 2025 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#include <algorithm>
+#include <cstdint>
+#include <fstream>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#pragma warning(push)
+#pragma warning(disable : 4005 4309 6001 6385 6386 6326 6011 6246 4456 6246)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#include "mediapipe/framework/calculator_framework.h"
+#include "mediapipe/framework/port/canonical_errors.h"
+#pragma GCC diagnostic pop
+#pragma warning(pop)
+
+#include "src/audio/audio_utils.hpp"
+#include "src/http_payload.hpp"
+#include "src/logging.hpp"
+#include "src/port/dr_audio.hpp"
+
+#include "../../model_metric_reporter.hpp"
+#include "../../executingstreamidguard.hpp"
+
+#pragma warning(push)
+#pragma warning(disable : 6001 4324 6385 6386)
+#include "absl/strings/escaping.h"
+#include "absl/strings/str_cat.h"
+#pragma warning(pop)
+
+#include <espeak-ng/speak_lib.h>
+
+#include "kokoro_servable.hpp"
+
+#ifdef _WIN32
+#include <fcntl.h>
+#include <io.h>
+#endif
+
+using namespace ovms;
+
+namespace {
+
+#ifndef espeakPHONEMES_IPA
+#define espeakPHONEMES_IPA 0x02
+#endif
+#ifndef espeakPHONEMES_NO_STRESS
+#define espeakPHONEMES_NO_STRESS 0x08
+#endif
+
+void espeakPhonemizeAll(const std::string& textUtf8, std::string& outIpa, bool noStress = true) {
+    outIpa.clear();
+    auto& espeak = ovms::EspeakInstance::instance();
+    if (!espeak.isReady()) {
+        SPDLOG_ERROR("eSpeak not initialized");
+        return;
+    }
+
+    std::lock_guard<std::mutex> guard(espeak.mutex());
+
+    const int mode = espeakPHONEMES_IPA | (noStress ? espeakPHONEMES_NO_STRESS : 0);
+    const void* pos = static_cast<const void*>(textUtf8.c_str());
+    const char* endPtr = static_cast<const char*>(pos) + textUtf8.size();
+    std::string rawIpa;
+
+    while (pos && static_cast<const char*>(pos) < endPtr) {
+        const char* ipaChunk = espeak_TextToPhonemes(&pos, espeakCHARS_UTF8, mode);
+        if (ipaChunk && *ipaChunk) {
+            if (!rawIpa.empty()) {
+                rawIpa.push_back(' ');
+            }
+            rawIpa.append(ipaChunk);
+        }
+    }
+
+    // Strip combining diacriticals (U+0300..U+036F) and collapse spaces
+    std::string cleaned;
+    for (size_t i = 0; i < rawIpa.size(); ++i) {
+        unsigned char c = static_cast<unsigned char>(rawIpa[i]);
+        if (i + 1 < rawIpa.size()) {
+            unsigned char next = static_cast<unsigned char>(rawIpa[i + 1]);
+            if ((c == 0xCC && next >= 0x80) || (c == 0xCD && next <= 0xAF)) {
+                i++;
+                continue;
+            }
+        }
+        cleaned.push_back(c);
+    }
+
+    bool lastSpace = false;
+    for (char c : cleaned) {
+        if (std::isspace(static_cast<unsigned char>(c))) {
+            if (!lastSpace) {
+                outIpa.push_back(' ');
+                lastSpace = true;
+            }
+        } else {
+            outIpa.push_back(c);
+            lastSpace = false;
+        }
+    }
+
+    if (!outIpa.empty() && std::isspace(static_cast<unsigned char>(outIpa.back()))) {
+        outIpa.pop_back();
+    }
+
+    SPDLOG_DEBUG("IPA phonemes: '{}' (length: {})", outIpa, outIpa.size());
+}
+
+size_t utf8CharLen(unsigned char lead) {
+    if (lead < 0x80) return 1;
+    if ((lead >> 5) == 0x6) return 2;
+    if ((lead >> 4) == 0xE) return 3;
+    if ((lead >> 3) == 0x1E) return 4;
+    return 1;
+}
+
+void tokenize(const std::string& textUtf8,
+              std::vector<int64_t>& tokenIds,
+              const ovms::VocabIndex& ix) {
+    tokenIds.clear();
+    size_t pos = 0;
+    const size_t n = textUtf8.size();
+
+    while (pos < n) {
+        size_t maxTry = std::min(ix.max_token_bytes, n - pos);
+        int foundId = -1;
+        size_t foundLen = 0;
+
+        for (size_t len = maxTry; len > 0; --len) {
+            auto it = ix.by_token.find(std::string(textUtf8.data() + pos, len));
+            if (it != ix.by_token.end()) {
+                foundId = it->second;
+                foundLen = len;
+                break;
+            }
+        }
+
+        if (foundId >= 0) {
+            tokenIds.push_back(foundId);
+            pos += foundLen;
+        } else {
+            const unsigned char lead = static_cast<unsigned char>(textUtf8[pos]);
+            const size_t adv = utf8CharLen(lead);
+            SPDLOG_WARN("Tokenizer: unknown bytes at pos {}: '{}'",
+                        pos, std::string(textUtf8.data() + pos, std::min(adv, n - pos)));
+            pos += std::min(adv, n - pos);
+        }
+    }
+    SPDLOG_DEBUG("Tokenize: produced {} ids", tokenIds.size());
+}
+}  // namespace
+
+namespace mediapipe {
+
+const std::string KOKORO_SESSION_SIDE_PACKET_TAG = "KOKORO_NODE_RESOURCES";
+
+class KokoroCalculator : public CalculatorBase {
+    static const std::string INPUT_TAG_NAME;
+    static const std::string OUTPUT_TAG_NAME;
+
+public:
+    static absl::Status GetContract(CalculatorContract* cc) {
+        RET_CHECK(!cc->Inputs().GetTags().empty());
+        RET_CHECK(!cc->Outputs().GetTags().empty());
+        cc->Inputs().Tag(INPUT_TAG_NAME).Set<ovms::HttpPayload>();
+        cc->InputSidePackets().Tag(KOKORO_SESSION_SIDE_PACKET_TAG).Set<KokoroServableMap>();
+        cc->Outputs().Tag(OUTPUT_TAG_NAME).Set<std::string>();
+        return absl::OkStatus();
+    }
+
+    absl::Status Close(CalculatorContext* cc) final {
+        SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "KokoroCalculator [Node: {}] Close", cc->NodeName());
+        return absl::OkStatus();
+    }
+
+    absl::Status Open(CalculatorContext* cc) final {
+        SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "KokoroCalculator [Node: {}] Open", cc->NodeName());
+        return absl::OkStatus();
+    }
+
+    absl::Status Process(CalculatorContext* cc) final {
+        SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "KokoroCalculator [Node: {}] Process start", cc->NodeName());
+
+        KokoroServableMap servablesMap = cc->InputSidePackets()
+            .Tag(KOKORO_SESSION_SIDE_PACKET_TAG).Get<KokoroServableMap>();
+        auto servableIt = servablesMap.find(cc->NodeName());
+        RET_CHECK(servableIt != servablesMap.end())
+            << "Could not find initialized Kokoro node named: " << cc->NodeName();
+        auto servable = servableIt->second;
+
+        const auto& payload = cc->Inputs().Tag(INPUT_TAG_NAME).Get<ovms::HttpPayload>();
+        auto it = payload.parsedJson->FindMember("input");
+        RET_CHECK(it != payload.parsedJson->MemberEnd()) << "Missing 'input' in request";
+        RET_CHECK(it->value.IsString()) << "'input' must be a string";
+        const std::string text = it->value.GetString();
+
+        // Text -> IPA phonemization
+        std::string phonemes;
+        espeakPhonemizeAll(text, phonemes, /*noStress=*/true);
+        SPDLOG_DEBUG("Input text: '{}', IPA phonemes ({} chars): '{}'", text, phonemes.size(), phonemes);
+
+        // IPA -> Kokoro token IDs
+        const auto& vocabIx = servable->getVocabIndex();
+        std::vector<std::vector<int64_t>> inputTokens(1);
+        tokenize(phonemes, inputTokens[0], vocabIx);
+
+        // Prepend PAD token (id=0) - Kokoro model requires BOS/PAD at start
+        inputTokens[0].insert(inputTokens[0].begin(), 0);
+
+        // Append EOS (period token = 4) if not already present
+        if (inputTokens[0].empty() || inputTokens[0].back() != 4) {
+            inputTokens[0].push_back(4);
+        }
+
+        // Voice embedding
+        std::vector<float> voice = {
+            -0.2296, 0.1835, -0.0069, -0.1240, -0.2505, 0.0112, -0.0759, -0.1650,
+            -0.2665, -0.1965, 0.0242, -0.1667, 0.3524, 0.2140, 0.3069, -0.3377,
+            -0.0878, -0.0477, 0.0813, -0.2135, -0.2340, -0.1971, 0.0200, 0.0145,
+            0.0016, 0.2596, -0.2665, 0.1434, 0.0503, 0.0867, 0.1905, -0.1281,
+            0.0658, -0.0639, -0.0920, 0.2444, -0.1506, -0.2197, 0.1385, 0.2133,
+            -0.0755, -0.0188, -0.0142, 0.2301, -0.0776, -0.0748, 0.0172, 0.0430,
+            -0.1009, 0.1519, 0.1137, 0.0641, 0.2264, 0.1911, -0.0205, 0.2578,
+            0.2210, -0.0784, -0.0235, -0.0547, 0.2191, -0.1623, -0.2416, 0.0076,
+            0.0574, 0.2186, 0.0080, 0.0473, 0.0972, 0.0286, 0.1324, 0.0686,
+            0.2652, -0.2237, -0.0980, -0.1693, -0.1866, 0.2273, 0.2008, -0.0683,
+            0.0957, 0.0623, -0.1891, 0.1620, 0.1811, -0.0516, -0.0800, -0.1416,
+            -0.2374, -0.1892, 0.1726, -0.0690, -0.0300, 0.0467, -0.2811, -0.1603,
+            0.0342, -0.1054, -0.0604, -0.0475, -0.0908, -0.1286, 0.1105, -0.1186,
+            0.0582, 0.1887, 0.0345, 0.2081, 0.1404, -0.2532, 0.0026, 0.0402,
+            0.0812, -0.0512, 0.0128, 0.0084, -0.0970, -0.0362, 0.0036, -0.0720,
+            -0.0850, 0.0221, -0.1037, 0.0569, 0.0187, -0.0649, -0.0288, -0.1795,
+            0.0045, 0.2535, 0.6751, 0.1578, -0.0966, 0.1516, 0.2109, 0.2033,
+            -0.2155, -0.1783, 0.0836, -0.1050, 0.0676, -0.0237, 0.0387, -0.2564,
+            0.1891, 0.1305, -0.3239, -0.1312, 0.2723, 0.0745, 0.1335, 0.0302,
+            0.0172, 0.2207, 0.0215, -0.0379, -0.1954, 0.4944, 0.2905, -0.0306,
+            0.2858, 0.2341, 0.0545, 0.4626, 0.2947, 0.3802, 0.2820, 0.1557,
+            0.1743, -0.1410, 0.0986, 0.4751, -0.2146, 0.3530, -0.2357, -0.5626,
+            -0.0617, 0.2190, 0.0992, -0.2365, 0.3726, 0.2092, 0.1660, 0.1928,
+            0.5731, -0.1734, -0.0816, -0.3191, -0.1871, -0.2217, -0.0112, 0.1261,
+            0.1601, 0.3835, 0.0451, -0.1927, -0.1116, 0.2204, -0.0379, -0.0094,
+            -0.0455, -0.4831, -0.3345, -0.2119, 0.4803, 0.1214, 0.1723, 0.2605,
+            0.0051, -0.2587, 0.0511, -0.1318, 0.0227, -0.0645, 0.2573, -0.0205,
+            0.0665, -0.3562, -0.6070, 0.4191, 0.0351, 0.2033, -0.5508, -0.1415,
+            -0.1249, -0.0986, -0.1120, -0.1187, 0.0600, 0.1974, 0.5017, -0.0247,
+            -0.2986, 0.3983, -0.1159, -0.4275, -0.0164, -0.3783, 0.0717, 0.1478,
+            -0.1144, 0.2292, 0.2741, 0.4309, -0.1611, 0.0755, -0.0981, 0.4584,
+            -0.2061, -0.0787, -0.1779, 0.2275, -0.1742, -0.2230, -0.1739, 0.0646
+        };
+
+        auto& ids = inputTokens[0];
+
+        auto inputIdsTensor = ov::Tensor{ov::element::i64, ov::Shape{1, ids.size()}};
+        auto refS = ov::Tensor{ov::element::f32, ov::Shape{1, voice.size()}};
+        auto speed = ov::Tensor{ov::element::f32, ov::Shape{1}};
+
+        *reinterpret_cast<float*>(speed.data()) = 0.5f;
+        std::copy(ids.data(), ids.data() + ids.size(),
+                  reinterpret_cast<int64_t*>(inputIdsTensor.data()));
+        std::copy(voice.data(), voice.data() + voice.size(),
+                  reinterpret_cast<float*>(refS.data()));
+
+        // Inference
+        ModelMetricReporter unused(nullptr, nullptr, "unused", 1);
+        auto executingStreamIdGuard =
+            std::make_unique<ExecutingStreamIdGuard>(servable->getInferRequestsQueue(), unused);
+        ov::InferRequest& inferRequest = executingStreamIdGuard->getInferRequest();
+
+        inferRequest.set_tensor("input_ids", inputIdsTensor);
+        inferRequest.set_tensor("103", refS);
+        inferRequest.set_tensor("speed", speed);
+        inferRequest.start_async();
+        inferRequest.wait();
+
+        // Collect audio output
+        auto out = inferRequest.get_tensor(inferRequest.get_compiled_model().outputs()[0]);
+        RET_CHECK(out.get_shape().size() == 1);
+        RET_CHECK(out.get_element_type() == ov::element::f32);
+        const size_t samples = out.get_shape()[0];
+        const float* data = out.data<float>();
+
+        SPDLOG_DEBUG("Model output: {} audio samples ({:.2f}s at 24kHz)",
+                     samples, static_cast<float>(samples) / 24000.0f);
+
+        void* wavDataPtr = nullptr;
+        size_t wavSize = 0;
+        prepareAudioOutputKokoro(&wavDataPtr, wavSize, 32, samples, data);
+
+        auto output = std::make_unique<std::string>(reinterpret_cast<char*>(wavDataPtr), wavSize);
+        drwav_free(wavDataPtr, NULL);
+
+        cc->Outputs().Tag(OUTPUT_TAG_NAME).Add(output.release(), cc->InputTimestamp());
+        SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "KokoroCalculator [Node: {}] Process end", cc->NodeName());
+        return absl::OkStatus();
+    }
+};
+
+const std::string KokoroCalculator::INPUT_TAG_NAME{"HTTP_REQUEST_PAYLOAD"};
+const std::string KokoroCalculator::OUTPUT_TAG_NAME{"HTTP_RESPONSE_PAYLOAD"};
+
+REGISTER_CALCULATOR(KokoroCalculator);
+
+}  // namespace mediapipe
diff --git a/src/audio/kokoro/kokoro_calculator.proto b/src/audio/kokoro/kokoro_calculator.proto
new file mode 100644
index 0000000000..d9fc1b4bd9
--- /dev/null
+++ b/src/audio/kokoro/kokoro_calculator.proto
@@ -0,0 +1,33 @@
+//*****************************************************************************
+// Copyright 2025 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+syntax = "proto2";
+package mediapipe;
+
+import "mediapipe/framework/calculator.proto";
+
+
+message KokoroCalculatorOptions {
+  extend mediapipe.CalculatorOptions {
+    // https://github.com/google/mediapipe/issues/634 have to be unique in app
+    // no rule to obtain this
+    optional KokoroCalculatorOptions ext = 116423799;
+    }
+
+    required string models_path = 1;
+    optional string target_device = 2;
+    optional string plugin_config = 3;
+}
diff --git a/src/audio/kokoro/kokoro_servable.hpp b/src/audio/kokoro/kokoro_servable.hpp
new file mode 100644
index 0000000000..3e42bd0db4
--- /dev/null
+++ b/src/audio/kokoro/kokoro_servable.hpp
@@ -0,0 +1,197 @@
+//*****************************************************************************
+// Copyright 2025 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+
+#include <fstream>
+#include <memory>
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#pragma warning(push)
+#pragma warning(disable : 4005 4309 6001 6385 6386 6326 6011 4005 4456 6246)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#include "mediapipe/framework/calculator_graph.h"
+#pragma GCC diagnostic pop
+#pragma warning(pop)
+
+#include "openvino/runtime/core.hpp"
+#include "../../ovinferrequestsqueue.hpp"
+
+#include <espeak-ng/speak_lib.h>
+#include <rapidjson/document.h>
+
+#include "src/audio/kokoro/kokoro_calculator.pb.h"
+#include "src/logging.hpp"
+
+namespace ovms {
+
+struct VocabIndex {
+    std::unordered_map<std::string, int> by_token;
+    size_t max_token_bytes = 1;
+};
+
+class EspeakInstance {
+public:
+    static EspeakInstance& instance() {
+        static EspeakInstance inst;
+        return inst;
+    }
+
+    bool isReady() const { return ready_; }
+    std::mutex& mutex() { return mutex_; }
+
+private:
+    EspeakInstance() {
+        ready_ = tryInit();
+        if (!ready_) {
+            SPDLOG_ERROR("eSpeak-NG initialization failed (data path or voice not found)");
+        } else {
+            SPDLOG_INFO("eSpeak-NG initialized successfully");
+        }
+    }
+
+    ~EspeakInstance() {
+        if (ready_) {
+            espeak_Terminate();
+        }
+    }
+
+    EspeakInstance(const EspeakInstance&) = delete;
+    EspeakInstance& operator=(const EspeakInstance&) = delete;
+
+    bool tryInit() {
+        auto try_path = [](const char* path) -> bool {
+            int sr = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS,
+                                       0, path,
+                                       espeakINITIALIZE_DONT_EXIT);
+            if (sr <= 0) return false;
+            if (espeak_SetVoiceByName("en") != EE_OK &&
+                espeak_SetVoiceByName("en-us") != EE_OK) {
+                return false;
+            }
+            return true;
+        };
+
+        if (try_path(nullptr)) return true;
+
+        static const char* ngPaths[] = {
+            "/usr/share/espeak-ng-data",
+            "/opt/homebrew/share/espeak-ng-data",
+            "/usr/local/share/espeak-ng-data",
+            "espeak-ng-data",
+            nullptr
+        };
+        for (int i = 0; ngPaths[i]; ++i)
+            if (try_path(ngPaths[i])) return true;
+
+        static const char* esPaths[] = {
+            "/usr/share/espeak-data",
+            "/usr/local/share/espeak-data",
+            "espeak-data",
+            nullptr
+        };
+        for (int i = 0; esPaths[i]; ++i)
+            if (try_path(esPaths[i])) return true;
+
+        return false;
+    }
+
+    bool ready_ = false;
+    std::mutex mutex_;
+};
+
+struct KokoroServable {
+    std::filesystem::path parsedModelsPath;
+    std::shared_ptr<ov::Model> model;
+    ov::CompiledModel compiledModel;
+    std::unique_ptr<OVInferRequestsQueue> inferRequestsQueue;
+    VocabIndex vocabIndex;
+
+    KokoroServable(const std::string& modelDir, const std::string& targetDevice, const std::string& graphPath) {
+        EspeakInstance::instance();
+
+        auto fsModelsPath = std::filesystem::path(modelDir);
+        if (fsModelsPath.is_relative()) {
+            parsedModelsPath = (std::filesystem::path(graphPath) / fsModelsPath);
+        } else {
+            parsedModelsPath = fsModelsPath;
+        }
+
+        vocabIndex = loadVocabFromConfig(parsedModelsPath);
+
+        ov::AnyMap properties;
+        ov::Core core;
+        auto m_model = core.read_model(parsedModelsPath / std::filesystem::path("openvino_model.xml"), {}, properties);
+        compiledModel = core.compile_model(m_model, targetDevice, properties);
+        inferRequestsQueue = std::make_unique<OVInferRequestsQueue>(compiledModel, 5);
+    }
+
+    OVInferRequestsQueue& getInferRequestsQueue() {
+        return *inferRequestsQueue;
+    }
+
+    const VocabIndex& getVocabIndex() const {
+        return vocabIndex;
+    }
+
+private:
+    static VocabIndex loadVocabFromConfig(const std::filesystem::path& modelDir) {
+        VocabIndex ix;
+        auto configPath = modelDir / "config.json";
+        std::ifstream ifs(configPath);
+        if (!ifs.is_open()) {
+            SPDLOG_ERROR("Failed to open Kokoro config: {}", configPath.string());
+            return ix;
+        }
+
+        std::stringstream buffer;
+        buffer << ifs.rdbuf();
+        std::string jsonStr = buffer.str();
+
+        rapidjson::Document doc;
+        doc.Parse(jsonStr.c_str());
+        if (doc.HasParseError()) {
+            SPDLOG_ERROR("Failed to parse Kokoro config JSON: {}", configPath.string());
+            return ix;
+        }
+
+        if (!doc.HasMember("vocab") || !doc["vocab"].IsObject()) {
+            SPDLOG_ERROR("Kokoro config missing 'vocab' object: {}", configPath.string());
+            return ix;
+        }
+
+        const auto& vocab = doc["vocab"];
+        ix.by_token.reserve(vocab.MemberCount());
+        for (auto it = vocab.MemberBegin(); it != vocab.MemberEnd(); ++it) {
+            if (!it->name.IsString() || !it->value.IsInt()) continue;
+            std::string token = it->name.GetString();
+            int id = it->value.GetInt();
+            ix.by_token.emplace(token, id);
+            ix.max_token_bytes = std::max(ix.max_token_bytes, token.size());
+        }
+
+        SPDLOG_INFO("Loaded Kokoro vocabulary: {} tokens, max_token_bytes={}",
+                     ix.by_token.size(), ix.max_token_bytes);
+        return ix;
+    }
+};
+
+using KokoroServableMap = std::unordered_map<std::string, std::shared_ptr<KokoroServable>>;
+}  // namespace ovms
diff --git a/src/logging.cpp b/src/logging.cpp
index e89fce9a07..9d058d82dc 100644
--- a/src/logging.cpp
+++ b/src/logging.cpp
@@ -35,6 +35,7 @@ std::shared_ptr<spdlog::logger> llm_executor_logger = std::make_shared<spdlog::l
 std::shared_ptr<spdlog::logger> llm_calculator_logger = std::make_shared<spdlog::logger>("llm_calculator");
 std::shared_ptr<spdlog::logger> s2t_calculator_logger = std::make_shared<spdlog::logger>("s2t_calculator");
 std::shared_ptr<spdlog::logger> t2s_calculator_logger = std::make_shared<spdlog::logger>("t2s_calculator");
+std::shared_ptr<spdlog::logger> kokoro_calculator_logger = std::make_shared<spdlog::logger>("kokoro_calculator");
 std::shared_ptr<spdlog::logger> embeddings_calculator_logger = std::make_shared<spdlog::logger>("embeddings_calculator");
 std::shared_ptr<spdlog::logger> rerank_calculator_logger = std::make_shared<spdlog::logger>("rerank_calculator");
 #endif
@@ -78,6 +79,7 @@ static void register_loggers(const std::string& log_level, std::vector<spdlog::s
     llm_calculator_logger->set_pattern(default_pattern);
     s2t_calculator_logger->set_pattern(default_pattern);
     t2s_calculator_logger->set_pattern(default_pattern);
+    kokoro_calculator_logger->set_pattern(default_pattern);
     rerank_calculator_logger->set_pattern(default_pattern);
     embeddings_calculator_logger->set_pattern(default_pattern);
 #endif
@@ -98,6 +100,7 @@ static void register_loggers(const std::string& log_level, std::vector<spdlog::s
         llm_calculator_logger->sinks().push_back(sink);
         s2t_calculator_logger->sinks().push_back(sink);
         t2s_calculator_logger->sinks().push_back(sink);
+        kokoro_calculator_logger->sinks().push_back(sink);
         rerank_calculator_logger->sinks().push_back(sink);
         embeddings_calculator_logger->sinks().push_back(sink);
 #endif
@@ -119,6 +122,7 @@ static void register_loggers(const std::string& log_level, std::vector<spdlog::s
     set_log_level(log_level, llm_calculator_logger);
     set_log_level(log_level, s2t_calculator_logger);
     set_log_level(log_level, t2s_calculator_logger);
+    set_log_level(log_level, kokoro_calculator_logger);
     set_log_level(log_level, rerank_calculator_logger);
     set_log_level(log_level, embeddings_calculator_logger);
 #endif
diff --git a/src/logging.hpp b/src/logging.hpp
index 011458fe49..bcbf987f30 100644
--- a/src/logging.hpp
+++ b/src/logging.hpp
@@ -38,6 +38,7 @@ extern std::shared_ptr<spdlog::logger> llm_executor_logger;
 extern std::shared_ptr<spdlog::logger> llm_calculator_logger;
 extern std::shared_ptr<spdlog::logger> s2t_calculator_logger;
 extern std::shared_ptr<spdlog::logger> t2s_calculator_logger;
+extern std::shared_ptr<spdlog::logger> kokoro_calculator_logger;
 extern std::shared_ptr<spdlog::logger> embeddings_calculator_logger;
 extern std::shared_ptr<spdlog::logger> rerank_calculator_logger;
 #endif
diff --git a/src/mediapipe_internal/mediapipegraphdefinition.cpp b/src/mediapipe_internal/mediapipegraphdefinition.cpp
index 9047765e75..e1436b5891 100644
--- a/src/mediapipe_internal/mediapipegraphdefinition.cpp
+++ b/src/mediapipe_internal/mediapipegraphdefinition.cpp
@@ -63,6 +63,7 @@ const std::string MediapipeGraphDefinition::LLM_NODE_CALCULATOR_NAME{"LLMCalcula
 const std::string MediapipeGraphDefinition::IMAGE_GEN_CALCULATOR_NAME{"ImageGenCalculator"};
 const std::string MediapipeGraphDefinition::STT_NODE_CALCULATOR_NAME{"S2tCalculator"};
 const std::string MediapipeGraphDefinition::TTS_NODE_CALCULATOR_NAME{"T2sCalculator"};
+const std::string MediapipeGraphDefinition::KOKORO_NODE_CALCULATOR_NAME{"KokoroCalculator"};
 const std::string MediapipeGraphDefinition::EMBEDDINGS_NODE_CALCULATOR_NAME{"EmbeddingsCalculatorOV"};
 const std::string MediapipeGraphDefinition::RERANK_NODE_CALCULATOR_NAME{"RerankCalculatorOV"};
 
@@ -625,6 +626,28 @@ Status MediapipeGraphDefinition::initializeNodes() {
                 return StatusCode::MEDIAPIPE_GRAPH_CONFIG_FILE_INVALID;
             }
         }
+        if (endsWith(config.node(i).calculator(), KOKORO_NODE_CALCULATOR_NAME)) {
+            auto& kokoroServableMap = this->sidePacketMaps.kokoroServableMap;
+            ResourcesCleaningGuard<KokoroServableMap> kokoroServablesCleaningGuard(kokoroServableMap);
+            if (!config.node(i).node_options().size()) {
+                SPDLOG_LOGGER_ERROR(modelmanager_logger, "Kokoro node missing options in graph: {}. ", this->name);
+                return StatusCode::LLM_NODE_MISSING_OPTIONS;
+            }
+            if (config.node(i).name().empty()) {
+                SPDLOG_LOGGER_ERROR(modelmanager_logger, "Kokoro node name is missing in graph: {}. ", this->name);
+                return StatusCode::LLM_NODE_MISSING_NAME;
+            }
+            std::string nodeName = config.node(i).name();
+            if (kokoroServableMap.find(nodeName) != kokoroServableMap.end()) {
+                SPDLOG_LOGGER_ERROR(modelmanager_logger, "Kokoro node name: {} already used in graph: {}. ", nodeName, this->name);
+                return StatusCode::LLM_NODE_NAME_ALREADY_EXISTS;
+            }
+            mediapipe::KokoroCalculatorOptions nodeOptions;
+            config.node(i).node_options(0).UnpackTo(&nodeOptions);
+            std::shared_ptr<KokoroServable> servable = std::make_shared<KokoroServable>(nodeOptions.models_path(), nodeOptions.target_device(), mgconfig.getBasePath());
+            kokoroServableMap.insert(std::pair<std::string, std::shared_ptr<KokoroServable>>(nodeName, std::move(servable)));
+            kokoroServablesCleaningGuard.disableCleaning();
+        }
     }
     return StatusCode::OK;
 }
diff --git a/src/mediapipe_internal/mediapipegraphdefinition.hpp b/src/mediapipe_internal/mediapipegraphdefinition.hpp
index 14c9e0679f..1067ca7d42 100644
--- a/src/mediapipe_internal/mediapipegraphdefinition.hpp
+++ b/src/mediapipe_internal/mediapipegraphdefinition.hpp
@@ -48,6 +48,7 @@
 #include "../rerank/rerank_servable.hpp"
 #include "../audio/speech_to_text/s2t_servable.hpp"
 #include "../audio/text_to_speech/t2s_servable.hpp"
+#include "../audio/kokoro/kokoro_servable.hpp"
 
 namespace ovms {
 class MediapipeGraphDefinitionUnloadGuard;
@@ -66,6 +67,7 @@ using GenAiServableMap = std::unordered_map<std::string, std::shared_ptr<GenAiSe
 using RerankServableMap = std::unordered_map<std::string, std::shared_ptr<RerankServable>>;
 using SttServableMap = std::unordered_map<std::string, std::shared_ptr<SttServable>>;
 using TtsServableMap = std::unordered_map<std::string, std::shared_ptr<TtsServable>>;
+using KokoroServableMap = std::unordered_map<std::string, std::shared_ptr<KokoroServable>>;
 using EmbeddingsServableMap = std::unordered_map<std::string, std::shared_ptr<EmbeddingsServable>>;
 using ImageGenerationPipelinesMap = std::unordered_map<std::string, std::shared_ptr<ImageGenerationPipelines>>;
 
@@ -77,6 +79,7 @@ struct GraphSidePackets {
     RerankServableMap rerankServableMap;
     SttServableMap sttServableMap;
     TtsServableMap ttsServableMap;
+    KokoroServableMap kokoroServableMap;
     void clear() {
         pythonNodeResourcesMap.clear();
         genAiServableMap.clear();
@@ -85,6 +88,7 @@ struct GraphSidePackets {
         rerankServableMap.clear();
         sttServableMap.clear();
         ttsServableMap.clear();
+        kokoroServableMap.clear();
     }
     bool empty() {
         return (pythonNodeResourcesMap.empty() &&
@@ -93,7 +97,8 @@ struct GraphSidePackets {
                 embeddingsServableMap.empty() &&
                 rerankServableMap.empty() &&
                 sttServableMap.empty() &&
-                ttsServableMap.empty());
+                ttsServableMap.empty() &&
+                kokoroServableMap.empty());
     }
 };
 
@@ -136,6 +141,7 @@ class MediapipeGraphDefinition {
     static const std::string RERANK_NODE_CALCULATOR_NAME;
     static const std::string STT_NODE_CALCULATOR_NAME;
     static const std::string TTS_NODE_CALCULATOR_NAME;
+    static const std::string KOKORO_NODE_CALCULATOR_NAME;
     Status waitForLoaded(std::unique_ptr<MediapipeGraphDefinitionUnloadGuard>& unloadGuard, const uint32_t waitForLoadedTimeoutMicroseconds = WAIT_FOR_LOADED_DEFAULT_TIMEOUT_MICROSECONDS);
 
     // Pipelines are not versioned and any available definition has constant version equal 1.
diff --git a/src/mediapipe_internal/mediapipegraphexecutor.cpp b/src/mediapipe_internal/mediapipegraphexecutor.cpp
index 93b53fdf8e..b2016ac3aa 100644
--- a/src/mediapipe_internal/mediapipegraphexecutor.cpp
+++ b/src/mediapipe_internal/mediapipegraphexecutor.cpp
@@ -49,6 +49,7 @@ MediapipeGraphExecutor::MediapipeGraphExecutor(
     const RerankServableMap& rerankServableMap,
     const SttServableMap& sttServableMap,
     const TtsServableMap& ttsServableMap,
+    const KokoroServableMap& kokoroServableMap,
     PythonBackend* pythonBackend,
     MediapipeServableMetricReporter* mediapipeServableMetricReporter) :
     name(name),
@@ -58,7 +59,7 @@ MediapipeGraphExecutor::MediapipeGraphExecutor(
     outputTypes(std::move(outputTypes)),
     inputNames(std::move(inputNames)),
     outputNames(std::move(outputNames)),
-    sidePacketMaps({pythonNodeResourcesMap, llmNodeResourcesMap, {}, embeddingsServableMap, rerankServableMap, sttServableMap, ttsServableMap}),
+    sidePacketMaps({pythonNodeResourcesMap, llmNodeResourcesMap, {}, embeddingsServableMap, rerankServableMap, sttServableMap, ttsServableMap, kokoroServableMap}),
     pythonBackend(pythonBackend),
     currentStreamTimestamp(STARTING_TIMESTAMP),
     mediapipeServableMetricReporter(mediapipeServableMetricReporter) {}
@@ -92,6 +93,7 @@ const std::string MediapipeGraphExecutor::EMBEDDINGS_SESSION_SIDE_PACKET_TAG = "
 const std::string MediapipeGraphExecutor::RERANK_SESSION_SIDE_PACKET_TAG = "rerank_servable";
 const std::string MediapipeGraphExecutor::STT_SESSION_SIDE_PACKET_TAG = "s2t_servable";
 const std::string MediapipeGraphExecutor::TTS_SESSION_SIDE_PACKET_TAG = "t2s_servable";
+const std::string MediapipeGraphExecutor::KOKORO_SESSION_SIDE_PACKET_TAG = "kokoro_servable";
 const ::mediapipe::Timestamp MediapipeGraphExecutor::STARTING_TIMESTAMP = ::mediapipe::Timestamp(0);
 
 }  // namespace ovms
diff --git a/src/mediapipe_internal/mediapipegraphexecutor.hpp b/src/mediapipe_internal/mediapipegraphexecutor.hpp
index c165469395..af2e8d08e6 100644
--- a/src/mediapipe_internal/mediapipegraphexecutor.hpp
+++ b/src/mediapipe_internal/mediapipegraphexecutor.hpp
@@ -95,6 +95,7 @@ class MediapipeGraphExecutor {
     static const std::string RERANK_SESSION_SIDE_PACKET_TAG;
     static const std::string STT_SESSION_SIDE_PACKET_TAG;
     static const std::string TTS_SESSION_SIDE_PACKET_TAG;
+    static const std::string KOKORO_SESSION_SIDE_PACKET_TAG;
     static const ::mediapipe::Timestamp STARTING_TIMESTAMP;
 
     MediapipeGraphExecutor(const std::string& name, const std::string& version, const ::mediapipe::CalculatorGraphConfig& config,
@@ -107,6 +108,7 @@ class MediapipeGraphExecutor {
         const RerankServableMap& rerankServableMap,
         const SttServableMap& sttServableMap,
         const TtsServableMap& ttsServableMap,
+        const KokoroServableMap& kokoroServableMap,
         PythonBackend* pythonBackend,
         MediapipeServableMetricReporter* mediapipeServableMetricReporter);
     MediapipeGraphExecutor(const std::string& name, const std::string& version, const ::mediapipe::CalculatorGraphConfig& config,
@@ -157,6 +159,7 @@ class MediapipeGraphExecutor {
         inputSidePackets[RERANK_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket<RerankServableMap>(this->sidePacketMaps.rerankServableMap).At(STARTING_TIMESTAMP);
         inputSidePackets[STT_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket<SttServableMap>(this->sidePacketMaps.sttServableMap).At(STARTING_TIMESTAMP);
         inputSidePackets[TTS_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket<TtsServableMap>(this->sidePacketMaps.ttsServableMap).At(STARTING_TIMESTAMP);
+        inputSidePackets[KOKORO_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket<KokoroServableMap>(this->sidePacketMaps.kokoroServableMap).At(STARTING_TIMESTAMP);
 
         MP_RETURN_ON_FAIL(graph.StartRun(inputSidePackets), std::string("start MediaPipe graph: ") + this->name, StatusCode::MEDIAPIPE_GRAPH_START_ERROR);
 

From fe10782011604a17460c6b384e70663bc84025cd Mon Sep 17 00:00:00 2001
From: Michal Kulakowski <michal.kulakowski@intel.com>
Date: Fri, 20 Feb 2026 09:32:04 +0100
Subject: [PATCH 02/11] speed

---
 src/audio/kokoro/kokoro_calculator.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/audio/kokoro/kokoro_calculator.cc b/src/audio/kokoro/kokoro_calculator.cc
index 986dd92fab..3bfb6e0634 100644
--- a/src/audio/kokoro/kokoro_calculator.cc
+++ b/src/audio/kokoro/kokoro_calculator.cc
@@ -271,7 +271,7 @@ class KokoroCalculator : public CalculatorBase {
         auto refS = ov::Tensor{ov::element::f32, ov::Shape{1, voice.size()}};
         auto speed = ov::Tensor{ov::element::f32, ov::Shape{1}};
 
-        *reinterpret_cast<float*>(speed.data()) = 0.5f;
+        *reinterpret_cast<float*>(speed.data()) = 0.8f;
         std::copy(ids.data(), ids.data() + ids.size(),
                   reinterpret_cast<int64_t*>(inputIdsTensor.data()));
         std::copy(voice.data(), voice.data() + voice.size(),

From 983613da58aab63c8e68b47d0e3a03940c88c749 Mon Sep 17 00:00:00 2001
From: Michal Kulakowski <michal.kulakowski@intel.com>
Date: Tue, 24 Feb 2026 13:35:49 +0100
Subject: [PATCH 03/11] add espeak

---
 third_party/espeak_ng/BUILD | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 third_party/espeak_ng/BUILD

diff --git a/third_party/espeak_ng/BUILD b/third_party/espeak_ng/BUILD
new file mode 100644
index 0000000000..2c0a1cb09a
--- /dev/null
+++ b/third_party/espeak_ng/BUILD
@@ -0,0 +1,31 @@
+
+# third_party/espeak_ng/BUILD
+
+config_setting(
+    name = "is_macos",
+    values = {"cpu": "darwin"},
+)
+
+cc_library(
+    name = "espeak_ng",
+    copts = select({
+        ":is_macos": [
+            # Adjust to where Homebrew (or your installer) puts headers
+            "-I" + "$(HOME)/.brew/opt/espeak-ng/include",
+            "-I" + "$(HOME)/.brew/opt/espeak-ng/include/espeak-ng",
+        ],
+        "//conditions:default": [
+            # Typical on Debian/Ubuntu when installing libespeak-ng-dev
+            "-I/usr/include",
+            "-I/usr/include/espeak-ng",
+        ],
+    }),
+    linkopts = select({
+        ":is_macos": [
+            "-L" + "$(HOME)/.brew/opt/espeak-ng/lib",
+            "-lespeak-ng",
+        ],
+        "//conditions:default": ["-lespeak-ng"],
+    }),
+    visibility = ["//visibility:public"],
+)

From f07991158078009687f1e8a8a0701873e4cc29bb Mon Sep 17 00:00:00 2001
From: Michal Kulakowski <michal.kulakowski@intel.com>
Date: Tue, 24 Feb 2026 15:14:24 +0100
Subject: [PATCH 04/11] fixes

---
 src/audio/audio_utils.cpp             |  15 +--
 src/audio/audio_utils.hpp             |   2 +-
 src/audio/kokoro/kokoro_calculator.cc | 139 +++++++++++++++++---------
 src/audio/kokoro/kokoro_servable.hpp  |  87 +++++++++++++++-
 4 files changed, 185 insertions(+), 58 deletions(-)

diff --git a/src/audio/audio_utils.cpp b/src/audio/audio_utils.cpp
index 01daafb351..7636d8afe7 100644
--- a/src/audio/audio_utils.cpp
+++ b/src/audio/audio_utils.cpp
@@ -22,6 +22,7 @@
 #include "src/logging.hpp"
 #include <string>
 #include <vector>
+#include <cmath>
 #include <random>
 #include <algorithm>
 #pragma warning(push)
@@ -190,28 +191,28 @@ void prepareAudioOutput(void** ppData, size_t& pDataSize, uint16_t bitsPerSample
 }
 
 
-void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, uint16_t bitsPerSample, size_t speechSize, const float* waveformPtr) {
+void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, size_t speechSize, const float* waveformPtr) {
     enum : unsigned int {
         OUTPUT_PREPARATION,
         TIMER_END
     };
     Timer<TIMER_END> timer;
     timer.start(OUTPUT_PREPARATION);
+
     drwav_data_format format;
     format.container = drwav_container_riff;
     format.format = DR_WAVE_FORMAT_IEEE_FLOAT;
     format.channels = 1;
-    format.sampleRate = 24000;  // assume it is always 24 KHz
-    format.bitsPerSample = bitsPerSample;
+    format.sampleRate = 24000;  // Kokoro native sample rate
+    format.bitsPerSample = 32;
     drwav wav;
-    size_t totalSamples = speechSize * format.channels;
 
     auto status = drwav_init_memory_write(&wav, ppData, &pDataSize, &format, nullptr);
     if (status == DRWAV_FALSE) {
-        throw std::runtime_error("Failed to write all frames");
+        throw std::runtime_error("Failed to initialize WAV writer");
     }
-    drwav_uint64 framesWritten = drwav_write_pcm_frames(&wav, totalSamples, waveformPtr);
-    if (framesWritten != totalSamples) {
+    drwav_uint64 framesWritten = drwav_write_pcm_frames(&wav, speechSize, waveformPtr);
+    if (framesWritten != speechSize) {
         throw std::runtime_error("Failed to write all frames");
     }
     drwav_uninit(&wav);
diff --git a/src/audio/audio_utils.hpp b/src/audio/audio_utils.hpp
index 874e83dca4..ca0ce00a7c 100644
--- a/src/audio/audio_utils.hpp
+++ b/src/audio/audio_utils.hpp
@@ -25,5 +25,5 @@ bool isWavBuffer(const std::string buf);
 std::vector<float> readWav(const std::string_view& wavData);
 std::vector<float> readMp3(const std::string_view& mp3Data);
 void prepareAudioOutput(void** ppData, size_t& pDataSize, uint16_t bitsPerSample, size_t speechSize, const float* waveformPtr);
-void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, uint16_t bitsPerSample, size_t speechSize, const float* waveformPtr);
+void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, size_t speechSize, const float* waveformPtr);
 
diff --git a/src/audio/kokoro/kokoro_calculator.cc b/src/audio/kokoro/kokoro_calculator.cc
index 3bfb6e0634..f5b3cb011b 100644
--- a/src/audio/kokoro/kokoro_calculator.cc
+++ b/src/audio/kokoro/kokoro_calculator.cc
@@ -123,6 +123,71 @@ void espeakPhonemizeAll(const std::string& textUtf8, std::string& outIpa, bool n
     SPDLOG_DEBUG("IPA phonemes: '{}' (length: {})", outIpa, outIpa.size());
 }
 
+// Post-process eSpeak IPA into Kokoro/misaki phoneme alphabet.
+// Mirrors misaki.espeak.EspeakFallback.E2M for American English.
+// void espeakIpaToKokoro(std::string& ps) {
+//     // Helper: replace all occurrences of `from` with `to` in `s`.
+//     auto replaceAll = [](std::string& s, const std::string& from, const std::string& to) {
+//         if (from.empty()) return;
+//         size_t pos = 0;
+//         while ((pos = s.find(from, pos)) != std::string::npos) {
+//             s.replace(pos, from.size(), to);
+//             pos += to.size();
+//         }
+//     };
+
+//     // --- Multi-char replacements (longest first) ---
+//     // Syllabic n with glottal stop
+//     replaceAll(ps, "\xca\x94\xcb\x8c\x6e\xcc\xa9", "\xca\x94\x6e");  // ʔˌn̩ → ʔn
+//     replaceAll(ps, "\xca\x94\x6e\xcc\xa9", "\xca\x94\x6e");              // ʔn̩ → ʔn
+//     // Syllabic mark before consonant → ᵊ + consonant
+//     // ə̩l → ᵊl  (syllabic l)
+//     replaceAll(ps, "\xc9\x99\xcc\xa9\x6c", "\xe1\xb5\x8a\x6c");          // əl̩ → ᵊl  (approximation)
+
+//     // Diphthongs
+//     replaceAll(ps, "a\xc9\xaa", "I");       // aɪ → I
+//     replaceAll(ps, "a\xca\x8a", "W");       // aʊ → W
+//     replaceAll(ps, "e\xc9\xaa", "A");       // eɪ → A
+//     replaceAll(ps, "\xc9\x94\xc9\xaa", "Y"); // ɔɪ → Y
+//     replaceAll(ps, "o\xca\x8a", "O");       // oʊ → O  (American)
+//     replaceAll(ps, "\xc9\x99\xca\x8a", "O"); // əʊ → O  (British)
+
+//     // Affricates
+//     replaceAll(ps, "d\xca\x92", "\xca\xa4");  // dʒ → ʤ
+//     replaceAll(ps, "t\xca\x83", "\xca\xa7");  // tʃ → ʧ
+
+//     // Palatalization
+//     replaceAll(ps, "\xca\xb2\x6f", "jo");     // ʲo → jo
+//     replaceAll(ps, "\xca\xb2\xc9\x99", "j\xc9\x99"); // ʲə → jə
+//     replaceAll(ps, "\xca\xb2", "");           // ʲ → (delete)
+
+//     // R-colored vowels and vowel length
+//     replaceAll(ps, "\xc9\x9c\xcb\x90\xc9\xb9", "\xc9\x9c\xc9\xb9"); // ɜːɹ → ɜɹ
+//     replaceAll(ps, "\xc9\x9c\xcb\x90", "\xc9\x9c\xc9\xb9");           // ɜː → ɜɹ
+//     replaceAll(ps, "\xc9\xaa\xc9\x99", "i\xc9\x99");                   // ɪə → iə
+
+//     // --- Single-char replacements ---
+//     replaceAll(ps, "\xc9\x9a", "\xc9\x99\xc9\xb9"); // ɚ → əɹ
+//     replaceAll(ps, "\xc9\x90", "\xc9\x99");           // ɐ → ə
+//     replaceAll(ps, "\xc9\xac", "l");                   // ɬ → l
+//     replaceAll(ps, "\xc3\xa7", "k");                   // ç → k
+//     replaceAll(ps, "x", "k");                           // x → k
+//     replaceAll(ps, "r", "\xc9\xb9");                   // r → ɹ
+//     replaceAll(ps, "\xcb\x90", "");                     // ː → (strip length marks)
+//     replaceAll(ps, "\xcc\x83", "");                     // ̃ → (strip nasal tilde)
+
+//     // British vowel mappings (in case eSpeak uses 'en' voice)
+//     replaceAll(ps, "\xc9\x92", "\xc9\x94");           // ɒ → ɔ
+
+//     // Remaining standalone vowels (must be AFTER diphthong replacements)
+//     replaceAll(ps, "o", "\xc9\x94");                   // o → ɔ  (for espeak < 1.52)
+//     replaceAll(ps, "e", "A");                           // e → A
+
+//     // Flap and glottal stop (misaki version != 2.0)
+//     replaceAll(ps, "\xc9\xbe", "T");                   // ɾ → T
+//     replaceAll(ps, "\xca\x94", "t");                   // ʔ → t
+// }
+
 size_t utf8CharLen(unsigned char lead) {
     if (lead < 0x80) return 1;
     if ((lead >> 5) == 0x6) return 2;
@@ -211,70 +276,50 @@ class KokoroCalculator : public CalculatorBase {
         RET_CHECK(it->value.IsString()) << "'input' must be a string";
         const std::string text = it->value.GetString();
 
+        // Read optional "voice" parameter (OpenAI TTS API)
+        std::string voiceName;
+        auto voiceIt = payload.parsedJson->FindMember("voice");
+        if (voiceIt != payload.parsedJson->MemberEnd() && voiceIt->value.IsString()) {
+            voiceName = voiceIt->value.GetString();
+        }
+
         // Text -> IPA phonemization
         std::string phonemes;
-        espeakPhonemizeAll(text, phonemes, /*noStress=*/true);
+        espeakPhonemizeAll(text, phonemes, /*noStress=*/false);
         SPDLOG_DEBUG("Input text: '{}', IPA phonemes ({} chars): '{}'", text, phonemes.size(), phonemes);
 
+        // Preserve trailing punctuation from original text (eSpeak strips it)
+        // if (!text.empty()) {
+        //     char last = text.back();
+        //     if (last == '.' || last == '!' || last == '?' || last == ';' || last == ':' || last == ',') {
+        //         phonemes.push_back(last);
+        //     }
+        // }
+        SPDLOG_DEBUG("After E2M mapping ({} chars): '{}'", phonemes.size(), phonemes);
         // IPA -> Kokoro token IDs
         const auto& vocabIx = servable->getVocabIndex();
         std::vector<std::vector<int64_t>> inputTokens(1);
         tokenize(phonemes, inputTokens[0], vocabIx);
 
-        // Prepend PAD token (id=0) - Kokoro model requires BOS/PAD at start
+        // Wrap with PAD token (id=0) at both ends — matches official
+        // forward_with_tokens: input_ids = [[0, *tokens, 0]]
         inputTokens[0].insert(inputTokens[0].begin(), 0);
+        inputTokens[0].push_back(0);
 
-        // Append EOS (period token = 4) if not already present
-        if (inputTokens[0].empty() || inputTokens[0].back() != 4) {
-            inputTokens[0].push_back(4);
-        }
-
-        // Voice embedding
-        std::vector<float> voice = {
-            -0.2296, 0.1835, -0.0069, -0.1240, -0.2505, 0.0112, -0.0759, -0.1650,
-            -0.2665, -0.1965, 0.0242, -0.1667, 0.3524, 0.2140, 0.3069, -0.3377,
-            -0.0878, -0.0477, 0.0813, -0.2135, -0.2340, -0.1971, 0.0200, 0.0145,
-            0.0016, 0.2596, -0.2665, 0.1434, 0.0503, 0.0867, 0.1905, -0.1281,
-            0.0658, -0.0639, -0.0920, 0.2444, -0.1506, -0.2197, 0.1385, 0.2133,
-            -0.0755, -0.0188, -0.0142, 0.2301, -0.0776, -0.0748, 0.0172, 0.0430,
-            -0.1009, 0.1519, 0.1137, 0.0641, 0.2264, 0.1911, -0.0205, 0.2578,
-            0.2210, -0.0784, -0.0235, -0.0547, 0.2191, -0.1623, -0.2416, 0.0076,
-            0.0574, 0.2186, 0.0080, 0.0473, 0.0972, 0.0286, 0.1324, 0.0686,
-            0.2652, -0.2237, -0.0980, -0.1693, -0.1866, 0.2273, 0.2008, -0.0683,
-            0.0957, 0.0623, -0.1891, 0.1620, 0.1811, -0.0516, -0.0800, -0.1416,
-            -0.2374, -0.1892, 0.1726, -0.0690, -0.0300, 0.0467, -0.2811, -0.1603,
-            0.0342, -0.1054, -0.0604, -0.0475, -0.0908, -0.1286, 0.1105, -0.1186,
-            0.0582, 0.1887, 0.0345, 0.2081, 0.1404, -0.2532, 0.0026, 0.0402,
-            0.0812, -0.0512, 0.0128, 0.0084, -0.0970, -0.0362, 0.0036, -0.0720,
-            -0.0850, 0.0221, -0.1037, 0.0569, 0.0187, -0.0649, -0.0288, -0.1795,
-            0.0045, 0.2535, 0.6751, 0.1578, -0.0966, 0.1516, 0.2109, 0.2033,
-            -0.2155, -0.1783, 0.0836, -0.1050, 0.0676, -0.0237, 0.0387, -0.2564,
-            0.1891, 0.1305, -0.3239, -0.1312, 0.2723, 0.0745, 0.1335, 0.0302,
-            0.0172, 0.2207, 0.0215, -0.0379, -0.1954, 0.4944, 0.2905, -0.0306,
-            0.2858, 0.2341, 0.0545, 0.4626, 0.2947, 0.3802, 0.2820, 0.1557,
-            0.1743, -0.1410, 0.0986, 0.4751, -0.2146, 0.3530, -0.2357, -0.5626,
-            -0.0617, 0.2190, 0.0992, -0.2365, 0.3726, 0.2092, 0.1660, 0.1928,
-            0.5731, -0.1734, -0.0816, -0.3191, -0.1871, -0.2217, -0.0112, 0.1261,
-            0.1601, 0.3835, 0.0451, -0.1927, -0.1116, 0.2204, -0.0379, -0.0094,
-            -0.0455, -0.4831, -0.3345, -0.2119, 0.4803, 0.1214, 0.1723, 0.2605,
-            0.0051, -0.2587, 0.0511, -0.1318, 0.0227, -0.0645, 0.2573, -0.0205,
-            0.0665, -0.3562, -0.6070, 0.4191, 0.0351, 0.2033, -0.5508, -0.1415,
-            -0.1249, -0.0986, -0.1120, -0.1187, 0.0600, 0.1974, 0.5017, -0.0247,
-            -0.2986, 0.3983, -0.1159, -0.4275, -0.0164, -0.3783, 0.0717, 0.1478,
-            -0.1144, 0.2292, 0.2741, 0.4309, -0.1611, 0.0755, -0.0981, 0.4584,
-            -0.2061, -0.0787, -0.1779, 0.2275, -0.1742, -0.2230, -0.1739, 0.0646
-        };
-
+        // Voice embedding — select slice from voice pack based on content token count
         auto& ids = inputTokens[0];
+        size_t numContentTokens = ids.size() >= 2 ? ids.size() - 2 : 0;  // exclude BOS pad + EOS
+        const float* voiceSlice = servable->getVoiceSlice(voiceName, numContentTokens);
+        RET_CHECK(voiceSlice != nullptr) << "No voice pack loaded (place .bin files in <model_dir>/voices/)";
 
         auto inputIdsTensor = ov::Tensor{ov::element::i64, ov::Shape{1, ids.size()}};
-        auto refS = ov::Tensor{ov::element::f32, ov::Shape{1, voice.size()}};
+        auto refS = ov::Tensor{ov::element::f32, ov::Shape{1, KokoroServable::STYLE_DIM}};
         auto speed = ov::Tensor{ov::element::f32, ov::Shape{1}};
 
-        *reinterpret_cast<float*>(speed.data()) = 0.8f;
+        *reinterpret_cast<float*>(speed.data()) = 1.0f;
         std::copy(ids.data(), ids.data() + ids.size(),
                   reinterpret_cast<int64_t*>(inputIdsTensor.data()));
-        std::copy(voice.data(), voice.data() + voice.size(),
+        std::copy(voiceSlice, voiceSlice + KokoroServable::STYLE_DIM,
                   reinterpret_cast<float*>(refS.data()));
 
         // Inference
@@ -301,7 +346,7 @@ class KokoroCalculator : public CalculatorBase {
 
         void* wavDataPtr = nullptr;
         size_t wavSize = 0;
-        prepareAudioOutputKokoro(&wavDataPtr, wavSize, 32, samples, data);
+        prepareAudioOutputKokoro(&wavDataPtr, wavSize, samples, data);
 
         auto output = std::make_unique<std::string>(reinterpret_cast<char*>(wavDataPtr), wavSize);
         drwav_free(wavDataPtr, NULL);
diff --git a/src/audio/kokoro/kokoro_servable.hpp b/src/audio/kokoro/kokoro_servable.hpp
index 3e42bd0db4..c06f88cfac 100644
--- a/src/audio/kokoro/kokoro_servable.hpp
+++ b/src/audio/kokoro/kokoro_servable.hpp
@@ -15,6 +15,8 @@
 //*****************************************************************************
 #pragma once
 
+#include <algorithm>
+#include <filesystem>
 #include <fstream>
 #include <memory>
 #include <mutex>
@@ -47,6 +49,11 @@ struct VocabIndex {
     size_t max_token_bytes = 1;
 };
 
+struct VoicePack {
+    std::vector<float> data;   // flat [numEntries * STYLE_DIM]
+    size_t numEntries = 0;
+};
+
 class EspeakInstance {
 public:
     static EspeakInstance& instance() {
@@ -82,8 +89,8 @@ class EspeakInstance {
                                        0, path,
                                        espeakINITIALIZE_DONT_EXIT);
             if (sr <= 0) return false;
-            if (espeak_SetVoiceByName("en") != EE_OK &&
-                espeak_SetVoiceByName("en-us") != EE_OK) {
+            if (espeak_SetVoiceByName("en-us") != EE_OK &&
+                espeak_SetVoiceByName("en") != EE_OK) {
                 return false;
             }
             return true;
@@ -118,11 +125,15 @@ class EspeakInstance {
 };
 
 struct KokoroServable {
+    static constexpr size_t STYLE_DIM = 256;
+
     std::filesystem::path parsedModelsPath;
     std::shared_ptr<ov::Model> model;
     ov::CompiledModel compiledModel;
     std::unique_ptr<OVInferRequestsQueue> inferRequestsQueue;
     VocabIndex vocabIndex;
+    std::unordered_map<std::string, VoicePack> voicePacks;
+    std::string defaultVoiceName;
 
     KokoroServable(const std::string& modelDir, const std::string& targetDevice, const std::string& graphPath) {
         EspeakInstance::instance();
@@ -135,8 +146,13 @@ struct KokoroServable {
         }
 
         vocabIndex = loadVocabFromConfig(parsedModelsPath);
+        loadVoicePacks(parsedModelsPath);
 
-        ov::AnyMap properties;
+        ov::AnyMap properties = {
+            // Use ACCURACY execution mode to avoid fast-math approximation errors
+            // that accumulate in the deep decoder network and cause energy fade.
+            ov::hint::execution_mode(ov::hint::ExecutionMode::ACCURACY),
+        };
         ov::Core core;
         auto m_model = core.read_model(parsedModelsPath / std::filesystem::path("openvino_model.xml"), {}, properties);
         compiledModel = core.compile_model(m_model, targetDevice, properties);
@@ -151,6 +167,30 @@ struct KokoroServable {
         return vocabIndex;
     }
 
+    // Returns pointer to 256 floats for the given voice and token count.
+    // voiceName: requested voice (e.g. "af_alloy"). Falls back to default voice if not found.
+    // numContentTokens: number of token IDs excluding BOS/EOS padding.
+    const float* getVoiceSlice(const std::string& voiceName, size_t numContentTokens) const {
+        auto it = voicePacks.find(voiceName);
+        if (it == voicePacks.end()) {
+            it = voicePacks.find(defaultVoiceName);
+            if (it == voicePacks.end()) {
+                return nullptr;
+            }
+        }
+        const auto& pack = it->second;
+        size_t idx = std::min(numContentTokens, pack.numEntries - 1);
+        return pack.data.data() + (idx * STYLE_DIM);
+    }
+
+    bool hasVoice(const std::string& voiceName) const {
+        return voicePacks.count(voiceName) > 0;
+    }
+
+    const std::string& getDefaultVoiceName() const {
+        return defaultVoiceName;
+    }
+
 private:
     static VocabIndex loadVocabFromConfig(const std::filesystem::path& modelDir) {
         VocabIndex ix;
@@ -191,6 +231,47 @@ struct KokoroServable {
                      ix.by_token.size(), ix.max_token_bytes);
         return ix;
     }
+
+    void loadVoicePacks(const std::filesystem::path& modelDir) {
+        auto voicesDir = modelDir / "voices";
+        if (!std::filesystem::exists(voicesDir) || !std::filesystem::is_directory(voicesDir)) {
+            SPDLOG_WARN("No voices directory found at: {}", voicesDir.string());
+            return;
+        }
+
+        for (const auto& entry : std::filesystem::directory_iterator(voicesDir)) {
+            if (!entry.is_regular_file() || entry.path().extension() != ".bin")
+                continue;
+
+            std::string name = entry.path().stem().string();
+            auto fileSize = std::filesystem::file_size(entry.path());
+            if (fileSize == 0 || fileSize % (STYLE_DIM * sizeof(float)) != 0) {
+                SPDLOG_ERROR("Voice file {} has invalid size {} (must be multiple of {})",
+                             entry.path().string(), fileSize, STYLE_DIM * sizeof(float));
+                continue;
+            }
+
+            VoicePack pack;
+            pack.numEntries = fileSize / (STYLE_DIM * sizeof(float));
+            pack.data.resize(pack.numEntries * STYLE_DIM);
+
+            std::ifstream ifs(entry.path(), std::ios::binary);
+            if (!ifs.read(reinterpret_cast<char*>(pack.data.data()), fileSize)) {
+                SPDLOG_ERROR("Failed to read voice file: {}", entry.path().string());
+                continue;
+            }
+
+            SPDLOG_INFO("Loaded voice pack '{}': {} entries x {} dims from {}",
+                         name, pack.numEntries, STYLE_DIM, entry.path().string());
+
+            if (defaultVoiceName.empty()) {
+                defaultVoiceName = name;
+            }
+            voicePacks.emplace(name, std::move(pack));
+        }
+
+        SPDLOG_INFO("Loaded {} voice pack(s), default: '{}'", voicePacks.size(), defaultVoiceName);
+    }
 };
 
 using KokoroServableMap = std::unordered_map<std::string, std::shared_ptr<KokoroServable>>;

From f82bf7f0d52ea733f129f724d0bac78898611912 Mon Sep 17 00:00:00 2001
From: Michal Kulakowski <michal.kulakowski@intel.com>
Date: Tue, 24 Feb 2026 15:31:20 +0100
Subject: [PATCH 05/11] fix

---
 Dockerfile.ubuntu | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Dockerfile.ubuntu b/Dockerfile.ubuntu
index d80087c646..d7d2ace9f8 100644
--- a/Dockerfile.ubuntu
+++ b/Dockerfile.ubuntu
@@ -394,6 +394,9 @@ RUN if [ -f /ovms_release/lib/libovms_shared.so ] ; then mv /ovms_release/lib/li
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
 FROM $BASE_IMAGE as release
+ARG http_proxy
+ARG https_proxy
+ARG no_proxy
 ARG INSTALL_RPMS_FROM_URL=
 ARG INSTALL_DRIVER_VERSION="24.26.30049"
 ARG GPU=0

From 59d1b3110ff87265aeaab7e6857a2c35b0577ed1 Mon Sep 17 00:00:00 2001
From: Michal Kulakowski <michal.kulakowski@intel.com>
Date: Wed, 25 Feb 2026 13:22:59 +0100
Subject: [PATCH 06/11] style

---
 src/audio/audio_utils.cpp             |  1 -
 src/audio/audio_utils.hpp             |  1 -
 src/audio/kokoro/kokoro_calculator.cc | 27 +++++++++++++---------
 src/audio/kokoro/kokoro_servable.hpp  | 33 +++++++++++++++------------
 4 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/src/audio/audio_utils.cpp b/src/audio/audio_utils.cpp
index 7636d8afe7..1707b45cd2 100644
--- a/src/audio/audio_utils.cpp
+++ b/src/audio/audio_utils.cpp
@@ -190,7 +190,6 @@ void prepareAudioOutput(void** ppData, size_t& pDataSize, uint16_t bitsPerSample
     SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Output preparation time: {} ms", outputPreparationTime);
 }
 
-
 void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, size_t speechSize, const float* waveformPtr) {
     enum : unsigned int {
         OUTPUT_PREPARATION,
diff --git a/src/audio/audio_utils.hpp b/src/audio/audio_utils.hpp
index ca0ce00a7c..0928d03f3d 100644
--- a/src/audio/audio_utils.hpp
+++ b/src/audio/audio_utils.hpp
@@ -26,4 +26,3 @@ std::vector<float> readWav(const std::string_view& wavData);
 std::vector<float> readMp3(const std::string_view& mp3Data);
 void prepareAudioOutput(void** ppData, size_t& pDataSize, uint16_t bitsPerSample, size_t speechSize, const float* waveformPtr);
 void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, size_t speechSize, const float* waveformPtr);
-
diff --git a/src/audio/kokoro/kokoro_calculator.cc b/src/audio/kokoro/kokoro_calculator.cc
index f5b3cb011b..1747b18081 100644
--- a/src/audio/kokoro/kokoro_calculator.cc
+++ b/src/audio/kokoro/kokoro_calculator.cc
@@ -189,16 +189,20 @@ void espeakPhonemizeAll(const std::string& textUtf8, std::string& outIpa, bool n
 // }
 
 size_t utf8CharLen(unsigned char lead) {
-    if (lead < 0x80) return 1;
-    if ((lead >> 5) == 0x6) return 2;
-    if ((lead >> 4) == 0xE) return 3;
-    if ((lead >> 3) == 0x1E) return 4;
+    if (lead < 0x80)
+        return 1;
+    if ((lead >> 5) == 0x6)
+        return 2;
+    if ((lead >> 4) == 0xE)
+        return 3;
+    if ((lead >> 3) == 0x1E)
+        return 4;
     return 1;
 }
 
 void tokenize(const std::string& textUtf8,
-              std::vector<int64_t>& tokenIds,
-              const ovms::VocabIndex& ix) {
+    std::vector<int64_t>& tokenIds,
+    const ovms::VocabIndex& ix) {
     tokenIds.clear();
     size_t pos = 0;
     const size_t n = textUtf8.size();
@@ -224,7 +228,7 @@ void tokenize(const std::string& textUtf8,
             const unsigned char lead = static_cast<unsigned char>(textUtf8[pos]);
             const size_t adv = utf8CharLen(lead);
             SPDLOG_WARN("Tokenizer: unknown bytes at pos {}: '{}'",
-                        pos, std::string(textUtf8.data() + pos, std::min(adv, n - pos)));
+                pos, std::string(textUtf8.data() + pos, std::min(adv, n - pos)));
             pos += std::min(adv, n - pos);
         }
     }
@@ -264,7 +268,8 @@ class KokoroCalculator : public CalculatorBase {
         SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "KokoroCalculator [Node: {}] Process start", cc->NodeName());
 
         KokoroServableMap servablesMap = cc->InputSidePackets()
-            .Tag(KOKORO_SESSION_SIDE_PACKET_TAG).Get<KokoroServableMap>();
+                                             .Tag(KOKORO_SESSION_SIDE_PACKET_TAG)
+                                             .Get<KokoroServableMap>();
         auto servableIt = servablesMap.find(cc->NodeName());
         RET_CHECK(servableIt != servablesMap.end())
             << "Could not find initialized Kokoro node named: " << cc->NodeName();
@@ -318,9 +323,9 @@ class KokoroCalculator : public CalculatorBase {
 
         *reinterpret_cast<float*>(speed.data()) = 1.0f;
         std::copy(ids.data(), ids.data() + ids.size(),
-                  reinterpret_cast<int64_t*>(inputIdsTensor.data()));
+            reinterpret_cast<int64_t*>(inputIdsTensor.data()));
         std::copy(voiceSlice, voiceSlice + KokoroServable::STYLE_DIM,
-                  reinterpret_cast<float*>(refS.data()));
+            reinterpret_cast<float*>(refS.data()));
 
         // Inference
         ModelMetricReporter unused(nullptr, nullptr, "unused", 1);
@@ -342,7 +347,7 @@ class KokoroCalculator : public CalculatorBase {
         const float* data = out.data<float>();
 
         SPDLOG_DEBUG("Model output: {} audio samples ({:.2f}s at 24kHz)",
-                     samples, static_cast<float>(samples) / 24000.0f);
+            samples, static_cast<float>(samples) / 24000.0f);
 
         void* wavDataPtr = nullptr;
         size_t wavSize = 0;
diff --git a/src/audio/kokoro/kokoro_servable.hpp b/src/audio/kokoro/kokoro_servable.hpp
index c06f88cfac..5c668ae05d 100644
--- a/src/audio/kokoro/kokoro_servable.hpp
+++ b/src/audio/kokoro/kokoro_servable.hpp
@@ -50,7 +50,7 @@ struct VocabIndex {
 };
 
 struct VoicePack {
-    std::vector<float> data;   // flat [numEntries * STYLE_DIM]
+    std::vector<float> data;  // flat [numEntries * STYLE_DIM]
     size_t numEntries = 0;
 };
 
@@ -86,9 +86,10 @@ class EspeakInstance {
     bool tryInit() {
         auto try_path = [](const char* path) -> bool {
             int sr = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS,
-                                       0, path,
-                                       espeakINITIALIZE_DONT_EXIT);
-            if (sr <= 0) return false;
+                0, path,
+                espeakINITIALIZE_DONT_EXIT);
+            if (sr <= 0)
+                return false;
             if (espeak_SetVoiceByName("en-us") != EE_OK &&
                 espeak_SetVoiceByName("en") != EE_OK) {
                 return false;
@@ -96,26 +97,27 @@ class EspeakInstance {
             return true;
         };
 
-        if (try_path(nullptr)) return true;
+        if (try_path(nullptr))
+            return true;
 
         static const char* ngPaths[] = {
             "/usr/share/espeak-ng-data",
             "/opt/homebrew/share/espeak-ng-data",
             "/usr/local/share/espeak-ng-data",
             "espeak-ng-data",
-            nullptr
-        };
+            nullptr};
         for (int i = 0; ngPaths[i]; ++i)
-            if (try_path(ngPaths[i])) return true;
+            if (try_path(ngPaths[i]))
+                return true;
 
         static const char* esPaths[] = {
             "/usr/share/espeak-data",
             "/usr/local/share/espeak-data",
             "espeak-data",
-            nullptr
-        };
+            nullptr};
         for (int i = 0; esPaths[i]; ++i)
-            if (try_path(esPaths[i])) return true;
+            if (try_path(esPaths[i]))
+                return true;
 
         return false;
     }
@@ -220,7 +222,8 @@ struct KokoroServable {
         const auto& vocab = doc["vocab"];
         ix.by_token.reserve(vocab.MemberCount());
         for (auto it = vocab.MemberBegin(); it != vocab.MemberEnd(); ++it) {
-            if (!it->name.IsString() || !it->value.IsInt()) continue;
+            if (!it->name.IsString() || !it->value.IsInt())
+                continue;
             std::string token = it->name.GetString();
             int id = it->value.GetInt();
             ix.by_token.emplace(token, id);
@@ -228,7 +231,7 @@ struct KokoroServable {
         }
 
         SPDLOG_INFO("Loaded Kokoro vocabulary: {} tokens, max_token_bytes={}",
-                     ix.by_token.size(), ix.max_token_bytes);
+            ix.by_token.size(), ix.max_token_bytes);
         return ix;
     }
 
@@ -247,7 +250,7 @@ struct KokoroServable {
             auto fileSize = std::filesystem::file_size(entry.path());
             if (fileSize == 0 || fileSize % (STYLE_DIM * sizeof(float)) != 0) {
                 SPDLOG_ERROR("Voice file {} has invalid size {} (must be multiple of {})",
-                             entry.path().string(), fileSize, STYLE_DIM * sizeof(float));
+                    entry.path().string(), fileSize, STYLE_DIM * sizeof(float));
                 continue;
             }
 
@@ -262,7 +265,7 @@ struct KokoroServable {
             }
 
             SPDLOG_INFO("Loaded voice pack '{}': {} entries x {} dims from {}",
-                         name, pack.numEntries, STYLE_DIM, entry.path().string());
+                name, pack.numEntries, STYLE_DIM, entry.path().string());
 
             if (defaultVoiceName.empty()) {
                 defaultVoiceName = name;

From b50e4c60a9538ee3851f866fb6a8b8ab61dad7d5 Mon Sep 17 00:00:00 2001
From: Michal Kulakowski <michal.kulakowski@intel.com>
Date: Wed, 25 Feb 2026 13:27:08 +0100
Subject: [PATCH 07/11] style

---
 src/audio/audio_utils.cpp            | 2 +-
 src/audio/kokoro/kokoro_servable.hpp | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/audio/audio_utils.cpp b/src/audio/audio_utils.cpp
index 1707b45cd2..59668be23f 100644
--- a/src/audio/audio_utils.cpp
+++ b/src/audio/audio_utils.cpp
@@ -218,4 +218,4 @@ void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, size_t speechSiz
     timer.stop(OUTPUT_PREPARATION);
     auto outputPreparationTime = (timer.elapsed<std::chrono::microseconds>(OUTPUT_PREPARATION)) / 1000;
     SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Output preparation time: {} ms", outputPreparationTime);
-}
\ No newline at end of file
+}
diff --git a/src/audio/kokoro/kokoro_servable.hpp b/src/audio/kokoro/kokoro_servable.hpp
index 5c668ae05d..ccee9f30cd 100644
--- a/src/audio/kokoro/kokoro_servable.hpp
+++ b/src/audio/kokoro/kokoro_servable.hpp
@@ -24,6 +24,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include <utility>
 
 #pragma warning(push)
 #pragma warning(disable : 4005 4309 6001 6385 6386 6326 6011 4005 4456 6246)

From 31f06cc486e7305ee7a0f17d45a503ab3dbe10ff Mon Sep 17 00:00:00 2001
From: Michal Kulakowski <michal.kulakowski@intel.com>
Date: Fri, 27 Feb 2026 10:34:02 +0100
Subject: [PATCH 08/11] fix

---
 demos/audio/export_kokoro.py         | 141 +++++++++++++++++++++++++++
 demos/audio/tts_test_strings.py      | 125 ++++++++++++++++++++++++
 src/audio/kokoro/kokoro_servable.hpp |  45 ++++++++-
 3 files changed, 310 insertions(+), 1 deletion(-)
 create mode 100644 demos/audio/export_kokoro.py
 create mode 100644 demos/audio/tts_test_strings.py

diff --git a/demos/audio/export_kokoro.py b/demos/audio/export_kokoro.py
new file mode 100644
index 0000000000..d2615a7aa5
--- /dev/null
+++ b/demos/audio/export_kokoro.py
@@ -0,0 +1,141 @@
+#
+# Copyright (C) 2026 Intel Corporation
+# SPDX-License-Identifier: Apache 2.0
+#
+
+import torch
+import json
+import time
+from pathlib import Path
+from kokoro.model import KModel
+from kokoro import KPipeline
+import openvino as ov
+import shutil
+
+MAX_SEQ_LENGTH = 500
+
+
+class KokoroTTSPipeline:
+    def __init__(self):
+        model_id = "hexgrad/Kokoro-82M"
+        self.pipeline = KPipeline(lang_code="a", repo_id=model_id)
+
+    def __call__(self, text: str, voice: str = "af_heart"):
+        with torch.no_grad():
+            generator = self.pipeline(text, voice=voice)
+            result = next(generator)
+        return result.audio
+
+
+class OVKModel(KModel):
+    def __init__(self, model_dir: Path, device: str, plugin_config: dict = {}):
+        torch.nn.Module.__init__(self)
+
+        core = ov.Core()
+
+        self.repo_id = model_id
+        with (model_dir / "config.json").open("r", encoding="utf-8") as f:
+            config = json.load(f)
+        self.vocab = config["vocab"]
+        print("Starting to compile OpenVINO model on device:", device)
+
+        start = time.time()
+        self.model = core.compile_model(model_dir / "openvino_model.xml", device.upper(), config=plugin_config)
+        print(f"Model compiled successfully in {time.time() - start:.2f}s.")
+        self.context_length = config["plbert"]["max_position_embeddings"]
+
+    @property
+    def device(self):
+        return torch.device("cpu")
+
+    def forward_with_tokens(self, input_ids: torch.LongTensor, ref_s: torch.FloatTensor, speed: float = 1) -> tuple[torch.FloatTensor, torch.LongTensor]:
+        text_len = input_ids.shape[-1]
+
+        if text_len < MAX_SEQ_LENGTH:
+            # 0 in this model context is acting as BOS/EOS/PAD.
+            # Since 0 causes artifacts, we might consider space (16) or period (4).
+            padding_value = 16
+            input_ids = torch.nn.functional.pad(input_ids, (0, MAX_SEQ_LENGTH - text_len), value=padding_value)
+
+        start = time.time()
+        print("Running inference on OpenVINO model...")
+        outputs = self.model([input_ids, ref_s, torch.tensor(speed)])
+        print(f"Inference completed in {time.time() - start:.2f}s.")
+
+        audio = torch.from_numpy(outputs[0])
+        pred_dur = torch.from_numpy(outputs[1])
+
+        if text_len < MAX_SEQ_LENGTH:
+            pred_dur = pred_dur[:text_len]
+            # Approximate audio trimming based on duration ratio
+            total_dur = outputs[1].sum()
+            valid_dur = pred_dur.sum()
+            if total_dur > 0:
+                audio_keep = int(audio.shape[-1] * (valid_dur / total_dur))
+                audio = audio[:audio_keep]
+
+        return audio, pred_dur
+
+    @staticmethod
+    def download_and_convert(model_dir: Path, repo_id: str, ttsPipeline: KokoroTTSPipeline):
+        import openvino as ov
+        from huggingface_hub import hf_hub_download
+        import gc
+
+        if not (model_dir / "openvino_model.xml").exists():
+            print(f"Converting Kokoro model to OpenVINO format at {model_dir}...")
+            model = ttsPipeline.pipeline.model
+            model.forward = model.forward_with_tokens
+            input_ids = torch.randint(1, 100, (48,)).numpy()
+            input_ids = torch.LongTensor([[0, *input_ids, 0]])
+            style = torch.randn(1, 256)
+            speed = torch.randint(1, 10, (1,), dtype=torch.float32)
+
+            ov_model = ov.convert_model(model, example_input=(input_ids, style, speed), input=[
+                                        ov.PartialShape("[1, 2..]"), ov.PartialShape([1, -1])])
+            ov.save_model(ov_model, model_dir / "openvino_model.xml")
+            hf_hub_download(repo_id=model_id, filename="config.json", local_dir=model_dir)
+        else:
+            print(f"OpenVINO model already exists at {model_dir}, skipping conversion.")
+
+        gc.collect()
+
+    @staticmethod
+    def convert_to_static(input_model_dir: Path, output_model_dir: Path):
+        import openvino as ov
+
+        print(f"Converting OpenVINO model to static shapes at {input_model_dir}...")
+        core = ov.Core()
+        model = core.read_model(input_model_dir / "openvino_model.xml")
+        static_shape = {"input_ids": [1, MAX_SEQ_LENGTH], "ref_s": [1, 256], "speed": [1], }
+        model.reshape(static_shape)
+        print("Reshaped model inputs:", model.inputs)
+        ov.save_model(model, output_model_dir / "openvino_model.xml")
+        print("Conversion to static shapes completed.")
+        # Copy config file
+        shutil.copy(input_model_dir / "config.json", output_model_dir / "config.json")
+
+
+if __name__ == "__main__":
+
+    model_id = "hexgrad/Kokoro-82M"
+
+    # Download model from Hugging Face and convert to OpenVINO format.
+    pipeline = KokoroTTSPipeline()
+
+    # Convert and save the Kokoro model to OpenVINO format
+    OVKModel.download_and_convert(Path("./kokoro_openvino_model"), repo_id=model_id, ttsPipeline=pipeline)
+
+    # To run inference on NPU, model must have static input shapes
+    OVKModel.convert_to_static(Path("./kokoro_openvino_model"), Path("./kokoro_static_openvino_model"))
+    # # Execution on NPU require config file
+    # config = {
+    #     "NPU": {
+    #         "NPU_USE_NPUW": "YES",
+    #         "NPUW_DEVICES": "NPU,CPU",
+    #         "NPUW_KOKORO": "YES",
+    #     }
+    # }
+
+    # # NPUW_CACHE_DIR can be used to avoid compilation on every run
+    # config["NPU"]["NPUW_CACHE_DIR"] = "./npu_cache_kokoro"
\ No newline at end of file
diff --git a/demos/audio/tts_test_strings.py b/demos/audio/tts_test_strings.py
new file mode 100644
index 0000000000..79b1194a3e
--- /dev/null
+++ b/demos/audio/tts_test_strings.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+"""Send a battery of tricky TTS test strings to an OpenAI-compatible
+speech endpoint, one by one, and save each result as a numbered WAV file.
+
+Usage:
+    python tts_test_strings.py --endpoint http://localhost:8000/v3 \
+                               --model kokoro \
+                               [--voice None] \
+                               [--output-dir tts_output]
+"""
+
+import argparse
+import os
+import sys
+import time
+
+from openai import OpenAI
+
+TEST_STRINGS = [
+    'Dr. A. B. Carter Jr. met Sen. O\'Neill at 5 p.m., Wed., in Washington, D.C.',
+    'Mr. Smith, Ph.D., arrived on Fri. at 6:30 a.m.; Mrs. Jones left at noon.',
+    'We meet on 01/02/2025 at 05:30 IST; is that India or Israel time?',
+    'The deadline is 2025\u201102\u201101 23:59 UTC\u221205:00 (EST).',
+    'He finished 1st; she was 22nd\u2014barely.',
+    'Prices: $1,234.56 vs \u20ac1.234,56; also \u00a512 345 (thin space).',
+    'Add \u00be cup, then \u00bd tsp; total \u2248 1\u00bc cups.',
+    'Chapter XLIV starts on page ix; version v2.0.0 follows v1.12.9.',
+    'Dose: 5 mg vs 5 \u03bcg\u2014don\'t confuse micrograms with milligrams.',
+    'Avogadro\'s number is 6.022e23; \u03c0 \u2248 3.14159; \u221a2 \u2248 1.4142.',
+    'Temperature dropped to \u221210 \u00b0C (14 \u00b0F) with 90% RH.',
+    'Visit https://example.com/a/b?x=1&y=2#frag or email ops+alerts@example.org.',
+    'Open C:\\Program Files\\Project\\config.yaml or /usr/local/bin/run.sh.',
+    '.NET, Node.js, C#, C++17, and Rust\'s crate\u2011names\u2011with\u2011hyphens.',
+    '"WYSIWYG," "GIF" (hard or soft g?), "SQL" (sequel or S\u2011Q\u2011L?).',
+    'I will present the present to the lead singer who stepped on the lead.',
+    'They desert the desert; the dove dove; he wound the wound.',
+    'Please record the record before the minute is up in a minute.',
+    'She sells seashells by the seashore; truly Irish wristwatch.',
+    'Unique New York, toy boat, red leather yellow leather.',
+    'A na\u00efve co\u00f6perative fa\u00e7ade in S\u00e3o Paulo; \u0141\u00f3d\u017a and Krak\u00f3w in Poland.',
+    'Pi\u00f1ata, jalape\u00f1o, cr\u00e8me br\u00fbl\u00e9e, bouillabaisse, d\u00e9j\u00e0 vu.',
+    '\U0001f44d\U0001f3fb is a thumbs\u2011up with light skin tone; \U0001f9d1\u200d\U0001f4bb writes code; \U0001f468\u200d\U0001f469\u200d\U0001f467\u200d\U0001f466 is a family; \U0001f1f5\U0001f1f1 is a flag.',
+    'Faces: \U0001f642\U0001f609\U0001f610\U0001f611\U0001f636; hearts: \u2764\ufe0f\U0001f9e1\U0001f49b\U0001f49a\U0001f499; mixed: \U0001f937\u200d\u2642\ufe0f\U0001f926\u200d\u2640\ufe0f.',
+    'Latin "A" vs Cyrillic "\u0410"; Greek "\u03c1" vs Latin "p"; micro "\u00b5" vs Greek "\u03bc".',
+    '\u05e9\u05dc\u05d5\u05dd and \u0645\u0631\u062d\u0628\u064b\u0627 appear with left\u2011to\u2011right text in one line.',
+    'Prosody markers: \u02c8primary, \u02ccsecondary, and length \u02d0 are tricky for tokenizers.',
+    'Arrows for intonation: \u2197 rising, \u2198 falling, \u2193 drop.',
+    'He said, "She replied, \'no\u2014never\u2026\'," then left\u2014silently.',
+    'Parentheticals (like this\u2014really!) and em\u2011dashes\u2014here\u2014confuse prosody.',
+    'Let f(x)=x^2; then d/dx x^2=2x; \u2202/\u2202x is the operator.',
+    'Inline code x += 1; and TeX E=mc^2 should be read clearly.',
+    'N,N\u2011Diethyl\u2011meta\u2011toluamide (DEET) differs from p\u2011xylene and m\u2011cresol.',
+    'The RFC 7231/HTTP\u2011semantics "GET" vs "HEAD" distinction matters.',
+    'Read "macOS" vs "Mac OS", "iOS", "SQL", "URL", and "S3" correctly.',
+]
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Send TTS test strings to an OpenAI-compatible speech endpoint."
+    )
+    parser.add_argument(
+        "--endpoint", required=True,
+        help="Base URL of the API (e.g. http://localhost:8000/v3)"
+    )
+    parser.add_argument(
+        "--model", required=True,
+        help="Model name to use for speech generation"
+    )
+    parser.add_argument(
+        "--voice", default=None,
+        help="Voice name (default: voice1)"
+    )
+    parser.add_argument(
+        "--output-dir", default="tts_output",
+        help="Directory to save output WAV files (default: tts_output)"
+    )
+    args = parser.parse_args()
+
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    client = OpenAI(base_url=args.endpoint, api_key="unused")
+
+    total = len(TEST_STRINGS)
+    print(f"Sending {total} test strings to {args.endpoint} (model={args.model}, voice={args.voice})")
+    print(f"Output directory: {args.output_dir}\n")
+
+    succeeded = 0
+    failed = 0
+    total_size_kb = 0.0
+    t_start = time.time()
+
+    for idx, text in enumerate(TEST_STRINGS, start=1):
+        preview = text[:80] + ("..." if len(text) > 80 else "")
+        print(f"[{idx:2d}/{total}] {preview}")
+
+        out_path = os.path.join(args.output_dir, f"{idx:02d}.wav")
+        t0 = time.time()
+        try:
+            response = client.audio.speech.create(
+                model=args.model,
+                voice=args.voice,
+                input=text,
+            )
+            response.write_to_file(out_path)
+            elapsed = time.time() - t0
+            size_kb = os.path.getsize(out_path) / 1024
+            total_size_kb += size_kb
+            succeeded += 1
+            print(f"        -> {out_path}  ({size_kb:.1f} KB, {elapsed:.2f}s)")
+        except Exception as exc:
+            elapsed = time.time() - t0
+            failed += 1
+            print(f"        !! FAILED after {elapsed:.2f}s: {exc}", file=sys.stderr)
+
+    total_elapsed = time.time() - t_start
+    print(f"\n{'='*60}")
+    print(f"Summary: {succeeded} succeeded, {failed} failed out of {total}")
+    print(f"Total time: {total_elapsed:.2f}s  (avg {total_elapsed/total:.2f}s per string)")
+    print(f"Total audio size: {total_size_kb:.1f} KB")
+    print(f"{'='*60}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/audio/kokoro/kokoro_servable.hpp b/src/audio/kokoro/kokoro_servable.hpp
index ccee9f30cd..73dff4b104 100644
--- a/src/audio/kokoro/kokoro_servable.hpp
+++ b/src/audio/kokoro/kokoro_servable.hpp
@@ -156,10 +156,21 @@ struct KokoroServable {
             // that accumulate in the deep decoder network and cause energy fade.
             ov::hint::execution_mode(ov::hint::ExecutionMode::ACCURACY),
         };
+        //properties["INFERENCE_PRECISION_HINT"] = "f32";
         ov::Core core;
         auto m_model = core.read_model(parsedModelsPath / std::filesystem::path("openvino_model.xml"), {}, properties);
         compiledModel = core.compile_model(m_model, targetDevice, properties);
-        inferRequestsQueue = std::make_unique<OVInferRequestsQueue>(compiledModel, 5);
+        uint32_t numberOfParallelInferRequests = 1;
+        try {
+            numberOfParallelInferRequests = compiledModel.get_property(ov::optimal_number_of_infer_requests);
+        } catch (const ov::Exception& ex) {
+            SPDLOG_WARN("Failed to query OPTIMAL_NUMBER_OF_INFER_REQUESTS with error {}. Using 1 nireq.", ex.what());
+            numberOfParallelInferRequests = 1u;
+        }
+        inferRequestsQueue = std::make_unique<OVInferRequestsQueue>(compiledModel, numberOfParallelInferRequests);
+        
+        // Warm up model with dummy inference
+        //warmUpModel();
     }
 
     OVInferRequestsQueue& getInferRequestsQueue() {
@@ -276,6 +287,38 @@ struct KokoroServable {
 
         SPDLOG_INFO("Loaded {} voice pack(s), default: '{}'", voicePacks.size(), defaultVoiceName);
     }
+
+    // void warmUpModel() {
+    //     try {
+    //         SPDLOG_INFO("Warming up Kokoro model with dummy inference...");
+            
+    //         // Create dummy tensors with minimal sequence length
+    //         constexpr size_t dummySeqLen = 3;  // [0, token, 0] pattern
+    //         auto inputIdsTensor = ov::Tensor{ov::element::i64, ov::Shape{1, dummySeqLen}};
+    //         auto refS = ov::Tensor{ov::element::f32, ov::Shape{1, STYLE_DIM}};
+    //         auto speed = ov::Tensor{ov::element::f32, ov::Shape{1}};
+            
+    //         // Fill with dummy values
+    //         auto* idsData = reinterpret_cast<int64_t*>(inputIdsTensor.data());
+    //         idsData[0] = 0;  // PAD token
+    //         idsData[1] = 1;  // arbitrary token ID
+    //         idsData[2] = 0;  // PAD token
+            
+    //         std::fill_n(reinterpret_cast<float*>(refS.data()), STYLE_DIM, 0.0f);
+    //         *reinterpret_cast<float*>(speed.data()) = 1.0f;
+            
+    //         // Get infer request and run warm-up inference
+    //         ov::InferRequest inferRequest = compiledModel.create_infer_request();
+    //         inferRequest.set_tensor("input_ids", inputIdsTensor);
+    //         inferRequest.set_tensor("103", refS);
+    //         inferRequest.set_tensor("speed", speed);
+    //         inferRequest.infer();
+            
+    //         SPDLOG_INFO("Kokoro model warm-up completed successfully");
+    //     } catch (const std::exception& ex) {
+    //         SPDLOG_WARN("Kokoro model warm-up failed: {}. Continuing anyway...", ex.what());
+    //     }
+    // }
 };
 
 using KokoroServableMap = std::unordered_map<std::string, std::shared_ptr<KokoroServable>>;

From 962e3d34ad7e813e9dc1b5a8ce7f8598d3ffc7b5 Mon Sep 17 00:00:00 2001
From: Michal Kulakowski <michal.kulakowski@intel.com>
Date: Fri, 27 Feb 2026 15:52:27 +0100
Subject: [PATCH 09/11] Improvements

---
 demos/audio/export_kokoro.py             |   6 +-
 src/audio/kokoro/kokoro_calculator.cc    | 387 +++++++++++++----------
 src/audio/kokoro/kokoro_calculator.proto |   1 +
 src/audio/kokoro/kokoro_servable.hpp     |  47 +--
 4 files changed, 239 insertions(+), 202 deletions(-)

diff --git a/demos/audio/export_kokoro.py b/demos/audio/export_kokoro.py
index d2615a7aa5..8ca3ed89f6 100644
--- a/demos/audio/export_kokoro.py
+++ b/demos/audio/export_kokoro.py
@@ -118,16 +118,16 @@ def convert_to_static(input_model_dir: Path, output_model_dir: Path):
 
 if __name__ == "__main__":
 
-    model_id = "hexgrad/Kokoro-82M"
+    model_id = "hexgrad/Kokoro-82M-v1.1-zh"
 
     # Download model from Hugging Face and convert to OpenVINO format.
     pipeline = KokoroTTSPipeline()
 
     # Convert and save the Kokoro model to OpenVINO format
-    OVKModel.download_and_convert(Path("./kokoro_openvino_model"), repo_id=model_id, ttsPipeline=pipeline)
+    OVKModel.download_and_convert(Path("./kokoro_openvino_model_zh"), repo_id=model_id, ttsPipeline=pipeline)
 
     # To run inference on NPU, model must have static input shapes
-    OVKModel.convert_to_static(Path("./kokoro_openvino_model"), Path("./kokoro_static_openvino_model"))
+    OVKModel.convert_to_static(Path("./kokoro_openvino_model_zh"), Path("./kokoro_static_openvino_model_zh"))
     # # Execution on NPU require config file
     # config = {
     #     "NPU": {
diff --git a/src/audio/kokoro/kokoro_calculator.cc b/src/audio/kokoro/kokoro_calculator.cc
index 1747b18081..728e0f88b4 100644
--- a/src/audio/kokoro/kokoro_calculator.cc
+++ b/src/audio/kokoro/kokoro_calculator.cc
@@ -14,11 +14,8 @@
 // limitations under the License.
 //*****************************************************************************
 #include <algorithm>
-#include <cstdint>
-#include <fstream>
 #include <mutex>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
 #pragma warning(push)
@@ -40,7 +37,6 @@
 
 #pragma warning(push)
 #pragma warning(disable : 6001 4324 6385 6386)
-#include "absl/strings/escaping.h"
 #include "absl/strings/str_cat.h"
 #pragma warning(pop)
 
@@ -64,7 +60,67 @@ namespace {
 #define espeakPHONEMES_NO_STRESS 0x08
 #endif
 
-void espeakPhonemizeAll(const std::string& textUtf8, std::string& outIpa, bool noStress = true) {
+std::string retone(const std::string& p) {
+    std::string result = p;
+    
+    auto replaceAll = [](std::string& s, const std::string& from, const std::string& to) {
+        size_t pos = 0;
+        while ((pos = s.find(from, pos)) != std::string::npos) {
+            s.replace(pos, from.size(), to);
+            pos += to.size();
+        }
+    };
+    
+    // Tone mark replacements
+    replaceAll(result, "˧˩˧", "↓");  // third tone
+    replaceAll(result, "˧˥", "↗");   // second tone
+    replaceAll(result, "˥˩", "↘");   // fourth tone
+    replaceAll(result, "˥", "→");    // first tone
+    
+    // Unicode character replacements (UTF-8 encoded)
+    replaceAll(result, "\xCA\x97\xCC\x89", "ɨ");  // chr(635)+chr(809)
+    replaceAll(result, "\xCA\x91\xCC\x89", "ɨ");  // chr(633)+chr(809)
+    
+    // Verify chr(809) removed
+    if (result.find("\xCC\x89") != std::string::npos) {
+        SPDLOG_WARN("Combining diacritic (chr 809) still present: {}", result);
+    }
+    
+    return result;
+}
+
+std::string getEspeakVoice(const std::string& isoLanguageCode) {
+    // ISO 639-1 codes with optional region codes
+    if (isoLanguageCode == "en-us") {
+        return "en-us";  // American English (default for 'en')
+    } else if (isoLanguageCode == "en-gb") {
+        return "en";     // British English
+    } else if (isoLanguageCode == "en") {
+        return "en-us";  // Default to American English when only 'en' specified
+    } else if (isoLanguageCode == "es") {
+        return "es";
+    } else if (isoLanguageCode == "fr") {
+        return "fr";
+    } else if (isoLanguageCode == "hi") {
+        return "hi";
+    } else if (isoLanguageCode == "it") {
+        return "it";
+    } else if (isoLanguageCode == "ja") {
+        return "ja";
+    } else if (isoLanguageCode == "pt-br") {
+        return "pt";     // Brazilian Portuguese
+    } else if (isoLanguageCode == "zh" || isoLanguageCode == "zh-cn") {
+        return "cmn-latn-pinyin";    // Mandarin Chinese
+    }
+    return "";  // Unsupported
+}
+
+bool isSupportedLanguage(const std::string& isoLanguageCode) {
+    // Only accept ISO 639-1 codes and regional variants
+    return !getEspeakVoice(isoLanguageCode).empty();
+}
+
+void espeakPhonemizeAll(const std::string& textUtf8, std::string& outIpa, const std::string& language = "en", bool noStress = true) {
     outIpa.clear();
     auto& espeak = ovms::EspeakInstance::instance();
     if (!espeak.isReady()) {
@@ -74,6 +130,23 @@ void espeakPhonemizeAll(const std::string& textUtf8, std::string& outIpa, bool n
 
     std::lock_guard<std::mutex> guard(espeak.mutex());
 
+    // Get the eSpeak voice name from the ISO language code
+    // Kokoro supports 9 languages: American English, British English, Spanish, French, Hindi, Italian, Japanese, Brazilian Portuguese, Mandarin Chinese
+    std::string voiceName = getEspeakVoice(language);
+    if (voiceName.empty()) {
+        // This should not happen if validation was done, but fallback just in case
+        SPDLOG_ERROR("Invalid language code '{}' passed to espeakPhonemizeAll", language);
+        voiceName = "en-us";
+    }
+    if (espeak_SetVoiceByName(voiceName.c_str()) != EE_OK) {
+        SPDLOG_ERROR("Failed to set eSpeak voice '{}'", voiceName);
+        if (voiceName != "en-us" && espeak_SetVoiceByName("en-us") == EE_OK) {
+            voiceName = "en-us";
+        } else {
+            return;
+        }
+    }
+
     const int mode = espeakPHONEMES_IPA | (noStress ? espeakPHONEMES_NO_STRESS : 0);
     const void* pos = static_cast<const void*>(textUtf8.c_str());
     const char* endPtr = static_cast<const char*>(pos) + textUtf8.size();
@@ -91,6 +164,7 @@ void espeakPhonemizeAll(const std::string& textUtf8, std::string& outIpa, bool n
 
     // Strip combining diacriticals (U+0300..U+036F) and collapse spaces
     std::string cleaned;
+    cleaned.reserve(rawIpa.size());
     for (size_t i = 0; i < rawIpa.size(); ++i) {
         unsigned char c = static_cast<unsigned char>(rawIpa[i]);
         if (i + 1 < rawIpa.size()) {
@@ -103,6 +177,7 @@ void espeakPhonemizeAll(const std::string& textUtf8, std::string& outIpa, bool n
         cleaned.push_back(c);
     }
 
+    outIpa.reserve(cleaned.size());
     bool lastSpace = false;
     for (char c : cleaned) {
         if (std::isspace(static_cast<unsigned char>(c))) {
@@ -123,70 +198,6 @@ void espeakPhonemizeAll(const std::string& textUtf8, std::string& outIpa, bool n
     SPDLOG_DEBUG("IPA phonemes: '{}' (length: {})", outIpa, outIpa.size());
 }
 
-// Post-process eSpeak IPA into Kokoro/misaki phoneme alphabet.
-// Mirrors misaki.espeak.EspeakFallback.E2M for American English.
-// void espeakIpaToKokoro(std::string& ps) {
-//     // Helper: replace all occurrences of `from` with `to` in `s`.
-//     auto replaceAll = [](std::string& s, const std::string& from, const std::string& to) {
-//         if (from.empty()) return;
-//         size_t pos = 0;
-//         while ((pos = s.find(from, pos)) != std::string::npos) {
-//             s.replace(pos, from.size(), to);
-//             pos += to.size();
-//         }
-//     };
-
-//     // --- Multi-char replacements (longest first) ---
-//     // Syllabic n with glottal stop
-//     replaceAll(ps, "\xca\x94\xcb\x8c\x6e\xcc\xa9", "\xca\x94\x6e");  // ʔˌn̩ → ʔn
-//     replaceAll(ps, "\xca\x94\x6e\xcc\xa9", "\xca\x94\x6e");              // ʔn̩ → ʔn
-//     // Syllabic mark before consonant → ᵊ + consonant
-//     // ə̩l → ᵊl  (syllabic l)
-//     replaceAll(ps, "\xc9\x99\xcc\xa9\x6c", "\xe1\xb5\x8a\x6c");          // əl̩ → ᵊl  (approximation)
-
-//     // Diphthongs
-//     replaceAll(ps, "a\xc9\xaa", "I");       // aɪ → I
-//     replaceAll(ps, "a\xca\x8a", "W");       // aʊ → W
-//     replaceAll(ps, "e\xc9\xaa", "A");       // eɪ → A
-//     replaceAll(ps, "\xc9\x94\xc9\xaa", "Y"); // ɔɪ → Y
-//     replaceAll(ps, "o\xca\x8a", "O");       // oʊ → O  (American)
-//     replaceAll(ps, "\xc9\x99\xca\x8a", "O"); // əʊ → O  (British)
-
-//     // Affricates
-//     replaceAll(ps, "d\xca\x92", "\xca\xa4");  // dʒ → ʤ
-//     replaceAll(ps, "t\xca\x83", "\xca\xa7");  // tʃ → ʧ
-
-//     // Palatalization
-//     replaceAll(ps, "\xca\xb2\x6f", "jo");     // ʲo → jo
-//     replaceAll(ps, "\xca\xb2\xc9\x99", "j\xc9\x99"); // ʲə → jə
-//     replaceAll(ps, "\xca\xb2", "");           // ʲ → (delete)
-
-//     // R-colored vowels and vowel length
-//     replaceAll(ps, "\xc9\x9c\xcb\x90\xc9\xb9", "\xc9\x9c\xc9\xb9"); // ɜːɹ → ɜɹ
-//     replaceAll(ps, "\xc9\x9c\xcb\x90", "\xc9\x9c\xc9\xb9");           // ɜː → ɜɹ
-//     replaceAll(ps, "\xc9\xaa\xc9\x99", "i\xc9\x99");                   // ɪə → iə
-
-//     // --- Single-char replacements ---
-//     replaceAll(ps, "\xc9\x9a", "\xc9\x99\xc9\xb9"); // ɚ → əɹ
-//     replaceAll(ps, "\xc9\x90", "\xc9\x99");           // ɐ → ə
-//     replaceAll(ps, "\xc9\xac", "l");                   // ɬ → l
-//     replaceAll(ps, "\xc3\xa7", "k");                   // ç → k
-//     replaceAll(ps, "x", "k");                           // x → k
-//     replaceAll(ps, "r", "\xc9\xb9");                   // r → ɹ
-//     replaceAll(ps, "\xcb\x90", "");                     // ː → (strip length marks)
-//     replaceAll(ps, "\xcc\x83", "");                     // ̃ → (strip nasal tilde)
-
-//     // British vowel mappings (in case eSpeak uses 'en' voice)
-//     replaceAll(ps, "\xc9\x92", "\xc9\x94");           // ɒ → ɔ
-
-//     // Remaining standalone vowels (must be AFTER diphthong replacements)
-//     replaceAll(ps, "o", "\xc9\x94");                   // o → ɔ  (for espeak < 1.52)
-//     replaceAll(ps, "e", "A");                           // e → A
-
-//     // Flap and glottal stop (misaki version != 2.0)
-//     replaceAll(ps, "\xc9\xbe", "T");                   // ɾ → T
-//     replaceAll(ps, "\xca\x94", "t");                   // ʔ → t
-// }
 
 size_t utf8CharLen(unsigned char lead) {
     if (lead < 0x80)
@@ -202,10 +213,15 @@ size_t utf8CharLen(unsigned char lead) {
 
 void tokenize(const std::string& textUtf8,
     std::vector<int64_t>& tokenIds,
-    const ovms::VocabIndex& ix) {
+    const ovms::VocabIndex& ix,
+    const std::string& language = "en") {
     tokenIds.clear();
+    // Reserve estimated capacity to avoid reallocations
+    tokenIds.reserve(textUtf8.size() / 2);
+    
     size_t pos = 0;
     const size_t n = textUtf8.size();
+    size_t unknownCount = 0;
 
     while (pos < n) {
         size_t maxTry = std::min(ix.max_token_bytes, n - pos);
@@ -227,12 +243,20 @@ void tokenize(const std::string& textUtf8,
         } else {
             const unsigned char lead = static_cast<unsigned char>(textUtf8[pos]);
             const size_t adv = utf8CharLen(lead);
-            SPDLOG_WARN("Tokenizer: unknown bytes at pos {}: '{}'",
-                pos, std::string(textUtf8.data() + pos, std::min(adv, n - pos)));
+            std::string unknownBytes(textUtf8.data() + pos, std::min(adv, n - pos));
+            unknownCount++;
+            SPDLOG_DEBUG("Tokenizer [lang={}]: unknown phoneme at pos {}: '{}' (skipping)",
+                language, pos, unknownBytes);
             pos += std::min(adv, n - pos);
         }
     }
-    SPDLOG_DEBUG("Tokenize: produced {} ids", tokenIds.size());
+    if (unknownCount > 0) {
+        SPDLOG_WARN("Tokenize [lang={}]: {} unknown phonemes found. Produced {} token ids. "
+                    "Consider updating vocabulary for better {} speech quality.",
+                    language, unknownCount, tokenIds.size(), language);
+    } else {
+        SPDLOG_DEBUG("Tokenize [lang={}]: produced {} ids without unknown phonemes", language, tokenIds.size());
+    }
 }
 }  // namespace
 
@@ -243,6 +267,7 @@ const std::string KOKORO_SESSION_SIDE_PACKET_TAG = "KOKORO_NODE_RESOURCES";
 class KokoroCalculator : public CalculatorBase {
     static const std::string INPUT_TAG_NAME;
     static const std::string OUTPUT_TAG_NAME;
+    std::string defaultLanguage;  // Language configured in graph pbtxt
 
 public:
     static absl::Status GetContract(CalculatorContract* cc) {
@@ -261,102 +286,140 @@ class KokoroCalculator : public CalculatorBase {
 
     absl::Status Open(CalculatorContext* cc) final {
         SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "KokoroCalculator [Node: {}] Open", cc->NodeName());
+        
+        // Read language from graph configuration
+        const auto& options = cc->Options<KokoroCalculatorOptions>();
+        this->defaultLanguage = options.has_language() ? options.language() : "en";
+        
+        // Normalize language code to lowercase
+        std::transform(this->defaultLanguage.begin(), this->defaultLanguage.end(), this->defaultLanguage.begin(), ::tolower);
+        
+        // Validate language is supported
+        if (!isSupportedLanguage(this->defaultLanguage)) {
+            return absl::InvalidArgumentError(absl::StrCat(
+                "Invalid language in graph config: '", this->defaultLanguage, "'. ",
+                "Supported ISO 639-1 language codes: en, es, fr, hi, it, ja, pt-br, zh. ",
+                "Regional variants: en-us, en-gb, pt-br, zh-cn"
+            ));
+        }
+        
+        SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, 
+            "KokoroCalculator [Node: {}] configured for language: {}", 
+            cc->NodeName(), this->defaultLanguage);
+        
         return absl::OkStatus();
     }
 
     absl::Status Process(CalculatorContext* cc) final {
         SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "KokoroCalculator [Node: {}] Process start", cc->NodeName());
+        try {
+            KokoroServableMap servablesMap = cc->InputSidePackets()
+                                                 .Tag(KOKORO_SESSION_SIDE_PACKET_TAG)
+                                                 .Get<KokoroServableMap>();
+            auto servableIt = servablesMap.find(cc->NodeName());
+            RET_CHECK(servableIt != servablesMap.end())
+                << "Could not find initialized Kokoro node named: " << cc->NodeName();
+            auto servable = servableIt->second;
+
+            const auto& payload = cc->Inputs().Tag(INPUT_TAG_NAME).Get<ovms::HttpPayload>();
+            auto it = payload.parsedJson->FindMember("input");
+            RET_CHECK(it != payload.parsedJson->MemberEnd()) << "Missing 'input' in request";
+            RET_CHECK(it->value.IsString()) << "'input' must be a string";
+            const std::string text = it->value.GetString();
+
+            // Read optional "voice" parameter (OpenAI TTS API)
+            std::string voiceName;
+            auto voiceIt = payload.parsedJson->FindMember("voice");
+            if (voiceIt != payload.parsedJson->MemberEnd() && voiceIt->value.IsString()) {
+                voiceName = voiceIt->value.GetString();
+            }
 
-        KokoroServableMap servablesMap = cc->InputSidePackets()
-                                             .Tag(KOKORO_SESSION_SIDE_PACKET_TAG)
-                                             .Get<KokoroServableMap>();
-        auto servableIt = servablesMap.find(cc->NodeName());
-        RET_CHECK(servableIt != servablesMap.end())
-            << "Could not find initialized Kokoro node named: " << cc->NodeName();
-        auto servable = servableIt->second;
-
-        const auto& payload = cc->Inputs().Tag(INPUT_TAG_NAME).Get<ovms::HttpPayload>();
-        auto it = payload.parsedJson->FindMember("input");
-        RET_CHECK(it != payload.parsedJson->MemberEnd()) << "Missing 'input' in request";
-        RET_CHECK(it->value.IsString()) << "'input' must be a string";
-        const std::string text = it->value.GetString();
-
-        // Read optional "voice" parameter (OpenAI TTS API)
-        std::string voiceName;
-        auto voiceIt = payload.parsedJson->FindMember("voice");
-        if (voiceIt != payload.parsedJson->MemberEnd() && voiceIt->value.IsString()) {
-            voiceName = voiceIt->value.GetString();
+            // Language is configured in the graph pbtxt, not from request
+            // Use the defaultLanguage set during Open()
+            const std::string language = this->defaultLanguage;
+            SPDLOG_DEBUG("Using configured language: {}", language);
+
+            // Text -> IPA phonemization
+            std::string phonemes;
+            
+            // Use eSpeak for all languages
+            espeakPhonemizeAll(text, phonemes, language, /*noStress=*/false);
+            if(language == "zh" || language == "zh-cn"){
+                phonemes = retone(phonemes);
+            }
+            
+            SPDLOG_DEBUG("Input text: '{}' (language: {}), IPA phonemes ({} chars): '{}'", text, language, phonemes.size(), phonemes);
+
+            // Preserve trailing punctuation from original text (eSpeak strips it)
+            // if (!text.empty()) {
+            //     char last = text.back();
+            //     if (last == '.' || last == '!' || last == '?' || last == ';' || last == ':' || last == ',') {
+            //         phonemes.push_back(last);
+            //     }
+            // }
+            SPDLOG_DEBUG("After E2M mapping ({} chars): '{}'", phonemes.size(), phonemes);
+            // IPA -> Kokoro token IDs
+            const auto& vocabIx = servable->getVocabIndex();
+            std::vector<int64_t> tokenIds;
+            tokenize(phonemes, tokenIds, vocabIx, language);
+
+            // Wrap with PAD token (id=0) at both ends — matches official
+            // forward_with_tokens: input_ids = [[0, *tokens, 0]]
+            tokenIds.insert(tokenIds.begin(), 0);
+            tokenIds.push_back(0);
+
+            // Voice embedding — select slice from voice pack based on content token count
+            size_t numContentTokens = tokenIds.size() >= 2 ? tokenIds.size() - 2 : 0;  // exclude BOS pad + EOS
+            const float* voiceSlice = servable->getVoiceSlice(voiceName, numContentTokens);
+            RET_CHECK(voiceSlice != nullptr) << "No voice pack loaded (place .bin files in <model_dir>/voices/)";
+
+            auto inputIdsTensor = ov::Tensor{ov::element::i64, ov::Shape{1, tokenIds.size()}};
+            auto refS = ov::Tensor{ov::element::f32, ov::Shape{1, KokoroServable::STYLE_DIM}};
+            auto speed = ov::Tensor{ov::element::f32, ov::Shape{1}};
+
+            *reinterpret_cast<float*>(speed.data()) = 1.0f;
+            std::copy(tokenIds.data(), tokenIds.data() + tokenIds.size(),
+                reinterpret_cast<int64_t*>(inputIdsTensor.data()));
+            std::copy(voiceSlice, voiceSlice + KokoroServable::STYLE_DIM,
+                reinterpret_cast<float*>(refS.data()));
+
+            // Inference
+            ModelMetricReporter unused(nullptr, nullptr, "unused", 1);
+            auto executingStreamIdGuard =
+                std::make_unique<ExecutingStreamIdGuard>(servable->getInferRequestsQueue(), unused);
+            ov::InferRequest& inferRequest = executingStreamIdGuard->getInferRequest();
+
+            inferRequest.set_tensor("input_ids", inputIdsTensor);
+            inferRequest.set_tensor("103", refS);
+            inferRequest.set_tensor("speed", speed);
+            inferRequest.start_async();
+            inferRequest.wait();
+
+            // Collect audio output
+            auto out = inferRequest.get_tensor(inferRequest.get_compiled_model().outputs()[0]);
+            RET_CHECK(out.get_shape().size() == 1);
+            RET_CHECK(out.get_element_type() == ov::element::f32);
+            const size_t samples = out.get_shape()[0];
+            const float* data = out.data<float>();
+
+            SPDLOG_DEBUG("Model output: {} audio samples ({:.2f}s at 24kHz)",
+                samples, static_cast<float>(samples) / 24000.0f);
+
+            void* wavDataPtr = nullptr;
+            size_t wavSize = 0;
+            prepareAudioOutputKokoro(&wavDataPtr, wavSize, samples, data);
+
+            auto output = std::make_unique<std::string>(reinterpret_cast<char*>(wavDataPtr), wavSize);
+            drwav_free(wavDataPtr, NULL);
+
+            cc->Outputs().Tag(OUTPUT_TAG_NAME).Add(output.release(), cc->InputTimestamp());
+        } catch (const std::exception& e) {
+            SPDLOG_ERROR("KokoroCalculator [Node: {}] Process failed: {}", cc->NodeName(), e.what());
+            return absl::InvalidArgumentError(e.what());
+        } catch (...) {
+            SPDLOG_ERROR("KokoroCalculator [Node: {}] Process failed: unknown error", cc->NodeName());
+            return absl::InvalidArgumentError("Kokoro processing failed");
         }
-
-        // Text -> IPA phonemization
-        std::string phonemes;
-        espeakPhonemizeAll(text, phonemes, /*noStress=*/false);
-        SPDLOG_DEBUG("Input text: '{}', IPA phonemes ({} chars): '{}'", text, phonemes.size(), phonemes);
-
-        // Preserve trailing punctuation from original text (eSpeak strips it)
-        // if (!text.empty()) {
-        //     char last = text.back();
-        //     if (last == '.' || last == '!' || last == '?' || last == ';' || last == ':' || last == ',') {
-        //         phonemes.push_back(last);
-        //     }
-        // }
-        SPDLOG_DEBUG("After E2M mapping ({} chars): '{}'", phonemes.size(), phonemes);
-        // IPA -> Kokoro token IDs
-        const auto& vocabIx = servable->getVocabIndex();
-        std::vector<std::vector<int64_t>> inputTokens(1);
-        tokenize(phonemes, inputTokens[0], vocabIx);
-
-        // Wrap with PAD token (id=0) at both ends — matches official
-        // forward_with_tokens: input_ids = [[0, *tokens, 0]]
-        inputTokens[0].insert(inputTokens[0].begin(), 0);
-        inputTokens[0].push_back(0);
-
-        // Voice embedding — select slice from voice pack based on content token count
-        auto& ids = inputTokens[0];
-        size_t numContentTokens = ids.size() >= 2 ? ids.size() - 2 : 0;  // exclude BOS pad + EOS
-        const float* voiceSlice = servable->getVoiceSlice(voiceName, numContentTokens);
-        RET_CHECK(voiceSlice != nullptr) << "No voice pack loaded (place .bin files in <model_dir>/voices/)";
-
-        auto inputIdsTensor = ov::Tensor{ov::element::i64, ov::Shape{1, ids.size()}};
-        auto refS = ov::Tensor{ov::element::f32, ov::Shape{1, KokoroServable::STYLE_DIM}};
-        auto speed = ov::Tensor{ov::element::f32, ov::Shape{1}};
-
-        *reinterpret_cast<float*>(speed.data()) = 1.0f;
-        std::copy(ids.data(), ids.data() + ids.size(),
-            reinterpret_cast<int64_t*>(inputIdsTensor.data()));
-        std::copy(voiceSlice, voiceSlice + KokoroServable::STYLE_DIM,
-            reinterpret_cast<float*>(refS.data()));
-
-        // Inference
-        ModelMetricReporter unused(nullptr, nullptr, "unused", 1);
-        auto executingStreamIdGuard =
-            std::make_unique<ExecutingStreamIdGuard>(servable->getInferRequestsQueue(), unused);
-        ov::InferRequest& inferRequest = executingStreamIdGuard->getInferRequest();
-
-        inferRequest.set_tensor("input_ids", inputIdsTensor);
-        inferRequest.set_tensor("103", refS);
-        inferRequest.set_tensor("speed", speed);
-        inferRequest.start_async();
-        inferRequest.wait();
-
-        // Collect audio output
-        auto out = inferRequest.get_tensor(inferRequest.get_compiled_model().outputs()[0]);
-        RET_CHECK(out.get_shape().size() == 1);
-        RET_CHECK(out.get_element_type() == ov::element::f32);
-        const size_t samples = out.get_shape()[0];
-        const float* data = out.data<float>();
-
-        SPDLOG_DEBUG("Model output: {} audio samples ({:.2f}s at 24kHz)",
-            samples, static_cast<float>(samples) / 24000.0f);
-
-        void* wavDataPtr = nullptr;
-        size_t wavSize = 0;
-        prepareAudioOutputKokoro(&wavDataPtr, wavSize, samples, data);
-
-        auto output = std::make_unique<std::string>(reinterpret_cast<char*>(wavDataPtr), wavSize);
-        drwav_free(wavDataPtr, NULL);
-
-        cc->Outputs().Tag(OUTPUT_TAG_NAME).Add(output.release(), cc->InputTimestamp());
         SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "KokoroCalculator [Node: {}] Process end", cc->NodeName());
         return absl::OkStatus();
     }
diff --git a/src/audio/kokoro/kokoro_calculator.proto b/src/audio/kokoro/kokoro_calculator.proto
index d9fc1b4bd9..8ec0f43341 100644
--- a/src/audio/kokoro/kokoro_calculator.proto
+++ b/src/audio/kokoro/kokoro_calculator.proto
@@ -30,4 +30,5 @@ message KokoroCalculatorOptions {
     required string models_path = 1;
     optional string target_device = 2;
     optional string plugin_config = 3;
+    optional string language = 4;  // ISO 639-1 language code (en, es, fr, hi, it, ja, pt-br, zh)
 }
diff --git a/src/audio/kokoro/kokoro_servable.hpp b/src/audio/kokoro/kokoro_servable.hpp
index 73dff4b104..9a81f8f527 100644
--- a/src/audio/kokoro/kokoro_servable.hpp
+++ b/src/audio/kokoro/kokoro_servable.hpp
@@ -24,7 +24,6 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
-#include <utility>
 
 #pragma warning(push)
 #pragma warning(disable : 4005 4309 6001 6385 6386 6326 6011 4005 4456 6246)
@@ -91,8 +90,17 @@ class EspeakInstance {
                 espeakINITIALIZE_DONT_EXIT);
             if (sr <= 0)
                 return false;
+            // Try to initialize with Kokoro's supported language voices
+            // Kokoro supports: en-us (American English), en (British English), es (Spanish), fr (French), hi (Hindi), it (Italian), ja (Japanese), pt (Brazilian Portuguese), cmn (Mandarin Chinese)
             if (espeak_SetVoiceByName("en-us") != EE_OK &&
-                espeak_SetVoiceByName("en") != EE_OK) {
+                espeak_SetVoiceByName("en") != EE_OK &&
+                espeak_SetVoiceByName("es") != EE_OK &&
+                espeak_SetVoiceByName("fr") != EE_OK &&
+                espeak_SetVoiceByName("hi") != EE_OK &&
+                espeak_SetVoiceByName("it") != EE_OK &&
+                espeak_SetVoiceByName("ja") != EE_OK &&
+                espeak_SetVoiceByName("pt") != EE_OK &&
+                espeak_SetVoiceByName("cmn") != EE_OK) {
                 return false;
             }
             return true;
@@ -168,9 +176,6 @@ struct KokoroServable {
             numberOfParallelInferRequests = 1u;
         }
         inferRequestsQueue = std::make_unique<OVInferRequestsQueue>(compiledModel, numberOfParallelInferRequests);
-        
-        // Warm up model with dummy inference
-        //warmUpModel();
     }
 
     OVInferRequestsQueue& getInferRequestsQueue() {
@@ -287,38 +292,6 @@ struct KokoroServable {
 
         SPDLOG_INFO("Loaded {} voice pack(s), default: '{}'", voicePacks.size(), defaultVoiceName);
     }
-
-    // void warmUpModel() {
-    //     try {
-    //         SPDLOG_INFO("Warming up Kokoro model with dummy inference...");
-            
-    //         // Create dummy tensors with minimal sequence length
-    //         constexpr size_t dummySeqLen = 3;  // [0, token, 0] pattern
-    //         auto inputIdsTensor = ov::Tensor{ov::element::i64, ov::Shape{1, dummySeqLen}};
-    //         auto refS = ov::Tensor{ov::element::f32, ov::Shape{1, STYLE_DIM}};
-    //         auto speed = ov::Tensor{ov::element::f32, ov::Shape{1}};
-            
-    //         // Fill with dummy values
-    //         auto* idsData = reinterpret_cast<int64_t*>(inputIdsTensor.data());
-    //         idsData[0] = 0;  // PAD token
-    //         idsData[1] = 1;  // arbitrary token ID
-    //         idsData[2] = 0;  // PAD token
-            
-    //         std::fill_n(reinterpret_cast<float*>(refS.data()), STYLE_DIM, 0.0f);
-    //         *reinterpret_cast<float*>(speed.data()) = 1.0f;
-            
-    //         // Get infer request and run warm-up inference
-    //         ov::InferRequest inferRequest = compiledModel.create_infer_request();
-    //         inferRequest.set_tensor("input_ids", inputIdsTensor);
-    //         inferRequest.set_tensor("103", refS);
-    //         inferRequest.set_tensor("speed", speed);
-    //         inferRequest.infer();
-            
-    //         SPDLOG_INFO("Kokoro model warm-up completed successfully");
-    //     } catch (const std::exception& ex) {
-    //         SPDLOG_WARN("Kokoro model warm-up failed: {}. Continuing anyway...", ex.what());
-    //     }
-    // }
 };
 
 using KokoroServableMap = std::unordered_map<std::string, std::shared_ptr<KokoroServable>>;

From a12dc60f2d3e465c5e907557cdd5e985bef53fc7 Mon Sep 17 00:00:00 2001
From: Michal Kulakowski <michal.kulakowski@intel.com>
Date: Mon, 2 Mar 2026 13:06:23 +0100
Subject: [PATCH 10/11] change way espeak is built

---
 Dockerfile.redhat           | 60 +++++++++++++++++++++++++++++++++---
 Dockerfile.ubuntu           | 61 +++++++++++++++++++++++++++++++++++--
 third_party/espeak_ng/BUILD | 31 ++++++++-----------
 3 files changed, 126 insertions(+), 26 deletions(-)

diff --git a/Dockerfile.redhat b/Dockerfile.redhat
index 41e02ecc12..da9885a92d 100644
--- a/Dockerfile.redhat
+++ b/Dockerfile.redhat
@@ -100,6 +100,37 @@ WORKDIR /ovms/third_party/opencv
 RUN if [ "$VERBOSE_LOGS" == "ON" ] ; then export VERBOSE=1 ; fi && ./install_opencv.sh
 ####### End of OpenCV
 
+# Build espeak-ng from sources
+FROM base_build as espeak_build
+
+ARG ESPEAK_NG_VERSION=1.51.1
+WORKDIR /tmp/espeak_build
+
+RUN dnf install -y libtool automake autoconf pkgconfig && \
+    dnf clean all
+
+RUN cd /tmp/espeak_build && \
+    git clone --branch ${ESPEAK_NG_VERSION} https://github.com/espeak-ng/espeak-ng.git espeak-ng-src 2>&1 && \
+    ls -lah /tmp/espeak_build/
+
+RUN cd /tmp/espeak_build/espeak-ng-src && \
+    touch AUTHORS NEWS && \
+    libtoolize --force --copy && \
+    aclocal && \
+    autoheader && \
+    autoconf && \
+    automake --add-missing --copy && \
+    ./configure --prefix=/opt/espeak-ng \
+        --disable-shared \
+        --enable-static \
+        --disable-mbrola \
+        --disable-klatt \
+        --without-audio && \
+    make -j$(nproc) && \
+    make install
+
+RUN rm -rf /tmp/espeak_build
+
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 FROM base_build as build
 ARG BASE_IMAGE
@@ -127,11 +158,15 @@ RUN dnf install -y -d6 \
             python3.12 \
             python3.12-devel \
             python3.12-pip \
-            libicu-devel \
-            espeak-ng \
-            espeak-ng-devel && \
+            libicu-devel && \
             dnf clean all
 
+# Copy espeak-ng built from sources
+COPY --from=espeak_build /opt/espeak-ng /opt/espeak-ng
+ENV PATH="/opt/espeak-ng/bin:${PATH}"
+ENV LD_LIBRARY_PATH="/opt/espeak-ng/lib:${LD_LIBRARY_PATH}"
+ENV ESPEAK_DATA_PATH="/opt/espeak-ng/share/espeak-ng-data"
+
 WORKDIR /
 
 ARG INSTALL_DRIVER_VERSION="24.52.32224"
@@ -258,6 +293,17 @@ RUN ln -s /usr/lib64 /usr/lib/x86_64-linux-gnu
 COPY external /ovms/external/
 COPY third_party /ovms/third_party
 
+# Provide espeak-ng headers and static library inside workspace for Bazel
+RUN mkdir -p /ovms/third_party/espeak_ng/include /ovms/third_party/espeak_ng/lib && \
+    cp -r /opt/espeak-ng/include/* /ovms/third_party/espeak_ng/include/ && \
+    mkdir -p /ovms/third_party/espeak_ng/include/espeak-ng && \
+    if [ -d /opt/espeak-ng/include/espeak-ng ]; then \
+        cp -r /opt/espeak-ng/include/espeak-ng/* /ovms/third_party/espeak_ng/include/espeak-ng/; \
+    else \
+        cp -v /opt/espeak-ng/include/*.h /ovms/third_party/espeak_ng/include/espeak-ng/; \
+    fi && \
+    cp -v /opt/espeak-ng/lib/libespeak-ng.a /ovms/third_party/espeak_ng/lib/
+
 # This path is required for namespace to setup Python dependencies for testing the binding
 COPY src/BUILD /ovms/src/BUILD
 COPY src/python/binding/BUILD /ovms/src/python/binding/BUILD
@@ -406,6 +452,13 @@ LABEL base-image=${RELEASE_BASE_IMAGE}
 ENV PYTHONPATH=/ovms/lib/python:/ovms/python_deps
 
 WORKDIR /
+
+# Copy espeak-ng built from sources
+COPY --from=espeak_build /opt/espeak-ng /opt/espeak-ng
+ENV PATH="/opt/espeak-ng/bin:${PATH}"
+ENV LD_LIBRARY_PATH="/opt/espeak-ng/lib:${LD_LIBRARY_PATH}"
+ENV ESPEAK_DATA_PATH="/opt/espeak-ng/share/espeak-ng-data"
+
 SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 COPY ./install_redhat_gpu_drivers.sh /install_gpu_drivers.sh
 # hadolint ignore=DL3003,DL3041,SC2164,SC1091
@@ -418,7 +471,6 @@ RUN if [ -f /usr/bin/dnf ] ; then export DNF_TOOL=dnf ; echo -e "max_parallel_do
     if ! [[ $debug_bazel_flags == *"py_off"* ]]; then \
         $DNF_TOOL install -y python3.12-libs --setopt=install_weak_deps=0 --nodocs; \
     fi ; \
-    $DNF_TOOL install -y espeak-ng --setopt=install_weak_deps=0 --nodocs; \
     $DNF_TOOL install -y shadow-utils; \
     $DNF_TOOL clean all ; \
     cp -v /etc/ssl/certs/ca-bundle.crt /etc/ssl/certs/ca-certificates.crt ; \
diff --git a/Dockerfile.ubuntu b/Dockerfile.ubuntu
index d7d2ace9f8..021d77f4ff 100644
--- a/Dockerfile.ubuntu
+++ b/Dockerfile.ubuntu
@@ -87,6 +87,38 @@ WORKDIR /ovms/third_party/opencv
 RUN ./install_opencv.sh
 ####### End of OpenCV
 
+# Build espeak-ng from sources
+FROM base_build as espeak_build
+
+ARG ESPEAK_NG_VERSION=1.51.1
+WORKDIR /tmp/espeak_build
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        libtool automake autoconf pkg-config && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+RUN cd /tmp/espeak_build && \
+    git clone --branch ${ESPEAK_NG_VERSION} https://github.com/espeak-ng/espeak-ng.git espeak-ng-src 2>&1 && \
+    ls -lah /tmp/espeak_build/
+
+RUN cd /tmp/espeak_build/espeak-ng-src && \
+    touch AUTHORS NEWS && \
+    libtoolize --force --copy && \
+    aclocal && \
+    autoheader && \
+    autoconf && \
+    automake --add-missing --copy && \
+    ./configure --prefix=/opt/espeak-ng \
+        --disable-shared \
+        --enable-static \
+        --disable-mbrola \
+        --disable-klatt \
+        --without-audio && \
+    make -j$(nproc) && \
+    make install
+
+RUN rm -rf /tmp/espeak_build
+
 ################### BASE BUILD ##########################
 FROM base_build as build
 ARG BASE_IMAGE
@@ -99,9 +131,14 @@ RUN if [ "$BASE_OS" == "ubuntu24" ] ; then apt-get update && \
     apt-get install -y software-properties-common --no-install-recommends; add-apt-repository 'ppa:deadsnakes/ppa' -y && \
     apt-get clean && rm -rf /var/lib/apt/lists/* ; fi
 ENV PIP_BREAK_SYSTEM_PACKAGES=1
+
+# Copy espeak-ng built from sources
+COPY --from=espeak_build /opt/espeak-ng /opt/espeak-ng
+ENV PATH="/opt/espeak-ng/bin:${PATH}"
+ENV LD_LIBRARY_PATH="/opt/espeak-ng/lib:${LD_LIBRARY_PATH}"
+ENV ESPEAK_DATA_PATH="/opt/espeak-ng/share/espeak-ng-data"
+
 RUN apt-get update && apt-get install --no-install-recommends -y \
-            espeak-ng \
-            libespeak-ng-dev \
             libgflags-dev \
             bc \
             ca-certificates \
@@ -265,6 +302,17 @@ ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/intel/openvino/runtime/lib/intel64/:
 COPY --from=base_build /opt/opencv /opt/opencv/
 COPY third_party /ovms/third_party/
 
+# Provide espeak-ng headers and static library inside workspace for Bazel
+RUN mkdir -p /ovms/third_party/espeak_ng/include /ovms/third_party/espeak_ng/lib && \
+    cp -r /opt/espeak-ng/include/* /ovms/third_party/espeak_ng/include/ && \
+    mkdir -p /ovms/third_party/espeak_ng/include/espeak-ng && \
+    if [ -d /opt/espeak-ng/include/espeak-ng ]; then \
+        cp -r /opt/espeak-ng/include/espeak-ng/* /ovms/third_party/espeak_ng/include/espeak-ng/; \
+    else \
+        cp -v /opt/espeak-ng/include/*.h /ovms/third_party/espeak_ng/include/espeak-ng/; \
+    fi && \
+    cp -v /opt/espeak-ng/lib/libespeak-ng.a /ovms/third_party/espeak_ng/lib/
+
 # Mediapipe
 COPY BUILD.bazel /ovms/
 COPY *\.bzl /ovms/
@@ -413,12 +461,19 @@ SHELL ["/bin/bash", "-c"]
 WORKDIR /
 
 COPY release_files/drivers /drivers
+
+# Copy espeak-ng built from sources
+COPY --from=espeak_build /opt/espeak-ng /opt/espeak-ng
+ENV PATH="/opt/espeak-ng/bin:${PATH}"
+ENV LD_LIBRARY_PATH="/opt/espeak-ng/lib:${LD_LIBRARY_PATH}"
+ENV ESPEAK_DATA_PATH="/opt/espeak-ng/share/espeak-ng-data"
+
 SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 ARG INSTALL_DRIVER_VERSION="24.39.31294"
 COPY ./install_ubuntu_gpu_drivers.sh /tmp/install_gpu_drivers.sh
 # hadolint ignore=DL3003,SC2164
 RUN apt-get update ; \
-    apt-get install -y --no-install-recommends curl ca-certificates libxml2 espeak-ng espeak-ng-data || exit 1; \
+    apt-get install -y --no-install-recommends curl ca-certificates libxml2 || exit 1; \
     if [ "$GPU" == "1" ] ; then \
 	/tmp/install_gpu_drivers.sh ; \
     fi ; \
diff --git a/third_party/espeak_ng/BUILD b/third_party/espeak_ng/BUILD
index 2c0a1cb09a..71f736f09c 100644
--- a/third_party/espeak_ng/BUILD
+++ b/third_party/espeak_ng/BUILD
@@ -8,24 +8,17 @@ config_setting(
 
 cc_library(
     name = "espeak_ng",
-    copts = select({
-        ":is_macos": [
-            # Adjust to where Homebrew (or your installer) puts headers
-            "-I" + "$(HOME)/.brew/opt/espeak-ng/include",
-            "-I" + "$(HOME)/.brew/opt/espeak-ng/include/espeak-ng",
-        ],
-        "//conditions:default": [
-            # Typical on Debian/Ubuntu when installing libespeak-ng-dev
-            "-I/usr/include",
-            "-I/usr/include/espeak-ng",
-        ],
-    }),
-    linkopts = select({
-        ":is_macos": [
-            "-L" + "$(HOME)/.brew/opt/espeak-ng/lib",
-            "-lespeak-ng",
-        ],
-        "//conditions:default": ["-lespeak-ng"],
-    }),
+    hdrs = glob(["include/**/*.h"]),
+    includes = [
+        "include",
+        "include/espeak-ng",
+    ],
+    deps = [":espeak_ng_lib"],
     visibility = ["//visibility:public"],
 )
+
+cc_import(
+    name = "espeak_ng_lib",
+    static_library = "lib/libespeak-ng.a",
+    visibility = ["//visibility:private"],
+)

From d62263e5b4832b0a3d6aff271248ca74765e773a Mon Sep 17 00:00:00 2001
From: Michal Kulakowski <michal.kulakowski@intel.com>
Date: Mon, 2 Mar 2026 17:17:56 +0100
Subject: [PATCH 11/11] Build espeak from source

---
 Dockerfile.redhat           | 27 +++------------------------
 Dockerfile.ubuntu           | 29 +++--------------------------
 third_party/espeak_ng/BUILD | 16 ++++++----------
 3 files changed, 12 insertions(+), 60 deletions(-)

diff --git a/Dockerfile.redhat b/Dockerfile.redhat
index da9885a92d..3686d98493 100644
--- a/Dockerfile.redhat
+++ b/Dockerfile.redhat
@@ -120,7 +120,7 @@ RUN cd /tmp/espeak_build/espeak-ng-src && \
     autoheader && \
     autoconf && \
     automake --add-missing --copy && \
-    ./configure --prefix=/opt/espeak-ng \
+    ./configure --prefix=/usr/local \
         --disable-shared \
         --enable-static \
         --disable-mbrola \
@@ -161,12 +161,6 @@ RUN dnf install -y -d6 \
             libicu-devel && \
             dnf clean all
 
-# Copy espeak-ng built from sources
-COPY --from=espeak_build /opt/espeak-ng /opt/espeak-ng
-ENV PATH="/opt/espeak-ng/bin:${PATH}"
-ENV LD_LIBRARY_PATH="/opt/espeak-ng/lib:${LD_LIBRARY_PATH}"
-ENV ESPEAK_DATA_PATH="/opt/espeak-ng/share/espeak-ng-data"
-
 WORKDIR /
 
 ARG INSTALL_DRIVER_VERSION="24.52.32224"
@@ -293,17 +287,6 @@ RUN ln -s /usr/lib64 /usr/lib/x86_64-linux-gnu
 COPY external /ovms/external/
 COPY third_party /ovms/third_party
 
-# Provide espeak-ng headers and static library inside workspace for Bazel
-RUN mkdir -p /ovms/third_party/espeak_ng/include /ovms/third_party/espeak_ng/lib && \
-    cp -r /opt/espeak-ng/include/* /ovms/third_party/espeak_ng/include/ && \
-    mkdir -p /ovms/third_party/espeak_ng/include/espeak-ng && \
-    if [ -d /opt/espeak-ng/include/espeak-ng ]; then \
-        cp -r /opt/espeak-ng/include/espeak-ng/* /ovms/third_party/espeak_ng/include/espeak-ng/; \
-    else \
-        cp -v /opt/espeak-ng/include/*.h /ovms/third_party/espeak_ng/include/espeak-ng/; \
-    fi && \
-    cp -v /opt/espeak-ng/lib/libespeak-ng.a /ovms/third_party/espeak_ng/lib/
-
 # This path is required for namespace to setup Python dependencies for testing the binding
 COPY src/BUILD /ovms/src/BUILD
 COPY src/python/binding/BUILD /ovms/src/python/binding/BUILD
@@ -452,12 +435,8 @@ LABEL base-image=${RELEASE_BASE_IMAGE}
 ENV PYTHONPATH=/ovms/lib/python:/ovms/python_deps
 
 WORKDIR /
-
-# Copy espeak-ng built from sources
-COPY --from=espeak_build /opt/espeak-ng /opt/espeak-ng
-ENV PATH="/opt/espeak-ng/bin:${PATH}"
-ENV LD_LIBRARY_PATH="/opt/espeak-ng/lib:${LD_LIBRARY_PATH}"
-ENV ESPEAK_DATA_PATH="/opt/espeak-ng/share/espeak-ng-data"
+COPY --from=base_build /usr/local/share/espeak-ng-data /usr/local/share/espeak-ng-data
+ENV ESPEAK_DATA_PATH=/usr/local/share/espeak-ng-data
 
 SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 COPY ./install_redhat_gpu_drivers.sh /install_gpu_drivers.sh
diff --git a/Dockerfile.ubuntu b/Dockerfile.ubuntu
index 021d77f4ff..33f6cbc4e0 100644
--- a/Dockerfile.ubuntu
+++ b/Dockerfile.ubuntu
@@ -88,7 +88,6 @@ RUN ./install_opencv.sh
 ####### End of OpenCV
 
 # Build espeak-ng from sources
-FROM base_build as espeak_build
 
 ARG ESPEAK_NG_VERSION=1.51.1
 WORKDIR /tmp/espeak_build
@@ -108,7 +107,7 @@ RUN cd /tmp/espeak_build/espeak-ng-src && \
     autoheader && \
     autoconf && \
     automake --add-missing --copy && \
-    ./configure --prefix=/opt/espeak-ng \
+    ./configure --prefix=/usr/local \
         --disable-shared \
         --enable-static \
         --disable-mbrola \
@@ -132,12 +131,6 @@ RUN if [ "$BASE_OS" == "ubuntu24" ] ; then apt-get update && \
     apt-get clean && rm -rf /var/lib/apt/lists/* ; fi
 ENV PIP_BREAK_SYSTEM_PACKAGES=1
 
-# Copy espeak-ng built from sources
-COPY --from=espeak_build /opt/espeak-ng /opt/espeak-ng
-ENV PATH="/opt/espeak-ng/bin:${PATH}"
-ENV LD_LIBRARY_PATH="/opt/espeak-ng/lib:${LD_LIBRARY_PATH}"
-ENV ESPEAK_DATA_PATH="/opt/espeak-ng/share/espeak-ng-data"
-
 RUN apt-get update && apt-get install --no-install-recommends -y \
             libgflags-dev \
             bc \
@@ -301,18 +294,6 @@ ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/intel/openvino/runtime/lib/intel64/:
 # FROM BASE BUILD
 COPY --from=base_build /opt/opencv /opt/opencv/
 COPY third_party /ovms/third_party/
-
-# Provide espeak-ng headers and static library inside workspace for Bazel
-RUN mkdir -p /ovms/third_party/espeak_ng/include /ovms/third_party/espeak_ng/lib && \
-    cp -r /opt/espeak-ng/include/* /ovms/third_party/espeak_ng/include/ && \
-    mkdir -p /ovms/third_party/espeak_ng/include/espeak-ng && \
-    if [ -d /opt/espeak-ng/include/espeak-ng ]; then \
-        cp -r /opt/espeak-ng/include/espeak-ng/* /ovms/third_party/espeak_ng/include/espeak-ng/; \
-    else \
-        cp -v /opt/espeak-ng/include/*.h /ovms/third_party/espeak_ng/include/espeak-ng/; \
-    fi && \
-    cp -v /opt/espeak-ng/lib/libespeak-ng.a /ovms/third_party/espeak_ng/lib/
-
 # Mediapipe
 COPY BUILD.bazel /ovms/
 COPY *\.bzl /ovms/
@@ -461,12 +442,8 @@ SHELL ["/bin/bash", "-c"]
 WORKDIR /
 
 COPY release_files/drivers /drivers
-
-# Copy espeak-ng built from sources
-COPY --from=espeak_build /opt/espeak-ng /opt/espeak-ng
-ENV PATH="/opt/espeak-ng/bin:${PATH}"
-ENV LD_LIBRARY_PATH="/opt/espeak-ng/lib:${LD_LIBRARY_PATH}"
-ENV ESPEAK_DATA_PATH="/opt/espeak-ng/share/espeak-ng-data"
+COPY --from=base_build /usr/local/share/espeak-ng-data /usr/local/share/espeak-ng-data
+ENV ESPEAK_DATA_PATH=/usr/local/share/espeak-ng-data
 
 SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 ARG INSTALL_DRIVER_VERSION="24.39.31294"
diff --git a/third_party/espeak_ng/BUILD b/third_party/espeak_ng/BUILD
index 71f736f09c..31f51b73da 100644
--- a/third_party/espeak_ng/BUILD
+++ b/third_party/espeak_ng/BUILD
@@ -8,17 +8,13 @@ config_setting(
 
 cc_library(
     name = "espeak_ng",
-    hdrs = glob(["include/**/*.h"]),
+    linkopts = [
+        "-L/usr/local/lib",
+        "-lespeak-ng",
+    ],
     includes = [
-        "include",
-        "include/espeak-ng",
+        "/usr/local/include",
+        "/usr/local/include/espeak-ng",
     ],
-    deps = [":espeak_ng_lib"],
     visibility = ["//visibility:public"],
 )
-
-cc_import(
-    name = "espeak_ng_lib",
-    static_library = "lib/libespeak-ng.a",
-    visibility = ["//visibility:private"],
-)