From 7047b8fb2f9ce858a6bfdc8746a5b7505d6476e8 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Wed, 17 Dec 2025 17:24:27 +0100 Subject: [PATCH 01/11] Initiali suppor of Kokoro model --- Dockerfile.redhat | 5 +- Dockerfile.ubuntu | 4 +- src/BUILD | 1 + src/audio/audio_utils.cpp | 31 ++ src/audio/audio_utils.hpp | 2 + src/audio/kokoro/BUILD | 60 ++++ src/audio/kokoro/kokoro_calculator.cc | 320 ++++++++++++++++++ src/audio/kokoro/kokoro_calculator.proto | 33 ++ src/audio/kokoro/kokoro_servable.hpp | 197 +++++++++++ src/logging.cpp | 4 + src/logging.hpp | 1 + .../mediapipegraphdefinition.cpp | 23 ++ .../mediapipegraphdefinition.hpp | 8 +- .../mediapipegraphexecutor.cpp | 4 +- .../mediapipegraphexecutor.hpp | 3 + 15 files changed, 692 insertions(+), 4 deletions(-) create mode 100644 src/audio/kokoro/BUILD create mode 100644 src/audio/kokoro/kokoro_calculator.cc create mode 100644 src/audio/kokoro/kokoro_calculator.proto create mode 100644 src/audio/kokoro/kokoro_servable.hpp diff --git a/Dockerfile.redhat b/Dockerfile.redhat index bc574eaaf2..41e02ecc12 100644 --- a/Dockerfile.redhat +++ b/Dockerfile.redhat @@ -127,7 +127,9 @@ RUN dnf install -y -d6 \ python3.12 \ python3.12-devel \ python3.12-pip \ - libicu-devel && \ + libicu-devel \ + espeak-ng \ + espeak-ng-devel && \ dnf clean all WORKDIR / @@ -416,6 +418,7 @@ RUN if [ -f /usr/bin/dnf ] ; then export DNF_TOOL=dnf ; echo -e "max_parallel_do if ! [[ $debug_bazel_flags == *"py_off"* ]]; then \ $DNF_TOOL install -y python3.12-libs --setopt=install_weak_deps=0 --nodocs; \ fi ; \ + $DNF_TOOL install -y espeak-ng --setopt=install_weak_deps=0 --nodocs; \ $DNF_TOOL install -y shadow-utils; \ $DNF_TOOL clean all ; \ cp -v /etc/ssl/certs/ca-bundle.crt /etc/ssl/certs/ca-certificates.crt ; \ diff --git a/Dockerfile.ubuntu b/Dockerfile.ubuntu index 15e47daf20..d80087c646 100644 --- a/Dockerfile.ubuntu +++ b/Dockerfile.ubuntu @@ -100,6 +100,8 @@ RUN if [ "$BASE_OS" == "ubuntu24" ] ; then apt-get update && \ apt-get clean && rm -rf /var/lib/apt/lists/* ; fi ENV PIP_BREAK_SYSTEM_PACKAGES=1 RUN apt-get update && apt-get install --no-install-recommends -y \ + espeak-ng \ + libespeak-ng-dev \ libgflags-dev \ bc \ ca-certificates \ @@ -413,7 +415,7 @@ ARG INSTALL_DRIVER_VERSION="24.39.31294" COPY ./install_ubuntu_gpu_drivers.sh /tmp/install_gpu_drivers.sh # hadolint ignore=DL3003,SC2164 RUN apt-get update ; \ - apt-get install -y --no-install-recommends curl ca-certificates libxml2 || exit 1; \ + apt-get install -y --no-install-recommends curl ca-certificates libxml2 espeak-ng espeak-ng-data || exit 1; \ if [ "$GPU" == "1" ] ; then \ /tmp/install_gpu_drivers.sh ; \ fi ; \ diff --git a/src/BUILD b/src/BUILD index 71321ca7ee..0318099727 100644 --- a/src/BUILD +++ b/src/BUILD @@ -563,6 +563,7 @@ ovms_cc_library( "//src/image_gen:image_gen_calculator", "//src/audio/speech_to_text:s2t_calculator", "//src/audio/text_to_speech:t2s_calculator", + "//src/audio/kokoro:kokoro_calculator", "//src/audio:audio_utils", "//src/image_gen:imagegen_init", "//src/llm:openai_completions_api_handler", diff --git a/src/audio/audio_utils.cpp b/src/audio/audio_utils.cpp index 77b38e70df..01daafb351 100644 --- a/src/audio/audio_utils.cpp +++ b/src/audio/audio_utils.cpp @@ -188,3 +188,34 @@ void prepareAudioOutput(void** ppData, size_t& pDataSize, uint16_t bitsPerSample auto outputPreparationTime = (timer.elapsed(OUTPUT_PREPARATION)) / 1000; SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Output preparation time: {} ms", outputPreparationTime); } + + +void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, uint16_t bitsPerSample, size_t speechSize, const float* waveformPtr) { + enum : unsigned int { + OUTPUT_PREPARATION, + TIMER_END + }; + Timer timer; + timer.start(OUTPUT_PREPARATION); + drwav_data_format format; + format.container = drwav_container_riff; + format.format = DR_WAVE_FORMAT_IEEE_FLOAT; + format.channels = 1; + format.sampleRate = 24000; // assume it is always 24 KHz + format.bitsPerSample = bitsPerSample; + drwav wav; + size_t totalSamples = speechSize * format.channels; + + auto status = drwav_init_memory_write(&wav, ppData, &pDataSize, &format, nullptr); + if (status == DRWAV_FALSE) { + throw std::runtime_error("Failed to write all frames"); + } + drwav_uint64 framesWritten = drwav_write_pcm_frames(&wav, totalSamples, waveformPtr); + if (framesWritten != totalSamples) { + throw std::runtime_error("Failed to write all frames"); + } + drwav_uninit(&wav); + timer.stop(OUTPUT_PREPARATION); + auto outputPreparationTime = (timer.elapsed(OUTPUT_PREPARATION)) / 1000; + SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Output preparation time: {} ms", outputPreparationTime); +} \ No newline at end of file diff --git a/src/audio/audio_utils.hpp b/src/audio/audio_utils.hpp index cbeea8b457..874e83dca4 100644 --- a/src/audio/audio_utils.hpp +++ b/src/audio/audio_utils.hpp @@ -25,3 +25,5 @@ bool isWavBuffer(const std::string buf); std::vector readWav(const std::string_view& wavData); std::vector readMp3(const std::string_view& mp3Data); void prepareAudioOutput(void** ppData, size_t& pDataSize, uint16_t bitsPerSample, size_t speechSize, const float* waveformPtr); +void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, uint16_t bitsPerSample, size_t speechSize, const float* waveformPtr); + diff --git a/src/audio/kokoro/BUILD b/src/audio/kokoro/BUILD new file mode 100644 index 0000000000..d7d3b64b1a --- /dev/null +++ b/src/audio/kokoro/BUILD @@ -0,0 +1,60 @@ +# +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +load("@mediapipe//mediapipe/framework/port:build_config.bzl", "mediapipe_cc_proto_library", "mediapipe_proto_library") +load("//:common_settings.bzl", "ovms_cc_library") + +ovms_cc_library( + name = "kokoro_servable", + hdrs = ["kokoro_servable.hpp"], + deps= ["//third_party:openvino", + "//src:libovms_ovinferrequestsqueue", + ], + visibility = ["//visibility:public"], + alwayslink = 1, +) + +ovms_cc_library( + name = "kokoro_calculator", + srcs = ["kokoro_calculator.cc"], + deps = [ + "@mediapipe//mediapipe/framework:calculator_framework", + "//src:httppayload", + "//src:libovmslogging", + "kokoro_calculator_cc_proto", + "//src/port:dr_audio", + "//src/port:rapidjson_stringbuffer", + "//src/port:rapidjson_writer", + ":kokoro_servable", + "//third_party:genai", + "//src/audio:audio_utils", + "//src:executingstreamidguard", + "//src:model_metric_reporter", + "//third_party/espeak_ng:espeak_ng", + ], + visibility = ["//visibility:public"], + alwayslink = 1, +) + +mediapipe_proto_library( + name = "kokoro_calculator_proto", + srcs = ["kokoro_calculator.proto"], + visibility = ["//visibility:private"], + deps = [ + "@mediapipe//mediapipe/framework:calculator_options_proto", + "@mediapipe//mediapipe/framework:calculator_proto", + ], +) diff --git a/src/audio/kokoro/kokoro_calculator.cc b/src/audio/kokoro/kokoro_calculator.cc new file mode 100644 index 0000000000..986dd92fab --- /dev/null +++ b/src/audio/kokoro/kokoro_calculator.cc @@ -0,0 +1,320 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#include +#include +#include +#include +#include +#include +#include + +#pragma warning(push) +#pragma warning(disable : 4005 4309 6001 6385 6386 6326 6011 6246 4456 6246) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/port/canonical_errors.h" +#pragma GCC diagnostic pop +#pragma warning(pop) + +#include "src/audio/audio_utils.hpp" +#include "src/http_payload.hpp" +#include "src/logging.hpp" +#include "src/port/dr_audio.hpp" + +#include "../../model_metric_reporter.hpp" +#include "../../executingstreamidguard.hpp" + +#pragma warning(push) +#pragma warning(disable : 6001 4324 6385 6386) +#include "absl/strings/escaping.h" +#include "absl/strings/str_cat.h" +#pragma warning(pop) + +#include + +#include "kokoro_servable.hpp" + +#ifdef _WIN32 +#include +#include +#endif + +using namespace ovms; + +namespace { + +#ifndef espeakPHONEMES_IPA +#define espeakPHONEMES_IPA 0x02 +#endif +#ifndef espeakPHONEMES_NO_STRESS +#define espeakPHONEMES_NO_STRESS 0x08 +#endif + +void espeakPhonemizeAll(const std::string& textUtf8, std::string& outIpa, bool noStress = true) { + outIpa.clear(); + auto& espeak = ovms::EspeakInstance::instance(); + if (!espeak.isReady()) { + SPDLOG_ERROR("eSpeak not initialized"); + return; + } + + std::lock_guard guard(espeak.mutex()); + + const int mode = espeakPHONEMES_IPA | (noStress ? espeakPHONEMES_NO_STRESS : 0); + const void* pos = static_cast(textUtf8.c_str()); + const char* endPtr = static_cast(pos) + textUtf8.size(); + std::string rawIpa; + + while (pos && static_cast(pos) < endPtr) { + const char* ipaChunk = espeak_TextToPhonemes(&pos, espeakCHARS_UTF8, mode); + if (ipaChunk && *ipaChunk) { + if (!rawIpa.empty()) { + rawIpa.push_back(' '); + } + rawIpa.append(ipaChunk); + } + } + + // Strip combining diacriticals (U+0300..U+036F) and collapse spaces + std::string cleaned; + for (size_t i = 0; i < rawIpa.size(); ++i) { + unsigned char c = static_cast(rawIpa[i]); + if (i + 1 < rawIpa.size()) { + unsigned char next = static_cast(rawIpa[i + 1]); + if ((c == 0xCC && next >= 0x80) || (c == 0xCD && next <= 0xAF)) { + i++; + continue; + } + } + cleaned.push_back(c); + } + + bool lastSpace = false; + for (char c : cleaned) { + if (std::isspace(static_cast(c))) { + if (!lastSpace) { + outIpa.push_back(' '); + lastSpace = true; + } + } else { + outIpa.push_back(c); + lastSpace = false; + } + } + + if (!outIpa.empty() && std::isspace(static_cast(outIpa.back()))) { + outIpa.pop_back(); + } + + SPDLOG_DEBUG("IPA phonemes: '{}' (length: {})", outIpa, outIpa.size()); +} + +size_t utf8CharLen(unsigned char lead) { + if (lead < 0x80) return 1; + if ((lead >> 5) == 0x6) return 2; + if ((lead >> 4) == 0xE) return 3; + if ((lead >> 3) == 0x1E) return 4; + return 1; +} + +void tokenize(const std::string& textUtf8, + std::vector& tokenIds, + const ovms::VocabIndex& ix) { + tokenIds.clear(); + size_t pos = 0; + const size_t n = textUtf8.size(); + + while (pos < n) { + size_t maxTry = std::min(ix.max_token_bytes, n - pos); + int foundId = -1; + size_t foundLen = 0; + + for (size_t len = maxTry; len > 0; --len) { + auto it = ix.by_token.find(std::string(textUtf8.data() + pos, len)); + if (it != ix.by_token.end()) { + foundId = it->second; + foundLen = len; + break; + } + } + + if (foundId >= 0) { + tokenIds.push_back(foundId); + pos += foundLen; + } else { + const unsigned char lead = static_cast(textUtf8[pos]); + const size_t adv = utf8CharLen(lead); + SPDLOG_WARN("Tokenizer: unknown bytes at pos {}: '{}'", + pos, std::string(textUtf8.data() + pos, std::min(adv, n - pos))); + pos += std::min(adv, n - pos); + } + } + SPDLOG_DEBUG("Tokenize: produced {} ids", tokenIds.size()); +} +} // namespace + +namespace mediapipe { + +const std::string KOKORO_SESSION_SIDE_PACKET_TAG = "KOKORO_NODE_RESOURCES"; + +class KokoroCalculator : public CalculatorBase { + static const std::string INPUT_TAG_NAME; + static const std::string OUTPUT_TAG_NAME; + +public: + static absl::Status GetContract(CalculatorContract* cc) { + RET_CHECK(!cc->Inputs().GetTags().empty()); + RET_CHECK(!cc->Outputs().GetTags().empty()); + cc->Inputs().Tag(INPUT_TAG_NAME).Set(); + cc->InputSidePackets().Tag(KOKORO_SESSION_SIDE_PACKET_TAG).Set(); + cc->Outputs().Tag(OUTPUT_TAG_NAME).Set(); + return absl::OkStatus(); + } + + absl::Status Close(CalculatorContext* cc) final { + SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "KokoroCalculator [Node: {}] Close", cc->NodeName()); + return absl::OkStatus(); + } + + absl::Status Open(CalculatorContext* cc) final { + SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "KokoroCalculator [Node: {}] Open", cc->NodeName()); + return absl::OkStatus(); + } + + absl::Status Process(CalculatorContext* cc) final { + SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "KokoroCalculator [Node: {}] Process start", cc->NodeName()); + + KokoroServableMap servablesMap = cc->InputSidePackets() + .Tag(KOKORO_SESSION_SIDE_PACKET_TAG).Get(); + auto servableIt = servablesMap.find(cc->NodeName()); + RET_CHECK(servableIt != servablesMap.end()) + << "Could not find initialized Kokoro node named: " << cc->NodeName(); + auto servable = servableIt->second; + + const auto& payload = cc->Inputs().Tag(INPUT_TAG_NAME).Get(); + auto it = payload.parsedJson->FindMember("input"); + RET_CHECK(it != payload.parsedJson->MemberEnd()) << "Missing 'input' in request"; + RET_CHECK(it->value.IsString()) << "'input' must be a string"; + const std::string text = it->value.GetString(); + + // Text -> IPA phonemization + std::string phonemes; + espeakPhonemizeAll(text, phonemes, /*noStress=*/true); + SPDLOG_DEBUG("Input text: '{}', IPA phonemes ({} chars): '{}'", text, phonemes.size(), phonemes); + + // IPA -> Kokoro token IDs + const auto& vocabIx = servable->getVocabIndex(); + std::vector> inputTokens(1); + tokenize(phonemes, inputTokens[0], vocabIx); + + // Prepend PAD token (id=0) - Kokoro model requires BOS/PAD at start + inputTokens[0].insert(inputTokens[0].begin(), 0); + + // Append EOS (period token = 4) if not already present + if (inputTokens[0].empty() || inputTokens[0].back() != 4) { + inputTokens[0].push_back(4); + } + + // Voice embedding + std::vector voice = { + -0.2296, 0.1835, -0.0069, -0.1240, -0.2505, 0.0112, -0.0759, -0.1650, + -0.2665, -0.1965, 0.0242, -0.1667, 0.3524, 0.2140, 0.3069, -0.3377, + -0.0878, -0.0477, 0.0813, -0.2135, -0.2340, -0.1971, 0.0200, 0.0145, + 0.0016, 0.2596, -0.2665, 0.1434, 0.0503, 0.0867, 0.1905, -0.1281, + 0.0658, -0.0639, -0.0920, 0.2444, -0.1506, -0.2197, 0.1385, 0.2133, + -0.0755, -0.0188, -0.0142, 0.2301, -0.0776, -0.0748, 0.0172, 0.0430, + -0.1009, 0.1519, 0.1137, 0.0641, 0.2264, 0.1911, -0.0205, 0.2578, + 0.2210, -0.0784, -0.0235, -0.0547, 0.2191, -0.1623, -0.2416, 0.0076, + 0.0574, 0.2186, 0.0080, 0.0473, 0.0972, 0.0286, 0.1324, 0.0686, + 0.2652, -0.2237, -0.0980, -0.1693, -0.1866, 0.2273, 0.2008, -0.0683, + 0.0957, 0.0623, -0.1891, 0.1620, 0.1811, -0.0516, -0.0800, -0.1416, + -0.2374, -0.1892, 0.1726, -0.0690, -0.0300, 0.0467, -0.2811, -0.1603, + 0.0342, -0.1054, -0.0604, -0.0475, -0.0908, -0.1286, 0.1105, -0.1186, + 0.0582, 0.1887, 0.0345, 0.2081, 0.1404, -0.2532, 0.0026, 0.0402, + 0.0812, -0.0512, 0.0128, 0.0084, -0.0970, -0.0362, 0.0036, -0.0720, + -0.0850, 0.0221, -0.1037, 0.0569, 0.0187, -0.0649, -0.0288, -0.1795, + 0.0045, 0.2535, 0.6751, 0.1578, -0.0966, 0.1516, 0.2109, 0.2033, + -0.2155, -0.1783, 0.0836, -0.1050, 0.0676, -0.0237, 0.0387, -0.2564, + 0.1891, 0.1305, -0.3239, -0.1312, 0.2723, 0.0745, 0.1335, 0.0302, + 0.0172, 0.2207, 0.0215, -0.0379, -0.1954, 0.4944, 0.2905, -0.0306, + 0.2858, 0.2341, 0.0545, 0.4626, 0.2947, 0.3802, 0.2820, 0.1557, + 0.1743, -0.1410, 0.0986, 0.4751, -0.2146, 0.3530, -0.2357, -0.5626, + -0.0617, 0.2190, 0.0992, -0.2365, 0.3726, 0.2092, 0.1660, 0.1928, + 0.5731, -0.1734, -0.0816, -0.3191, -0.1871, -0.2217, -0.0112, 0.1261, + 0.1601, 0.3835, 0.0451, -0.1927, -0.1116, 0.2204, -0.0379, -0.0094, + -0.0455, -0.4831, -0.3345, -0.2119, 0.4803, 0.1214, 0.1723, 0.2605, + 0.0051, -0.2587, 0.0511, -0.1318, 0.0227, -0.0645, 0.2573, -0.0205, + 0.0665, -0.3562, -0.6070, 0.4191, 0.0351, 0.2033, -0.5508, -0.1415, + -0.1249, -0.0986, -0.1120, -0.1187, 0.0600, 0.1974, 0.5017, -0.0247, + -0.2986, 0.3983, -0.1159, -0.4275, -0.0164, -0.3783, 0.0717, 0.1478, + -0.1144, 0.2292, 0.2741, 0.4309, -0.1611, 0.0755, -0.0981, 0.4584, + -0.2061, -0.0787, -0.1779, 0.2275, -0.1742, -0.2230, -0.1739, 0.0646 + }; + + auto& ids = inputTokens[0]; + + auto inputIdsTensor = ov::Tensor{ov::element::i64, ov::Shape{1, ids.size()}}; + auto refS = ov::Tensor{ov::element::f32, ov::Shape{1, voice.size()}}; + auto speed = ov::Tensor{ov::element::f32, ov::Shape{1}}; + + *reinterpret_cast(speed.data()) = 0.5f; + std::copy(ids.data(), ids.data() + ids.size(), + reinterpret_cast(inputIdsTensor.data())); + std::copy(voice.data(), voice.data() + voice.size(), + reinterpret_cast(refS.data())); + + // Inference + ModelMetricReporter unused(nullptr, nullptr, "unused", 1); + auto executingStreamIdGuard = + std::make_unique(servable->getInferRequestsQueue(), unused); + ov::InferRequest& inferRequest = executingStreamIdGuard->getInferRequest(); + + inferRequest.set_tensor("input_ids", inputIdsTensor); + inferRequest.set_tensor("103", refS); + inferRequest.set_tensor("speed", speed); + inferRequest.start_async(); + inferRequest.wait(); + + // Collect audio output + auto out = inferRequest.get_tensor(inferRequest.get_compiled_model().outputs()[0]); + RET_CHECK(out.get_shape().size() == 1); + RET_CHECK(out.get_element_type() == ov::element::f32); + const size_t samples = out.get_shape()[0]; + const float* data = out.data(); + + SPDLOG_DEBUG("Model output: {} audio samples ({:.2f}s at 24kHz)", + samples, static_cast(samples) / 24000.0f); + + void* wavDataPtr = nullptr; + size_t wavSize = 0; + prepareAudioOutputKokoro(&wavDataPtr, wavSize, 32, samples, data); + + auto output = std::make_unique(reinterpret_cast(wavDataPtr), wavSize); + drwav_free(wavDataPtr, NULL); + + cc->Outputs().Tag(OUTPUT_TAG_NAME).Add(output.release(), cc->InputTimestamp()); + SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "KokoroCalculator [Node: {}] Process end", cc->NodeName()); + return absl::OkStatus(); + } +}; + +const std::string KokoroCalculator::INPUT_TAG_NAME{"HTTP_REQUEST_PAYLOAD"}; +const std::string KokoroCalculator::OUTPUT_TAG_NAME{"HTTP_RESPONSE_PAYLOAD"}; + +REGISTER_CALCULATOR(KokoroCalculator); + +} // namespace mediapipe diff --git a/src/audio/kokoro/kokoro_calculator.proto b/src/audio/kokoro/kokoro_calculator.proto new file mode 100644 index 0000000000..d9fc1b4bd9 --- /dev/null +++ b/src/audio/kokoro/kokoro_calculator.proto @@ -0,0 +1,33 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +syntax = "proto2"; +package mediapipe; + +import "mediapipe/framework/calculator.proto"; + + +message KokoroCalculatorOptions { + extend mediapipe.CalculatorOptions { + // https://github.com/google/mediapipe/issues/634 have to be unique in app + // no rule to obtain this + optional KokoroCalculatorOptions ext = 116423799; + } + + required string models_path = 1; + optional string target_device = 2; + optional string plugin_config = 3; +} diff --git a/src/audio/kokoro/kokoro_servable.hpp b/src/audio/kokoro/kokoro_servable.hpp new file mode 100644 index 0000000000..3e42bd0db4 --- /dev/null +++ b/src/audio/kokoro/kokoro_servable.hpp @@ -0,0 +1,197 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#pragma warning(push) +#pragma warning(disable : 4005 4309 6001 6385 6386 6326 6011 4005 4456 6246) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#include "mediapipe/framework/calculator_graph.h" +#pragma GCC diagnostic pop +#pragma warning(pop) + +#include "openvino/runtime/core.hpp" +#include "../../ovinferrequestsqueue.hpp" + +#include +#include + +#include "src/audio/kokoro/kokoro_calculator.pb.h" +#include "src/logging.hpp" + +namespace ovms { + +struct VocabIndex { + std::unordered_map by_token; + size_t max_token_bytes = 1; +}; + +class EspeakInstance { +public: + static EspeakInstance& instance() { + static EspeakInstance inst; + return inst; + } + + bool isReady() const { return ready_; } + std::mutex& mutex() { return mutex_; } + +private: + EspeakInstance() { + ready_ = tryInit(); + if (!ready_) { + SPDLOG_ERROR("eSpeak-NG initialization failed (data path or voice not found)"); + } else { + SPDLOG_INFO("eSpeak-NG initialized successfully"); + } + } + + ~EspeakInstance() { + if (ready_) { + espeak_Terminate(); + } + } + + EspeakInstance(const EspeakInstance&) = delete; + EspeakInstance& operator=(const EspeakInstance&) = delete; + + bool tryInit() { + auto try_path = [](const char* path) -> bool { + int sr = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, + 0, path, + espeakINITIALIZE_DONT_EXIT); + if (sr <= 0) return false; + if (espeak_SetVoiceByName("en") != EE_OK && + espeak_SetVoiceByName("en-us") != EE_OK) { + return false; + } + return true; + }; + + if (try_path(nullptr)) return true; + + static const char* ngPaths[] = { + "/usr/share/espeak-ng-data", + "/opt/homebrew/share/espeak-ng-data", + "/usr/local/share/espeak-ng-data", + "espeak-ng-data", + nullptr + }; + for (int i = 0; ngPaths[i]; ++i) + if (try_path(ngPaths[i])) return true; + + static const char* esPaths[] = { + "/usr/share/espeak-data", + "/usr/local/share/espeak-data", + "espeak-data", + nullptr + }; + for (int i = 0; esPaths[i]; ++i) + if (try_path(esPaths[i])) return true; + + return false; + } + + bool ready_ = false; + std::mutex mutex_; +}; + +struct KokoroServable { + std::filesystem::path parsedModelsPath; + std::shared_ptr model; + ov::CompiledModel compiledModel; + std::unique_ptr inferRequestsQueue; + VocabIndex vocabIndex; + + KokoroServable(const std::string& modelDir, const std::string& targetDevice, const std::string& graphPath) { + EspeakInstance::instance(); + + auto fsModelsPath = std::filesystem::path(modelDir); + if (fsModelsPath.is_relative()) { + parsedModelsPath = (std::filesystem::path(graphPath) / fsModelsPath); + } else { + parsedModelsPath = fsModelsPath; + } + + vocabIndex = loadVocabFromConfig(parsedModelsPath); + + ov::AnyMap properties; + ov::Core core; + auto m_model = core.read_model(parsedModelsPath / std::filesystem::path("openvino_model.xml"), {}, properties); + compiledModel = core.compile_model(m_model, targetDevice, properties); + inferRequestsQueue = std::make_unique(compiledModel, 5); + } + + OVInferRequestsQueue& getInferRequestsQueue() { + return *inferRequestsQueue; + } + + const VocabIndex& getVocabIndex() const { + return vocabIndex; + } + +private: + static VocabIndex loadVocabFromConfig(const std::filesystem::path& modelDir) { + VocabIndex ix; + auto configPath = modelDir / "config.json"; + std::ifstream ifs(configPath); + if (!ifs.is_open()) { + SPDLOG_ERROR("Failed to open Kokoro config: {}", configPath.string()); + return ix; + } + + std::stringstream buffer; + buffer << ifs.rdbuf(); + std::string jsonStr = buffer.str(); + + rapidjson::Document doc; + doc.Parse(jsonStr.c_str()); + if (doc.HasParseError()) { + SPDLOG_ERROR("Failed to parse Kokoro config JSON: {}", configPath.string()); + return ix; + } + + if (!doc.HasMember("vocab") || !doc["vocab"].IsObject()) { + SPDLOG_ERROR("Kokoro config missing 'vocab' object: {}", configPath.string()); + return ix; + } + + const auto& vocab = doc["vocab"]; + ix.by_token.reserve(vocab.MemberCount()); + for (auto it = vocab.MemberBegin(); it != vocab.MemberEnd(); ++it) { + if (!it->name.IsString() || !it->value.IsInt()) continue; + std::string token = it->name.GetString(); + int id = it->value.GetInt(); + ix.by_token.emplace(token, id); + ix.max_token_bytes = std::max(ix.max_token_bytes, token.size()); + } + + SPDLOG_INFO("Loaded Kokoro vocabulary: {} tokens, max_token_bytes={}", + ix.by_token.size(), ix.max_token_bytes); + return ix; + } +}; + +using KokoroServableMap = std::unordered_map>; +} // namespace ovms diff --git a/src/logging.cpp b/src/logging.cpp index e89fce9a07..9d058d82dc 100644 --- a/src/logging.cpp +++ b/src/logging.cpp @@ -35,6 +35,7 @@ std::shared_ptr llm_executor_logger = std::make_shared llm_calculator_logger = std::make_shared("llm_calculator"); std::shared_ptr s2t_calculator_logger = std::make_shared("s2t_calculator"); std::shared_ptr t2s_calculator_logger = std::make_shared("t2s_calculator"); +std::shared_ptr kokoro_calculator_logger = std::make_shared("kokoro_calculator"); std::shared_ptr embeddings_calculator_logger = std::make_shared("embeddings_calculator"); std::shared_ptr rerank_calculator_logger = std::make_shared("rerank_calculator"); #endif @@ -78,6 +79,7 @@ static void register_loggers(const std::string& log_level, std::vectorset_pattern(default_pattern); s2t_calculator_logger->set_pattern(default_pattern); t2s_calculator_logger->set_pattern(default_pattern); + kokoro_calculator_logger->set_pattern(default_pattern); rerank_calculator_logger->set_pattern(default_pattern); embeddings_calculator_logger->set_pattern(default_pattern); #endif @@ -98,6 +100,7 @@ static void register_loggers(const std::string& log_level, std::vectorsinks().push_back(sink); s2t_calculator_logger->sinks().push_back(sink); t2s_calculator_logger->sinks().push_back(sink); + kokoro_calculator_logger->sinks().push_back(sink); rerank_calculator_logger->sinks().push_back(sink); embeddings_calculator_logger->sinks().push_back(sink); #endif @@ -119,6 +122,7 @@ static void register_loggers(const std::string& log_level, std::vector llm_executor_logger; extern std::shared_ptr llm_calculator_logger; extern std::shared_ptr s2t_calculator_logger; extern std::shared_ptr t2s_calculator_logger; +extern std::shared_ptr kokoro_calculator_logger; extern std::shared_ptr embeddings_calculator_logger; extern std::shared_ptr rerank_calculator_logger; #endif diff --git a/src/mediapipe_internal/mediapipegraphdefinition.cpp b/src/mediapipe_internal/mediapipegraphdefinition.cpp index 9047765e75..e1436b5891 100644 --- a/src/mediapipe_internal/mediapipegraphdefinition.cpp +++ b/src/mediapipe_internal/mediapipegraphdefinition.cpp @@ -63,6 +63,7 @@ const std::string MediapipeGraphDefinition::LLM_NODE_CALCULATOR_NAME{"LLMCalcula const std::string MediapipeGraphDefinition::IMAGE_GEN_CALCULATOR_NAME{"ImageGenCalculator"}; const std::string MediapipeGraphDefinition::STT_NODE_CALCULATOR_NAME{"S2tCalculator"}; const std::string MediapipeGraphDefinition::TTS_NODE_CALCULATOR_NAME{"T2sCalculator"}; +const std::string MediapipeGraphDefinition::KOKORO_NODE_CALCULATOR_NAME{"KokoroCalculator"}; const std::string MediapipeGraphDefinition::EMBEDDINGS_NODE_CALCULATOR_NAME{"EmbeddingsCalculatorOV"}; const std::string MediapipeGraphDefinition::RERANK_NODE_CALCULATOR_NAME{"RerankCalculatorOV"}; @@ -625,6 +626,28 @@ Status MediapipeGraphDefinition::initializeNodes() { return StatusCode::MEDIAPIPE_GRAPH_CONFIG_FILE_INVALID; } } + if (endsWith(config.node(i).calculator(), KOKORO_NODE_CALCULATOR_NAME)) { + auto& kokoroServableMap = this->sidePacketMaps.kokoroServableMap; + ResourcesCleaningGuard kokoroServablesCleaningGuard(kokoroServableMap); + if (!config.node(i).node_options().size()) { + SPDLOG_LOGGER_ERROR(modelmanager_logger, "Kokoro node missing options in graph: {}. ", this->name); + return StatusCode::LLM_NODE_MISSING_OPTIONS; + } + if (config.node(i).name().empty()) { + SPDLOG_LOGGER_ERROR(modelmanager_logger, "Kokoro node name is missing in graph: {}. ", this->name); + return StatusCode::LLM_NODE_MISSING_NAME; + } + std::string nodeName = config.node(i).name(); + if (kokoroServableMap.find(nodeName) != kokoroServableMap.end()) { + SPDLOG_LOGGER_ERROR(modelmanager_logger, "Kokoro node name: {} already used in graph: {}. ", nodeName, this->name); + return StatusCode::LLM_NODE_NAME_ALREADY_EXISTS; + } + mediapipe::KokoroCalculatorOptions nodeOptions; + config.node(i).node_options(0).UnpackTo(&nodeOptions); + std::shared_ptr servable = std::make_shared(nodeOptions.models_path(), nodeOptions.target_device(), mgconfig.getBasePath()); + kokoroServableMap.insert(std::pair>(nodeName, std::move(servable))); + kokoroServablesCleaningGuard.disableCleaning(); + } } return StatusCode::OK; } diff --git a/src/mediapipe_internal/mediapipegraphdefinition.hpp b/src/mediapipe_internal/mediapipegraphdefinition.hpp index 14c9e0679f..1067ca7d42 100644 --- a/src/mediapipe_internal/mediapipegraphdefinition.hpp +++ b/src/mediapipe_internal/mediapipegraphdefinition.hpp @@ -48,6 +48,7 @@ #include "../rerank/rerank_servable.hpp" #include "../audio/speech_to_text/s2t_servable.hpp" #include "../audio/text_to_speech/t2s_servable.hpp" +#include "../audio/kokoro/kokoro_servable.hpp" namespace ovms { class MediapipeGraphDefinitionUnloadGuard; @@ -66,6 +67,7 @@ using GenAiServableMap = std::unordered_map>; using SttServableMap = std::unordered_map>; using TtsServableMap = std::unordered_map>; +using KokoroServableMap = std::unordered_map>; using EmbeddingsServableMap = std::unordered_map>; using ImageGenerationPipelinesMap = std::unordered_map>; @@ -77,6 +79,7 @@ struct GraphSidePackets { RerankServableMap rerankServableMap; SttServableMap sttServableMap; TtsServableMap ttsServableMap; + KokoroServableMap kokoroServableMap; void clear() { pythonNodeResourcesMap.clear(); genAiServableMap.clear(); @@ -85,6 +88,7 @@ struct GraphSidePackets { rerankServableMap.clear(); sttServableMap.clear(); ttsServableMap.clear(); + kokoroServableMap.clear(); } bool empty() { return (pythonNodeResourcesMap.empty() && @@ -93,7 +97,8 @@ struct GraphSidePackets { embeddingsServableMap.empty() && rerankServableMap.empty() && sttServableMap.empty() && - ttsServableMap.empty()); + ttsServableMap.empty() && + kokoroServableMap.empty()); } }; @@ -136,6 +141,7 @@ class MediapipeGraphDefinition { static const std::string RERANK_NODE_CALCULATOR_NAME; static const std::string STT_NODE_CALCULATOR_NAME; static const std::string TTS_NODE_CALCULATOR_NAME; + static const std::string KOKORO_NODE_CALCULATOR_NAME; Status waitForLoaded(std::unique_ptr& unloadGuard, const uint32_t waitForLoadedTimeoutMicroseconds = WAIT_FOR_LOADED_DEFAULT_TIMEOUT_MICROSECONDS); // Pipelines are not versioned and any available definition has constant version equal 1. diff --git a/src/mediapipe_internal/mediapipegraphexecutor.cpp b/src/mediapipe_internal/mediapipegraphexecutor.cpp index 93b53fdf8e..b2016ac3aa 100644 --- a/src/mediapipe_internal/mediapipegraphexecutor.cpp +++ b/src/mediapipe_internal/mediapipegraphexecutor.cpp @@ -49,6 +49,7 @@ MediapipeGraphExecutor::MediapipeGraphExecutor( const RerankServableMap& rerankServableMap, const SttServableMap& sttServableMap, const TtsServableMap& ttsServableMap, + const KokoroServableMap& kokoroServableMap, PythonBackend* pythonBackend, MediapipeServableMetricReporter* mediapipeServableMetricReporter) : name(name), @@ -58,7 +59,7 @@ MediapipeGraphExecutor::MediapipeGraphExecutor( outputTypes(std::move(outputTypes)), inputNames(std::move(inputNames)), outputNames(std::move(outputNames)), - sidePacketMaps({pythonNodeResourcesMap, llmNodeResourcesMap, {}, embeddingsServableMap, rerankServableMap, sttServableMap, ttsServableMap}), + sidePacketMaps({pythonNodeResourcesMap, llmNodeResourcesMap, {}, embeddingsServableMap, rerankServableMap, sttServableMap, ttsServableMap, kokoroServableMap}), pythonBackend(pythonBackend), currentStreamTimestamp(STARTING_TIMESTAMP), mediapipeServableMetricReporter(mediapipeServableMetricReporter) {} @@ -92,6 +93,7 @@ const std::string MediapipeGraphExecutor::EMBEDDINGS_SESSION_SIDE_PACKET_TAG = " const std::string MediapipeGraphExecutor::RERANK_SESSION_SIDE_PACKET_TAG = "rerank_servable"; const std::string MediapipeGraphExecutor::STT_SESSION_SIDE_PACKET_TAG = "s2t_servable"; const std::string MediapipeGraphExecutor::TTS_SESSION_SIDE_PACKET_TAG = "t2s_servable"; +const std::string MediapipeGraphExecutor::KOKORO_SESSION_SIDE_PACKET_TAG = "kokoro_servable"; const ::mediapipe::Timestamp MediapipeGraphExecutor::STARTING_TIMESTAMP = ::mediapipe::Timestamp(0); } // namespace ovms diff --git a/src/mediapipe_internal/mediapipegraphexecutor.hpp b/src/mediapipe_internal/mediapipegraphexecutor.hpp index c165469395..af2e8d08e6 100644 --- a/src/mediapipe_internal/mediapipegraphexecutor.hpp +++ b/src/mediapipe_internal/mediapipegraphexecutor.hpp @@ -95,6 +95,7 @@ class MediapipeGraphExecutor { static const std::string RERANK_SESSION_SIDE_PACKET_TAG; static const std::string STT_SESSION_SIDE_PACKET_TAG; static const std::string TTS_SESSION_SIDE_PACKET_TAG; + static const std::string KOKORO_SESSION_SIDE_PACKET_TAG; static const ::mediapipe::Timestamp STARTING_TIMESTAMP; MediapipeGraphExecutor(const std::string& name, const std::string& version, const ::mediapipe::CalculatorGraphConfig& config, @@ -107,6 +108,7 @@ class MediapipeGraphExecutor { const RerankServableMap& rerankServableMap, const SttServableMap& sttServableMap, const TtsServableMap& ttsServableMap, + const KokoroServableMap& kokoroServableMap, PythonBackend* pythonBackend, MediapipeServableMetricReporter* mediapipeServableMetricReporter); MediapipeGraphExecutor(const std::string& name, const std::string& version, const ::mediapipe::CalculatorGraphConfig& config, @@ -157,6 +159,7 @@ class MediapipeGraphExecutor { inputSidePackets[RERANK_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket(this->sidePacketMaps.rerankServableMap).At(STARTING_TIMESTAMP); inputSidePackets[STT_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket(this->sidePacketMaps.sttServableMap).At(STARTING_TIMESTAMP); inputSidePackets[TTS_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket(this->sidePacketMaps.ttsServableMap).At(STARTING_TIMESTAMP); + inputSidePackets[KOKORO_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket(this->sidePacketMaps.kokoroServableMap).At(STARTING_TIMESTAMP); MP_RETURN_ON_FAIL(graph.StartRun(inputSidePackets), std::string("start MediaPipe graph: ") + this->name, StatusCode::MEDIAPIPE_GRAPH_START_ERROR); From fe10782011604a17460c6b384e70663bc84025cd Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Fri, 20 Feb 2026 09:32:04 +0100 Subject: [PATCH 02/11] speed --- src/audio/kokoro/kokoro_calculator.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/audio/kokoro/kokoro_calculator.cc b/src/audio/kokoro/kokoro_calculator.cc index 986dd92fab..3bfb6e0634 100644 --- a/src/audio/kokoro/kokoro_calculator.cc +++ b/src/audio/kokoro/kokoro_calculator.cc @@ -271,7 +271,7 @@ class KokoroCalculator : public CalculatorBase { auto refS = ov::Tensor{ov::element::f32, ov::Shape{1, voice.size()}}; auto speed = ov::Tensor{ov::element::f32, ov::Shape{1}}; - *reinterpret_cast(speed.data()) = 0.5f; + *reinterpret_cast(speed.data()) = 0.8f; std::copy(ids.data(), ids.data() + ids.size(), reinterpret_cast(inputIdsTensor.data())); std::copy(voice.data(), voice.data() + voice.size(), From 983613da58aab63c8e68b47d0e3a03940c88c749 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Tue, 24 Feb 2026 13:35:49 +0100 Subject: [PATCH 03/11] add espeak --- third_party/espeak_ng/BUILD | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 third_party/espeak_ng/BUILD diff --git a/third_party/espeak_ng/BUILD b/third_party/espeak_ng/BUILD new file mode 100644 index 0000000000..2c0a1cb09a --- /dev/null +++ b/third_party/espeak_ng/BUILD @@ -0,0 +1,31 @@ + +# third_party/espeak_ng/BUILD + +config_setting( + name = "is_macos", + values = {"cpu": "darwin"}, +) + +cc_library( + name = "espeak_ng", + copts = select({ + ":is_macos": [ + # Adjust to where Homebrew (or your installer) puts headers + "-I" + "$(HOME)/.brew/opt/espeak-ng/include", + "-I" + "$(HOME)/.brew/opt/espeak-ng/include/espeak-ng", + ], + "//conditions:default": [ + # Typical on Debian/Ubuntu when installing libespeak-ng-dev + "-I/usr/include", + "-I/usr/include/espeak-ng", + ], + }), + linkopts = select({ + ":is_macos": [ + "-L" + "$(HOME)/.brew/opt/espeak-ng/lib", + "-lespeak-ng", + ], + "//conditions:default": ["-lespeak-ng"], + }), + visibility = ["//visibility:public"], +) From f07991158078009687f1e8a8a0701873e4cc29bb Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Tue, 24 Feb 2026 15:14:24 +0100 Subject: [PATCH 04/11] fixes --- src/audio/audio_utils.cpp | 15 +-- src/audio/audio_utils.hpp | 2 +- src/audio/kokoro/kokoro_calculator.cc | 139 +++++++++++++++++--------- src/audio/kokoro/kokoro_servable.hpp | 87 +++++++++++++++- 4 files changed, 185 insertions(+), 58 deletions(-) diff --git a/src/audio/audio_utils.cpp b/src/audio/audio_utils.cpp index 01daafb351..7636d8afe7 100644 --- a/src/audio/audio_utils.cpp +++ b/src/audio/audio_utils.cpp @@ -22,6 +22,7 @@ #include "src/logging.hpp" #include #include +#include #include #include #pragma warning(push) @@ -190,28 +191,28 @@ void prepareAudioOutput(void** ppData, size_t& pDataSize, uint16_t bitsPerSample } -void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, uint16_t bitsPerSample, size_t speechSize, const float* waveformPtr) { +void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, size_t speechSize, const float* waveformPtr) { enum : unsigned int { OUTPUT_PREPARATION, TIMER_END }; Timer timer; timer.start(OUTPUT_PREPARATION); + drwav_data_format format; format.container = drwav_container_riff; format.format = DR_WAVE_FORMAT_IEEE_FLOAT; format.channels = 1; - format.sampleRate = 24000; // assume it is always 24 KHz - format.bitsPerSample = bitsPerSample; + format.sampleRate = 24000; // Kokoro native sample rate + format.bitsPerSample = 32; drwav wav; - size_t totalSamples = speechSize * format.channels; auto status = drwav_init_memory_write(&wav, ppData, &pDataSize, &format, nullptr); if (status == DRWAV_FALSE) { - throw std::runtime_error("Failed to write all frames"); + throw std::runtime_error("Failed to initialize WAV writer"); } - drwav_uint64 framesWritten = drwav_write_pcm_frames(&wav, totalSamples, waveformPtr); - if (framesWritten != totalSamples) { + drwav_uint64 framesWritten = drwav_write_pcm_frames(&wav, speechSize, waveformPtr); + if (framesWritten != speechSize) { throw std::runtime_error("Failed to write all frames"); } drwav_uninit(&wav); diff --git a/src/audio/audio_utils.hpp b/src/audio/audio_utils.hpp index 874e83dca4..ca0ce00a7c 100644 --- a/src/audio/audio_utils.hpp +++ b/src/audio/audio_utils.hpp @@ -25,5 +25,5 @@ bool isWavBuffer(const std::string buf); std::vector readWav(const std::string_view& wavData); std::vector readMp3(const std::string_view& mp3Data); void prepareAudioOutput(void** ppData, size_t& pDataSize, uint16_t bitsPerSample, size_t speechSize, const float* waveformPtr); -void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, uint16_t bitsPerSample, size_t speechSize, const float* waveformPtr); +void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, size_t speechSize, const float* waveformPtr); diff --git a/src/audio/kokoro/kokoro_calculator.cc b/src/audio/kokoro/kokoro_calculator.cc index 3bfb6e0634..f5b3cb011b 100644 --- a/src/audio/kokoro/kokoro_calculator.cc +++ b/src/audio/kokoro/kokoro_calculator.cc @@ -123,6 +123,71 @@ void espeakPhonemizeAll(const std::string& textUtf8, std::string& outIpa, bool n SPDLOG_DEBUG("IPA phonemes: '{}' (length: {})", outIpa, outIpa.size()); } +// Post-process eSpeak IPA into Kokoro/misaki phoneme alphabet. +// Mirrors misaki.espeak.EspeakFallback.E2M for American English. +// void espeakIpaToKokoro(std::string& ps) { +// // Helper: replace all occurrences of `from` with `to` in `s`. +// auto replaceAll = [](std::string& s, const std::string& from, const std::string& to) { +// if (from.empty()) return; +// size_t pos = 0; +// while ((pos = s.find(from, pos)) != std::string::npos) { +// s.replace(pos, from.size(), to); +// pos += to.size(); +// } +// }; + +// // --- Multi-char replacements (longest first) --- +// // Syllabic n with glottal stop +// replaceAll(ps, "\xca\x94\xcb\x8c\x6e\xcc\xa9", "\xca\x94\x6e"); // ʔˌn̩ → ʔn +// replaceAll(ps, "\xca\x94\x6e\xcc\xa9", "\xca\x94\x6e"); // ʔn̩ → ʔn +// // Syllabic mark before consonant → ᵊ + consonant +// // ə̩l → ᵊl (syllabic l) +// replaceAll(ps, "\xc9\x99\xcc\xa9\x6c", "\xe1\xb5\x8a\x6c"); // əl̩ → ᵊl (approximation) + +// // Diphthongs +// replaceAll(ps, "a\xc9\xaa", "I"); // aɪ → I +// replaceAll(ps, "a\xca\x8a", "W"); // aʊ → W +// replaceAll(ps, "e\xc9\xaa", "A"); // eɪ → A +// replaceAll(ps, "\xc9\x94\xc9\xaa", "Y"); // ɔɪ → Y +// replaceAll(ps, "o\xca\x8a", "O"); // oʊ → O (American) +// replaceAll(ps, "\xc9\x99\xca\x8a", "O"); // əʊ → O (British) + +// // Affricates +// replaceAll(ps, "d\xca\x92", "\xca\xa4"); // dʒ → ʤ +// replaceAll(ps, "t\xca\x83", "\xca\xa7"); // tʃ → ʧ + +// // Palatalization +// replaceAll(ps, "\xca\xb2\x6f", "jo"); // ʲo → jo +// replaceAll(ps, "\xca\xb2\xc9\x99", "j\xc9\x99"); // ʲə → jə +// replaceAll(ps, "\xca\xb2", ""); // ʲ → (delete) + +// // R-colored vowels and vowel length +// replaceAll(ps, "\xc9\x9c\xcb\x90\xc9\xb9", "\xc9\x9c\xc9\xb9"); // ɜːɹ → ɜɹ +// replaceAll(ps, "\xc9\x9c\xcb\x90", "\xc9\x9c\xc9\xb9"); // ɜː → ɜɹ +// replaceAll(ps, "\xc9\xaa\xc9\x99", "i\xc9\x99"); // ɪə → iə + +// // --- Single-char replacements --- +// replaceAll(ps, "\xc9\x9a", "\xc9\x99\xc9\xb9"); // ɚ → əɹ +// replaceAll(ps, "\xc9\x90", "\xc9\x99"); // ɐ → ə +// replaceAll(ps, "\xc9\xac", "l"); // ɬ → l +// replaceAll(ps, "\xc3\xa7", "k"); // ç → k +// replaceAll(ps, "x", "k"); // x → k +// replaceAll(ps, "r", "\xc9\xb9"); // r → ɹ +// replaceAll(ps, "\xcb\x90", ""); // ː → (strip length marks) +// replaceAll(ps, "\xcc\x83", ""); // ̃ → (strip nasal tilde) + +// // British vowel mappings (in case eSpeak uses 'en' voice) +// replaceAll(ps, "\xc9\x92", "\xc9\x94"); // ɒ → ɔ + +// // Remaining standalone vowels (must be AFTER diphthong replacements) +// replaceAll(ps, "o", "\xc9\x94"); // o → ɔ (for espeak < 1.52) +// replaceAll(ps, "e", "A"); // e → A + +// // Flap and glottal stop (misaki version != 2.0) +// replaceAll(ps, "\xc9\xbe", "T"); // ɾ → T +// replaceAll(ps, "\xca\x94", "t"); // ʔ → t +// } + size_t utf8CharLen(unsigned char lead) { if (lead < 0x80) return 1; if ((lead >> 5) == 0x6) return 2; @@ -211,70 +276,50 @@ class KokoroCalculator : public CalculatorBase { RET_CHECK(it->value.IsString()) << "'input' must be a string"; const std::string text = it->value.GetString(); + // Read optional "voice" parameter (OpenAI TTS API) + std::string voiceName; + auto voiceIt = payload.parsedJson->FindMember("voice"); + if (voiceIt != payload.parsedJson->MemberEnd() && voiceIt->value.IsString()) { + voiceName = voiceIt->value.GetString(); + } + // Text -> IPA phonemization std::string phonemes; - espeakPhonemizeAll(text, phonemes, /*noStress=*/true); + espeakPhonemizeAll(text, phonemes, /*noStress=*/false); SPDLOG_DEBUG("Input text: '{}', IPA phonemes ({} chars): '{}'", text, phonemes.size(), phonemes); + // Preserve trailing punctuation from original text (eSpeak strips it) + // if (!text.empty()) { + // char last = text.back(); + // if (last == '.' || last == '!' || last == '?' || last == ';' || last == ':' || last == ',') { + // phonemes.push_back(last); + // } + // } + SPDLOG_DEBUG("After E2M mapping ({} chars): '{}'", phonemes.size(), phonemes); // IPA -> Kokoro token IDs const auto& vocabIx = servable->getVocabIndex(); std::vector> inputTokens(1); tokenize(phonemes, inputTokens[0], vocabIx); - // Prepend PAD token (id=0) - Kokoro model requires BOS/PAD at start + // Wrap with PAD token (id=0) at both ends — matches official + // forward_with_tokens: input_ids = [[0, *tokens, 0]] inputTokens[0].insert(inputTokens[0].begin(), 0); + inputTokens[0].push_back(0); - // Append EOS (period token = 4) if not already present - if (inputTokens[0].empty() || inputTokens[0].back() != 4) { - inputTokens[0].push_back(4); - } - - // Voice embedding - std::vector voice = { - -0.2296, 0.1835, -0.0069, -0.1240, -0.2505, 0.0112, -0.0759, -0.1650, - -0.2665, -0.1965, 0.0242, -0.1667, 0.3524, 0.2140, 0.3069, -0.3377, - -0.0878, -0.0477, 0.0813, -0.2135, -0.2340, -0.1971, 0.0200, 0.0145, - 0.0016, 0.2596, -0.2665, 0.1434, 0.0503, 0.0867, 0.1905, -0.1281, - 0.0658, -0.0639, -0.0920, 0.2444, -0.1506, -0.2197, 0.1385, 0.2133, - -0.0755, -0.0188, -0.0142, 0.2301, -0.0776, -0.0748, 0.0172, 0.0430, - -0.1009, 0.1519, 0.1137, 0.0641, 0.2264, 0.1911, -0.0205, 0.2578, - 0.2210, -0.0784, -0.0235, -0.0547, 0.2191, -0.1623, -0.2416, 0.0076, - 0.0574, 0.2186, 0.0080, 0.0473, 0.0972, 0.0286, 0.1324, 0.0686, - 0.2652, -0.2237, -0.0980, -0.1693, -0.1866, 0.2273, 0.2008, -0.0683, - 0.0957, 0.0623, -0.1891, 0.1620, 0.1811, -0.0516, -0.0800, -0.1416, - -0.2374, -0.1892, 0.1726, -0.0690, -0.0300, 0.0467, -0.2811, -0.1603, - 0.0342, -0.1054, -0.0604, -0.0475, -0.0908, -0.1286, 0.1105, -0.1186, - 0.0582, 0.1887, 0.0345, 0.2081, 0.1404, -0.2532, 0.0026, 0.0402, - 0.0812, -0.0512, 0.0128, 0.0084, -0.0970, -0.0362, 0.0036, -0.0720, - -0.0850, 0.0221, -0.1037, 0.0569, 0.0187, -0.0649, -0.0288, -0.1795, - 0.0045, 0.2535, 0.6751, 0.1578, -0.0966, 0.1516, 0.2109, 0.2033, - -0.2155, -0.1783, 0.0836, -0.1050, 0.0676, -0.0237, 0.0387, -0.2564, - 0.1891, 0.1305, -0.3239, -0.1312, 0.2723, 0.0745, 0.1335, 0.0302, - 0.0172, 0.2207, 0.0215, -0.0379, -0.1954, 0.4944, 0.2905, -0.0306, - 0.2858, 0.2341, 0.0545, 0.4626, 0.2947, 0.3802, 0.2820, 0.1557, - 0.1743, -0.1410, 0.0986, 0.4751, -0.2146, 0.3530, -0.2357, -0.5626, - -0.0617, 0.2190, 0.0992, -0.2365, 0.3726, 0.2092, 0.1660, 0.1928, - 0.5731, -0.1734, -0.0816, -0.3191, -0.1871, -0.2217, -0.0112, 0.1261, - 0.1601, 0.3835, 0.0451, -0.1927, -0.1116, 0.2204, -0.0379, -0.0094, - -0.0455, -0.4831, -0.3345, -0.2119, 0.4803, 0.1214, 0.1723, 0.2605, - 0.0051, -0.2587, 0.0511, -0.1318, 0.0227, -0.0645, 0.2573, -0.0205, - 0.0665, -0.3562, -0.6070, 0.4191, 0.0351, 0.2033, -0.5508, -0.1415, - -0.1249, -0.0986, -0.1120, -0.1187, 0.0600, 0.1974, 0.5017, -0.0247, - -0.2986, 0.3983, -0.1159, -0.4275, -0.0164, -0.3783, 0.0717, 0.1478, - -0.1144, 0.2292, 0.2741, 0.4309, -0.1611, 0.0755, -0.0981, 0.4584, - -0.2061, -0.0787, -0.1779, 0.2275, -0.1742, -0.2230, -0.1739, 0.0646 - }; - + // Voice embedding — select slice from voice pack based on content token count auto& ids = inputTokens[0]; + size_t numContentTokens = ids.size() >= 2 ? ids.size() - 2 : 0; // exclude BOS pad + EOS + const float* voiceSlice = servable->getVoiceSlice(voiceName, numContentTokens); + RET_CHECK(voiceSlice != nullptr) << "No voice pack loaded (place .bin files in /voices/)"; auto inputIdsTensor = ov::Tensor{ov::element::i64, ov::Shape{1, ids.size()}}; - auto refS = ov::Tensor{ov::element::f32, ov::Shape{1, voice.size()}}; + auto refS = ov::Tensor{ov::element::f32, ov::Shape{1, KokoroServable::STYLE_DIM}}; auto speed = ov::Tensor{ov::element::f32, ov::Shape{1}}; - *reinterpret_cast(speed.data()) = 0.8f; + *reinterpret_cast(speed.data()) = 1.0f; std::copy(ids.data(), ids.data() + ids.size(), reinterpret_cast(inputIdsTensor.data())); - std::copy(voice.data(), voice.data() + voice.size(), + std::copy(voiceSlice, voiceSlice + KokoroServable::STYLE_DIM, reinterpret_cast(refS.data())); // Inference @@ -301,7 +346,7 @@ class KokoroCalculator : public CalculatorBase { void* wavDataPtr = nullptr; size_t wavSize = 0; - prepareAudioOutputKokoro(&wavDataPtr, wavSize, 32, samples, data); + prepareAudioOutputKokoro(&wavDataPtr, wavSize, samples, data); auto output = std::make_unique(reinterpret_cast(wavDataPtr), wavSize); drwav_free(wavDataPtr, NULL); diff --git a/src/audio/kokoro/kokoro_servable.hpp b/src/audio/kokoro/kokoro_servable.hpp index 3e42bd0db4..c06f88cfac 100644 --- a/src/audio/kokoro/kokoro_servable.hpp +++ b/src/audio/kokoro/kokoro_servable.hpp @@ -15,6 +15,8 @@ //***************************************************************************** #pragma once +#include +#include #include #include #include @@ -47,6 +49,11 @@ struct VocabIndex { size_t max_token_bytes = 1; }; +struct VoicePack { + std::vector data; // flat [numEntries * STYLE_DIM] + size_t numEntries = 0; +}; + class EspeakInstance { public: static EspeakInstance& instance() { @@ -82,8 +89,8 @@ class EspeakInstance { 0, path, espeakINITIALIZE_DONT_EXIT); if (sr <= 0) return false; - if (espeak_SetVoiceByName("en") != EE_OK && - espeak_SetVoiceByName("en-us") != EE_OK) { + if (espeak_SetVoiceByName("en-us") != EE_OK && + espeak_SetVoiceByName("en") != EE_OK) { return false; } return true; @@ -118,11 +125,15 @@ class EspeakInstance { }; struct KokoroServable { + static constexpr size_t STYLE_DIM = 256; + std::filesystem::path parsedModelsPath; std::shared_ptr model; ov::CompiledModel compiledModel; std::unique_ptr inferRequestsQueue; VocabIndex vocabIndex; + std::unordered_map voicePacks; + std::string defaultVoiceName; KokoroServable(const std::string& modelDir, const std::string& targetDevice, const std::string& graphPath) { EspeakInstance::instance(); @@ -135,8 +146,13 @@ struct KokoroServable { } vocabIndex = loadVocabFromConfig(parsedModelsPath); + loadVoicePacks(parsedModelsPath); - ov::AnyMap properties; + ov::AnyMap properties = { + // Use ACCURACY execution mode to avoid fast-math approximation errors + // that accumulate in the deep decoder network and cause energy fade. + ov::hint::execution_mode(ov::hint::ExecutionMode::ACCURACY), + }; ov::Core core; auto m_model = core.read_model(parsedModelsPath / std::filesystem::path("openvino_model.xml"), {}, properties); compiledModel = core.compile_model(m_model, targetDevice, properties); @@ -151,6 +167,30 @@ struct KokoroServable { return vocabIndex; } + // Returns pointer to 256 floats for the given voice and token count. + // voiceName: requested voice (e.g. "af_alloy"). Falls back to default voice if not found. + // numContentTokens: number of token IDs excluding BOS/EOS padding. + const float* getVoiceSlice(const std::string& voiceName, size_t numContentTokens) const { + auto it = voicePacks.find(voiceName); + if (it == voicePacks.end()) { + it = voicePacks.find(defaultVoiceName); + if (it == voicePacks.end()) { + return nullptr; + } + } + const auto& pack = it->second; + size_t idx = std::min(numContentTokens, pack.numEntries - 1); + return pack.data.data() + (idx * STYLE_DIM); + } + + bool hasVoice(const std::string& voiceName) const { + return voicePacks.count(voiceName) > 0; + } + + const std::string& getDefaultVoiceName() const { + return defaultVoiceName; + } + private: static VocabIndex loadVocabFromConfig(const std::filesystem::path& modelDir) { VocabIndex ix; @@ -191,6 +231,47 @@ struct KokoroServable { ix.by_token.size(), ix.max_token_bytes); return ix; } + + void loadVoicePacks(const std::filesystem::path& modelDir) { + auto voicesDir = modelDir / "voices"; + if (!std::filesystem::exists(voicesDir) || !std::filesystem::is_directory(voicesDir)) { + SPDLOG_WARN("No voices directory found at: {}", voicesDir.string()); + return; + } + + for (const auto& entry : std::filesystem::directory_iterator(voicesDir)) { + if (!entry.is_regular_file() || entry.path().extension() != ".bin") + continue; + + std::string name = entry.path().stem().string(); + auto fileSize = std::filesystem::file_size(entry.path()); + if (fileSize == 0 || fileSize % (STYLE_DIM * sizeof(float)) != 0) { + SPDLOG_ERROR("Voice file {} has invalid size {} (must be multiple of {})", + entry.path().string(), fileSize, STYLE_DIM * sizeof(float)); + continue; + } + + VoicePack pack; + pack.numEntries = fileSize / (STYLE_DIM * sizeof(float)); + pack.data.resize(pack.numEntries * STYLE_DIM); + + std::ifstream ifs(entry.path(), std::ios::binary); + if (!ifs.read(reinterpret_cast(pack.data.data()), fileSize)) { + SPDLOG_ERROR("Failed to read voice file: {}", entry.path().string()); + continue; + } + + SPDLOG_INFO("Loaded voice pack '{}': {} entries x {} dims from {}", + name, pack.numEntries, STYLE_DIM, entry.path().string()); + + if (defaultVoiceName.empty()) { + defaultVoiceName = name; + } + voicePacks.emplace(name, std::move(pack)); + } + + SPDLOG_INFO("Loaded {} voice pack(s), default: '{}'", voicePacks.size(), defaultVoiceName); + } }; using KokoroServableMap = std::unordered_map>; From f82bf7f0d52ea733f129f724d0bac78898611912 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Tue, 24 Feb 2026 15:31:20 +0100 Subject: [PATCH 05/11] fix --- Dockerfile.ubuntu | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Dockerfile.ubuntu b/Dockerfile.ubuntu index d80087c646..d7d2ace9f8 100644 --- a/Dockerfile.ubuntu +++ b/Dockerfile.ubuntu @@ -394,6 +394,9 @@ RUN if [ -f /ovms_release/lib/libovms_shared.so ] ; then mv /ovms_release/lib/li # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # FROM $BASE_IMAGE as release +ARG http_proxy +ARG https_proxy +ARG no_proxy ARG INSTALL_RPMS_FROM_URL= ARG INSTALL_DRIVER_VERSION="24.26.30049" ARG GPU=0 From 59d1b3110ff87265aeaab7e6857a2c35b0577ed1 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Wed, 25 Feb 2026 13:22:59 +0100 Subject: [PATCH 06/11] style --- src/audio/audio_utils.cpp | 1 - src/audio/audio_utils.hpp | 1 - src/audio/kokoro/kokoro_calculator.cc | 27 +++++++++++++--------- src/audio/kokoro/kokoro_servable.hpp | 33 +++++++++++++++------------ 4 files changed, 34 insertions(+), 28 deletions(-) diff --git a/src/audio/audio_utils.cpp b/src/audio/audio_utils.cpp index 7636d8afe7..1707b45cd2 100644 --- a/src/audio/audio_utils.cpp +++ b/src/audio/audio_utils.cpp @@ -190,7 +190,6 @@ void prepareAudioOutput(void** ppData, size_t& pDataSize, uint16_t bitsPerSample SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Output preparation time: {} ms", outputPreparationTime); } - void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, size_t speechSize, const float* waveformPtr) { enum : unsigned int { OUTPUT_PREPARATION, diff --git a/src/audio/audio_utils.hpp b/src/audio/audio_utils.hpp index ca0ce00a7c..0928d03f3d 100644 --- a/src/audio/audio_utils.hpp +++ b/src/audio/audio_utils.hpp @@ -26,4 +26,3 @@ std::vector readWav(const std::string_view& wavData); std::vector readMp3(const std::string_view& mp3Data); void prepareAudioOutput(void** ppData, size_t& pDataSize, uint16_t bitsPerSample, size_t speechSize, const float* waveformPtr); void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, size_t speechSize, const float* waveformPtr); - diff --git a/src/audio/kokoro/kokoro_calculator.cc b/src/audio/kokoro/kokoro_calculator.cc index f5b3cb011b..1747b18081 100644 --- a/src/audio/kokoro/kokoro_calculator.cc +++ b/src/audio/kokoro/kokoro_calculator.cc @@ -189,16 +189,20 @@ void espeakPhonemizeAll(const std::string& textUtf8, std::string& outIpa, bool n // } size_t utf8CharLen(unsigned char lead) { - if (lead < 0x80) return 1; - if ((lead >> 5) == 0x6) return 2; - if ((lead >> 4) == 0xE) return 3; - if ((lead >> 3) == 0x1E) return 4; + if (lead < 0x80) + return 1; + if ((lead >> 5) == 0x6) + return 2; + if ((lead >> 4) == 0xE) + return 3; + if ((lead >> 3) == 0x1E) + return 4; return 1; } void tokenize(const std::string& textUtf8, - std::vector& tokenIds, - const ovms::VocabIndex& ix) { + std::vector& tokenIds, + const ovms::VocabIndex& ix) { tokenIds.clear(); size_t pos = 0; const size_t n = textUtf8.size(); @@ -224,7 +228,7 @@ void tokenize(const std::string& textUtf8, const unsigned char lead = static_cast(textUtf8[pos]); const size_t adv = utf8CharLen(lead); SPDLOG_WARN("Tokenizer: unknown bytes at pos {}: '{}'", - pos, std::string(textUtf8.data() + pos, std::min(adv, n - pos))); + pos, std::string(textUtf8.data() + pos, std::min(adv, n - pos))); pos += std::min(adv, n - pos); } } @@ -264,7 +268,8 @@ class KokoroCalculator : public CalculatorBase { SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "KokoroCalculator [Node: {}] Process start", cc->NodeName()); KokoroServableMap servablesMap = cc->InputSidePackets() - .Tag(KOKORO_SESSION_SIDE_PACKET_TAG).Get(); + .Tag(KOKORO_SESSION_SIDE_PACKET_TAG) + .Get(); auto servableIt = servablesMap.find(cc->NodeName()); RET_CHECK(servableIt != servablesMap.end()) << "Could not find initialized Kokoro node named: " << cc->NodeName(); @@ -318,9 +323,9 @@ class KokoroCalculator : public CalculatorBase { *reinterpret_cast(speed.data()) = 1.0f; std::copy(ids.data(), ids.data() + ids.size(), - reinterpret_cast(inputIdsTensor.data())); + reinterpret_cast(inputIdsTensor.data())); std::copy(voiceSlice, voiceSlice + KokoroServable::STYLE_DIM, - reinterpret_cast(refS.data())); + reinterpret_cast(refS.data())); // Inference ModelMetricReporter unused(nullptr, nullptr, "unused", 1); @@ -342,7 +347,7 @@ class KokoroCalculator : public CalculatorBase { const float* data = out.data(); SPDLOG_DEBUG("Model output: {} audio samples ({:.2f}s at 24kHz)", - samples, static_cast(samples) / 24000.0f); + samples, static_cast(samples) / 24000.0f); void* wavDataPtr = nullptr; size_t wavSize = 0; diff --git a/src/audio/kokoro/kokoro_servable.hpp b/src/audio/kokoro/kokoro_servable.hpp index c06f88cfac..5c668ae05d 100644 --- a/src/audio/kokoro/kokoro_servable.hpp +++ b/src/audio/kokoro/kokoro_servable.hpp @@ -50,7 +50,7 @@ struct VocabIndex { }; struct VoicePack { - std::vector data; // flat [numEntries * STYLE_DIM] + std::vector data; // flat [numEntries * STYLE_DIM] size_t numEntries = 0; }; @@ -86,9 +86,10 @@ class EspeakInstance { bool tryInit() { auto try_path = [](const char* path) -> bool { int sr = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, - 0, path, - espeakINITIALIZE_DONT_EXIT); - if (sr <= 0) return false; + 0, path, + espeakINITIALIZE_DONT_EXIT); + if (sr <= 0) + return false; if (espeak_SetVoiceByName("en-us") != EE_OK && espeak_SetVoiceByName("en") != EE_OK) { return false; @@ -96,26 +97,27 @@ class EspeakInstance { return true; }; - if (try_path(nullptr)) return true; + if (try_path(nullptr)) + return true; static const char* ngPaths[] = { "/usr/share/espeak-ng-data", "/opt/homebrew/share/espeak-ng-data", "/usr/local/share/espeak-ng-data", "espeak-ng-data", - nullptr - }; + nullptr}; for (int i = 0; ngPaths[i]; ++i) - if (try_path(ngPaths[i])) return true; + if (try_path(ngPaths[i])) + return true; static const char* esPaths[] = { "/usr/share/espeak-data", "/usr/local/share/espeak-data", "espeak-data", - nullptr - }; + nullptr}; for (int i = 0; esPaths[i]; ++i) - if (try_path(esPaths[i])) return true; + if (try_path(esPaths[i])) + return true; return false; } @@ -220,7 +222,8 @@ struct KokoroServable { const auto& vocab = doc["vocab"]; ix.by_token.reserve(vocab.MemberCount()); for (auto it = vocab.MemberBegin(); it != vocab.MemberEnd(); ++it) { - if (!it->name.IsString() || !it->value.IsInt()) continue; + if (!it->name.IsString() || !it->value.IsInt()) + continue; std::string token = it->name.GetString(); int id = it->value.GetInt(); ix.by_token.emplace(token, id); @@ -228,7 +231,7 @@ struct KokoroServable { } SPDLOG_INFO("Loaded Kokoro vocabulary: {} tokens, max_token_bytes={}", - ix.by_token.size(), ix.max_token_bytes); + ix.by_token.size(), ix.max_token_bytes); return ix; } @@ -247,7 +250,7 @@ struct KokoroServable { auto fileSize = std::filesystem::file_size(entry.path()); if (fileSize == 0 || fileSize % (STYLE_DIM * sizeof(float)) != 0) { SPDLOG_ERROR("Voice file {} has invalid size {} (must be multiple of {})", - entry.path().string(), fileSize, STYLE_DIM * sizeof(float)); + entry.path().string(), fileSize, STYLE_DIM * sizeof(float)); continue; } @@ -262,7 +265,7 @@ struct KokoroServable { } SPDLOG_INFO("Loaded voice pack '{}': {} entries x {} dims from {}", - name, pack.numEntries, STYLE_DIM, entry.path().string()); + name, pack.numEntries, STYLE_DIM, entry.path().string()); if (defaultVoiceName.empty()) { defaultVoiceName = name; From b50e4c60a9538ee3851f866fb6a8b8ab61dad7d5 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Wed, 25 Feb 2026 13:27:08 +0100 Subject: [PATCH 07/11] style --- src/audio/audio_utils.cpp | 2 +- src/audio/kokoro/kokoro_servable.hpp | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/audio/audio_utils.cpp b/src/audio/audio_utils.cpp index 1707b45cd2..59668be23f 100644 --- a/src/audio/audio_utils.cpp +++ b/src/audio/audio_utils.cpp @@ -218,4 +218,4 @@ void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, size_t speechSiz timer.stop(OUTPUT_PREPARATION); auto outputPreparationTime = (timer.elapsed(OUTPUT_PREPARATION)) / 1000; SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Output preparation time: {} ms", outputPreparationTime); -} \ No newline at end of file +} diff --git a/src/audio/kokoro/kokoro_servable.hpp b/src/audio/kokoro/kokoro_servable.hpp index 5c668ae05d..ccee9f30cd 100644 --- a/src/audio/kokoro/kokoro_servable.hpp +++ b/src/audio/kokoro/kokoro_servable.hpp @@ -24,6 +24,7 @@ #include #include #include +#include #pragma warning(push) #pragma warning(disable : 4005 4309 6001 6385 6386 6326 6011 4005 4456 6246) From 31f06cc486e7305ee7a0f17d45a503ab3dbe10ff Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Fri, 27 Feb 2026 10:34:02 +0100 Subject: [PATCH 08/11] fix --- demos/audio/export_kokoro.py | 141 +++++++++++++++++++++++++++ demos/audio/tts_test_strings.py | 125 ++++++++++++++++++++++++ src/audio/kokoro/kokoro_servable.hpp | 45 ++++++++- 3 files changed, 310 insertions(+), 1 deletion(-) create mode 100644 demos/audio/export_kokoro.py create mode 100644 demos/audio/tts_test_strings.py diff --git a/demos/audio/export_kokoro.py b/demos/audio/export_kokoro.py new file mode 100644 index 0000000000..d2615a7aa5 --- /dev/null +++ b/demos/audio/export_kokoro.py @@ -0,0 +1,141 @@ +# +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: Apache 2.0 +# + +import torch +import json +import time +from pathlib import Path +from kokoro.model import KModel +from kokoro import KPipeline +import openvino as ov +import shutil + +MAX_SEQ_LENGTH = 500 + + +class KokoroTTSPipeline: + def __init__(self): + model_id = "hexgrad/Kokoro-82M" + self.pipeline = KPipeline(lang_code="a", repo_id=model_id) + + def __call__(self, text: str, voice: str = "af_heart"): + with torch.no_grad(): + generator = self.pipeline(text, voice=voice) + result = next(generator) + return result.audio + + +class OVKModel(KModel): + def __init__(self, model_dir: Path, device: str, plugin_config: dict = {}): + torch.nn.Module.__init__(self) + + core = ov.Core() + + self.repo_id = model_id + with (model_dir / "config.json").open("r", encoding="utf-8") as f: + config = json.load(f) + self.vocab = config["vocab"] + print("Starting to compile OpenVINO model on device:", device) + + start = time.time() + self.model = core.compile_model(model_dir / "openvino_model.xml", device.upper(), config=plugin_config) + print(f"Model compiled successfully in {time.time() - start:.2f}s.") + self.context_length = config["plbert"]["max_position_embeddings"] + + @property + def device(self): + return torch.device("cpu") + + def forward_with_tokens(self, input_ids: torch.LongTensor, ref_s: torch.FloatTensor, speed: float = 1) -> tuple[torch.FloatTensor, torch.LongTensor]: + text_len = input_ids.shape[-1] + + if text_len < MAX_SEQ_LENGTH: + # 0 in this model context is acting as BOS/EOS/PAD. + # Since 0 causes artifacts, we might consider space (16) or period (4). + padding_value = 16 + input_ids = torch.nn.functional.pad(input_ids, (0, MAX_SEQ_LENGTH - text_len), value=padding_value) + + start = time.time() + print("Running inference on OpenVINO model...") + outputs = self.model([input_ids, ref_s, torch.tensor(speed)]) + print(f"Inference completed in {time.time() - start:.2f}s.") + + audio = torch.from_numpy(outputs[0]) + pred_dur = torch.from_numpy(outputs[1]) + + if text_len < MAX_SEQ_LENGTH: + pred_dur = pred_dur[:text_len] + # Approximate audio trimming based on duration ratio + total_dur = outputs[1].sum() + valid_dur = pred_dur.sum() + if total_dur > 0: + audio_keep = int(audio.shape[-1] * (valid_dur / total_dur)) + audio = audio[:audio_keep] + + return audio, pred_dur + + @staticmethod + def download_and_convert(model_dir: Path, repo_id: str, ttsPipeline: KokoroTTSPipeline): + import openvino as ov + from huggingface_hub import hf_hub_download + import gc + + if not (model_dir / "openvino_model.xml").exists(): + print(f"Converting Kokoro model to OpenVINO format at {model_dir}...") + model = ttsPipeline.pipeline.model + model.forward = model.forward_with_tokens + input_ids = torch.randint(1, 100, (48,)).numpy() + input_ids = torch.LongTensor([[0, *input_ids, 0]]) + style = torch.randn(1, 256) + speed = torch.randint(1, 10, (1,), dtype=torch.float32) + + ov_model = ov.convert_model(model, example_input=(input_ids, style, speed), input=[ + ov.PartialShape("[1, 2..]"), ov.PartialShape([1, -1])]) + ov.save_model(ov_model, model_dir / "openvino_model.xml") + hf_hub_download(repo_id=model_id, filename="config.json", local_dir=model_dir) + else: + print(f"OpenVINO model already exists at {model_dir}, skipping conversion.") + + gc.collect() + + @staticmethod + def convert_to_static(input_model_dir: Path, output_model_dir: Path): + import openvino as ov + + print(f"Converting OpenVINO model to static shapes at {input_model_dir}...") + core = ov.Core() + model = core.read_model(input_model_dir / "openvino_model.xml") + static_shape = {"input_ids": [1, MAX_SEQ_LENGTH], "ref_s": [1, 256], "speed": [1], } + model.reshape(static_shape) + print("Reshaped model inputs:", model.inputs) + ov.save_model(model, output_model_dir / "openvino_model.xml") + print("Conversion to static shapes completed.") + # Copy config file + shutil.copy(input_model_dir / "config.json", output_model_dir / "config.json") + + +if __name__ == "__main__": + + model_id = "hexgrad/Kokoro-82M" + + # Download model from Hugging Face and convert to OpenVINO format. + pipeline = KokoroTTSPipeline() + + # Convert and save the Kokoro model to OpenVINO format + OVKModel.download_and_convert(Path("./kokoro_openvino_model"), repo_id=model_id, ttsPipeline=pipeline) + + # To run inference on NPU, model must have static input shapes + OVKModel.convert_to_static(Path("./kokoro_openvino_model"), Path("./kokoro_static_openvino_model")) + # # Execution on NPU require config file + # config = { + # "NPU": { + # "NPU_USE_NPUW": "YES", + # "NPUW_DEVICES": "NPU,CPU", + # "NPUW_KOKORO": "YES", + # } + # } + + # # NPUW_CACHE_DIR can be used to avoid compilation on every run + # config["NPU"]["NPUW_CACHE_DIR"] = "./npu_cache_kokoro" \ No newline at end of file diff --git a/demos/audio/tts_test_strings.py b/demos/audio/tts_test_strings.py new file mode 100644 index 0000000000..79b1194a3e --- /dev/null +++ b/demos/audio/tts_test_strings.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 +"""Send a battery of tricky TTS test strings to an OpenAI-compatible +speech endpoint, one by one, and save each result as a numbered WAV file. + +Usage: + python tts_test_strings.py --endpoint http://localhost:8000/v3 \ + --model kokoro \ + [--voice None] \ + [--output-dir tts_output] +""" + +import argparse +import os +import sys +import time + +from openai import OpenAI + +TEST_STRINGS = [ + 'Dr. A. B. Carter Jr. met Sen. O\'Neill at 5 p.m., Wed., in Washington, D.C.', + 'Mr. Smith, Ph.D., arrived on Fri. at 6:30 a.m.; Mrs. Jones left at noon.', + 'We meet on 01/02/2025 at 05:30 IST; is that India or Israel time?', + 'The deadline is 2025\u201102\u201101 23:59 UTC\u221205:00 (EST).', + 'He finished 1st; she was 22nd\u2014barely.', + 'Prices: $1,234.56 vs \u20ac1.234,56; also \u00a512 345 (thin space).', + 'Add \u00be cup, then \u00bd tsp; total \u2248 1\u00bc cups.', + 'Chapter XLIV starts on page ix; version v2.0.0 follows v1.12.9.', + 'Dose: 5 mg vs 5 \u03bcg\u2014don\'t confuse micrograms with milligrams.', + 'Avogadro\'s number is 6.022e23; \u03c0 \u2248 3.14159; \u221a2 \u2248 1.4142.', + 'Temperature dropped to \u221210 \u00b0C (14 \u00b0F) with 90% RH.', + 'Visit https://example.com/a/b?x=1&y=2#frag or email ops+alerts@example.org.', + 'Open C:\\Program Files\\Project\\config.yaml or /usr/local/bin/run.sh.', + '.NET, Node.js, C#, C++17, and Rust\'s crate\u2011names\u2011with\u2011hyphens.', + '"WYSIWYG," "GIF" (hard or soft g?), "SQL" (sequel or S\u2011Q\u2011L?).', + 'I will present the present to the lead singer who stepped on the lead.', + 'They desert the desert; the dove dove; he wound the wound.', + 'Please record the record before the minute is up in a minute.', + 'She sells seashells by the seashore; truly Irish wristwatch.', + 'Unique New York, toy boat, red leather yellow leather.', + 'A na\u00efve co\u00f6perative fa\u00e7ade in S\u00e3o Paulo; \u0141\u00f3d\u017a and Krak\u00f3w in Poland.', + 'Pi\u00f1ata, jalape\u00f1o, cr\u00e8me br\u00fbl\u00e9e, bouillabaisse, d\u00e9j\u00e0 vu.', + '\U0001f44d\U0001f3fb is a thumbs\u2011up with light skin tone; \U0001f9d1\u200d\U0001f4bb writes code; \U0001f468\u200d\U0001f469\u200d\U0001f467\u200d\U0001f466 is a family; \U0001f1f5\U0001f1f1 is a flag.', + 'Faces: \U0001f642\U0001f609\U0001f610\U0001f611\U0001f636; hearts: \u2764\ufe0f\U0001f9e1\U0001f49b\U0001f49a\U0001f499; mixed: \U0001f937\u200d\u2642\ufe0f\U0001f926\u200d\u2640\ufe0f.', + 'Latin "A" vs Cyrillic "\u0410"; Greek "\u03c1" vs Latin "p"; micro "\u00b5" vs Greek "\u03bc".', + '\u05e9\u05dc\u05d5\u05dd and \u0645\u0631\u062d\u0628\u064b\u0627 appear with left\u2011to\u2011right text in one line.', + 'Prosody markers: \u02c8primary, \u02ccsecondary, and length \u02d0 are tricky for tokenizers.', + 'Arrows for intonation: \u2197 rising, \u2198 falling, \u2193 drop.', + 'He said, "She replied, \'no\u2014never\u2026\'," then left\u2014silently.', + 'Parentheticals (like this\u2014really!) and em\u2011dashes\u2014here\u2014confuse prosody.', + 'Let f(x)=x^2; then d/dx x^2=2x; \u2202/\u2202x is the operator.', + 'Inline code x += 1; and TeX E=mc^2 should be read clearly.', + 'N,N\u2011Diethyl\u2011meta\u2011toluamide (DEET) differs from p\u2011xylene and m\u2011cresol.', + 'The RFC 7231/HTTP\u2011semantics "GET" vs "HEAD" distinction matters.', + 'Read "macOS" vs "Mac OS", "iOS", "SQL", "URL", and "S3" correctly.', +] + + +def main(): + parser = argparse.ArgumentParser( + description="Send TTS test strings to an OpenAI-compatible speech endpoint." + ) + parser.add_argument( + "--endpoint", required=True, + help="Base URL of the API (e.g. http://localhost:8000/v3)" + ) + parser.add_argument( + "--model", required=True, + help="Model name to use for speech generation" + ) + parser.add_argument( + "--voice", default=None, + help="Voice name (default: voice1)" + ) + parser.add_argument( + "--output-dir", default="tts_output", + help="Directory to save output WAV files (default: tts_output)" + ) + args = parser.parse_args() + + os.makedirs(args.output_dir, exist_ok=True) + + client = OpenAI(base_url=args.endpoint, api_key="unused") + + total = len(TEST_STRINGS) + print(f"Sending {total} test strings to {args.endpoint} (model={args.model}, voice={args.voice})") + print(f"Output directory: {args.output_dir}\n") + + succeeded = 0 + failed = 0 + total_size_kb = 0.0 + t_start = time.time() + + for idx, text in enumerate(TEST_STRINGS, start=1): + preview = text[:80] + ("..." if len(text) > 80 else "") + print(f"[{idx:2d}/{total}] {preview}") + + out_path = os.path.join(args.output_dir, f"{idx:02d}.wav") + t0 = time.time() + try: + response = client.audio.speech.create( + model=args.model, + voice=args.voice, + input=text, + ) + response.write_to_file(out_path) + elapsed = time.time() - t0 + size_kb = os.path.getsize(out_path) / 1024 + total_size_kb += size_kb + succeeded += 1 + print(f" -> {out_path} ({size_kb:.1f} KB, {elapsed:.2f}s)") + except Exception as exc: + elapsed = time.time() - t0 + failed += 1 + print(f" !! FAILED after {elapsed:.2f}s: {exc}", file=sys.stderr) + + total_elapsed = time.time() - t_start + print(f"\n{'='*60}") + print(f"Summary: {succeeded} succeeded, {failed} failed out of {total}") + print(f"Total time: {total_elapsed:.2f}s (avg {total_elapsed/total:.2f}s per string)") + print(f"Total audio size: {total_size_kb:.1f} KB") + print(f"{'='*60}") + + +if __name__ == "__main__": + main() diff --git a/src/audio/kokoro/kokoro_servable.hpp b/src/audio/kokoro/kokoro_servable.hpp index ccee9f30cd..73dff4b104 100644 --- a/src/audio/kokoro/kokoro_servable.hpp +++ b/src/audio/kokoro/kokoro_servable.hpp @@ -156,10 +156,21 @@ struct KokoroServable { // that accumulate in the deep decoder network and cause energy fade. ov::hint::execution_mode(ov::hint::ExecutionMode::ACCURACY), }; + //properties["INFERENCE_PRECISION_HINT"] = "f32"; ov::Core core; auto m_model = core.read_model(parsedModelsPath / std::filesystem::path("openvino_model.xml"), {}, properties); compiledModel = core.compile_model(m_model, targetDevice, properties); - inferRequestsQueue = std::make_unique(compiledModel, 5); + uint32_t numberOfParallelInferRequests = 1; + try { + numberOfParallelInferRequests = compiledModel.get_property(ov::optimal_number_of_infer_requests); + } catch (const ov::Exception& ex) { + SPDLOG_WARN("Failed to query OPTIMAL_NUMBER_OF_INFER_REQUESTS with error {}. Using 1 nireq.", ex.what()); + numberOfParallelInferRequests = 1u; + } + inferRequestsQueue = std::make_unique(compiledModel, numberOfParallelInferRequests); + + // Warm up model with dummy inference + //warmUpModel(); } OVInferRequestsQueue& getInferRequestsQueue() { @@ -276,6 +287,38 @@ struct KokoroServable { SPDLOG_INFO("Loaded {} voice pack(s), default: '{}'", voicePacks.size(), defaultVoiceName); } + + // void warmUpModel() { + // try { + // SPDLOG_INFO("Warming up Kokoro model with dummy inference..."); + + // // Create dummy tensors with minimal sequence length + // constexpr size_t dummySeqLen = 3; // [0, token, 0] pattern + // auto inputIdsTensor = ov::Tensor{ov::element::i64, ov::Shape{1, dummySeqLen}}; + // auto refS = ov::Tensor{ov::element::f32, ov::Shape{1, STYLE_DIM}}; + // auto speed = ov::Tensor{ov::element::f32, ov::Shape{1}}; + + // // Fill with dummy values + // auto* idsData = reinterpret_cast(inputIdsTensor.data()); + // idsData[0] = 0; // PAD token + // idsData[1] = 1; // arbitrary token ID + // idsData[2] = 0; // PAD token + + // std::fill_n(reinterpret_cast(refS.data()), STYLE_DIM, 0.0f); + // *reinterpret_cast(speed.data()) = 1.0f; + + // // Get infer request and run warm-up inference + // ov::InferRequest inferRequest = compiledModel.create_infer_request(); + // inferRequest.set_tensor("input_ids", inputIdsTensor); + // inferRequest.set_tensor("103", refS); + // inferRequest.set_tensor("speed", speed); + // inferRequest.infer(); + + // SPDLOG_INFO("Kokoro model warm-up completed successfully"); + // } catch (const std::exception& ex) { + // SPDLOG_WARN("Kokoro model warm-up failed: {}. Continuing anyway...", ex.what()); + // } + // } }; using KokoroServableMap = std::unordered_map>; From 962e3d34ad7e813e9dc1b5a8ce7f8598d3ffc7b5 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Fri, 27 Feb 2026 15:52:27 +0100 Subject: [PATCH 09/11] Improvements --- demos/audio/export_kokoro.py | 6 +- src/audio/kokoro/kokoro_calculator.cc | 387 +++++++++++++---------- src/audio/kokoro/kokoro_calculator.proto | 1 + src/audio/kokoro/kokoro_servable.hpp | 47 +-- 4 files changed, 239 insertions(+), 202 deletions(-) diff --git a/demos/audio/export_kokoro.py b/demos/audio/export_kokoro.py index d2615a7aa5..8ca3ed89f6 100644 --- a/demos/audio/export_kokoro.py +++ b/demos/audio/export_kokoro.py @@ -118,16 +118,16 @@ def convert_to_static(input_model_dir: Path, output_model_dir: Path): if __name__ == "__main__": - model_id = "hexgrad/Kokoro-82M" + model_id = "hexgrad/Kokoro-82M-v1.1-zh" # Download model from Hugging Face and convert to OpenVINO format. pipeline = KokoroTTSPipeline() # Convert and save the Kokoro model to OpenVINO format - OVKModel.download_and_convert(Path("./kokoro_openvino_model"), repo_id=model_id, ttsPipeline=pipeline) + OVKModel.download_and_convert(Path("./kokoro_openvino_model_zh"), repo_id=model_id, ttsPipeline=pipeline) # To run inference on NPU, model must have static input shapes - OVKModel.convert_to_static(Path("./kokoro_openvino_model"), Path("./kokoro_static_openvino_model")) + OVKModel.convert_to_static(Path("./kokoro_openvino_model_zh"), Path("./kokoro_static_openvino_model_zh")) # # Execution on NPU require config file # config = { # "NPU": { diff --git a/src/audio/kokoro/kokoro_calculator.cc b/src/audio/kokoro/kokoro_calculator.cc index 1747b18081..728e0f88b4 100644 --- a/src/audio/kokoro/kokoro_calculator.cc +++ b/src/audio/kokoro/kokoro_calculator.cc @@ -14,11 +14,8 @@ // limitations under the License. //***************************************************************************** #include -#include -#include #include #include -#include #include #pragma warning(push) @@ -40,7 +37,6 @@ #pragma warning(push) #pragma warning(disable : 6001 4324 6385 6386) -#include "absl/strings/escaping.h" #include "absl/strings/str_cat.h" #pragma warning(pop) @@ -64,7 +60,67 @@ namespace { #define espeakPHONEMES_NO_STRESS 0x08 #endif -void espeakPhonemizeAll(const std::string& textUtf8, std::string& outIpa, bool noStress = true) { +std::string retone(const std::string& p) { + std::string result = p; + + auto replaceAll = [](std::string& s, const std::string& from, const std::string& to) { + size_t pos = 0; + while ((pos = s.find(from, pos)) != std::string::npos) { + s.replace(pos, from.size(), to); + pos += to.size(); + } + }; + + // Tone mark replacements + replaceAll(result, "˧˩˧", "↓"); // third tone + replaceAll(result, "˧˥", "↗"); // second tone + replaceAll(result, "˥˩", "↘"); // fourth tone + replaceAll(result, "˥", "→"); // first tone + + // Unicode character replacements (UTF-8 encoded) + replaceAll(result, "\xCA\x97\xCC\x89", "ɨ"); // chr(635)+chr(809) + replaceAll(result, "\xCA\x91\xCC\x89", "ɨ"); // chr(633)+chr(809) + + // Verify chr(809) removed + if (result.find("\xCC\x89") != std::string::npos) { + SPDLOG_WARN("Combining diacritic (chr 809) still present: {}", result); + } + + return result; +} + +std::string getEspeakVoice(const std::string& isoLanguageCode) { + // ISO 639-1 codes with optional region codes + if (isoLanguageCode == "en-us") { + return "en-us"; // American English (default for 'en') + } else if (isoLanguageCode == "en-gb") { + return "en"; // British English + } else if (isoLanguageCode == "en") { + return "en-us"; // Default to American English when only 'en' specified + } else if (isoLanguageCode == "es") { + return "es"; + } else if (isoLanguageCode == "fr") { + return "fr"; + } else if (isoLanguageCode == "hi") { + return "hi"; + } else if (isoLanguageCode == "it") { + return "it"; + } else if (isoLanguageCode == "ja") { + return "ja"; + } else if (isoLanguageCode == "pt-br") { + return "pt"; // Brazilian Portuguese + } else if (isoLanguageCode == "zh" || isoLanguageCode == "zh-cn") { + return "cmn-latn-pinyin"; // Mandarin Chinese + } + return ""; // Unsupported +} + +bool isSupportedLanguage(const std::string& isoLanguageCode) { + // Only accept ISO 639-1 codes and regional variants + return !getEspeakVoice(isoLanguageCode).empty(); +} + +void espeakPhonemizeAll(const std::string& textUtf8, std::string& outIpa, const std::string& language = "en", bool noStress = true) { outIpa.clear(); auto& espeak = ovms::EspeakInstance::instance(); if (!espeak.isReady()) { @@ -74,6 +130,23 @@ void espeakPhonemizeAll(const std::string& textUtf8, std::string& outIpa, bool n std::lock_guard guard(espeak.mutex()); + // Get the eSpeak voice name from the ISO language code + // Kokoro supports 9 languages: American English, British English, Spanish, French, Hindi, Italian, Japanese, Brazilian Portuguese, Mandarin Chinese + std::string voiceName = getEspeakVoice(language); + if (voiceName.empty()) { + // This should not happen if validation was done, but fallback just in case + SPDLOG_ERROR("Invalid language code '{}' passed to espeakPhonemizeAll", language); + voiceName = "en-us"; + } + if (espeak_SetVoiceByName(voiceName.c_str()) != EE_OK) { + SPDLOG_ERROR("Failed to set eSpeak voice '{}'", voiceName); + if (voiceName != "en-us" && espeak_SetVoiceByName("en-us") == EE_OK) { + voiceName = "en-us"; + } else { + return; + } + } + const int mode = espeakPHONEMES_IPA | (noStress ? espeakPHONEMES_NO_STRESS : 0); const void* pos = static_cast(textUtf8.c_str()); const char* endPtr = static_cast(pos) + textUtf8.size(); @@ -91,6 +164,7 @@ void espeakPhonemizeAll(const std::string& textUtf8, std::string& outIpa, bool n // Strip combining diacriticals (U+0300..U+036F) and collapse spaces std::string cleaned; + cleaned.reserve(rawIpa.size()); for (size_t i = 0; i < rawIpa.size(); ++i) { unsigned char c = static_cast(rawIpa[i]); if (i + 1 < rawIpa.size()) { @@ -103,6 +177,7 @@ void espeakPhonemizeAll(const std::string& textUtf8, std::string& outIpa, bool n cleaned.push_back(c); } + outIpa.reserve(cleaned.size()); bool lastSpace = false; for (char c : cleaned) { if (std::isspace(static_cast(c))) { @@ -123,70 +198,6 @@ void espeakPhonemizeAll(const std::string& textUtf8, std::string& outIpa, bool n SPDLOG_DEBUG("IPA phonemes: '{}' (length: {})", outIpa, outIpa.size()); } -// Post-process eSpeak IPA into Kokoro/misaki phoneme alphabet. -// Mirrors misaki.espeak.EspeakFallback.E2M for American English. -// void espeakIpaToKokoro(std::string& ps) { -// // Helper: replace all occurrences of `from` with `to` in `s`. -// auto replaceAll = [](std::string& s, const std::string& from, const std::string& to) { -// if (from.empty()) return; -// size_t pos = 0; -// while ((pos = s.find(from, pos)) != std::string::npos) { -// s.replace(pos, from.size(), to); -// pos += to.size(); -// } -// }; - -// // --- Multi-char replacements (longest first) --- -// // Syllabic n with glottal stop -// replaceAll(ps, "\xca\x94\xcb\x8c\x6e\xcc\xa9", "\xca\x94\x6e"); // ʔˌn̩ → ʔn -// replaceAll(ps, "\xca\x94\x6e\xcc\xa9", "\xca\x94\x6e"); // ʔn̩ → ʔn -// // Syllabic mark before consonant → ᵊ + consonant -// // ə̩l → ᵊl (syllabic l) -// replaceAll(ps, "\xc9\x99\xcc\xa9\x6c", "\xe1\xb5\x8a\x6c"); // əl̩ → ᵊl (approximation) - -// // Diphthongs -// replaceAll(ps, "a\xc9\xaa", "I"); // aɪ → I -// replaceAll(ps, "a\xca\x8a", "W"); // aʊ → W -// replaceAll(ps, "e\xc9\xaa", "A"); // eɪ → A -// replaceAll(ps, "\xc9\x94\xc9\xaa", "Y"); // ɔɪ → Y -// replaceAll(ps, "o\xca\x8a", "O"); // oʊ → O (American) -// replaceAll(ps, "\xc9\x99\xca\x8a", "O"); // əʊ → O (British) - -// // Affricates -// replaceAll(ps, "d\xca\x92", "\xca\xa4"); // dʒ → ʤ -// replaceAll(ps, "t\xca\x83", "\xca\xa7"); // tʃ → ʧ - -// // Palatalization -// replaceAll(ps, "\xca\xb2\x6f", "jo"); // ʲo → jo -// replaceAll(ps, "\xca\xb2\xc9\x99", "j\xc9\x99"); // ʲə → jə -// replaceAll(ps, "\xca\xb2", ""); // ʲ → (delete) - -// // R-colored vowels and vowel length -// replaceAll(ps, "\xc9\x9c\xcb\x90\xc9\xb9", "\xc9\x9c\xc9\xb9"); // ɜːɹ → ɜɹ -// replaceAll(ps, "\xc9\x9c\xcb\x90", "\xc9\x9c\xc9\xb9"); // ɜː → ɜɹ -// replaceAll(ps, "\xc9\xaa\xc9\x99", "i\xc9\x99"); // ɪə → iə - -// // --- Single-char replacements --- -// replaceAll(ps, "\xc9\x9a", "\xc9\x99\xc9\xb9"); // ɚ → əɹ -// replaceAll(ps, "\xc9\x90", "\xc9\x99"); // ɐ → ə -// replaceAll(ps, "\xc9\xac", "l"); // ɬ → l -// replaceAll(ps, "\xc3\xa7", "k"); // ç → k -// replaceAll(ps, "x", "k"); // x → k -// replaceAll(ps, "r", "\xc9\xb9"); // r → ɹ -// replaceAll(ps, "\xcb\x90", ""); // ː → (strip length marks) -// replaceAll(ps, "\xcc\x83", ""); // ̃ → (strip nasal tilde) - -// // British vowel mappings (in case eSpeak uses 'en' voice) -// replaceAll(ps, "\xc9\x92", "\xc9\x94"); // ɒ → ɔ - -// // Remaining standalone vowels (must be AFTER diphthong replacements) -// replaceAll(ps, "o", "\xc9\x94"); // o → ɔ (for espeak < 1.52) -// replaceAll(ps, "e", "A"); // e → A - -// // Flap and glottal stop (misaki version != 2.0) -// replaceAll(ps, "\xc9\xbe", "T"); // ɾ → T -// replaceAll(ps, "\xca\x94", "t"); // ʔ → t -// } size_t utf8CharLen(unsigned char lead) { if (lead < 0x80) @@ -202,10 +213,15 @@ size_t utf8CharLen(unsigned char lead) { void tokenize(const std::string& textUtf8, std::vector& tokenIds, - const ovms::VocabIndex& ix) { + const ovms::VocabIndex& ix, + const std::string& language = "en") { tokenIds.clear(); + // Reserve estimated capacity to avoid reallocations + tokenIds.reserve(textUtf8.size() / 2); + size_t pos = 0; const size_t n = textUtf8.size(); + size_t unknownCount = 0; while (pos < n) { size_t maxTry = std::min(ix.max_token_bytes, n - pos); @@ -227,12 +243,20 @@ void tokenize(const std::string& textUtf8, } else { const unsigned char lead = static_cast(textUtf8[pos]); const size_t adv = utf8CharLen(lead); - SPDLOG_WARN("Tokenizer: unknown bytes at pos {}: '{}'", - pos, std::string(textUtf8.data() + pos, std::min(adv, n - pos))); + std::string unknownBytes(textUtf8.data() + pos, std::min(adv, n - pos)); + unknownCount++; + SPDLOG_DEBUG("Tokenizer [lang={}]: unknown phoneme at pos {}: '{}' (skipping)", + language, pos, unknownBytes); pos += std::min(adv, n - pos); } } - SPDLOG_DEBUG("Tokenize: produced {} ids", tokenIds.size()); + if (unknownCount > 0) { + SPDLOG_WARN("Tokenize [lang={}]: {} unknown phonemes found. Produced {} token ids. " + "Consider updating vocabulary for better {} speech quality.", + language, unknownCount, tokenIds.size(), language); + } else { + SPDLOG_DEBUG("Tokenize [lang={}]: produced {} ids without unknown phonemes", language, tokenIds.size()); + } } } // namespace @@ -243,6 +267,7 @@ const std::string KOKORO_SESSION_SIDE_PACKET_TAG = "KOKORO_NODE_RESOURCES"; class KokoroCalculator : public CalculatorBase { static const std::string INPUT_TAG_NAME; static const std::string OUTPUT_TAG_NAME; + std::string defaultLanguage; // Language configured in graph pbtxt public: static absl::Status GetContract(CalculatorContract* cc) { @@ -261,102 +286,140 @@ class KokoroCalculator : public CalculatorBase { absl::Status Open(CalculatorContext* cc) final { SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "KokoroCalculator [Node: {}] Open", cc->NodeName()); + + // Read language from graph configuration + const auto& options = cc->Options(); + this->defaultLanguage = options.has_language() ? options.language() : "en"; + + // Normalize language code to lowercase + std::transform(this->defaultLanguage.begin(), this->defaultLanguage.end(), this->defaultLanguage.begin(), ::tolower); + + // Validate language is supported + if (!isSupportedLanguage(this->defaultLanguage)) { + return absl::InvalidArgumentError(absl::StrCat( + "Invalid language in graph config: '", this->defaultLanguage, "'. ", + "Supported ISO 639-1 language codes: en, es, fr, hi, it, ja, pt-br, zh. ", + "Regional variants: en-us, en-gb, pt-br, zh-cn" + )); + } + + SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, + "KokoroCalculator [Node: {}] configured for language: {}", + cc->NodeName(), this->defaultLanguage); + return absl::OkStatus(); } absl::Status Process(CalculatorContext* cc) final { SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "KokoroCalculator [Node: {}] Process start", cc->NodeName()); + try { + KokoroServableMap servablesMap = cc->InputSidePackets() + .Tag(KOKORO_SESSION_SIDE_PACKET_TAG) + .Get(); + auto servableIt = servablesMap.find(cc->NodeName()); + RET_CHECK(servableIt != servablesMap.end()) + << "Could not find initialized Kokoro node named: " << cc->NodeName(); + auto servable = servableIt->second; + + const auto& payload = cc->Inputs().Tag(INPUT_TAG_NAME).Get(); + auto it = payload.parsedJson->FindMember("input"); + RET_CHECK(it != payload.parsedJson->MemberEnd()) << "Missing 'input' in request"; + RET_CHECK(it->value.IsString()) << "'input' must be a string"; + const std::string text = it->value.GetString(); + + // Read optional "voice" parameter (OpenAI TTS API) + std::string voiceName; + auto voiceIt = payload.parsedJson->FindMember("voice"); + if (voiceIt != payload.parsedJson->MemberEnd() && voiceIt->value.IsString()) { + voiceName = voiceIt->value.GetString(); + } - KokoroServableMap servablesMap = cc->InputSidePackets() - .Tag(KOKORO_SESSION_SIDE_PACKET_TAG) - .Get(); - auto servableIt = servablesMap.find(cc->NodeName()); - RET_CHECK(servableIt != servablesMap.end()) - << "Could not find initialized Kokoro node named: " << cc->NodeName(); - auto servable = servableIt->second; - - const auto& payload = cc->Inputs().Tag(INPUT_TAG_NAME).Get(); - auto it = payload.parsedJson->FindMember("input"); - RET_CHECK(it != payload.parsedJson->MemberEnd()) << "Missing 'input' in request"; - RET_CHECK(it->value.IsString()) << "'input' must be a string"; - const std::string text = it->value.GetString(); - - // Read optional "voice" parameter (OpenAI TTS API) - std::string voiceName; - auto voiceIt = payload.parsedJson->FindMember("voice"); - if (voiceIt != payload.parsedJson->MemberEnd() && voiceIt->value.IsString()) { - voiceName = voiceIt->value.GetString(); + // Language is configured in the graph pbtxt, not from request + // Use the defaultLanguage set during Open() + const std::string language = this->defaultLanguage; + SPDLOG_DEBUG("Using configured language: {}", language); + + // Text -> IPA phonemization + std::string phonemes; + + // Use eSpeak for all languages + espeakPhonemizeAll(text, phonemes, language, /*noStress=*/false); + if(language == "zh" || language == "zh-cn"){ + phonemes = retone(phonemes); + } + + SPDLOG_DEBUG("Input text: '{}' (language: {}), IPA phonemes ({} chars): '{}'", text, language, phonemes.size(), phonemes); + + // Preserve trailing punctuation from original text (eSpeak strips it) + // if (!text.empty()) { + // char last = text.back(); + // if (last == '.' || last == '!' || last == '?' || last == ';' || last == ':' || last == ',') { + // phonemes.push_back(last); + // } + // } + SPDLOG_DEBUG("After E2M mapping ({} chars): '{}'", phonemes.size(), phonemes); + // IPA -> Kokoro token IDs + const auto& vocabIx = servable->getVocabIndex(); + std::vector tokenIds; + tokenize(phonemes, tokenIds, vocabIx, language); + + // Wrap with PAD token (id=0) at both ends — matches official + // forward_with_tokens: input_ids = [[0, *tokens, 0]] + tokenIds.insert(tokenIds.begin(), 0); + tokenIds.push_back(0); + + // Voice embedding — select slice from voice pack based on content token count + size_t numContentTokens = tokenIds.size() >= 2 ? tokenIds.size() - 2 : 0; // exclude BOS pad + EOS + const float* voiceSlice = servable->getVoiceSlice(voiceName, numContentTokens); + RET_CHECK(voiceSlice != nullptr) << "No voice pack loaded (place .bin files in /voices/)"; + + auto inputIdsTensor = ov::Tensor{ov::element::i64, ov::Shape{1, tokenIds.size()}}; + auto refS = ov::Tensor{ov::element::f32, ov::Shape{1, KokoroServable::STYLE_DIM}}; + auto speed = ov::Tensor{ov::element::f32, ov::Shape{1}}; + + *reinterpret_cast(speed.data()) = 1.0f; + std::copy(tokenIds.data(), tokenIds.data() + tokenIds.size(), + reinterpret_cast(inputIdsTensor.data())); + std::copy(voiceSlice, voiceSlice + KokoroServable::STYLE_DIM, + reinterpret_cast(refS.data())); + + // Inference + ModelMetricReporter unused(nullptr, nullptr, "unused", 1); + auto executingStreamIdGuard = + std::make_unique(servable->getInferRequestsQueue(), unused); + ov::InferRequest& inferRequest = executingStreamIdGuard->getInferRequest(); + + inferRequest.set_tensor("input_ids", inputIdsTensor); + inferRequest.set_tensor("103", refS); + inferRequest.set_tensor("speed", speed); + inferRequest.start_async(); + inferRequest.wait(); + + // Collect audio output + auto out = inferRequest.get_tensor(inferRequest.get_compiled_model().outputs()[0]); + RET_CHECK(out.get_shape().size() == 1); + RET_CHECK(out.get_element_type() == ov::element::f32); + const size_t samples = out.get_shape()[0]; + const float* data = out.data(); + + SPDLOG_DEBUG("Model output: {} audio samples ({:.2f}s at 24kHz)", + samples, static_cast(samples) / 24000.0f); + + void* wavDataPtr = nullptr; + size_t wavSize = 0; + prepareAudioOutputKokoro(&wavDataPtr, wavSize, samples, data); + + auto output = std::make_unique(reinterpret_cast(wavDataPtr), wavSize); + drwav_free(wavDataPtr, NULL); + + cc->Outputs().Tag(OUTPUT_TAG_NAME).Add(output.release(), cc->InputTimestamp()); + } catch (const std::exception& e) { + SPDLOG_ERROR("KokoroCalculator [Node: {}] Process failed: {}", cc->NodeName(), e.what()); + return absl::InvalidArgumentError(e.what()); + } catch (...) { + SPDLOG_ERROR("KokoroCalculator [Node: {}] Process failed: unknown error", cc->NodeName()); + return absl::InvalidArgumentError("Kokoro processing failed"); } - - // Text -> IPA phonemization - std::string phonemes; - espeakPhonemizeAll(text, phonemes, /*noStress=*/false); - SPDLOG_DEBUG("Input text: '{}', IPA phonemes ({} chars): '{}'", text, phonemes.size(), phonemes); - - // Preserve trailing punctuation from original text (eSpeak strips it) - // if (!text.empty()) { - // char last = text.back(); - // if (last == '.' || last == '!' || last == '?' || last == ';' || last == ':' || last == ',') { - // phonemes.push_back(last); - // } - // } - SPDLOG_DEBUG("After E2M mapping ({} chars): '{}'", phonemes.size(), phonemes); - // IPA -> Kokoro token IDs - const auto& vocabIx = servable->getVocabIndex(); - std::vector> inputTokens(1); - tokenize(phonemes, inputTokens[0], vocabIx); - - // Wrap with PAD token (id=0) at both ends — matches official - // forward_with_tokens: input_ids = [[0, *tokens, 0]] - inputTokens[0].insert(inputTokens[0].begin(), 0); - inputTokens[0].push_back(0); - - // Voice embedding — select slice from voice pack based on content token count - auto& ids = inputTokens[0]; - size_t numContentTokens = ids.size() >= 2 ? ids.size() - 2 : 0; // exclude BOS pad + EOS - const float* voiceSlice = servable->getVoiceSlice(voiceName, numContentTokens); - RET_CHECK(voiceSlice != nullptr) << "No voice pack loaded (place .bin files in /voices/)"; - - auto inputIdsTensor = ov::Tensor{ov::element::i64, ov::Shape{1, ids.size()}}; - auto refS = ov::Tensor{ov::element::f32, ov::Shape{1, KokoroServable::STYLE_DIM}}; - auto speed = ov::Tensor{ov::element::f32, ov::Shape{1}}; - - *reinterpret_cast(speed.data()) = 1.0f; - std::copy(ids.data(), ids.data() + ids.size(), - reinterpret_cast(inputIdsTensor.data())); - std::copy(voiceSlice, voiceSlice + KokoroServable::STYLE_DIM, - reinterpret_cast(refS.data())); - - // Inference - ModelMetricReporter unused(nullptr, nullptr, "unused", 1); - auto executingStreamIdGuard = - std::make_unique(servable->getInferRequestsQueue(), unused); - ov::InferRequest& inferRequest = executingStreamIdGuard->getInferRequest(); - - inferRequest.set_tensor("input_ids", inputIdsTensor); - inferRequest.set_tensor("103", refS); - inferRequest.set_tensor("speed", speed); - inferRequest.start_async(); - inferRequest.wait(); - - // Collect audio output - auto out = inferRequest.get_tensor(inferRequest.get_compiled_model().outputs()[0]); - RET_CHECK(out.get_shape().size() == 1); - RET_CHECK(out.get_element_type() == ov::element::f32); - const size_t samples = out.get_shape()[0]; - const float* data = out.data(); - - SPDLOG_DEBUG("Model output: {} audio samples ({:.2f}s at 24kHz)", - samples, static_cast(samples) / 24000.0f); - - void* wavDataPtr = nullptr; - size_t wavSize = 0; - prepareAudioOutputKokoro(&wavDataPtr, wavSize, samples, data); - - auto output = std::make_unique(reinterpret_cast(wavDataPtr), wavSize); - drwav_free(wavDataPtr, NULL); - - cc->Outputs().Tag(OUTPUT_TAG_NAME).Add(output.release(), cc->InputTimestamp()); SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "KokoroCalculator [Node: {}] Process end", cc->NodeName()); return absl::OkStatus(); } diff --git a/src/audio/kokoro/kokoro_calculator.proto b/src/audio/kokoro/kokoro_calculator.proto index d9fc1b4bd9..8ec0f43341 100644 --- a/src/audio/kokoro/kokoro_calculator.proto +++ b/src/audio/kokoro/kokoro_calculator.proto @@ -30,4 +30,5 @@ message KokoroCalculatorOptions { required string models_path = 1; optional string target_device = 2; optional string plugin_config = 3; + optional string language = 4; // ISO 639-1 language code (en, es, fr, hi, it, ja, pt-br, zh) } diff --git a/src/audio/kokoro/kokoro_servable.hpp b/src/audio/kokoro/kokoro_servable.hpp index 73dff4b104..9a81f8f527 100644 --- a/src/audio/kokoro/kokoro_servable.hpp +++ b/src/audio/kokoro/kokoro_servable.hpp @@ -24,7 +24,6 @@ #include #include #include -#include #pragma warning(push) #pragma warning(disable : 4005 4309 6001 6385 6386 6326 6011 4005 4456 6246) @@ -91,8 +90,17 @@ class EspeakInstance { espeakINITIALIZE_DONT_EXIT); if (sr <= 0) return false; + // Try to initialize with Kokoro's supported language voices + // Kokoro supports: en-us (American English), en (British English), es (Spanish), fr (French), hi (Hindi), it (Italian), ja (Japanese), pt (Brazilian Portuguese), cmn (Mandarin Chinese) if (espeak_SetVoiceByName("en-us") != EE_OK && - espeak_SetVoiceByName("en") != EE_OK) { + espeak_SetVoiceByName("en") != EE_OK && + espeak_SetVoiceByName("es") != EE_OK && + espeak_SetVoiceByName("fr") != EE_OK && + espeak_SetVoiceByName("hi") != EE_OK && + espeak_SetVoiceByName("it") != EE_OK && + espeak_SetVoiceByName("ja") != EE_OK && + espeak_SetVoiceByName("pt") != EE_OK && + espeak_SetVoiceByName("cmn") != EE_OK) { return false; } return true; @@ -168,9 +176,6 @@ struct KokoroServable { numberOfParallelInferRequests = 1u; } inferRequestsQueue = std::make_unique(compiledModel, numberOfParallelInferRequests); - - // Warm up model with dummy inference - //warmUpModel(); } OVInferRequestsQueue& getInferRequestsQueue() { @@ -287,38 +292,6 @@ struct KokoroServable { SPDLOG_INFO("Loaded {} voice pack(s), default: '{}'", voicePacks.size(), defaultVoiceName); } - - // void warmUpModel() { - // try { - // SPDLOG_INFO("Warming up Kokoro model with dummy inference..."); - - // // Create dummy tensors with minimal sequence length - // constexpr size_t dummySeqLen = 3; // [0, token, 0] pattern - // auto inputIdsTensor = ov::Tensor{ov::element::i64, ov::Shape{1, dummySeqLen}}; - // auto refS = ov::Tensor{ov::element::f32, ov::Shape{1, STYLE_DIM}}; - // auto speed = ov::Tensor{ov::element::f32, ov::Shape{1}}; - - // // Fill with dummy values - // auto* idsData = reinterpret_cast(inputIdsTensor.data()); - // idsData[0] = 0; // PAD token - // idsData[1] = 1; // arbitrary token ID - // idsData[2] = 0; // PAD token - - // std::fill_n(reinterpret_cast(refS.data()), STYLE_DIM, 0.0f); - // *reinterpret_cast(speed.data()) = 1.0f; - - // // Get infer request and run warm-up inference - // ov::InferRequest inferRequest = compiledModel.create_infer_request(); - // inferRequest.set_tensor("input_ids", inputIdsTensor); - // inferRequest.set_tensor("103", refS); - // inferRequest.set_tensor("speed", speed); - // inferRequest.infer(); - - // SPDLOG_INFO("Kokoro model warm-up completed successfully"); - // } catch (const std::exception& ex) { - // SPDLOG_WARN("Kokoro model warm-up failed: {}. Continuing anyway...", ex.what()); - // } - // } }; using KokoroServableMap = std::unordered_map>; From a12dc60f2d3e465c5e907557cdd5e985bef53fc7 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Mon, 2 Mar 2026 13:06:23 +0100 Subject: [PATCH 10/11] change way espeak is built --- Dockerfile.redhat | 60 +++++++++++++++++++++++++++++++++--- Dockerfile.ubuntu | 61 +++++++++++++++++++++++++++++++++++-- third_party/espeak_ng/BUILD | 31 ++++++++----------- 3 files changed, 126 insertions(+), 26 deletions(-) diff --git a/Dockerfile.redhat b/Dockerfile.redhat index 41e02ecc12..da9885a92d 100644 --- a/Dockerfile.redhat +++ b/Dockerfile.redhat @@ -100,6 +100,37 @@ WORKDIR /ovms/third_party/opencv RUN if [ "$VERBOSE_LOGS" == "ON" ] ; then export VERBOSE=1 ; fi && ./install_opencv.sh ####### End of OpenCV +# Build espeak-ng from sources +FROM base_build as espeak_build + +ARG ESPEAK_NG_VERSION=1.51.1 +WORKDIR /tmp/espeak_build + +RUN dnf install -y libtool automake autoconf pkgconfig && \ + dnf clean all + +RUN cd /tmp/espeak_build && \ + git clone --branch ${ESPEAK_NG_VERSION} https://github.com/espeak-ng/espeak-ng.git espeak-ng-src 2>&1 && \ + ls -lah /tmp/espeak_build/ + +RUN cd /tmp/espeak_build/espeak-ng-src && \ + touch AUTHORS NEWS && \ + libtoolize --force --copy && \ + aclocal && \ + autoheader && \ + autoconf && \ + automake --add-missing --copy && \ + ./configure --prefix=/opt/espeak-ng \ + --disable-shared \ + --enable-static \ + --disable-mbrola \ + --disable-klatt \ + --without-audio && \ + make -j$(nproc) && \ + make install + +RUN rm -rf /tmp/espeak_build + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # FROM base_build as build ARG BASE_IMAGE @@ -127,11 +158,15 @@ RUN dnf install -y -d6 \ python3.12 \ python3.12-devel \ python3.12-pip \ - libicu-devel \ - espeak-ng \ - espeak-ng-devel && \ + libicu-devel && \ dnf clean all +# Copy espeak-ng built from sources +COPY --from=espeak_build /opt/espeak-ng /opt/espeak-ng +ENV PATH="/opt/espeak-ng/bin:${PATH}" +ENV LD_LIBRARY_PATH="/opt/espeak-ng/lib:${LD_LIBRARY_PATH}" +ENV ESPEAK_DATA_PATH="/opt/espeak-ng/share/espeak-ng-data" + WORKDIR / ARG INSTALL_DRIVER_VERSION="24.52.32224" @@ -258,6 +293,17 @@ RUN ln -s /usr/lib64 /usr/lib/x86_64-linux-gnu COPY external /ovms/external/ COPY third_party /ovms/third_party +# Provide espeak-ng headers and static library inside workspace for Bazel +RUN mkdir -p /ovms/third_party/espeak_ng/include /ovms/third_party/espeak_ng/lib && \ + cp -r /opt/espeak-ng/include/* /ovms/third_party/espeak_ng/include/ && \ + mkdir -p /ovms/third_party/espeak_ng/include/espeak-ng && \ + if [ -d /opt/espeak-ng/include/espeak-ng ]; then \ + cp -r /opt/espeak-ng/include/espeak-ng/* /ovms/third_party/espeak_ng/include/espeak-ng/; \ + else \ + cp -v /opt/espeak-ng/include/*.h /ovms/third_party/espeak_ng/include/espeak-ng/; \ + fi && \ + cp -v /opt/espeak-ng/lib/libespeak-ng.a /ovms/third_party/espeak_ng/lib/ + # This path is required for namespace to setup Python dependencies for testing the binding COPY src/BUILD /ovms/src/BUILD COPY src/python/binding/BUILD /ovms/src/python/binding/BUILD @@ -406,6 +452,13 @@ LABEL base-image=${RELEASE_BASE_IMAGE} ENV PYTHONPATH=/ovms/lib/python:/ovms/python_deps WORKDIR / + +# Copy espeak-ng built from sources +COPY --from=espeak_build /opt/espeak-ng /opt/espeak-ng +ENV PATH="/opt/espeak-ng/bin:${PATH}" +ENV LD_LIBRARY_PATH="/opt/espeak-ng/lib:${LD_LIBRARY_PATH}" +ENV ESPEAK_DATA_PATH="/opt/espeak-ng/share/espeak-ng-data" + SHELL ["/bin/bash", "-o", "pipefail", "-c"] COPY ./install_redhat_gpu_drivers.sh /install_gpu_drivers.sh # hadolint ignore=DL3003,DL3041,SC2164,SC1091 @@ -418,7 +471,6 @@ RUN if [ -f /usr/bin/dnf ] ; then export DNF_TOOL=dnf ; echo -e "max_parallel_do if ! [[ $debug_bazel_flags == *"py_off"* ]]; then \ $DNF_TOOL install -y python3.12-libs --setopt=install_weak_deps=0 --nodocs; \ fi ; \ - $DNF_TOOL install -y espeak-ng --setopt=install_weak_deps=0 --nodocs; \ $DNF_TOOL install -y shadow-utils; \ $DNF_TOOL clean all ; \ cp -v /etc/ssl/certs/ca-bundle.crt /etc/ssl/certs/ca-certificates.crt ; \ diff --git a/Dockerfile.ubuntu b/Dockerfile.ubuntu index d7d2ace9f8..021d77f4ff 100644 --- a/Dockerfile.ubuntu +++ b/Dockerfile.ubuntu @@ -87,6 +87,38 @@ WORKDIR /ovms/third_party/opencv RUN ./install_opencv.sh ####### End of OpenCV +# Build espeak-ng from sources +FROM base_build as espeak_build + +ARG ESPEAK_NG_VERSION=1.51.1 +WORKDIR /tmp/espeak_build + +RUN apt-get update && apt-get install -y --no-install-recommends \ + libtool automake autoconf pkg-config && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +RUN cd /tmp/espeak_build && \ + git clone --branch ${ESPEAK_NG_VERSION} https://github.com/espeak-ng/espeak-ng.git espeak-ng-src 2>&1 && \ + ls -lah /tmp/espeak_build/ + +RUN cd /tmp/espeak_build/espeak-ng-src && \ + touch AUTHORS NEWS && \ + libtoolize --force --copy && \ + aclocal && \ + autoheader && \ + autoconf && \ + automake --add-missing --copy && \ + ./configure --prefix=/opt/espeak-ng \ + --disable-shared \ + --enable-static \ + --disable-mbrola \ + --disable-klatt \ + --without-audio && \ + make -j$(nproc) && \ + make install + +RUN rm -rf /tmp/espeak_build + ################### BASE BUILD ########################## FROM base_build as build ARG BASE_IMAGE @@ -99,9 +131,14 @@ RUN if [ "$BASE_OS" == "ubuntu24" ] ; then apt-get update && \ apt-get install -y software-properties-common --no-install-recommends; add-apt-repository 'ppa:deadsnakes/ppa' -y && \ apt-get clean && rm -rf /var/lib/apt/lists/* ; fi ENV PIP_BREAK_SYSTEM_PACKAGES=1 + +# Copy espeak-ng built from sources +COPY --from=espeak_build /opt/espeak-ng /opt/espeak-ng +ENV PATH="/opt/espeak-ng/bin:${PATH}" +ENV LD_LIBRARY_PATH="/opt/espeak-ng/lib:${LD_LIBRARY_PATH}" +ENV ESPEAK_DATA_PATH="/opt/espeak-ng/share/espeak-ng-data" + RUN apt-get update && apt-get install --no-install-recommends -y \ - espeak-ng \ - libespeak-ng-dev \ libgflags-dev \ bc \ ca-certificates \ @@ -265,6 +302,17 @@ ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/intel/openvino/runtime/lib/intel64/: COPY --from=base_build /opt/opencv /opt/opencv/ COPY third_party /ovms/third_party/ +# Provide espeak-ng headers and static library inside workspace for Bazel +RUN mkdir -p /ovms/third_party/espeak_ng/include /ovms/third_party/espeak_ng/lib && \ + cp -r /opt/espeak-ng/include/* /ovms/third_party/espeak_ng/include/ && \ + mkdir -p /ovms/third_party/espeak_ng/include/espeak-ng && \ + if [ -d /opt/espeak-ng/include/espeak-ng ]; then \ + cp -r /opt/espeak-ng/include/espeak-ng/* /ovms/third_party/espeak_ng/include/espeak-ng/; \ + else \ + cp -v /opt/espeak-ng/include/*.h /ovms/third_party/espeak_ng/include/espeak-ng/; \ + fi && \ + cp -v /opt/espeak-ng/lib/libespeak-ng.a /ovms/third_party/espeak_ng/lib/ + # Mediapipe COPY BUILD.bazel /ovms/ COPY *\.bzl /ovms/ @@ -413,12 +461,19 @@ SHELL ["/bin/bash", "-c"] WORKDIR / COPY release_files/drivers /drivers + +# Copy espeak-ng built from sources +COPY --from=espeak_build /opt/espeak-ng /opt/espeak-ng +ENV PATH="/opt/espeak-ng/bin:${PATH}" +ENV LD_LIBRARY_PATH="/opt/espeak-ng/lib:${LD_LIBRARY_PATH}" +ENV ESPEAK_DATA_PATH="/opt/espeak-ng/share/espeak-ng-data" + SHELL ["/bin/bash", "-o", "pipefail", "-c"] ARG INSTALL_DRIVER_VERSION="24.39.31294" COPY ./install_ubuntu_gpu_drivers.sh /tmp/install_gpu_drivers.sh # hadolint ignore=DL3003,SC2164 RUN apt-get update ; \ - apt-get install -y --no-install-recommends curl ca-certificates libxml2 espeak-ng espeak-ng-data || exit 1; \ + apt-get install -y --no-install-recommends curl ca-certificates libxml2 || exit 1; \ if [ "$GPU" == "1" ] ; then \ /tmp/install_gpu_drivers.sh ; \ fi ; \ diff --git a/third_party/espeak_ng/BUILD b/third_party/espeak_ng/BUILD index 2c0a1cb09a..71f736f09c 100644 --- a/third_party/espeak_ng/BUILD +++ b/third_party/espeak_ng/BUILD @@ -8,24 +8,17 @@ config_setting( cc_library( name = "espeak_ng", - copts = select({ - ":is_macos": [ - # Adjust to where Homebrew (or your installer) puts headers - "-I" + "$(HOME)/.brew/opt/espeak-ng/include", - "-I" + "$(HOME)/.brew/opt/espeak-ng/include/espeak-ng", - ], - "//conditions:default": [ - # Typical on Debian/Ubuntu when installing libespeak-ng-dev - "-I/usr/include", - "-I/usr/include/espeak-ng", - ], - }), - linkopts = select({ - ":is_macos": [ - "-L" + "$(HOME)/.brew/opt/espeak-ng/lib", - "-lespeak-ng", - ], - "//conditions:default": ["-lespeak-ng"], - }), + hdrs = glob(["include/**/*.h"]), + includes = [ + "include", + "include/espeak-ng", + ], + deps = [":espeak_ng_lib"], visibility = ["//visibility:public"], ) + +cc_import( + name = "espeak_ng_lib", + static_library = "lib/libespeak-ng.a", + visibility = ["//visibility:private"], +) From d62263e5b4832b0a3d6aff271248ca74765e773a Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Mon, 2 Mar 2026 17:17:56 +0100 Subject: [PATCH 11/11] Build espeak from source --- Dockerfile.redhat | 27 +++------------------------ Dockerfile.ubuntu | 29 +++-------------------------- third_party/espeak_ng/BUILD | 16 ++++++---------- 3 files changed, 12 insertions(+), 60 deletions(-) diff --git a/Dockerfile.redhat b/Dockerfile.redhat index da9885a92d..3686d98493 100644 --- a/Dockerfile.redhat +++ b/Dockerfile.redhat @@ -120,7 +120,7 @@ RUN cd /tmp/espeak_build/espeak-ng-src && \ autoheader && \ autoconf && \ automake --add-missing --copy && \ - ./configure --prefix=/opt/espeak-ng \ + ./configure --prefix=/usr/local \ --disable-shared \ --enable-static \ --disable-mbrola \ @@ -161,12 +161,6 @@ RUN dnf install -y -d6 \ libicu-devel && \ dnf clean all -# Copy espeak-ng built from sources -COPY --from=espeak_build /opt/espeak-ng /opt/espeak-ng -ENV PATH="/opt/espeak-ng/bin:${PATH}" -ENV LD_LIBRARY_PATH="/opt/espeak-ng/lib:${LD_LIBRARY_PATH}" -ENV ESPEAK_DATA_PATH="/opt/espeak-ng/share/espeak-ng-data" - WORKDIR / ARG INSTALL_DRIVER_VERSION="24.52.32224" @@ -293,17 +287,6 @@ RUN ln -s /usr/lib64 /usr/lib/x86_64-linux-gnu COPY external /ovms/external/ COPY third_party /ovms/third_party -# Provide espeak-ng headers and static library inside workspace for Bazel -RUN mkdir -p /ovms/third_party/espeak_ng/include /ovms/third_party/espeak_ng/lib && \ - cp -r /opt/espeak-ng/include/* /ovms/third_party/espeak_ng/include/ && \ - mkdir -p /ovms/third_party/espeak_ng/include/espeak-ng && \ - if [ -d /opt/espeak-ng/include/espeak-ng ]; then \ - cp -r /opt/espeak-ng/include/espeak-ng/* /ovms/third_party/espeak_ng/include/espeak-ng/; \ - else \ - cp -v /opt/espeak-ng/include/*.h /ovms/third_party/espeak_ng/include/espeak-ng/; \ - fi && \ - cp -v /opt/espeak-ng/lib/libespeak-ng.a /ovms/third_party/espeak_ng/lib/ - # This path is required for namespace to setup Python dependencies for testing the binding COPY src/BUILD /ovms/src/BUILD COPY src/python/binding/BUILD /ovms/src/python/binding/BUILD @@ -452,12 +435,8 @@ LABEL base-image=${RELEASE_BASE_IMAGE} ENV PYTHONPATH=/ovms/lib/python:/ovms/python_deps WORKDIR / - -# Copy espeak-ng built from sources -COPY --from=espeak_build /opt/espeak-ng /opt/espeak-ng -ENV PATH="/opt/espeak-ng/bin:${PATH}" -ENV LD_LIBRARY_PATH="/opt/espeak-ng/lib:${LD_LIBRARY_PATH}" -ENV ESPEAK_DATA_PATH="/opt/espeak-ng/share/espeak-ng-data" +COPY --from=base_build /usr/local/share/espeak-ng-data /usr/local/share/espeak-ng-data +ENV ESPEAK_DATA_PATH=/usr/local/share/espeak-ng-data SHELL ["/bin/bash", "-o", "pipefail", "-c"] COPY ./install_redhat_gpu_drivers.sh /install_gpu_drivers.sh diff --git a/Dockerfile.ubuntu b/Dockerfile.ubuntu index 021d77f4ff..33f6cbc4e0 100644 --- a/Dockerfile.ubuntu +++ b/Dockerfile.ubuntu @@ -88,7 +88,6 @@ RUN ./install_opencv.sh ####### End of OpenCV # Build espeak-ng from sources -FROM base_build as espeak_build ARG ESPEAK_NG_VERSION=1.51.1 WORKDIR /tmp/espeak_build @@ -108,7 +107,7 @@ RUN cd /tmp/espeak_build/espeak-ng-src && \ autoheader && \ autoconf && \ automake --add-missing --copy && \ - ./configure --prefix=/opt/espeak-ng \ + ./configure --prefix=/usr/local \ --disable-shared \ --enable-static \ --disable-mbrola \ @@ -132,12 +131,6 @@ RUN if [ "$BASE_OS" == "ubuntu24" ] ; then apt-get update && \ apt-get clean && rm -rf /var/lib/apt/lists/* ; fi ENV PIP_BREAK_SYSTEM_PACKAGES=1 -# Copy espeak-ng built from sources -COPY --from=espeak_build /opt/espeak-ng /opt/espeak-ng -ENV PATH="/opt/espeak-ng/bin:${PATH}" -ENV LD_LIBRARY_PATH="/opt/espeak-ng/lib:${LD_LIBRARY_PATH}" -ENV ESPEAK_DATA_PATH="/opt/espeak-ng/share/espeak-ng-data" - RUN apt-get update && apt-get install --no-install-recommends -y \ libgflags-dev \ bc \ @@ -301,18 +294,6 @@ ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/intel/openvino/runtime/lib/intel64/: # FROM BASE BUILD COPY --from=base_build /opt/opencv /opt/opencv/ COPY third_party /ovms/third_party/ - -# Provide espeak-ng headers and static library inside workspace for Bazel -RUN mkdir -p /ovms/third_party/espeak_ng/include /ovms/third_party/espeak_ng/lib && \ - cp -r /opt/espeak-ng/include/* /ovms/third_party/espeak_ng/include/ && \ - mkdir -p /ovms/third_party/espeak_ng/include/espeak-ng && \ - if [ -d /opt/espeak-ng/include/espeak-ng ]; then \ - cp -r /opt/espeak-ng/include/espeak-ng/* /ovms/third_party/espeak_ng/include/espeak-ng/; \ - else \ - cp -v /opt/espeak-ng/include/*.h /ovms/third_party/espeak_ng/include/espeak-ng/; \ - fi && \ - cp -v /opt/espeak-ng/lib/libespeak-ng.a /ovms/third_party/espeak_ng/lib/ - # Mediapipe COPY BUILD.bazel /ovms/ COPY *\.bzl /ovms/ @@ -461,12 +442,8 @@ SHELL ["/bin/bash", "-c"] WORKDIR / COPY release_files/drivers /drivers - -# Copy espeak-ng built from sources -COPY --from=espeak_build /opt/espeak-ng /opt/espeak-ng -ENV PATH="/opt/espeak-ng/bin:${PATH}" -ENV LD_LIBRARY_PATH="/opt/espeak-ng/lib:${LD_LIBRARY_PATH}" -ENV ESPEAK_DATA_PATH="/opt/espeak-ng/share/espeak-ng-data" +COPY --from=base_build /usr/local/share/espeak-ng-data /usr/local/share/espeak-ng-data +ENV ESPEAK_DATA_PATH=/usr/local/share/espeak-ng-data SHELL ["/bin/bash", "-o", "pipefail", "-c"] ARG INSTALL_DRIVER_VERSION="24.39.31294" diff --git a/third_party/espeak_ng/BUILD b/third_party/espeak_ng/BUILD index 71f736f09c..31f51b73da 100644 --- a/third_party/espeak_ng/BUILD +++ b/third_party/espeak_ng/BUILD @@ -8,17 +8,13 @@ config_setting( cc_library( name = "espeak_ng", - hdrs = glob(["include/**/*.h"]), + linkopts = [ + "-L/usr/local/lib", + "-lespeak-ng", + ], includes = [ - "include", - "include/espeak-ng", + "/usr/local/include", + "/usr/local/include/espeak-ng", ], - deps = [":espeak_ng_lib"], visibility = ["//visibility:public"], ) - -cc_import( - name = "espeak_ng_lib", - static_library = "lib/libespeak-ng.a", - visibility = ["//visibility:private"], -)