diff --git a/Dockerfile.redhat b/Dockerfile.redhat index bc574eaaf2..3686d98493 100644 --- a/Dockerfile.redhat +++ b/Dockerfile.redhat @@ -100,6 +100,37 @@ WORKDIR /ovms/third_party/opencv RUN if [ "$VERBOSE_LOGS" == "ON" ] ; then export VERBOSE=1 ; fi && ./install_opencv.sh ####### End of OpenCV +# Build espeak-ng from sources +FROM base_build as espeak_build + +ARG ESPEAK_NG_VERSION=1.51.1 +WORKDIR /tmp/espeak_build + +RUN dnf install -y libtool automake autoconf pkgconfig && \ + dnf clean all + +RUN cd /tmp/espeak_build && \ + git clone --branch ${ESPEAK_NG_VERSION} https://github.com/espeak-ng/espeak-ng.git espeak-ng-src 2>&1 && \ + ls -lah /tmp/espeak_build/ + +RUN cd /tmp/espeak_build/espeak-ng-src && \ + touch AUTHORS NEWS && \ + libtoolize --force --copy && \ + aclocal && \ + autoheader && \ + autoconf && \ + automake --add-missing --copy && \ + ./configure --prefix=/usr/local \ + --disable-shared \ + --enable-static \ + --disable-mbrola \ + --disable-klatt \ + --without-audio && \ + make -j$(nproc) && \ + make install + +RUN rm -rf /tmp/espeak_build + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # FROM base_build as build ARG BASE_IMAGE @@ -404,6 +435,9 @@ LABEL base-image=${RELEASE_BASE_IMAGE} ENV PYTHONPATH=/ovms/lib/python:/ovms/python_deps WORKDIR / +COPY --from=base_build /usr/local/share/espeak-ng-data /usr/local/share/espeak-ng-data +ENV ESPEAK_DATA_PATH=/usr/local/share/espeak-ng-data + SHELL ["/bin/bash", "-o", "pipefail", "-c"] COPY ./install_redhat_gpu_drivers.sh /install_gpu_drivers.sh # hadolint ignore=DL3003,DL3041,SC2164,SC1091 diff --git a/Dockerfile.ubuntu b/Dockerfile.ubuntu index 15e47daf20..33f6cbc4e0 100644 --- a/Dockerfile.ubuntu +++ b/Dockerfile.ubuntu @@ -87,6 +87,37 @@ WORKDIR /ovms/third_party/opencv RUN ./install_opencv.sh ####### End of OpenCV +# Build espeak-ng from sources + +ARG ESPEAK_NG_VERSION=1.51.1 +WORKDIR /tmp/espeak_build + +RUN apt-get update && apt-get install -y --no-install-recommends \ + libtool automake autoconf pkg-config && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +RUN cd /tmp/espeak_build && \ + git clone --branch ${ESPEAK_NG_VERSION} https://github.com/espeak-ng/espeak-ng.git espeak-ng-src 2>&1 && \ + ls -lah /tmp/espeak_build/ + +RUN cd /tmp/espeak_build/espeak-ng-src && \ + touch AUTHORS NEWS && \ + libtoolize --force --copy && \ + aclocal && \ + autoheader && \ + autoconf && \ + automake --add-missing --copy && \ + ./configure --prefix=/usr/local \ + --disable-shared \ + --enable-static \ + --disable-mbrola \ + --disable-klatt \ + --without-audio && \ + make -j$(nproc) && \ + make install + +RUN rm -rf /tmp/espeak_build + ################### BASE BUILD ########################## FROM base_build as build ARG BASE_IMAGE @@ -99,6 +130,7 @@ RUN if [ "$BASE_OS" == "ubuntu24" ] ; then apt-get update && \ apt-get install -y software-properties-common --no-install-recommends; add-apt-repository 'ppa:deadsnakes/ppa' -y && \ apt-get clean && rm -rf /var/lib/apt/lists/* ; fi ENV PIP_BREAK_SYSTEM_PACKAGES=1 + RUN apt-get update && apt-get install --no-install-recommends -y \ libgflags-dev \ bc \ @@ -262,7 +294,6 @@ ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/intel/openvino/runtime/lib/intel64/: # FROM BASE BUILD COPY --from=base_build /opt/opencv /opt/opencv/ COPY third_party /ovms/third_party/ - # Mediapipe COPY BUILD.bazel /ovms/ COPY *\.bzl /ovms/ @@ -392,6 +423,9 @@ RUN if [ -f /ovms_release/lib/libovms_shared.so ] ; then mv /ovms_release/lib/li # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # FROM $BASE_IMAGE as release +ARG http_proxy +ARG https_proxy +ARG no_proxy ARG INSTALL_RPMS_FROM_URL= ARG INSTALL_DRIVER_VERSION="24.26.30049" ARG GPU=0 @@ -408,6 +442,9 @@ SHELL ["/bin/bash", "-c"] WORKDIR / COPY release_files/drivers /drivers +COPY --from=base_build /usr/local/share/espeak-ng-data /usr/local/share/espeak-ng-data +ENV ESPEAK_DATA_PATH=/usr/local/share/espeak-ng-data + SHELL ["/bin/bash", "-o", "pipefail", "-c"] ARG INSTALL_DRIVER_VERSION="24.39.31294" COPY ./install_ubuntu_gpu_drivers.sh /tmp/install_gpu_drivers.sh diff --git a/demos/audio/export_kokoro.py b/demos/audio/export_kokoro.py new file mode 100644 index 0000000000..8ca3ed89f6 --- /dev/null +++ b/demos/audio/export_kokoro.py @@ -0,0 +1,141 @@ +# +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: Apache 2.0 +# + +import torch +import json +import time +from pathlib import Path +from kokoro.model import KModel +from kokoro import KPipeline +import openvino as ov +import shutil + +MAX_SEQ_LENGTH = 500 + + +class KokoroTTSPipeline: + def __init__(self): + model_id = "hexgrad/Kokoro-82M" + self.pipeline = KPipeline(lang_code="a", repo_id=model_id) + + def __call__(self, text: str, voice: str = "af_heart"): + with torch.no_grad(): + generator = self.pipeline(text, voice=voice) + result = next(generator) + return result.audio + + +class OVKModel(KModel): + def __init__(self, model_dir: Path, device: str, plugin_config: dict = {}): + torch.nn.Module.__init__(self) + + core = ov.Core() + + self.repo_id = model_id + with (model_dir / "config.json").open("r", encoding="utf-8") as f: + config = json.load(f) + self.vocab = config["vocab"] + print("Starting to compile OpenVINO model on device:", device) + + start = time.time() + self.model = core.compile_model(model_dir / "openvino_model.xml", device.upper(), config=plugin_config) + print(f"Model compiled successfully in {time.time() - start:.2f}s.") + self.context_length = config["plbert"]["max_position_embeddings"] + + @property + def device(self): + return torch.device("cpu") + + def forward_with_tokens(self, input_ids: torch.LongTensor, ref_s: torch.FloatTensor, speed: float = 1) -> tuple[torch.FloatTensor, torch.LongTensor]: + text_len = input_ids.shape[-1] + + if text_len < MAX_SEQ_LENGTH: + # 0 in this model context is acting as BOS/EOS/PAD. + # Since 0 causes artifacts, we might consider space (16) or period (4). + padding_value = 16 + input_ids = torch.nn.functional.pad(input_ids, (0, MAX_SEQ_LENGTH - text_len), value=padding_value) + + start = time.time() + print("Running inference on OpenVINO model...") + outputs = self.model([input_ids, ref_s, torch.tensor(speed)]) + print(f"Inference completed in {time.time() - start:.2f}s.") + + audio = torch.from_numpy(outputs[0]) + pred_dur = torch.from_numpy(outputs[1]) + + if text_len < MAX_SEQ_LENGTH: + pred_dur = pred_dur[:text_len] + # Approximate audio trimming based on duration ratio + total_dur = outputs[1].sum() + valid_dur = pred_dur.sum() + if total_dur > 0: + audio_keep = int(audio.shape[-1] * (valid_dur / total_dur)) + audio = audio[:audio_keep] + + return audio, pred_dur + + @staticmethod + def download_and_convert(model_dir: Path, repo_id: str, ttsPipeline: KokoroTTSPipeline): + import openvino as ov + from huggingface_hub import hf_hub_download + import gc + + if not (model_dir / "openvino_model.xml").exists(): + print(f"Converting Kokoro model to OpenVINO format at {model_dir}...") + model = ttsPipeline.pipeline.model + model.forward = model.forward_with_tokens + input_ids = torch.randint(1, 100, (48,)).numpy() + input_ids = torch.LongTensor([[0, *input_ids, 0]]) + style = torch.randn(1, 256) + speed = torch.randint(1, 10, (1,), dtype=torch.float32) + + ov_model = ov.convert_model(model, example_input=(input_ids, style, speed), input=[ + ov.PartialShape("[1, 2..]"), ov.PartialShape([1, -1])]) + ov.save_model(ov_model, model_dir / "openvino_model.xml") + hf_hub_download(repo_id=model_id, filename="config.json", local_dir=model_dir) + else: + print(f"OpenVINO model already exists at {model_dir}, skipping conversion.") + + gc.collect() + + @staticmethod + def convert_to_static(input_model_dir: Path, output_model_dir: Path): + import openvino as ov + + print(f"Converting OpenVINO model to static shapes at {input_model_dir}...") + core = ov.Core() + model = core.read_model(input_model_dir / "openvino_model.xml") + static_shape = {"input_ids": [1, MAX_SEQ_LENGTH], "ref_s": [1, 256], "speed": [1], } + model.reshape(static_shape) + print("Reshaped model inputs:", model.inputs) + ov.save_model(model, output_model_dir / "openvino_model.xml") + print("Conversion to static shapes completed.") + # Copy config file + shutil.copy(input_model_dir / "config.json", output_model_dir / "config.json") + + +if __name__ == "__main__": + + model_id = "hexgrad/Kokoro-82M-v1.1-zh" + + # Download model from Hugging Face and convert to OpenVINO format. + pipeline = KokoroTTSPipeline() + + # Convert and save the Kokoro model to OpenVINO format + OVKModel.download_and_convert(Path("./kokoro_openvino_model_zh"), repo_id=model_id, ttsPipeline=pipeline) + + # To run inference on NPU, model must have static input shapes + OVKModel.convert_to_static(Path("./kokoro_openvino_model_zh"), Path("./kokoro_static_openvino_model_zh")) + # # Execution on NPU require config file + # config = { + # "NPU": { + # "NPU_USE_NPUW": "YES", + # "NPUW_DEVICES": "NPU,CPU", + # "NPUW_KOKORO": "YES", + # } + # } + + # # NPUW_CACHE_DIR can be used to avoid compilation on every run + # config["NPU"]["NPUW_CACHE_DIR"] = "./npu_cache_kokoro" \ No newline at end of file diff --git a/demos/audio/tts_test_strings.py b/demos/audio/tts_test_strings.py new file mode 100644 index 0000000000..79b1194a3e --- /dev/null +++ b/demos/audio/tts_test_strings.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 +"""Send a battery of tricky TTS test strings to an OpenAI-compatible +speech endpoint, one by one, and save each result as a numbered WAV file. + +Usage: + python tts_test_strings.py --endpoint http://localhost:8000/v3 \ + --model kokoro \ + [--voice None] \ + [--output-dir tts_output] +""" + +import argparse +import os +import sys +import time + +from openai import OpenAI + +TEST_STRINGS = [ + 'Dr. A. B. Carter Jr. met Sen. O\'Neill at 5 p.m., Wed., in Washington, D.C.', + 'Mr. Smith, Ph.D., arrived on Fri. at 6:30 a.m.; Mrs. Jones left at noon.', + 'We meet on 01/02/2025 at 05:30 IST; is that India or Israel time?', + 'The deadline is 2025\u201102\u201101 23:59 UTC\u221205:00 (EST).', + 'He finished 1st; she was 22nd\u2014barely.', + 'Prices: $1,234.56 vs \u20ac1.234,56; also \u00a512 345 (thin space).', + 'Add \u00be cup, then \u00bd tsp; total \u2248 1\u00bc cups.', + 'Chapter XLIV starts on page ix; version v2.0.0 follows v1.12.9.', + 'Dose: 5 mg vs 5 \u03bcg\u2014don\'t confuse micrograms with milligrams.', + 'Avogadro\'s number is 6.022e23; \u03c0 \u2248 3.14159; \u221a2 \u2248 1.4142.', + 'Temperature dropped to \u221210 \u00b0C (14 \u00b0F) with 90% RH.', + 'Visit https://example.com/a/b?x=1&y=2#frag or email ops+alerts@example.org.', + 'Open C:\\Program Files\\Project\\config.yaml or /usr/local/bin/run.sh.', + '.NET, Node.js, C#, C++17, and Rust\'s crate\u2011names\u2011with\u2011hyphens.', + '"WYSIWYG," "GIF" (hard or soft g?), "SQL" (sequel or S\u2011Q\u2011L?).', + 'I will present the present to the lead singer who stepped on the lead.', + 'They desert the desert; the dove dove; he wound the wound.', + 'Please record the record before the minute is up in a minute.', + 'She sells seashells by the seashore; truly Irish wristwatch.', + 'Unique New York, toy boat, red leather yellow leather.', + 'A na\u00efve co\u00f6perative fa\u00e7ade in S\u00e3o Paulo; \u0141\u00f3d\u017a and Krak\u00f3w in Poland.', + 'Pi\u00f1ata, jalape\u00f1o, cr\u00e8me br\u00fbl\u00e9e, bouillabaisse, d\u00e9j\u00e0 vu.', + '\U0001f44d\U0001f3fb is a thumbs\u2011up with light skin tone; \U0001f9d1\u200d\U0001f4bb writes code; \U0001f468\u200d\U0001f469\u200d\U0001f467\u200d\U0001f466 is a family; \U0001f1f5\U0001f1f1 is a flag.', + 'Faces: \U0001f642\U0001f609\U0001f610\U0001f611\U0001f636; hearts: \u2764\ufe0f\U0001f9e1\U0001f49b\U0001f49a\U0001f499; mixed: \U0001f937\u200d\u2642\ufe0f\U0001f926\u200d\u2640\ufe0f.', + 'Latin "A" vs Cyrillic "\u0410"; Greek "\u03c1" vs Latin "p"; micro "\u00b5" vs Greek "\u03bc".', + '\u05e9\u05dc\u05d5\u05dd and \u0645\u0631\u062d\u0628\u064b\u0627 appear with left\u2011to\u2011right text in one line.', + 'Prosody markers: \u02c8primary, \u02ccsecondary, and length \u02d0 are tricky for tokenizers.', + 'Arrows for intonation: \u2197 rising, \u2198 falling, \u2193 drop.', + 'He said, "She replied, \'no\u2014never\u2026\'," then left\u2014silently.', + 'Parentheticals (like this\u2014really!) and em\u2011dashes\u2014here\u2014confuse prosody.', + 'Let f(x)=x^2; then d/dx x^2=2x; \u2202/\u2202x is the operator.', + 'Inline code x += 1; and TeX E=mc^2 should be read clearly.', + 'N,N\u2011Diethyl\u2011meta\u2011toluamide (DEET) differs from p\u2011xylene and m\u2011cresol.', + 'The RFC 7231/HTTP\u2011semantics "GET" vs "HEAD" distinction matters.', + 'Read "macOS" vs "Mac OS", "iOS", "SQL", "URL", and "S3" correctly.', +] + + +def main(): + parser = argparse.ArgumentParser( + description="Send TTS test strings to an OpenAI-compatible speech endpoint." + ) + parser.add_argument( + "--endpoint", required=True, + help="Base URL of the API (e.g. http://localhost:8000/v3)" + ) + parser.add_argument( + "--model", required=True, + help="Model name to use for speech generation" + ) + parser.add_argument( + "--voice", default=None, + help="Voice name (default: voice1)" + ) + parser.add_argument( + "--output-dir", default="tts_output", + help="Directory to save output WAV files (default: tts_output)" + ) + args = parser.parse_args() + + os.makedirs(args.output_dir, exist_ok=True) + + client = OpenAI(base_url=args.endpoint, api_key="unused") + + total = len(TEST_STRINGS) + print(f"Sending {total} test strings to {args.endpoint} (model={args.model}, voice={args.voice})") + print(f"Output directory: {args.output_dir}\n") + + succeeded = 0 + failed = 0 + total_size_kb = 0.0 + t_start = time.time() + + for idx, text in enumerate(TEST_STRINGS, start=1): + preview = text[:80] + ("..." if len(text) > 80 else "") + print(f"[{idx:2d}/{total}] {preview}") + + out_path = os.path.join(args.output_dir, f"{idx:02d}.wav") + t0 = time.time() + try: + response = client.audio.speech.create( + model=args.model, + voice=args.voice, + input=text, + ) + response.write_to_file(out_path) + elapsed = time.time() - t0 + size_kb = os.path.getsize(out_path) / 1024 + total_size_kb += size_kb + succeeded += 1 + print(f" -> {out_path} ({size_kb:.1f} KB, {elapsed:.2f}s)") + except Exception as exc: + elapsed = time.time() - t0 + failed += 1 + print(f" !! FAILED after {elapsed:.2f}s: {exc}", file=sys.stderr) + + total_elapsed = time.time() - t_start + print(f"\n{'='*60}") + print(f"Summary: {succeeded} succeeded, {failed} failed out of {total}") + print(f"Total time: {total_elapsed:.2f}s (avg {total_elapsed/total:.2f}s per string)") + print(f"Total audio size: {total_size_kb:.1f} KB") + print(f"{'='*60}") + + +if __name__ == "__main__": + main() diff --git a/src/BUILD b/src/BUILD index 71321ca7ee..0318099727 100644 --- a/src/BUILD +++ b/src/BUILD @@ -563,6 +563,7 @@ ovms_cc_library( "//src/image_gen:image_gen_calculator", "//src/audio/speech_to_text:s2t_calculator", "//src/audio/text_to_speech:t2s_calculator", + "//src/audio/kokoro:kokoro_calculator", "//src/audio:audio_utils", "//src/image_gen:imagegen_init", "//src/llm:openai_completions_api_handler", diff --git a/src/audio/audio_utils.cpp b/src/audio/audio_utils.cpp index 77b38e70df..59668be23f 100644 --- a/src/audio/audio_utils.cpp +++ b/src/audio/audio_utils.cpp @@ -22,6 +22,7 @@ #include "src/logging.hpp" #include #include +#include #include #include #pragma warning(push) @@ -188,3 +189,33 @@ void prepareAudioOutput(void** ppData, size_t& pDataSize, uint16_t bitsPerSample auto outputPreparationTime = (timer.elapsed(OUTPUT_PREPARATION)) / 1000; SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Output preparation time: {} ms", outputPreparationTime); } + +void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, size_t speechSize, const float* waveformPtr) { + enum : unsigned int { + OUTPUT_PREPARATION, + TIMER_END + }; + Timer timer; + timer.start(OUTPUT_PREPARATION); + + drwav_data_format format; + format.container = drwav_container_riff; + format.format = DR_WAVE_FORMAT_IEEE_FLOAT; + format.channels = 1; + format.sampleRate = 24000; // Kokoro native sample rate + format.bitsPerSample = 32; + drwav wav; + + auto status = drwav_init_memory_write(&wav, ppData, &pDataSize, &format, nullptr); + if (status == DRWAV_FALSE) { + throw std::runtime_error("Failed to initialize WAV writer"); + } + drwav_uint64 framesWritten = drwav_write_pcm_frames(&wav, speechSize, waveformPtr); + if (framesWritten != speechSize) { + throw std::runtime_error("Failed to write all frames"); + } + drwav_uninit(&wav); + timer.stop(OUTPUT_PREPARATION); + auto outputPreparationTime = (timer.elapsed(OUTPUT_PREPARATION)) / 1000; + SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Output preparation time: {} ms", outputPreparationTime); +} diff --git a/src/audio/audio_utils.hpp b/src/audio/audio_utils.hpp index cbeea8b457..0928d03f3d 100644 --- a/src/audio/audio_utils.hpp +++ b/src/audio/audio_utils.hpp @@ -25,3 +25,4 @@ bool isWavBuffer(const std::string buf); std::vector readWav(const std::string_view& wavData); std::vector readMp3(const std::string_view& mp3Data); void prepareAudioOutput(void** ppData, size_t& pDataSize, uint16_t bitsPerSample, size_t speechSize, const float* waveformPtr); +void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, size_t speechSize, const float* waveformPtr); diff --git a/src/audio/kokoro/BUILD b/src/audio/kokoro/BUILD new file mode 100644 index 0000000000..d7d3b64b1a --- /dev/null +++ b/src/audio/kokoro/BUILD @@ -0,0 +1,60 @@ +# +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +load("@mediapipe//mediapipe/framework/port:build_config.bzl", "mediapipe_cc_proto_library", "mediapipe_proto_library") +load("//:common_settings.bzl", "ovms_cc_library") + +ovms_cc_library( + name = "kokoro_servable", + hdrs = ["kokoro_servable.hpp"], + deps= ["//third_party:openvino", + "//src:libovms_ovinferrequestsqueue", + ], + visibility = ["//visibility:public"], + alwayslink = 1, +) + +ovms_cc_library( + name = "kokoro_calculator", + srcs = ["kokoro_calculator.cc"], + deps = [ + "@mediapipe//mediapipe/framework:calculator_framework", + "//src:httppayload", + "//src:libovmslogging", + "kokoro_calculator_cc_proto", + "//src/port:dr_audio", + "//src/port:rapidjson_stringbuffer", + "//src/port:rapidjson_writer", + ":kokoro_servable", + "//third_party:genai", + "//src/audio:audio_utils", + "//src:executingstreamidguard", + "//src:model_metric_reporter", + "//third_party/espeak_ng:espeak_ng", + ], + visibility = ["//visibility:public"], + alwayslink = 1, +) + +mediapipe_proto_library( + name = "kokoro_calculator_proto", + srcs = ["kokoro_calculator.proto"], + visibility = ["//visibility:private"], + deps = [ + "@mediapipe//mediapipe/framework:calculator_options_proto", + "@mediapipe//mediapipe/framework:calculator_proto", + ], +) diff --git a/src/audio/kokoro/kokoro_calculator.cc b/src/audio/kokoro/kokoro_calculator.cc new file mode 100644 index 0000000000..728e0f88b4 --- /dev/null +++ b/src/audio/kokoro/kokoro_calculator.cc @@ -0,0 +1,433 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#include +#include +#include +#include + +#pragma warning(push) +#pragma warning(disable : 4005 4309 6001 6385 6386 6326 6011 6246 4456 6246) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/port/canonical_errors.h" +#pragma GCC diagnostic pop +#pragma warning(pop) + +#include "src/audio/audio_utils.hpp" +#include "src/http_payload.hpp" +#include "src/logging.hpp" +#include "src/port/dr_audio.hpp" + +#include "../../model_metric_reporter.hpp" +#include "../../executingstreamidguard.hpp" + +#pragma warning(push) +#pragma warning(disable : 6001 4324 6385 6386) +#include "absl/strings/str_cat.h" +#pragma warning(pop) + +#include + +#include "kokoro_servable.hpp" + +#ifdef _WIN32 +#include +#include +#endif + +using namespace ovms; + +namespace { + +#ifndef espeakPHONEMES_IPA +#define espeakPHONEMES_IPA 0x02 +#endif +#ifndef espeakPHONEMES_NO_STRESS +#define espeakPHONEMES_NO_STRESS 0x08 +#endif + +std::string retone(const std::string& p) { + std::string result = p; + + auto replaceAll = [](std::string& s, const std::string& from, const std::string& to) { + size_t pos = 0; + while ((pos = s.find(from, pos)) != std::string::npos) { + s.replace(pos, from.size(), to); + pos += to.size(); + } + }; + + // Tone mark replacements + replaceAll(result, "˧˩˧", "↓"); // third tone + replaceAll(result, "˧˥", "↗"); // second tone + replaceAll(result, "˥˩", "↘"); // fourth tone + replaceAll(result, "˥", "→"); // first tone + + // Unicode character replacements (UTF-8 encoded) + replaceAll(result, "\xCA\x97\xCC\x89", "ɨ"); // chr(635)+chr(809) + replaceAll(result, "\xCA\x91\xCC\x89", "ɨ"); // chr(633)+chr(809) + + // Verify chr(809) removed + if (result.find("\xCC\x89") != std::string::npos) { + SPDLOG_WARN("Combining diacritic (chr 809) still present: {}", result); + } + + return result; +} + +std::string getEspeakVoice(const std::string& isoLanguageCode) { + // ISO 639-1 codes with optional region codes + if (isoLanguageCode == "en-us") { + return "en-us"; // American English (default for 'en') + } else if (isoLanguageCode == "en-gb") { + return "en"; // British English + } else if (isoLanguageCode == "en") { + return "en-us"; // Default to American English when only 'en' specified + } else if (isoLanguageCode == "es") { + return "es"; + } else if (isoLanguageCode == "fr") { + return "fr"; + } else if (isoLanguageCode == "hi") { + return "hi"; + } else if (isoLanguageCode == "it") { + return "it"; + } else if (isoLanguageCode == "ja") { + return "ja"; + } else if (isoLanguageCode == "pt-br") { + return "pt"; // Brazilian Portuguese + } else if (isoLanguageCode == "zh" || isoLanguageCode == "zh-cn") { + return "cmn-latn-pinyin"; // Mandarin Chinese + } + return ""; // Unsupported +} + +bool isSupportedLanguage(const std::string& isoLanguageCode) { + // Only accept ISO 639-1 codes and regional variants + return !getEspeakVoice(isoLanguageCode).empty(); +} + +void espeakPhonemizeAll(const std::string& textUtf8, std::string& outIpa, const std::string& language = "en", bool noStress = true) { + outIpa.clear(); + auto& espeak = ovms::EspeakInstance::instance(); + if (!espeak.isReady()) { + SPDLOG_ERROR("eSpeak not initialized"); + return; + } + + std::lock_guard guard(espeak.mutex()); + + // Get the eSpeak voice name from the ISO language code + // Kokoro supports 9 languages: American English, British English, Spanish, French, Hindi, Italian, Japanese, Brazilian Portuguese, Mandarin Chinese + std::string voiceName = getEspeakVoice(language); + if (voiceName.empty()) { + // This should not happen if validation was done, but fallback just in case + SPDLOG_ERROR("Invalid language code '{}' passed to espeakPhonemizeAll", language); + voiceName = "en-us"; + } + if (espeak_SetVoiceByName(voiceName.c_str()) != EE_OK) { + SPDLOG_ERROR("Failed to set eSpeak voice '{}'", voiceName); + if (voiceName != "en-us" && espeak_SetVoiceByName("en-us") == EE_OK) { + voiceName = "en-us"; + } else { + return; + } + } + + const int mode = espeakPHONEMES_IPA | (noStress ? espeakPHONEMES_NO_STRESS : 0); + const void* pos = static_cast(textUtf8.c_str()); + const char* endPtr = static_cast(pos) + textUtf8.size(); + std::string rawIpa; + + while (pos && static_cast(pos) < endPtr) { + const char* ipaChunk = espeak_TextToPhonemes(&pos, espeakCHARS_UTF8, mode); + if (ipaChunk && *ipaChunk) { + if (!rawIpa.empty()) { + rawIpa.push_back(' '); + } + rawIpa.append(ipaChunk); + } + } + + // Strip combining diacriticals (U+0300..U+036F) and collapse spaces + std::string cleaned; + cleaned.reserve(rawIpa.size()); + for (size_t i = 0; i < rawIpa.size(); ++i) { + unsigned char c = static_cast(rawIpa[i]); + if (i + 1 < rawIpa.size()) { + unsigned char next = static_cast(rawIpa[i + 1]); + if ((c == 0xCC && next >= 0x80) || (c == 0xCD && next <= 0xAF)) { + i++; + continue; + } + } + cleaned.push_back(c); + } + + outIpa.reserve(cleaned.size()); + bool lastSpace = false; + for (char c : cleaned) { + if (std::isspace(static_cast(c))) { + if (!lastSpace) { + outIpa.push_back(' '); + lastSpace = true; + } + } else { + outIpa.push_back(c); + lastSpace = false; + } + } + + if (!outIpa.empty() && std::isspace(static_cast(outIpa.back()))) { + outIpa.pop_back(); + } + + SPDLOG_DEBUG("IPA phonemes: '{}' (length: {})", outIpa, outIpa.size()); +} + + +size_t utf8CharLen(unsigned char lead) { + if (lead < 0x80) + return 1; + if ((lead >> 5) == 0x6) + return 2; + if ((lead >> 4) == 0xE) + return 3; + if ((lead >> 3) == 0x1E) + return 4; + return 1; +} + +void tokenize(const std::string& textUtf8, + std::vector& tokenIds, + const ovms::VocabIndex& ix, + const std::string& language = "en") { + tokenIds.clear(); + // Reserve estimated capacity to avoid reallocations + tokenIds.reserve(textUtf8.size() / 2); + + size_t pos = 0; + const size_t n = textUtf8.size(); + size_t unknownCount = 0; + + while (pos < n) { + size_t maxTry = std::min(ix.max_token_bytes, n - pos); + int foundId = -1; + size_t foundLen = 0; + + for (size_t len = maxTry; len > 0; --len) { + auto it = ix.by_token.find(std::string(textUtf8.data() + pos, len)); + if (it != ix.by_token.end()) { + foundId = it->second; + foundLen = len; + break; + } + } + + if (foundId >= 0) { + tokenIds.push_back(foundId); + pos += foundLen; + } else { + const unsigned char lead = static_cast(textUtf8[pos]); + const size_t adv = utf8CharLen(lead); + std::string unknownBytes(textUtf8.data() + pos, std::min(adv, n - pos)); + unknownCount++; + SPDLOG_DEBUG("Tokenizer [lang={}]: unknown phoneme at pos {}: '{}' (skipping)", + language, pos, unknownBytes); + pos += std::min(adv, n - pos); + } + } + if (unknownCount > 0) { + SPDLOG_WARN("Tokenize [lang={}]: {} unknown phonemes found. Produced {} token ids. " + "Consider updating vocabulary for better {} speech quality.", + language, unknownCount, tokenIds.size(), language); + } else { + SPDLOG_DEBUG("Tokenize [lang={}]: produced {} ids without unknown phonemes", language, tokenIds.size()); + } +} +} // namespace + +namespace mediapipe { + +const std::string KOKORO_SESSION_SIDE_PACKET_TAG = "KOKORO_NODE_RESOURCES"; + +class KokoroCalculator : public CalculatorBase { + static const std::string INPUT_TAG_NAME; + static const std::string OUTPUT_TAG_NAME; + std::string defaultLanguage; // Language configured in graph pbtxt + +public: + static absl::Status GetContract(CalculatorContract* cc) { + RET_CHECK(!cc->Inputs().GetTags().empty()); + RET_CHECK(!cc->Outputs().GetTags().empty()); + cc->Inputs().Tag(INPUT_TAG_NAME).Set(); + cc->InputSidePackets().Tag(KOKORO_SESSION_SIDE_PACKET_TAG).Set(); + cc->Outputs().Tag(OUTPUT_TAG_NAME).Set(); + return absl::OkStatus(); + } + + absl::Status Close(CalculatorContext* cc) final { + SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "KokoroCalculator [Node: {}] Close", cc->NodeName()); + return absl::OkStatus(); + } + + absl::Status Open(CalculatorContext* cc) final { + SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "KokoroCalculator [Node: {}] Open", cc->NodeName()); + + // Read language from graph configuration + const auto& options = cc->Options(); + this->defaultLanguage = options.has_language() ? options.language() : "en"; + + // Normalize language code to lowercase + std::transform(this->defaultLanguage.begin(), this->defaultLanguage.end(), this->defaultLanguage.begin(), ::tolower); + + // Validate language is supported + if (!isSupportedLanguage(this->defaultLanguage)) { + return absl::InvalidArgumentError(absl::StrCat( + "Invalid language in graph config: '", this->defaultLanguage, "'. ", + "Supported ISO 639-1 language codes: en, es, fr, hi, it, ja, pt-br, zh. ", + "Regional variants: en-us, en-gb, pt-br, zh-cn" + )); + } + + SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, + "KokoroCalculator [Node: {}] configured for language: {}", + cc->NodeName(), this->defaultLanguage); + + return absl::OkStatus(); + } + + absl::Status Process(CalculatorContext* cc) final { + SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "KokoroCalculator [Node: {}] Process start", cc->NodeName()); + try { + KokoroServableMap servablesMap = cc->InputSidePackets() + .Tag(KOKORO_SESSION_SIDE_PACKET_TAG) + .Get(); + auto servableIt = servablesMap.find(cc->NodeName()); + RET_CHECK(servableIt != servablesMap.end()) + << "Could not find initialized Kokoro node named: " << cc->NodeName(); + auto servable = servableIt->second; + + const auto& payload = cc->Inputs().Tag(INPUT_TAG_NAME).Get(); + auto it = payload.parsedJson->FindMember("input"); + RET_CHECK(it != payload.parsedJson->MemberEnd()) << "Missing 'input' in request"; + RET_CHECK(it->value.IsString()) << "'input' must be a string"; + const std::string text = it->value.GetString(); + + // Read optional "voice" parameter (OpenAI TTS API) + std::string voiceName; + auto voiceIt = payload.parsedJson->FindMember("voice"); + if (voiceIt != payload.parsedJson->MemberEnd() && voiceIt->value.IsString()) { + voiceName = voiceIt->value.GetString(); + } + + // Language is configured in the graph pbtxt, not from request + // Use the defaultLanguage set during Open() + const std::string language = this->defaultLanguage; + SPDLOG_DEBUG("Using configured language: {}", language); + + // Text -> IPA phonemization + std::string phonemes; + + // Use eSpeak for all languages + espeakPhonemizeAll(text, phonemes, language, /*noStress=*/false); + if(language == "zh" || language == "zh-cn"){ + phonemes = retone(phonemes); + } + + SPDLOG_DEBUG("Input text: '{}' (language: {}), IPA phonemes ({} chars): '{}'", text, language, phonemes.size(), phonemes); + + // Preserve trailing punctuation from original text (eSpeak strips it) + // if (!text.empty()) { + // char last = text.back(); + // if (last == '.' || last == '!' || last == '?' || last == ';' || last == ':' || last == ',') { + // phonemes.push_back(last); + // } + // } + SPDLOG_DEBUG("After E2M mapping ({} chars): '{}'", phonemes.size(), phonemes); + // IPA -> Kokoro token IDs + const auto& vocabIx = servable->getVocabIndex(); + std::vector tokenIds; + tokenize(phonemes, tokenIds, vocabIx, language); + + // Wrap with PAD token (id=0) at both ends — matches official + // forward_with_tokens: input_ids = [[0, *tokens, 0]] + tokenIds.insert(tokenIds.begin(), 0); + tokenIds.push_back(0); + + // Voice embedding — select slice from voice pack based on content token count + size_t numContentTokens = tokenIds.size() >= 2 ? tokenIds.size() - 2 : 0; // exclude BOS pad + EOS + const float* voiceSlice = servable->getVoiceSlice(voiceName, numContentTokens); + RET_CHECK(voiceSlice != nullptr) << "No voice pack loaded (place .bin files in /voices/)"; + + auto inputIdsTensor = ov::Tensor{ov::element::i64, ov::Shape{1, tokenIds.size()}}; + auto refS = ov::Tensor{ov::element::f32, ov::Shape{1, KokoroServable::STYLE_DIM}}; + auto speed = ov::Tensor{ov::element::f32, ov::Shape{1}}; + + *reinterpret_cast(speed.data()) = 1.0f; + std::copy(tokenIds.data(), tokenIds.data() + tokenIds.size(), + reinterpret_cast(inputIdsTensor.data())); + std::copy(voiceSlice, voiceSlice + KokoroServable::STYLE_DIM, + reinterpret_cast(refS.data())); + + // Inference + ModelMetricReporter unused(nullptr, nullptr, "unused", 1); + auto executingStreamIdGuard = + std::make_unique(servable->getInferRequestsQueue(), unused); + ov::InferRequest& inferRequest = executingStreamIdGuard->getInferRequest(); + + inferRequest.set_tensor("input_ids", inputIdsTensor); + inferRequest.set_tensor("103", refS); + inferRequest.set_tensor("speed", speed); + inferRequest.start_async(); + inferRequest.wait(); + + // Collect audio output + auto out = inferRequest.get_tensor(inferRequest.get_compiled_model().outputs()[0]); + RET_CHECK(out.get_shape().size() == 1); + RET_CHECK(out.get_element_type() == ov::element::f32); + const size_t samples = out.get_shape()[0]; + const float* data = out.data(); + + SPDLOG_DEBUG("Model output: {} audio samples ({:.2f}s at 24kHz)", + samples, static_cast(samples) / 24000.0f); + + void* wavDataPtr = nullptr; + size_t wavSize = 0; + prepareAudioOutputKokoro(&wavDataPtr, wavSize, samples, data); + + auto output = std::make_unique(reinterpret_cast(wavDataPtr), wavSize); + drwav_free(wavDataPtr, NULL); + + cc->Outputs().Tag(OUTPUT_TAG_NAME).Add(output.release(), cc->InputTimestamp()); + } catch (const std::exception& e) { + SPDLOG_ERROR("KokoroCalculator [Node: {}] Process failed: {}", cc->NodeName(), e.what()); + return absl::InvalidArgumentError(e.what()); + } catch (...) { + SPDLOG_ERROR("KokoroCalculator [Node: {}] Process failed: unknown error", cc->NodeName()); + return absl::InvalidArgumentError("Kokoro processing failed"); + } + SPDLOG_LOGGER_DEBUG(kokoro_calculator_logger, "KokoroCalculator [Node: {}] Process end", cc->NodeName()); + return absl::OkStatus(); + } +}; + +const std::string KokoroCalculator::INPUT_TAG_NAME{"HTTP_REQUEST_PAYLOAD"}; +const std::string KokoroCalculator::OUTPUT_TAG_NAME{"HTTP_RESPONSE_PAYLOAD"}; + +REGISTER_CALCULATOR(KokoroCalculator); + +} // namespace mediapipe diff --git a/src/audio/kokoro/kokoro_calculator.proto b/src/audio/kokoro/kokoro_calculator.proto new file mode 100644 index 0000000000..8ec0f43341 --- /dev/null +++ b/src/audio/kokoro/kokoro_calculator.proto @@ -0,0 +1,34 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +syntax = "proto2"; +package mediapipe; + +import "mediapipe/framework/calculator.proto"; + + +message KokoroCalculatorOptions { + extend mediapipe.CalculatorOptions { + // https://github.com/google/mediapipe/issues/634 have to be unique in app + // no rule to obtain this + optional KokoroCalculatorOptions ext = 116423799; + } + + required string models_path = 1; + optional string target_device = 2; + optional string plugin_config = 3; + optional string language = 4; // ISO 639-1 language code (en, es, fr, hi, it, ja, pt-br, zh) +} diff --git a/src/audio/kokoro/kokoro_servable.hpp b/src/audio/kokoro/kokoro_servable.hpp new file mode 100644 index 0000000000..9a81f8f527 --- /dev/null +++ b/src/audio/kokoro/kokoro_servable.hpp @@ -0,0 +1,298 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#pragma warning(push) +#pragma warning(disable : 4005 4309 6001 6385 6386 6326 6011 4005 4456 6246) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#include "mediapipe/framework/calculator_graph.h" +#pragma GCC diagnostic pop +#pragma warning(pop) + +#include "openvino/runtime/core.hpp" +#include "../../ovinferrequestsqueue.hpp" + +#include +#include + +#include "src/audio/kokoro/kokoro_calculator.pb.h" +#include "src/logging.hpp" + +namespace ovms { + +struct VocabIndex { + std::unordered_map by_token; + size_t max_token_bytes = 1; +}; + +struct VoicePack { + std::vector data; // flat [numEntries * STYLE_DIM] + size_t numEntries = 0; +}; + +class EspeakInstance { +public: + static EspeakInstance& instance() { + static EspeakInstance inst; + return inst; + } + + bool isReady() const { return ready_; } + std::mutex& mutex() { return mutex_; } + +private: + EspeakInstance() { + ready_ = tryInit(); + if (!ready_) { + SPDLOG_ERROR("eSpeak-NG initialization failed (data path or voice not found)"); + } else { + SPDLOG_INFO("eSpeak-NG initialized successfully"); + } + } + + ~EspeakInstance() { + if (ready_) { + espeak_Terminate(); + } + } + + EspeakInstance(const EspeakInstance&) = delete; + EspeakInstance& operator=(const EspeakInstance&) = delete; + + bool tryInit() { + auto try_path = [](const char* path) -> bool { + int sr = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, + 0, path, + espeakINITIALIZE_DONT_EXIT); + if (sr <= 0) + return false; + // Try to initialize with Kokoro's supported language voices + // Kokoro supports: en-us (American English), en (British English), es (Spanish), fr (French), hi (Hindi), it (Italian), ja (Japanese), pt (Brazilian Portuguese), cmn (Mandarin Chinese) + if (espeak_SetVoiceByName("en-us") != EE_OK && + espeak_SetVoiceByName("en") != EE_OK && + espeak_SetVoiceByName("es") != EE_OK && + espeak_SetVoiceByName("fr") != EE_OK && + espeak_SetVoiceByName("hi") != EE_OK && + espeak_SetVoiceByName("it") != EE_OK && + espeak_SetVoiceByName("ja") != EE_OK && + espeak_SetVoiceByName("pt") != EE_OK && + espeak_SetVoiceByName("cmn") != EE_OK) { + return false; + } + return true; + }; + + if (try_path(nullptr)) + return true; + + static const char* ngPaths[] = { + "/usr/share/espeak-ng-data", + "/opt/homebrew/share/espeak-ng-data", + "/usr/local/share/espeak-ng-data", + "espeak-ng-data", + nullptr}; + for (int i = 0; ngPaths[i]; ++i) + if (try_path(ngPaths[i])) + return true; + + static const char* esPaths[] = { + "/usr/share/espeak-data", + "/usr/local/share/espeak-data", + "espeak-data", + nullptr}; + for (int i = 0; esPaths[i]; ++i) + if (try_path(esPaths[i])) + return true; + + return false; + } + + bool ready_ = false; + std::mutex mutex_; +}; + +struct KokoroServable { + static constexpr size_t STYLE_DIM = 256; + + std::filesystem::path parsedModelsPath; + std::shared_ptr model; + ov::CompiledModel compiledModel; + std::unique_ptr inferRequestsQueue; + VocabIndex vocabIndex; + std::unordered_map voicePacks; + std::string defaultVoiceName; + + KokoroServable(const std::string& modelDir, const std::string& targetDevice, const std::string& graphPath) { + EspeakInstance::instance(); + + auto fsModelsPath = std::filesystem::path(modelDir); + if (fsModelsPath.is_relative()) { + parsedModelsPath = (std::filesystem::path(graphPath) / fsModelsPath); + } else { + parsedModelsPath = fsModelsPath; + } + + vocabIndex = loadVocabFromConfig(parsedModelsPath); + loadVoicePacks(parsedModelsPath); + + ov::AnyMap properties = { + // Use ACCURACY execution mode to avoid fast-math approximation errors + // that accumulate in the deep decoder network and cause energy fade. + ov::hint::execution_mode(ov::hint::ExecutionMode::ACCURACY), + }; + //properties["INFERENCE_PRECISION_HINT"] = "f32"; + ov::Core core; + auto m_model = core.read_model(parsedModelsPath / std::filesystem::path("openvino_model.xml"), {}, properties); + compiledModel = core.compile_model(m_model, targetDevice, properties); + uint32_t numberOfParallelInferRequests = 1; + try { + numberOfParallelInferRequests = compiledModel.get_property(ov::optimal_number_of_infer_requests); + } catch (const ov::Exception& ex) { + SPDLOG_WARN("Failed to query OPTIMAL_NUMBER_OF_INFER_REQUESTS with error {}. Using 1 nireq.", ex.what()); + numberOfParallelInferRequests = 1u; + } + inferRequestsQueue = std::make_unique(compiledModel, numberOfParallelInferRequests); + } + + OVInferRequestsQueue& getInferRequestsQueue() { + return *inferRequestsQueue; + } + + const VocabIndex& getVocabIndex() const { + return vocabIndex; + } + + // Returns pointer to 256 floats for the given voice and token count. + // voiceName: requested voice (e.g. "af_alloy"). Falls back to default voice if not found. + // numContentTokens: number of token IDs excluding BOS/EOS padding. + const float* getVoiceSlice(const std::string& voiceName, size_t numContentTokens) const { + auto it = voicePacks.find(voiceName); + if (it == voicePacks.end()) { + it = voicePacks.find(defaultVoiceName); + if (it == voicePacks.end()) { + return nullptr; + } + } + const auto& pack = it->second; + size_t idx = std::min(numContentTokens, pack.numEntries - 1); + return pack.data.data() + (idx * STYLE_DIM); + } + + bool hasVoice(const std::string& voiceName) const { + return voicePacks.count(voiceName) > 0; + } + + const std::string& getDefaultVoiceName() const { + return defaultVoiceName; + } + +private: + static VocabIndex loadVocabFromConfig(const std::filesystem::path& modelDir) { + VocabIndex ix; + auto configPath = modelDir / "config.json"; + std::ifstream ifs(configPath); + if (!ifs.is_open()) { + SPDLOG_ERROR("Failed to open Kokoro config: {}", configPath.string()); + return ix; + } + + std::stringstream buffer; + buffer << ifs.rdbuf(); + std::string jsonStr = buffer.str(); + + rapidjson::Document doc; + doc.Parse(jsonStr.c_str()); + if (doc.HasParseError()) { + SPDLOG_ERROR("Failed to parse Kokoro config JSON: {}", configPath.string()); + return ix; + } + + if (!doc.HasMember("vocab") || !doc["vocab"].IsObject()) { + SPDLOG_ERROR("Kokoro config missing 'vocab' object: {}", configPath.string()); + return ix; + } + + const auto& vocab = doc["vocab"]; + ix.by_token.reserve(vocab.MemberCount()); + for (auto it = vocab.MemberBegin(); it != vocab.MemberEnd(); ++it) { + if (!it->name.IsString() || !it->value.IsInt()) + continue; + std::string token = it->name.GetString(); + int id = it->value.GetInt(); + ix.by_token.emplace(token, id); + ix.max_token_bytes = std::max(ix.max_token_bytes, token.size()); + } + + SPDLOG_INFO("Loaded Kokoro vocabulary: {} tokens, max_token_bytes={}", + ix.by_token.size(), ix.max_token_bytes); + return ix; + } + + void loadVoicePacks(const std::filesystem::path& modelDir) { + auto voicesDir = modelDir / "voices"; + if (!std::filesystem::exists(voicesDir) || !std::filesystem::is_directory(voicesDir)) { + SPDLOG_WARN("No voices directory found at: {}", voicesDir.string()); + return; + } + + for (const auto& entry : std::filesystem::directory_iterator(voicesDir)) { + if (!entry.is_regular_file() || entry.path().extension() != ".bin") + continue; + + std::string name = entry.path().stem().string(); + auto fileSize = std::filesystem::file_size(entry.path()); + if (fileSize == 0 || fileSize % (STYLE_DIM * sizeof(float)) != 0) { + SPDLOG_ERROR("Voice file {} has invalid size {} (must be multiple of {})", + entry.path().string(), fileSize, STYLE_DIM * sizeof(float)); + continue; + } + + VoicePack pack; + pack.numEntries = fileSize / (STYLE_DIM * sizeof(float)); + pack.data.resize(pack.numEntries * STYLE_DIM); + + std::ifstream ifs(entry.path(), std::ios::binary); + if (!ifs.read(reinterpret_cast(pack.data.data()), fileSize)) { + SPDLOG_ERROR("Failed to read voice file: {}", entry.path().string()); + continue; + } + + SPDLOG_INFO("Loaded voice pack '{}': {} entries x {} dims from {}", + name, pack.numEntries, STYLE_DIM, entry.path().string()); + + if (defaultVoiceName.empty()) { + defaultVoiceName = name; + } + voicePacks.emplace(name, std::move(pack)); + } + + SPDLOG_INFO("Loaded {} voice pack(s), default: '{}'", voicePacks.size(), defaultVoiceName); + } +}; + +using KokoroServableMap = std::unordered_map>; +} // namespace ovms diff --git a/src/logging.cpp b/src/logging.cpp index e89fce9a07..9d058d82dc 100644 --- a/src/logging.cpp +++ b/src/logging.cpp @@ -35,6 +35,7 @@ std::shared_ptr llm_executor_logger = std::make_shared llm_calculator_logger = std::make_shared("llm_calculator"); std::shared_ptr s2t_calculator_logger = std::make_shared("s2t_calculator"); std::shared_ptr t2s_calculator_logger = std::make_shared("t2s_calculator"); +std::shared_ptr kokoro_calculator_logger = std::make_shared("kokoro_calculator"); std::shared_ptr embeddings_calculator_logger = std::make_shared("embeddings_calculator"); std::shared_ptr rerank_calculator_logger = std::make_shared("rerank_calculator"); #endif @@ -78,6 +79,7 @@ static void register_loggers(const std::string& log_level, std::vectorset_pattern(default_pattern); s2t_calculator_logger->set_pattern(default_pattern); t2s_calculator_logger->set_pattern(default_pattern); + kokoro_calculator_logger->set_pattern(default_pattern); rerank_calculator_logger->set_pattern(default_pattern); embeddings_calculator_logger->set_pattern(default_pattern); #endif @@ -98,6 +100,7 @@ static void register_loggers(const std::string& log_level, std::vectorsinks().push_back(sink); s2t_calculator_logger->sinks().push_back(sink); t2s_calculator_logger->sinks().push_back(sink); + kokoro_calculator_logger->sinks().push_back(sink); rerank_calculator_logger->sinks().push_back(sink); embeddings_calculator_logger->sinks().push_back(sink); #endif @@ -119,6 +122,7 @@ static void register_loggers(const std::string& log_level, std::vector llm_executor_logger; extern std::shared_ptr llm_calculator_logger; extern std::shared_ptr s2t_calculator_logger; extern std::shared_ptr t2s_calculator_logger; +extern std::shared_ptr kokoro_calculator_logger; extern std::shared_ptr embeddings_calculator_logger; extern std::shared_ptr rerank_calculator_logger; #endif diff --git a/src/mediapipe_internal/mediapipegraphdefinition.cpp b/src/mediapipe_internal/mediapipegraphdefinition.cpp index 9047765e75..e1436b5891 100644 --- a/src/mediapipe_internal/mediapipegraphdefinition.cpp +++ b/src/mediapipe_internal/mediapipegraphdefinition.cpp @@ -63,6 +63,7 @@ const std::string MediapipeGraphDefinition::LLM_NODE_CALCULATOR_NAME{"LLMCalcula const std::string MediapipeGraphDefinition::IMAGE_GEN_CALCULATOR_NAME{"ImageGenCalculator"}; const std::string MediapipeGraphDefinition::STT_NODE_CALCULATOR_NAME{"S2tCalculator"}; const std::string MediapipeGraphDefinition::TTS_NODE_CALCULATOR_NAME{"T2sCalculator"}; +const std::string MediapipeGraphDefinition::KOKORO_NODE_CALCULATOR_NAME{"KokoroCalculator"}; const std::string MediapipeGraphDefinition::EMBEDDINGS_NODE_CALCULATOR_NAME{"EmbeddingsCalculatorOV"}; const std::string MediapipeGraphDefinition::RERANK_NODE_CALCULATOR_NAME{"RerankCalculatorOV"}; @@ -625,6 +626,28 @@ Status MediapipeGraphDefinition::initializeNodes() { return StatusCode::MEDIAPIPE_GRAPH_CONFIG_FILE_INVALID; } } + if (endsWith(config.node(i).calculator(), KOKORO_NODE_CALCULATOR_NAME)) { + auto& kokoroServableMap = this->sidePacketMaps.kokoroServableMap; + ResourcesCleaningGuard kokoroServablesCleaningGuard(kokoroServableMap); + if (!config.node(i).node_options().size()) { + SPDLOG_LOGGER_ERROR(modelmanager_logger, "Kokoro node missing options in graph: {}. ", this->name); + return StatusCode::LLM_NODE_MISSING_OPTIONS; + } + if (config.node(i).name().empty()) { + SPDLOG_LOGGER_ERROR(modelmanager_logger, "Kokoro node name is missing in graph: {}. ", this->name); + return StatusCode::LLM_NODE_MISSING_NAME; + } + std::string nodeName = config.node(i).name(); + if (kokoroServableMap.find(nodeName) != kokoroServableMap.end()) { + SPDLOG_LOGGER_ERROR(modelmanager_logger, "Kokoro node name: {} already used in graph: {}. ", nodeName, this->name); + return StatusCode::LLM_NODE_NAME_ALREADY_EXISTS; + } + mediapipe::KokoroCalculatorOptions nodeOptions; + config.node(i).node_options(0).UnpackTo(&nodeOptions); + std::shared_ptr servable = std::make_shared(nodeOptions.models_path(), nodeOptions.target_device(), mgconfig.getBasePath()); + kokoroServableMap.insert(std::pair>(nodeName, std::move(servable))); + kokoroServablesCleaningGuard.disableCleaning(); + } } return StatusCode::OK; } diff --git a/src/mediapipe_internal/mediapipegraphdefinition.hpp b/src/mediapipe_internal/mediapipegraphdefinition.hpp index 14c9e0679f..1067ca7d42 100644 --- a/src/mediapipe_internal/mediapipegraphdefinition.hpp +++ b/src/mediapipe_internal/mediapipegraphdefinition.hpp @@ -48,6 +48,7 @@ #include "../rerank/rerank_servable.hpp" #include "../audio/speech_to_text/s2t_servable.hpp" #include "../audio/text_to_speech/t2s_servable.hpp" +#include "../audio/kokoro/kokoro_servable.hpp" namespace ovms { class MediapipeGraphDefinitionUnloadGuard; @@ -66,6 +67,7 @@ using GenAiServableMap = std::unordered_map>; using SttServableMap = std::unordered_map>; using TtsServableMap = std::unordered_map>; +using KokoroServableMap = std::unordered_map>; using EmbeddingsServableMap = std::unordered_map>; using ImageGenerationPipelinesMap = std::unordered_map>; @@ -77,6 +79,7 @@ struct GraphSidePackets { RerankServableMap rerankServableMap; SttServableMap sttServableMap; TtsServableMap ttsServableMap; + KokoroServableMap kokoroServableMap; void clear() { pythonNodeResourcesMap.clear(); genAiServableMap.clear(); @@ -85,6 +88,7 @@ struct GraphSidePackets { rerankServableMap.clear(); sttServableMap.clear(); ttsServableMap.clear(); + kokoroServableMap.clear(); } bool empty() { return (pythonNodeResourcesMap.empty() && @@ -93,7 +97,8 @@ struct GraphSidePackets { embeddingsServableMap.empty() && rerankServableMap.empty() && sttServableMap.empty() && - ttsServableMap.empty()); + ttsServableMap.empty() && + kokoroServableMap.empty()); } }; @@ -136,6 +141,7 @@ class MediapipeGraphDefinition { static const std::string RERANK_NODE_CALCULATOR_NAME; static const std::string STT_NODE_CALCULATOR_NAME; static const std::string TTS_NODE_CALCULATOR_NAME; + static const std::string KOKORO_NODE_CALCULATOR_NAME; Status waitForLoaded(std::unique_ptr& unloadGuard, const uint32_t waitForLoadedTimeoutMicroseconds = WAIT_FOR_LOADED_DEFAULT_TIMEOUT_MICROSECONDS); // Pipelines are not versioned and any available definition has constant version equal 1. diff --git a/src/mediapipe_internal/mediapipegraphexecutor.cpp b/src/mediapipe_internal/mediapipegraphexecutor.cpp index 93b53fdf8e..b2016ac3aa 100644 --- a/src/mediapipe_internal/mediapipegraphexecutor.cpp +++ b/src/mediapipe_internal/mediapipegraphexecutor.cpp @@ -49,6 +49,7 @@ MediapipeGraphExecutor::MediapipeGraphExecutor( const RerankServableMap& rerankServableMap, const SttServableMap& sttServableMap, const TtsServableMap& ttsServableMap, + const KokoroServableMap& kokoroServableMap, PythonBackend* pythonBackend, MediapipeServableMetricReporter* mediapipeServableMetricReporter) : name(name), @@ -58,7 +59,7 @@ MediapipeGraphExecutor::MediapipeGraphExecutor( outputTypes(std::move(outputTypes)), inputNames(std::move(inputNames)), outputNames(std::move(outputNames)), - sidePacketMaps({pythonNodeResourcesMap, llmNodeResourcesMap, {}, embeddingsServableMap, rerankServableMap, sttServableMap, ttsServableMap}), + sidePacketMaps({pythonNodeResourcesMap, llmNodeResourcesMap, {}, embeddingsServableMap, rerankServableMap, sttServableMap, ttsServableMap, kokoroServableMap}), pythonBackend(pythonBackend), currentStreamTimestamp(STARTING_TIMESTAMP), mediapipeServableMetricReporter(mediapipeServableMetricReporter) {} @@ -92,6 +93,7 @@ const std::string MediapipeGraphExecutor::EMBEDDINGS_SESSION_SIDE_PACKET_TAG = " const std::string MediapipeGraphExecutor::RERANK_SESSION_SIDE_PACKET_TAG = "rerank_servable"; const std::string MediapipeGraphExecutor::STT_SESSION_SIDE_PACKET_TAG = "s2t_servable"; const std::string MediapipeGraphExecutor::TTS_SESSION_SIDE_PACKET_TAG = "t2s_servable"; +const std::string MediapipeGraphExecutor::KOKORO_SESSION_SIDE_PACKET_TAG = "kokoro_servable"; const ::mediapipe::Timestamp MediapipeGraphExecutor::STARTING_TIMESTAMP = ::mediapipe::Timestamp(0); } // namespace ovms diff --git a/src/mediapipe_internal/mediapipegraphexecutor.hpp b/src/mediapipe_internal/mediapipegraphexecutor.hpp index c165469395..af2e8d08e6 100644 --- a/src/mediapipe_internal/mediapipegraphexecutor.hpp +++ b/src/mediapipe_internal/mediapipegraphexecutor.hpp @@ -95,6 +95,7 @@ class MediapipeGraphExecutor { static const std::string RERANK_SESSION_SIDE_PACKET_TAG; static const std::string STT_SESSION_SIDE_PACKET_TAG; static const std::string TTS_SESSION_SIDE_PACKET_TAG; + static const std::string KOKORO_SESSION_SIDE_PACKET_TAG; static const ::mediapipe::Timestamp STARTING_TIMESTAMP; MediapipeGraphExecutor(const std::string& name, const std::string& version, const ::mediapipe::CalculatorGraphConfig& config, @@ -107,6 +108,7 @@ class MediapipeGraphExecutor { const RerankServableMap& rerankServableMap, const SttServableMap& sttServableMap, const TtsServableMap& ttsServableMap, + const KokoroServableMap& kokoroServableMap, PythonBackend* pythonBackend, MediapipeServableMetricReporter* mediapipeServableMetricReporter); MediapipeGraphExecutor(const std::string& name, const std::string& version, const ::mediapipe::CalculatorGraphConfig& config, @@ -157,6 +159,7 @@ class MediapipeGraphExecutor { inputSidePackets[RERANK_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket(this->sidePacketMaps.rerankServableMap).At(STARTING_TIMESTAMP); inputSidePackets[STT_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket(this->sidePacketMaps.sttServableMap).At(STARTING_TIMESTAMP); inputSidePackets[TTS_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket(this->sidePacketMaps.ttsServableMap).At(STARTING_TIMESTAMP); + inputSidePackets[KOKORO_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket(this->sidePacketMaps.kokoroServableMap).At(STARTING_TIMESTAMP); MP_RETURN_ON_FAIL(graph.StartRun(inputSidePackets), std::string("start MediaPipe graph: ") + this->name, StatusCode::MEDIAPIPE_GRAPH_START_ERROR); diff --git a/third_party/espeak_ng/BUILD b/third_party/espeak_ng/BUILD new file mode 100644 index 0000000000..31f51b73da --- /dev/null +++ b/third_party/espeak_ng/BUILD @@ -0,0 +1,20 @@ + +# third_party/espeak_ng/BUILD + +config_setting( + name = "is_macos", + values = {"cpu": "darwin"}, +) + +cc_library( + name = "espeak_ng", + linkopts = [ + "-L/usr/local/lib", + "-lespeak-ng", + ], + includes = [ + "/usr/local/include", + "/usr/local/include/espeak-ng", + ], + visibility = ["//visibility:public"], +)