Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions Dockerfile.redhat
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,37 @@ WORKDIR /ovms/third_party/opencv
RUN if [ "$VERBOSE_LOGS" == "ON" ] ; then export VERBOSE=1 ; fi && ./install_opencv.sh
####### End of OpenCV

# Build espeak-ng from sources
FROM base_build as espeak_build

ARG ESPEAK_NG_VERSION=1.51.1
WORKDIR /tmp/espeak_build

RUN dnf install -y libtool automake autoconf pkgconfig && \
dnf clean all

RUN cd /tmp/espeak_build && \
git clone --branch ${ESPEAK_NG_VERSION} https://github.com/espeak-ng/espeak-ng.git espeak-ng-src 2>&1 && \
ls -lah /tmp/espeak_build/

RUN cd /tmp/espeak_build/espeak-ng-src && \
touch AUTHORS NEWS && \
libtoolize --force --copy && \
aclocal && \
autoheader && \
autoconf && \
automake --add-missing --copy && \
./configure --prefix=/usr/local \
--disable-shared \
--enable-static \
--disable-mbrola \
--disable-klatt \
--without-audio && \
make -j$(nproc) && \
make install

RUN rm -rf /tmp/espeak_build

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
FROM base_build as build
ARG BASE_IMAGE
Expand Down Expand Up @@ -404,6 +435,9 @@ LABEL base-image=${RELEASE_BASE_IMAGE}
ENV PYTHONPATH=/ovms/lib/python:/ovms/python_deps

WORKDIR /
COPY --from=base_build /usr/local/share/espeak-ng-data /usr/local/share/espeak-ng-data
ENV ESPEAK_DATA_PATH=/usr/local/share/espeak-ng-data

SHELL ["/bin/bash", "-o", "pipefail", "-c"]
COPY ./install_redhat_gpu_drivers.sh /install_gpu_drivers.sh
# hadolint ignore=DL3003,DL3041,SC2164,SC1091
Expand Down
39 changes: 38 additions & 1 deletion Dockerfile.ubuntu
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,37 @@ WORKDIR /ovms/third_party/opencv
RUN ./install_opencv.sh
####### End of OpenCV

# Build espeak-ng from sources

ARG ESPEAK_NG_VERSION=1.51.1
WORKDIR /tmp/espeak_build

RUN apt-get update && apt-get install -y --no-install-recommends \
libtool automake autoconf pkg-config && \
apt-get clean && rm -rf /var/lib/apt/lists/*

RUN cd /tmp/espeak_build && \
git clone --branch ${ESPEAK_NG_VERSION} https://github.com/espeak-ng/espeak-ng.git espeak-ng-src 2>&1 && \
ls -lah /tmp/espeak_build/

RUN cd /tmp/espeak_build/espeak-ng-src && \
touch AUTHORS NEWS && \
libtoolize --force --copy && \
aclocal && \
autoheader && \
autoconf && \
automake --add-missing --copy && \
./configure --prefix=/usr/local \
--disable-shared \
--enable-static \
--disable-mbrola \
--disable-klatt \
--without-audio && \
make -j$(nproc) && \
make install

RUN rm -rf /tmp/espeak_build

################### BASE BUILD ##########################
FROM base_build as build
ARG BASE_IMAGE
Expand All @@ -99,6 +130,7 @@ RUN if [ "$BASE_OS" == "ubuntu24" ] ; then apt-get update && \
apt-get install -y software-properties-common --no-install-recommends; add-apt-repository 'ppa:deadsnakes/ppa' -y && \
apt-get clean && rm -rf /var/lib/apt/lists/* ; fi
ENV PIP_BREAK_SYSTEM_PACKAGES=1

RUN apt-get update && apt-get install --no-install-recommends -y \
libgflags-dev \
bc \
Expand Down Expand Up @@ -262,7 +294,6 @@ ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/intel/openvino/runtime/lib/intel64/:
# FROM BASE BUILD
COPY --from=base_build /opt/opencv /opt/opencv/
COPY third_party /ovms/third_party/

# Mediapipe
COPY BUILD.bazel /ovms/
COPY *\.bzl /ovms/
Expand Down Expand Up @@ -392,6 +423,9 @@ RUN if [ -f /ovms_release/lib/libovms_shared.so ] ; then mv /ovms_release/lib/li
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #

FROM $BASE_IMAGE as release
ARG http_proxy
ARG https_proxy
ARG no_proxy
ARG INSTALL_RPMS_FROM_URL=
ARG INSTALL_DRIVER_VERSION="24.26.30049"
ARG GPU=0
Expand All @@ -408,6 +442,9 @@ SHELL ["/bin/bash", "-c"]
WORKDIR /

COPY release_files/drivers /drivers
COPY --from=base_build /usr/local/share/espeak-ng-data /usr/local/share/espeak-ng-data
ENV ESPEAK_DATA_PATH=/usr/local/share/espeak-ng-data

SHELL ["/bin/bash", "-o", "pipefail", "-c"]
ARG INSTALL_DRIVER_VERSION="24.39.31294"
COPY ./install_ubuntu_gpu_drivers.sh /tmp/install_gpu_drivers.sh
Expand Down
141 changes: 141 additions & 0 deletions demos/audio/export_kokoro.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#
# Copyright (C) 2026 Intel Corporation
# SPDX-License-Identifier: Apache 2.0
#

import torch
import json
import time
from pathlib import Path
from kokoro.model import KModel
from kokoro import KPipeline
import openvino as ov
import shutil

MAX_SEQ_LENGTH = 500


class KokoroTTSPipeline:
def __init__(self):
model_id = "hexgrad/Kokoro-82M"
self.pipeline = KPipeline(lang_code="a", repo_id=model_id)

def __call__(self, text: str, voice: str = "af_heart"):
with torch.no_grad():
generator = self.pipeline(text, voice=voice)
result = next(generator)
return result.audio


class OVKModel(KModel):
def __init__(self, model_dir: Path, device: str, plugin_config: dict = {}):
torch.nn.Module.__init__(self)

core = ov.Core()

self.repo_id = model_id
with (model_dir / "config.json").open("r", encoding="utf-8") as f:
config = json.load(f)
self.vocab = config["vocab"]
print("Starting to compile OpenVINO model on device:", device)

start = time.time()
self.model = core.compile_model(model_dir / "openvino_model.xml", device.upper(), config=plugin_config)
print(f"Model compiled successfully in {time.time() - start:.2f}s.")
self.context_length = config["plbert"]["max_position_embeddings"]

@property
def device(self):
return torch.device("cpu")

def forward_with_tokens(self, input_ids: torch.LongTensor, ref_s: torch.FloatTensor, speed: float = 1) -> tuple[torch.FloatTensor, torch.LongTensor]:
text_len = input_ids.shape[-1]

if text_len < MAX_SEQ_LENGTH:
# 0 in this model context is acting as BOS/EOS/PAD.
# Since 0 causes artifacts, we might consider space (16) or period (4).
padding_value = 16
input_ids = torch.nn.functional.pad(input_ids, (0, MAX_SEQ_LENGTH - text_len), value=padding_value)

start = time.time()
print("Running inference on OpenVINO model...")
outputs = self.model([input_ids, ref_s, torch.tensor(speed)])
print(f"Inference completed in {time.time() - start:.2f}s.")

audio = torch.from_numpy(outputs[0])
pred_dur = torch.from_numpy(outputs[1])

if text_len < MAX_SEQ_LENGTH:
pred_dur = pred_dur[:text_len]
# Approximate audio trimming based on duration ratio
total_dur = outputs[1].sum()
valid_dur = pred_dur.sum()
if total_dur > 0:
audio_keep = int(audio.shape[-1] * (valid_dur / total_dur))
audio = audio[:audio_keep]

return audio, pred_dur

@staticmethod
def download_and_convert(model_dir: Path, repo_id: str, ttsPipeline: KokoroTTSPipeline):
import openvino as ov
from huggingface_hub import hf_hub_download
import gc

if not (model_dir / "openvino_model.xml").exists():
print(f"Converting Kokoro model to OpenVINO format at {model_dir}...")
model = ttsPipeline.pipeline.model
model.forward = model.forward_with_tokens
input_ids = torch.randint(1, 100, (48,)).numpy()
input_ids = torch.LongTensor([[0, *input_ids, 0]])
style = torch.randn(1, 256)
speed = torch.randint(1, 10, (1,), dtype=torch.float32)

ov_model = ov.convert_model(model, example_input=(input_ids, style, speed), input=[
ov.PartialShape("[1, 2..]"), ov.PartialShape([1, -1])])
ov.save_model(ov_model, model_dir / "openvino_model.xml")
hf_hub_download(repo_id=model_id, filename="config.json", local_dir=model_dir)
else:
print(f"OpenVINO model already exists at {model_dir}, skipping conversion.")

gc.collect()

@staticmethod
def convert_to_static(input_model_dir: Path, output_model_dir: Path):
import openvino as ov

print(f"Converting OpenVINO model to static shapes at {input_model_dir}...")
core = ov.Core()
model = core.read_model(input_model_dir / "openvino_model.xml")
static_shape = {"input_ids": [1, MAX_SEQ_LENGTH], "ref_s": [1, 256], "speed": [1], }
model.reshape(static_shape)
print("Reshaped model inputs:", model.inputs)
ov.save_model(model, output_model_dir / "openvino_model.xml")
print("Conversion to static shapes completed.")
# Copy config file
shutil.copy(input_model_dir / "config.json", output_model_dir / "config.json")


if __name__ == "__main__":

model_id = "hexgrad/Kokoro-82M-v1.1-zh"

# Download model from Hugging Face and convert to OpenVINO format.
pipeline = KokoroTTSPipeline()

# Convert and save the Kokoro model to OpenVINO format
OVKModel.download_and_convert(Path("./kokoro_openvino_model_zh"), repo_id=model_id, ttsPipeline=pipeline)

# To run inference on NPU, model must have static input shapes
OVKModel.convert_to_static(Path("./kokoro_openvino_model_zh"), Path("./kokoro_static_openvino_model_zh"))
# # Execution on NPU require config file
# config = {
# "NPU": {
# "NPU_USE_NPUW": "YES",
# "NPUW_DEVICES": "NPU,CPU",
# "NPUW_KOKORO": "YES",
# }
# }

# # NPUW_CACHE_DIR can be used to avoid compilation on every run
# config["NPU"]["NPUW_CACHE_DIR"] = "./npu_cache_kokoro"
125 changes: 125 additions & 0 deletions demos/audio/tts_test_strings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
#!/usr/bin/env python3
"""Send a battery of tricky TTS test strings to an OpenAI-compatible
speech endpoint, one by one, and save each result as a numbered WAV file.

Usage:
python tts_test_strings.py --endpoint http://localhost:8000/v3 \
--model kokoro \
[--voice None] \
[--output-dir tts_output]
"""

import argparse
import os
import sys
import time

from openai import OpenAI

TEST_STRINGS = [
'Dr. A. B. Carter Jr. met Sen. O\'Neill at 5 p.m., Wed., in Washington, D.C.',
'Mr. Smith, Ph.D., arrived on Fri. at 6:30 a.m.; Mrs. Jones left at noon.',
'We meet on 01/02/2025 at 05:30 IST; is that India or Israel time?',
'The deadline is 2025\u201102\u201101 23:59 UTC\u221205:00 (EST).',
'He finished 1st; she was 22nd\u2014barely.',
'Prices: $1,234.56 vs \u20ac1.234,56; also \u00a512 345 (thin space).',
'Add \u00be cup, then \u00bd tsp; total \u2248 1\u00bc cups.',
'Chapter XLIV starts on page ix; version v2.0.0 follows v1.12.9.',
'Dose: 5 mg vs 5 \u03bcg\u2014don\'t confuse micrograms with milligrams.',
'Avogadro\'s number is 6.022e23; \u03c0 \u2248 3.14159; \u221a2 \u2248 1.4142.',
'Temperature dropped to \u221210 \u00b0C (14 \u00b0F) with 90% RH.',
'Visit https://example.com/a/b?x=1&y=2#frag or email ops+alerts@example.org.',
'Open C:\\Program Files\\Project\\config.yaml or /usr/local/bin/run.sh.',
'.NET, Node.js, C#, C++17, and Rust\'s crate\u2011names\u2011with\u2011hyphens.',
'"WYSIWYG," "GIF" (hard or soft g?), "SQL" (sequel or S\u2011Q\u2011L?).',
'I will present the present to the lead singer who stepped on the lead.',
'They desert the desert; the dove dove; he wound the wound.',
'Please record the record before the minute is up in a minute.',
'She sells seashells by the seashore; truly Irish wristwatch.',
'Unique New York, toy boat, red leather yellow leather.',
'A na\u00efve co\u00f6perative fa\u00e7ade in S\u00e3o Paulo; \u0141\u00f3d\u017a and Krak\u00f3w in Poland.',
'Pi\u00f1ata, jalape\u00f1o, cr\u00e8me br\u00fbl\u00e9e, bouillabaisse, d\u00e9j\u00e0 vu.',
'\U0001f44d\U0001f3fb is a thumbs\u2011up with light skin tone; \U0001f9d1\u200d\U0001f4bb writes code; \U0001f468\u200d\U0001f469\u200d\U0001f467\u200d\U0001f466 is a family; \U0001f1f5\U0001f1f1 is a flag.',
'Faces: \U0001f642\U0001f609\U0001f610\U0001f611\U0001f636; hearts: \u2764\ufe0f\U0001f9e1\U0001f49b\U0001f49a\U0001f499; mixed: \U0001f937\u200d\u2642\ufe0f\U0001f926\u200d\u2640\ufe0f.',
'Latin "A" vs Cyrillic "\u0410"; Greek "\u03c1" vs Latin "p"; micro "\u00b5" vs Greek "\u03bc".',
'\u05e9\u05dc\u05d5\u05dd and \u0645\u0631\u062d\u0628\u064b\u0627 appear with left\u2011to\u2011right text in one line.',
'Prosody markers: \u02c8primary, \u02ccsecondary, and length \u02d0 are tricky for tokenizers.',
'Arrows for intonation: \u2197 rising, \u2198 falling, \u2193 drop.',
'He said, "She replied, \'no\u2014never\u2026\'," then left\u2014silently.',
'Parentheticals (like this\u2014really!) and em\u2011dashes\u2014here\u2014confuse prosody.',
'Let f(x)=x^2; then d/dx x^2=2x; \u2202/\u2202x is the operator.',
'Inline code x += 1; and TeX E=mc^2 should be read clearly.',
'N,N\u2011Diethyl\u2011meta\u2011toluamide (DEET) differs from p\u2011xylene and m\u2011cresol.',
'The RFC 7231/HTTP\u2011semantics "GET" vs "HEAD" distinction matters.',
'Read "macOS" vs "Mac OS", "iOS", "SQL", "URL", and "S3" correctly.',
]


def main():
parser = argparse.ArgumentParser(
description="Send TTS test strings to an OpenAI-compatible speech endpoint."
)
parser.add_argument(
"--endpoint", required=True,
help="Base URL of the API (e.g. http://localhost:8000/v3)"
)
parser.add_argument(
"--model", required=True,
help="Model name to use for speech generation"
)
parser.add_argument(
"--voice", default=None,
help="Voice name (default: voice1)"
)
parser.add_argument(
"--output-dir", default="tts_output",
help="Directory to save output WAV files (default: tts_output)"
)
args = parser.parse_args()

os.makedirs(args.output_dir, exist_ok=True)

client = OpenAI(base_url=args.endpoint, api_key="unused")

total = len(TEST_STRINGS)
print(f"Sending {total} test strings to {args.endpoint} (model={args.model}, voice={args.voice})")
print(f"Output directory: {args.output_dir}\n")

succeeded = 0
failed = 0
total_size_kb = 0.0
t_start = time.time()

for idx, text in enumerate(TEST_STRINGS, start=1):
preview = text[:80] + ("..." if len(text) > 80 else "")
print(f"[{idx:2d}/{total}] {preview}")

out_path = os.path.join(args.output_dir, f"{idx:02d}.wav")
t0 = time.time()
try:
response = client.audio.speech.create(
model=args.model,
voice=args.voice,
input=text,
)
response.write_to_file(out_path)
elapsed = time.time() - t0
size_kb = os.path.getsize(out_path) / 1024
total_size_kb += size_kb
succeeded += 1
print(f" -> {out_path} ({size_kb:.1f} KB, {elapsed:.2f}s)")
except Exception as exc:
elapsed = time.time() - t0
failed += 1
print(f" !! FAILED after {elapsed:.2f}s: {exc}", file=sys.stderr)

total_elapsed = time.time() - t_start
print(f"\n{'='*60}")
print(f"Summary: {succeeded} succeeded, {failed} failed out of {total}")
print(f"Total time: {total_elapsed:.2f}s (avg {total_elapsed/total:.2f}s per string)")
print(f"Total audio size: {total_size_kb:.1f} KB")
print(f"{'='*60}")


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions src/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,7 @@ ovms_cc_library(
"//src/image_gen:image_gen_calculator",
"//src/audio/speech_to_text:s2t_calculator",
"//src/audio/text_to_speech:t2s_calculator",
"//src/audio/kokoro:kokoro_calculator",
"//src/audio:audio_utils",
"//src/image_gen:imagegen_init",
"//src/llm:openai_completions_api_handler",
Expand Down
Loading