From 3a2f63cc3706ec11b4e54d9e047030a2d71aca9d Mon Sep 17 00:00:00 2001 From: chenchaonan <2301835860@qq.com> Date: Mon, 30 Mar 2026 14:49:18 +0800 Subject: [PATCH] Resolve the conflict --- README.md | 2 +- README_ZH.md | 2 +- __init__.py | 2 +- fish_speech_src/.gitignore | 4 ++-- fish_speech_src/docs/ar/finetune.md | 4 ++-- fish_speech_src/docs/ar/inference.md | 6 +++--- fish_speech_src/docs/en/finetune.md | 4 ++-- fish_speech_src/docs/en/inference.md | 6 +++--- fish_speech_src/docs/ja/finetune.md | 4 ++-- fish_speech_src/docs/ja/inference.md | 6 +++--- fish_speech_src/docs/ko/finetune.md | 4 ++-- fish_speech_src/docs/ko/inference.md | 6 +++--- fish_speech_src/docs/pt/finetune.md | 4 ++-- fish_speech_src/docs/pt/inference.md | 6 +++--- fish_speech_src/docs/zh/finetune.md | 4 ++-- fish_speech_src/docs/zh/inference.md | 6 +++--- .../fish_speech/configs/lora/r_8_alpha_16.yaml | 4 ---- .../callbacks/__init__.py | 0 .../callbacks/grad_norm.py | 0 .../configs/base.yaml | 2 +- .../fish_speech_s2/configs/lora/r_8_alpha_16.yaml | 4 ++++ .../configs/modded_dac_vq.yaml | 10 +++++----- .../configs/text2semantic_finetune.yaml | 14 +++++++------- .../content_sequence.py | 2 +- .../conversation.py | 4 ++-- .../datasets/concat_repeat.py | 0 .../datasets/protos/text-data.proto | 0 .../datasets/protos/text_data_pb2.py | 0 .../datasets/protos/text_data_stream.py | 0 .../datasets/semantic.py | 14 +++++++------- .../datasets/vqgan.py | 2 +- .../{fish_speech => fish_speech_s2}/i18n/README.md | 4 ++-- .../i18n/__init__.py | 0 .../{fish_speech => fish_speech_s2}/i18n/core.py | 0 .../i18n/locale/en_US.json | 0 .../i18n/locale/es_ES.json | 0 .../i18n/locale/ja_JP.json | 0 .../i18n/locale/ko_KR.json | 0 .../i18n/locale/pt_BR.json | 0 .../i18n/locale/zh_CN.json | 0 .../{fish_speech => fish_speech_s2}/i18n/scan.py | 0 .../inference_engine/__init__.py | 14 +++++++------- .../inference_engine/reference_loader.py | 6 +++--- .../inference_engine/utils.py | 0 .../inference_engine/vq_manager.py | 2 +- .../models/dac/__init__.py | 0 .../models/dac/inference.py | 2 +- .../models/dac/modded_dac.py | 2 +- .../models/dac/rvq.py | 0 .../models/text2semantic/__init__.py | 0 .../models/text2semantic/inference.py | 8 ++++---- .../models/text2semantic/lit_module.py | 4 ++-- .../models/text2semantic/llama.py | 4 ++-- .../models/text2semantic/lora.py | 0 .../{fish_speech => fish_speech_s2}/scheduler.py | 0 .../text/__init__.py | 0 .../{fish_speech => fish_speech_s2}/text/clean.py | 0 .../{fish_speech => fish_speech_s2}/tokenizer.py | 0 .../{fish_speech => fish_speech_s2}/train.py | 2 +- .../utils/__init__.py | 0 .../utils/braceexpand.py | 0 .../utils/context.py | 0 .../{fish_speech => fish_speech_s2}/utils/file.py | 0 .../utils/instantiators.py | 0 .../utils/logger.py | 0 .../utils/logging_utils.py | 2 +- .../utils/rich_utils.py | 2 +- .../utils/schema.py | 2 +- .../utils/spectrogram.py | 0 .../{fish_speech => fish_speech_s2}/utils/utils.py | 0 fish_speech_src/tools/api_client.py | 4 ++-- fish_speech_src/tools/llama/build_dataset.py | 6 +++--- fish_speech_src/tools/llama/eval_in_context.py | 4 ++-- fish_speech_src/tools/llama/merge_lora.py | 6 +++--- fish_speech_src/tools/llama/quantize.py | 4 ++-- fish_speech_src/tools/run_webui.py | 8 ++++---- fish_speech_src/tools/server/api_utils.py | 4 ++-- fish_speech_src/tools/server/inference.py | 4 ++-- fish_speech_src/tools/server/model_manager.py | 8 ++++---- fish_speech_src/tools/server/views.py | 2 +- fish_speech_src/tools/vqgan/create_train_split.py | 2 +- fish_speech_src/tools/vqgan/extract_vq.py | 4 ++-- fish_speech_src/tools/webui/__init__.py | 2 +- fish_speech_src/tools/webui/inference.py | 4 ++-- fish_speech_src/tools/webui/variables.py | 2 +- nodes/loader.py | 14 +++++++------- nodes/model_cache.py | 4 ++-- nodes/multi_speaker_node.py | 4 ++-- nodes/multi_speaker_split_node.py | 4 ++-- nodes/tts_node.py | 2 +- nodes/voice_clone_node.py | 2 +- 91 files changed, 137 insertions(+), 137 deletions(-) delete mode 100644 fish_speech_src/fish_speech/configs/lora/r_8_alpha_16.yaml rename fish_speech_src/{fish_speech => fish_speech_s2}/callbacks/__init__.py (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/callbacks/grad_norm.py (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/configs/base.yaml (97%) create mode 100644 fish_speech_src/fish_speech_s2/configs/lora/r_8_alpha_16.yaml rename fish_speech_src/{fish_speech => fish_speech_s2}/configs/modded_dac_vq.yaml (76%) rename fish_speech_src/{fish_speech => fish_speech_s2}/configs/text2semantic_finetune.yaml (74%) rename fish_speech_src/{fish_speech => fish_speech_s2}/content_sequence.py (99%) rename fish_speech_src/{fish_speech => fish_speech_s2}/conversation.py (97%) rename fish_speech_src/{fish_speech => fish_speech_s2}/datasets/concat_repeat.py (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/datasets/protos/text-data.proto (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/datasets/protos/text_data_pb2.py (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/datasets/protos/text_data_stream.py (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/datasets/semantic.py (97%) rename fish_speech_src/{fish_speech => fish_speech_s2}/datasets/vqgan.py (98%) rename fish_speech_src/{fish_speech => fish_speech_s2}/i18n/README.md (95%) rename fish_speech_src/{fish_speech => fish_speech_s2}/i18n/__init__.py (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/i18n/core.py (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/i18n/locale/en_US.json (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/i18n/locale/es_ES.json (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/i18n/locale/ja_JP.json (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/i18n/locale/ko_KR.json (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/i18n/locale/pt_BR.json (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/i18n/locale/zh_CN.json (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/i18n/scan.py (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/inference_engine/__init__.py (93%) rename fish_speech_src/{fish_speech => fish_speech_s2}/inference_engine/reference_loader.py (98%) rename fish_speech_src/{fish_speech => fish_speech_s2}/inference_engine/utils.py (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/inference_engine/vq_manager.py (97%) rename fish_speech_src/{fish_speech => fish_speech_s2}/models/dac/__init__.py (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/models/dac/inference.py (98%) rename fish_speech_src/{fish_speech => fish_speech_s2}/models/dac/modded_dac.py (99%) rename fish_speech_src/{fish_speech => fish_speech_s2}/models/dac/rvq.py (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/models/text2semantic/__init__.py (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/models/text2semantic/inference.py (99%) rename fish_speech_src/{fish_speech => fish_speech_s2}/models/text2semantic/lit_module.py (98%) rename fish_speech_src/{fish_speech => fish_speech_s2}/models/text2semantic/llama.py (99%) rename fish_speech_src/{fish_speech => fish_speech_s2}/models/text2semantic/lora.py (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/scheduler.py (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/text/__init__.py (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/text/clean.py (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/tokenizer.py (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/train.py (99%) rename fish_speech_src/{fish_speech => fish_speech_s2}/utils/__init__.py (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/utils/braceexpand.py (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/utils/context.py (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/utils/file.py (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/utils/instantiators.py (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/utils/logger.py (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/utils/logging_utils.py (96%) rename fish_speech_src/{fish_speech => fish_speech_s2}/utils/rich_utils.py (98%) rename fish_speech_src/{fish_speech => fish_speech_s2}/utils/schema.py (98%) rename fish_speech_src/{fish_speech => fish_speech_s2}/utils/spectrogram.py (100%) rename fish_speech_src/{fish_speech => fish_speech_s2}/utils/utils.py (100%) diff --git a/README.md b/README.md index e7ec832..8c69f37 100644 --- a/README.md +++ b/README.md @@ -412,7 +412,7 @@ If you see `RuntimeError: Failed to create AudioDecoder ... MockDecoder() takes ### Conflicting `fish_speech` Package? -If you see `ImportError: cannot import name 'AUDIO_EXTENSIONS' from 'fish_speech.utils.file'` pointing to another custom node's directory (e.g. `comfyui-mixlab-nodes`), a different node has its own `fish_speech` folder that conflicts with ours via `sys.path`. Disable the conflicting node or remove it. Do **not** pip-install `fish_speech` — it is bundled inside this node. +If you see `ImportError: cannot import name 'AUDIO_EXTENSIONS' from 'fish_speech_s2.utils.file'` pointing to another custom node's directory (e.g. `comfyui-mixlab-nodes`), a different node has its own `fish_speech` folder that conflicts with ours via `sys.path`. Disable the conflicting node or remove it. Do **not** pip-install `fish_speech` — it is bundled inside this node. ### Out of Memory? diff --git a/README_ZH.md b/README_ZH.md index 6f6632f..45d2266 100644 --- a/README_ZH.md +++ b/README_ZH.md @@ -406,7 +406,7 @@ pip install "descript-audiotools>=0.7.2" --no-deps ### `fish_speech` 包冲突? -如果看到 `ImportError: cannot import name 'AUDIO_EXTENSIONS' from 'fish_speech.utils.file'` 且路径指向其他自定义节点目录(如 `comfyui-mixlab-nodes`),说明另一个节点有自己的 `fish_speech` 文件夹,通过 `sys.path` 与本节点冲突。请禁用或删除冲突节点。**不要** 通过 pip 安装 `fish_speech` — 它已内置于本节点中。 +如果看到 `ImportError: cannot import name 'AUDIO_EXTENSIONS' from 'fish_speech_s2.utils.file'` 且路径指向其他自定义节点目录(如 `comfyui-mixlab-nodes`),说明另一个节点有自己的 `fish_speech` 文件夹,通过 `sys.path` 与本节点冲突。请禁用或删除冲突节点。**不要** 通过 pip 安装 `fish_speech` — 它已内置于本节点中。 ### 显存不足? diff --git a/__init__.py b/__init__.py index fba8cc1..19d4f0b 100644 --- a/__init__.py +++ b/__init__.py @@ -167,7 +167,7 @@ def _ensure_fish_source() -> bool: sys.path.insert(0, fish_src_str) try: - import fish_speech.models # noqa: F401 + import fish_speech_s2.models # noqa: F401 return True except ImportError as e: logger.error(f"fish_speech not importable from {_FISH_SRC}: {e}") diff --git a/fish_speech_src/.gitignore b/fish_speech_src/.gitignore index 91c3d5f..3e1feef 100644 --- a/fish_speech_src/.gitignore +++ b/fish_speech_src/.gitignore @@ -58,7 +58,7 @@ venv.bak/ # Project Dependencies # -------------------- .pdm-python -/fish_speech.egg-info +/fish_speech_s2.egg-info # Data and Model Files # -------------------- @@ -86,7 +86,7 @@ filelists/ *.pkl *.pickle *.lab -/fish_speech/text/cmudict_cache.pickle +/fish_speech_s2/text/cmudict_cache.pickle # Cache and Temporary Files # -------------------------- diff --git a/fish_speech_src/docs/ar/finetune.md b/fish_speech_src/docs/ar/finetune.md index 929397e..bc33027 100644 --- a/fish_speech_src/docs/ar/finetune.md +++ b/fish_speech_src/docs/ar/finetune.md @@ -95,13 +95,13 @@ huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/ope أخيرًا، يمكنك بدء الضبط الدقيق عن طريق تشغيل الأمر التالي: ```bash -python fish_speech/train.py --config-name text2semantic_finetune \ +python fish_speech_s2/train.py --config-name text2semantic_finetune \ project=$project \ +lora@model.model.lora_config=r_8_alpha_16 ``` !!! note "ملاحظة" - يمكنك تعديل معلمات التدريب مثل `batch_size`، `gradient_accumulation_steps`، وما إلى ذلك لتناسب ذاكرة وحدة معالجة الرسومات الخاصة بك عن طريق تعديل `fish_speech/configs/text2semantic_finetune.yaml`. + يمكنك تعديل معلمات التدريب مثل `batch_size`، `gradient_accumulation_steps`، وما إلى ذلك لتناسب ذاكرة وحدة معالجة الرسومات الخاصة بك عن طريق تعديل `fish_speech_s2/configs/text2semantic_finetune.yaml`. !!! note "ملاحظة" لمستخدمي Windows، يمكنك استخدام `trainer.strategy.process_group_backend=gloo` لتجنب مشكلات `nccl`. diff --git a/fish_speech_src/docs/ar/inference.md b/fish_speech_src/docs/ar/inference.md index 293144d..0957aff 100644 --- a/fish_speech_src/docs/ar/inference.md +++ b/fish_speech_src/docs/ar/inference.md @@ -18,7 +18,7 @@ hf download fishaudio/s2-pro --local-dir checkpoints/s2-pro ### 1. الحصول على رموز VQ من الصوت المرجعي ```bash -python fish_speech/models/dac/inference.py \ +python fish_speech_s2/models/dac/inference.py \ -i "test.wav" \ --checkpoint-path "checkpoints/s2-pro/codec.pth" ``` @@ -28,7 +28,7 @@ python fish_speech/models/dac/inference.py \ ### 2. توليد الرموز الدلالية (Semantic tokens) من النص: ```bash -python fish_speech/models/text2semantic/inference.py \ +python fish_speech_s2/models/text2semantic/inference.py \ --text "النص الذي تريد تحويله" \ --prompt-text "النص المرجعي الخاص بك" \ --prompt-tokens "fake.npy" \ @@ -47,7 +47,7 @@ python fish_speech/models/text2semantic/inference.py \ ### 3. توليد الصوت من الرموز الدلالية: ```bash -python fish_speech/models/dac/inference.py \ +python fish_speech_s2/models/dac/inference.py \ -i "codes_0.npy" \ ``` diff --git a/fish_speech_src/docs/en/finetune.md b/fish_speech_src/docs/en/finetune.md index 96954bb..63b4644 100644 --- a/fish_speech_src/docs/en/finetune.md +++ b/fish_speech_src/docs/en/finetune.md @@ -98,13 +98,13 @@ huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/ope Finally, you can start the fine-tuning by running the following command: ```bash -python fish_speech/train.py --config-name text2semantic_finetune \ +python fish_speech_s2/train.py --config-name text2semantic_finetune \ project=$project \ +lora@model.model.lora_config=r_8_alpha_16 ``` !!! note - You can modify the training parameters such as `batch_size`, `gradient_accumulation_steps`, etc. to fit your GPU memory by modifying `fish_speech/configs/text2semantic_finetune.yaml`. + You can modify the training parameters such as `batch_size`, `gradient_accumulation_steps`, etc. to fit your GPU memory by modifying `fish_speech_s2/configs/text2semantic_finetune.yaml`. !!! note For Windows users, you can use `trainer.strategy.process_group_backend=gloo` to avoid `nccl` issues. diff --git a/fish_speech_src/docs/en/inference.md b/fish_speech_src/docs/en/inference.md index 64a312d..98607e6 100644 --- a/fish_speech_src/docs/en/inference.md +++ b/fish_speech_src/docs/en/inference.md @@ -18,7 +18,7 @@ hf download fishaudio/s2-pro --local-dir checkpoints/s2-pro ### 1. Get VQ tokens from reference audio ```bash -python fish_speech/models/dac/inference.py \ +python fish_speech_s2/models/dac/inference.py \ -i "test.wav" \ --checkpoint-path "checkpoints/s2-pro/codec.pth" ``` @@ -28,7 +28,7 @@ You should get a `fake.npy` and a `fake.wav`. ### 2. Generate Semantic tokens from text: ```bash -python fish_speech/models/text2semantic/inference.py \ +python fish_speech_s2/models/text2semantic/inference.py \ --text "The text you want to convert" \ --prompt-text "Your reference text" \ --prompt-tokens "fake.npy" \ @@ -47,7 +47,7 @@ This command will create a `codes_N` file in the working directory, where N is a ### 3. Generate vocals from semantic tokens: ```bash -python fish_speech/models/dac/inference.py \ +python fish_speech_s2/models/dac/inference.py \ -i "codes_0.npy" \ ``` diff --git a/fish_speech_src/docs/ja/finetune.md b/fish_speech_src/docs/ja/finetune.md index 5419831..45f1231 100644 --- a/fish_speech_src/docs/ja/finetune.md +++ b/fish_speech_src/docs/ja/finetune.md @@ -94,13 +94,13 @@ huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/ope 最後に、次のコマンドを実行してファインチューニングを開始できます。 ```bash -python fish_speech/train.py --config-name text2semantic_finetune \ +python fish_speech_s2/train.py --config-name text2semantic_finetune \ project=$project \ +lora@model.model.lora_config=r_8_alpha_16 ``` !!! note - `fish_speech/configs/text2semantic_finetune.yaml` を変更することで、`batch_size` や `gradient_accumulation_steps` などのトレーニングパラメータをGPUメモリに合わせて変更できます。 + `fish_speech_s2/configs/text2semantic_finetune.yaml` を変更することで、`batch_size` や `gradient_accumulation_steps` などのトレーニングパラメータをGPUメモリに合わせて変更できます。 !!! note Windows ユーザーの場合、`trainer.strategy.process_group_backend=gloo` を使用して `nccl` の問題を回避できます。 diff --git a/fish_speech_src/docs/ja/inference.md b/fish_speech_src/docs/ja/inference.md index 69ac9ee..94b9e5d 100644 --- a/fish_speech_src/docs/ja/inference.md +++ b/fish_speech_src/docs/ja/inference.md @@ -18,7 +18,7 @@ hf download fishaudio/s2-pro --local-dir checkpoints/s2-pro ### 1. リファレンスオーディオから VQ トークンを取得する ```bash -python fish_speech/models/dac/inference.py \ +python fish_speech_s2/models/dac/inference.py \ -i "test.wav" \ --checkpoint-path "checkpoints/s2-pro/codec.pth" ``` @@ -28,7 +28,7 @@ python fish_speech/models/dac/inference.py \ ### 2. テキストから Semantic トークンを生成する: ```bash -python fish_speech/models/text2semantic/inference.py \ +python fish_speech_s2/models/text2semantic/inference.py \ --text "変換したいテキスト" \ --prompt-text "リファレンステキスト" \ --prompt-tokens "fake.npy" \ @@ -47,7 +47,7 @@ python fish_speech/models/text2semantic/inference.py \ ### 3. セマンティックトークンから音声を生成する: ```bash -python fish_speech/models/dac/inference.py \ +python fish_speech_s2/models/dac/inference.py \ -i "codes_0.npy" \ ``` diff --git a/fish_speech_src/docs/ko/finetune.md b/fish_speech_src/docs/ko/finetune.md index 07b8dbd..f0c4695 100644 --- a/fish_speech_src/docs/ko/finetune.md +++ b/fish_speech_src/docs/ko/finetune.md @@ -95,13 +95,13 @@ huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/ope 마지막으로, 다음 명령을 실행하여 미세 조정을 시작할 수 있습니다. ```bash -python fish_speech/train.py --config-name text2semantic_finetune \ +python fish_speech_s2/train.py --config-name text2semantic_finetune \ project=$project \ +lora@model.model.lora_config=r_8_alpha_16 ``` !!! note - `fish_speech/configs/text2semantic_finetune.yaml` 파일을 수정하여 `batch_size`, `gradient_accumulation_steps` 등 훈련 매개변수를 GPU 메모리에 맞게 조정할 수 있습니다. + `fish_speech_s2/configs/text2semantic_finetune.yaml` 파일을 수정하여 `batch_size`, `gradient_accumulation_steps` 등 훈련 매개변수를 GPU 메모리에 맞게 조정할 수 있습니다. !!! note Windows 사용자의 경우, `trainer.strategy.process_group_backend=gloo`를 사용하여 `nccl` 관련 문제를 피할 수 있습니다. diff --git a/fish_speech_src/docs/ko/inference.md b/fish_speech_src/docs/ko/inference.md index b7afd78..3bb41ae 100644 --- a/fish_speech_src/docs/ko/inference.md +++ b/fish_speech_src/docs/ko/inference.md @@ -18,7 +18,7 @@ hf download fishaudio/s2-pro --local-dir checkpoints/s2-pro ### 1. 참조 오디오에서 VQ 토큰 가져오기 ```bash -python fish_speech/models/dac/inference.py \ +python fish_speech_s2/models/dac/inference.py \ -i "test.wav" \ --checkpoint-path "checkpoints/s2-pro/codec.pth" ``` @@ -28,7 +28,7 @@ python fish_speech/models/dac/inference.py \ ### 2. 텍스트에서 Semantic 토큰 생성: ```bash -python fish_speech/models/text2semantic/inference.py \ +python fish_speech_s2/models/text2semantic/inference.py \ --text "변환하려는 텍스트" \ --prompt-text "참조 텍스트" \ --prompt-tokens "fake.npy" \ @@ -47,7 +47,7 @@ python fish_speech/models/text2semantic/inference.py \ ### 3. 시맨틱 토큰에서 음성 생성: ```bash -python fish_speech/models/dac/inference.py \ +python fish_speech_s2/models/dac/inference.py \ -i "codes_0.npy" \ ``` diff --git a/fish_speech_src/docs/pt/finetune.md b/fish_speech_src/docs/pt/finetune.md index 6333634..0ba4756 100644 --- a/fish_speech_src/docs/pt/finetune.md +++ b/fish_speech_src/docs/pt/finetune.md @@ -95,13 +95,13 @@ huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/ope Finalmente, você pode iniciar o ajuste fino executando o seguinte comando: ```bash -python fish_speech/train.py --config-name text2semantic_finetune \ +python fish_speech_s2/train.py --config-name text2semantic_finetune \ project=$project \ +lora@model.model.lora_config=r_8_alpha_16 ``` !!! note - Você pode modificar os parâmetros de treinamento, como `batch_size`, `gradient_accumulation_steps`, etc., para se adequar à memória da sua GPU, modificando `fish_speech/configs/text2semantic_finetune.yaml`. + Você pode modificar os parâmetros de treinamento, como `batch_size`, `gradient_accumulation_steps`, etc., para se adequar à memória da sua GPU, modificando `fish_speech_s2/configs/text2semantic_finetune.yaml`. !!! note Para usuários do Windows, você pode usar `trainer.strategy.process_group_backend=gloo` para evitar problemas com `nccl`. diff --git a/fish_speech_src/docs/pt/inference.md b/fish_speech_src/docs/pt/inference.md index 580e08e..56ba3f5 100644 --- a/fish_speech_src/docs/pt/inference.md +++ b/fish_speech_src/docs/pt/inference.md @@ -18,7 +18,7 @@ hf download fishaudio/s2-pro --local-dir checkpoints/s2-pro ### 1. Obter tokens VQ do áudio de referência ```bash -python fish_speech/models/dac/inference.py \ +python fish_speech_s2/models/dac/inference.py \ -i "test.wav" \ --checkpoint-path "checkpoints/s2-pro/codec.pth" ``` @@ -28,7 +28,7 @@ Você deve obter um `fake.npy` e um `fake.wav`. ### 2. Gerar tokens Semânticos a partir do texto: ```bash -python fish_speech/models/text2semantic/inference.py \ +python fish_speech_s2/models/text2semantic/inference.py \ --text "O texto que você deseja converter" \ --prompt-text "Seu texto de referência" \ --prompt-tokens "fake.npy" \ @@ -47,7 +47,7 @@ Este comando criará um arquivo `codes_N` no diretório de trabalho, onde N é u ### 3. Gerar vocais a partir de tokens semânticos: ```bash -python fish_speech/models/dac/inference.py \ +python fish_speech_s2/models/dac/inference.py \ -i "codes_0.npy" \ ``` diff --git a/fish_speech_src/docs/zh/finetune.md b/fish_speech_src/docs/zh/finetune.md index 04ece3f..3617284 100644 --- a/fish_speech_src/docs/zh/finetune.md +++ b/fish_speech_src/docs/zh/finetune.md @@ -96,13 +96,13 @@ huggingface-cli download fishaudio/s2-pro --local-dir checkpoints/s2-pro 最后, 你可以运行以下命令来启动微调: ```bash -python fish_speech/train.py --config-name text2semantic_finetune \ +python fish_speech_s2/train.py --config-name text2semantic_finetune \ project=$project \ +lora@model.model.lora_config=r_8_alpha_16 ``` !!! note - 你可以通过修改 `fish_speech/configs/text2semantic_finetune.yaml` 来修改训练参数如 `batch_size`, `gradient_accumulation_steps` 等, 来适应你的显存. + 你可以通过修改 `fish_speech_s2/configs/text2semantic_finetune.yaml` 来修改训练参数如 `batch_size`, `gradient_accumulation_steps` 等, 来适应你的显存. !!! note 对于 Windows 用户, 你可以使用 `trainer.strategy.process_group_backend=gloo` 来避免 `nccl` 的问题. diff --git a/fish_speech_src/docs/zh/inference.md b/fish_speech_src/docs/zh/inference.md index 959cb98..78a770c 100644 --- a/fish_speech_src/docs/zh/inference.md +++ b/fish_speech_src/docs/zh/inference.md @@ -18,7 +18,7 @@ hf download fishaudio/s2-pro --local-dir checkpoints/s2-pro ### 1. 从参考音频获取 VQ tokens ```bash -python fish_speech/models/dac/inference.py \ +python fish_speech_s2/models/dac/inference.py \ -i "test.wav" \ --checkpoint-path "checkpoints/s2-pro/codec.pth" ``` @@ -28,7 +28,7 @@ python fish_speech/models/dac/inference.py \ ### 2. 从文本生成 Semantic tokens: ```bash -python fish_speech/models/text2semantic/inference.py \ +python fish_speech_s2/models/text2semantic/inference.py \ --text "您想要转换的文本" \ --prompt-text "您的参考文本" \ --prompt-tokens "fake.npy" \ @@ -47,7 +47,7 @@ python fish_speech/models/text2semantic/inference.py \ ### 3. 从语义令牌生成声音: ```bash -python fish_speech/models/dac/inference.py \ +python fish_speech_s2/models/dac/inference.py \ -i "codes_0.npy" \ ``` diff --git a/fish_speech_src/fish_speech/configs/lora/r_8_alpha_16.yaml b/fish_speech_src/fish_speech/configs/lora/r_8_alpha_16.yaml deleted file mode 100644 index aecc4d9..0000000 --- a/fish_speech_src/fish_speech/configs/lora/r_8_alpha_16.yaml +++ /dev/null @@ -1,4 +0,0 @@ -_target_: fish_speech.models.text2semantic.lora.LoraConfig -r: 8 -lora_alpha: 16 -lora_dropout: 0.01 diff --git a/fish_speech_src/fish_speech/callbacks/__init__.py b/fish_speech_src/fish_speech_s2/callbacks/__init__.py similarity index 100% rename from fish_speech_src/fish_speech/callbacks/__init__.py rename to fish_speech_src/fish_speech_s2/callbacks/__init__.py diff --git a/fish_speech_src/fish_speech/callbacks/grad_norm.py b/fish_speech_src/fish_speech_s2/callbacks/grad_norm.py similarity index 100% rename from fish_speech_src/fish_speech/callbacks/grad_norm.py rename to fish_speech_src/fish_speech_s2/callbacks/grad_norm.py diff --git a/fish_speech_src/fish_speech/configs/base.yaml b/fish_speech_src/fish_speech_s2/configs/base.yaml similarity index 97% rename from fish_speech_src/fish_speech/configs/base.yaml rename to fish_speech_src/fish_speech_s2/configs/base.yaml index 99e6dab..f5fcbaa 100644 --- a/fish_speech_src/fish_speech/configs/base.yaml +++ b/fish_speech_src/fish_speech_s2/configs/base.yaml @@ -53,7 +53,7 @@ callbacks: log_momentum: false grad_norm_monitor: - _target_: fish_speech.callbacks.GradNormMonitor + _target_: fish_speech_s2.callbacks.GradNormMonitor norm_type: 2 logging_interval: step diff --git a/fish_speech_src/fish_speech_s2/configs/lora/r_8_alpha_16.yaml b/fish_speech_src/fish_speech_s2/configs/lora/r_8_alpha_16.yaml new file mode 100644 index 0000000..28d3ec1 --- /dev/null +++ b/fish_speech_src/fish_speech_s2/configs/lora/r_8_alpha_16.yaml @@ -0,0 +1,4 @@ +_target_: fish_speech_s2.models.text2semantic.lora.LoraConfig +r: 8 +lora_alpha: 16 +lora_dropout: 0.01 diff --git a/fish_speech_src/fish_speech/configs/modded_dac_vq.yaml b/fish_speech_src/fish_speech_s2/configs/modded_dac_vq.yaml similarity index 76% rename from fish_speech_src/fish_speech/configs/modded_dac_vq.yaml rename to fish_speech_src/fish_speech_s2/configs/modded_dac_vq.yaml index 18089ed..9ca3405 100644 --- a/fish_speech_src/fish_speech/configs/modded_dac_vq.yaml +++ b/fish_speech_src/fish_speech_s2/configs/modded_dac_vq.yaml @@ -1,4 +1,4 @@ -_target_: fish_speech.models.dac.modded_dac.DAC +_target_: fish_speech_s2.models.dac.modded_dac.DAC # Model setup sample_rate: 44100 encoder_dim: 64 @@ -8,7 +8,7 @@ decoder_rates: [8, 8, 4, 2] encoder_transformer_layers: [0, 0, 0, 4] decoder_transformer_layers: [4, 0, 0, 0] transformer_general_config: - _target_: fish_speech.models.dac.modded_dac.ModelArgs + _target_: fish_speech_s2.models.dac.modded_dac.ModelArgs _partial_: true block_size: 8192 n_local_heads: -1 @@ -20,7 +20,7 @@ transformer_general_config: channels_first: true # Quantization quantizer: - _target_: fish_speech.models.dac.rvq.DownsampleResidualVectorQuantize + _target_: fish_speech_s2.models.dac.rvq.DownsampleResidualVectorQuantize input_dim: 1024 n_codebooks: 9 codebook_size: 1024 @@ -28,12 +28,12 @@ quantizer: quantizer_dropout: 0.5 downsample_factor: [2, 2] post_module: &transformer_module - _target_: fish_speech.models.dac.modded_dac.WindowLimitedTransformer + _target_: fish_speech_s2.models.dac.modded_dac.WindowLimitedTransformer causal: true window_size: 128 # empirically this does not seem to matter input_dim: 1024 config: &transformer_config - _target_: fish_speech.models.dac.modded_dac.ModelArgs + _target_: fish_speech_s2.models.dac.modded_dac.ModelArgs block_size: 2048 n_layer: 8 n_head: 16 diff --git a/fish_speech_src/fish_speech/configs/text2semantic_finetune.yaml b/fish_speech_src/fish_speech_s2/configs/text2semantic_finetune.yaml similarity index 74% rename from fish_speech_src/fish_speech/configs/text2semantic_finetune.yaml rename to fish_speech_src/fish_speech_s2/configs/text2semantic_finetune.yaml index 00f6905..5895fbd 100644 --- a/fish_speech_src/fish_speech/configs/text2semantic_finetune.yaml +++ b/fish_speech_src/fish_speech_s2/configs/text2semantic_finetune.yaml @@ -21,12 +21,12 @@ trainer: # Dataset Configuration tokenizer: - _target_: fish_speech.tokenizer.FishTokenizer + _target_: fish_speech_s2.tokenizer.FishTokenizer model_path: ${pretrained_ckpt_path}/tokenizer.tiktoken # Dataset Configuration train_dataset: - _target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionIterableDataset + _target_: fish_speech_s2.datasets.semantic.AutoTextSemanticInstructionIterableDataset proto_files: - data/protos tokenizer: ${tokenizer} @@ -36,7 +36,7 @@ train_dataset: interactive_prob: 0.7 val_dataset: - _target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionIterableDataset + _target_: fish_speech_s2.datasets.semantic.AutoTextSemanticInstructionIterableDataset proto_files: - data/protos tokenizer: ${tokenizer} @@ -46,7 +46,7 @@ val_dataset: interactive_prob: 0.7 data: - _target_: fish_speech.datasets.semantic.SemanticDataModule + _target_: fish_speech_s2.datasets.semantic.SemanticDataModule train_dataset: ${train_dataset} val_dataset: ${val_dataset} num_workers: 4 @@ -56,9 +56,9 @@ data: # Model Configuration model: - _target_: fish_speech.models.text2semantic.lit_module.TextToSemantic + _target_: fish_speech_s2.models.text2semantic.lit_module.TextToSemantic model: - _target_: fish_speech.models.text2semantic.llama.BaseTransformer.from_pretrained + _target_: fish_speech_s2.models.text2semantic.llama.BaseTransformer.from_pretrained path: ${pretrained_ckpt_path} load_weights: true max_length: ${max_length} @@ -76,7 +76,7 @@ model: _target_: torch.optim.lr_scheduler.LambdaLR _partial_: true lr_lambda: - _target_: fish_speech.scheduler.get_constant_schedule_with_warmup_lr_lambda + _target_: fish_speech_s2.scheduler.get_constant_schedule_with_warmup_lr_lambda _partial_: true num_warmup_steps: 10 diff --git a/fish_speech_src/fish_speech/content_sequence.py b/fish_speech_src/fish_speech_s2/content_sequence.py similarity index 99% rename from fish_speech_src/fish_speech/content_sequence.py rename to fish_speech_src/fish_speech_s2/content_sequence.py index c29f8c0..99bf780 100644 --- a/fish_speech_src/fish_speech/content_sequence.py +++ b/fish_speech_src/fish_speech_s2/content_sequence.py @@ -4,7 +4,7 @@ import numpy as np import torch -from fish_speech.tokenizer import ( +from fish_speech_s2.tokenizer import ( IM_END_TOKEN, MODALITY_TOKENS, FishTokenizer, diff --git a/fish_speech_src/fish_speech/conversation.py b/fish_speech_src/fish_speech_s2/conversation.py similarity index 97% rename from fish_speech_src/fish_speech/conversation.py rename to fish_speech_src/fish_speech_s2/conversation.py index d0fa5c2..94979c1 100644 --- a/fish_speech_src/fish_speech/conversation.py +++ b/fish_speech_src/fish_speech_s2/conversation.py @@ -5,7 +5,7 @@ import torch from transformers import PreTrainedTokenizerFast -from fish_speech.content_sequence import ( +from fish_speech_s2.content_sequence import ( AudioPart, BasePart, ContentSequence, @@ -13,7 +13,7 @@ TextPart, VQPart, ) -from fish_speech.tokenizer import IM_END_TOKEN, IM_START_TOKEN, MODALITY_TOKENS +from fish_speech_s2.tokenizer import IM_END_TOKEN, IM_START_TOKEN, MODALITY_TOKENS @dataclass(kw_only=True) diff --git a/fish_speech_src/fish_speech/datasets/concat_repeat.py b/fish_speech_src/fish_speech_s2/datasets/concat_repeat.py similarity index 100% rename from fish_speech_src/fish_speech/datasets/concat_repeat.py rename to fish_speech_src/fish_speech_s2/datasets/concat_repeat.py diff --git a/fish_speech_src/fish_speech/datasets/protos/text-data.proto b/fish_speech_src/fish_speech_s2/datasets/protos/text-data.proto similarity index 100% rename from fish_speech_src/fish_speech/datasets/protos/text-data.proto rename to fish_speech_src/fish_speech_s2/datasets/protos/text-data.proto diff --git a/fish_speech_src/fish_speech/datasets/protos/text_data_pb2.py b/fish_speech_src/fish_speech_s2/datasets/protos/text_data_pb2.py similarity index 100% rename from fish_speech_src/fish_speech/datasets/protos/text_data_pb2.py rename to fish_speech_src/fish_speech_s2/datasets/protos/text_data_pb2.py diff --git a/fish_speech_src/fish_speech/datasets/protos/text_data_stream.py b/fish_speech_src/fish_speech_s2/datasets/protos/text_data_stream.py similarity index 100% rename from fish_speech_src/fish_speech/datasets/protos/text_data_stream.py rename to fish_speech_src/fish_speech_s2/datasets/protos/text_data_stream.py diff --git a/fish_speech_src/fish_speech/datasets/semantic.py b/fish_speech_src/fish_speech_s2/datasets/semantic.py similarity index 97% rename from fish_speech_src/fish_speech/datasets/semantic.py rename to fish_speech_src/fish_speech_s2/datasets/semantic.py index 8999e26..747078c 100644 --- a/fish_speech_src/fish_speech/datasets/semantic.py +++ b/fish_speech_src/fish_speech_s2/datasets/semantic.py @@ -15,16 +15,16 @@ from torch.distributed import get_rank, get_world_size, is_initialized from torch.utils.data import DataLoader, Dataset, IterableDataset, get_worker_info -from fish_speech.content_sequence import ContentSequence, TextPart, VQPart +from fish_speech_s2.content_sequence import ContentSequence, TextPart, VQPart CODEBOOK_PAD_TOKEN_ID = 0 -from fish_speech.datasets.protos.text_data_pb2 import SampledData -from fish_speech.datasets.protos.text_data_stream import read_pb_stream -from fish_speech.text.clean import clean_text -from fish_speech.tokenizer import FishTokenizer -from fish_speech.utils import RankedLogger -from fish_speech.utils.braceexpand import braceexpand +from fish_speech_s2.datasets.protos.text_data_pb2 import SampledData +from fish_speech_s2.datasets.protos.text_data_stream import read_pb_stream +from fish_speech_s2.text.clean import clean_text +from fish_speech_s2.tokenizer import FishTokenizer +from fish_speech_s2.utils import RankedLogger +from fish_speech_s2.utils.braceexpand import braceexpand log = RankedLogger(__name__, rank_zero_only=True) diff --git a/fish_speech_src/fish_speech/datasets/vqgan.py b/fish_speech_src/fish_speech_s2/datasets/vqgan.py similarity index 98% rename from fish_speech_src/fish_speech/datasets/vqgan.py rename to fish_speech_src/fish_speech_s2/datasets/vqgan.py index a45583d..0d91abf 100644 --- a/fish_speech_src/fish_speech/datasets/vqgan.py +++ b/fish_speech_src/fish_speech_s2/datasets/vqgan.py @@ -8,7 +8,7 @@ from lightning import LightningDataModule from torch.utils.data import DataLoader, Dataset -from fish_speech.utils import RankedLogger +from fish_speech_s2.utils import RankedLogger logger = RankedLogger(__name__, rank_zero_only=False) diff --git a/fish_speech_src/fish_speech/i18n/README.md b/fish_speech_src/fish_speech_s2/i18n/README.md similarity index 95% rename from fish_speech_src/fish_speech/i18n/README.md rename to fish_speech_src/fish_speech_s2/i18n/README.md index 700902b..d8d78a0 100644 --- a/fish_speech_src/fish_speech/i18n/README.md +++ b/fish_speech_src/fish_speech_s2/i18n/README.md @@ -2,7 +2,7 @@ The `i18n` folder within the `fish_speech` directory contains files initially sourced from the RVC project. In compliance with the MIT license under which these files were released, we acknowledge the original authors and sources below: -### fish_speech/i18n/core.py +### fish_speech_s2/i18n/core.py **Related code from RVC:** [https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/i18n.py](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/i18n.py) @@ -13,7 +13,7 @@ add localization(添加本地化) [RVC-Project/Retrieval-based-Voice-Conversion- **Initial author:** [@L4Ph](https://github.com/L4Ph) -### fish_speech/i18n/scan.py +### fish_speech_s2/i18n/scan.py **Related code from RVC:** [https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/scan_i18n.py](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/scan_i18n.py) diff --git a/fish_speech_src/fish_speech/i18n/__init__.py b/fish_speech_src/fish_speech_s2/i18n/__init__.py similarity index 100% rename from fish_speech_src/fish_speech/i18n/__init__.py rename to fish_speech_src/fish_speech_s2/i18n/__init__.py diff --git a/fish_speech_src/fish_speech/i18n/core.py b/fish_speech_src/fish_speech_s2/i18n/core.py similarity index 100% rename from fish_speech_src/fish_speech/i18n/core.py rename to fish_speech_src/fish_speech_s2/i18n/core.py diff --git a/fish_speech_src/fish_speech/i18n/locale/en_US.json b/fish_speech_src/fish_speech_s2/i18n/locale/en_US.json similarity index 100% rename from fish_speech_src/fish_speech/i18n/locale/en_US.json rename to fish_speech_src/fish_speech_s2/i18n/locale/en_US.json diff --git a/fish_speech_src/fish_speech/i18n/locale/es_ES.json b/fish_speech_src/fish_speech_s2/i18n/locale/es_ES.json similarity index 100% rename from fish_speech_src/fish_speech/i18n/locale/es_ES.json rename to fish_speech_src/fish_speech_s2/i18n/locale/es_ES.json diff --git a/fish_speech_src/fish_speech/i18n/locale/ja_JP.json b/fish_speech_src/fish_speech_s2/i18n/locale/ja_JP.json similarity index 100% rename from fish_speech_src/fish_speech/i18n/locale/ja_JP.json rename to fish_speech_src/fish_speech_s2/i18n/locale/ja_JP.json diff --git a/fish_speech_src/fish_speech/i18n/locale/ko_KR.json b/fish_speech_src/fish_speech_s2/i18n/locale/ko_KR.json similarity index 100% rename from fish_speech_src/fish_speech/i18n/locale/ko_KR.json rename to fish_speech_src/fish_speech_s2/i18n/locale/ko_KR.json diff --git a/fish_speech_src/fish_speech/i18n/locale/pt_BR.json b/fish_speech_src/fish_speech_s2/i18n/locale/pt_BR.json similarity index 100% rename from fish_speech_src/fish_speech/i18n/locale/pt_BR.json rename to fish_speech_src/fish_speech_s2/i18n/locale/pt_BR.json diff --git a/fish_speech_src/fish_speech/i18n/locale/zh_CN.json b/fish_speech_src/fish_speech_s2/i18n/locale/zh_CN.json similarity index 100% rename from fish_speech_src/fish_speech/i18n/locale/zh_CN.json rename to fish_speech_src/fish_speech_s2/i18n/locale/zh_CN.json diff --git a/fish_speech_src/fish_speech/i18n/scan.py b/fish_speech_src/fish_speech_s2/i18n/scan.py similarity index 100% rename from fish_speech_src/fish_speech/i18n/scan.py rename to fish_speech_src/fish_speech_s2/i18n/scan.py diff --git a/fish_speech_src/fish_speech/inference_engine/__init__.py b/fish_speech_src/fish_speech_s2/inference_engine/__init__.py similarity index 93% rename from fish_speech_src/fish_speech/inference_engine/__init__.py rename to fish_speech_src/fish_speech_s2/inference_engine/__init__.py index ff34fc5..117a581 100644 --- a/fish_speech_src/fish_speech/inference_engine/__init__.py +++ b/fish_speech_src/fish_speech_s2/inference_engine/__init__.py @@ -6,17 +6,17 @@ import torch from loguru import logger -from fish_speech.inference_engine.reference_loader import ReferenceLoader -from fish_speech.inference_engine.utils import InferenceResult, wav_chunk_header -from fish_speech.inference_engine.vq_manager import VQManager -from fish_speech.models.dac.modded_dac import DAC -from fish_speech.models.text2semantic.inference import ( +from fish_speech_s2.inference_engine.reference_loader import ReferenceLoader +from fish_speech_s2.inference_engine.utils import InferenceResult, wav_chunk_header +from fish_speech_s2.inference_engine.vq_manager import VQManager +from fish_speech_s2.models.dac.modded_dac import DAC +from fish_speech_s2.models.text2semantic.inference import ( GenerateRequest, GenerateResponse, WrappedGenerateResponse, ) -from fish_speech.utils import autocast_exclude_mps, set_seed -from fish_speech.utils.schema import ServeTTSRequest +from fish_speech_s2.utils import autocast_exclude_mps, set_seed +from fish_speech_s2.utils.schema import ServeTTSRequest class TTSInferenceEngine(ReferenceLoader, VQManager): diff --git a/fish_speech_src/fish_speech/inference_engine/reference_loader.py b/fish_speech_src/fish_speech_s2/inference_engine/reference_loader.py similarity index 98% rename from fish_speech_src/fish_speech/inference_engine/reference_loader.py rename to fish_speech_src/fish_speech_s2/inference_engine/reference_loader.py index d44e4d3..70e08c4 100644 --- a/fish_speech_src/fish_speech/inference_engine/reference_loader.py +++ b/fish_speech_src/fish_speech_s2/inference_engine/reference_loader.py @@ -7,14 +7,14 @@ import torchaudio from loguru import logger -from fish_speech.models.dac.modded_dac import DAC -from fish_speech.utils.file import ( +from fish_speech_s2.models.dac.modded_dac import DAC +from fish_speech_s2.utils.file import ( AUDIO_EXTENSIONS, audio_to_bytes, list_files, read_ref_text, ) -from fish_speech.utils.schema import ServeReferenceAudio +from fish_speech_s2.utils.schema import ServeReferenceAudio class ReferenceLoader: diff --git a/fish_speech_src/fish_speech/inference_engine/utils.py b/fish_speech_src/fish_speech_s2/inference_engine/utils.py similarity index 100% rename from fish_speech_src/fish_speech/inference_engine/utils.py rename to fish_speech_src/fish_speech_s2/inference_engine/utils.py diff --git a/fish_speech_src/fish_speech/inference_engine/vq_manager.py b/fish_speech_src/fish_speech_s2/inference_engine/vq_manager.py similarity index 97% rename from fish_speech_src/fish_speech/inference_engine/vq_manager.py rename to fish_speech_src/fish_speech_s2/inference_engine/vq_manager.py index 4481580..58db326 100644 --- a/fish_speech_src/fish_speech/inference_engine/vq_manager.py +++ b/fish_speech_src/fish_speech_s2/inference_engine/vq_manager.py @@ -3,7 +3,7 @@ import torch from loguru import logger -from fish_speech.models.dac.modded_dac import DAC +from fish_speech_s2.models.dac.modded_dac import DAC class VQManager: diff --git a/fish_speech_src/fish_speech/models/dac/__init__.py b/fish_speech_src/fish_speech_s2/models/dac/__init__.py similarity index 100% rename from fish_speech_src/fish_speech/models/dac/__init__.py rename to fish_speech_src/fish_speech_s2/models/dac/__init__.py diff --git a/fish_speech_src/fish_speech/models/dac/inference.py b/fish_speech_src/fish_speech_s2/models/dac/inference.py similarity index 98% rename from fish_speech_src/fish_speech/models/dac/inference.py rename to fish_speech_src/fish_speech_s2/models/dac/inference.py index 236312b..478c1fb 100644 --- a/fish_speech_src/fish_speech/models/dac/inference.py +++ b/fish_speech_src/fish_speech_s2/models/dac/inference.py @@ -14,7 +14,7 @@ pyrootutils.setup_root(__file__, indicator=".project-root", pythonpath=True) -from fish_speech.utils.file import AUDIO_EXTENSIONS +from fish_speech_s2.utils.file import AUDIO_EXTENSIONS # register eval resolver (guard against duplicate registration when other # nodes or multiple imports have already registered it) diff --git a/fish_speech_src/fish_speech/models/dac/modded_dac.py b/fish_speech_src/fish_speech_s2/models/dac/modded_dac.py similarity index 99% rename from fish_speech_src/fish_speech/models/dac/modded_dac.py rename to fish_speech_src/fish_speech_s2/models/dac/modded_dac.py index eb14b47..d5b5b59 100644 --- a/fish_speech_src/fish_speech/models/dac/modded_dac.py +++ b/fish_speech_src/fish_speech_s2/models/dac/modded_dac.py @@ -1000,7 +1000,7 @@ def forward( from omegaconf import OmegaConf # 配置路径 - config_path = "fish_speech/configs/modded_dac_vq.yaml" + config_path = "fish_speech_s2/configs/modded_dac_vq.yaml" checkpoint_path = "checkpoints/s2-pro/codec.pth" codes_path = "./output/codes_0.npy" # 你的 codes 文件路径 output_path = "reconstructed_from_codes.wav" diff --git a/fish_speech_src/fish_speech/models/dac/rvq.py b/fish_speech_src/fish_speech_s2/models/dac/rvq.py similarity index 100% rename from fish_speech_src/fish_speech/models/dac/rvq.py rename to fish_speech_src/fish_speech_s2/models/dac/rvq.py diff --git a/fish_speech_src/fish_speech/models/text2semantic/__init__.py b/fish_speech_src/fish_speech_s2/models/text2semantic/__init__.py similarity index 100% rename from fish_speech_src/fish_speech/models/text2semantic/__init__.py rename to fish_speech_src/fish_speech_s2/models/text2semantic/__init__.py diff --git a/fish_speech_src/fish_speech/models/text2semantic/inference.py b/fish_speech_src/fish_speech_s2/models/text2semantic/inference.py similarity index 99% rename from fish_speech_src/fish_speech/models/text2semantic/inference.py rename to fish_speech_src/fish_speech_s2/models/text2semantic/inference.py index a9b5d28..462cdd6 100644 --- a/fish_speech_src/fish_speech/models/text2semantic/inference.py +++ b/fish_speech_src/fish_speech_s2/models/text2semantic/inference.py @@ -16,12 +16,12 @@ from loguru import logger from tqdm import tqdm -from fish_speech.content_sequence import ( +from fish_speech_s2.content_sequence import ( TextPart, VQPart, ) -from fish_speech.conversation import Conversation, Message -from fish_speech.tokenizer import IM_END_TOKEN +from fish_speech_s2.conversation import Conversation, Message +from fish_speech_s2.tokenizer import IM_END_TOKEN os.environ["TOKENIZERS_PARALLELISM"] = "false" torch._inductor.config.coordinate_descent_tuning = True @@ -33,7 +33,7 @@ from torch.nn.attention import SDPBackend, sdpa_kernel -from fish_speech.models.text2semantic.llama import ( +from fish_speech_s2.models.text2semantic.llama import ( BaseTransformer, DualARTransformer, NaiveTransformer, diff --git a/fish_speech_src/fish_speech/models/text2semantic/lit_module.py b/fish_speech_src/fish_speech_s2/models/text2semantic/lit_module.py similarity index 98% rename from fish_speech_src/fish_speech/models/text2semantic/lit_module.py rename to fish_speech_src/fish_speech_s2/models/text2semantic/lit_module.py index a13030c..212af76 100644 --- a/fish_speech_src/fish_speech/models/text2semantic/lit_module.py +++ b/fish_speech_src/fish_speech_s2/models/text2semantic/lit_module.py @@ -5,10 +5,10 @@ import torch.nn.functional as F from lightning.pytorch.utilities.types import OptimizerLRScheduler -import fish_speech.utils as utils +import fish_speech_s2.utils as utils CODEBOOK_PAD_TOKEN_ID = 0 -from fish_speech.models.text2semantic.llama import NaiveTransformer +from fish_speech_s2.models.text2semantic.llama import NaiveTransformer log = utils.RankedLogger(__name__, rank_zero_only=True) diff --git a/fish_speech_src/fish_speech/models/text2semantic/llama.py b/fish_speech_src/fish_speech_s2/models/text2semantic/llama.py similarity index 99% rename from fish_speech_src/fish_speech/models/text2semantic/llama.py rename to fish_speech_src/fish_speech_s2/models/text2semantic/llama.py index 4583bdd..d586d33 100644 --- a/fish_speech_src/fish_speech/models/text2semantic/llama.py +++ b/fish_speech_src/fish_speech_s2/models/text2semantic/llama.py @@ -15,7 +15,7 @@ from torch.nn.attention import SDPBackend, sdpa_kernel from torch.utils.checkpoint import checkpoint -from fish_speech.models.text2semantic.lora import LoraConfig, setup_lora +from fish_speech_s2.models.text2semantic.lora import LoraConfig, setup_lora # --------------------------------------------------------------------------- @@ -702,7 +702,7 @@ def from_pretrained( Actual CUDA-side quantization occurs on the first forward pass. """ # Import wrapper locally to avoid circular dependency or global import issues - from fish_speech.tokenizer import FishTokenizer + from fish_speech_s2.tokenizer import FishTokenizer config = BaseModelArgs.from_pretrained(str(path)) if max_length is not None: diff --git a/fish_speech_src/fish_speech/models/text2semantic/lora.py b/fish_speech_src/fish_speech_s2/models/text2semantic/lora.py similarity index 100% rename from fish_speech_src/fish_speech/models/text2semantic/lora.py rename to fish_speech_src/fish_speech_s2/models/text2semantic/lora.py diff --git a/fish_speech_src/fish_speech/scheduler.py b/fish_speech_src/fish_speech_s2/scheduler.py similarity index 100% rename from fish_speech_src/fish_speech/scheduler.py rename to fish_speech_src/fish_speech_s2/scheduler.py diff --git a/fish_speech_src/fish_speech/text/__init__.py b/fish_speech_src/fish_speech_s2/text/__init__.py similarity index 100% rename from fish_speech_src/fish_speech/text/__init__.py rename to fish_speech_src/fish_speech_s2/text/__init__.py diff --git a/fish_speech_src/fish_speech/text/clean.py b/fish_speech_src/fish_speech_s2/text/clean.py similarity index 100% rename from fish_speech_src/fish_speech/text/clean.py rename to fish_speech_src/fish_speech_s2/text/clean.py diff --git a/fish_speech_src/fish_speech/tokenizer.py b/fish_speech_src/fish_speech_s2/tokenizer.py similarity index 100% rename from fish_speech_src/fish_speech/tokenizer.py rename to fish_speech_src/fish_speech_s2/tokenizer.py diff --git a/fish_speech_src/fish_speech/train.py b/fish_speech_src/fish_speech_s2/train.py similarity index 99% rename from fish_speech_src/fish_speech/train.py rename to fish_speech_src/fish_speech_s2/train.py index e693f3a..4d84f23 100644 --- a/fish_speech_src/fish_speech/train.py +++ b/fish_speech_src/fish_speech_s2/train.py @@ -27,7 +27,7 @@ # register eval resolver OmegaConf.register_new_resolver("eval", eval) -import fish_speech.utils as utils +import fish_speech_s2.utils as utils log = utils.RankedLogger(__name__, rank_zero_only=True) diff --git a/fish_speech_src/fish_speech/utils/__init__.py b/fish_speech_src/fish_speech_s2/utils/__init__.py similarity index 100% rename from fish_speech_src/fish_speech/utils/__init__.py rename to fish_speech_src/fish_speech_s2/utils/__init__.py diff --git a/fish_speech_src/fish_speech/utils/braceexpand.py b/fish_speech_src/fish_speech_s2/utils/braceexpand.py similarity index 100% rename from fish_speech_src/fish_speech/utils/braceexpand.py rename to fish_speech_src/fish_speech_s2/utils/braceexpand.py diff --git a/fish_speech_src/fish_speech/utils/context.py b/fish_speech_src/fish_speech_s2/utils/context.py similarity index 100% rename from fish_speech_src/fish_speech/utils/context.py rename to fish_speech_src/fish_speech_s2/utils/context.py diff --git a/fish_speech_src/fish_speech/utils/file.py b/fish_speech_src/fish_speech_s2/utils/file.py similarity index 100% rename from fish_speech_src/fish_speech/utils/file.py rename to fish_speech_src/fish_speech_s2/utils/file.py diff --git a/fish_speech_src/fish_speech/utils/instantiators.py b/fish_speech_src/fish_speech_s2/utils/instantiators.py similarity index 100% rename from fish_speech_src/fish_speech/utils/instantiators.py rename to fish_speech_src/fish_speech_s2/utils/instantiators.py diff --git a/fish_speech_src/fish_speech/utils/logger.py b/fish_speech_src/fish_speech_s2/utils/logger.py similarity index 100% rename from fish_speech_src/fish_speech/utils/logger.py rename to fish_speech_src/fish_speech_s2/utils/logger.py diff --git a/fish_speech_src/fish_speech/utils/logging_utils.py b/fish_speech_src/fish_speech_s2/utils/logging_utils.py similarity index 96% rename from fish_speech_src/fish_speech/utils/logging_utils.py rename to fish_speech_src/fish_speech_s2/utils/logging_utils.py index 117c8d4..48d2f00 100644 --- a/fish_speech_src/fish_speech/utils/logging_utils.py +++ b/fish_speech_src/fish_speech_s2/utils/logging_utils.py @@ -5,7 +5,7 @@ def rank_zero_only(fn): return fn -from fish_speech.utils import logger as log +from fish_speech_s2.utils import logger as log @rank_zero_only diff --git a/fish_speech_src/fish_speech/utils/rich_utils.py b/fish_speech_src/fish_speech_s2/utils/rich_utils.py similarity index 98% rename from fish_speech_src/fish_speech/utils/rich_utils.py rename to fish_speech_src/fish_speech_s2/utils/rich_utils.py index ed77672..d3e7b5d 100644 --- a/fish_speech_src/fish_speech/utils/rich_utils.py +++ b/fish_speech_src/fish_speech_s2/utils/rich_utils.py @@ -13,7 +13,7 @@ def rank_zero_only(fn): from omegaconf import DictConfig, OmegaConf, open_dict from rich.prompt import Prompt -from fish_speech.utils import logger as log +from fish_speech_s2.utils import logger as log @rank_zero_only diff --git a/fish_speech_src/fish_speech/utils/schema.py b/fish_speech_src/fish_speech_s2/utils/schema.py similarity index 98% rename from fish_speech_src/fish_speech/utils/schema.py rename to fish_speech_src/fish_speech_s2/utils/schema.py index 01d40de..1d7baa0 100644 --- a/fish_speech_src/fish_speech/utils/schema.py +++ b/fish_speech_src/fish_speech_s2/utils/schema.py @@ -9,7 +9,7 @@ from pydantic.functional_validators import SkipValidation from typing_extensions import Annotated -from fish_speech.content_sequence import TextPart, VQPart +from fish_speech_s2.content_sequence import TextPart, VQPart class ServeVQPart(BaseModel): diff --git a/fish_speech_src/fish_speech/utils/spectrogram.py b/fish_speech_src/fish_speech_s2/utils/spectrogram.py similarity index 100% rename from fish_speech_src/fish_speech/utils/spectrogram.py rename to fish_speech_src/fish_speech_s2/utils/spectrogram.py diff --git a/fish_speech_src/fish_speech/utils/utils.py b/fish_speech_src/fish_speech_s2/utils/utils.py similarity index 100% rename from fish_speech_src/fish_speech/utils/utils.py rename to fish_speech_src/fish_speech_s2/utils/utils.py diff --git a/fish_speech_src/tools/api_client.py b/fish_speech_src/tools/api_client.py index 13d87b3..ea1c95d 100644 --- a/fish_speech_src/tools/api_client.py +++ b/fish_speech_src/tools/api_client.py @@ -9,8 +9,8 @@ from pydub import AudioSegment from pydub.playback import play -from fish_speech.utils.file import audio_to_bytes, read_ref_text -from fish_speech.utils.schema import ServeReferenceAudio, ServeTTSRequest +from fish_speech_s2.utils.file import audio_to_bytes, read_ref_text +from fish_speech_s2.utils.schema import ServeReferenceAudio, ServeTTSRequest def parse_args(): diff --git a/fish_speech_src/tools/llama/build_dataset.py b/fish_speech_src/tools/llama/build_dataset.py index 20e2219..6c68d40 100644 --- a/fish_speech_src/tools/llama/build_dataset.py +++ b/fish_speech_src/tools/llama/build_dataset.py @@ -11,9 +11,9 @@ from loguru import logger from tqdm import tqdm -from fish_speech.datasets.protos.text_data_pb2 import Semantics, Sentence, TextData -from fish_speech.datasets.protos.text_data_stream import pack_pb_stream -from fish_speech.utils.file import load_filelist +from fish_speech_s2.datasets.protos.text_data_pb2 import Semantics, Sentence, TextData +from fish_speech_s2.datasets.protos.text_data_stream import pack_pb_stream +from fish_speech_s2.utils.file import load_filelist # To avoid CPU overload os.environ["MKL_NUM_THREADS"] = "1" diff --git a/fish_speech_src/tools/llama/eval_in_context.py b/fish_speech_src/tools/llama/eval_in_context.py index 41d6397..c6c08fd 100644 --- a/fish_speech_src/tools/llama/eval_in_context.py +++ b/fish_speech_src/tools/llama/eval_in_context.py @@ -9,8 +9,8 @@ from torch.utils.data import DataLoader -from fish_speech.datasets.semantic import AutoAugTextDataset, TextDataCollator -from fish_speech.models.text2semantic.inference import load_model +from fish_speech_s2.datasets.semantic import AutoAugTextDataset, TextDataCollator +from fish_speech_s2.models.text2semantic.inference import load_model def smooth( diff --git a/fish_speech_src/tools/llama/merge_lora.py b/fish_speech_src/tools/llama/merge_lora.py index 1080ff5..7ffda35 100644 --- a/fish_speech_src/tools/llama/merge_lora.py +++ b/fish_speech_src/tools/llama/merge_lora.py @@ -9,8 +9,8 @@ from hydra.utils import instantiate from loguru import logger -from fish_speech.models.text2semantic.llama import BaseTransformer -from fish_speech.models.text2semantic.lora import get_merged_state_dict +from fish_speech_s2.models.text2semantic.llama import BaseTransformer +from fish_speech_s2.models.text2semantic.lora import get_merged_state_dict @click.command() @@ -24,7 +24,7 @@ def merge(lora_config, base_weight, lora_weight, output): f"Merging {base_weight} and {lora_weight} into {output} with {lora_config}" ) - with initialize(version_base="1.3", config_path="../../fish_speech/configs/lora"): + with initialize(version_base="1.3", config_path="../../fish_speech_s2/configs/lora"): cfg = compose(config_name=lora_config) lora_config = instantiate(cfg) diff --git a/fish_speech_src/tools/llama/quantize.py b/fish_speech_src/tools/llama/quantize.py index c606c53..e2703f7 100644 --- a/fish_speech_src/tools/llama/quantize.py +++ b/fish_speech_src/tools/llama/quantize.py @@ -13,8 +13,8 @@ import torch.nn as nn import torch.nn.functional as F -from fish_speech.models.text2semantic.inference import load_model -from fish_speech.models.text2semantic.llama import find_multiple +from fish_speech_s2.models.text2semantic.inference import load_model +from fish_speech_s2.models.text2semantic.llama import find_multiple ##### Quantization Primitives ###### diff --git a/fish_speech_src/tools/run_webui.py b/fish_speech_src/tools/run_webui.py index 37130a0..6660361 100644 --- a/fish_speech_src/tools/run_webui.py +++ b/fish_speech_src/tools/run_webui.py @@ -8,10 +8,10 @@ pyrootutils.setup_root(__file__, indicator=".project-root", pythonpath=True) -from fish_speech.inference_engine import TTSInferenceEngine -from fish_speech.models.dac.inference import load_model as load_decoder_model -from fish_speech.models.text2semantic.inference import launch_thread_safe_queue -from fish_speech.utils.schema import ServeTTSRequest +from fish_speech_s2.inference_engine import TTSInferenceEngine +from fish_speech_s2.models.dac.inference import load_model as load_decoder_model +from fish_speech_s2.models.text2semantic.inference import launch_thread_safe_queue +from fish_speech_s2.utils.schema import ServeTTSRequest from tools.webui import build_app from tools.webui.inference import get_inference_wrapper diff --git a/fish_speech_src/tools/server/api_utils.py b/fish_speech_src/tools/server/api_utils.py index 4fa9526..26932f1 100644 --- a/fish_speech_src/tools/server/api_utils.py +++ b/fish_speech_src/tools/server/api_utils.py @@ -13,8 +13,8 @@ from loguru import logger from pydantic import BaseModel -from fish_speech.inference_engine import TTSInferenceEngine -from fish_speech.utils.schema import ServeTTSRequest +from fish_speech_s2.inference_engine import TTSInferenceEngine +from fish_speech_s2.utils.schema import ServeTTSRequest from tools.server.inference import inference_wrapper as inference diff --git a/fish_speech_src/tools/server/inference.py b/fish_speech_src/tools/server/inference.py index 060e24b..90bba3a 100644 --- a/fish_speech_src/tools/server/inference.py +++ b/fish_speech_src/tools/server/inference.py @@ -3,8 +3,8 @@ import numpy as np from kui.asgi import HTTPException -from fish_speech.inference_engine import TTSInferenceEngine -from fish_speech.utils.schema import ServeTTSRequest +from fish_speech_s2.inference_engine import TTSInferenceEngine +from fish_speech_s2.utils.schema import ServeTTSRequest AMPLITUDE = 32768 # Needs an explaination diff --git a/fish_speech_src/tools/server/model_manager.py b/fish_speech_src/tools/server/model_manager.py index 4ec4bdf..685757e 100644 --- a/fish_speech_src/tools/server/model_manager.py +++ b/fish_speech_src/tools/server/model_manager.py @@ -1,10 +1,10 @@ import torch from loguru import logger -from fish_speech.inference_engine import TTSInferenceEngine -from fish_speech.models.dac.inference import load_model as load_decoder_model -from fish_speech.models.text2semantic.inference import launch_thread_safe_queue -from fish_speech.utils.schema import ServeTTSRequest +from fish_speech_s2.inference_engine import TTSInferenceEngine +from fish_speech_s2.models.dac.inference import load_model as load_decoder_model +from fish_speech_s2.models.text2semantic.inference import launch_thread_safe_queue +from fish_speech_s2.utils.schema import ServeTTSRequest from tools.server.inference import inference_wrapper as inference diff --git a/fish_speech_src/tools/server/views.py b/fish_speech_src/tools/server/views.py index 5e47304..a745b66 100644 --- a/fish_speech_src/tools/server/views.py +++ b/fish_speech_src/tools/server/views.py @@ -24,7 +24,7 @@ from loguru import logger from typing_extensions import Annotated -from fish_speech.utils.schema import ( +from fish_speech_s2.utils.schema import ( AddReferenceRequest, AddReferenceResponse, DeleteReferenceResponse, diff --git a/fish_speech_src/tools/vqgan/create_train_split.py b/fish_speech_src/tools/vqgan/create_train_split.py index 977afdf..747503f 100644 --- a/fish_speech_src/tools/vqgan/create_train_split.py +++ b/fish_speech_src/tools/vqgan/create_train_split.py @@ -7,7 +7,7 @@ from pydub import AudioSegment from tqdm import tqdm -from fish_speech.utils.file import AUDIO_EXTENSIONS, list_files, load_filelist +from fish_speech_s2.utils.file import AUDIO_EXTENSIONS, list_files, load_filelist @click.command() diff --git a/fish_speech_src/tools/vqgan/extract_vq.py b/fish_speech_src/tools/vqgan/extract_vq.py index d50748c..70cc389 100644 --- a/fish_speech_src/tools/vqgan/extract_vq.py +++ b/fish_speech_src/tools/vqgan/extract_vq.py @@ -16,7 +16,7 @@ from loguru import logger from omegaconf import OmegaConf -from fish_speech.utils.file import AUDIO_EXTENSIONS, list_files, load_filelist +from fish_speech_s2.utils.file import AUDIO_EXTENSIONS, list_files, load_filelist # register eval resolver OmegaConf.register_new_resolver("eval", eval) @@ -50,7 +50,7 @@ def get_model( checkpoint_path: str = "checkpoints/openaudio-s1-mini/codec.pth", device: str | torch.device = "cuda", ): - with initialize(version_base="1.3", config_path="../../fish_speech/configs"): + with initialize(version_base="1.3", config_path="../../fish_speech_s2/configs"): cfg = compose(config_name=config_name) model = instantiate(cfg) diff --git a/fish_speech_src/tools/webui/__init__.py b/fish_speech_src/tools/webui/__init__.py index e9b9a02..52c1cd4 100644 --- a/fish_speech_src/tools/webui/__init__.py +++ b/fish_speech_src/tools/webui/__init__.py @@ -2,7 +2,7 @@ import gradio as gr -from fish_speech.i18n import i18n +from fish_speech_s2.i18n import i18n from tools.webui.variables import HEADER_MD, TEXTBOX_PLACEHOLDER diff --git a/fish_speech_src/tools/webui/inference.py b/fish_speech_src/tools/webui/inference.py index e6cd1d7..6617487 100644 --- a/fish_speech_src/tools/webui/inference.py +++ b/fish_speech_src/tools/webui/inference.py @@ -2,8 +2,8 @@ from functools import partial from typing import Any, Callable -from fish_speech.i18n import i18n -from fish_speech.utils.schema import ServeReferenceAudio, ServeTTSRequest +from fish_speech_s2.i18n import i18n +from fish_speech_s2.utils.schema import ServeReferenceAudio, ServeTTSRequest def inference_wrapper( diff --git a/fish_speech_src/tools/webui/variables.py b/fish_speech_src/tools/webui/variables.py index d32cc94..c72a680 100644 --- a/fish_speech_src/tools/webui/variables.py +++ b/fish_speech_src/tools/webui/variables.py @@ -1,4 +1,4 @@ -from fish_speech.i18n import i18n +from fish_speech_s2.i18n import i18n HEADER_MD = f"""# Fish Speech diff --git a/nodes/loader.py b/nodes/loader.py index fe23ed2..4e030ea 100644 --- a/nodes/loader.py +++ b/nodes/loader.py @@ -375,9 +375,9 @@ def load_engine( ) attention = "sdpa" try: - from fish_speech.models.dac.inference import load_model as load_decoder_model - from fish_speech.models.text2semantic.inference import launch_thread_safe_queue - from fish_speech.inference_engine import TTSInferenceEngine + from fish_speech_s2.models.dac.inference import load_model as load_decoder_model + from fish_speech_s2.models.text2semantic.inference import launch_thread_safe_queue + from fish_speech_s2.inference_engine import TTSInferenceEngine except ImportError as e: raise ImportError( f"fish_speech package not found: {e}\n" @@ -465,7 +465,7 @@ def _make_attention_forward(attention: str): if attention == "sdpa": def _forward(self, x, freqs_cis, mask, input_pos=None): - from fish_speech.models.text2semantic.llama import apply_rotary_emb + from fish_speech_s2.models.text2semantic.llama import apply_rotary_emb import torch.nn.functional as F bsz, seqlen, _ = x.shape q_size = self.n_head * self.head_dim @@ -497,7 +497,7 @@ def _forward(self, x, freqs_cis, mask, input_pos=None): if attention == "flash_attention": def _forward(self, x, freqs_cis, mask, input_pos=None): - from fish_speech.models.text2semantic.llama import apply_rotary_emb + from fish_speech_s2.models.text2semantic.llama import apply_rotary_emb from torch.nn.attention import SDPBackend, sdpa_kernel import torch.nn.functional as F bsz, seqlen, _ = x.shape @@ -540,7 +540,7 @@ def _forward(self, x, freqs_cis, mask, input_pos=None): ) def _forward(self, x, freqs_cis, mask, input_pos=None): - from fish_speech.models.text2semantic.llama import apply_rotary_emb + from fish_speech_s2.models.text2semantic.llama import apply_rotary_emb from sageattention import sageattn import torch.nn.functional as F bsz, seqlen, _ = x.shape @@ -586,7 +586,7 @@ def _patch_attention_class(attention: str): return None, None try: - from fish_speech.models.text2semantic.llama import Attention + from fish_speech_s2.models.text2semantic.llama import Attention except ImportError as e: logger.warning(f"Cannot patch Attention class: {e}") return None, None diff --git a/nodes/model_cache.py b/nodes/model_cache.py index 610f16b..b9fd06e 100644 --- a/nodes/model_cache.py +++ b/nodes/model_cache.py @@ -85,7 +85,7 @@ def offload_engine_to_cpu() -> None: # our offload message as soon as it finishes that job. We use a long timeout # to cover the worst case (long generation cancelled mid-way). try: - from fish_speech.models.text2semantic.inference import GenerateRequest + from fish_speech_s2.models.text2semantic.inference import GenerateRequest offload_response: queue.Queue = queue.Queue() engine.llama_queue.put( @@ -152,7 +152,7 @@ def resume_engine_to_cuda(device: str = "cuda") -> None: # --- Ask the LLaMA worker thread to move back to device --- try: - from fish_speech.models.text2semantic.inference import GenerateRequest + from fish_speech_s2.models.text2semantic.inference import GenerateRequest response_queue: queue.Queue = queue.Queue() engine.llama_queue.put( diff --git a/nodes/multi_speaker_node.py b/nodes/multi_speaker_node.py index 4cc6e16..57c8f8d 100644 --- a/nodes/multi_speaker_node.py +++ b/nodes/multi_speaker_node.py @@ -343,7 +343,7 @@ def execute( engine = _get_engine(model_path, device, precision, attention, compile_model, keep_model_loaded, offload_to_cpu) - from fish_speech.utils.schema import ServeReferenceAudio, ServeTTSRequest + from fish_speech_s2.utils.schema import ServeReferenceAudio, ServeTTSRequest # num_speakers is a dict from DynamicCombo: # {"num_speakers": "3", "speaker_1_audio": ..., "speaker_1_ref_text": ..., ...} @@ -565,7 +565,7 @@ def generate( engine = _get_engine(model_path, device, precision, attention, compile_model, keep_model_loaded, offload_to_cpu) - from fish_speech.utils.schema import ServeReferenceAudio, ServeTTSRequest + from fish_speech_s2.utils.schema import ServeReferenceAudio, ServeTTSRequest # Build per-speaker reference map (0-based index) references = {} diff --git a/nodes/multi_speaker_split_node.py b/nodes/multi_speaker_split_node.py index 619f31d..b21f2d5 100644 --- a/nodes/multi_speaker_split_node.py +++ b/nodes/multi_speaker_split_node.py @@ -206,7 +206,7 @@ def execute( engine = _get_engine(model_path, device, precision, attention, compile_model, keep_model_loaded, offload_to_cpu) - from fish_speech.utils.schema import ServeReferenceAudio, ServeTTSRequest + from fish_speech_s2.utils.schema import ServeReferenceAudio, ServeTTSRequest n = int(num_speakers["num_speakers"]) @@ -399,7 +399,7 @@ def generate( engine = _get_engine(model_path, device, precision, attention, compile_model, keep_model_loaded, offload_to_cpu) - from fish_speech.utils.schema import ServeReferenceAudio, ServeTTSRequest + from fish_speech_s2.utils.schema import ServeReferenceAudio, ServeTTSRequest references = {} missing = [] diff --git a/nodes/tts_node.py b/nodes/tts_node.py index 8532bc7..9f4223c 100644 --- a/nodes/tts_node.py +++ b/nodes/tts_node.py @@ -211,7 +211,7 @@ def generate( engine = self._get_engine(model_path, device, precision, attention, compile_model, keep_model_loaded, offload_to_cpu) - from fish_speech.utils.schema import ServeTTSRequest + from fish_speech_s2.utils.schema import ServeTTSRequest pbar = ProgressBar(3) if _PBAR else None diff --git a/nodes/voice_clone_node.py b/nodes/voice_clone_node.py index ddd5013..8c2a140 100644 --- a/nodes/voice_clone_node.py +++ b/nodes/voice_clone_node.py @@ -175,7 +175,7 @@ def generate( engine = self._get_engine(model_path, device, precision, attention, compile_model, keep_model_loaded, offload_to_cpu) - from fish_speech.utils.schema import ServeReferenceAudio, ServeTTSRequest + from fish_speech_s2.utils.schema import ServeReferenceAudio, ServeTTSRequest pbar = ProgressBar(4) if _PBAR else None