siliconflow · ccndcn · Mar 30, 2026
diff --git a/README.md b/README.md
@@ -412,7 +412,7 @@ If you see `RuntimeError: Failed to create AudioDecoder ... MockDecoder() takes
 
 ### Conflicting `fish_speech` Package?
 
-If you see `ImportError: cannot import name 'AUDIO_EXTENSIONS' from 'fish_speech.utils.file'` pointing to another custom node's directory (e.g. `comfyui-mixlab-nodes`), a different node has its own `fish_speech` folder that conflicts with ours via `sys.path`. Disable the conflicting node or remove it. Do **not** pip-install `fish_speech` — it is bundled inside this node.
+If you see `ImportError: cannot import name 'AUDIO_EXTENSIONS' from 'fish_speech_s2.utils.file'` pointing to another custom node's directory (e.g. `comfyui-mixlab-nodes`), a different node has its own `fish_speech` folder that conflicts with ours via `sys.path`. Disable the conflicting node or remove it. Do **not** pip-install `fish_speech` — it is bundled inside this node.
 
 ### Out of Memory?
 

diff --git a/README_ZH.md b/README_ZH.md
@@ -406,7 +406,7 @@ pip install "descript-audiotools>=0.7.2" --no-deps
 
 ### `fish_speech` 包冲突？
 
-如果看到 `ImportError: cannot import name 'AUDIO_EXTENSIONS' from 'fish_speech.utils.file'` 且路径指向其他自定义节点目录（如 `comfyui-mixlab-nodes`），说明另一个节点有自己的 `fish_speech` 文件夹，通过 `sys.path` 与本节点冲突。请禁用或删除冲突节点。**不要** 通过 pip 安装 `fish_speech` — 它已内置于本节点中。
+如果看到 `ImportError: cannot import name 'AUDIO_EXTENSIONS' from 'fish_speech_s2.utils.file'` 且路径指向其他自定义节点目录（如 `comfyui-mixlab-nodes`），说明另一个节点有自己的 `fish_speech` 文件夹，通过 `sys.path` 与本节点冲突。请禁用或删除冲突节点。**不要** 通过 pip 安装 `fish_speech` — 它已内置于本节点中。
 
 ### 显存不足？
 

diff --git a/__init__.py b/__init__.py
@@ -167,7 +167,7 @@ def _ensure_fish_source() -> bool:
         sys.path.insert(0, fish_src_str)
 
     try:
-        import fish_speech.models  # noqa: F401
+        import fish_speech_s2.models  # noqa: F401
         return True
     except ImportError as e:
         logger.error(f"fish_speech not importable from {_FISH_SRC}: {e}")

diff --git a/fish_speech_src/.gitignore b/fish_speech_src/.gitignore
@@ -58,7 +58,7 @@ venv.bak/
 # Project Dependencies
 # --------------------
 .pdm-python
-/fish_speech.egg-info
+/fish_speech_s2.egg-info
 
 # Data and Model Files
 # --------------------
@@ -86,7 +86,7 @@ filelists/
 *.pkl
 *.pickle
 *.lab
-/fish_speech/text/cmudict_cache.pickle
+/fish_speech_s2/text/cmudict_cache.pickle
 
 # Cache and Temporary Files
 # --------------------------

diff --git a/fish_speech_src/docs/ar/finetune.md b/fish_speech_src/docs/ar/finetune.md
@@ -95,13 +95,13 @@ huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/ope
 أخيرًا، يمكنك بدء الضبط الدقيق عن طريق تشغيل الأمر التالي:
 
 ```bash
-python fish_speech/train.py --config-name text2semantic_finetune \
+python fish_speech_s2/train.py --config-name text2semantic_finetune \
     project=$project \
     +lora@model.model.lora_config=r_8_alpha_16
 ```
 
 !!! note "ملاحظة"
-    يمكنك تعديل معلمات التدريب مثل `batch_size`، `gradient_accumulation_steps`، وما إلى ذلك لتناسب ذاكرة وحدة معالجة الرسومات الخاصة بك عن طريق تعديل `fish_speech/configs/text2semantic_finetune.yaml`.
+    يمكنك تعديل معلمات التدريب مثل `batch_size`، `gradient_accumulation_steps`، وما إلى ذلك لتناسب ذاكرة وحدة معالجة الرسومات الخاصة بك عن طريق تعديل `fish_speech_s2/configs/text2semantic_finetune.yaml`.
 
 !!! note "ملاحظة"
     لمستخدمي Windows، يمكنك استخدام `trainer.strategy.process_group_backend=gloo` لتجنب مشكلات `nccl`.

diff --git a/fish_speech_src/docs/ar/inference.md b/fish_speech_src/docs/ar/inference.md
@@ -18,7 +18,7 @@ hf download fishaudio/s2-pro --local-dir checkpoints/s2-pro
 ### 1. الحصول على رموز VQ من الصوت المرجعي
 
 ```bash
-python fish_speech/models/dac/inference.py \
+python fish_speech_s2/models/dac/inference.py \
     -i "test.wav" \
     --checkpoint-path "checkpoints/s2-pro/codec.pth"
 ```
@@ -28,7 +28,7 @@ python fish_speech/models/dac/inference.py \
 ### 2. توليد الرموز الدلالية (Semantic tokens) من النص:
 
 ```bash
-python fish_speech/models/text2semantic/inference.py \
+python fish_speech_s2/models/text2semantic/inference.py \
     --text "النص الذي تريد تحويله" \
     --prompt-text "النص المرجعي الخاص بك" \
     --prompt-tokens "fake.npy" \
@@ -47,7 +47,7 @@ python fish_speech/models/text2semantic/inference.py \
 ### 3. توليد الصوت من الرموز الدلالية:
 
 ```bash
-python fish_speech/models/dac/inference.py \
+python fish_speech_s2/models/dac/inference.py \
     -i "codes_0.npy" \
 ```
 

diff --git a/fish_speech_src/docs/en/finetune.md b/fish_speech_src/docs/en/finetune.md
@@ -98,13 +98,13 @@ huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/ope
 Finally, you can start the fine-tuning by running the following command:
 
 ```bash
-python fish_speech/train.py --config-name text2semantic_finetune \
+python fish_speech_s2/train.py --config-name text2semantic_finetune \
     project=$project \
     +lora@model.model.lora_config=r_8_alpha_16
 ```
 
 !!! note
-    You can modify the training parameters such as `batch_size`, `gradient_accumulation_steps`, etc. to fit your GPU memory by modifying `fish_speech/configs/text2semantic_finetune.yaml`.
+    You can modify the training parameters such as `batch_size`, `gradient_accumulation_steps`, etc. to fit your GPU memory by modifying `fish_speech_s2/configs/text2semantic_finetune.yaml`.
 
 !!! note
     For Windows users, you can use `trainer.strategy.process_group_backend=gloo` to avoid `nccl` issues.

diff --git a/fish_speech_src/docs/en/inference.md b/fish_speech_src/docs/en/inference.md
@@ -18,7 +18,7 @@ hf download fishaudio/s2-pro --local-dir checkpoints/s2-pro
 ### 1. Get VQ tokens from reference audio
 
 ```bash
-python fish_speech/models/dac/inference.py \
+python fish_speech_s2/models/dac/inference.py \
     -i "test.wav" \
     --checkpoint-path "checkpoints/s2-pro/codec.pth"
 ```
@@ -28,7 +28,7 @@ You should get a `fake.npy` and a `fake.wav`.
 ### 2. Generate Semantic tokens from text:
 
 ```bash
-python fish_speech/models/text2semantic/inference.py \
+python fish_speech_s2/models/text2semantic/inference.py \
     --text "The text you want to convert" \
     --prompt-text "Your reference text" \
     --prompt-tokens "fake.npy" \
@@ -47,7 +47,7 @@ This command will create a `codes_N` file in the working directory, where N is a
 ### 3. Generate vocals from semantic tokens:
 
 ```bash
-python fish_speech/models/dac/inference.py \
+python fish_speech_s2/models/dac/inference.py \
     -i "codes_0.npy" \
 ```
 

diff --git a/fish_speech_src/docs/ja/finetune.md b/fish_speech_src/docs/ja/finetune.md
@@ -94,13 +94,13 @@ huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/ope
 最後に、次のコマンドを実行してファインチューニングを開始できます。
 
 ```bash
-python fish_speech/train.py --config-name text2semantic_finetune \
+python fish_speech_s2/train.py --config-name text2semantic_finetune \
     project=$project \
     +lora@model.model.lora_config=r_8_alpha_16
 ```
 
 !!! note
-    `fish_speech/configs/text2semantic_finetune.yaml` を変更することで、`batch_size` や `gradient_accumulation_steps` などのトレーニングパラメータをGPUメモリに合わせて変更できます。
+    `fish_speech_s2/configs/text2semantic_finetune.yaml` を変更することで、`batch_size` や `gradient_accumulation_steps` などのトレーニングパラメータをGPUメモリに合わせて変更できます。
 
 !!! note
     Windows ユーザーの場合、`trainer.strategy.process_group_backend=gloo` を使用して `nccl` の問題を回避できます。

diff --git a/fish_speech_src/docs/ja/inference.md b/fish_speech_src/docs/ja/inference.md
@@ -18,7 +18,7 @@ hf download fishaudio/s2-pro --local-dir checkpoints/s2-pro
 ### 1. リファレンスオーディオから VQ トークンを取得する
 
 ```bash
-python fish_speech/models/dac/inference.py \
+python fish_speech_s2/models/dac/inference.py \
     -i "test.wav" \
     --checkpoint-path "checkpoints/s2-pro/codec.pth"
 ```
@@ -28,7 +28,7 @@ python fish_speech/models/dac/inference.py \
 ### 2. テキストから Semantic トークンを生成する：
 
 ```bash
-python fish_speech/models/text2semantic/inference.py \
+python fish_speech_s2/models/text2semantic/inference.py \
     --text "変換したいテキスト" \
     --prompt-text "リファレンステキスト" \
     --prompt-tokens "fake.npy" \
@@ -47,7 +47,7 @@ python fish_speech/models/text2semantic/inference.py \
 ### 3. セマンティックトークンから音声を生成する：
 
 ```bash
-python fish_speech/models/dac/inference.py \
+python fish_speech_s2/models/dac/inference.py \
     -i "codes_0.npy" \
 ```
 

diff --git a/fish_speech_src/docs/ko/finetune.md b/fish_speech_src/docs/ko/finetune.md
@@ -95,13 +95,13 @@ huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/ope
 마지막으로, 다음 명령을 실행하여 미세 조정을 시작할 수 있습니다.
 
 ```bash
-python fish_speech/train.py --config-name text2semantic_finetune \
+python fish_speech_s2/train.py --config-name text2semantic_finetune \
     project=$project \
     +lora@model.model.lora_config=r_8_alpha_16
 ```
 
 !!! note
-    `fish_speech/configs/text2semantic_finetune.yaml` 파일을 수정하여 `batch_size`, `gradient_accumulation_steps` 등 훈련 매개변수를 GPU 메모리에 맞게 조정할 수 있습니다.
+    `fish_speech_s2/configs/text2semantic_finetune.yaml` 파일을 수정하여 `batch_size`, `gradient_accumulation_steps` 등 훈련 매개변수를 GPU 메모리에 맞게 조정할 수 있습니다.
 
 !!! note
     Windows 사용자의 경우, `trainer.strategy.process_group_backend=gloo`를 사용하여 `nccl` 관련 문제를 피할 수 있습니다.

diff --git a/fish_speech_src/docs/ko/inference.md b/fish_speech_src/docs/ko/inference.md
@@ -18,7 +18,7 @@ hf download fishaudio/s2-pro --local-dir checkpoints/s2-pro
 ### 1. 참조 오디오에서 VQ 토큰 가져오기
 
 ```bash
-python fish_speech/models/dac/inference.py \
+python fish_speech_s2/models/dac/inference.py \
     -i "test.wav" \
     --checkpoint-path "checkpoints/s2-pro/codec.pth"
 ```
@@ -28,7 +28,7 @@ python fish_speech/models/dac/inference.py \
 ### 2. 텍스트에서 Semantic 토큰 생성:
 
 ```bash
-python fish_speech/models/text2semantic/inference.py \
+python fish_speech_s2/models/text2semantic/inference.py \
     --text "변환하려는 텍스트" \
     --prompt-text "참조 텍스트" \
     --prompt-tokens "fake.npy" \
@@ -47,7 +47,7 @@ python fish_speech/models/text2semantic/inference.py \
 ### 3. 시맨틱 토큰에서 음성 생성:
 
 ```bash
-python fish_speech/models/dac/inference.py \
+python fish_speech_s2/models/dac/inference.py \
     -i "codes_0.npy" \
 ```
 

diff --git a/fish_speech_src/docs/pt/finetune.md b/fish_speech_src/docs/pt/finetune.md
@@ -95,13 +95,13 @@ huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/ope
 Finalmente, você pode iniciar o ajuste fino executando o seguinte comando:
 
 ```bash
-python fish_speech/train.py --config-name text2semantic_finetune \
+python fish_speech_s2/train.py --config-name text2semantic_finetune \
     project=$project \
     +lora@model.model.lora_config=r_8_alpha_16
 ```
 
 !!! note
-    Você pode modificar os parâmetros de treinamento, como `batch_size`, `gradient_accumulation_steps`, etc., para se adequar à memória da sua GPU, modificando `fish_speech/configs/text2semantic_finetune.yaml`.
+    Você pode modificar os parâmetros de treinamento, como `batch_size`, `gradient_accumulation_steps`, etc., para se adequar à memória da sua GPU, modificando `fish_speech_s2/configs/text2semantic_finetune.yaml`.
 
 !!! note
     Para usuários do Windows, você pode usar `trainer.strategy.process_group_backend=gloo` para evitar problemas com `nccl`.

diff --git a/fish_speech_src/docs/pt/inference.md b/fish_speech_src/docs/pt/inference.md
@@ -18,7 +18,7 @@ hf download fishaudio/s2-pro --local-dir checkpoints/s2-pro
 ### 1. Obter tokens VQ do áudio de referência
 
 ```bash
-python fish_speech/models/dac/inference.py \
+python fish_speech_s2/models/dac/inference.py \
     -i "test.wav" \
     --checkpoint-path "checkpoints/s2-pro/codec.pth"
 ```
@@ -28,7 +28,7 @@ Você deve obter um `fake.npy` e um `fake.wav`.
 ### 2. Gerar tokens Semânticos a partir do texto:
 
 ```bash
-python fish_speech/models/text2semantic/inference.py \
+python fish_speech_s2/models/text2semantic/inference.py \
     --text "O texto que você deseja converter" \
     --prompt-text "Seu texto de referência" \
     --prompt-tokens "fake.npy" \
@@ -47,7 +47,7 @@ Este comando criará um arquivo `codes_N` no diretório de trabalho, onde N é u
 ### 3. Gerar vocais a partir de tokens semânticos:
 
 ```bash
-python fish_speech/models/dac/inference.py \
+python fish_speech_s2/models/dac/inference.py \
     -i "codes_0.npy" \
 ```
 

diff --git a/fish_speech_src/docs/zh/finetune.md b/fish_speech_src/docs/zh/finetune.md
@@ -96,13 +96,13 @@ huggingface-cli download fishaudio/s2-pro --local-dir checkpoints/s2-pro
 最后, 你可以运行以下命令来启动微调:
 
 ```bash
-python fish_speech/train.py --config-name text2semantic_finetune \
+python fish_speech_s2/train.py --config-name text2semantic_finetune \
     project=$project \
     +lora@model.model.lora_config=r_8_alpha_16
 ```
 
 !!! note
-    你可以通过修改 `fish_speech/configs/text2semantic_finetune.yaml` 来修改训练参数如 `batch_size`, `gradient_accumulation_steps` 等, 来适应你的显存.
+    你可以通过修改 `fish_speech_s2/configs/text2semantic_finetune.yaml` 来修改训练参数如 `batch_size`, `gradient_accumulation_steps` 等, 来适应你的显存.
 
 !!! note
     对于 Windows 用户, 你可以使用 `trainer.strategy.process_group_backend=gloo` 来避免 `nccl` 的问题.

diff --git a/fish_speech_src/docs/zh/inference.md b/fish_speech_src/docs/zh/inference.md
@@ -18,7 +18,7 @@ hf download fishaudio/s2-pro --local-dir checkpoints/s2-pro
 ### 1. 从参考音频获取 VQ tokens
 
 ```bash
-python fish_speech/models/dac/inference.py \
+python fish_speech_s2/models/dac/inference.py \
     -i "test.wav" \
     --checkpoint-path "checkpoints/s2-pro/codec.pth"
 ```
@@ -28,7 +28,7 @@ python fish_speech/models/dac/inference.py \
 ### 2. 从文本生成 Semantic tokens：
 
 ```bash
-python fish_speech/models/text2semantic/inference.py \
+python fish_speech_s2/models/text2semantic/inference.py \
     --text "您想要转换的文本" \
     --prompt-text "您的参考文本" \
     --prompt-tokens "fake.npy" \
@@ -47,7 +47,7 @@ python fish_speech/models/text2semantic/inference.py \
 ### 3. 从语义令牌生成声音：
 
 ```bash
-python fish_speech/models/dac/inference.py \
+python fish_speech_s2/models/dac/inference.py \
     -i "codes_0.npy" \
 ```
 

diff --git a/fish_speech_src/fish_speech/configs/lora/r_8_alpha_16.yaml b/fish_speech_src/fish_speech/configs/lora/r_8_alpha_16.yaml
diff --git a/...ech_src/fish_speech/callbacks/__init__.py → ..._src/fish_speech_s2/callbacks/__init__.py b/...ech_src/fish_speech/callbacks/__init__.py → ..._src/fish_speech_s2/callbacks/__init__.py
diff --git a/...ch_src/fish_speech/callbacks/grad_norm.py → ...src/fish_speech_s2/callbacks/grad_norm.py b/...ch_src/fish_speech/callbacks/grad_norm.py → ...src/fish_speech_s2/callbacks/grad_norm.py
diff --git a/..._speech_src/fish_speech/configs/base.yaml → ...eech_src/fish_speech_s2/configs/base.yaml b/..._speech_src/fish_speech/configs/base.yaml → ...eech_src/fish_speech_s2/configs/base.yaml
@@ -53,7 +53,7 @@ callbacks:
     log_momentum: false
 
   grad_norm_monitor:
-    _target_: fish_speech.callbacks.GradNormMonitor
+    _target_: fish_speech_s2.callbacks.GradNormMonitor
     norm_type: 2
     logging_interval: step
 

diff --git a/fish_speech_src/fish_speech_s2/configs/lora/r_8_alpha_16.yaml b/fish_speech_src/fish_speech_s2/configs/lora/r_8_alpha_16.yaml
@@ -0,0 +1,4 @@
+_target_: fish_speech_s2.models.text2semantic.lora.LoraConfig
+r: 8
+lora_alpha: 16
+lora_dropout: 0.01
diff --git a/...rc/fish_speech/configs/modded_dac_vq.yaml → ...fish_speech_s2/configs/modded_dac_vq.yaml b/...rc/fish_speech/configs/modded_dac_vq.yaml → ...fish_speech_s2/configs/modded_dac_vq.yaml
@@ -1,4 +1,4 @@
-_target_: fish_speech.models.dac.modded_dac.DAC
+_target_: fish_speech_s2.models.dac.modded_dac.DAC
 # Model setup
 sample_rate: 44100
 encoder_dim: 64
@@ -8,7 +8,7 @@ decoder_rates: [8, 8, 4, 2]
 encoder_transformer_layers: [0, 0, 0, 4]
 decoder_transformer_layers: [4, 0, 0, 0]
 transformer_general_config:
-  _target_: fish_speech.models.dac.modded_dac.ModelArgs
+  _target_: fish_speech_s2.models.dac.modded_dac.ModelArgs
   _partial_: true
   block_size: 8192
   n_local_heads: -1
@@ -20,20 +20,20 @@ transformer_general_config:
   channels_first: true
 # Quantization
 quantizer:
-  _target_: fish_speech.models.dac.rvq.DownsampleResidualVectorQuantize
+  _target_: fish_speech_s2.models.dac.rvq.DownsampleResidualVectorQuantize
   input_dim: 1024
   n_codebooks: 9
   codebook_size: 1024
   codebook_dim: 8
   quantizer_dropout: 0.5
   downsample_factor: [2, 2]
   post_module: &transformer_module
-    _target_: fish_speech.models.dac.modded_dac.WindowLimitedTransformer
+    _target_: fish_speech_s2.models.dac.modded_dac.WindowLimitedTransformer
     causal: true
     window_size: 128  # empirically this does not seem to matter
     input_dim: 1024
     config: &transformer_config
-      _target_: fish_speech.models.dac.modded_dac.ModelArgs
+      _target_: fish_speech_s2.models.dac.modded_dac.ModelArgs
       block_size: 2048
       n_layer: 8
       n_head: 16