From 8c5d821d55070a45b307d11531c65ed6592fddd4 Mon Sep 17 00:00:00 2001
From: Mr-Neutr0n <64578610+Mr-Neutr0n@users.noreply.github.com>
Date: Sat, 7 Feb 2026 02:33:28 +0530
Subject: [PATCH] Fix channel dimension mismatch in DiscriminatorSTFT

The input channels for the second convolution layer should match
the output channels of the first convolution (self.filters), not
the scaled value.

The first convolution outputs self.filters channels, so the second
convolution should expect that many input channels, not
filters_scale * self.filters which caused dimension mismatch when
filters_scale > 1.

Fixes #93
---
 encodec/msstftd.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/encodec/msstftd.py b/encodec/msstftd.py
index a1d3242..d916cde 100644
--- a/encodec/msstftd.py
+++ b/encodec/msstftd.py
@@ -67,7 +67,7 @@ def __init__(self, filters: int, in_channels: int = 1, out_channels: int = 1,
         self.convs.append(
             NormConv2d(spec_channels, self.filters, kernel_size=kernel_size, padding=get_2d_padding(kernel_size))
         )
-        in_chs = min(filters_scale * self.filters, max_filters)
+        in_chs = self.filters
         for i, dilation in enumerate(dilations):
             out_chs = min((filters_scale ** (i + 1)) * self.filters, max_filters)
             self.convs.append(NormConv2d(in_chs, out_chs, kernel_size=kernel_size, stride=stride,