From 8c5d821d55070a45b307d11531c65ed6592fddd4 Mon Sep 17 00:00:00 2001 From: Mr-Neutr0n <64578610+Mr-Neutr0n@users.noreply.github.com> Date: Sat, 7 Feb 2026 02:33:28 +0530 Subject: [PATCH] Fix channel dimension mismatch in DiscriminatorSTFT The input channels for the second convolution layer should match the output channels of the first convolution (self.filters), not the scaled value. The first convolution outputs self.filters channels, so the second convolution should expect that many input channels, not filters_scale * self.filters which caused dimension mismatch when filters_scale > 1. Fixes #93 --- encodec/msstftd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/encodec/msstftd.py b/encodec/msstftd.py index a1d3242..d916cde 100644 --- a/encodec/msstftd.py +++ b/encodec/msstftd.py @@ -67,7 +67,7 @@ def __init__(self, filters: int, in_channels: int = 1, out_channels: int = 1, self.convs.append( NormConv2d(spec_channels, self.filters, kernel_size=kernel_size, padding=get_2d_padding(kernel_size)) ) - in_chs = min(filters_scale * self.filters, max_filters) + in_chs = self.filters for i, dilation in enumerate(dilations): out_chs = min((filters_scale ** (i + 1)) * self.filters, max_filters) self.convs.append(NormConv2d(in_chs, out_chs, kernel_size=kernel_size, stride=stride,