From 018d8b074ea286f32229fd331e5aa4083061bf96 Mon Sep 17 00:00:00 2001 From: Muhammad Saad Habib <116092271+Saadidream@users.noreply.github.com> Date: Sun, 20 Apr 2025 22:58:39 +0500 Subject: [PATCH 1/4] Create style_transfer.py --- encodec/modules/style_transfer.py | 149 ++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 encodec/modules/style_transfer.py diff --git a/encodec/modules/style_transfer.py b/encodec/modules/style_transfer.py new file mode 100644 index 0000000..ffcae6d --- /dev/null +++ b/encodec/modules/style_transfer.py @@ -0,0 +1,149 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from typing import Optional, Tuple + +class StyleEncoder(nn.Module): + """Encoder network for extracting style features from audio. + + This module uses a combination of convolutional layers and attention + to extract style-specific features from audio input. + """ + def __init__(self, input_channels: int = 1, style_dim: int = 256): + super().__init__() + self.conv_layers = nn.Sequential( + nn.Conv1d(input_channels, 64, kernel_size=7, stride=2, padding=3), + nn.ReLU(), + nn.Conv1d(64, 128, kernel_size=4, stride=2, padding=1), + nn.ReLU(), + nn.Conv1d(128, 256, kernel_size=4, stride=2, padding=1), + nn.ReLU(), + ) + + self.attention = nn.MultiheadAttention(256, num_heads=8) + self.style_projection = nn.Linear(256, style_dim) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # x shape: [batch, channels, time] + features = self.conv_layers(x) + + # Reshape for attention + features = features.permute(2, 0, 1) # [time, batch, channels] + features, _ = self.attention(features, features, features) + + # Global average pooling + style = features.mean(dim=0) # [batch, channels] + style = self.style_projection(style) + return style + +class ContentEncoder(nn.Module): + """Encoder network for extracting content features from audio. + + This module focuses on capturing the structural and content-related + features of the audio while being style-invariant. + """ + def __init__(self, input_channels: int = 1, content_dim: int = 256): + super().__init__() + self.conv_layers = nn.Sequential( + nn.Conv1d(input_channels, 64, kernel_size=7, stride=2, padding=3), + nn.InstanceNorm1d(64), + nn.ReLU(), + nn.Conv1d(64, 128, kernel_size=4, stride=2, padding=1), + nn.InstanceNorm1d(128), + nn.ReLU(), + nn.Conv1d(128, 256, kernel_size=4, stride=2, padding=1), + nn.InstanceNorm1d(256), + nn.ReLU(), + ) + + self.content_projection = nn.Linear(256, content_dim) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + features = self.conv_layers(x) + # Global average pooling + content = features.mean(dim=2) # [batch, channels] + content = self.content_projection(content) + return content + +class AudioDecoder(nn.Module): + """Decoder network for generating audio from content and style features. + + This module combines content and style features to generate + stylized audio output. + """ + def __init__(self, content_dim: int = 256, style_dim: int = 256, output_channels: int = 1): + super().__init__() + self.fusion = nn.Sequential( + nn.Linear(content_dim + style_dim, 512), + nn.ReLU(), + nn.Linear(512, 1024), + nn.ReLU(), + ) + + self.deconv_layers = nn.Sequential( + nn.ConvTranspose1d(1024, 512, kernel_size=4, stride=2, padding=1), + nn.InstanceNorm1d(512), + nn.ReLU(), + nn.ConvTranspose1d(512, 256, kernel_size=4, stride=2, padding=1), + nn.InstanceNorm1d(256), + nn.ReLU(), + nn.ConvTranspose1d(256, 128, kernel_size=4, stride=2, padding=1), + nn.InstanceNorm1d(128), + nn.ReLU(), + nn.ConvTranspose1d(128, output_channels, kernel_size=7, stride=2, padding=3), + nn.Tanh(), + ) + + def forward(self, content: torch.Tensor, style: torch.Tensor) -> torch.Tensor: + # Combine content and style + combined = torch.cat([content, style], dim=1) + features = self.fusion(combined) + + # Reshape for deconvolution + features = features.unsqueeze(2) # Add time dimension + audio = self.deconv_layers(features) + return audio + +class NeuralAudioStyleTransfer(nn.Module): + """Complete neural audio style transfer model. + + This model can transfer the style of one audio to another while + maintaining the content structure. + """ + def __init__(self, input_channels: int = 1, content_dim: int = 256, style_dim: int = 256): + super().__init__() + self.content_encoder = ContentEncoder(input_channels, content_dim) + self.style_encoder = StyleEncoder(input_channels, style_dim) + self.decoder = AudioDecoder(content_dim, style_dim, input_channels) + + def encode_content(self, x: torch.Tensor) -> torch.Tensor: + """Extract content features from input audio.""" + return self.content_encoder(x) + + def encode_style(self, x: torch.Tensor) -> torch.Tensor: + """Extract style features from reference audio.""" + return self.style_encoder(x) + + def forward(self, content_audio: torch.Tensor, style_audio: torch.Tensor) -> torch.Tensor: + """Transfer style from style_audio to content_audio.""" + content_features = self.encode_content(content_audio) + style_features = self.encode_style(style_audio) + return self.decoder(content_features, style_features) + + def compute_style_loss(self, generated: torch.Tensor, style: torch.Tensor) -> torch.Tensor: + """Compute style loss between generated and style audio.""" + gen_features = self.style_encoder(generated) + style_features = self.style_encoder(style) + return F.mse_loss(gen_features, style_features) + + def compute_content_loss(self, generated: torch.Tensor, content: torch.Tensor) -> torch.Tensor: + """Compute content loss between generated and content audio.""" + gen_features = self.content_encoder(generated) + content_features = self.content_encoder(content) + return F.mse_loss(gen_features, content_features) From d52b9c36fc09cd79708db3ba69e9fa68ed05e77a Mon Sep 17 00:00:00 2001 From: Muhammad Saad Habib <116092271+Saadidream@users.noreply.github.com> Date: Sun, 20 Apr 2025 23:00:04 +0500 Subject: [PATCH 2/4] Create train_style_transfer.py --- encodec/train_style_transfer.py | 144 ++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 encodec/train_style_transfer.py diff --git a/encodec/train_style_transfer.py b/encodec/train_style_transfer.py new file mode 100644 index 0000000..eccd387 --- /dev/null +++ b/encodec/train_style_transfer.py @@ -0,0 +1,144 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import Dataset, DataLoader +import torchaudio +import argparse +from pathlib import Path +from typing import Tuple, List +import numpy as np + +from .modules.style_transfer import NeuralAudioStyleTransfer + +class AudioStyleDataset(Dataset): + """Dataset for audio style transfer training.""" + def __init__(self, content_dir: Path, style_dir: Path, sample_rate: int = 16000): + self.content_files = list(content_dir.glob("*.wav")) + self.style_files = list(style_dir.glob("*.wav")) + self.sample_rate = sample_rate + + def __len__(self) -> int: + return len(self.content_files) + + def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]: + # Load content audio + content_path = self.content_files[idx] + content_audio, sr = torchaudio.load(content_path) + if sr != self.sample_rate: + content_audio = torchaudio.transforms.Resample(sr, self.sample_rate)(content_audio) + + # Load random style audio + style_path = np.random.choice(self.style_files) + style_audio, sr = torchaudio.load(style_path) + if sr != self.sample_rate: + style_audio = torchaudio.transforms.Resample(sr, self.sample_rate)(style_audio) + + return content_audio, style_audio + +def train(args): + # Initialize model + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model = NeuralAudioStyleTransfer().to(device) + + # Initialize optimizer + optimizer = optim.Adam(model.parameters(), lr=args.learning_rate) + + # Create dataset and dataloader + dataset = AudioStyleDataset( + content_dir=Path(args.content_dir), + style_dir=Path(args.style_dir), + sample_rate=args.sample_rate + ) + dataloader = DataLoader( + dataset, + batch_size=args.batch_size, + shuffle=True, + num_workers=args.num_workers + ) + + # Training loop + for epoch in range(args.epochs): + model.train() + total_loss = 0 + + for batch_idx, (content_audio, style_audio) in enumerate(dataloader): + content_audio = content_audio.to(device) + style_audio = style_audio.to(device) + + # Forward pass + generated_audio = model(content_audio, style_audio) + + # Compute losses + style_loss = model.compute_style_loss(generated_audio, style_audio) + content_loss = model.compute_content_loss(generated_audio, content_audio) + reconstruction_loss = nn.L1Loss()(generated_audio, content_audio) + + # Total loss + loss = ( + args.style_weight * style_loss + + args.content_weight * content_loss + + args.reconstruction_weight * reconstruction_loss + ) + + # Backward pass + optimizer.zero_grad() + loss.backward() + optimizer.step() + + total_loss += loss.item() + + if batch_idx % args.log_interval == 0: + print(f"Epoch {epoch}, Batch {batch_idx}, Loss: {loss.item():.4f}") + + avg_loss = total_loss / len(dataloader) + print(f"Epoch {epoch} completed. Average Loss: {avg_loss:.4f}") + + # Save checkpoint + if (epoch + 1) % args.save_interval == 0: + checkpoint_path = Path(args.checkpoint_dir) / f"model_epoch_{epoch+1}.pt" + torch.save({ + 'epoch': epoch, + 'model_state_dict': model.state_dict(), + 'optimizer_state_dict': optimizer.state_dict(), + 'loss': avg_loss, + }, checkpoint_path) + +def main(): + parser = argparse.ArgumentParser(description="Train Neural Audio Style Transfer model") + + # Data parameters + parser.add_argument("--content-dir", type=str, required=True, help="Directory containing content audio files") + parser.add_argument("--style-dir", type=str, required=True, help="Directory containing style audio files") + parser.add_argument("--sample-rate", type=int, default=16000, help="Audio sample rate") + + # Training parameters + parser.add_argument("--batch-size", type=int, default=32, help="Batch size") + parser.add_argument("--epochs", type=int, default=100, help="Number of epochs") + parser.add_argument("--learning-rate", type=float, default=0.001, help="Learning rate") + parser.add_argument("--num-workers", type=int, default=4, help="Number of dataloader workers") + + # Loss weights + parser.add_argument("--style-weight", type=float, default=1.0, help="Weight for style loss") + parser.add_argument("--content-weight", type=float, default=1.0, help="Weight for content loss") + parser.add_argument("--reconstruction-weight", type=float, default=1.0, help="Weight for reconstruction loss") + + # Logging and saving + parser.add_argument("--log-interval", type=int, default=10, help="Logging interval in batches") + parser.add_argument("--save-interval", type=int, default=5, help="Checkpoint saving interval in epochs") + parser.add_argument("--checkpoint-dir", type=str, default="checkpoints", help="Directory to save checkpoints") + + args = parser.parse_args() + + # Create checkpoint directory + Path(args.checkpoint_dir).mkdir(parents=True, exist_ok=True) + + train(args) + +if __name__ == "__main__": + main() From 4f31b0a3906786fc6a5271a4ed5d11742a089d43 Mon Sep 17 00:00:00 2001 From: Muhammad Saad Habib <116092271+Saadidream@users.noreply.github.com> Date: Sun, 20 Apr 2025 23:00:54 +0500 Subject: [PATCH 3/4] Create apply_style_transfer.py --- encodec/apply_style_transfer.py | 93 +++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 encodec/apply_style_transfer.py diff --git a/encodec/apply_style_transfer.py b/encodec/apply_style_transfer.py new file mode 100644 index 0000000..2470976 --- /dev/null +++ b/encodec/apply_style_transfer.py @@ -0,0 +1,93 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torchaudio +import argparse +from pathlib import Path +from typing import Optional + +from .modules.style_transfer import NeuralAudioStyleTransfer + +def apply_style_transfer( + model: NeuralAudioStyleTransfer, + content_path: Path, + style_path: Path, + output_path: Path, + sample_rate: int = 16000, + device: Optional[str] = None +) -> None: + """Apply style transfer to an audio file. + + Args: + model: Trained style transfer model + content_path: Path to content audio file + style_path: Path to style reference audio file + output_path: Path to save the output audio + sample_rate: Target sample rate + device: Device to run inference on + """ + if device is None: + device = "cuda" if torch.cuda.is_available() else "cpu" + + # Load content audio + content_audio, sr = torchaudio.load(content_path) + if sr != sample_rate: + content_audio = torchaudio.transforms.Resample(sr, sample_rate)(content_audio) + + # Load style audio + style_audio, sr = torchaudio.load(style_path) + if sr != sample_rate: + style_audio = torchaudio.transforms.Resample(sr, sample_rate)(style_audio) + + # Move to device + content_audio = content_audio.to(device) + style_audio = style_audio.to(device) + + # Apply style transfer + model.eval() + with torch.no_grad(): + generated_audio = model(content_audio, style_audio) + + # Save output + torchaudio.save( + output_path, + generated_audio.cpu(), + sample_rate, + encoding='PCM_S', + bits_per_sample=16 + ) + +def main(): + parser = argparse.ArgumentParser(description="Apply Neural Audio Style Transfer") + + parser.add_argument("--model-path", type=str, required=True, help="Path to trained model checkpoint") + parser.add_argument("--content-path", type=str, required=True, help="Path to content audio file") + parser.add_argument("--style-path", type=str, required=True, help="Path to style reference audio file") + parser.add_argument("--output-path", type=str, required=True, help="Path to save output audio") + parser.add_argument("--sample-rate", type=int, default=16000, help="Target sample rate") + parser.add_argument("--device", type=str, default=None, help="Device to run inference on") + + args = parser.parse_args() + + # Load model + checkpoint = torch.load(args.model_path, map_location=args.device) + model = NeuralAudioStyleTransfer() + model.load_state_dict(checkpoint['model_state_dict']) + model = model.to(args.device) + + # Apply style transfer + apply_style_transfer( + model=model, + content_path=Path(args.content_path), + style_path=Path(args.style_path), + output_path=Path(args.output_path), + sample_rate=args.sample_rate, + device=args.device + ) + +if __name__ == "__main__": + main() From 83de7b623313fa1a3ce17f7479ef923c868e538c Mon Sep 17 00:00:00 2001 From: Muhammad Saad Habib <116092271+Saadidream@users.noreply.github.com> Date: Sun, 20 Apr 2025 23:05:40 +0500 Subject: [PATCH 4/4] Update README.md --- README.md | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/README.md b/README.md index 05e90ee..3acc2bf 100644 --- a/README.md +++ b/README.md @@ -186,6 +186,83 @@ Note that the 48 kHz model processes the audio by chunks of 1 seconds, with an o and renormalizes the audio to have unit scale. For this model, the output of `model.encode(wav)` would a list (for each frame of 1 second) of a tuple `(codes, scale)` with `scale` a scalar tensor. +## Audio Style Transfer + +EnCodec now includes a neural audio style transfer module that enables transferring audio styles while maintaining content structure. This feature allows you to transform the style of one audio while preserving the content of another. + +### Features +- Transfer voice styles between different speakers +- Maintain content while changing audio characteristics +- Support for both mono and stereo audio +- GPU acceleration for faster processing +- Compatible with existing EnCodec compression pipeline + +### Usage + +#### Training +```bash +python -m encodec.train_style_transfer \ + --content-dir /path/to/content/audio \ + --style-dir /path/to/style/audio \ + --checkpoint-dir checkpoints +``` + +#### Inference +```bash +python -m encodec.apply_style_transfer \ + --model-path checkpoints/model_epoch_100.pt \ + --content-path input.wav \ + --style-path style.wav \ + --output-path output.wav +``` + +### Example +```python +from encodec import NeuralAudioStyleTransfer +from encodec.utils import convert_audio +import torchaudio + +# Load model +model = NeuralAudioStyleTransfer() +model.load_state_dict(torch.load("checkpoints/model_epoch_100.pt")) + +# Load content and style audio +content_audio, sr = torchaudio.load("content.wav") +style_audio, _ = torchaudio.load("style.wav") + +# Convert audio to model's sample rate +content_audio = convert_audio(content_audio, sr, model.sample_rate, model.channels) +style_audio = convert_audio(style_audio, sr, model.sample_rate, model.channels) + +# Apply style transfer +with torch.no_grad(): + output_audio = model(content_audio, style_audio) + +# Save result +torchaudio.save("output.wav", output_audio, model.sample_rate) +``` + +### Parameters +- `--content-dir`: Directory containing content audio files +- `--style-dir`: Directory containing style reference audio files +- `--sample-rate`: Target sample rate (default: 16000) +- `--batch-size`: Training batch size (default: 32) +- `--epochs`: Number of training epochs (default: 100) +- `--learning-rate`: Learning rate (default: 0.001) + +### Integration with Compression +The style transfer module can be used in conjunction with EnCodec's compression pipeline: + +```python +# First apply style transfer +output_audio = model(content_audio, style_audio) + +# Then compress the result +model = EncodecModel.encodec_model_24khz() +model.set_target_bandwidth(6.0) +compressed = model.encode(output_audio) +``` + ## Installation for development This will install the dependencies and a `encodec` in developer mode (changes to the files