diff --git a/CMakeLists.txt b/CMakeLists.txt index dfb23d5..c73eada 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ include(FetchContent) FetchContent_Declare( signalsmith-linear GIT_REPOSITORY https://github.com/Signalsmith-Audio/linear.git - GIT_TAG 0.2.3 + GIT_TAG 0.3.0 GIT_SHALLOW ON ) FetchContent_MakeAvailable(signalsmith-linear) diff --git a/cmd/Makefile b/cmd/Makefile index 6887a65..9f0803f 100644 --- a/cmd/Makefile +++ b/cmd/Makefile @@ -1,5 +1,15 @@ all: out/stretch +DEV_FLAGS := --semitones=4 --time=0.667 --asymmetry=0.5 + +dev: out/stretch + ./out/stretch inputs/dev.wav out/dev-2048.wav --process-chunk=2048 $(DEV_FLAGS) + ./out/stretch inputs/dev.wav out/dev-512.wav --process-chunk=512 $(DEV_FLAGS) + ./out/stretch inputs/dev.wav out/dev-100.wav --process-chunk=100 $(DEV_FLAGS) + ./out/stretch inputs/dev.wav out/dev-2048-sc.wav --process-chunk=2048 --split-computation $(DEV_FLAGS) + ./out/stretch inputs/dev.wav out/dev-512-sc.wav --process-chunk=512 --split-computation $(DEV_FLAGS) + ./out/stretch inputs/dev.wav out/dev-100-sc.wav --process-chunk=100 --split-computation $(DEV_FLAGS) + out/stretch: main.cpp ../signalsmith-stretch.h util/*.h util/*.hxx mkdir -p out g++ -std=c++11 -O3 -g \ @@ -27,13 +37,6 @@ examples: out/stretch TEST_WAV ?= "inputs/voice.wav" -dev: out/stretch - out/stretch --time=0.8 --semitones=10 $(TEST_WAV) out/shift.wav - out/stretch --time=0.8 --semitones=10 --formant-comp $(TEST_WAV) out/shift-fc.wav - out/stretch --time=0.8 --semitones=10 --formant-comp --formant=3 $(TEST_WAV) out/shift-fc-f3.wav - out/stretch --time=0.8 --semitones=10 --formant-comp --formant=3 --formant-base=500 $(TEST_WAV) out/shift-fc-f3-fb500.wav - out/stretch --time=0.8 --semitones=10 --formant-comp --formant=2 --formant-base=100 $(TEST_WAV) out/shift-fc-f2-fb100.wav - clean: rm -rf out diff --git a/cmd/main.cpp b/cmd/main.cpp index 0ff29ea..a875212 100644 --- a/cmd/main.cpp +++ b/cmd/main.cpp @@ -7,6 +7,9 @@ using SignalsmithStretch = signalsmith::stretch::SignalsmithStretch; #include "./util/simple-args.h" #include "./util/wav.h" +#include "./util/stopwatch.h" + +#include "plot/plot.h" int main(int argc, char* argv[]) { SimpleArgs args(argc, argv); @@ -19,13 +22,15 @@ int main(int argc, char* argv[]) { std::string inputWav = args.arg("input.wav", "16-bit WAV file"); std::string outputWav = args.arg("output.wav", "output WAV file"); + double time = args.flag("time", "time-stretch factor", 1); double semitones = args.flag("semitones", "pitch-shift amount", 0); double formants = args.flag("formant", "formant-shift amount (semitones)", 0); bool formantComp = args.hasFlag("formant-comp", "formant compensation"); double formantBase = args.flag("formant-base", "formant base frequency (Hz, 0=auto)", 100); double tonality = args.flag("tonality", "tonality limit (Hz)", 8000); - double time = args.flag("time", "time-stretch factor", 1); + double asymmetry = args.flag("asymmetry", "asymmetrical STFT analysis (0-1)", 0); bool splitComputation = args.hasFlag("split-computation", "distributes the computation more evenly (but higher latency)"); + int processChunkSize = args.flag("process-chunk", "process chunk size in samples", -1); args.errorExit(); // exits on error, or with `--help` std::cout << inputWav << " -> " << outputWav << "\n"; @@ -42,7 +47,7 @@ int main(int argc, char* argv[]) { outWav.resize(outputLength); SignalsmithStretch stretch; - stretch.presetDefault(int(inWav.channels), inWav.sampleRate, splitComputation); + stretch.configure(int(inWav.channels), inWav.sampleRate*0.12, inWav.sampleRate*0.03, splitComputation, asymmetry); stretch.setTransposeSemitones(semitones, tonality/inWav.sampleRate); stretch.setFormantSemitones(formants, formantComp); stretch.setFormantBase(formantBase/inWav.sampleRate); @@ -56,30 +61,62 @@ int main(int argc, char* argv[]) { // First, an "output seek", where we provide a chunk of input. // This is suitable for starting playback of a sample at a given playback rate. auto seekLength = stretch.outputSeekLength(1/time); + signalsmith::Stopwatch stopwatch; stretch.outputSeek(inWav, seekLength); + double seekTime = stopwatch.seconds(stopwatch.lap()); // At this point, the next output samples we get will correspond to the beginning of the audio file. // We're going to process until *just* before the end of the audio file (so we can get a tidier end using `.flush()`. - int outputIndex = outputLength - stretch.intervalSamples(); + int outputMainBlockLength = outputLength - stretch.intervalSamples(); + // And this is how much input we'll need for that + int inputMainBlockLength = outputMainBlockLength/time; - // Stretch's internal output position is slightly ahead of the output samples we get - int outputPos = outputIndex + stretch.outputLatency(); - // Time-map: where do we want the input position to be at that moment? - int inputPos = std::round(outputPos/time); - // And therefore which input samples do we need to be supplying? - int inputIndex = inputPos + stretch.inputLatency(); - - // In this particular case, our `inputPos` will be at the end of the file - // and `inputIndex` will be beyond the end, so we pad with 0s to have enough input - inWav.resize(inputIndex); + // This zero-pads the input, since we'll go past the end of it + inWav.resize(inputMainBlockLength + seekLength); - // OK, go for it + // Main block of processing inWav.offset = seekLength; - stretch.process(inWav, inputIndex - seekLength, outWav, outputIndex); + if (processChunkSize <= 0) { + stretch.process(inWav, inputMainBlockLength, outWav, outputMainBlockLength); + } else { + // Plot computation time for each chunk + signalsmith::plot::Plot2D timePlot(500, 200); + timePlot.x.major(0); + timePlot.y.major(0); + timePlot.y.minor(0.01*processChunkSize/inWav.sampleRate, "1%"); + timePlot.y.minor(0.02*processChunkSize/inWav.sampleRate, "2%"); + auto &timeLine = timePlot.line(); + auto &timeLineSeek = timePlot.line().fillToY(0); + timeLine.add(outWav.offset, 0); // output seek + timeLineSeek.add(0, 0); + timeLineSeek.add(0, seekTime); + timeLineSeek.add(inWav.offset, seekTime); + timeLineSeek.add(inWav.offset, 0); + + float residue = 0.f; + while (outWav.offset < size_t(outputMainBlockLength)) { + int outputSamples = std::min(processChunkSize, outputMainBlockLength - outWav.offset); + float inputPrecise = outputSamples/time + residue; + int inputSamples = std::round(inputPrecise); + residue = inputPrecise - inputSamples; + + stopwatch.startLap(); + stretch.process(inWav, inputSamples, outWav, outputSamples); + double time = stopwatch.seconds(stopwatch.lap()); + timeLine.add(outWav.offset, time); + timeLine.add(outWav.offset + outputSamples, time); + + inWav.offset += inputSamples; + outWav.offset += outputSamples; + } + + timeLine.add(outWav.offset, 0); + timePlot.write(outputWav + ".svg"); + } // And as promised, get the last bits using `.flush()`, which does some extra stuff to avoid introducing clicks. - outWav.offset = outputIndex; - stretch.flush(outWav, outputLength - outputIndex); + outWav.offset = outputMainBlockLength; + stretch.flush(outWav, outputLength - outputMainBlockLength); outWav.offset = 0; if (!outWav.write(outputWav).warn()) args.errorExit("failed to write WAV"); diff --git a/signalsmith-stretch.h b/signalsmith-stretch.h index e05406c..8bdbe89 100644 --- a/signalsmith-stretch.h +++ b/signalsmith-stretch.h @@ -40,18 +40,24 @@ struct SignalsmithStretch { // The difference between the internal position (centre of a block) and the input samples you're supplying int inputLatency() const { - return int(stft.analysisLatency()); + return configuredInputLatency; } int outputLatency() const { - return int(stft.synthesisLatency() + _splitComputation*stft.defaultInterval()); + return configuredOutputLatency; } void reset() { stft.reset(0.1); stashedInput = stft.input; stashedOutput = stft.output; + + if (restoreConfig.pending) { + stft.setInterval(restoreConfig.interval, stft.kaiser, restoreConfig.asymmetry); + restoreConfig = {}; + } prevInputOffset = -1; + assumePreviousBlockZero = true; channelBands.assign(channelBands.size(), Band()); silenceCounter = 0; didSeek = false; @@ -68,14 +74,22 @@ struct SignalsmithStretch { } // Manual setup - void configure(int nChannels, int blockSamples, int intervalSamples, bool splitComputation=false) { + void configure(int nChannels, int blockSamples, int intervalSamples, bool splitComputation=false, Sample asymmetry=0) { _splitComputation = splitComputation; channels = nChannels; + asymmetry *= 1 - 2.0*intervalSamples/blockSamples; // maximum asymmetry gives latency of two intervals stft.configure(channels, channels, blockSamples, intervalSamples + 1); - stft.setInterval(intervalSamples, stft.kaiser); + + restoreConfig = {}; + restoreConfig.interval = intervalSamples; + restoreConfig.asymmetry = asymmetry; + + stft.setInterval(intervalSamples, stft.kaiser, asymmetry); stft.reset(0.1); stashedInput = stft.input; stashedOutput = stft.output; + configuredInputLatency = int(stft.analysisLatency()); + configuredOutputLatency = int(stft.synthesisLatency() + _splitComputation*stft.defaultInterval()); bands = int(stft.bands()); channelBands.assign(bands*channels, Band()); @@ -170,18 +184,48 @@ struct SignalsmithStretch { // Moves the input position *and* pre-calculates some output, so that the next samples returned from `.process()` are aligned to the beginning of the sample. // The time-stretch rate is inferred from `inputLength`, so use `.outputSeekLength()` to get a correct value for that. template - void outputSeek(Inputs &&inputs, int inputLength) { + void outputSeek(Inputs &&inputs, int inputLength, Sample firstBlockAsymmetry=0.5) { + restoreConfig.pending = true; + restoreConfig.interval = stft.defaultInterval(); + + Sample playbackRate = std::max(inputLength - configuredInputLatency, 0)/Sample(configuredOutputLatency); + + // Place the next (restored-window) block some time in the future + Sample nextBlockOutputStart = stft.defaultInterval()*firstBlockAsymmetry; + Sample nextBlockOutputPos = nextBlockOutputStart + stft.synthesisOffset(); + // The initial block starts some time before time 0 + Sample firstBlockOutputStart = nextBlockOutputStart - stft.defaultInterval(); + // Set the initial block's window so it's centred on 0 + // Use that as the input latency + size_t windowOffset = -firstBlockOutputStart; + size_t windowEnd = int(nextBlockOutputPos); // first block ends at centre of next block + stft.analysisOffset(windowOffset); + stft.synthesisOffset(windowOffset); + + // Sine window, warped as two linear segments + for (size_t i = 0; i < stft.blockSamples(); ++i) { + Sample r = i + Sample(0.5); + if (i < windowOffset) { + r = r/windowOffset/2; + } else if (r < windowEnd) { + r = (1 + (r - windowOffset)/(windowEnd - windowOffset))/2; + } else { + r = 1; + } + stft.analysisWindow()[i] = stft.synthesisWindow()[i] = (1 - std::cos(r*Sample(2*M_PI)))/2; + } + // TODO: add fade-out parameter to avoid clicks, instead of doing a full reset - reset(); - // Assume we've been handed enough surplus input to produce `outputLatency()` samples of pre-roll - int surplusInput = std::max(inputLength - inputLatency(), 0); - Sample playbackRate = surplusInput/Sample(outputLatency()); + stft.reset(0.01); + clearPreviousBlock(); + auto seekSamples = int(stft.analysisLatency()); // Move the input position to the start of the sound - int seekSamples = inputLength - surplusInput; seek(inputs, seekSamples, playbackRate); - - tmpPreRollBuffer.resize(outputLatency()*channels); + + // Enough output to reach the start of the sound + auto preRollLength = int(outputLatency()); + tmpPreRollBuffer.resize(preRollLength*channels); struct BufferOutput { Sample *samples; int length; @@ -189,21 +233,24 @@ struct SignalsmithStretch { Sample * operator[](int c) { return samples + c*length; } - } preRollOutput{tmpPreRollBuffer.data(), outputLatency()}; - + } preRollOutput{tmpPreRollBuffer.data(), preRollLength}; + // Use the surplus input to produce pre-roll output OffsetIO offsetInput{inputs, seekSamples}; - process(offsetInput, surplusInput, preRollOutput, preRollOutput.length); - + int preRollInputSamples = std::max(inputLength - seekSamples, 0); + process(offsetInput, preRollInputSamples, preRollOutput, preRollLength); + // put the thing down, flip it and reverse it for (auto &v : tmpPreRollBuffer) v = -v; for (int c = 0; c < channels; ++c) { - std::reverse(preRollOutput[c], preRollOutput[c] + preRollOutput.length); - stft.addOutput(c, preRollOutput.length, preRollOutput[c]); + std::reverse(preRollOutput[c], preRollOutput[c] + preRollLength); + if (_splitComputation) stashedOutput.swap(stft.output); + stft.addOutput(c, preRollLength, preRollOutput[c]); + if (_splitComputation) stashedOutput.swap(stft.output); } } int outputSeekLength(Sample playbackRate) const { - return inputLatency() + playbackRate*outputLatency(); + return configuredInputLatency + playbackRate*configuredOutputLatency; } template @@ -213,7 +260,6 @@ struct SignalsmithStretch { #endif int prevCopiedInput = 0; auto copyInput = [&](int toIndex){ - int length = std::min(int(stft.blockSamples() + stft.defaultInterval()), toIndex - prevCopiedInput); tmpProcessBuffer.resize(length); int offset = toIndex - length; @@ -301,6 +347,7 @@ struct SignalsmithStretch { if (blockProcess.newSpectrum) { // make sure the previous input is the correct distance in the past (give or take 1 sample) blockProcess.reanalysePrev = didSeek || std::abs(inputInterval - int(stft.defaultInterval())) > 1; + if (assumePreviousBlockZero) blockProcess.reanalysePrev = false; if (blockProcess.reanalysePrev) blockProcess.steps += stft.analyseSteps() + 1; // analyse a new input @@ -314,8 +361,13 @@ struct SignalsmithStretch { updateProcessSpectrumSteps(); blockProcess.steps += processSpectrumSteps; - + blockProcess.steps += stft.synthesiseSteps() + 1; + + if (restoreConfig.pending > 0) { + blockProcess.resetInterval = true; + blockProcess.steps += 1 + channels; // STFT window reset then adjusting prevInput/output + } } size_t processToStep = newBlock ? blockProcess.steps : 0; @@ -398,6 +450,46 @@ struct SignalsmithStretch { stft.synthesiseStep(step); continue; } + step -= stft.synthesiseSteps(); + + if (blockProcess.resetInterval) { + if (step-- == 0) { + int prevOffsetA = stft.analysisOffset(), prevOffsetS = stft.synthesisOffset(); + stft.setInterval(restoreConfig.interval, stft.kaiser, restoreConfig.asymmetry); + restoreConfig.pending = false; + + restoreConfig.diffOffsetA = int(stft.analysisOffset()) - prevOffsetA; + restoreConfig.diffOffsetS = int(stft.synthesisOffset()) - prevOffsetS; + continue; + } else if (step < size_t(channels)) { + int channel = int(step); + auto bins = bandsForChannel(channel); + if (restoreConfig.diffOffsetA) { // adjust prevInput + Complex rot = std::polar(Sample(1), bandToFreq(0)*restoreConfig.diffOffsetA*Sample(2*M_PI)); + Sample freqStep = bandToFreq(1) - bandToFreq(0); + Complex rotStep = std::polar(Sample(1), freqStep*restoreConfig.diffOffsetA*Sample(2*M_PI)); + + for (int b = 0; b < bands; ++b) { + auto &bin = bins[b]; + bin.prevInput = _impl::mul(bin.prevInput, rot); + rot = _impl::mul(rot, rotStep); + } + } + if (restoreConfig.diffOffsetS) { // adjust output + Complex rot = std::polar(Sample(1), bandToFreq(0)*restoreConfig.diffOffsetS*Sample(2*M_PI)); + Sample freqStep = bandToFreq(1) - bandToFreq(0); + Complex rotStep = std::polar(Sample(1), freqStep*restoreConfig.diffOffsetS*Sample(2*M_PI)); + + for (int b = 0; b < bands; ++b) { + auto &bin = bins[b]; + bin.output = _impl::mul(bin.output, rot); + rot = _impl::mul(rot, rotStep); + } + } + continue; + } + step -= channels; + } } #ifdef SIGNALSMITH_STRETCH_PROFILE_PROCESS_ENDSTEP SIGNALSMITH_STRETCH_PROFILE_PROCESS_ENDSTEP(); @@ -454,13 +546,8 @@ struct SignalsmithStretch { } } stft.reset(0.1f); - // Reset the phase-vocoder stuff, so the next block gets a fresh start - for (int c = 0; c < channels; ++c) { - auto channelBands = bandsForChannel(c); - for (int b = 0; b < bands; ++b) { - channelBands[b].prevInput = channelBands[b].output = 0; - } - } + + clearPreviousBlock(); } // Process a complete audio buffer all in one go @@ -481,12 +568,15 @@ struct SignalsmithStretch { outputSeek(inputs, seekLength); - int outputIndex = outputSamples - seekLength/playbackRate; OffsetIO offsetInput{inputs, seekLength}; - process(offsetInput, inputSamples - seekLength, outputs, outputIndex); + int inputMainBlock = inputSamples - seekLength; + int outputMainBlock = inputMainBlock/playbackRate; + // Ordinary process calls, as far as the input goes + process(offsetInput, inputMainBlock, outputs, outputMainBlock); - OffsetIO offsetOutput{outputs, outputIndex}; - flush(offsetOutput, outputSamples - outputIndex, playbackRate); + OffsetIO offsetOutput{outputs, outputMainBlock}; + // We've run out of input - this gets the last chunk of output (cheaply) + flush(offsetOutput, outputSamples - outputMainBlock, playbackRate); return true; } @@ -502,6 +592,9 @@ struct SignalsmithStretch { bool mappedFrequencies = false; bool processFormants = false; Sample timeFactor; + + // If our previous block had an unusual offset/shape, reset and adjust + bool resetInterval = false; } blockProcess; using Complex = std::complex; @@ -520,14 +613,30 @@ struct SignalsmithStretch { STFT stft; typename STFT::Input stashedInput; typename STFT::Output stashedOutput; + int configuredInputLatency = 0, configuredOutputLatency = 0; std::vector tmpProcessBuffer, tmpPreRollBuffer; + + struct { + bool pending = false; + Sample asymmetry = 0; + int interval = 0; + int diffOffsetA = 0, diffOffsetS = 0; + } restoreConfig; int channels = 0, bands = 0; int prevInputOffset = -1; bool didSeek = false; Sample seekTimeFactor = 1; + bool assumePreviousBlockZero = false; + void clearPreviousBlock() { + assumePreviousBlockZero = true; + for (auto &b : channelBands) { + b.output = b.prevInput = 0; + } + } + Sample bandToFreq(Sample b) const { return stft.binToFreq(b); } @@ -544,6 +653,7 @@ struct SignalsmithStretch { Band * bandsForChannel(int channel) { return channelBands.data() + channel*bands; } + template Complex getBand(int channel, int index) { if (index < 0 || index >= bands) return 0; @@ -641,6 +751,7 @@ struct SignalsmithStretch { if (blockProcess.newSpectrum) { if (step < size_t(channels)) { + if (assumePreviousBlockZero) return; // TODO: remove this from the processing schedule int channel = int(step); auto bins = bandsForChannel(channel); @@ -809,6 +920,7 @@ struct SignalsmithStretch { bin.prevInput = bin.input; } } + assumePreviousBlockZero = false; } }