diff --git a/CMakeLists.txt b/CMakeLists.txt
index dfb23d5..c73eada 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,7 @@ include(FetchContent)
 FetchContent_Declare(
 	signalsmith-linear
 	GIT_REPOSITORY https://github.com/Signalsmith-Audio/linear.git
-	GIT_TAG 0.2.3
+	GIT_TAG 0.3.0
 	GIT_SHALLOW ON
 )
 FetchContent_MakeAvailable(signalsmith-linear)
diff --git a/cmd/Makefile b/cmd/Makefile
index 6887a65..9f0803f 100644
--- a/cmd/Makefile
+++ b/cmd/Makefile
@@ -1,5 +1,15 @@
 all: out/stretch
 
+DEV_FLAGS := --semitones=4 --time=0.667 --asymmetry=0.5
+
+dev: out/stretch
+	./out/stretch inputs/dev.wav out/dev-2048.wav --process-chunk=2048 $(DEV_FLAGS)
+	./out/stretch inputs/dev.wav out/dev-512.wav --process-chunk=512 $(DEV_FLAGS)
+	./out/stretch inputs/dev.wav out/dev-100.wav --process-chunk=100 $(DEV_FLAGS)
+	./out/stretch inputs/dev.wav out/dev-2048-sc.wav --process-chunk=2048 --split-computation $(DEV_FLAGS)
+	./out/stretch inputs/dev.wav out/dev-512-sc.wav --process-chunk=512 --split-computation $(DEV_FLAGS)
+	./out/stretch inputs/dev.wav out/dev-100-sc.wav --process-chunk=100 --split-computation $(DEV_FLAGS)
+
 out/stretch: main.cpp ../signalsmith-stretch.h util/*.h util/*.hxx
 	mkdir -p out
 	g++ -std=c++11 -O3 -g \
@@ -27,13 +37,6 @@ examples: out/stretch
 
 TEST_WAV ?= "inputs/voice.wav"
 
-dev: out/stretch
-	out/stretch --time=0.8 --semitones=10 $(TEST_WAV) out/shift.wav
-	out/stretch --time=0.8 --semitones=10 --formant-comp $(TEST_WAV) out/shift-fc.wav
-	out/stretch --time=0.8 --semitones=10 --formant-comp --formant=3 $(TEST_WAV) out/shift-fc-f3.wav
-	out/stretch --time=0.8 --semitones=10 --formant-comp --formant=3 --formant-base=500 $(TEST_WAV) out/shift-fc-f3-fb500.wav
-	out/stretch --time=0.8 --semitones=10 --formant-comp --formant=2 --formant-base=100 $(TEST_WAV) out/shift-fc-f2-fb100.wav
-
 clean:
 	rm -rf out
 
diff --git a/cmd/main.cpp b/cmd/main.cpp
index 0ff29ea..a875212 100644
--- a/cmd/main.cpp
+++ b/cmd/main.cpp
@@ -7,6 +7,9 @@ using SignalsmithStretch = signalsmith::stretch::SignalsmithStretch<float>;
 
 #include "./util/simple-args.h"
 #include "./util/wav.h"
+#include "./util/stopwatch.h"
+
+#include "plot/plot.h"
 
 int main(int argc, char* argv[]) {
 	SimpleArgs args(argc, argv);
@@ -19,13 +22,15 @@ int main(int argc, char* argv[]) {
 
 	std::string inputWav = args.arg<std::string>("input.wav", "16-bit WAV file");
 	std::string outputWav = args.arg<std::string>("output.wav", "output WAV file");
+	double time = args.flag<double>("time", "time-stretch factor", 1);
 	double semitones = args.flag<double>("semitones", "pitch-shift amount", 0);
 	double formants = args.flag<double>("formant", "formant-shift amount (semitones)", 0);
 	bool formantComp = args.hasFlag("formant-comp", "formant compensation");
 	double formantBase = args.flag<double>("formant-base", "formant base frequency (Hz, 0=auto)", 100);
 	double tonality = args.flag<double>("tonality", "tonality limit (Hz)", 8000);
-	double time = args.flag<double>("time", "time-stretch factor", 1);
+	double asymmetry = args.flag<double>("asymmetry", "asymmetrical STFT analysis (0-1)", 0);
 	bool splitComputation = args.hasFlag("split-computation", "distributes the computation more evenly (but higher latency)");
+	int processChunkSize = args.flag<int>("process-chunk", "process chunk size in samples", -1);
 	args.errorExit(); // exits on error, or with `--help`
 
 	std::cout << inputWav << " -> " << outputWav << "\n";
@@ -42,7 +47,7 @@ int main(int argc, char* argv[]) {
 	outWav.resize(outputLength);
 
 	SignalsmithStretch stretch;
-	stretch.presetDefault(int(inWav.channels), inWav.sampleRate, splitComputation);
+	stretch.configure(int(inWav.channels), inWav.sampleRate*0.12, inWav.sampleRate*0.03, splitComputation, asymmetry);
 	stretch.setTransposeSemitones(semitones, tonality/inWav.sampleRate);
 	stretch.setFormantSemitones(formants, formantComp);
 	stretch.setFormantBase(formantBase/inWav.sampleRate);
@@ -56,30 +61,62 @@ int main(int argc, char* argv[]) {
 	// First, an "output seek", where we provide a chunk of input.
 	// This is suitable for starting playback of a sample at a given playback rate.
 	auto seekLength = stretch.outputSeekLength(1/time);
+	signalsmith::Stopwatch stopwatch;
 	stretch.outputSeek(inWav, seekLength);
+	double seekTime = stopwatch.seconds(stopwatch.lap());
 	// At this point, the next output samples we get will correspond to the beginning of the audio file.
 
 	// We're going to process until *just* before the end of the audio file (so we can get a tidier end using `.flush()`.
-	int outputIndex = outputLength - stretch.intervalSamples();
+	int outputMainBlockLength = outputLength - stretch.intervalSamples();
+	// And this is how much input we'll need for that
+	int inputMainBlockLength = outputMainBlockLength/time;
 
-	// Stretch's internal output position is slightly ahead of the output samples we get
-	int outputPos = outputIndex + stretch.outputLatency();
-	// Time-map: where do we want the input position to be at that moment?
-	int inputPos = std::round(outputPos/time);
-	// And therefore which input samples do we need to be supplying?
-	int inputIndex = inputPos + stretch.inputLatency();
-	
-	// In this particular case, our `inputPos` will be at the end of the file
-	// and `inputIndex` will be beyond the end, so we pad with 0s to have enough input
-	inWav.resize(inputIndex);
+	// This zero-pads the input, since we'll go past the end of it
+	inWav.resize(inputMainBlockLength + seekLength);
 
-	// OK, go for it
+	// Main block of processing
 	inWav.offset = seekLength;
-	stretch.process(inWav, inputIndex - seekLength, outWav, outputIndex);
+	if (processChunkSize <= 0) {
+		stretch.process(inWav, inputMainBlockLength, outWav, outputMainBlockLength);
+	} else {
+		// Plot computation time for each chunk
+		signalsmith::plot::Plot2D timePlot(500, 200);
+		timePlot.x.major(0);
+		timePlot.y.major(0);
+		timePlot.y.minor(0.01*processChunkSize/inWav.sampleRate, "1%");
+		timePlot.y.minor(0.02*processChunkSize/inWav.sampleRate, "2%");
+		auto &timeLine = timePlot.line();
+		auto &timeLineSeek = timePlot.line().fillToY(0);
+		timeLine.add(outWav.offset, 0); // output seek
+		timeLineSeek.add(0, 0);
+		timeLineSeek.add(0, seekTime);
+		timeLineSeek.add(inWav.offset, seekTime);
+		timeLineSeek.add(inWav.offset, 0);
+	
+		float residue = 0.f;
+		while (outWav.offset < size_t(outputMainBlockLength)) {
+			int outputSamples = std::min<int>(processChunkSize, outputMainBlockLength - outWav.offset);
+			float inputPrecise = outputSamples/time + residue;
+			int inputSamples = std::round(inputPrecise);
+			residue = inputPrecise - inputSamples;
+
+			stopwatch.startLap();
+			stretch.process(inWav, inputSamples, outWav, outputSamples);
+			double time = stopwatch.seconds(stopwatch.lap());
+			timeLine.add(outWav.offset, time);
+			timeLine.add(outWav.offset + outputSamples, time);
+			
+			inWav.offset += inputSamples;
+			outWav.offset += outputSamples;
+		}
+		
+		timeLine.add(outWav.offset, 0);
+		timePlot.write(outputWav + ".svg");
+	}
 	
 	// And as promised, get the last bits using `.flush()`, which does some extra stuff to avoid introducing clicks.
-	outWav.offset = outputIndex;
-	stretch.flush(outWav, outputLength - outputIndex);
+	outWav.offset = outputMainBlockLength;
+	stretch.flush(outWav, outputLength - outputMainBlockLength);
 	outWav.offset = 0;
 
 	if (!outWav.write(outputWav).warn()) args.errorExit("failed to write WAV");
diff --git a/signalsmith-stretch.h b/signalsmith-stretch.h
index e05406c..8bdbe89 100644
--- a/signalsmith-stretch.h
+++ b/signalsmith-stretch.h
@@ -40,18 +40,24 @@ struct SignalsmithStretch {
 		
 	// The difference between the internal position (centre of a block) and the input samples you're supplying
 	int inputLatency() const {
-		return int(stft.analysisLatency());
+		return configuredInputLatency;
 	}
 	int outputLatency() const {
-		return int(stft.synthesisLatency() + _splitComputation*stft.defaultInterval());
+		return configuredOutputLatency;
 	}
 	
 	void reset() {
 		stft.reset(0.1);
 		stashedInput = stft.input;
 		stashedOutput = stft.output;
+
+		if (restoreConfig.pending) {
+			stft.setInterval(restoreConfig.interval, stft.kaiser, restoreConfig.asymmetry);
+			restoreConfig = {};
+		}
 		
 		prevInputOffset = -1;
+		assumePreviousBlockZero = true;
 		channelBands.assign(channelBands.size(), Band());
 		silenceCounter = 0;
 		didSeek = false;
@@ -68,14 +74,22 @@ struct SignalsmithStretch {
 	}
 
 	// Manual setup
-	void configure(int nChannels, int blockSamples, int intervalSamples, bool splitComputation=false) {
+	void configure(int nChannels, int blockSamples, int intervalSamples, bool splitComputation=false, Sample asymmetry=0) {
 		_splitComputation = splitComputation;
 		channels = nChannels;
+		asymmetry *= 1 - 2.0*intervalSamples/blockSamples; // maximum asymmetry gives latency of two intervals
 		stft.configure(channels, channels, blockSamples, intervalSamples + 1);
-		stft.setInterval(intervalSamples, stft.kaiser);
+
+		restoreConfig = {};
+		restoreConfig.interval = intervalSamples;
+		restoreConfig.asymmetry = asymmetry;
+
+		stft.setInterval(intervalSamples, stft.kaiser, asymmetry);
 		stft.reset(0.1);
 		stashedInput = stft.input;
 		stashedOutput = stft.output;
+		configuredInputLatency = int(stft.analysisLatency());
+		configuredOutputLatency = int(stft.synthesisLatency() + _splitComputation*stft.defaultInterval());
 
 		bands = int(stft.bands());
 		channelBands.assign(bands*channels, Band());
@@ -170,18 +184,48 @@ struct SignalsmithStretch {
 	// Moves the input position *and* pre-calculates some output, so that the next samples returned from `.process()` are aligned to the beginning of the sample.
 	// The time-stretch rate is inferred from `inputLength`, so use `.outputSeekLength()` to get a correct value for that.
 	template<class Inputs>
-	void outputSeek(Inputs &&inputs, int inputLength) {
+	void outputSeek(Inputs &&inputs, int inputLength, Sample firstBlockAsymmetry=0.5) {
+		restoreConfig.pending = true;
+		restoreConfig.interval = stft.defaultInterval();
+
+		Sample playbackRate = std::max<int>(inputLength - configuredInputLatency, 0)/Sample(configuredOutputLatency);
+		
+		// Place the next (restored-window) block some time in the future
+		Sample nextBlockOutputStart = stft.defaultInterval()*firstBlockAsymmetry;
+		Sample nextBlockOutputPos = nextBlockOutputStart + stft.synthesisOffset();
+		// The initial block starts some time before time 0
+		Sample firstBlockOutputStart = nextBlockOutputStart - stft.defaultInterval();
+		// Set the initial block's window so it's centred on 0
+		// Use that as the input latency
+		size_t windowOffset = -firstBlockOutputStart;
+		size_t windowEnd = int(nextBlockOutputPos); // first block ends at centre of next block
+		stft.analysisOffset(windowOffset);
+		stft.synthesisOffset(windowOffset);
+
+		// Sine window, warped as two linear segments
+		for (size_t i = 0; i < stft.blockSamples(); ++i) {
+			Sample r = i + Sample(0.5);
+			if (i < windowOffset) {
+				r = r/windowOffset/2;
+			} else if (r < windowEnd) {
+				r = (1 + (r - windowOffset)/(windowEnd - windowOffset))/2;
+			} else {
+				r = 1;
+			}
+			stft.analysisWindow()[i] = stft.synthesisWindow()[i] = (1 - std::cos(r*Sample(2*M_PI)))/2;
+		}
+
 		// TODO: add fade-out parameter to avoid clicks, instead of doing a full reset
-		reset();
-		// Assume we've been handed enough surplus input to produce `outputLatency()` samples of pre-roll
-		int surplusInput = std::max<int>(inputLength - inputLatency(), 0);
-		Sample playbackRate = surplusInput/Sample(outputLatency());
+		stft.reset(0.01);
+		clearPreviousBlock();
 
+		auto seekSamples = int(stft.analysisLatency());
 		// Move the input position to the start of the sound
-		int seekSamples = inputLength - surplusInput;
 		seek(inputs, seekSamples, playbackRate);
-		
-		tmpPreRollBuffer.resize(outputLatency()*channels);
+
+		// Enough output to reach the start of the sound
+		auto preRollLength = int(outputLatency());
+		tmpPreRollBuffer.resize(preRollLength*channels);
 		struct BufferOutput {
 			Sample *samples;
 			int length;
@@ -189,21 +233,24 @@ struct SignalsmithStretch {
 			Sample * operator[](int c) {
 				return samples + c*length;
 			}
-		} preRollOutput{tmpPreRollBuffer.data(), outputLatency()};
-		
+		} preRollOutput{tmpPreRollBuffer.data(), preRollLength};
+
 		// Use the surplus input to produce pre-roll output
 		OffsetIO<Inputs> offsetInput{inputs, seekSamples};
-		process(offsetInput, surplusInput, preRollOutput, preRollOutput.length);
-		
+		int preRollInputSamples = std::max<int>(inputLength - seekSamples, 0);
+		process(offsetInput, preRollInputSamples, preRollOutput, preRollLength);
+
 		// put the thing down, flip it and reverse it
 		for (auto &v : tmpPreRollBuffer) v = -v;
 		for (int c = 0; c < channels; ++c) {
-			std::reverse(preRollOutput[c], preRollOutput[c] + preRollOutput.length);
-			stft.addOutput(c, preRollOutput.length, preRollOutput[c]);
+			std::reverse(preRollOutput[c], preRollOutput[c] + preRollLength);
+			if (_splitComputation) stashedOutput.swap(stft.output);
+			stft.addOutput(c, preRollLength, preRollOutput[c]);
+			if (_splitComputation) stashedOutput.swap(stft.output);
 		}
 	}
 	int outputSeekLength(Sample playbackRate) const {
-		return inputLatency() + playbackRate*outputLatency();
+		return configuredInputLatency + playbackRate*configuredOutputLatency;
 	}
 
 	template<class Inputs, class Outputs>
@@ -213,7 +260,6 @@ struct SignalsmithStretch {
 #endif
 		int prevCopiedInput = 0;
 		auto copyInput = [&](int toIndex){
-
 			int length = std::min<int>(int(stft.blockSamples() + stft.defaultInterval()), toIndex - prevCopiedInput);
 			tmpProcessBuffer.resize(length);
 			int offset = toIndex - length;
@@ -301,6 +347,7 @@ struct SignalsmithStretch {
 				if (blockProcess.newSpectrum) {
 					// make sure the previous input is the correct distance in the past (give or take 1 sample)
 					blockProcess.reanalysePrev = didSeek || std::abs(inputInterval - int(stft.defaultInterval())) > 1;
+					if (assumePreviousBlockZero) blockProcess.reanalysePrev = false;
 					if (blockProcess.reanalysePrev) blockProcess.steps += stft.analyseSteps() + 1;
 
 					// analyse a new input
@@ -314,8 +361,13 @@ struct SignalsmithStretch {
 
 				updateProcessSpectrumSteps();
 				blockProcess.steps += processSpectrumSteps;
-
+				
 				blockProcess.steps += stft.synthesiseSteps() + 1;
+
+				if (restoreConfig.pending > 0) {
+					blockProcess.resetInterval = true;
+					blockProcess.steps += 1 + channels; // STFT window reset then adjusting prevInput/output
+				}
 			}
 			
 			size_t processToStep = newBlock ? blockProcess.steps : 0;
@@ -398,6 +450,46 @@ struct SignalsmithStretch {
 					stft.synthesiseStep(step);
 					continue;
 				}
+				step -= stft.synthesiseSteps();
+				
+				if (blockProcess.resetInterval) {
+					if (step-- == 0) {
+						int prevOffsetA = stft.analysisOffset(), prevOffsetS = stft.synthesisOffset();
+						stft.setInterval(restoreConfig.interval, stft.kaiser, restoreConfig.asymmetry);
+						restoreConfig.pending = false;
+						
+						restoreConfig.diffOffsetA = int(stft.analysisOffset()) - prevOffsetA;
+						restoreConfig.diffOffsetS = int(stft.synthesisOffset()) - prevOffsetS;
+						continue;
+					} else if (step < size_t(channels)) {
+						int channel = int(step);
+						auto bins = bandsForChannel(channel);
+						if (restoreConfig.diffOffsetA) { // adjust prevInput
+							Complex rot = std::polar(Sample(1), bandToFreq(0)*restoreConfig.diffOffsetA*Sample(2*M_PI));
+							Sample freqStep = bandToFreq(1) - bandToFreq(0);
+							Complex rotStep = std::polar(Sample(1), freqStep*restoreConfig.diffOffsetA*Sample(2*M_PI));
+							 
+							for (int b = 0; b < bands; ++b) {
+								auto &bin = bins[b];
+								bin.prevInput = _impl::mul(bin.prevInput, rot);
+								rot = _impl::mul(rot, rotStep);
+							}
+						}
+						if (restoreConfig.diffOffsetS) { // adjust output
+							Complex rot = std::polar(Sample(1), bandToFreq(0)*restoreConfig.diffOffsetS*Sample(2*M_PI));
+							Sample freqStep = bandToFreq(1) - bandToFreq(0);
+							Complex rotStep = std::polar(Sample(1), freqStep*restoreConfig.diffOffsetS*Sample(2*M_PI));
+							 
+							for (int b = 0; b < bands; ++b) {
+								auto &bin = bins[b];
+								bin.output = _impl::mul(bin.output, rot);
+								rot = _impl::mul(rot, rotStep);
+							}
+						}
+						continue;
+					}
+					step -= channels;
+				}
 			}
 #ifdef SIGNALSMITH_STRETCH_PROFILE_PROCESS_ENDSTEP
 			SIGNALSMITH_STRETCH_PROFILE_PROCESS_ENDSTEP();
@@ -454,13 +546,8 @@ struct SignalsmithStretch {
 			}
 		}
 		stft.reset(0.1f);
-		// Reset the phase-vocoder stuff, so the next block gets a fresh start
-		for (int c = 0; c < channels; ++c) {
-			auto channelBands = bandsForChannel(c);
-			for (int b = 0; b < bands; ++b) {
-				channelBands[b].prevInput = channelBands[b].output = 0;
-			}
-		}
+
+		clearPreviousBlock();
 	}
 
 	// Process a complete audio buffer all in one go
@@ -481,12 +568,15 @@ struct SignalsmithStretch {
 
 		outputSeek(inputs, seekLength);
 
-		int outputIndex = outputSamples - seekLength/playbackRate;
 		OffsetIO<Inputs> offsetInput{inputs, seekLength};
-		process(offsetInput, inputSamples - seekLength, outputs, outputIndex);
+		int inputMainBlock = inputSamples - seekLength;
+		int outputMainBlock = inputMainBlock/playbackRate;
+		// Ordinary process calls, as far as the input goes
+		process(offsetInput, inputMainBlock, outputs, outputMainBlock);
 		
-		OffsetIO<Outputs> offsetOutput{outputs, outputIndex};
-		flush(offsetOutput, outputSamples - outputIndex, playbackRate);
+		OffsetIO<Outputs> offsetOutput{outputs, outputMainBlock};
+		// We've run out of input - this gets the last chunk of output (cheaply)
+		flush(offsetOutput, outputSamples - outputMainBlock, playbackRate);
 		return true;
 	}
 
@@ -502,6 +592,9 @@ struct SignalsmithStretch {
 		bool mappedFrequencies = false;
 		bool processFormants = false;
 		Sample timeFactor;
+		
+		// If our previous block had an unusual offset/shape, reset and adjust
+		bool resetInterval = false;
 	} blockProcess;
 
 	using Complex = std::complex<Sample>;
@@ -520,14 +613,30 @@ struct SignalsmithStretch {
 	STFT stft;
 	typename STFT::Input stashedInput;
 	typename STFT::Output stashedOutput;
+	int configuredInputLatency = 0, configuredOutputLatency = 0;
 	
 	std::vector<Sample> tmpProcessBuffer, tmpPreRollBuffer;
+	
+	struct {
+		bool pending = false;
+		Sample asymmetry = 0;
+		int interval = 0;
+		int diffOffsetA = 0, diffOffsetS = 0;
+	} restoreConfig;
 
 	int channels = 0, bands = 0;
 	int prevInputOffset = -1;
 	bool didSeek = false;
 	Sample seekTimeFactor = 1;
 
+	bool assumePreviousBlockZero = false;
+	void clearPreviousBlock() {
+		assumePreviousBlockZero = true;
+		for (auto &b : channelBands) {
+			b.output = b.prevInput = 0;
+		}
+	}
+
 	Sample bandToFreq(Sample b) const {
 		return stft.binToFreq(b);
 	}
@@ -544,6 +653,7 @@ struct SignalsmithStretch {
 	Band * bandsForChannel(int channel) {
 		return channelBands.data() + channel*bands;
 	}
+
 	template<Complex Band::*member>
 	Complex getBand(int channel, int index) {
 		if (index < 0 || index >= bands) return 0;
@@ -641,6 +751,7 @@ struct SignalsmithStretch {
 
 		if (blockProcess.newSpectrum) {
 			if (step < size_t(channels)) {
+				if (assumePreviousBlockZero) return; // TODO: remove this from the processing schedule
 				int channel = int(step);
 				auto bins = bandsForChannel(channel);
 
@@ -809,6 +920,7 @@ struct SignalsmithStretch {
 					bin.prevInput = bin.input;
 				}
 			}
+			assumePreviousBlockZero = false;
 		}
 	}