@@ -134,6 +134,40 @@ class Pipeline {
134134 cudaStream_t stream = 0
135135 );
136136
137+ /* *
138+ * Compress into a caller-allocated device buffer.
139+ *
140+ * @param d_input Device pointer to raw input.
141+ * @param input_size Input size in bytes.
142+ * @param d_output Caller-allocated device buffer for compressed bytes.
143+ * @param output_capacity Capacity of d_output in bytes.
144+ * @param output_size Receives the compressed size in bytes.
145+ * @param stream CUDA stream for all GPU operations.
146+ */
147+ void compressInto (
148+ const void * d_input,
149+ size_t input_size,
150+ void * d_output,
151+ size_t output_capacity,
152+ size_t * output_size,
153+ cudaStream_t stream = 0
154+ );
155+
156+ /* *
157+ * Compress multi-source input into a caller-allocated device buffer.
158+ *
159+ * Output format matches compress(const std::vector<InputSpec>&):
160+ * single output buffer for single-source pipelines; concat format for
161+ * multi-source pipelines.
162+ */
163+ void compressInto (
164+ const std::vector<InputSpec>& inputs,
165+ void * d_output,
166+ size_t output_capacity,
167+ size_t * output_size,
168+ cudaStream_t stream = 0
169+ );
170+
137171 /* *
138172 * Compress (multi-source). One InputSpec per source stage; order does not matter.
139173 * *d_output is pool-owned — do NOT cudaFree.
@@ -177,6 +211,45 @@ class Pipeline {
177211 cudaStream_t stream = 0
178212 );
179213
214+ /* *
215+ * Decompress into a caller-allocated device buffer (single-source only).
216+ *
217+ * @param d_input nullptr to read from live forward DAG buffers, or
218+ * a device pointer to external compressed bytes.
219+ * @param input_size Byte size of d_input (ignored when d_input is nullptr).
220+ * @param d_output Caller-allocated device buffer for decompressed bytes.
221+ * @param output_capacity Capacity of d_output in bytes.
222+ * @param output_size Receives the exact decompressed size in bytes.
223+ * @param stream CUDA stream for all GPU operations.
224+ */
225+ void decompressInto (
226+ const void * d_input,
227+ size_t input_size,
228+ void * d_output,
229+ size_t output_capacity,
230+ size_t * output_size,
231+ cudaStream_t stream = 0
232+ );
233+
234+ /* *
235+ * Decompress multi-source data into caller-allocated output buffers.
236+ *
237+ * @param d_input nullptr to read from live forward DAG buffers,
238+ * or a device pointer to external compressed bytes.
239+ * @param input_size Byte size of d_input (ignored when d_input is nullptr).
240+ * @param d_outputs One device pointer per source output.
241+ * @param output_capacities Capacity (bytes) for each output pointer.
242+ * @param stream CUDA stream for all GPU operations.
243+ * @return Exact decompressed size (bytes) for each source output.
244+ */
245+ std::vector<size_t > decompressMultiInto (
246+ const void * d_input,
247+ size_t input_size,
248+ const std::vector<void *>& d_outputs,
249+ const std::vector<size_t >& output_capacities,
250+ cudaStream_t stream = 0
251+ );
252+
180253 /* *
181254 * Decompress (multi-source). Returns one {device_ptr, size} pair per source,
182255 * in the same order as forward source discovery. Ownership follows
@@ -188,6 +261,32 @@ class Pipeline {
188261 cudaStream_t stream = 0
189262 );
190263
264+ /* *
265+ * Maximum compressed output size for the current finalized pipeline.
266+ *
267+ * Returned value is an upper bound suitable for caller allocation before
268+ * compressInto(...). For multi-source pipelines this corresponds to the
269+ * single concat output format returned by compress().
270+ */
271+ size_t getMaxCompressedOutputSize () const ;
272+
273+ /* *
274+ * Maximum decompressed output size (single-source pipelines only).
275+ *
276+ * Value is derived from the most recent compress() input size when available,
277+ * otherwise from finalize-time size hints. Returns an upper bound suitable
278+ * for caller allocation before decompressInto().
279+ */
280+ size_t getMaxDecompressedOutputSize () const ;
281+
282+ /* *
283+ * Maximum decompressed output size per source (multi-source aware).
284+ *
285+ * Order matches decompressMulti()/decompressMultiInto() source order.
286+ * Values are upper bounds suitable for caller allocation.
287+ */
288+ std::vector<size_t > getMaxDecompressedOutputSizes () const ;
289+
191290 /* * Free non-persistent buffers and reset execution state for re-use. */
192291 void reset (cudaStream_t stream = 0 );
193292
@@ -270,6 +369,12 @@ class Pipeline {
270369 /* * Parse the FZM header from a file without decompressing the payload. */
271370 static FZMFileHeader readHeader (const std::string& filename);
272371
372+ /* * Exact decompressed output size from an FZM file (single-source convenience). */
373+ static size_t getDecompressedOutputSizeFromFile (const std::string& filename);
374+
375+ /* * Exact decompressed output sizes (one per source) from an FZM file header. */
376+ static std::vector<size_t > getDecompressedOutputSizesFromFile (const std::string& filename);
377+
273378 /* * Build the FZM header from current pipeline state. Requires a prior compress(). */
274379 FZMFileHeader buildHeader () const ;
275380
@@ -332,6 +437,14 @@ class Pipeline {
332437 std::vector<Stage*> getSourceStages () const ;
333438 std::vector<Stage*> getSinkStages () const ;
334439
440+ std::vector<std::pair<void *, size_t >> decompressMultiImpl (
441+ const void * d_input,
442+ size_t input_size,
443+ cudaStream_t stream,
444+ const std::vector<void *>* caller_outputs,
445+ const std::vector<size_t >* caller_capacities
446+ );
447+
335448 // ── Inverse DAG helpers ───────────────────────────────────────────────────
336449
337450 /* * Compact description of one forward stage used by buildInverseDAG(). */
0 commit comments