szcompressor
diff --git a/‎examples/caller_allocated_output.cpp‎
Lines changed: 117 additions & 0 deletions b/‎examples/caller_allocated_output.cpp‎
Lines changed: 117 additions & 0 deletions
diff --git a/‎include/pipeline/compressor.h‎
Lines changed: 113 additions & 0 deletions b/‎include/pipeline/compressor.h‎
Lines changed: 113 additions & 0 deletions
diff --git a/‎src/pipeline/compressor.cpp‎
Lines changed: 89 additions & 0 deletions b/‎src/pipeline/compressor.cpp‎
Lines changed: 89 additions & 0 deletions
@@ -0,0 +1,117 @@
+/**
+ * Simple caller-allocated output example:
+ *
+ *   - compressInto():    writes compressed bytes into a user-owned device buffer
+ *   - decompressInto():  writes decompressed bytes into a user-owned device buffer
+ *
+ * Build:
+ *   cmake -S . -B build -DBUILD_EXAMPLES=ON
+ *   cmake --build build -j --target caller_allocated_output
+ *
+ * Run:
+ *   ./build/bin/caller_allocated_output
+ */
+
+#include "cuda_check.h"
+#include "fzgpumodules.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+using namespace fz;
+
+static std::vector<float> make_smooth_data(size_t n) {
+    std::vector<float> v(n);
+    for (size_t i = 0; i < n; ++i) {
+        v[i] = std::sin(static_cast<float>(i) * 0.01f) * 50.0f
+             + std::cos(static_cast<float>(i) * 0.003f) * 20.0f;
+    }
+    return v;
+}
+
+int main() {
+    constexpr size_t N = 1 << 16;
+    constexpr float EB = 1e-2f;
+    const size_t input_bytes = N * sizeof(float);
+
+    auto h_input = make_smooth_data(N);
+
+    float* d_input = nullptr;
+    FZ_CUDA_CHECK(cudaMalloc(&d_input, input_bytes));
+    FZ_CUDA_CHECK(cudaMemcpy(d_input, h_input.data(), input_bytes, cudaMemcpyHostToDevice));
+
+    Pipeline pipeline(input_bytes, MemoryStrategy::MINIMAL);
+    auto* lorenzo = pipeline.addStage<LorenzoStage<float, uint16_t>>();
+    lorenzo->setErrorBound(EB);
+    lorenzo->setQuantRadius(512);
+    pipeline.finalize();
+
+    // Ask the pipeline for a max compressed output size before allocating.
+    const size_t compressed_capacity = pipeline.getMaxCompressedOutputSize();
+
+    void* d_compressed_user = nullptr;
+    FZ_CUDA_CHECK(cudaMalloc(&d_compressed_user, compressed_capacity));
+
+    size_t compressed_size = 0;
+    pipeline.compressInto(
+        d_input,
+        input_bytes,
+        d_compressed_user,
+        compressed_capacity,
+        &compressed_size,
+        0);
+    FZ_CUDA_CHECK(cudaDeviceSynchronize());
+
+    // Ask the pipeline for a safe upper bound before allocating output.
+    const size_t decompressed_capacity = pipeline.getMaxDecompressedOutputSize();
+
+    // User-owned decompressed output buffer.
+    void* d_decompressed_user = nullptr;
+    FZ_CUDA_CHECK(cudaMalloc(&d_decompressed_user, decompressed_capacity));
+
+    size_t decompressed_size = 0;
+    pipeline.decompressInto(
+        d_compressed_user,
+        compressed_size,
+        d_decompressed_user,
+        decompressed_capacity,
+        &decompressed_size,
+        0);
+    FZ_CUDA_CHECK(cudaDeviceSynchronize());
+
+    if (decompressed_size != input_bytes) {
+        std::cerr << "Unexpected decompressed size: " << decompressed_size
+                  << " (expected " << input_bytes << ")\n";
+        cudaFree(d_decompressed_user);
+        cudaFree(d_compressed_user);
+        cudaFree(d_input);
+        return 1;
+    }
+
+    std::vector<float> h_recon(N);
+    FZ_CUDA_CHECK(cudaMemcpy(
+        h_recon.data(), d_decompressed_user, input_bytes, cudaMemcpyDeviceToHost));
+
+    float max_abs_error = 0.0f;
+    for (size_t i = 0; i < N; ++i) {
+        max_abs_error = std::max(max_abs_error, std::abs(h_recon[i] - h_input[i]));
+    }
+
+    std::cout << std::fixed << std::setprecision(3);
+    std::cout << "Caller-allocated output API example\n";
+    std::cout << "  input bytes:        " << input_bytes << "\n";
+    std::cout << "  compressed size:    " << compressed_size << "\n";
+    std::cout << "  compressed cap:     " << compressed_capacity << "\n";
+    std::cout << "  decompressed cap:   " << decompressed_capacity << "\n";
+    std::cout << "  decompressed size:  " << decompressed_size << "\n";
+    std::cout << "  max abs error:      " << max_abs_error << "\n";
+
+    cudaFree(d_decompressed_user);
+    cudaFree(d_compressed_user);
+    cudaFree(d_input);
+    return 0;
+}
@@ -134,6 +134,40 @@ class Pipeline {
         cudaStream_t stream = 0
     );
 
+    /**
+     * Compress into a caller-allocated device buffer.
+     *
+     * @param d_input          Device pointer to raw input.
+     * @param input_size       Input size in bytes.
+     * @param d_output         Caller-allocated device buffer for compressed bytes.
+     * @param output_capacity  Capacity of d_output in bytes.
+     * @param output_size      Receives the compressed size in bytes.
+     * @param stream           CUDA stream for all GPU operations.
+     */
+    void compressInto(
+        const void* d_input,
+        size_t      input_size,
+        void*       d_output,
+        size_t      output_capacity,
+        size_t*     output_size,
+        cudaStream_t stream = 0
+    );
+
+    /**
+     * Compress multi-source input into a caller-allocated device buffer.
+     *
+     * Output format matches compress(const std::vector<InputSpec>&):
+     * single output buffer for single-source pipelines; concat format for
+     * multi-source pipelines.
+     */
+    void compressInto(
+        const std::vector<InputSpec>& inputs,
+        void*       d_output,
+        size_t      output_capacity,
+        size_t*     output_size,
+        cudaStream_t stream = 0
+    );
+
     /**
      * Compress (multi-source). One InputSpec per source stage; order does not matter.
      * *d_output is pool-owned — do NOT cudaFree.
@@ -177,6 +211,45 @@ class Pipeline {
         cudaStream_t stream = 0
     );
 
+    /**
+     * Decompress into a caller-allocated device buffer (single-source only).
+     *
+     * @param d_input          nullptr to read from live forward DAG buffers, or
+     *                         a device pointer to external compressed bytes.
+     * @param input_size       Byte size of d_input (ignored when d_input is nullptr).
+     * @param d_output         Caller-allocated device buffer for decompressed bytes.
+     * @param output_capacity  Capacity of d_output in bytes.
+     * @param output_size      Receives the exact decompressed size in bytes.
+     * @param stream           CUDA stream for all GPU operations.
+     */
+    void decompressInto(
+        const void* d_input,
+        size_t      input_size,
+        void*       d_output,
+        size_t      output_capacity,
+        size_t*     output_size,
+        cudaStream_t stream = 0
+    );
+
+    /**
+     * Decompress multi-source data into caller-allocated output buffers.
+     *
+     * @param d_input             nullptr to read from live forward DAG buffers,
+     *                            or a device pointer to external compressed bytes.
+     * @param input_size          Byte size of d_input (ignored when d_input is nullptr).
+     * @param d_outputs           One device pointer per source output.
+     * @param output_capacities   Capacity (bytes) for each output pointer.
+     * @param stream              CUDA stream for all GPU operations.
+     * @return Exact decompressed size (bytes) for each source output.
+     */
+    std::vector<size_t> decompressMultiInto(
+        const void*                 d_input,
+        size_t                      input_size,
+        const std::vector<void*>&   d_outputs,
+        const std::vector<size_t>&  output_capacities,
+        cudaStream_t                stream = 0
+    );
+
     /**
      * Decompress (multi-source). Returns one {device_ptr, size} pair per source,
      * in the same order as forward source discovery. Ownership follows
@@ -188,6 +261,32 @@ class Pipeline {
         cudaStream_t stream    = 0
     );
 
+    /**
+     * Maximum compressed output size for the current finalized pipeline.
+     *
+     * Returned value is an upper bound suitable for caller allocation before
+     * compressInto(...). For multi-source pipelines this corresponds to the
+     * single concat output format returned by compress().
+     */
+    size_t getMaxCompressedOutputSize() const;
+
+    /**
+     * Maximum decompressed output size (single-source pipelines only).
+     *
+     * Value is derived from the most recent compress() input size when available,
+     * otherwise from finalize-time size hints. Returns an upper bound suitable
+     * for caller allocation before decompressInto().
+     */
+    size_t getMaxDecompressedOutputSize() const;
+
+    /**
+     * Maximum decompressed output size per source (multi-source aware).
+     *
+     * Order matches decompressMulti()/decompressMultiInto() source order.
+     * Values are upper bounds suitable for caller allocation.
+     */
+    std::vector<size_t> getMaxDecompressedOutputSizes() const;
+
     /** Free non-persistent buffers and reset execution state for re-use. */
     void reset(cudaStream_t stream = 0);
 
@@ -270,6 +369,12 @@ class Pipeline {
     /** Parse the FZM header from a file without decompressing the payload. */
     static FZMFileHeader readHeader(const std::string& filename);
 
+    /** Exact decompressed output size from an FZM file (single-source convenience). */
+    static size_t getDecompressedOutputSizeFromFile(const std::string& filename);
+
+    /** Exact decompressed output sizes (one per source) from an FZM file header. */
+    static std::vector<size_t> getDecompressedOutputSizesFromFile(const std::string& filename);
+
     /** Build the FZM header from current pipeline state. Requires a prior compress(). */
     FZMFileHeader buildHeader() const;
 
@@ -332,6 +437,14 @@ class Pipeline {
     std::vector<Stage*> getSourceStages() const;
     std::vector<Stage*> getSinkStages() const;
 
+    std::vector<std::pair<void*, size_t>> decompressMultiImpl(
+        const void*                 d_input,
+        size_t                      input_size,
+        cudaStream_t                stream,
+        const std::vector<void*>*   caller_outputs,
+        const std::vector<size_t>*  caller_capacities
+    );
+
     // ── Inverse DAG helpers ───────────────────────────────────────────────────
 
     /** Compact description of one forward stage used by buildInverseDAG(). */
 
@@ -584,6 +584,95 @@ size_t Pipeline::getPoolThreshold() const {
     return mem_pool_ ? mem_pool_->getConfiguredSize() : 0;
 }
 
+size_t Pipeline::getMaxCompressedOutputSize() const {
+    if (!is_finalized_) {
+        throw std::runtime_error(
+            "getMaxCompressedOutputSize() requires finalize() to be called first");
+    }
+    if (!dag_) {
+        throw std::runtime_error("getMaxCompressedOutputSize(): DAG is not initialized");
+    }
+    if (output_buffer_ids_.empty()) {
+        throw std::runtime_error(
+            "getMaxCompressedOutputSize(): pipeline has no detected output buffers");
+    }
+
+    if (!needs_concat_) {
+        return dag_->getBufferSize(output_buffer_ids_[0]);
+    }
+
+    // Mirror concat format capacity computation using buffer capacities rather
+    // than run-time actual sizes.
+    auto align16_local = [](size_t x) -> size_t { return (x + 15u) & ~15u; };
+
+    const size_t n = output_buffer_ids_.size();
+    size_t total = align16_local(sizeof(uint32_t) + n * sizeof(uint64_t));
+    for (int buf_id : output_buffer_ids_) {
+        total += align16_local(dag_->getBufferSize(buf_id));
+    }
+    return total;
+}
+
+std::vector<size_t> Pipeline::getMaxDecompressedOutputSizes() const {
+    if (!is_finalized_) {
+        throw std::runtime_error(
+            "getMaxDecompressedOutputSizes() requires finalize() to be called first");
+    }
+
+    std::vector<size_t> out;
+    out.reserve(input_nodes_.size());
+
+    for (size_t i = 0; i < input_nodes_.size(); ++i) {
+        size_t max_size = 0;
+
+        // Most accurate value: actual source size from the most recent compress().
+        if (i < source_input_sizes_.size() && source_input_sizes_[i] > 0) {
+            max_size = source_input_sizes_[i];
+        }
+
+        // Fallback: per-source finalize-time hint.
+        if (max_size == 0) {
+            Stage* src_stage = input_nodes_[i]->stage;
+            auto hint_it = per_source_hints_.find(src_stage);
+            if (hint_it != per_source_hints_.end() && hint_it->second > 0) {
+                max_size = hint_it->second;
+            }
+        }
+
+        // Fallback: constructor hint (single-source common case).
+        if (max_size == 0 && input_size_hint_ > 0) {
+            max_size = input_size_hint_;
+        }
+
+        // Final fallback: finalized DAG input buffer size if it is not the
+        // placeholder 1-byte value used when no hint is available.
+        if (max_size == 0 && i < input_buffer_ids_.size() && dag_) {
+            const size_t dag_size = dag_->getBufferSize(input_buffer_ids_[i]);
+            if (dag_size > 1) max_size = dag_size;
+        }
+
+        // Returned value is a max allocation size; preserve alignment rounding.
+        if (max_size > 0 && input_alignment_bytes_ > 1) {
+            max_size = ((max_size + input_alignment_bytes_ - 1) / input_alignment_bytes_)
+                     * input_alignment_bytes_;
+        }
+
+        out.push_back(max_size);
+    }
+
+    return out;
+}
+
+size_t Pipeline::getMaxDecompressedOutputSize() const {
+    auto sizes = getMaxDecompressedOutputSizes();
+    if (sizes.size() != 1) {
+        throw std::runtime_error(
+            "getMaxDecompressedOutputSize() is single-source only; use getMaxDecompressedOutputSizes() for " +
+            std::to_string(sizes.size()) + " source outputs");
+    }
+    return sizes[0];
+}
+
 size_t Pipeline::getCurrentMemoryUsage() const {
     return dag_ ? dag_->getCurrentMemoryUsage() : 0;
 }