diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..0388516 --- /dev/null +++ b/.clang-format @@ -0,0 +1,90 @@ +--- +Language: Cpp +BasedOnStyle: Microsoft +Standard: Latest + +IndentWidth: 4 +TabWidth: 4 +UseTab: Never +ColumnLimit: 120 + +BreakBeforeBraces: Allman +BraceWrapping: + AfterCaseLabel: true + AfterClass: true + AfterControlStatement: MultiLine + AfterEnum: true + AfterFunction: true + AfterNamespace: true + AfterObjCDeclaration: true + AfterStruct: true + AfterUnion: true + AfterExternBlock: true + BeforeCatch: true + BeforeElse: true + BeforeLambdaBody: true + BeforeWhile: true + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true + +NamespaceIndentation: All +FixNamespaceComments: true +CompactNamespaces: false + +AccessModifierOffset: -4 +IndentAccessModifiers: false + +PointerAlignment: Left +ReferenceAlignment: Left +DerivePointerAlignment: false + +AlignAfterOpenBracket: AlwaysBreak +BinPackArguments: false +BinPackParameters: false +AllowAllArgumentsOnNextLine: false +AllowAllParametersOfDeclarationOnNextLine: false +PenaltyBreakBeforeFirstCallParameter: 0 + +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +IndentWrappedFunctionNames: false + +AllowShortFunctionsOnASingleLine: Empty +AllowShortBlocksOnASingleLine: Never +AllowShortIfStatementsOnASingleLine: Never +AllowShortLoopsOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortEnumsOnASingleLine: false + +AlwaysBreakTemplateDeclarations: Yes +BreakConstructorInitializers: BeforeComma +PackConstructorInitializers: Never + +Cpp11BracedListStyle: true +SpaceBeforeCpp11BracedList: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +SpaceInEmptyParentheses: false +SpaceBeforeParens: ControlStatements +SpaceAfterTemplateKeyword: false + +SortIncludes: CaseSensitive +IncludeBlocks: Regroup +IncludeCategories: + - Regex: '^<.*>$' + Priority: 1 + - Regex: '^"(blosc2|nlohmann|nvcomp|cuda|cuda_runtime).*' + Priority: 2 + - Regex: '^".*"$' + Priority: 3 + +ReflowComments: true +AlignTrailingComments: true +KeepEmptyLinesAtTheStartOfBlocks: false +MaxEmptyLinesToKeep: 2 + +DeriveLineEnding: true +InsertNewlineAtEOF: true +... \ No newline at end of file diff --git a/.clang-format-ignore b/.clang-format-ignore new file mode 100644 index 0000000..c3d6b98 --- /dev/null +++ b/.clang-format-ignore @@ -0,0 +1 @@ +thirdparty/** \ No newline at end of file diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 0000000..3c1deb3 --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,42 @@ +Checks: > + -*, + readability-identifier-naming + +CheckOptions: + - key: readability-identifier-naming.FunctionCase + value: lower_case + - key: readability-identifier-naming.MethodCase + value: lower_case + - key: readability-identifier-naming.VariableCase + value: lower_case + + # Allow local temporaries like _compressor + - key: readability-identifier-naming.LocalVariableCase + value: lower_case + - key: readability-identifier-naming.LocalVariableIgnoredRegexp + value: '^_[a-z0-9_]+$' + + # Public members: no prefix + - key: readability-identifier-naming.PublicMemberCase + value: lower_case + + # Protected members: m_ prefix + - key: readability-identifier-naming.ProtectedMemberCase + value: lower_case + - key: readability-identifier-naming.ProtectedMemberPrefix + value: m_ + + # Private members: m_ prefix + - key: readability-identifier-naming.PrivateMemberCase + value: lower_case + - key: readability-identifier-naming.PrivateMemberPrefix + value: m_ + + - key: readability-identifier-naming.ClassCase + value: lower_case + - key: readability-identifier-naming.StructCase + value: lower_case + - key: readability-identifier-naming.EnumCase + value: lower_case + - key: readability-identifier-naming.EnumConstantCase + value: lower_case \ No newline at end of file diff --git a/.gitignore b/.gitignore index d053b09..a2dbeb4 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,8 @@ benchmark/images/*.tga # Release artifacts release/ +.idea/ + # Wheel Artifacts wheels/ wheelhouse/ diff --git a/.gitmodules b/.gitmodules index f87a610..ae3034d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -25,3 +25,6 @@ [submodule "thirdparty/pybind11_json"] path = thirdparty/pybind11_json url = https://github.com/pybind/pybind11_json +[submodule "thirdparty/spdlog"] + path = thirdparty/spdlog + url = https://github.com/gabime/spdlog diff --git a/CMakeLists.txt b/CMakeLists.txt index f2f033a..85de98f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,37 +1,42 @@ -cmake_minimum_required (VERSION 3.19) +cmake_minimum_required(VERSION 3.19) set(VCPKG_LIBRARY_LINKAGE static) +if (POLICY CMP0135) + cmake_policy(SET CMP0135 NEW) +endif () + list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") -set (CMAKE_CXX_STANDARD 20) -project (CompressedImageBuild) +set(CMAKE_CXX_STANDARD 20) +project(CompressedImageBuild) # If we are compiling as the main project we automatically turn on all the build options. # This can be circumvented by passing "-DCOMPRESSED_DETERMINE_MAIN_PROJECT=OFF" set(MAIN_PROJECT OFF) -option ( - COMPRESSED_DETERMINE_MAIN_PROJECT - "Whether to automatically determine if we are building this module as main project" - ON +option( + COMPRESSED_DETERMINE_MAIN_PROJECT + "Whether to automatically determine if we are building this module as main project" + ON ) if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR AND COMPRESSED_DETERMINE_MAIN_PROJECT) - message("Compiling compressed-image as main project") - set(MAIN_PROJECT ON) -else() - set(MAIN_PROJECT OFF) -endif() + message("Compiling compressed-image as main project") + set(MAIN_PROJECT ON) +else () + set(MAIN_PROJECT OFF) +endif () if (MAIN_PROJECT) - set(COMPRESSED_IMAGE_USE_VCPKG ON) - set(COMPRESSED_IMAGE_BUILD_TESTS ON) - set(COMPRESSED_IMAGE_BUILD_EXAMPLES ON) - set(COMPRESSED_IMAGE_BUILD_PYTHON ON) - set(COMPRESSED_IMAGE_BUILD_DOCS ON) - set(COMPRESSED_IMAGE_BUILD_BENCHMARKS ON) - set(COMPRESSED_IMAGE_EXTENDED_WARNINGS ON) -endif() + set(COMPRESSED_IMAGE_USE_VCPKG ON) + set(COMPRESSED_IMAGE_BUILD_TESTS ON) + set(COMPRESSED_IMAGE_BUILD_EXAMPLES ON) + set(COMPRESSED_IMAGE_BUILD_PYTHON ON) + set(COMPRESSED_IMAGE_BUILD_DOCS ON) + set(COMPRESSED_IMAGE_BUILD_BENCHMARKS ON) + set(COMPRESSED_IMAGE_EXTENDED_WARNINGS ON) +endif () option(COMPRESSED_IMAGE_USE_VCPKG "Whether to use the submodule version of vcpkg to resolve the dependencies instead of system libraries." OFF) option(COMPRESSED_IMAGE_EXTENDED_WARNINGS "Whether to compile with extended warnings (-Wextra, -Werror etc.)" OFF) +option(COMPRESSED_IMAGE_CUDA_VERSION "CUDA Runtime/Toolkit version" 12) option(COMPRESSED_IMAGE_BUILD_TESTS OFF) option(COMPRESSED_IMAGE_BUILD_EXAMPLES OFF) option(COMPRESSED_IMAGE_BUILD_DOCS OFF) @@ -44,70 +49,83 @@ option(_COMPRESSED_IMAGE_SANITIZE_FLAGS "Internal CI flag for enabling sanitizer # Add thirdparty libraries # -------------------------------------------------------------------------- +find_package(CUDAToolkit REQUIRED COMPRESSED_IMAGE_CUDA_VERSION) + # Add c-blosc2 set(DEACTIVATE_ZLIB ON) set(BUILD_TESTS OFF) set(BUILD_FUZZERS OFF) set(BUILD_BENCHMARKS OFF) set(BUILD_EXAMPLES OFF) -add_subdirectory (thirdparty/c-blosc2) +add_subdirectory(thirdparty/c-blosc2) # Add target for blosc2 headers add_library(blosc2_include INTERFACE) -target_include_directories(blosc2_include SYSTEM INTERFACE thirdparty/c-blosc2/include) +target_include_directories(blosc2_include SYSTEM INTERFACE + $ + $ +) + # JSON module for parsing/storing metadata add_subdirectory(thirdparty/json) +# spdlog for logging +set(SPDLOG_USE_STD_FORMAT ON) +add_subdirectory(thirdparty/spdlog) + +# Pull nvcomp (headers + dll/so target) +include(${CMAKE_CURRENT_LIST_DIR}/cmake/FetchNvcomp.cmake) + # Include the local vcpkg toolchain, if requested, otherwise it is up to the user # to provide a valid OpenImageIO library (that can be found via find_package) if (COMPRESSED_IMAGE_USE_VCPKG) - include("${PROJECT_SOURCE_DIR}/thirdparty/vcpkg/scripts/buildsystems/vcpkg.cmake") -endif() + include("${CMAKE_CURRENT_LIST_DIR}/thirdparty/vcpkg/scripts/buildsystems/vcpkg.cmake") +endif () find_package(OpenImageIO CONFIG QUIET) if (OpenImageIO_FOUND) - message(STATUS "Found OpenImageIO") - set(COMPRESSED_IMAGE_HAVE_OIIO TRUE) -else() - message(WARNING "OpenImageIO not found, some features will not be available") - set(COMPRESSED_IMAGE_HAVE_OIIO TRUE) -endif() + message(STATUS "Found OpenImageIO") + set(COMPRESSED_IMAGE_HAVE_OIIO TRUE) +else () + message(WARNING "OpenImageIO not found, some features will not be available") + set(COMPRESSED_IMAGE_HAVE_OIIO FALSE) +endif () # Projects # -------------------------------------------------------------------------- add_subdirectory(compressed_image) if (COMPRESSED_IMAGE_BUILD_TESTS) - add_library(doctest INTERFACE) - target_include_directories(doctest SYSTEM INTERFACE thirdparty/doctest/doctest) + add_library(doctest INTERFACE) + target_include_directories(doctest SYSTEM INTERFACE thirdparty/doctest/doctest) - add_subdirectory(test) -endif() + add_subdirectory(test) +endif () if (COMPRESSED_IMAGE_BUILD_EXAMPLES) - add_subdirectory(examples/read_from_file) - add_subdirectory(examples/read_with_postprocess) - add_subdirectory(examples/lazy_channels) - add_subdirectory(examples/modifying_image) -endif() + add_subdirectory(examples/read_from_file) + add_subdirectory(examples/read_with_postprocess) + add_subdirectory(examples/lazy_channels) + add_subdirectory(examples/modifying_image) +endif () if (COMPRESSED_IMAGE_BUILD_BENCHMARKS) - set(BENCHMARK_ENABLE_INSTALL OFF) - set(BENCHMARK_INSTALL_DOCS OFF) - set(BENCHMARK_ENABLE_TESTING OFF) - set(BENCHMARK_ENABLE_GTEST_TESTS OFF) - add_subdirectory(thirdparty/benchmark) - add_subdirectory(benchmark) -endif() + set(BENCHMARK_ENABLE_INSTALL OFF) + set(BENCHMARK_INSTALL_DOCS OFF) + set(BENCHMARK_ENABLE_TESTING OFF) + set(BENCHMARK_ENABLE_GTEST_TESTS OFF) + add_subdirectory(thirdparty/benchmark) + add_subdirectory(benchmark) +endif () if (COMPRESSED_IMAGE_BUILD_DOCS) - add_subdirectory(docs) -endif() + add_subdirectory(docs) +endif () if (COMPRESSED_IMAGE_BUILD_PYTHON) - add_subdirectory(thirdparty/pybind11) - add_subdirectory(thirdparty/pybind11_image_util) - add_subdirectory(thirdparty/pybind11_json) - add_subdirectory(python) -endif() \ No newline at end of file + add_subdirectory(thirdparty/pybind11) + add_subdirectory(thirdparty/pybind11_image_util) + add_subdirectory(thirdparty/pybind11_json) + add_subdirectory(python) +endif () \ No newline at end of file diff --git a/cmake/FetchNvcomp.cmake b/cmake/FetchNvcomp.cmake new file mode 100644 index 0000000..0afa027 --- /dev/null +++ b/cmake/FetchNvcomp.cmake @@ -0,0 +1,64 @@ +# FetchNvcomp.cmake +# Fetch NVCOMP headers and dynamic libraries for runtime loading. +# Provides namespaced target: +# compressed::nvcomp_headers + +include(FetchContent) + +############################################################## +# Fetch dynamically from NVIDIA Redistributables +############################################################## + +if (WIN32) + set(NVCOMP_URL "https://developer.download.nvidia.com/compute/nvcomp/redist/nvcomp/windows-x86_64/nvcomp-windows-x86_64-5.0.0.6_cuda11-archive.zip") + set(NVCOMP_SHA256 "5C2E1EE55398F47D28806EB7C53ACA33B9E22D6D5B3ACEC86BBC4253C7E6D1D3") +elseif (UNIX AND NOT APPLE) + set(NVCOMP_URL "https://developer.download.nvidia.com/compute/nvcomp/redist/nvcomp/linux-x86_64/nvcomp-linux-x86_64-5.0.0.6_cuda11-archive.tar.xz") + set(NVCOMP_SHA256 "64F5F7CC622F36006C503EE5A3F9D730B5C6CC49E4FAB0FC0507C1272D5EFA7B") +else () + message(FATAL_ERROR "Unsupported platform for NVCOMP") +endif () + +FetchContent_Declare(_nvcomp_src + URL ${NVCOMP_URL} + URL_HASH SHA256=${NVCOMP_SHA256} +) +FetchContent_MakeAvailable(_nvcomp_src) +set(NVCOMP_ROOT ${_nvcomp_src_SOURCE_DIR}) + +############################################################## +# Locate Runtime Binaries +############################################################## + +if (WIN32) + file(GLOB FOUND_BINARIES "${NVCOMP_ROOT}/bin/*.dll") +else () + file(GLOB FOUND_BINARIES "${NVCOMP_ROOT}/lib/libnvcomp.so*") +endif () + +set(NVCOMP_RUNTIME_BINARIES "${FOUND_BINARIES}" CACHE INTERNAL "nvcomp runtime binaries") + +############################################################## +# Set up Compile-Time Header Targets +############################################################## + +add_library(compressed_nvcomp_headers INTERFACE) + +target_include_directories(compressed_nvcomp_headers INTERFACE + $ + $ +) + +add_library(compressed::nvcomp_headers ALIAS compressed_nvcomp_headers) + +############################################################## +# Install Rules (For Deployment / Packaging) +############################################################## + +install(DIRECTORY ${NVCOMP_ROOT}/include/ DESTINATION include) + +if (WIN32) + install(FILES ${NVCOMP_RUNTIME_BINARIES} DESTINATION bin) +else () + install(FILES ${NVCOMP_RUNTIME_BINARIES} DESTINATION lib) +endif () \ No newline at end of file diff --git a/compressed_image/CMakeLists.txt b/compressed_image/CMakeLists.txt index 02b814d..688917a 100644 --- a/compressed_image/CMakeLists.txt +++ b/compressed_image/CMakeLists.txt @@ -1,51 +1,56 @@ project(CompressedImage) add_library(compressed_image INTERFACE) -target_include_directories(compressed_image INTERFACE "include") -target_link_libraries(compressed_image INTERFACE - blosc2_static - blosc2_include - nlohmann_json - OpenImageIO::OpenImageIO +target_include_directories(compressed_image INTERFACE + include + ${CUDAToolkit_INCLUDE_DIRS} +) +target_link_libraries(compressed_image INTERFACE + blosc2_static + blosc2_include + nlohmann_json + OpenImageIO::OpenImageIO + compressed::nvcomp_headers + spdlog::spdlog_header_only ) if (MSVC) - target_compile_options(compressed_image INTERFACE /utf-8 /MP /DNOMINMAX) -endif() + target_compile_options(compressed_image INTERFACE /utf-8 /MP /DNOMINMAX) +endif () if (COMPRESSED_IMAGE_HAVE_OIIO) - target_compile_definitions(compressed_image INTERFACE COMPRESSED_IMAGE_OIIO_AVAILABLE) -endif() + target_compile_definitions(compressed_image INTERFACE COMPRESSED_IMAGE_OIIO_AVAILABLE) +endif () # Crank up warning levels on both MSVC, Clang and GCC if (COMPRESSED_IMAGE_EXTENDED_WARNINGS) - if (CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU") - target_compile_options( - compressed_image - INTERFACE - -Wall - -Werror - -Wextra - ) - elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") - target_compile_options( - compressed_image - INTERFACE - /W4 - /WX - /w44062 - /w44464 - /w45264 - ) - endif() -endif() + if (CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU") + target_compile_options( + compressed_image + INTERFACE + -Wall + -Werror + -Wextra + ) + elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC") + target_compile_options( + compressed_image + INTERFACE + /W4 + /WX + /w44062 + /w44464 + /w45264 + ) + endif () +endif () # Enable sanitizers unless on macOS (not supported) or Windows (github runners run out of memory). # These are for our CI runs only and should be ignored by users. if (_COMPRESSED_IMAGE_SANITIZE_FLAGS AND NOT APPLE) - if (CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU") - target_compile_options(compressed_image INTERFACE -fsanitize=address,leak,undefined) - target_link_options(compressed_image INTERFACE -fsanitize=address,leak,undefined) - endif() -endif() \ No newline at end of file + if (CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU") + target_compile_options(compressed_image INTERFACE -fsanitize=address,leak,undefined) + target_link_options(compressed_image INTERFACE -fsanitize=address,leak,undefined) + endif () +endif () \ No newline at end of file diff --git a/compressed_image/include/compressed/blosc2/lazyschunk.h b/compressed_image/include/compressed/blosc2/lazyschunk.h index f92eda7..6b44e57 100644 --- a/compressed_image/include/compressed/blosc2/lazyschunk.h +++ b/compressed_image/include/compressed/blosc2/lazyschunk.h @@ -10,365 +10,519 @@ #include "compressed/util.h" #include "wrapper.h" #include "schunk_mixin.h" +#include "compressed/cuda/compression.h" #include "compressed/detail/scoped_timer.h" -namespace NAMESPACE_COMPRESSED_IMAGE +namespace +NAMESPACE_COMPRESSED_IMAGE { - - namespace blosc2 - { - - namespace detail - { - - /// Wrapper representing a lazy chunk holding either an initialized (and compressed) chunk - /// in the form of a byte array or just a single T representing a lazy state - template - struct lazy_chunk - { - std::variant, T> value; - size_t num_elements = 0; - - size_t byte_size() const noexcept - { - return num_elements * sizeof(T); - } - - bool is_lazy() const noexcept - { - return std::holds_alternative(this->value); - } - }; - - } // detail - - - template - struct lazy_schunk final : public detail::schunk_mixin> - { - using detail::schunk_mixin>::chunk_bytes; - - lazy_schunk() = default; - lazy_schunk(lazy_schunk&& other) noexcept - { - this->m_Chunks = std::move(other.m_Chunks); - this->m_ChunkSize = other.m_ChunkSize; - this->m_BlockSize = other.m_BlockSize; - } - lazy_schunk& operator=(lazy_schunk&& other) noexcept - { - if (this != &other) - { - this->m_Chunks = std::move(other.m_Chunks); - this->m_ChunkSize = other.m_ChunkSize; - this->m_BlockSize = other.m_BlockSize; - } - return *this; - } - lazy_schunk(const lazy_schunk& other) = default; - lazy_schunk& operator=(const lazy_schunk& other) = default; - - - /// Initialize a lazy super-chunk from the given value, has a near-zero - /// cost with the chunks only being initialized on read/modify. - /// - /// \param value The initial value to fill. - /// \param num_elements The size to initialize the data with. - /// \param block_size The requested chunk size. It is up to the caller to ensure - /// this is appropriately sized - /// \param chunk_size The requested chunk size. It is up to the caller to ensure - /// this is appropriately sized (i.e. by using util::align_chunk_to_scanlines) - lazy_schunk(T value, size_t num_elements, size_t block_size, size_t chunk_size) - { - util::validate_chunk_size(chunk_size, "lazy_schunk"); - this->m_BlockSize = block_size; - this->m_ChunkSize = chunk_size; - - size_t num_bytes = num_elements * sizeof(T); - - // Calculate all 'full' chunks and the final remainder (if any). - size_t num_full_chunks = num_bytes / this->m_ChunkSize; - size_t remainder_bytes = num_bytes - (this->m_ChunkSize * num_full_chunks); - - // Initialize lazy chunks with the provided value of T - for ([[maybe_unused]] auto idx : std::views::iota(size_t{ 0 }, num_full_chunks)) - { - detail::lazy_chunk chunk = { value, this->m_ChunkSize / sizeof(T) }; - this->m_Chunks.push_back(std::move(chunk)); - } - if (remainder_bytes > 0) - { - detail::lazy_chunk chunk = { value, remainder_bytes / sizeof(T) }; - this->m_Chunks.push_back(std::move(chunk)); - } - } - - size_t chunk_bytes(size_t index) const override - { - if (index > this->m_Chunks.size() - 1) - { - throw std::out_of_range( - std::format("Cannot access index {} in lazy-schunk. Total amount of chunks is {}", index, this->m_Chunks.size()) - ); - } - - return this->m_Chunks[index].num_elements * sizeof(T); - } - - /// convert the lazy schunk into a super-chunk, generating any - /// not yet initialized lazy chunks in the process. This should - /// be done once all the data is computed to minimize the overhead. - schunk_ptr to_schunk() override - { - _COMPRESSED_PROFILE_FUNCTION(); - // Initialize the chunks, either appending the byte array directly to the schunk - // or compressing the lazy chunk. - blosc2::schunk_ptr schunk = create_default_schunk(); - - // Allocate and compress the lazy buff. Since this only needs to happen once - // as all lazy values are the same we can just use the same compressed buffer for all. - util::default_init_vector lazy_compressed_data; - if (this->has_lazy_chunk()) - { - auto lazy_buff = std::vector(this->chunk_elements(), this->lazy_chunk_value()); - lazy_compressed_data.resize(blosc2::min_compressed_size(this->m_ChunkSize)); - - auto context = blosc2::create_compression_context( - schunk, - std::thread::hardware_concurrency(), - enums::codec::lz4, - 9, - this->m_BlockSize - ); - blosc2::compress(context, std::span(lazy_buff), std::span(lazy_compressed_data)); - } - - // Iterate all the chunks, if lazy add the compressed lazy buffer, else add the compressed data. - for (auto& chunk : this->m_Chunks) - { - if (std::holds_alternative>(chunk.value)) - { - auto& data = std::get>(chunk.value); - blosc2_schunk_append_chunk( - schunk.get(), - reinterpret_cast(data.data()), - true // copy - ); - } - else - { - assert(lazy_compressed_data.size() >= BLOSC2_MAX_OVERHEAD); - // we already initialized the buffer to the lazychunk value above - blosc2_schunk_append_chunk( - schunk.get(), - reinterpret_cast(lazy_compressed_data.data()), - true // copy - ); - } - } - - return schunk; - } - - /// Generate an uncompressed vector from the chunks, using the decompression context - /// to perform the decompression. - std::vector to_uncompressed(blosc2::context_ptr& decompression_ctx) const override - { - std::vector uncompressed(this->size(), this->lazy_chunk_value()); - - size_t offset = 0; // element offset - for (const auto& chunk : this->m_Chunks) - { - if (std::holds_alternative>(chunk.value)) - { - auto subspan = std::span(uncompressed.data() + offset, chunk.num_elements); - blosc2::decompress(decompression_ctx, subspan, std::get>(chunk.value)); - } - // Since we already initialized the uncompressed data to the lazy chunks' value we don't need - // to do any filling here. - offset += chunk.num_elements; - } - - return uncompressed; - } - - std::vector chunk(blosc2::context_ptr& decompression_ctx, size_t index) const override - { - return this->chunk(decompression_ctx.get(), index); - } - - std::vector chunk(blosc2::context_raw_ptr decompression_ctx, size_t index) const override - { - if (index > this->m_Chunks.size() - 1) - { - throw std::out_of_range( - std::format("Cannot access index {} in lazy-schunk. Total amount of chunks is {}", index, this->m_Chunks.size()) - ); - } - - if (std::holds_alternative>(this->m_Chunks[index].value)) - { - std::vector uncompressed(this->chunk_elements(index), 0); - this->chunk(decompression_ctx, std::span(uncompressed), index); - return uncompressed; - } - return std::vector(this->chunk_elements(index), std::get(this->m_Chunks[index].value)); - } - - void chunk(blosc2::context_ptr& decompression_ctx, std::span buffer, size_t index) const override - { - this->chunk(decompression_ctx.get(), buffer, index); - } - - void chunk(blosc2::context_raw_ptr decompression_ctx, std::span buffer, size_t index) const override - { - if (index > this->m_Chunks.size() - 1) - { - throw std::out_of_range( - std::format("Cannot access index {} in lazy-schunk. Total amount of chunks is {}", index, this->m_Chunks.size()) - ); - } - - // Either decompress from the compressed data or fill with the lazy chunks value - if (std::holds_alternative>(this->m_Chunks.at(index).value)) - { - auto& compressed = std::get>(this->m_Chunks.at(index).value); - blosc2::decompress( - decompression_ctx, - buffer, - std::span(compressed) - ); - } - else - { - std::fill( - std::execution::par_unseq, - buffer.begin(), - buffer.end(), - std::get(this->m_Chunks[index].value) - ); - } - } - - void set_chunk(std::vector compressed, size_t index) override - { - this->validate_chunk_index(index); - this->m_Chunks[index].value = std::move(compressed); - this->validate_chunk_sizes(); - } - - void set_chunk(std::span compressed, size_t index) override - { - this->validate_chunk_index(index); - this->m_Chunks[index].value = std::vector(compressed.begin(), compressed.end()); - this->validate_chunk_sizes(); - } - - void set_chunk(blosc2::context_ptr& compression_ctx, std::span uncompressed, size_t index) override - { - this->validate_chunk_index(index); - - util::default_init_vector compression_buffer(blosc2::min_compressed_size(this->m_ChunkSize)); - std::span compression_span(compression_buffer); - - auto csize = blosc2::compress(compression_ctx, uncompressed, compression_span); - - // copy over a new vector containing all the elements from the compression span. - this->m_Chunks[index].value = std::vector(compression_span.begin(), compression_span.begin() + csize); - this->m_Chunks[index].num_elements = uncompressed.size(); - this->validate_chunk_sizes(); - } - - void append_chunk(std::vector compressed) override - { - auto num_elements = blosc2::chunk_num_elements(compressed); - auto chunk = detail::lazy_chunk{ .value = std::move(compressed), .num_elements = num_elements }; - this->m_Chunks.push_back(std::move(chunk)); - this->validate_chunk_sizes(); - } - - void append_chunk(blosc2::context_ptr& compression_ctx, std::span uncompressed) override - { - util::default_init_vector compression_buffer(blosc2::min_compressed_size(this->chunk_bytes())); - std::span compression_span(compression_buffer); - this->append_chunk(compression_ctx, uncompressed, compression_span); - this->validate_chunk_sizes(); - }; - - void append_chunk(blosc2::context_ptr& compression_ctx, std::span uncompressed, std::span compression_buff) override - { - auto csize = blosc2::compress(compression_ctx, uncompressed, compression_buff); - auto chunk = detail::lazy_chunk{ - .value = std::vector(compression_buff.begin(), compression_buff.begin() + csize), - .num_elements = uncompressed.size() - }; - this->m_Chunks.push_back(std::move(chunk)); - this->validate_chunk_sizes(); - } - - /// Retrieve the total compressed size of the lazy-schunk. - /// Lazy chunks will count as the size of T. - size_t csize() const noexcept override - { - size_t _csize = 0; - for (const auto& chunk : this->m_Chunks) - { - if (std::holds_alternative(chunk.value)) - { - _csize += sizeof(T); - } - else - { - _csize += std::get>(chunk.value).size(); - } - } - return _csize; - } - - // The total uncompressed size of the lazy-schunk in elements. - size_t size() const noexcept override - { - size_t _size = 0; - for (const auto& chunk : this->m_Chunks) - { - _size += chunk.num_elements; - } - return _size; - } - - private: - - /// Check whether this->m_Chunks contain any still-lazy chunks. - bool has_lazy_chunk() const noexcept - { - for (const auto& chunk : this->m_Chunks) - { - if (std::holds_alternative(chunk.value)) - { - return true; - } - } - return false; - } - - /// Get the value of the first encountered lazy chunk, since we only create lazy chunks with a single value - /// this is a valid way of accessing this value. if no lazy chunk exists we simply return T{} - T lazy_chunk_value() const noexcept - { - for (const auto& chunk : this->m_Chunks) - { - if (std::holds_alternative(chunk.value)) - { - return std::get(chunk.value); - } - } - return T{}; - - } - - }; - - } // blosc2 - -} // NAMESPACE_COMPRESSED_IMAGE \ No newline at end of file + namespace detail + { + /// Wrapper representing a lazy chunk holding either an initialized (and compressed) chunk + /// in the form of a byte array or just a single T representing a lazy state + template + struct lazy_chunk + { + std::variant<_storage_type, T> value; + size_t num_elements = 0; + + lazy_chunk(std::variant<_storage_type, T> v, size_t n) noexcept + : value(std::move(v)), num_elements(n) + { + } + + size_t byte_size() const noexcept + { + return num_elements * sizeof(T); + } + + bool is_lazy() const noexcept + { + return std::holds_alternative(this->value); + } + }; + + + template + struct lazy_schunk final : + public detail::schunk_mixin< + T, /* element type */ + detail::lazy_chunk>, /* gpu storage type */ + detail::lazy_chunk /* cpu storage type */ + > + { + /// Bring the gpu_container and cpu_container using declarations into this struct + using detail::schunk_mixin< + T, + detail::lazy_chunk>, + detail::lazy_chunk + >::gpu_container; + using detail::schunk_mixin< + T, + detail::lazy_chunk>, + detail::lazy_chunk + >::cpu_container; + + using detail::schunk_mixin< + T, + detail::lazy_chunk>, + detail::lazy_chunk + >::chunk; + using detail::schunk_mixin< + T, + detail::lazy_chunk>, + detail::lazy_chunk + >::chunk_bytes; + using detail::schunk_mixin< + T, + detail::lazy_chunk>, + detail::lazy_chunk + >::is_gpu_chunk; + using detail::schunk_mixin< + T, + detail::lazy_chunk>, + detail::lazy_chunk + >::to_uncompressed; + + lazy_schunk() = default; + + lazy_schunk(lazy_schunk&& other) noexcept + { + this->m_chunks = std::move(other.m_chunks); + this->m_chunk_size = other.m_chunk_size; + this->m_block_size = other.m_block_size; + } + + lazy_schunk& operator=(lazy_schunk&& other) noexcept + { + if (this != &other) + { + this->m_chunks = std::move(other.m_chunks); + this->m_chunk_size = other.m_chunk_size; + this->m_block_size = other.m_block_size; + } + return *this; + } + + lazy_schunk(const lazy_schunk& other) = default; + lazy_schunk& operator=(const lazy_schunk& other) = default; + + + /// Initialize a lazy super-chunk from the given value, has a near-zero + /// cost with the chunks only being initialized on read/modify. + /// + /// \param value The initial value to fill. + /// \param num_elements The size to initialize the data with. + /// \param block_size The requested chunk size. It is up to the caller to ensure + /// this is appropriately sized + /// \param chunk_size The requested chunk size. It is up to the caller to ensure + /// this is appropriately sized (i.e. by using util::align_chunk_to_scanlines) + lazy_schunk(const T value, const size_t num_elements, const size_t block_size, const size_t chunk_size) + { + util::validate_chunk_size(chunk_size, "lazy_schunk"); + this->m_block_size = block_size; + this->m_chunk_size = chunk_size; + + size_t num_bytes = num_elements * sizeof(T); + + // Calculate all 'full' chunks and the final remainder (if any). + size_t num_full_chunks = num_bytes / this->m_chunk_size; + size_t remainder_bytes = num_bytes - (this->m_chunk_size * num_full_chunks); + + // Initialize lazy chunks with the provided value of T + for ([[maybe_unused]] auto idx : std::views::iota(size_t{0}, num_full_chunks)) + { + detail::lazy_chunk chunk = {value, this->m_chunk_size / sizeof(T)}; + this->m_chunks.push_back(std::move(chunk)); + } + if (remainder_bytes > 0) + { + detail::lazy_chunk> chunk = {value, remainder_bytes / sizeof(T)}; + this->m_chunks.push_back(std::move(chunk)); + } + } + + size_t chunk_bytes(size_t index) const override + { + if (index > this->m_chunks.size() - 1) + { + throw std::out_of_range( + std::format( + "Cannot access index {} in lazy-schunk. Total amount of chunks is {}", + index, + this->m_chunks.size() + ) + ); + } + + return std::visit( + [&](const auto& chunk) + { + return chunk.num_elements * sizeof(T); + }, + this->m_chunks[index] + ); + } + + /// Generate an uncompressed vector from the chunks, using the decompression context + /// to perform the decompression. + std::vector to_uncompressed( + cpu_compression_context& cpu_ctx, + gpu_compression_context gpu_ctx + ) const override + { + std::vector uncompressed(this->size(), this->lazy_chunk_value()); + + size_t offset = 0; // element offset + for (const auto& chunk : this->m_chunks) + { + if (std::holds_alternative(chunk)) + { + const auto& _chunk_val = std::get(chunk); + + // Since we already initialized the uncompressed data to the lazy chunks' value we don't need + // to do any filling here. + if (_chunk_val.is_lazy()) + { + offset += _chunk_val.num_elements; + continue; + } + + auto subspan = std::span(uncompressed.data() + offset, _chunk_val.num_elements); + + auto compressor = cuda::make_compressor(gpu_ctx.ctx.codec); + std::visit( + [&](auto& _compressor) + { + _compressor.decompress(std::get>(_chunk_val.value), subspan); + }, + compressor + ); + + offset += _chunk_val.num_elements; + } + else + { + const auto& _chunk_val = std::get(chunk); + + // Since we already initialized the uncompressed data to the lazy chunks' value we don't need + // to do any filling here. + if (_chunk_val.is_lazy()) + { + offset += _chunk_val.num_elements; + continue; + } + + auto subspan = std::span(uncompressed.data() + offset, _chunk_val.num_elements); + blosc2::decompress( + cpu_ctx.decompression_ctx, + subspan, + std::get(_chunk_val.value) + ); + offset += _chunk_val.num_elements; + } + } + + return uncompressed; + } + + std::vector chunk(blosc2::context_raw_ptr decompression_ctx, size_t index) const override + { + if (index > this->m_chunks.size() - 1) + { + throw std::out_of_range( + std::format( + "Cannot access index {} in lazy-schunk. Total amount of chunks is {}", + index, + this->m_chunks.size() + ) + ); + } + + const auto& chunk_val = std::get(this->m_chunks.at(index)); + + if (std::holds_alternative(chunk_val.value)) + { + std::vector uncompressed(this->chunk_elements(index), 0); + this->chunk(decompression_ctx, std::span(uncompressed), index); + return uncompressed; + } + return std::vector(this->chunk_elements(index), std::get(chunk_val.value)); + } + + void chunk(blosc2::context_raw_ptr decompression_ctx, std::span buffer, size_t index) const override + { + this->validate_chunk_index(index); + if (this->is_gpu_chunk(index)) + { + throw std::runtime_error( + "Invalid function overload called for lazy_schunk::chunk. The given chunk is not a cpu" + " chunk but a gpu chunk." + ); + } + + // Either decompress from the compressed data or fill with the lazy chunks value + if (const auto& chunk_val = std::get(this->m_chunks.at(index)); std::holds_alternative< + cpu_chunk>(chunk_val.value)) + { + const auto& compressed = std::get(chunk_val.value); + blosc2::decompress( + decompression_ctx, + buffer, + std::span(compressed) + ); + } + else + { + std::fill( + std::execution::par_unseq, + buffer.begin(), + buffer.end(), + std::get(chunk_val.value) + ); + } + } + + void chunk(std::span buffer, size_t index) const override + { + this->validate_chunk_index(index); + if (!this->is_gpu_chunk(index)) + { + throw std::runtime_error( + "Invalid function overload called for lazy_schunk::chunk. The given chunk is not a gpu" + " chunk but a cpu chunk." + ); + } + + // Either decompress from the compressed data or fill with the lazy chunks value + if (const auto& chunk_val = std::get(this->m_chunks.at(index)); std::holds_alternative< + gpu_chunk>(chunk_val.value)) + { + const auto& chunk_container = std::get>(chunk_val.value); + auto compressor = cuda::make_compressor(chunk_container); + std::visit( + [&](auto& _compressor) + { + _compressor.decompress(chunk_container, std::span(buffer)); + }, + compressor + ); + } + else + { + std::fill( + std::execution::par_unseq, + buffer.begin(), + buffer.end(), + std::get(chunk_val.value) + ); + } + } + + void set_chunk(blosc2::context_ptr& compression_ctx, std::span uncompressed, size_t index) override + { + this->validate_chunk_index(index); + + auto compressed = blosc2::compress_to_chunk(compression_ctx, uncompressed); + + auto chunk = detail::lazy_chunk{ + std::move(compressed), + uncompressed.size() + }; + this->m_chunks[index] = std::move(chunk); + this->validate_chunk_sizes(); + } + + void set_chunk(cuda::nvcomp_context compression_ctx, std::span uncompressed, size_t index) override + { + this->validate_chunk_index(index); + + auto compressor = cuda::make_compressor(compression_ctx.codec); + cuda::compressed_chunk _chunk{}; + std::visit( + [&](auto& _compressor) + { + _chunk = _compressor.compress(uncompressed, compression_ctx); + }, + compressor + ); + auto chunk = detail::lazy_chunk>{ + std::move(_chunk), + uncompressed.size() + }; + this->m_chunks[index] = std::move(chunk); + this->validate_chunk_sizes(); + } + + void append_chunk(cuda::nvcomp_context compression_ctx, std::span uncompressed) override + { + auto compressor = cuda::make_compressor(compression_ctx.codec); + cuda::compressed_chunk _chunk{}; + std::visit( + [&](auto& _compressor) -> void + { + _chunk = _compressor.compress(uncompressed, compression_ctx); + }, + compressor + ); + + auto chunk = detail::lazy_chunk>{ + std::move(_chunk), + uncompressed.size() + }; + this->m_chunks.push_back(std::move(chunk)); + this->validate_chunk_sizes(); + } + + void append_chunk(blosc2::context_ptr& compression_ctx, + std::span uncompressed, + std::span compression_buff) override + { + auto csize = blosc2::compress(compression_ctx, uncompressed, compression_buff); + auto chunk = detail::lazy_chunk{ + cpu_chunk(compression_buff.begin(), compression_buff.begin() + csize), + uncompressed.size() + }; + this->m_chunks.push_back(std::move(chunk)); + this->validate_chunk_sizes(); + } + + void append_chunk(compression_context_var compression_ctx, std::span uncompressed) override + { + if (std::holds_alternative(compression_ctx)) + { + auto compressed = blosc2::compress_to_chunk( + std::get(compression_ctx).compression_ctx, + uncompressed + ); + auto chunk = detail::lazy_chunk{ + cpu_chunk(compressed.begin(), compressed.end()), + uncompressed.size() + }; + this->m_chunks.push_back(std::move(chunk)); + } + else + { + auto compressor = cuda::make_compressor( + std::get(compression_ctx).ctx.codec + ); + cuda::compressed_chunk gpu_chunk{}; + std::visit( + [&](auto& _compressor) + { + gpu_chunk = _compressor.compress( + uncompressed, + std::get(compression_ctx).ctx + ); + }, + compressor + ); + + auto chunk = detail::lazy_chunk>{ + std::move(gpu_chunk), + uncompressed.size() + }; + this->m_chunks.push_back(std::move(chunk)); + } + this->validate_chunk_sizes(); + }; + + /// Retrieve the total compressed size of the lazy-schunk. + /// Lazy chunks will count as the size of T. + size_t csize() const noexcept override + { + size_t _csize = 0; + size_t idx = 0; + for (const auto& chunk : this->m_chunks) + { + if (this->is_gpu_chunk(idx)) + { + const auto& _chunk = std::get(chunk); + if (std::holds_alternative(_chunk.value)) + { + _csize += sizeof(T); + } + else + { + _csize += std::get>(_chunk.value).size(); + } + } + else + { + const auto& _chunk = std::get(chunk); + if (std::holds_alternative(_chunk.value)) + { + _csize += sizeof(T); + } + else + { + _csize += std::get(_chunk.value).size(); + } + } + ++idx; + } + return _csize; + } + + // The total uncompressed size of the lazy-schunk in elements. + size_t size() const noexcept override + { + size_t _size = 0; + for (const auto& chunk : this->m_chunks) + { + std::visit( + [&](const auto& _chunk) -> void + { + _size += _chunk.num_elements; + }, + chunk + ); + } + return _size; + } + + private: + /// Check whether this->m_chunks contain any still-lazy chunks. + bool has_lazy_chunk() const noexcept + { + for (const auto& chunk : this->m_chunks) + { + if (std::holds_alternative(chunk.value)) + { + return true; + } + } + return false; + } + + /// Get the value of the first encountered lazy chunk, since we only create lazy chunks with a single value + /// this is a valid way of accessing this value. if no lazy chunk exists we simply return T{} + T lazy_chunk_value() const noexcept + { + for (const auto& chunk : this->m_chunks) + { + T value = {}; + + std::visit( + [&](const auto& _chunk) + { + if (_chunk.is_lazy()) + { + value = std::get(_chunk.value); + } + }, + chunk + ); + + if (value != T{}) + { + return value; + } + } + + return {}; + } + }; + } // detail +} // NAMESPACE_COMPRESSED_IMAGE diff --git a/compressed_image/include/compressed/blosc2/schunk.h b/compressed_image/include/compressed/blosc2/schunk.h index c090527..fcdc730 100644 --- a/compressed_image/include/compressed/blosc2/schunk.h +++ b/compressed_image/include/compressed/blosc2/schunk.h @@ -12,269 +12,358 @@ #include "wrapper.h" #include "schunk_mixin.h" -namespace NAMESPACE_COMPRESSED_IMAGE -{ +#include "compressed/cuda/compression.h" - namespace blosc2 - { - - template - struct schunk final: public detail::schunk_mixin> - { - using detail::schunk_mixin>::chunk_bytes; - - schunk() = default; - - schunk(schunk&& other) noexcept - { - this->m_Chunks = std::move(other.m_Chunks); - this->m_ChunkSize = other.m_ChunkSize; - this->m_BlockSize = other.m_BlockSize; - } - schunk& operator=(schunk&& other) noexcept - { - if (this != &other) - { - this->m_Chunks = std::move(other.m_Chunks); - this->m_ChunkSize = other.m_ChunkSize; - this->m_BlockSize = other.m_BlockSize; - } - return *this; - } - schunk(const schunk& other) = default; - schunk& operator=(const schunk& other) = default; - - - /// Initialize an empty schunk with just a schunk size. The data can then later - /// be filled with append_chunk for example. - schunk(size_t block_size, size_t chunk_size) - { - util::validate_chunk_size(chunk_size, "schunk"); - this->m_ChunkSize = chunk_size; - this->m_BlockSize = block_size; - } - - /// Initialize a super-chunk from the given vector, compressing it - /// - /// \param data The data to store - /// \param block_size The requested block size. It is up to the caller to ensure - /// this is appropriately sized - /// \param chunk_size The requested chunk size. It is up to the caller to ensure - /// this is appropriately sized (i.e. by using util::align_chunk_to_scanlines) - /// \param compression_ctx The compression context to be used for compressing the data. - schunk(std::span data, size_t block_size, size_t chunk_size, blosc2::context_ptr& compression_ctx) - { - util::validate_chunk_size(chunk_size, "schunk"); - this->m_BlockSize = block_size; - this->m_ChunkSize = chunk_size; - - // Compression buffer we will continuously overwrite in our compression, the chunk data is then copied out - // of this on initialization. - util::default_init_vector compression_buffer(blosc2::min_compressed_size(chunk_size)); - auto compression_span = std::span(compression_buffer); - - size_t num_elements = data.size(); - size_t num_bytes = num_elements * sizeof(T); - - // Calculate all 'full' chunks and the final remainder (if any). - size_t num_full_chunks = num_bytes / this->chunk_bytes(); - size_t remainder_bytes = num_bytes - (this->chunk_bytes() * num_full_chunks); - - size_t data_offset = 0; - // Initialize the chunks by compressing them. - for ([[maybe_unused]] auto idx : std::views::iota(size_t{ 0 }, num_full_chunks)) - { - auto subspan = std::span(data.data() + data_offset, this->chunk_elements()); - auto csize = blosc2::compress(compression_ctx, subspan, compression_span); - - // copy over a new vector containing all the elements from the compression span. - this->m_Chunks.push_back(std::vector(compression_span.begin(), compression_span.begin() + csize)); - - data_offset += this->chunk_elements(); - } - if (remainder_bytes > 0) - { - auto subspan = std::span(data.data() + data_offset, data.size() - data_offset); - auto csize = blosc2::compress(compression_ctx, subspan, compression_span); - - // copy over a new vector containing all the elements from the compression span. - this->m_Chunks.push_back(std::vector(compression_span.begin(), compression_span.begin() + csize)); - - // no need to move over the data_offset. - } - } - - schunk_ptr to_schunk() override - { - _COMPRESSED_PROFILE_FUNCTION(); - blosc2::schunk_ptr schunk = create_default_schunk(); - for (auto& chunk : this->m_Chunks) - { - blosc2_schunk_append_chunk( - schunk.get(), - reinterpret_cast(chunk.data()), - true // copy, blosc2 will internally at some point do this anyways. - ); - } - - return schunk; - } - - std::vector to_uncompressed(blosc2::context_ptr& decompression_ctx) const override - { - _COMPRESSED_PROFILE_FUNCTION(); - auto num_elems = this->size(); - std::vector data(num_elems); - - size_t data_offset = 0; - for (auto idx : std::views::iota(size_t{ 0 }, this->m_Chunks.size())) - { - size_t chunk_elems = this->chunk_elements(idx); - - auto subspan = std::span(data.data() + data_offset, chunk_elems); - this->chunk(decompression_ctx, subspan, idx); - - data_offset += chunk_elems; - } - - return data; - } - - std::vector chunk(blosc2::context_ptr& decompression_ctx, size_t index) const override - { - return this->chunk(decompression_ctx.get(), index); - } - - std::vector chunk(blosc2::context_raw_ptr decompression_ctx, size_t index) const override - { - this->validate_chunk_index(index); - - std::vector decompressed(this->chunk_elements(index)); - auto chunk_span = std::span(this->m_Chunks[index].begin(), this->m_Chunks[index].end()); - blosc2::decompress(decompression_ctx, std::span(decompressed), chunk_span); - - return std::move(decompressed); - } - - void chunk(blosc2::context_ptr& decompression_ctx, std::span buffer, size_t index) const override - { - this->chunk(decompression_ctx.get(), buffer, index); - } - - void chunk(blosc2::context_raw_ptr decompression_ctx, std::span buffer, size_t index) const override - { - this->validate_chunk_index(index); - - if (buffer.size() < this->chunk_elements(index)) - { - throw std::invalid_argument( - std::format( - "Unable to decompress chunk at idx {} into buffer as the buffer needs to at least have the size {:L}." - " Instead got {:L}", index, this->chunk_elements(index), buffer.size() - ) - ); - } - - auto chunk_span = std::span(this->m_Chunks[index].begin(), this->m_Chunks[index].end()); - blosc2::decompress(decompression_ctx, std::span(buffer), chunk_span); - } - - void set_chunk(std::vector compressed, size_t index) override - { - this->validate_chunk_index(index); - this->m_Chunks[index] = std::move(compressed); - this->validate_chunk_sizes(); - } - - void set_chunk(std::span compressed, size_t index) override - { - this->validate_chunk_index(index); - this->m_Chunks[index] = std::vector(compressed.begin(), compressed.end()); - this->validate_chunk_sizes(); - } - - void set_chunk(blosc2::context_ptr& compression_ctx, std::span uncompressed, size_t index) override - { - this->validate_chunk_index(index); - - util::default_init_vector compression_buffer(blosc2::min_compressed_size(this->chunk_bytes())); - std::span compression_span(compression_buffer); - - auto csize = blosc2::compress(compression_ctx, uncompressed, compression_span); - - // copy over a new vector containing all the elements from the compression span. - this->m_Chunks[index] = std::vector(compression_span.begin(), compression_span.begin() + csize); - this->validate_chunk_sizes(); - } - - /// Append to the schunk with the uncompressed data (compressing it). - /// - /// \param compressed the compressed chunk - void append_chunk(std::vector compressed) override - { - this->m_Chunks.push_back(std::move(compressed)); - this->validate_chunk_sizes(); - }; - - /// Append to the schunk with the uncompressed data (compressing it). - /// - /// \param compression_ctx the compression context to use for compression. - /// \param uncompressed the uncompressed chunk - void append_chunk(blosc2::context_ptr& compression_ctx, std::span uncompressed) override - { - util::default_init_vector compression_buffer(blosc2::min_compressed_size(this->chunk_bytes())); - std::span compression_span(compression_buffer); - this->append_chunk(compression_ctx, uncompressed, compression_span); - this->validate_chunk_sizes(); - }; - - void append_chunk(blosc2::context_ptr& compression_ctx, std::span uncompressed, std::span compression_buff) override - { - if (compression_buff.size() < blosc2::min_compressed_size(this->chunk_bytes())) - { - throw std::runtime_error( - std::format( - "Error while appending chunk to super-chunk. Expected compression buffer to be at least" - " {:L} bytes but instead we got {:L} bytes", blosc2::min_compressed_size(this->chunk_bytes()), - compression_buff.size() - ) - ); - } - auto csize = blosc2::compress(compression_ctx, uncompressed, compression_buff); - assert(csize <= compression_buff.size()); - // copy over a new vector containing all the elements from the compression span. - this->m_Chunks.push_back(std::vector(compression_buff.begin(), compression_buff.begin() + csize)); - this->validate_chunk_sizes(); - } - - size_t chunk_bytes(size_t index) const override - { - return blosc2::chunk_num_elements(this->m_Chunks[index]) * sizeof(T); - } - - /// The total compressed size of the schunk - virtual size_t csize() const noexcept override - { - size_t _size = 0; - for (const auto& chunk : this->m_Chunks) - { - _size += chunk.size(); - } - return _size; - }; - - size_t size() const noexcept override - { - size_t _size = 0; - for (const auto& chunk : this->m_Chunks) - { - _size += blosc2::chunk_num_elements(chunk); - } - return _size; - }; - - }; - - } // blosc2 - -} // NAMESPACE_COMPRESSED_IMAGE \ No newline at end of file +namespace +NAMESPACE_COMPRESSED_IMAGE +{ + namespace detail + { + template + struct schunk final : + public detail::schunk_mixin + { + using detail::schunk_mixin::gpu_container; + using detail::schunk_mixin::cpu_container; + + using detail::schunk_mixin::chunk_bytes; + using detail::schunk_mixin::chunk; + using detail::schunk_mixin::is_gpu_chunk; + using detail::schunk_mixin::to_uncompressed; + + schunk() = default; + + schunk(schunk&& other) noexcept + { + this->m_chunks = std::move(other.m_chunks); + this->m_chunk_size = other.m_chunk_size; + this->m_block_size = other.m_block_size; + } + + schunk& operator=(schunk&& other) noexcept + { + if (this != &other) + { + this->m_chunks = std::move(other.m_chunks); + this->m_chunk_size = other.m_chunk_size; + this->m_block_size = other.m_block_size; + } + return *this; + } + + schunk(const schunk& other) = default; + schunk& operator=(const schunk& other) = default; + + + /// Initialize an empty schunk with just a schunk size. The data can then later + /// be filled with append_chunk for example. + schunk(size_t block_size, size_t chunk_size) + { + util::validate_chunk_size(chunk_size, "schunk"); + this->m_chunk_size = chunk_size; + this->m_block_size = block_size; + } + + /// Initialize a super-chunk from the given vector, compressing it + /// + /// \param data The data to store + /// \param block_size The requested block size. It is up to the caller to ensure + /// this is appropriately sized + /// \param chunk_size The requested chunk size. It is up to the caller to ensure + /// this is appropriately sized (i.e., by using util::align_chunk_to_scanlines) + /// \param compression_ctx The compression context to be used for compressing the data. Depending on which + /// type this is, this will initialize the data using gpu/cpu compression internally. + schunk(std::span data, + size_t block_size, + size_t chunk_size, + compression_context_var compression_ctx) + { + util::validate_chunk_size(chunk_size, "schunk"); + this->m_block_size = block_size; + this->m_chunk_size = chunk_size; + + const size_t num_elements = data.size(); + const size_t num_bytes = num_elements * sizeof(T); + + // Calculate all 'full' chunks and the final remainder (if any). + const size_t num_full_chunks = num_bytes / this->chunk_bytes(); + const size_t remainder_bytes = num_bytes - (this->chunk_bytes() * num_full_chunks); + + // When compressing using gpu compression, we don't allocate a scratch buffer on the cpu as we internally + // use a memory-pool on the gpu that we reuse between compressions, making allocations quite cheap. + if (std::holds_alternative(compression_ctx)) + { + size_t data_offset = 0; + + for ([[maybe_unused]] auto idx : std::views::iota(size_t{0}, num_full_chunks)) + { + auto subspan = std::span(data.data() + data_offset, this->chunk_elements()); + this->append_chunk(std::get(compression_ctx).ctx, subspan); + + data_offset += this->chunk_elements(); + } + if (remainder_bytes > 0) + { + auto subspan = std::span(data.data() + data_offset, data.size() - data_offset); + + this->append_chunk(std::get(compression_ctx).ctx, subspan); + // no need to move over the data_offset. + } + } + else + { + // Compression buffer we will continuously overwrite in our compression, the chunk data is then copied out + // of this on initialization. + util::default_init_vector compression_buffer(blosc2::min_compressed_size(chunk_size)); + auto compression_span = std::span(compression_buffer); + + size_t data_offset = 0; + // Initialize the chunks by compressing them. + for ([[maybe_unused]] auto idx : std::views::iota(size_t{0}, num_full_chunks)) + { + auto subspan = std::span(data.data() + data_offset, this->chunk_elements()); + auto csize = blosc2::compress( + std::get(compression_ctx).compression_ctx.get(), + subspan, + compression_span + ); + + // copy over a new vector containing all the elements from the compression span. + this->m_chunks.push_back( + util::default_init_vector( + compression_span.begin(), + compression_span.begin() + csize + ) + ); + + data_offset += this->chunk_elements(); + } + if (remainder_bytes > 0) + { + auto subspan = std::span(data.data() + data_offset, data.size() - data_offset); + auto csize = blosc2::compress( + std::get(compression_ctx).compression_ctx.get(), + subspan, + compression_span + ); + + // copy over a new vector containing all the elements from the compression span. + this->m_chunks.push_back( + util::default_init_vector( + compression_span.begin(), + compression_span.begin() + csize + ) + ); + + // no need to move over the data_offset. + } + } + } + + + void chunk(std::span buffer, size_t index) const override + { + this->validate_chunk_index(index); + if (!this->is_gpu_chunk(index)) + { + throw std::runtime_error( + "Invalid function overload called for schunk::chunk. The given chunk is not a gpu" + " chunk but a cpu chunk." + ); + } + const auto& chunk_data = std::get(this->m_chunks.at(index)); + auto compressor = cuda::make_compressor(chunk_data); + std::visit( + [&](auto& _compressor) + { + _compressor.decompress(chunk_data, std::span(buffer)); + }, + compressor + ); + } + + void chunk(blosc2::context_raw_ptr decompression_ctx, std::span buffer, size_t index) const override + { + this->validate_chunk_index(index); + if (this->is_gpu_chunk(index)) + { + throw std::runtime_error( + "Invalid function overload called for schunk::chunk. The given chunk is not a cpu" + " chunk but a gpu chunk." + ); + } + + if (buffer.size() < this->chunk_elements(index)) + { + throw std::invalid_argument( + std::format( + "Unable to decompress chunk at idx {} into buffer as the buffer needs to at least have the size {:L}." + " Instead got {:L}", + index, + this->chunk_elements(index), + buffer.size() + ) + ); + } + + const auto& chunk_data = std::get(this->m_chunks.at(index)); + auto chunk_span = std::span(chunk_data.begin(), chunk_data.end()); + blosc2::decompress(decompression_ctx, std::span(buffer), chunk_span); + } + + void set_chunk(blosc2::context_ptr& compression_ctx, std::span uncompressed, size_t index) override + { + this->validate_chunk_index(index); + + auto compressed = blosc2::compress_to_chunk(compression_ctx, uncompressed); + + // copy over a new vector containing all the elements from the compression span. + this->m_chunks[index] = std::move(compressed); + this->validate_chunk_sizes(); + } + + void set_chunk(cuda::nvcomp_context compression_ctx, std::span uncompressed, size_t index) override + { + this->validate_chunk_index(index); + + auto compressor = cuda::make_compressor(compression_ctx.codec); + cuda::compressed_chunk _chunk{}; + std::visit( + [&](auto& _compressor) + { + _chunk = _compressor.compress(uncompressed, compression_ctx); + }, + compressor + ); + + this->m_chunks[index] = std::move(_chunk); + this->validate_chunk_sizes(); + } + + void append_chunk(blosc2::context_ptr& compression_ctx, + std::span uncompressed, + std::span compression_buff) override + { + if (compression_buff.size() < blosc2::min_compressed_size(this->chunk_bytes())) + { + throw std::runtime_error( + std::format( + "Error while appending chunk to super-chunk. Expected compression buffer to be at least" + " {:L} bytes but instead we got {:L} bytes", + blosc2::min_compressed_size(this->chunk_bytes()), + compression_buff.size() + ) + ); + } + auto csize = blosc2::compress(compression_ctx, uncompressed, compression_buff); + assert(csize <= compression_buff.size()); + // copy over a new vector containing all the elements from the compression span. + this->m_chunks.push_back(cpu_chunk(compression_buff.begin(), compression_buff.begin() + csize)); + this->validate_chunk_sizes(); + } + + void append_chunk(cuda::nvcomp_context compression_ctx, std::span uncompressed) override + { + auto compressor = cuda::make_compressor(compression_ctx.codec); + + cuda::compressed_chunk _chunk{}; + std::visit( + [&](auto& _compressor) + { + _chunk = _compressor.compress(uncompressed, compression_ctx); + }, + compressor + ); + + this->m_chunks.push_back(std::move(_chunk)); + this->validate_chunk_sizes(); + } + + void append_chunk(compression_context_var compression_ctx, std::span uncompressed) override + { + if (std::holds_alternative(compression_ctx)) + { + auto compressed = blosc2::compress_to_chunk( + std::get(compression_ctx).compression_ctx, + uncompressed + ); + this->m_chunks.push_back(std::move(compressed)); + } + else + { + auto compressor = cuda::make_compressor( + std::get(compression_ctx).ctx.codec + ); + cuda::compressed_chunk _chunk{}; + std::visit( + [&](auto& _compressor) + { + _chunk = _compressor.compress( + uncompressed, + std::get(compression_ctx).ctx + ); + }, + compressor + ); + + this->m_chunks.push_back(std::move(_chunk)); + } + this->validate_chunk_sizes(); + }; + + size_t chunk_bytes(size_t index) const override + { + if (is_gpu_chunk(index)) + { + const auto& _chunk = std::get(this->m_chunks.at(index)); + return _chunk.byte_size(); + } + const auto& _chunk = std::get(this->m_chunks.at(index)); + return blosc2::chunk_num_elements(_chunk) * sizeof(T); + } + + /// The total compressed size of the schunk + size_t csize() const noexcept override + { + size_t _size = 0; + size_t index = 0; + for ([[maybe_unused]] const auto& chunk : this->m_chunks) + { + if (is_gpu_chunk(index)) + { + const auto& _chunk = std::get(this->m_chunks.at(index)); + _size += _chunk.csize(); + } + else + { + const auto& _chunk = std::get(this->m_chunks.at(index)); + _size += _chunk.size(); + } + ++index; + } + return _size; + }; + + [[nodiscard]] size_t size() const noexcept override + { + size_t _size = 0; + size_t index = 0; + for ([[maybe_unused]] const auto& chunk : this->m_chunks) + { + if (is_gpu_chunk(index)) + { + const auto& _chunk = std::get(this->m_chunks.at(index)); + _size += _chunk.size(); + } + else + { + const auto& _chunk = std::get(this->m_chunks.at(index)); + _size += blosc2::chunk_num_elements(_chunk); + } + ++index; + } + return _size; + }; + }; + } // detail +} // NAMESPACE_COMPRESSED_IMAGE diff --git a/compressed_image/include/compressed/blosc2/schunk_mixin.h b/compressed_image/include/compressed/blosc2/schunk_mixin.h index b72e512..563089c 100644 --- a/compressed_image/include/compressed/blosc2/schunk_mixin.h +++ b/compressed_image/include/compressed/blosc2/schunk_mixin.h @@ -7,264 +7,431 @@ #include "compressed/macros.h" #include "wrapper.h" #include "compressed/constants.h" +#include "compressed/context.h" +#include "compressed/cuda/compressors/base.h" -namespace NAMESPACE_COMPRESSED_IMAGE + +namespace +NAMESPACE_COMPRESSED_IMAGE { - namespace blosc2 - { - - namespace detail - { - - /// Opaque mixin around a blosc2 super-chunk with the intention of not using a `blosc2_schunk` - /// itself but instead of using it directly the chunks should be stored individually. - /// Subclassed by either a `schunk` or a `lazy_schunk` depending on the needs of the - /// consumer. - template - struct schunk_mixin - { - virtual ~schunk_mixin() = default; - - /// convert the struct into a blosc2 schunk. - virtual blosc2::schunk_ptr to_schunk() = 0; - - /// Generate an uncompressed vector from all of the chunks. - /// - /// \param decompression_ctx the decompression context pr. - /// - /// \returns a contiguous vector representing the uncompressed schunk. - virtual std::vector to_uncompressed(blosc2::context_ptr& decompression_ctx) const = 0; - - /// Retrieve the uncompressed chunk at `index`. - /// - /// \param decompression_ctx the decompression context ptr - /// \param index the index of the chunk within the schunk. - /// - /// \throws std::out_of_range if the index is not valid - virtual std::vector chunk(blosc2::context_ptr& decompression_ctx, size_t index) const = 0; - - /// Retrieve the uncompressed chunk at `index`. - /// - /// \param decompression_ctx the decompression context ptr - /// \param index the index of the chunk within the schunk. - /// - /// \throws std::out_of_range if the index is not valid - virtual std::vector chunk(blosc2::context_raw_ptr decompression_cx, size_t index) const = 0; - - /// Retrieve the uncompressed chunk at `index`. - /// - /// \param decompression_ctx the decompression context ptr - /// \param buffer the buffer to fill the uncompressed data with. Must be at least max chunk size. - /// \param index the index of the chunk within the schunk. - /// - /// \throws std::out_of_range if the index is not valid - virtual void chunk(blosc2::context_ptr& decompression_ctx, std::span buffer, size_t index) const = 0; - - /// Retrieve the uncompressed chunk at `index`. - /// - /// \param decompression_ctx the decompression context ptr - /// \param buffer the buffer to fill the uncompressed data with. Must be at least max chunk size. - /// \param index the index of the chunk within the schunk. - /// - /// \throws std::out_of_range if the index is not valid - virtual void chunk(blosc2::context_raw_ptr decompression_ctx, std::span buffer, size_t index) const = 0; - - /// Set the chunk at `index` to the compressed data. - /// - /// \param compressed the compressed chunk - /// \param index the index of the chunk within the schunk. - /// - /// \throws std::out_of_range if the index is not valid - virtual void set_chunk(std::vector compressed, size_t index) = 0; - - /// Set the chunk at `index` to the compressed data. - /// - /// \param compressed the compressed chunk - /// \param index the index of the chunk within the schunk. - /// - /// \throws std::out_of_range if the index is not valid - virtual void set_chunk(std::span compressed, size_t index) = 0; - - /// Set the chunk at `index` to the uncompressed data (compressing it). - /// - /// \param compression_ctx the compression context to use for compression. - /// \param uncompressed the uncompressed chunk - /// \param index the index of the chunk within the schunk. - /// - /// \throws std::out_of_range if the index is not valid - virtual void set_chunk(blosc2::context_ptr& compression_ctx, std::span uncompressed, size_t index) = 0; - - /// Append to the schunk with the uncompressed data (compressing it). - /// - /// \param compressed the compressed chunk - virtual void append_chunk(std::vector compressed) = 0; - - /// Append to the schunk with the uncompressed data (compressing it). - /// - /// \param compression_ctx the compression context to use for compression. - /// \param uncompressed the uncompressed chunk - virtual void append_chunk(blosc2::context_ptr& compression_ctx, std::span uncompressed) = 0; - - /// Append to the schunk with the uncompressed data (compressing it). - /// - /// \param compression_ctx the compression context to use for compression. - /// \param uncompressed the uncompressed chunk - /// \param compression_buff the compression buffer to use for temporary storage. - virtual void append_chunk(blosc2::context_ptr& compression_ctx, std::span uncompressed, std::span compression_buff) = 0; - - /// Retrieve the number of elements (uncompressed) that the schunk stores. - /// - /// \throws std::runtime_error if the chunk_bytes / sizeof(T) is not cleanly divisble - size_t chunk_elements() const - { - auto _size = this->chunk_bytes(); - if (_size % sizeof(T) != 0) - { - throw std::runtime_error( - std::format( - "Internal Error: The chunk byte size is not cleanly divisible by the sizeof T." - " Chunk size is {:L} while sizeof(T) is {}", _size, sizeof(T) - ) - ); - } - return _size / sizeof(T); - }; - - /// Retrieve the number of elements (uncompressed) that the schunk stores at a given chunk. - /// In all cases except for chunk_elements(num_chunks() - 1) this will return chunk_elements. - /// - /// \throws std::out_of_range if the index is not valid in the super-chunk. - /// \throws std::runtime_error if the chunk_bytes / sizeof(T) is not cleanly divisble - size_t chunk_elements(size_t index) const - { - auto _size = this->chunk_bytes(index); - if (_size % sizeof(T) != 0) - { - throw std::runtime_error( - std::format( - "Internal Error: The chunk byte size is not cleanly divisible by the sizeof T." - " Chunk size is {:L} while sizeof(T) is {}", _size, sizeof(T) - ) - ); - } - return _size / sizeof(T); - }; - - /// Retrieve the number of bytes stored by the super-chunk per-chunk. This will be equivalent - /// to the number of uncompressed bytes stored by each chunk up to num_chunks() - 1. - /// The last chunk may be smaller (but not bigger) in size than this value. - size_t chunk_bytes() const - { - return this->m_ChunkSize; - }; - - /// Retrieve the number of bytes stored by the chunk at index `index`. This will be equivalent to - /// chunk_bytes unless it is the last chunk in which case it may be smaller. - /// - /// \throws std::out_of_range if the index is not valid in the super-chunk. - virtual size_t chunk_bytes(size_t index) const = 0; - - /// The number of chunks in the super-chunk - size_t num_chunks() const noexcept - { - return m_Chunks.size(); - } - - /// The total compressed size of the schunk in bytes - virtual size_t csize() const noexcept = 0; - - /// The total uncompressed size of the schunk in elements - virtual size_t size() const noexcept = 0; - - /// The total number of bytes stored in the schunk when uncompressed. - /// equivalent to size() * sizeof(T) - size_t byte_size() const noexcept - { - return size() * sizeof(T); - } - - size_t max_chunk_size() - { - return m_ChunkSize; - } - - size_t max_block_size() - { - return m_BlockSize; - } - - protected: - std::vector m_Chunks{}; - /// The maximum size a chunk is constrained to, in bytes. This will dictate the size of all chunks from - /// 0 - (this->m_Chunks.size() - 1). The last chunk may be any other size smaller than or equal to this value. - size_t m_ChunkSize = s_default_chunksize; - size_t m_BlockSize = s_default_blocksize; - - /// Validate the chunk index throwing a std::out_of_range if the index is not valid. - void validate_chunk_index(size_t index) const - { - if (index > m_Chunks.size() - 1) - { - throw std::out_of_range( - std::format("Cannot access index {} in schunk. Total amount of chunks is {}", index, m_Chunks.size()) - ); - } - } - - /// Validate all the chunk sizes currently held by the super-chunk. This function - /// ensures that the chunks - void validate_chunk_sizes() const - { - // Check that all chunks barring the last one are equal to m_ChunkSize - for (auto i : std::views::iota(size_t{ 0 }, this->num_chunks() - 1)) - { - if (this->chunk_bytes(i) != this->chunk_bytes()) - { - throw std::invalid_argument( - std::format( - "Error while validating chunk sizes; Expected all chunks to have a size equivalent to {:L} (m_ChunkSize)." - " However, chunk {} instead has a chunk size of {:L}. Having a size different from the rest of the chunks" - " is only supported for the last chunk (blosc2 limitation). Please ensure that all chunks are equally sized" - " when modifying the super-chunk (excluding the last one).", - this->chunk_bytes(), i, this->chunk_bytes(i) - ) - ); - } - } - - // Check that the last chunk is not larger than the rest. - if (this->chunk_bytes(this->num_chunks() - 1) > this->chunk_bytes()) - { - throw std::runtime_error( - std::format( - "Error while validating chunk sizes; Expected the last chunk to be at most {:L} bytes," - " instead got {:L} bytes.", - this->chunk_bytes(), this->chunk_bytes(this->num_chunks() - 1) - ) - ); - } - } - - /// Get the buffer size for T for the given byte size. Checks that the buffer - /// can be divided cleanly by sizeof(T). - size_t get_T_buffer_size(size_t byte_size) const - { - if (byte_size % sizeof(T) != 0) - { - throw std::runtime_error( - std::format( - "Cannot get buffer size for type T of size {} because it is not evenly divisible for buffer size {:L}", - sizeof(T), - byte_size - ) - ); - } - return byte_size / sizeof(T); - } - }; - - } // detail - - } // blosc2 - -} // NAMESPACE_COMPRESSED_IMAGE \ No newline at end of file + namespace detail + { + /// \brief The default storage class for a gpu compressed chunk. + /// + /// \note this chunk may not live on the gpu, this just indicates it was generated + /// on the gpu. + template + using gpu_chunk = cuda::compressed_chunk; + + /// \brief The default storage class for a cpu compressed chunk. + using cpu_chunk = util::default_init_vector; + + /// Mixin for representing a blosc2-style super-chunk for both cpu and gpu chunks. + /// + /// \tparam _gpu_container_type The type for a gpu compressed chunk + /// \tparam _cpu_container_type The type for a cpu compressed chunk + template , typename _cpu_container_type = cpu_chunk> + struct schunk_mixin + { + using gpu_container = _gpu_container_type; + using cpu_container = _cpu_container_type; + + virtual ~schunk_mixin() = default; + + /// Checks whether the chunk at `index` is a gpu/cpu chunk + /// + /// \parm index The chunk index + /// + /// \throws std::runtime_error if the chunk index is not valid + bool is_gpu_chunk(size_t index) const + { + if (index > m_chunks.size() - 1) + { + throw std::runtime_error( + std::format( + "Invalid chunk index {}, can at most index up to {}", + index, + m_chunks.size() - 1 + ) + ); + } + + return std::holds_alternative<_gpu_container_type>(m_chunks.at(index)); + }; + + /// Generate an uncompressed vector from all of the chunks. + /// + /// \param cpu_ctx the decompression context for all cpu based chunks. + /// \param gpu_ctx the decompression context for all gpu based chunks. + /// + /// \returns a contiguous vector representing the uncompressed schunk. + virtual std::vector to_uncompressed( + cpu_compression_context& cpu_ctx, + [[maybe_unused]] gpu_compression_context gpu_ctx + ) const + { + _COMPRESSED_PROFILE_FUNCTION(); + auto num_elems = this->size(); + std::vector data(num_elems); + + size_t data_offset = 0; + for (auto idx : std::views::iota(size_t{0}, this->m_chunks.size())) + { + size_t chunk_elems = this->chunk_elements(idx); + + auto subspan = std::span(data.data() + data_offset, chunk_elems); + + if (this->is_gpu_chunk(idx)) + { + this->chunk(subspan, idx); + } + else + { + if (!cpu_ctx.decompression_ctx || !cpu_ctx.compression_ctx) + { + throw std::invalid_argument( + std::format( + "Chunk {}: valid cpu decompression and compression contexts must be provided" + " for cpu chunks", + idx + ) + ); + } + + this->chunk(cpu_ctx.decompression_ctx.get(), subspan, idx); + } + + data_offset += chunk_elems; + } + + return data; + }; + + /// Generate an uncompressed vector from all of the chunks. + /// + /// This overload may only be called if the schunk contains no gpu chunks. + /// + /// \param context the decompression context for the chunks + /// + /// \throws std::runtime_error if the schunk contains one or more gpu chunks. + /// + /// \returns a contiguous vector representing the uncompressed schunk. + std::vector to_uncompressed(cpu_compression_context& context) const + { + for (size_t i = 0; i < this->num_chunks(); ++i) + { + if (is_gpu_chunk(i)) + { + throw std::runtime_error( + std::format( + "Invalid overload of 'to_uncompressed' called. This overload may only be called if" + " there are no GPU chunks. However, at least chunk {} is a gpu chunk. Please pass" + " an explicit GPU decompressor.", + i + ) + ); + } + } + return this->to_uncompressed(context, gpu_compression_context{cuda::nvcomp_context{}}); + } + + /// Generate an uncompressed vector from all of the chunks. + /// + /// This overload may only be called if the schunk contains no cpu chunks. + /// + /// \param context the decompression context for the chunks + /// + /// \throws std::runtime_error if the schunk contains one or more gpu chunks. + /// + /// \returns a contiguous vector representing the uncompressed schunk. + std::vector to_uncompressed(gpu_compression_context context) const + { + for (size_t i = 0; i < this->num_chunks(); ++i) + { + if (!is_gpu_chunk(i)) + { + throw std::runtime_error( + std::format( + "Invalid overload of 'to_uncompressed' called. This overload may only be called if" + " there are no CPU chunks. However, at least chunk {} is a cpu chunk. Please pass" + " an explicit CPU decompressor.", + i + ) + ); + } + } + auto _cpu_context = cpu_compression_context{}; + return this->to_uncompressed(_cpu_context, context); + } + + /// Retrieve the uncompressed chunk at `index`. + /// + /// \param context the decompression context + /// \param index the index of the chunk within the schunk. + /// + /// \throws std::out_of_range if the index is not valid + virtual std::vector chunk(cpu_compression_context& context, const size_t index) const + { + return this->chunk(context.decompression_ctx.get(), index); + }; + + /// Retrieve the uncompressed gpu chunk at `index`. + /// + /// \param context the decompression context + /// \param index the index of the chunk within the schunk. + /// + /// \throws std::out_of_range if the index is not valid + virtual std::vector chunk(const cuda::nvcomp_context context, const size_t index) const + { + std::vector buffer(this->size()); + this->chunk(context, index); + return buffer; + }; + + /// Retrieve the uncompressed chunk at `index`. + /// + /// \param decompression_ctx the decompression context ptr + /// \param index the index of the chunk within the schunk. + /// + /// \throws std::out_of_range if the index is not valid + virtual std::vector chunk(blosc2::context_raw_ptr decompression_ctx, size_t index) const + { + std::vector buffer(this->chunk_elements(index)); + this->chunk(decompression_ctx, std::span(buffer), index); + return buffer; + }; + + /// Retrieve the uncompressed chunk at `index`. + /// + /// \param decompression_ctx the decompression context ptr + /// \param buffer the buffer to fill the uncompressed data with. Must be at least max chunk size. + /// \param index the index of the chunk within the schunk. + /// + /// \throws std::out_of_range if the index is not valid + virtual void chunk(blosc2::context_ptr& decompression_ctx, std::span buffer, size_t index) const + { + this->chunk(decompression_ctx.get(), buffer, index); + }; + + /// Retrieve the uncompressed gpu chunk at `index`. + /// + /// \param buffer the buffer to fill the uncompressed data with. Must be at least max chunk size. + /// \param index the index of the chunk within the schunk. + /// + /// \throws std::out_of_range if the index is not valid + virtual void chunk(std::span buffer, size_t index) const = 0; + + /// Retrieve the uncompressed chunk at `index`. + /// + /// \param decompression_ctx the decompression context ptr + /// \param buffer the buffer to fill the uncompressed data with. Must be at least max chunk size. + /// \param index the index of the chunk within the schunk. + /// + /// \throws std::out_of_range if the index is not valid + virtual void chunk(blosc2::context_raw_ptr decompression_ctx, std::span buffer, size_t index) const = 0; + + /// Set the chunk at `index` to the uncompressed data (compressing it). + /// + /// \param compression_ctx the compression context to use for compression. + /// \param uncompressed the uncompressed chunk + /// \param index the index of the chunk within the schunk. + /// + /// \throws std::out_of_range if the index is not valid + virtual void set_chunk(blosc2::context_ptr& compression_ctx, std::span uncompressed, size_t index) = 0; + + /// Set the chunk at `index` to the uncompressed data (compressing it). + /// + /// \param compression_ctx the compression context to use for compression. + /// \param uncompressed the uncompressed chunk + /// \param index the index of the chunk within the schunk. + /// + /// \throws std::out_of_range if the index is not valid + virtual void set_chunk(cuda::nvcomp_context compression_ctx, std::span uncompressed, size_t index) = 0; + + + /// Append to the schunk with the uncompressed data (compressing it). + /// + /// \param compression_ctx the compression context to use for compression. + /// \param uncompressed the uncompressed chunk + virtual void append_chunk(cuda::nvcomp_context compression_ctx, std::span uncompressed) = 0; + + /// Append to the schunk with the uncompressed data (compressing it). + /// + /// \param compression_ctx the compression context to use for compression. + /// \param uncompressed the uncompressed chunk + /// \param compression_buff the compression buffer to use for temporary storage. + virtual void append_chunk(blosc2::context_ptr& compression_ctx, + std::span uncompressed, + std::span compression_buff) = 0; + + /// Append to the schunk with the uncompressed data (compressing it). + /// + /// \param compression_ctx the compression context to use for compression. + /// \param uncompressed the uncompressed chunk + virtual void append_chunk(compression_context_var compression_ctx, + std::span uncompressed) = 0; + + + /// Retrieve the number of elements (uncompressed) that the schunk stores. + /// + /// \throws std::runtime_error if the chunk_bytes / sizeof(T) is not cleanly divisble + size_t chunk_elements() const + { + auto _size = this->chunk_bytes(); + if (_size % sizeof(T) != 0) + { + throw std::runtime_error( + std::format( + "Internal Error: The chunk byte size is not cleanly divisible by the sizeof T." + " Chunk size is {:L} while sizeof(T) is {}", + _size, + sizeof(T) + ) + ); + } + return _size / sizeof(T); + }; + + /// Retrieve the number of elements (uncompressed) that the schunk stores at a given chunk. + /// In all cases except for chunk_elements(num_chunks() - 1) this will return chunk_elements. + /// + /// \throws std::out_of_range if the index is not valid in the super-chunk. + /// \throws std::runtime_error if the chunk_bytes / sizeof(T) is not cleanly divisble + size_t chunk_elements(size_t index) const + { + auto _size = this->chunk_bytes(index); + if (_size % sizeof(T) != 0) + { + throw std::runtime_error( + std::format( + "Internal Error: The chunk byte size is not cleanly divisible by the sizeof T." + " Chunk size is {:L} while sizeof(T) is {}", + _size, + sizeof(T) + ) + ); + } + return _size / sizeof(T); + }; + + /// Retrieve the number of bytes stored by the super-chunk per-chunk. This will be equivalent + /// to the number of uncompressed bytes stored by each chunk up to num_chunks() - 1. + /// The last chunk may be smaller (but not bigger) in size than this value. + size_t chunk_bytes() const + { + return this->m_chunk_size; + }; + + /// Retrieve the number of bytes stored by the chunk at index `index`. This will be equivalent to + /// chunk_bytes unless it is the last chunk in which case it may be smaller. + /// + /// \throws std::out_of_range if the index is not valid in the super-chunk. + virtual size_t chunk_bytes(size_t index) const = 0; + + /// The number of chunks in the super-chunk + size_t num_chunks() const noexcept + { + return m_chunks.size(); + } + + /// The total compressed size of the schunk in bytes + virtual size_t csize() const noexcept = 0; + + /// The total uncompressed size of the schunk in elements + virtual size_t size() const noexcept = 0; + + /// The total number of bytes stored in the schunk when uncompressed. + /// equivalent to size() * sizeof(T) + size_t byte_size() const noexcept + { + return size() * sizeof(T); + } + + size_t max_chunk_size() const noexcept + { + return m_chunk_size; + } + + size_t max_block_size() const noexcept + { + return m_block_size; + } + + protected: + std::vector> m_chunks{}; + /// The maximum size a chunk is constrained to, in bytes. This will dictate the size of all chunks from + /// 0 - (this->m_chunks.size() - 1). The last chunk may be any other size smaller than or equal to this value. + size_t m_chunk_size = s_default_chunksize; + size_t m_block_size = s_default_blocksize; + + /// Validate the chunk index throwing a std::out_of_range if the index is not valid. + void validate_chunk_index(size_t index) const + { + if (index > m_chunks.size() - 1) + { + throw std::out_of_range( + std::format( + "Cannot access index {} in schunk. Total amount of chunks is {}", + index, + m_chunks.size() + ) + ); + } + } + + /// Validate all the chunk sizes currently held by the super-chunk. This function + /// ensures that the chunks + void validate_chunk_sizes() const + { + // Check that all chunks barring the last one are equal to m_chunk_size + for (auto i : std::views::iota(size_t{0}, this->num_chunks() - 1)) + { + if (this->chunk_bytes(i) != this->chunk_bytes()) + { + throw std::invalid_argument( + std::format( + "Error while validating chunk sizes; Expected all chunks to have a size equivalent to {:L} (m_chunk_size)." + " However, chunk {} instead has a chunk size of {:L}. Having a size different from the rest of the chunks" + " is only supported for the last chunk (blosc2 limitation). Please ensure that all chunks are equally sized" + " when modifying the super-chunk (excluding the last one).", + this->chunk_bytes(), + i, + this->chunk_bytes(i) + ) + ); + } + } + + // Check that the last chunk is not larger than the rest. + if (this->chunk_bytes(this->num_chunks() - 1) > this->chunk_bytes()) + { + throw std::runtime_error( + std::format( + "Error while validating chunk sizes; Expected the last chunk to be at most {:L} bytes," + " instead got {:L} bytes.", + this->chunk_bytes(), + this->chunk_bytes(this->num_chunks() - 1) + ) + ); + } + } + + /// Get the buffer size for T for the given byte size. Checks that the buffer + /// can be divided cleanly by sizeof(T). + size_t get_T_buffer_size(size_t byte_size) const + { + if (byte_size % sizeof(T) != 0) + { + throw std::runtime_error( + std::format( + "Cannot get buffer size for type T of size {} because it is not evenly divisible for buffer size {:L}", + sizeof(T), + byte_size + ) + ); + } + return byte_size / sizeof(T); + } + }; + } // detail +} // NAMESPACE_COMPRESSED_IMAGE diff --git a/compressed_image/include/compressed/blosc2/typedefs.h b/compressed_image/include/compressed/blosc2/typedefs.h index 89f5f24..2059d5c 100644 --- a/compressed_image/include/compressed/blosc2/typedefs.h +++ b/compressed_image/include/compressed/blosc2/typedefs.h @@ -8,15 +8,10 @@ namespace NAMESPACE_COMPRESSED_IMAGE { - - namespace blosc2 - { - template - using schunk_var_ptr = std::shared_ptr, blosc2::lazy_schunk>>; - template - using schunk_var = std::variant, blosc2::lazy_schunk>; - - } // blosc2 + template + using schunk_var_ptr = std::shared_ptr, detail::lazy_schunk>>; + template + using schunk_var = std::variant, detail::lazy_schunk>; } // NAMESPACE_COMPRESSED_IMAGE \ No newline at end of file diff --git a/compressed_image/include/compressed/blosc2/wrapper.h b/compressed_image/include/compressed/blosc2/wrapper.h index d8982ab..3099fe3 100644 --- a/compressed_image/include/compressed/blosc2/wrapper.h +++ b/compressed_image/include/compressed/blosc2/wrapper.h @@ -6,468 +6,536 @@ #include "compressed/enums.h" #include "compressed/blosc2/util.h" #include "compressed/detail/scoped_timer.h" +#include "compressed/detail/scratch_buffer_pool.h" #include "blosc2.h" -#include "blosc2/blosc2-common.h" -#include "blosc2/blosc2-stdio.h" #include "blosc2/filters-registry.h" #include -namespace NAMESPACE_COMPRESSED_IMAGE +namespace +NAMESPACE_COMPRESSED_IMAGE { - - namespace blosc2 - { - - namespace detail - { - static const inline bool g_filters_registered = false; - - /// Initialize filters in c-blosc2. Since we don't have an explicit entry point this needs to be checked on every call to compress and decompress. - /// May be a no-op if detail::g_filters_registered is true. - inline void init_filters() - { - if (!detail::g_filters_registered) - { - register_filters(); - } - } - - } - - // Custom deleter for blosc2 structs for use in a smart pointer - template - struct deleter {}; - - template <> - struct deleter - { - void operator()(blosc2_schunk* schunk) - { - blosc2_schunk_free(schunk); - } - }; - - template <> - struct deleter - { - void operator()(blosc2_context* context) - { - blosc2_free_ctx(context); - } - }; - - /// Typedef the blosc2 primitives into both smart pointers and as raw ptrs - typedef std::unique_ptr> schunk_ptr; - typedef blosc2_schunk* schunk_raw_ptr; - typedef void* chunk_raw_ptr; - typedef std::unique_ptr> context_ptr; - typedef blosc2_context* context_raw_ptr; - - /// Maps a codec enum into its blosc2 representation. - /// - /// \param compcode the compression codec to get - /// - /// \returns The mapped enum as uint8_t since blosc expects it that way - inline uint8_t codec_to_blosc2(enums::codec compcode) - { - if (compcode == enums::codec::blosclz) - { - return static_cast(BLOSC_BLOSCLZ); - } - else if (compcode == enums::codec::lz4) - { - return static_cast(BLOSC_LZ4); - } - else if (compcode == enums::codec::lz4hc) - { - return static_cast(BLOSC_LZ4HC); - } - else if (compcode == enums::codec::zstd) - { - return static_cast(BLOSC_ZSTD); - } - return BLOSC_BLOSCLZ; - } - - /// Maps a blosc2 compression codec into an enum representation - /// - /// \param compcode the compression codec to get - /// - /// \returns The mapped enum - inline enums::codec blosc2_to_codec(uint8_t compcode) - { - if (compcode == BLOSC_BLOSCLZ) - { - return enums::codec::blosclz; - } - else if (compcode == BLOSC_LZ4) - { - return enums::codec::lz4; - } - else if (compcode == BLOSC_LZ4HC) - { - return enums::codec::lz4hc; - } - else if (compcode == BLOSC_ZSTD) - { - return enums::codec::zstd; - } - return enums::codec::blosclz; - } - - /// Compress the `data` into `chunk` using the provided `context`. - /// - /// This function applies Blosc2 compression to the input `data` and stores the compressed - /// result in `chunk`. If compression fails, it throws a `std::runtime_error` with the - /// corresponding Blosc2 error code. - /// - /// \tparam T The data type of the input buffer. - /// \param context A raw pointer to the Blosc2 compression context. - /// \param data The input data to be compressed, provided as a `std::span`. - /// \param chunk The output chunk where compressed data will be stored, provided as a `std::span`. - /// \returns The compressed byte size of the chunk. This size includes a header with metadata, - /// which Blosc2 internally uses. - /// \throws std::runtime_error if compression fails, with the Blosc2 error code. - template - size_t compress(context_raw_ptr context, std::span data, std::span chunk) - { - _COMPRESSED_PROFILE_FUNCTION(); - detail::init_filters(); - const auto cbytes = blosc2_compress_ctx( - context, - static_cast(data.data()), - static_cast(data.size() * sizeof(T)), - static_cast(chunk.data()), - static_cast(chunk.size()) - ); - if (cbytes < 0) - { - throw std::runtime_error(std::format("Unable to compress context using Blosc2 with error code {}", cbytes)); - } - - return cbytes; - } - - /// Compress the `data` into `chunk` using the provided `context`. - /// - /// This function applies Blosc2 compression to the input `data` and stores the compressed - /// result in `chunk`. If compression fails, it throws a `std::runtime_error` with the - /// corresponding Blosc2 error code. - /// - /// \tparam T The data type of the input buffer. - /// \param context A raw pointer to the Blosc2 compression context. - /// \param data The input data to be compressed, provided as a `std::span`. - /// \param chunk The output chunk where compressed data will be stored, provided as a `std::span`. - /// \returns The compressed byte size of the chunk. This size includes a header with metadata, - /// which Blosc2 internally uses. - /// \throws std::runtime_error if compression fails, with the Blosc2 error code. - template - size_t compress(context_raw_ptr context, std::span data, std::span chunk) - { - _COMPRESSED_PROFILE_FUNCTION(); - detail::init_filters(); - const auto cbytes = blosc2_compress_ctx( - context, - static_cast(data.data()), - static_cast(data.size() * sizeof(T)), - static_cast(chunk.data()), - static_cast(chunk.size()) - ); - if (cbytes < 0) - { - throw std::runtime_error(std::format("Unable to compress context using Blosc2 with error code {}", cbytes)); - } - - return cbytes; - } - - /// Compress the `data` into `chunk` using the provided `context`. - /// - /// This function applies Blosc2 compression to the input `data` and stores the compressed - /// result in `chunk`. If compression fails, it throws a `std::runtime_error` with the - /// corresponding Blosc2 error code. - /// - /// \tparam T The data type of the input buffer. - /// \param context A unique pointer to the Blosc2 compression context. - /// \param data The input data to be compressed, provided as a `std::span`. - /// \param chunk The output chunk where compressed data will be stored, provided as a `std::span`. - /// \returns The compressed byte size of the chunk. This size includes a header with metadata, - /// which Blosc2 internally uses. - /// \throws std::runtime_error if compression fails, with the Blosc2 error code. - template - size_t compress(context_ptr& context, std::span data, std::span chunk) - { - return compress(context.get(), data, chunk); - } - - /// Compress the `data` into `chunk` using the provided `context`. - /// - /// This function applies Blosc2 compression to the input `data` and stores the compressed - /// result in `chunk`. If compression fails, it throws a `std::runtime_error` with the - /// corresponding Blosc2 error code. - /// - /// \tparam T The data type of the input buffer. - /// \param context A unique pointer to the Blosc2 compression context. - /// \param data The input data to be compressed, provided as a `std::span`. - /// \param chunk The output chunk where compressed data will be stored, provided as a `std::span`. - /// \returns The compressed byte size of the chunk. This size includes a header with metadata, - /// which Blosc2 internally uses. - /// \throws std::runtime_error if compression fails, with the Blosc2 error code. - template - size_t compress(context_ptr& context, std::span data, std::span chunk) - { - return compress(context.get(), data, chunk); - } - - /// Decompress a Blosc2 `chunk` into `buffer` using the provided `context`. - /// - /// This function reverses the Blosc2 compression, restoring the original uncompressed data. - /// If decompression fails, it throws a `std::runtime_error` with the corresponding error code. - /// - /// \tparam T The data type of the decompressed output. - /// \param context A raw pointer to the Blosc2 decompression context. - /// \param buffer The output buffer where decompressed data will be stored, provided as a `std::span`. - /// \param chunk The compressed input data to be decompressed, provided as a `std::span`. - /// \returns The decompressed byte size of the buffer. - /// \throws std::runtime_error if decompression fails, with the Blosc2 error code. - template - size_t decompress(context_raw_ptr context, std::span buffer, std::span chunk) - { - _COMPRESSED_PROFILE_FUNCTION(); - detail::init_filters(); - if (buffer.size() * sizeof(T) > std::numeric_limits::max()) - { - throw std::out_of_range(std::format("Blosc2 chunk size may not exceed numeric limit of int32_t, got {:L} which would exceed that", buffer.size() * sizeof(T))); - } - - int decompressed_size = blosc2_decompress_ctx( - context, - static_cast(chunk.data()), - std::numeric_limits::max(), - buffer.data(), - static_cast(buffer.size() * sizeof(T)) - ); - - if (decompressed_size < 0) - { - throw std::runtime_error(std::format("Error code {} while decompressing blosc2 chunk", decompressed_size)); - } - return decompressed_size; - } - - - /// Decompress a Blosc2 `chunk` into `buffer` using the provided `context`. - /// - /// This function reverses the Blosc2 compression, restoring the original uncompressed data. - /// If decompression fails, it throws a `std::runtime_error` with the corresponding error code. - /// - /// \tparam T The data type of the decompressed output. - /// \param context A unique pointer to the Blosc2 decompression context. - /// \param buffer The output buffer where decompressed data will be stored, provided as a `std::span`. - /// \param chunk The compressed input data to be decompressed, provided as a `std::span`. - /// \returns The decompressed byte size of the buffer. - /// \throws std::runtime_error if decompression fails, with the Blosc2 error code. - template - size_t decompress(context_ptr& context, std::span buffer, std::span chunk) - { - return decompress(context.get(), buffer, chunk); - } - - /// Append the chunk into the super-chunk. The chunk in this case does not need to be refitted as its actual - /// size since c-blosc will read the size from its header bytes. - inline size_t append_chunk(schunk_ptr& schunk, std::span chunk) - { - detail::init_filters(); - // We don't expose the copy parameter as internally in c-blosc if the chunk was compressed at all (i.e. compressed size < - // uncompressed size) the chunk gets realloc'd anyways effectively copying it. - auto nchunks = blosc2_schunk_append_chunk( - schunk.get(), - reinterpret_cast(chunk.data()), - true // copy - ); - - if (nchunks < 0) - { - throw std::runtime_error(std::format("Unable to append chunk into super-chunk with the following blosc2 error code {}", nchunks)); - } - - return nchunks; - } - - /// Create a default schunk with BLOSC2_CPARAMS_DEFAULTS and BLOSC2_DPARAMS_DEFAULTS - inline blosc2::schunk_ptr create_default_schunk() - { - detail::init_filters(); - auto cparams = BLOSC2_CPARAMS_DEFAULTS; - auto dparams = BLOSC2_DPARAMS_DEFAULTS; - blosc2_storage storage = BLOSC2_STORAGE_DEFAULTS; - storage.cparams = &cparams; - storage.dparams = &dparams; - return blosc2::schunk_ptr(blosc2_schunk_new(&storage)); - } - - /// Create blosc2 compression parameters for the given input. - template - blosc2_cparams create_blosc2_cparams(schunk_ptr& schunk, size_t nthreads, enums::codec codec, uint8_t compression_level, size_t block_size) - { - if (nthreads > std::numeric_limits::max()) - { - throw std::out_of_range(std::format("Number of threads may not exceed {}, got {:L}", std::numeric_limits::max(), nthreads)); - } - nthreads = std::max(nthreads, static_cast(1)); - - assert(std::numeric_limits::max() > block_size); - - detail::init_filters(); - blosc2_cparams cparams = BLOSC2_CPARAMS_DEFAULTS; - cparams.blocksize = static_cast(block_size);; - cparams.typesize = sizeof(T); - cparams.splitmode = BLOSC_AUTO_SPLIT; - cparams.clevel = compression_level; - cparams.nthreads = static_cast(nthreads); - cparams.schunk = schunk.get(); - cparams.compcode = codec_to_blosc2(codec); - - return cparams; - } - - /// Create blosc2 compression parameters for the given input. - template - blosc2_cparams create_blosc2_cparams(size_t nthreads, enums::codec codec, uint8_t compression_level, size_t block_size) - { - if (nthreads > std::numeric_limits::max()) - { - throw std::out_of_range(std::format("Number of threads may not exceed {}, got {:L}", std::numeric_limits::max(), nthreads)); - } - nthreads = std::max(nthreads, static_cast(1)); - - assert(std::numeric_limits::max() > block_size); - - detail::init_filters(); - blosc2_cparams cparams = BLOSC2_CPARAMS_DEFAULTS; - cparams.blocksize = static_cast(block_size); - cparams.typesize = sizeof(T); - cparams.splitmode = BLOSC_AUTO_SPLIT; - cparams.clevel = compression_level; - cparams.nthreads = static_cast(nthreads); - cparams.compcode = codec_to_blosc2(codec); - - return cparams; - } - - /// Create a blosc2 compression context with the given number of threads. - template - blosc2::context_ptr create_compression_context(schunk_ptr& schunk, size_t nthreads, enums::codec codec, uint8_t compression_level, size_t block_size) - { - _COMPRESSED_PROFILE_FUNCTION(); - detail::init_filters(); - auto cparams = create_blosc2_cparams(schunk, nthreads, codec, compression_level, block_size); - return blosc2::context_ptr(blosc2_create_cctx(cparams)); - } - - template - blosc2::context_ptr create_compression_context(size_t nthreads, enums::codec codec, uint8_t compression_level, size_t block_size) - { - _COMPRESSED_PROFILE_FUNCTION(); - detail::init_filters(); - auto cparams = create_blosc2_cparams(nthreads, codec, compression_level, block_size); - return blosc2::context_ptr(blosc2_create_cctx(cparams)); - } - - /// Create a blosc2 decompression context with the given number of threads. - inline blosc2::context_ptr create_decompression_context(schunk_ptr& schunk, size_t nthreads) - { - _COMPRESSED_PROFILE_FUNCTION(); - if (nthreads > std::numeric_limits::max()) - { - throw std::out_of_range(std::format("Number of threads may not exceed {}, got {:L}", std::numeric_limits::max(), nthreads)); - } - nthreads = std::min(nthreads, static_cast(1)); - - detail::init_filters(); - auto dparams = BLOSC2_DPARAMS_DEFAULTS; - dparams.schunk = schunk.get(); - dparams.nthreads = static_cast(nthreads); - - return blosc2::context_ptr(blosc2_create_dctx(dparams)); - } - - /// Create a blosc2 decompression context with the given number of threads. - inline blosc2::context_ptr create_decompression_context(size_t nthreads) - { - _COMPRESSED_PROFILE_FUNCTION(); - if (nthreads > std::numeric_limits::max()) - { - throw std::out_of_range(std::format("Number of threads may not exceed {}, got {:L}", std::numeric_limits::max(), nthreads)); - } - nthreads = std::min(nthreads, static_cast(1)); - - detail::init_filters(); - auto dparams = BLOSC2_DPARAMS_DEFAULTS; - dparams.nthreads = static_cast(nthreads); - - return blosc2::context_ptr(blosc2_create_dctx(dparams)); - } - - /// Get the minimum size needed to store the compressed data. - template - constexpr size_t min_compressed_size() - { - return ChunkSize + BLOSC2_MAX_OVERHEAD; - } - - /// Get the minimum size needed to store the compressed data. - inline constexpr size_t min_compressed_size(size_t chunk_size) - { - return chunk_size + BLOSC2_MAX_OVERHEAD; - } - - /// Get the minimum size needed to store the decompressed data. - template - constexpr size_t min_decompressed_size() - { - return ChunkSize; - } - - /// Get the minimum size needed to store the decompressed data. - inline constexpr size_t min_decompressed_size(size_t chunk_size) - { - return chunk_size; - } - - /// Get the number of elements of the uncompressed chunk. - /// - /// \tparam T the type to check against - /// \param chunk the compressed chunk to query - /// - /// \throws std::runtime_error if we encounter a blosc2 error. - template - size_t chunk_num_elements(const std::vector& chunk) - { - int32_t nbytes{}; - int32_t cbytes{}; - int32_t blocksize{}; - auto res = blosc2_cbuffer_sizes( - static_cast(chunk.data()), - &nbytes, - &cbytes, - &blocksize - ); - if (res < 0) - { - throw std::runtime_error(std::format("Unable to find buffer sizes due to blosc2 error: {}", map_error_code(res))); - } - - assert(nbytes > 0); - assert(nbytes % sizeof(T) == 0); - - return static_cast(nbytes) / sizeof(T); - } - - } // namespace blosc2 - - -} // NAMESPACE_COMPRESSED_IMAGE \ No newline at end of file + namespace blosc2 + { + namespace detail + { + static const inline bool g_filters_registered = false; + + /// Initialize filters in c-blosc2. Since we don't have an explicit entry point this needs to be checked on every call to compress and decompress. + /// May be a no-op if detail::g_filters_registered is true. + inline void init_filters() + { + if (!detail::g_filters_registered) + { + register_filters(); + } + } + } + + // Custom deleter for blosc2 structs for use in a smart pointer + template + struct deleter + { + }; + + template <> + struct deleter + { + void operator()(blosc2_schunk* schunk) + { + blosc2_schunk_free(schunk); + } + }; + + template <> + struct deleter + { + void operator()(blosc2_context* context) + { + blosc2_free_ctx(context); + } + }; + + /// Typedef the blosc2 primitives into both smart pointers and as raw ptrs + typedef std::unique_ptr> schunk_ptr; + typedef blosc2_schunk* schunk_raw_ptr; + typedef void* chunk_raw_ptr; + typedef std::unique_ptr> context_ptr; + typedef blosc2_context* context_raw_ptr; + + /// Maps a codec enum into its blosc2 representation. + /// + /// \param compcode the compression codec to get + /// + /// \returns The mapped enum as uint8_t since blosc expects it that way + inline uint8_t codec_to_blosc2(enums::codec compcode) + { + if (compcode == enums::codec::blosclz) + { + return static_cast(BLOSC_BLOSCLZ); + } + else if (compcode == enums::codec::lz4) + { + return static_cast(BLOSC_LZ4); + } + else if (compcode == enums::codec::lz4hc) + { + return static_cast(BLOSC_LZ4HC); + } + else if (compcode == enums::codec::zstd) + { + return static_cast(BLOSC_ZSTD); + } + return BLOSC_BLOSCLZ; + } + + /// Maps a blosc2 compression codec into an enum representation + /// + /// \param compcode the compression codec to get + /// + /// \returns The mapped enum + inline enums::codec blosc2_to_codec(uint8_t compcode) + { + if (compcode == BLOSC_BLOSCLZ) + { + return enums::codec::blosclz; + } + else if (compcode == BLOSC_LZ4) + { + return enums::codec::lz4; + } + else if (compcode == BLOSC_LZ4HC) + { + return enums::codec::lz4hc; + } + else if (compcode == BLOSC_ZSTD) + { + return enums::codec::zstd; + } + return enums::codec::blosclz; + } + + + /// Get the minimum size needed to store the compressed data. + template + constexpr size_t min_compressed_size() + { + return ChunkSize + BLOSC2_MAX_OVERHEAD; + } + + /// Get the minimum size needed to store the compressed data. + inline constexpr size_t min_compressed_size(size_t chunk_size) + { + return chunk_size + BLOSC2_MAX_OVERHEAD; + } + + /// Get the minimum size needed to store the decompressed data. + template + constexpr size_t min_decompressed_size() + { + return ChunkSize; + } + + /// Get the minimum size needed to store the decompressed data. + inline constexpr size_t min_decompressed_size(size_t chunk_size) + { + return chunk_size; + } + + + /// Compress the `data` into `chunk` using the provided `context`. + /// + /// This function applies Blosc2 compression to the input `data` and stores the compressed + /// result in `chunk`. If compression fails, it throws a `std::runtime_error` with the + /// corresponding Blosc2 error code. + /// + /// \tparam T The data type of the input buffer. + /// \param context A raw pointer to the Blosc2 compression context. + /// \param data The input data to be compressed, provided as a `std::span`. + /// \param chunk The output chunk where compressed data will be stored, provided as a `std::span`. + /// \returns The compressed byte size of the chunk. This size includes a header with metadata, + /// which Blosc2 internally uses. + /// \throws std::runtime_error if compression fails, with the Blosc2 error code. + template + size_t compress(context_raw_ptr context, std::span data, std::span chunk) + { + _COMPRESSED_PROFILE_FUNCTION(); + detail::init_filters(); + const auto cbytes = blosc2_compress_ctx( + context, + static_cast(data.data()), + static_cast(data.size() * sizeof(T)), + static_cast(chunk.data()), + static_cast(chunk.size()) + ); + if (cbytes < 0) + { + throw std::runtime_error( + std::format("Unable to compress context using Blosc2 with error code {}", cbytes) + ); + } + + return cbytes; + } + + /// Compress the `data` into `chunk` using the provided `context`. + /// + /// This function applies Blosc2 compression to the input `data` and stores the compressed + /// result in `chunk`. If compression fails, it throws a `std::runtime_error` with the + /// corresponding Blosc2 error code. + /// + /// \tparam T The data type of the input buffer. + /// \param context A raw pointer to the Blosc2 compression context. + /// \param data The input data to be compressed, provided as a `std::span`. + /// \param chunk The output chunk where compressed data will be stored, provided as a `std::span`. + /// \returns The compressed byte size of the chunk. This size includes a header with metadata, + /// which Blosc2 internally uses. + /// \throws std::runtime_error if compression fails, with the Blosc2 error code. + template + size_t compress(context_raw_ptr context, std::span data, std::span chunk) + { + _COMPRESSED_PROFILE_FUNCTION(); + detail::init_filters(); + const auto cbytes = blosc2_compress_ctx( + context, + static_cast(data.data()), + static_cast(data.size() * sizeof(T)), + static_cast(chunk.data()), + static_cast(chunk.size()) + ); + if (cbytes < 0) + { + throw std::runtime_error( + std::format("Unable to compress context using Blosc2 with error code {}", cbytes) + ); + } + + return cbytes; + } + + /// Compress the `data` into `chunk` using the provided `context`. + /// + /// This function applies Blosc2 compression to the input `data` and stores the compressed + /// result in `chunk`. If compression fails, it throws a `std::runtime_error` with the + /// corresponding Blosc2 error code. + /// + /// \tparam T The data type of the input buffer. + /// \param context A unique pointer to the Blosc2 compression context. + /// \param data The input data to be compressed, provided as a `std::span`. + /// \param chunk The output chunk where compressed data will be stored, provided as a `std::span`. + /// \returns The compressed byte size of the chunk. This size includes a header with metadata, + /// which Blosc2 internally uses. + /// \throws std::runtime_error if compression fails, with the Blosc2 error code. + template + size_t compress(context_ptr& context, std::span data, std::span chunk) + { + return compress(context.get(), data, chunk); + } + + /// Compress the `data` into `chunk` using the provided `context`. + /// + /// This function applies Blosc2 compression to the input `data` and stores the compressed + /// result in `chunk`. If compression fails, it throws a `std::runtime_error` with the + /// corresponding Blosc2 error code. + /// + /// \tparam T The data type of the input buffer. + /// \param context A unique pointer to the Blosc2 compression context. + /// \param data The input data to be compressed, provided as a `std::span`. + /// \param chunk The output chunk where compressed data will be stored, provided as a `std::span`. + /// \returns The compressed byte size of the chunk. This size includes a header with metadata, + /// which Blosc2 internally uses. + /// \throws std::runtime_error if compression fails, with the Blosc2 error code. + template + size_t compress(context_ptr& context, std::span data, std::span chunk) + { + return compress(context.get(), data, chunk); + } + + template + util::default_init_vector compress_to_chunk(context_raw_ptr context, std::span data) + { + _COMPRESSED_PROFILE_FUNCTION(); + + const auto required_size = min_compressed_size(data.size_bytes()); + + if (auto pool = NAMESPACE_COMPRESSED_IMAGE::detail::scratch_pool_registry::current()) + { + auto lease = pool->acquire(required_size); + auto scratch = lease.span(); + const auto csize = compress(context, data, scratch); + return util::default_init_vector(scratch.begin(), scratch.begin() + csize); + } + + util::default_init_vector scratch(required_size); + const auto csize = compress(context, data, std::span(scratch)); + return util::default_init_vector(scratch.begin(), scratch.begin() + csize); + } + + template + util::default_init_vector compress_to_chunk(context_raw_ptr context, std::span data) + { + _COMPRESSED_PROFILE_FUNCTION(); + + const auto required_size = min_compressed_size(data.size_bytes()); + + if (auto pool = NAMESPACE_COMPRESSED_IMAGE::detail::scratch_pool_registry::current()) + { + auto lease = pool->acquire(required_size); + auto scratch = lease.span(); + const auto csize = compress(context, data, scratch); + return util::default_init_vector(scratch.begin(), scratch.begin() + csize); + } + + util::default_init_vector scratch(required_size); + const auto csize = compress(context, data, std::span(scratch)); + return util::default_init_vector(scratch.begin(), scratch.begin() + csize); + } + + template + util::default_init_vector compress_to_chunk(context_ptr& context, std::span data) + { + return compress_to_chunk(context.get(), data); + } + + template + util::default_init_vector compress_to_chunk(context_ptr& context, std::span data) + { + return compress_to_chunk(context.get(), data); + } + + /// Decompress a Blosc2 `chunk` into `buffer` using the provided `context`. + /// + /// This function reverses the Blosc2 compression, restoring the original uncompressed data. + /// If decompression fails, it throws a `std::runtime_error` with the corresponding error code. + /// + /// \tparam T The data type of the decompressed output. + /// \param context A raw pointer to the Blosc2 decompression context. + /// \param buffer The output buffer where decompressed data will be stored, provided as a `std::span`. + /// \param chunk The compressed input data to be decompressed, provided as a `std::span`. + /// \returns The decompressed byte size of the buffer. + /// \throws std::runtime_error if decompression fails, with the Blosc2 error code. + template + size_t decompress(context_raw_ptr context, std::span buffer, std::span chunk) + { + _COMPRESSED_PROFILE_FUNCTION(); + detail::init_filters(); + if (buffer.size() * sizeof(T) > std::numeric_limits::max()) + { + throw std::out_of_range( + std::format( + "Blosc2 chunk size may not exceed numeric limit of int32_t, got {:L} which would exceed that", + buffer.size() * sizeof(T) + ) + ); + } + + int decompressed_size = blosc2_decompress_ctx( + context, + static_cast(chunk.data()), + std::numeric_limits::max(), + buffer.data(), + static_cast(buffer.size() * sizeof(T)) + ); + + if (decompressed_size < 0) + { + throw std::runtime_error( + std::format("Error code {} while decompressing blosc2 chunk", decompressed_size) + ); + } + return decompressed_size; + } + + + /// Decompress a Blosc2 `chunk` into `buffer` using the provided `context`. + /// + /// This function reverses the Blosc2 compression, restoring the original uncompressed data. + /// If decompression fails, it throws a `std::runtime_error` with the corresponding error code. + /// + /// \tparam T The data type of the decompressed output. + /// \param context A unique pointer to the Blosc2 decompression context. + /// \param buffer The output buffer where decompressed data will be stored, provided as a `std::span`. + /// \param chunk The compressed input data to be decompressed, provided as a `std::span`. + /// \returns The decompressed byte size of the buffer. + /// \throws std::runtime_error if decompression fails, with the Blosc2 error code. + template + size_t decompress(context_ptr& context, std::span buffer, std::span chunk) + { + return decompress(context.get(), buffer, chunk); + } + + /// Create blosc2 compression parameters for the given input. + template + blosc2_cparams create_blosc2_cparams(schunk_ptr& schunk, + size_t nthreads, + enums::codec codec, + uint8_t compression_level, + size_t block_size) + { + if (nthreads > std::numeric_limits::max()) + { + throw std::out_of_range( + std::format( + "Number of threads may not exceed {}, got {:L}", + std::numeric_limits::max(), + nthreads + ) + ); + } + nthreads = std::max(nthreads, static_cast(1)); + + assert(std::numeric_limits::max() > block_size); + + detail::init_filters(); + blosc2_cparams cparams = BLOSC2_CPARAMS_DEFAULTS; + cparams.blocksize = static_cast(block_size);; + cparams.typesize = sizeof(T); + cparams.splitmode = BLOSC_AUTO_SPLIT; + cparams.clevel = compression_level; + cparams.nthreads = static_cast(nthreads); + cparams.schunk = schunk.get(); + cparams.compcode = codec_to_blosc2(codec); + + return cparams; + } + + /// Create blosc2 compression parameters for the given input. + template + blosc2_cparams create_blosc2_cparams(size_t nthreads, + enums::codec codec, + uint8_t compression_level, + size_t block_size) + { + if (nthreads > std::numeric_limits::max()) + { + throw std::out_of_range( + std::format( + "Number of threads may not exceed {}, got {:L}", + std::numeric_limits::max(), + nthreads + ) + ); + } + nthreads = std::max(nthreads, static_cast(1)); + + assert(std::numeric_limits::max() > block_size); + + detail::init_filters(); + blosc2_cparams cparams = BLOSC2_CPARAMS_DEFAULTS; + cparams.blocksize = static_cast(block_size); + cparams.typesize = sizeof(T); + cparams.splitmode = BLOSC_AUTO_SPLIT; + cparams.clevel = compression_level; + cparams.nthreads = static_cast(nthreads); + cparams.compcode = codec_to_blosc2(codec); + + return cparams; + } + + /// Create a blosc2 compression context with the given number of threads. + template + blosc2::context_ptr create_compression_context(schunk_ptr& schunk, + size_t nthreads, + enums::codec codec, + uint8_t compression_level, + size_t block_size) + { + _COMPRESSED_PROFILE_FUNCTION(); + detail::init_filters(); + auto cparams = create_blosc2_cparams(schunk, nthreads, codec, compression_level, block_size); + return blosc2::context_ptr(blosc2_create_cctx(cparams)); + } + + template + blosc2::context_ptr create_compression_context(size_t nthreads, + enums::codec codec, + uint8_t compression_level, + size_t block_size) + { + _COMPRESSED_PROFILE_FUNCTION(); + detail::init_filters(); + auto cparams = create_blosc2_cparams(nthreads, codec, compression_level, block_size); + return blosc2::context_ptr(blosc2_create_cctx(cparams)); + } + + /// Create a blosc2 decompression context with the given number of threads. + inline blosc2::context_ptr create_decompression_context(schunk_ptr& schunk, size_t nthreads) + { + _COMPRESSED_PROFILE_FUNCTION(); + if (nthreads > std::numeric_limits::max()) + { + throw std::out_of_range( + std::format( + "Number of threads may not exceed {}, got {:L}", + std::numeric_limits::max(), + nthreads + ) + ); + } + nthreads = std::min(nthreads, static_cast(1)); + + detail::init_filters(); + auto dparams = BLOSC2_DPARAMS_DEFAULTS; + dparams.schunk = schunk.get(); + dparams.nthreads = static_cast(nthreads); + + return blosc2::context_ptr(blosc2_create_dctx(dparams)); + } + + /// Create a blosc2 decompression context with the given number of threads. + inline blosc2::context_ptr create_decompression_context(size_t nthreads) + { + _COMPRESSED_PROFILE_FUNCTION(); + if (nthreads > std::numeric_limits::max()) + { + throw std::out_of_range( + std::format( + "Number of threads may not exceed {}, got {:L}", + std::numeric_limits::max(), + nthreads + ) + ); + } + nthreads = std::min(nthreads, static_cast(1)); + + detail::init_filters(); + auto dparams = BLOSC2_DPARAMS_DEFAULTS; + dparams.nthreads = static_cast(nthreads); + + return blosc2::context_ptr(blosc2_create_dctx(dparams)); + } + + /// Get the number of elements of the uncompressed chunk. + /// + /// \tparam T the type to check against + /// \param chunk the compressed chunk to query + /// + /// \throws std::runtime_error if we encounter a blosc2 error. + template + size_t chunk_num_elements(const std::span chunk) + { + int32_t nbytes{}; + int32_t cbytes{}; + int32_t blocksize{}; + auto res = blosc2_cbuffer_sizes( + static_cast(chunk.data()), + &nbytes, + &cbytes, + &blocksize + ); + if (res < 0) + { + throw std::runtime_error( + std::format("Unable to find buffer sizes due to blosc2 error: {}", map_error_code(res)) + ); + } + + assert(nbytes > 0); + assert(nbytes % sizeof(T) == 0); + + return static_cast(nbytes) / sizeof(T); + } + } // namespace blosc2 +} // NAMESPACE_COMPRESSED_IMAGE diff --git a/compressed_image/include/compressed/channel.h b/compressed_image/include/compressed/channel.h index b5a4f41..2d26084 100644 --- a/compressed_image/include/compressed/channel.h +++ b/compressed_image/include/compressed/channel.h @@ -6,581 +6,787 @@ #include #include #include -#include #include "blosc2.h" #include "nlohmann/json.hpp" #include "macros.h" -#include "enums.h" #include "fwd.h" +#include "enums.h" #include "blosc2/wrapper.h" #include "blosc2/typedefs.h" #include "blosc2/schunk.h" #include "blosc2/lazyschunk.h" #include "constants.h" +#include "context.h" +#include "logger.h" #include "util.h" -#include "json_alias.h" #include "detail/scoped_timer.h" -#include "iterators/iterator.h" +#include "detail/scratch_buffer_pool.h" +#include "iterators/channel.h" -namespace NAMESPACE_COMPRESSED_IMAGE +namespace +NAMESPACE_COMPRESSED_IMAGE { - - template - struct channel : public std::ranges::view_interface> - { - using value_type = T; - using iterator = channel_iterator; - using const_iterator = channel_iterator; - - channel(channel&& other) - { - m_Schunk = std::move(other.m_Schunk); - m_Codec = other.m_Codec; - m_Nthreads = other.m_Nthreads; - m_CompressionContext = std::move(other.m_CompressionContext); - m_DecompressionContext = std::move(other.m_DecompressionContext); - m_CompressionLevel = other.m_CompressionLevel; - m_Width = other.m_Width; - m_Height = other.m_Height; - }; - channel& operator=(channel&& other) - { - if (this != &other) - { - m_Schunk = std::move(other.m_Schunk); - m_Codec = other.m_Codec; - m_Nthreads = other.m_Nthreads; - m_CompressionContext = std::move(other.m_CompressionContext); - m_DecompressionContext = std::move(other.m_DecompressionContext); - m_CompressionLevel = other.m_CompressionLevel; - m_Width = other.m_Width; - m_Height = other.m_Height; - } - return *this; - }; - channel(const channel&) = delete; - channel& operator=(const channel&) = delete; - - - /// Default ctor, ensures the schunk and compression/decompression contexts are always initialized - /// into valid states. This will not generate a valid channel however and the ctor taking data or the static - /// functions `zeros` and `full` are preferred. - channel() - { - m_Schunk = std::make_shared>(blosc2::lazy_schunk(0, 1, s_default_blocksize, s_default_chunksize)); - m_CompressionContext = blosc2::create_compression_context( - std::thread::hardware_concurrency() / 2, - enums::codec::lz4, - 9, - s_default_blocksize - ); - m_DecompressionContext = blosc2::create_decompression_context(std::thread::hardware_concurrency() / 2); - }; - - /// Initialize the channel with the given data. - /// - /// \param data The span of input data to be compressed. - /// \param width The width of the image channel. - /// \param height The height of the image channel. - /// \param compression_codec The compression codec to be used (default is lz4). - /// \param compression_level The compression level (default is 5). - /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to - /// comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle - /// larger blocks feel free to up this number although this may not increase performance - /// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. - /// This should be tweaked to be no larger than the size of the usual images you are expecting - /// to compress for optimal performance but this could be upped which might give better compression - /// ratios. Must be a multiple of sizeof(T). - channel( - const std::span data, - size_t width, - size_t height, - enums::codec compression_codec = enums::codec::lz4, - uint8_t compression_level = 9, - size_t block_size = s_default_blocksize, - size_t chunk_size = s_default_chunksize - ) - { - _COMPRESSED_PROFILE_FUNCTION(); - m_Width = width; - m_Height = height; - m_Codec = compression_codec; - m_CompressionLevel = util::ensure_compression_level(compression_level); - if (data.size() != width * height) - { - throw std::runtime_error( - std::format( - "Invalid channel data passed. Expected its size to match up to width * height ({} * {}) which would be {:L}." \ - " Instead received {:L}", - width, height, width * height, data.size() - ) - ); - } - - // c-blosc2 chunks can at most be 2 gigabytes so the set chunk size should not exceed this. - assert(chunk_size < std::numeric_limits::max()); - assert(block_size < chunk_size); - - m_CompressionContext = blosc2::create_compression_context(std::thread::hardware_concurrency() / 2, m_Codec, m_CompressionLevel, block_size); - m_DecompressionContext = blosc2::create_decompression_context(std::thread::hardware_concurrency() / 2); - - // Align the chunks to the scanlines, makes our lifes a lot easier on read/write. - auto chunk_size_aligned = util::align_chunk_to_scanlines_bytes(m_Width, chunk_size); - m_Schunk = std::make_shared>(blosc2::schunk(data, block_size, chunk_size_aligned, m_CompressionContext)); - } - - - /// Initialize the channel with the given data. - /// - /// \param schunk The initialized super-chunk. - /// \param width The width of the image channel. - /// \param height The height of the image channel. - /// \param compression_codec The compression codec to be used. - /// \param compression_level The compression level (default is 5). - /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to - /// comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle - /// larger blocks feel free to up this number although this may not increase performance - /// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. - /// This should be tweaked to be no larger than the size of the usual images you are expecting - /// to compress for optimal performance but this could be upped which might give better compression - /// ratios. Must be a multiple of sizeof(T). - channel( - blosc2::schunk_var schunk, - size_t width, - size_t height, - enums::codec compression_codec = enums::codec::lz4, - uint8_t compression_level = 9 - ) - { - _COMPRESSED_PROFILE_FUNCTION(); - m_Codec = compression_codec; - m_CompressionLevel = util::ensure_compression_level(compression_level); - - if (std::holds_alternative>(schunk)) - { - if (std::get>(schunk).size() != width * height) - { - throw std::invalid_argument( - std::format( - "Invalid schunk passed to compressed::channel constructor. Expected a size of {:L} but instead got {:L}", - width * height, - std::get>(schunk).size() - ) - ); - } - } - else if (std::holds_alternative>(schunk)) - { - if (std::get>(schunk).size() != width * height) - { - throw std::invalid_argument( - std::format( - "Invalid schunk passed to compressed::channel constructor. Expected a size of {:L} but instead got {:L}", - width * height, - std::get>(schunk).size() - ) - ); - } - } - - m_Schunk = std::make_shared>(std::move(schunk)); - m_Width = width; - m_Height = height; - - // Store the compression and decompression contexts, retrieving the block size from the underlying schunk - // wrapper - std::visit([&](auto& schunk) - { - m_CompressionContext = blosc2::create_compression_context(std::thread::hardware_concurrency() / 2, m_Codec, m_CompressionLevel, schunk.max_block_size()); - m_DecompressionContext = blosc2::create_decompression_context(std::thread::hardware_concurrency() / 2); - }, *m_Schunk); - - } - - - /// Create a channel filled with zeros. - /// - /// Generates a lazy-channel which only stores a single value T per-chunk, only setting this to a compressed buffer - /// if set with something like `set_chunk`. This is especially memory efficient and should be the preferred way - /// when wanting to generate an empty channel only filling out some parts (i.e. sparse cryptomatte loading). - /// - /// \param width The width of the image channel. - /// \param height The height of the image channel. - /// \param compression_codec The compression codec to be used. - /// \param compression_level The compression level (default is 9). - /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to - /// comfortably fit into the L1 cache of most modern CPUs. - /// \param chunk_size The size of each individual chunk, defaults to 4MB. Should be no larger than the expected image size - /// for optimal performance and must be a multiple of sizeof(T). - /// \return A channel instance with all values initialized to zero. - static channel zeros( - size_t width, - size_t height, - enums::codec compression_codec = enums::codec::lz4, - uint8_t compression_level = 9, - size_t block_size = s_default_blocksize, - size_t chunk_size = s_default_chunksize - ) - { - return channel::full(width, height, static_cast(0), compression_codec, compression_level, block_size, chunk_size); - } - - /// Create a zero-initialized channel with the same shape and compression parameters as another channel. - /// - /// Generates a lazy-channel which only stores a single value T per-chunk, only setting this to a compressed buffer - /// if set with something like `set_chunk`. This is especially memory efficient and should be the preferred way - /// when wanting to generate an empty channel only filling out some parts (i.e. sparse cryptomatte loading). - /// - /// \param other The reference channel from which to copy shape and compression settings. - /// \return A new channel instance with the same dimensions and compression settings as \p other, filled with zeros. - static channel zeros_like(const channel& other) - { - return channel::zeros( - other.width(), - other.height(), - other.compression(), - other.compression_level(), - other.block_size(), - other.chunk_size() - ); - } - - /// Create a channel filled with a specific value. - /// - /// Generates a lazy-channel which only stores a single value T per-chunk, only setting this to a compressed buffer - /// if set with something like `set_chunk`. This is especially memory efficient and should be the preferred way - /// when wanting to generate an empty channel only filling out some parts (i.e. sparse cryptomatte loading). - /// - /// \param width The width of the image channel. - /// \param height The height of the image channel. - /// \param fill_value The value to fill the channel with. - /// \param compression_codec The compression codec to be used. - /// \param compression_level The compression level (default is 9). - /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB. - /// \param chunk_size The size of each individual chunk, defaults to 4MB. Should be no larger than the expected image size - /// for optimal performance and must be a multiple of sizeof(T). - /// \return A channel instance with all values initialized to \p fill_value. - static channel full( - size_t width, - size_t height, - T fill_value, - enums::codec compression_codec = enums::codec::lz4, - uint8_t compression_level = 9, - size_t block_size = s_default_blocksize, - size_t chunk_size = s_default_chunksize - ) - { - const size_t chunk_size_aligned = util::align_chunk_to_scanlines_bytes(width, chunk_size); - const size_t num_elements = width * height; - - auto schunk = blosc2::lazy_schunk(fill_value, num_elements, block_size, chunk_size_aligned); - return channel(std::move(schunk), width, height, compression_codec, compression_level); - } - - - /// Create a channel filled with a specific value and the same shape and compression settings as another channel. - /// - /// Generates a lazy-channel which only stores a single value T per-chunk, only setting this to a compressed buffer - /// if set with something like `set_chunk`. This is especially memory efficient and should be the preferred way - /// when wanting to generate an empty channel only filling out some parts (i.e. sparse cryptomatte loading). - /// - /// \param other The reference channel from which to copy shape and compression settings. - /// \param fill_value The value to fill the channel with. - /// \return A new channel instance filled with \p fill_value and the same dimensions and compression settings as \p other. - static channel full_like(const channel& other, T fill_value) - { - return channel::full( - other.width(), - other.height(), - fill_value, - other.compression(), - other.compression_level(), - other.block_size(), - other.chunk_size() - ); - } - - /// Returns an iterator pointing to the beginning of the compressed data. - /// - /// \return An iterator to the beginning of the compressed data. - iterator begin() - { - return iterator(m_Schunk, m_CompressionContext.get(), m_DecompressionContext.get(), 0, m_Width, m_Height); - } - - /// Returns an iterator pointing to the end of the compressed data. - /// - /// \return An iterator to the end of the compressed data. - iterator end() - { - if (m_Schunk) - { - return std::visit([&](auto& schunk) - { - return iterator(m_Schunk, m_CompressionContext.get(), m_DecompressionContext.get(), schunk.num_chunks(), m_Width, m_Height); - }, *m_Schunk); - } - throw std::runtime_error("Internal Error: Unable to create end iterator as m_Schunk is uninitialized."); - } - - /// Retrieve a view to the compression context. In most cases users will not have to modify this. - /// - /// \return A pointer to the compression context. - blosc2::context_raw_ptr compression_context() { return m_CompressionContext.get(); } - - /// Retrieve a view to the decompression context. In most cases users will not have to modify this. - /// - /// \return A pointer to the decompression context. - blosc2::context_raw_ptr decompression_context() { return m_DecompressionContext.get(); } - - /// Update the number of threads used internally by c-blosc2 for compression and decompression. - /// - /// \param nthreads The number of threads to use for compression and decompression. - /// \param block_size The block size to compress to - void update_nthreads(size_t nthreads, size_t block_size = s_default_blocksize) - { - m_CompressionContext = blosc2::create_compression_context(nthreads, m_Codec, m_CompressionLevel, block_size); - m_DecompressionContext = blosc2::create_decompression_context(nthreads); - m_Nthreads = nthreads; - } - - /// The channel width. - /// - /// \return The width of the channel. - size_t width() const noexcept { return m_Width; } - - /// The channel height. - /// - /// \return The height of the channel. - size_t height() const noexcept { return m_Height; } - - /// Retrieve the compression codec used. - /// - /// \return The compression codec. - enums::codec compression() const noexcept { return m_Codec; } - - /// Retrieve the compression level used. - /// - /// \return The compression level (typically from 1-9). - uint8_t compression_level() const noexcept - { - return m_CompressionLevel; - } - - /// Retrieve the compressed data size. - /// - /// \return The size of the compressed data in bytes. - size_t compressed_bytes() const - { - if (!m_Schunk) - { - throw std::runtime_error("Channel instance is not properly initialized, unable to get decompressed data"); - } - - if (std::holds_alternative>(*m_Schunk)) - { - return std::get>(*m_Schunk).csize(); - } - else if (std::holds_alternative>(*m_Schunk)) - { - return std::get>(*m_Schunk).csize(); - } - return {}; - } - - /// Retrieve the uncompressed data size. - /// - /// \return The size of the uncompressed data in elements. - size_t uncompressed_size() const - { - if (!m_Schunk) - { - throw std::runtime_error("Channel instance is not properly initialized, unable to get decompressed data"); - } - - if (std::holds_alternative>(*m_Schunk)) - { - return std::get>(*m_Schunk).size(); - } - else if (std::holds_alternative>(*m_Schunk)) - { - return std::get>(*m_Schunk).size(); - } - return {}; - } - - /// Retrieve the total number of chunks the channel stores. - /// - /// \return The number of chunks. - size_t num_chunks() const - { - assert(m_Schunk != nullptr); - - if (std::holds_alternative>(*m_Schunk)) - { - return std::get>(*m_Schunk).num_chunks(); - } - else if (std::holds_alternative>(*m_Schunk)) - { - return std::get>(*m_Schunk).num_chunks(); - } - return {}; - } - - /// \brief Retrieve the block size (in bytes) of the channel - /// - /// The internal blosc2 implementation reserves changing this value on compression so it may be possible - /// that this is not the value you initially set. - /// - /// \return The block size (in bytes). - size_t block_size() const - { - assert(m_Schunk != nullptr); - return std::visit([&](auto& schunk) - { - return schunk.max_block_size(); - }, *m_Schunk); - } - - /// \brief Retrieve the chunk size (in bytes) of the channel - /// - /// This will be all of the chunk sizes except for the last chunk. The last chunk may be smaller so to accurately - /// capture it you should use the override with a size_t - /// - /// \return The chunk size (in bytes). - size_t chunk_size() const noexcept - { - assert(m_Schunk != nullptr); - return std::visit([&](auto& schunk) - { - return schunk.chunk_bytes(); - }, *m_Schunk); - } - - size_t chunk_elems() const - { - auto chunk_size = this->chunk_size(); - assert(chunk_size % sizeof(T) == 0); - return chunk_size / sizeof(T); - } - - /// \brief Retrieve the chunk size (in bytes) of the channel at the given chunk index. - /// - /// \return The chunk size (in bytes) at index `chunk_index`. - /// - /// \throws std::out_of_range if the chunk index is invalid - size_t chunk_size(size_t chunk_index) const - { - assert(m_Schunk != nullptr); - return std::visit([&](auto& schunk) - { - return schunk.chunk_bytes(chunk_index); - }, *m_Schunk); - } - - size_t chunk_elems(size_t chunk_index) const - { - auto chunk_size = this->chunk_size(chunk_index); - assert(chunk_size % sizeof(T) == 0); - return chunk_size / sizeof(T); - } - - - /// Retrieves and decompresses a chunk of data into the provided buffer. - /// - /// This function retrieves the chunk at the given index from the internal `schunk`, - /// decompresses it using the current decompression context, and stores the result in `buffer`. - /// - /// \param buffer A span representing the destination buffer to store the decompressed data. - /// Must be large enough to hold one chunk of decompressed data. - /// \param chunk_idx The index of the chunk to retrieve. - /// - /// \throws std::runtime_error if the internal `schunk` pointer is not initialized. - void get_chunk(std::span buffer, size_t chunk_idx) const - { - if (!m_Schunk) - { - throw std::runtime_error("Internal Error: Channel instance is not properly initialized, unable to get decompressed data"); - } - - return std::visit([&](const auto& schunk) - { - // We cheat a little bit here by creating this compression ctx on the fly, unfortunately this is - // necessary as blosc2 will actually modify the ctx on decompression. - auto decomp_ctx = blosc2::create_decompression_context(m_Nthreads); - return schunk.chunk(decomp_ctx, buffer, chunk_idx); - }, *m_Schunk); - } - - /// Compresses and sets a chunk of data from the provided buffer at the specified index. - /// - /// This function compresses the data in the provided buffer using the current compression - /// context and writes it into the internal `schunk` at the given index. - /// - /// \param buffer A span representing the source data to be compressed and stored. - /// \param chunk_idx The index of the chunk to overwrite or set with the compressed data. - /// - /// \throws std::runtime_error if the internal `schunk` pointer is not initialized. - void set_chunk(std::span buffer, size_t chunk_idx) - { - if (!m_Schunk) - { - throw std::runtime_error("Internal Error: Channel instance is not properly initialized, unable to set data"); - } - - return std::visit([&](auto& schunk) - { - return schunk.set_chunk(m_CompressionContext, buffer, chunk_idx); - }, *m_Schunk); - } - - /// Get the decompressed data as a vector. - /// - /// \throws std::runtime_error if the internal `schunk` pointer is not initialized. - /// - /// \return A vector containing the decompressed data. - std::vector get_decompressed() const - { - if (!m_Schunk) - { - throw std::runtime_error("Internal Error: Channel instance is not properly initialized, unable to get decompressed data"); - } - return std::visit([&](const auto& schunk) - { - // We cheat a little bit here by creating this compression ctx on the fly, unfortunately this is - // necessary as blosc2 will actually modify the ctx on decompression. - auto decomp_ctx = blosc2::create_decompression_context(m_Nthreads); - return schunk.to_uncompressed(decomp_ctx); - }, *m_Schunk); - } - - /// Equality operators, compares pointers to check for equality - bool operator==(const channel& other) const noexcept - { - return this == &other; - } - - private: - /// The storage for the internal data, stored contiguously in a compressed data format - blosc2::schunk_var_ptr m_Schunk = nullptr; - enums::codec m_Codec = enums::codec::lz4; - - size_t m_Nthreads = std::thread::hardware_concurrency() / 2; - - /// We store a compression and decompression context here to allow us to reuse them rather than having - /// to reinitialize them on launch. May be nullptr; - blosc2::context_ptr m_CompressionContext = nullptr; - blosc2::context_ptr m_DecompressionContext = nullptr; - - /// Compression level. - uint8_t m_CompressionLevel = 9; - - /// The width and height of the channel. - size_t m_Width = 1; - size_t m_Height = 1; - }; - -} // NAMESPACE_COMPRESSED_IMAGE \ No newline at end of file + template + struct channel : public std::ranges::view_interface> + { + using value_type = T; + using iterator = channel_iterator; + using const_iterator = channel_iterator; + + channel(channel&& other) noexcept + { + m_schunk = std::move(other.m_schunk); + m_scratch_pool = std::move(other.m_scratch_pool); + m_codec = other.m_codec; + m_compression_level = other.m_compression_level; + m_num_threads = other.m_num_threads; + m_width = other.m_width; + m_height = other.m_height; + }; + + channel& operator=(channel&& other) noexcept + { + if (this != &other) + { + m_schunk = std::move(other.m_schunk); + m_scratch_pool = std::move(other.m_scratch_pool); + m_codec = other.m_codec; + m_compression_level = other.m_compression_level; + m_num_threads = other.m_num_threads; + m_width = other.m_width; + m_height = other.m_height; + } + return *this; + }; + channel(const channel&) = delete; + channel& operator=(const channel&) = delete; + + + /// Default ctor, ensures the schunk and compression/decompression contexts are always initialized + /// into valid states. This will not generate a valid channel however, and the ctor taking data or the static + /// functions `zeros` and `full` are preferred. + channel() + { + m_scratch_pool = detail::scratch_pool_registry::get_or_create_for_channel(); + m_schunk = std::make_shared>( + detail::lazy_schunk(0, 1, s_default_blocksize, s_default_chunksize) + ); + }; + + /// Initialize the channel with the given data. + /// + /// \param data The span of input data to be compressed. + /// \param width The width of the image channel. + /// \param height The height of the image channel. + /// \param compression_codec The compression codec to be used (default is lz4). + /// \param compression_level The compression level (default is 5). + /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to + /// comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle + /// larger blocks feel free to up this number although this may not increase performance + /// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. + /// This should be tweaked to be no larger than the size of the usual images you are expecting + /// to compress for optimal performance, but this could be upped which might give better compression + /// ratios. Must be a multiple of sizeof(T). + /// \param gpu_device The GPU device to user for compression/decompression. This only has an effect if the codec + /// chosen is one of the gpu_* codecs. If not specified, the best default device will be used. + /// To find out which devices are available, we provide the utility functions + /// `NAMESPACE_COMPRESSED_IMAGE::cuda::device_names()` and `NAMESPACE_COMPRESSED_IMAGE::cuda::devices()`. + /// The logical index into the arrays returned by those functions is the index that is passed + /// here. + channel( + const std::span data, + const size_t width, + const size_t height, + const enums::codec compression_codec = enums::codec::lz4, + const uint8_t compression_level = 9, + const size_t block_size = s_default_blocksize, + const size_t chunk_size = s_default_chunksize, + std::optional gpu_device = std::nullopt + ) + { + _COMPRESSED_PROFILE_FUNCTION(); + m_scratch_pool = detail::scratch_pool_registry::get_or_create_for_channel(); + m_width = width; + m_height = height; + m_codec = compression_codec; + m_num_threads = std::thread::hardware_concurrency() / 2; + m_compression_level = util::ensure_compression_level(compression_level); + if (data.size() != width * height) + { + throw std::runtime_error( + std::format( + "Invalid channel data passed. Expected its size to match up to width * height ({} * {}) which would be {:L}." + " Instead received {:L}", + width, + height, + width * height, + data.size() + ) + ); + } + + if (enums::is_gpu_codec(m_codec) && !cuda::is_available()) + { + m_codec = enums::s_gpu_codec_fallback.at(m_codec); + get_logger()->warn( + "Unable to use the provided gpu codec '{}' as no cuda device is available." + " Falling back to cpu codec '{}'.", + enums::to_string(compression_codec), + enums::to_string(m_codec) + ); + } + + if (enums::is_gpu_codec(m_codec)) + { + // Ensure the gpu index passed is valid. We treat this as a failure instead of falling back to some + // other value as this indicates the user passed an invalid device. + if (cuda::is_available() && gpu_device && gpu_device.value() > cuda::devices().size()) + { + throw std::invalid_argument( + std::format( + "Invalid GPU device index passed to compressed::channel constructor. Expected a value between 0 and {:L} but instead got {:L}", + cuda::devices().size(), + gpu_device.value() + ) + ); + } + } + else + { + // c-blosc2 chunks can at most be 2 gigabytes so the set chunk size should not exceed this. + assert(chunk_size < std::numeric_limits::max()); + assert(block_size <= chunk_size); + } + + // Align the chunks to the scanlines, makes our life a lot easier on read / write. + auto chunk_size_aligned = util::align_chunk_to_scanlines_bytes(m_width, chunk_size); + + auto gpu_device_index = enums::is_gpu_codec(m_codec) ? cuda::current_device() : 0; + auto compression_ctx = this->create_compression_context( + m_codec, + m_num_threads, + m_compression_level, + block_size, + gpu_device_index + ); + + m_schunk = std::make_shared>( + detail::schunk(data, block_size, chunk_size_aligned, std::move(compression_ctx)) + ); + } + + + /// Initialize the channel with the given data. + /// + /// \param schunk The initialized super-chunk. + /// \param width The width of the image channel. + /// \param height The height of the image channel. + /// \param compression_codec The compression codec to be used. + /// \param compression_level The compression level (default is 5). + channel( + schunk_var schunk, + const size_t width, + const size_t height, + const enums::codec compression_codec = enums::codec::lz4, + const uint8_t compression_level = 9 + ) + { + _COMPRESSED_PROFILE_FUNCTION(); + m_scratch_pool = detail::scratch_pool_registry::get_or_create_for_channel(); + m_codec = compression_codec; + m_compression_level = util::ensure_compression_level(compression_level); + + if (std::holds_alternative>(schunk)) + { + if (std::get>(schunk).size() != width * height) + { + throw std::invalid_argument( + std::format( + "Invalid schunk passed to compressed::channel constructor. Expected a size of {:L} but instead got {:L}", + width * height, + std::get>(schunk).size() + ) + ); + } + } + else if (std::holds_alternative>(schunk)) + { + if (std::get>(schunk).size() != width * height) + { + throw std::invalid_argument( + std::format( + "Invalid schunk passed to compressed::channel constructor. Expected a size of {:L} but instead got {:L}", + width * height, + std::get>(schunk).size() + ) + ); + } + } + + m_schunk = std::make_shared>(std::move(schunk)); + m_num_threads = std::thread::hardware_concurrency() / 2; + m_width = width; + m_height = height; + } + + + /// Create a channel filled with zeros. + /// + /// Generates a lazy-channel which only stores a single value T per-chunk, only setting this to a compressed buffer + /// if set with something like `set_chunk`. This is especially memory efficient and should be the preferred way + /// when wanting to generate an empty channel only filling out some parts (i.e. sparse cryptomatte loading). + /// + /// \param width The width of the image channel. + /// \param height The height of the image channel. + /// \param compression_codec The compression codec to be used. + /// \param compression_level The compression level (default is 9). + /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to + /// comfortably fit into the L1 cache of most modern CPUs. + /// \param chunk_size The size of each individual chunk, defaults to 4MB. Should be no larger than the expected image size + /// for optimal performance and must be a multiple of sizeof(T). + /// \return A channel instance with all values initialized to zero. + static channel zeros( + const size_t width, + const size_t height, + const enums::codec compression_codec = enums::codec::lz4, + const uint8_t compression_level = 9, + const size_t block_size = s_default_blocksize, + const size_t chunk_size = s_default_chunksize + ) + { + return channel::full( + width, + height, + static_cast(0), + compression_codec, + compression_level, + block_size, + chunk_size + ); + } + + /// Create a zero-initialized channel with the same shape and compression parameters as another channel. + /// + /// Generates a lazy-channel which only stores a single value T per-chunk, only setting this to a compressed buffer + /// if set with something like `set_chunk`. This is especially memory efficient and should be the preferred way + /// when wanting to generate an empty channel only filling out some parts (i.e. sparse cryptomatte loading). + /// + /// \param other The reference channel from which to copy shape and compression settings. + /// \return A new channel instance with the same dimensions and compression settings as \p other, filled with zeros. + static channel zeros_like(const channel& other) + { + return channel::zeros( + other.width(), + other.height(), + other.compression(), + other.compression_level(), + other.block_size(), + other.chunk_size() + ); + } + + /// Create a channel filled with a specific value. + /// + /// Generates a lazy-channel which only stores a single value T per-chunk, only setting this to a compressed buffer + /// if set with something like `set_chunk`. This is especially memory efficient and should be the preferred way + /// when wanting to generate an empty channel only filling out some parts (i.e. sparse cryptomatte loading). + /// + /// \param width The width of the image channel. + /// \param height The height of the image channel. + /// \param fill_value The value to fill the channel with. + /// \param compression_codec The compression codec to be used. + /// \param compression_level The compression level (default is 9). + /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB. + /// \param chunk_size The size of each individual chunk, defaults to 4MB. Should be no larger than the expected image size + /// for optimal performance and must be a multiple of sizeof(T). + /// \return A channel instance with all values initialized to \p fill_value. + static channel full( + const size_t width, + const size_t height, + const T fill_value, + const enums::codec compression_codec = enums::codec::lz4, + const uint8_t compression_level = 9, + const size_t block_size = s_default_blocksize, + const size_t chunk_size = s_default_chunksize + ) + { + const size_t chunk_size_aligned = util::align_chunk_to_scanlines_bytes(width, chunk_size); + const size_t num_elements = width * height; + + auto schunk = detail::lazy_schunk(fill_value, num_elements, block_size, chunk_size_aligned); + return channel(std::move(schunk), width, height, compression_codec, compression_level); + } + + + /// Create a channel filled with a specific value and the same shape and compression settings as another channel. + /// + /// Generates a lazy-channel which only stores a single value T per-chunk, only setting this to a compressed buffer + /// if set with something like `set_chunk`. This is especially memory efficient and should be the preferred way + /// when wanting to generate an empty channel only filling out some parts (i.e. sparse cryptomatte loading). + /// + /// \param other The reference channel from which to copy shape and compression settings. + /// \param fill_value The value to fill the channel with. + /// \return A new channel instance filled with \p fill_value and the same dimensions and compression settings as \p other. + static channel full_like(const channel& other, T fill_value) + { + return channel::full( + other.width(), + other.height(), + fill_value, + other.compression(), + other.compression_level(), + other.block_size(), + other.chunk_size() + ); + } + + /// Returns an iterator pointing to the beginning of the decompressed channel chunks. + /// + /// The iterator does not allocate on construction. Its internal buffers and compression/decompression + /// context are initialized lazily when the first chunk is dereferenced. + iterator begin() + { + if (!m_schunk) + { + throw std::runtime_error( + "Internal Error: Unable to create begin iterator as m_schunk is uninitialized." + ); + } + + return iterator( + m_schunk, + 0, + this->num_chunks(), + m_width, + m_height, + m_codec, + m_compression_level, + m_num_threads, + this->block_size(), + this->chunk_size() + ); + } + + /// Returns an iterator pointing past the last decompressed channel chunk. + iterator end() + { + if (!m_schunk) + { + throw std::runtime_error("Internal Error: Unable to create end iterator as m_schunk is uninitialized."); + } + + return iterator( + m_schunk, + this->num_chunks(), + this->num_chunks(), + m_width, + m_height, + m_codec, + m_compression_level, + m_num_threads, + this->block_size(), + this->chunk_size() + ); + } + + /// Update the number of threads used internally by c-blosc2 for compression and decompression. Only valid for + /// CPU compression/decompression + /// + /// \param nthreads The number of threads to use for compression and decompression. + void update_nthreads(size_t nthreads) + { + m_num_threads = nthreads; + } + + /// The channel width. + /// + /// \return The width of the channel. + size_t width() const noexcept + { + return m_width; + } + + /// The channel height. + /// + /// \return The height of the channel. + size_t height() const noexcept + { + return m_height; + } + + /// Retrieve the compression codec used. + /// + /// \return The compression codec. + enums::codec compression() const noexcept + { + return m_codec; + } + + /// Retrieve the compression level used. + /// + /// \return The compression level (typically from 1-9). + uint8_t compression_level() const noexcept + { + return m_compression_level; + } + + /// Retrieve the compressed data size. + /// + /// \return The size of the compressed data in bytes. + size_t compressed_bytes() const + { + if (!m_schunk) + { + throw std::runtime_error( + "Channel instance is not properly initialized, unable to get decompressed data" + ); + } + + if (std::holds_alternative>(*m_schunk)) + { + return std::get>(*m_schunk).csize(); + } + else if (std::holds_alternative>(*m_schunk)) + { + return std::get>(*m_schunk).csize(); + } + return {}; + } + + /// Retrieve the uncompressed data size. + /// + /// \return The size of the uncompressed data in elements. + size_t uncompressed_size() const + { + if (!m_schunk) + { + throw std::runtime_error( + "Channel instance is not properly initialized, unable to get decompressed data" + ); + } + + if (std::holds_alternative>(*m_schunk)) + { + return std::get>(*m_schunk).size(); + } + else if (std::holds_alternative>(*m_schunk)) + { + return std::get>(*m_schunk).size(); + } + return {}; + } + + /// Retrieve the total number of chunks the channel stores. + /// + /// \return The number of chunks. + size_t num_chunks() const + { + assert(m_schunk != nullptr); + + return std::visit( + [](const auto& schunk_) + { + return schunk_.num_chunks(); + }, + *m_schunk + ); + } + + /// \brief Retrieve the block size (in bytes) of the channel + /// + /// The internal blosc2 implementation reserves changing this value on compression so it may be possible + /// that this is not the value you initially set. + /// + /// \return The block size (in bytes). + size_t block_size() const + { + assert(m_schunk != nullptr); + return std::visit( + [&](auto& schunk) + { + return schunk.max_block_size(); + }, + *m_schunk + ); + } + + /// \brief Retrieve the chunk size (in bytes) of the channel + /// + /// This will be all of the chunk sizes except for the last chunk. The last chunk may be smaller so to accurately + /// capture it you should use the override with a size_t + /// + /// \return The chunk size (in bytes). + size_t chunk_size() const noexcept + { + assert(m_schunk != nullptr); + return std::visit( + [&](auto& schunk) + { + return schunk.chunk_bytes(); + }, + *m_schunk + ); + } + + size_t chunk_elems() const + { + auto chunk_size = this->chunk_size(); + assert(chunk_size % sizeof(T) == 0); + return chunk_size / sizeof(T); + } + + /// \brief Retrieve the chunk size (in bytes) of the channel at the given chunk index. + /// + /// \return The chunk size (in bytes) at index `chunk_index`. + /// + /// \throws std::out_of_range if the chunk index is invalid + size_t chunk_size(size_t chunk_index) const + { + assert(m_schunk != nullptr); + return std::visit( + [&](auto& schunk) + { + return schunk.chunk_bytes(chunk_index); + }, + *m_schunk + ); + } + + size_t chunk_elems(size_t chunk_index) const + { + auto chunk_size = this->chunk_size(chunk_index); + assert(chunk_size % sizeof(T) == 0); + return chunk_size / sizeof(T); + } + + + /// Retrieves and decompresses a chunk of data into the provided buffer. + /// + /// This function retrieves the chunk at the given index from the internal `schunk`, + /// decompresses it using the current decompression context, and stores the result in `buffer`. + /// + /// \param buffer A span representing the destination buffer to store the decompressed data. + /// Must be large enough to hold one chunk of decompressed data. + /// \param chunk_idx The index of the chunk to retrieve. + /// + /// \throws std::runtime_error if the internal `schunk` pointer is not initialized. + void get_chunk(std::span buffer, size_t chunk_idx) const + { + if (!m_schunk) + { + throw std::runtime_error( + "Internal Error: Channel instance is not properly initialized, unable to get decompressed data" + ); + } + + std::visit( + [&](const auto& schunk) + { + if (enums::is_gpu_codec(m_codec)) + { + schunk.chunk(buffer, chunk_idx); + } + else + { + auto compression_context = this->create_compression_context( + m_codec, + m_num_threads, + m_compression_level, + this->block_size(), + 0 + ); + + schunk.chunk( + std::get(compression_context).decompression_ctx.get(), + buffer, + chunk_idx + ); + } + } + * m_schunk + ); + } + + /// Compresses and sets a chunk of data from the provided buffer at the specified index. + /// + /// This function compresses the data in the provided buffer using the current compression + /// context and writes it into the internal `schunk` at the given index. + /// + /// \param buffer A span representing the source data to be compressed and stored. + /// \param chunk_idx The index of the chunk to overwrite or set with the compressed data. + /// + /// \throws std::runtime_error if the internal `schunk` pointer is not initialized. + void set_chunk(std::span buffer, size_t chunk_idx) + { + if (!m_schunk) + { + throw std::runtime_error( + "Internal Error: Channel instance is not properly initialized, unable to set data" + ); + } + + std::visit( + [&](auto& schunk) + { + if (buffer.size() != schunk.chunk_elements(chunk_idx)) + { + throw std::invalid_argument( + std::format( + "Invalid chunk passed to `set_chunk`. Expected this to contain exactly {} elements." + " Instead it holds {}. This is likely due to having not correctly checked the number" + " of elements.", + schunk.chunk_elements(chunk_idx), + buffer.size() + ) + ); + } + + if (enums::is_gpu_codec(m_codec)) + { + auto compression_context = this->create_compression_context( + m_codec, + m_num_threads, + m_compression_level, + this->block_size(), + cuda::current_device() + ); + + schunk.set_chunk( + std::get(compression_context).ctx, + buffer, + chunk_idx + ); + } + else + { + auto compression_context = this->create_compression_context( + m_codec, + m_num_threads, + m_compression_level, + this->block_size(), + 0 + ); + + schunk.set_chunk( + std::get(compression_context).compression_ctx, + buffer, + chunk_idx + ); + } + }, + *m_schunk + ); + } + + /// Get the decompressed data as a vector. + /// + /// \throws std::runtime_error if the internal `schunk` pointer is not initialized. + /// + /// \return A vector containing the decompressed data. + std::vector get_decompressed() const + { + if (!m_schunk) + { + throw std::runtime_error( + "Internal Error: Channel instance is not properly initialized, unable to get decompressed data" + ); + } + + + return std::visit( + [&](const auto& schunk) + { + if (enums::is_gpu_codec(m_codec)) + { + auto compression_context = this->create_compression_context( + m_codec, + m_num_threads, + m_compression_level, + this->block_size(), + cuda::current_device() + ); + return schunk.to_uncompressed(std::get(compression_context)); + } + auto compression_context = this->create_compression_context( + m_codec, + m_num_threads, + m_compression_level, + this->block_size(), + 0 + ); + return schunk.to_uncompressed(std::get(compression_context)); + }, + *m_schunk + ); + } + + /// Equality operators, compares pointers to check for equality + bool operator==(const channel& other) const noexcept + { + return this == &other; + } + + /// \brief Create a compression context for the given codec. + /// + /// This will initialize either a gpu or cpu compressor/decompressor, returning it. This is primarily + /// for internal API usage. + /// + /// \param codec The compression codec, the type of context to initialize is inferred from this. + /// \param num_threads The compression/decompression threads. Only used when the codec is cpu-based + /// \param compression_level The compression level. Only used when the codec is cpu-based + /// \param block_size The block size for the compressed data. + /// \param gpu_device The GPU device to use for compression/decompression. Only used when the codec is gpu-based + static compression_context_var create_compression_context( + const enums::codec codec, + const size_t num_threads, + const size_t compression_level, + const size_t block_size, + const int gpu_device + ) + { + _COMPRESSED_PROFILE_FUNCTION(); + if (enums::is_gpu_codec(codec)) + { + return gpu_compression_context{ + .ctx = cuda::make_compression_context(codec, gpu_device, block_size) + + }; + } + else + { + return cpu_compression_context{ + .compression_ctx = blosc2::create_compression_context( + num_threads, + codec, + static_cast(compression_level), + block_size + ), + .decompression_ctx = blosc2::create_decompression_context(num_threads), + .nthreads = num_threads + }; + } + } + + private + : + friend struct image; + + /// The storage for the internal data, stored contiguously in a compressed data format + schunk_var_ptr m_schunk = nullptr; + /// Keeps the globally discoverable scratch pool alive for as long as this channel exists. + std::shared_ptr m_scratch_pool = nullptr; + /// The compression codec in use. + enums::codec m_codec = enums::codec::lz4; + /// Compression level. + uint8_t m_compression_level = 9; + /// The number of threads used for cpu compression/decompression (blosc2 only). + size_t m_num_threads = std::thread::hardware_concurrency() / 2; + + /// The width and height of the channel. + size_t m_width = 1; + size_t m_height = 1; + }; +} // NAMESPACE_COMPRESSED_IMAGE diff --git a/compressed_image/include/compressed/constants.h b/compressed_image/include/compressed/constants.h index 6f548ed..afcf899 100644 --- a/compressed_image/include/compressed/constants.h +++ b/compressed_image/include/compressed/constants.h @@ -2,12 +2,11 @@ #include "macros.h" - -namespace NAMESPACE_COMPRESSED_IMAGE +namespace +NAMESPACE_COMPRESSED_IMAGE { - /// Default chunk size for blosc2 super-chunks. This equates to 4MB or one 2048*2048 channel - constexpr static inline std::size_t s_default_chunksize = 4'194'304; - /// Default block size for blosc2 chunks. This equates to 16 scanlines in that same 2048*2048 channel. - constexpr static inline std::size_t s_default_blocksize = 32'768; - -} // NAMESPACE_COMPRESSED_IMAGE \ No newline at end of file + /// Default chunk size for blosc2 super-chunks. This equates to 4MB or one 2048*2048 channel + constexpr static inline size_t s_default_chunksize = 4'194'304; + /// Default block size for blosc2 chunks. This equates to 16 scanlines in that same 2048*2048 channel. + constexpr static inline size_t s_default_blocksize = 32'768; +} // NAMESPACE_COMPRESSED_IMAGE diff --git a/compressed_image/include/compressed/context.h b/compressed_image/include/compressed/context.h new file mode 100644 index 0000000..9ae1176 --- /dev/null +++ b/compressed_image/include/compressed/context.h @@ -0,0 +1,26 @@ +#pragma once + +#include + +#include "blosc2/wrapper.h" +#include "compressed/macros.h" +#include "cuda/compressors/base.h" + +namespace +NAMESPACE_COMPRESSED_IMAGE +{ + struct cpu_compression_context + { + blosc2::context_ptr compression_ctx = nullptr; + blosc2::context_ptr decompression_ctx = nullptr; + + size_t nthreads{}; + }; + + struct gpu_compression_context + { + cuda::nvcomp_context ctx{}; + }; + + using compression_context_var = std::variant; +} // NAMESPACE_COMPRESSED_IMAGE diff --git a/compressed_image/include/compressed/cuda/compression.h b/compressed_image/include/compressed/cuda/compression.h new file mode 100644 index 0000000..4be5f58 --- /dev/null +++ b/compressed_image/include/compressed/cuda/compression.h @@ -0,0 +1,83 @@ +#pragma once + +#include "compressed/enums.h" + +#include "compressed/cuda/compressors/lz4.h" +#include "compressed/cuda/compressors/snappy.h" +#include "compressed/cuda/compressors/zstd.h" +#include "compressed/cuda/compressors/deflate.h" +#include "compressed/cuda/compressors/gdeflate.h" +#include "compressed/cuda/compressors/cascaded.h" + +namespace +NAMESPACE_COMPRESSED_IMAGE +{ + namespace cuda + { + template + using compressor_var = std::variant< + lz4_compressor, + snappy_compressor, + zstd_compressor, + deflate_compressor, + gdeflate_compressor, + cascaded_compressor>; + + + template + compressor_var make_compressor(NAMESPACE_COMPRESSED_IMAGE::enums::codec codec) + { + switch (codec) + { + case NAMESPACE_COMPRESSED_IMAGE::enums::codec::lz4_gpu: + return lz4_compressor{}; + case NAMESPACE_COMPRESSED_IMAGE::enums::codec::snappy_gpu: + return snappy_compressor{}; + case NAMESPACE_COMPRESSED_IMAGE::enums::codec::zstd_gpu: + return zstd_compressor{}; + case NAMESPACE_COMPRESSED_IMAGE::enums::codec::deflate_gpu: + return deflate_compressor{}; + case NAMESPACE_COMPRESSED_IMAGE::enums::codec::gdeflate_gpu: + return gdeflate_compressor{}; + case NAMESPACE_COMPRESSED_IMAGE::enums::codec::cascaded_gpu: + return cascaded_compressor{}; + default: + throw std::invalid_argument( + std::format("Unknown or unsupported gpu codec: {}", static_cast(codec)) + ); + } + } + + template + compressor_var make_compressor(const cuda::compressed_chunk& chunk) + { + return make_compressor(chunk.context.codec); + } + + + template + nvcomp_context make_compression_context( + const NAMESPACE_COMPRESSED_IMAGE::enums::codec codec, + const int gpu_device, + const size_t block_size + ) + { + _COMPRESSED_PROFILE_FUNCTION(); + auto compressor = make_compressor(codec); + + return std::visit( + [&](auto&& compressor_raw) + { + return nvcomp_context{ + .comp_options = compressor_raw.default_compression_opts(), + .decomp_options = compressor_raw.default_decompression_opts(), + .block_size = block_size, + .codec = codec, + .gpu_device = gpu_device + }; + }, + compressor + ); + } + } // namespace cuda +} // NAMESPACE_COMPRESSED_IMAGE diff --git a/compressed_image/include/compressed/cuda/compressors/base.h b/compressed_image/include/compressed/cuda/compressors/base.h new file mode 100644 index 0000000..ecc028c --- /dev/null +++ b/compressed_image/include/compressed/cuda/compressors/base.h @@ -0,0 +1,701 @@ +/* +Entry point for the various compressors of nvcomp +*/ + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "compressed/macros.h" +#include "compressed/constants.h" +#include "compressed/enums.h" +#include "compressed/util.h" + +#include "compressed/cuda/memory.h" +#include "compressed/cuda/enums.h" +#include "compressed/cuda/compressors/util.h" +#include "compressed/cuda/cuda_hook.h" +#include "compressed/cuda/gpu.h" + + +namespace +NAMESPACE_COMPRESSED_IMAGE +{ + namespace cuda + { + /// \brief compression options for the various nvcomp compressors. + using compression_options = std::variant< + nvcompBatchedLZ4CompressOpts_t, + nvcompBatchedCascadedCompressOpts_t, + nvcompBatchedDeflateCompressOpts_t, + nvcompBatchedGdeflateCompressOpts_t, + nvcompBatchedGzipCompressOpts_t, + nvcompBatchedSnappyCompressOpts_t, + nvcompBatchedZstdCompressOpts_t + >; + + /// \brief decompression options for the various nvcomp compressors. + using decompression_options = std::variant< + nvcompBatchedLZ4DecompressOpts_t, + nvcompBatchedCascadedDecompressOpts_t, + nvcompBatchedDeflateDecompressOpts_t, + nvcompBatchedGdeflateDecompressOpts_t, + nvcompBatchedGzipDecompressOpts_t, + nvcompBatchedSnappyDecompressOpts_t, + nvcompBatchedZstdDecompressOpts_t + >; + + + /// \brief Compression context for nvcomp/cuda based compression and decompression. + /// + /// Similar in scope to a `blosc2_context` struct but instead holds gpu-specific information. + struct nvcomp_context + { + /// The options to use for compression + compression_options comp_options{}; + + /// The options to use for decompression + decompression_options decomp_options{}; + + /// The block size used for compression. All blocks will have this size except for the last one which may + /// be smaller. The input is split into this many blocks. If the requested block size exceeds what the + /// compressor allows, it is internally reduced. A typical recommended value is 65536 (2^16). + size_t block_size = s_default_blocksize; + + /// The compression codec to be used. Must be one of the GPU codecs to be valid. + NAMESPACE_COMPRESSED_IMAGE::enums::codec codec{}; + + /// The GPU device to use for compression/decompression. + int gpu_device = 0; + }; + + + /// A single compressed chunk holding a collection of blocks inside it. Similar to a blosc2 chunk but instead + /// of being stored as a single chunk + /// + /// .. note:: + /// The block sizes of the chunk will always be equal to `max_block_size` except for the last block which + /// may be smaller. + template + struct compressed_chunk + { + /// \brief The compressed data, stored on device as a std::vector. + cuda_device_buffer_async compressed_data; + + /// \brief The compressed size of each block in bytes. + std::vector compressed_block_sizes{}; + + /// \brief The uncompressed block sizes of all the blocks inside. Expressed as bytes. + std::vector uncompressed_block_sizes{}; + + /// \brief The compression context used for compression/decompression. Once set this may not be modified. + nvcomp_context context{}; + + compressed_chunk() = default; + + compressed_chunk( + cuda_device_buffer_async _compressed_data, + std::vector _comp_sizes, + std::vector _uncomp_sizes, + nvcomp_context ctx) + : compressed_data(std::move(_compressed_data)), + compressed_block_sizes(std::move(_comp_sizes)), + uncompressed_block_sizes(std::move(_uncomp_sizes)), + context(std::move(ctx)) + { + } + + compressed_chunk(const compressed_chunk&) = delete; + compressed_chunk& operator=(const compressed_chunk&) = delete; + compressed_chunk(compressed_chunk&&) noexcept = default; + compressed_chunk& operator=(compressed_chunk&&) noexcept = default; + + [[nodiscard]] size_t csize() const + { + return this->compressed_data.bytes(); + } + + [[nodiscard]] size_t size() const + { + return this->byte_size() / sizeof(T); + } + + [[nodiscard]] size_t byte_size() const + { + return std::accumulate(uncompressed_block_sizes.begin(), uncompressed_block_sizes.end(), size_t{0}); + } + + [[nodiscard]] inline size_t max_block_size() const + { + if (uncompressed_block_sizes.empty()) + return 0; + + return uncompressed_block_sizes.at(0); + } + }; + + + namespace detail + { + /// \brief base cuda-based compressor base that provides utility functions for the various compressor implementations + /// + /// Note: we use blosc2 terminology for a lot of these calls i.e. what nvcomp may call 'chunks' we call blocks + /// as each unit we are compressing is already a chunk. This gives us the same 3D data structure where + /// we cascade from channel -> chunks -> blocks. Unlike with blosc2 the blocks here are transparent to us + /// but we hide them from the call site + /// + /// When re-implementing this for different compression procedures, one needs to implement: + /// + /// - codec <-- The codec associated with this compressor + /// - default_compression_opts <-- Default compression options + /// - default_decompression_opts <-- Default decompression options + /// - get_temp_bytes <-- The number of scratch bytes required for compression/decompression + /// - block_max_compressed_size <-- The max block size of a compressed block, given the compression opts + /// - max_block_size <-- The overall max (uncompressed) size a block may have. + /// - compression_impl <-- The implementation of the compression procedure + /// - decompression_impl <-- The implementation of the decompression procedure + template + struct compressor + { + virtual ~compressor() = default; + + /// \brief Compress a CPU buffer into a compressed chunk using CUDA. + /// + /// This function transfers the input data to the specified CUDA device, splits it + /// into fixed-size blocks, compresses each block asynchronously, and copies the + /// compressed results back to host memory. The compression is performed using + /// the algorithm configured by \p context. + /// + /// \param data The input data buffer to compress (host memory). + /// \param context The compression/decompression context used for the generated blocks. + /// + /// \return A \c compressed_chunk containing the compressed blocks, their sizes, + /// and codec metadata describing the compression algorithm used. + /// + /// \throws std::bad_variant_access If the provided \p options are not valid for + /// the current compression algorithm. + /// \throws std::runtime_error If any CUDA memory allocation, copy, or kernel + /// execution fails. + /// + /// \note Compression uses asynchronous CUDA operations with per-thread streams and + /// memory pooling. Data is synchronized before returning, but overlapping + /// work on other streams may proceed concurrently. + compressed_chunk compress(std::span data, nvcomp_context context) const + { + _COMPRESSED_PROFILE_FUNCTION(); + + device_guard guard(context.gpu_device); + cuda_api::instance().set_mem_pool_size(context.gpu_device); + + context.block_size = this->fit_block_size(context.block_size); + + // ################################################################################## + // Set up device uncompressed memory + // ################################################################################## + const size_t num_blocks = (data.size() * sizeof(T) + context.block_size - 1) / context.block_size; + auto block_sizes = this->generate_block_sizes( + data.size() * sizeof(T), + context.block_size, + num_blocks + ); + + auto device_uncompressed_data = make_device_buffer_async(data.size()); + cuda_api::instance().memcpy_async( + device_uncompressed_data.get_raw(), + static_cast(data.data()), + device_uncompressed_data.bytes(), + cudaMemcpyHostToDevice + ); + + auto device_block_pointers = compressor::generate_device_block_pointers( + device_uncompressed_data, + context.block_size, + num_blocks + ); + auto device_block_sizes = cuda_device_buffer_async::from_host(block_sizes); + + // ################################################################################## + // Set up device temporary compressed memory (Flat Buffer Strategy) + // ################################################################################## + auto max_block_compressed_size = this->block_max_compressed_size( + context.block_size, + context.comp_options + ); + + auto temp_compressed_buffer = make_device_buffer_async( + num_blocks * max_block_compressed_size + ); + std::byte* base_device_ptr = temp_compressed_buffer.get(); + + std::vector host_compressed_ptrs(num_blocks); + for (size_t i = 0; i < num_blocks; ++i) + { + host_compressed_ptrs[i] = static_cast(base_device_ptr + (i * max_block_compressed_size)); + } + + auto device_compressed_ptrs = cuda_device_buffer_async::from_host(host_compressed_ptrs); + + // ################################################################################## + // Set up scratch buffer + // ################################################################################## + auto device_temp_compression_buffer = this->generate_temp_buffer( + context.block_size, + num_blocks, + context.comp_options + ); + + // ################################################################################## + // Call the compression routine + // ################################################################################## + auto device_compressed_bytes = make_device_buffer_async(num_blocks); + auto device_statuses = make_device_buffer_async(num_blocks); + + this->compression_impl( + context.block_size, + num_blocks, + device_block_pointers, + device_block_sizes, + device_temp_compression_buffer, + device_compressed_ptrs, + device_compressed_bytes, + device_statuses, + context.comp_options + ); + + // ################################################################################## + // Copy sizes back to host & allocate flat fitted GPU buffer + // ################################################################################## + auto compressed_bytes_pinned = make_host_mem(num_blocks); + + device_compressed_bytes.to_host( + std::span(compressed_bytes_pinned.get(), num_blocks) + ); + + cuda_api::instance().stream_synchronize(cudaStreamPerThread); + + this->validate_per_block_statuses(device_statuses); + + // Compute tracking offsets and total required memory size (Prefix Sum) + size_t total_fitted_bytes = 0; + std::vector host_block_offsets(num_blocks); + std::vector host_compressed_sizes(num_blocks); + + for (size_t i = 0; i < num_blocks; ++i) + { + size_t actual_size = compressed_bytes_pinned.get()[i]; + host_compressed_sizes[i] = actual_size; + host_block_offsets[i] = total_fitted_bytes; + total_fitted_bytes += actual_size; + } + + // Allocate ONE single, tightly fitted GPU buffer for output + auto fitted_device_buffer = make_device_buffer_async(total_fitted_bytes); + std::byte* dest_base_ptr = fitted_device_buffer.get(); + + // Parallel device-to-device streaming memcpys + for (size_t i = 0; i < num_blocks; ++i) + { + const size_t actual_size = host_compressed_sizes[i]; + if (actual_size == 0) continue; + + void* src_ptr = static_cast(base_device_ptr + (i * max_block_compressed_size)); + void* dst_ptr = static_cast(dest_base_ptr + host_block_offsets[i]); + + cuda_api::instance().memcpy_async( + dst_ptr, + src_ptr, + actual_size, + cudaMemcpyDeviceToDevice + ); + } + + // ################################################################################## + // Finalize async work + // ################################################################################## + cuda_api::instance().stream_synchronize(cudaStreamPerThread); + + return compressed_chunk{ + std::move(fitted_device_buffer), + std::move(host_compressed_sizes), + std::move(block_sizes), + std::move(context) + }; + }; + + + /// \brief Decompress a compressed_chunk directly into a preallocated CPU buffer. + /// + /// This function decompresses all blocks in \p chunk and writes the output + /// sequentially into \p output. The caller must ensure that \p output is + /// large enough to hold the full decompressed data. + /// + /// \param chunk The compressed data (blocks + sizes). + /// \param output The preallocated span of memory where the uncompressed data + /// will be written. + /// + /// \throws std::runtime_error on CUDA or nvCOMP failure. + void decompress(const compressed_chunk& chunk, std::span output) const + { + _COMPRESSED_PROFILE_FUNCTION(); + + nvcomp_context context = chunk.context; + + device_guard guard(context.gpu_device); + cuda_api::instance().set_mem_pool_size(context.gpu_device); + + const size_t num_blocks = chunk.compressed_block_sizes.size(); + + // ################################################################################## + // Slice flat buffer pointers back out on the host via prefix tracking + // ################################################################################## + std::vector host_compressed_ptrs(num_blocks); + size_t running_offset = 0; + const std::byte* base_compressed_ptr = chunk.compressed_data.get(); + + for (size_t i = 0; i < num_blocks; ++i) + { + host_compressed_ptrs[i] = static_cast(base_compressed_ptr + running_offset); + running_offset += chunk.compressed_block_sizes[i]; + } + + auto device_compressed_ptrs = cuda_device_buffer_async::from_host(host_compressed_ptrs); + auto device_compressed_bytes = cuda_device_buffer_async::from_host( + chunk.compressed_block_sizes + ); + + // ################################################################################## + // Allocate single contiguous device buffer for all output + // ################################################################################## + auto device_output = make_device_buffer_async(output.size()); + + std::vector host_uncompressed_ptrs(num_blocks); + size_t offset_bytes = 0; + for (size_t i = 0; i < num_blocks; ++i) + { + host_uncompressed_ptrs[i] = reinterpret_cast(device_output.get() + (offset_bytes / sizeof + (T))); + offset_bytes += context.block_size; + } + + auto device_uncompressed_ptrs = cuda_device_buffer_async::from_host(host_uncompressed_ptrs); + std::vector _tmp_block_sizes = chunk.uncompressed_block_sizes; + auto device_uncompressed_bytes = cuda_device_buffer_async::from_host(_tmp_block_sizes); + + // ################################################################################## + // Allocate scratch buffer for decompression + // ################################################################################## + auto device_temp = this->generate_temp_buffer( + context.block_size, + num_blocks, + context.decomp_options + ); + auto device_statuses = make_device_buffer_async(num_blocks); + + // ################################################################################## + // Call algorithm-specific device decompression + // ################################################################################## + decompression_impl( + num_blocks, + device_compressed_ptrs, + device_compressed_bytes, + device_temp, + device_uncompressed_ptrs, + device_uncompressed_bytes, + device_statuses, + context.decomp_options + ); + + // ################################################################################## + // Copy result back to host + // ################################################################################## + cuda_api::instance().memcpy_async( + static_cast(output.data()), + device_output.get_raw(), + output.size() * sizeof(T), + cudaMemcpyDeviceToHost + ); + + cuda_api::instance().stream_synchronize(cudaStreamPerThread); + + compressor::validate_per_block_statuses(device_statuses); + } + + /// \brief The codec associated with the compressor. + [[nodiscard]] virtual NAMESPACE_COMPRESSED_IMAGE::enums::codec codec() const noexcept + = + 0; + + [[nodiscard]] virtual compression_options default_compression_opts() const noexcept + = + 0; + [[nodiscard]] virtual decompression_options default_decompression_opts() const noexcept + = + 0; + + /// \brief The max block size allowed for a given compressor. Implementation defined. + [[nodiscard]] virtual size_t max_block_size() const noexcept + = + 0; + + /// \brief Fits the given `block_size` to be <= what the compressor allows. + /// + /// Additionally, ensures the block size aligns to T such that we can go back and forth from + /// std::byte <-> T cleanly. + [[nodiscard]] size_t fit_block_size(size_t block_size) const noexcept + { + auto fitted = std::min(block_size, this->max_block_size()); + fitted -= fitted % sizeof(T); + return fitted; + }; + + private + : + /// ################################################################################## + /// Pure virtual function, dependent on compressor. + /// ################################################################################## + + /// \brief Retrieve the number of temporary device bytes needed for the compression/decompression procedure + /// + /// \param block_size The block size of one of the sub-streams + /// \param num_blocks The total number of blocks + /// \param options The compression/decompression options for which to get the number of temporary + /// bytes + virtual size_t get_temp_bytes( + size_t block_size, + size_t num_blocks, + std::variant options + ) const + = + 0; + + /// \brief Retrieve the maximum size required for compressing a single block for the given options + /// + /// \param block_size The size of a single block + /// \param options The compression options which are used for compression. + virtual size_t block_max_compressed_size(size_t block_size, compression_options& options) const + = + 0; + + + /// \brief Call the underlying compression implementation of the set of blocks. + /// + /// All memory allocation needs to happen before this point, as this assumes all of these buffers have + /// been allocated and filled + /// + /// \param block_size The overall block size, all blocks except for the last should have + /// this size. + /// \param num_blocks The overall number of blocks + /// \param uncompressed_block_ptrs The pointers to the start of each uncompressed block + /// \param uncompressed_block_sizes The size of each uncompressed block + /// \param scratch_space The scratch bytes used by the compressor during compression. + /// \param compressed_block_ptrs To be filled out by the implementation, the pointers to the start + /// of each (preallocated) compressed block + /// \param compressed_block_sizes To be filled out by the implementation, the sizes of the compressed + /// blocks. + /// \param block_statuses The statuses per compressed block, these live on the GPU and must be + /// copied back for introspection. + /// \param options The compression options to use, must be valid for the current + /// compressor. + virtual void compression_impl( + size_t block_size, + size_t num_blocks, + const cuda_device_buffer_async& uncompressed_block_ptrs, + const cuda_device_buffer_async& uncompressed_block_sizes, + cuda_device_buffer_async& scratch_space, + cuda_device_buffer_async& compressed_block_ptrs, + cuda_device_buffer_async& compressed_block_sizes, + cuda_device_buffer_async& block_statuses, + const compression_options& options + ) const + = + 0; + + /// \brief Low-level device decompression implementation. + /// + /// This function should be implemented by the derived class for a specific + /// compression algorithm (e.g., lz, zstd). It operates entirely on device + /// memory and writes decompressed data into `uncompressed_block_ptrs`. + /// + /// \param num_blocks The overall number of blocks + /// \param compressed_block_ptrs Device buffer containing pointers to compressed blocks. + /// \param compressed_block_sizes Device buffer containing sizes of compressed blocks. + /// \param scratch_space Temporary device buffer allocated for decompression. + /// \param uncompressed_block_ptrs The pointers to the start of each uncompressed block, filled out by + /// this function + /// \param uncompressed_block_sizes The size of each uncompressed block, filled out by this function. + /// \param block_statuses The statuses per compressed block, these live on the GPU and must be + /// copied back for introspection. + /// \param options Algorithm-specific decompression options. + virtual void decompression_impl( + size_t num_blocks, + const cuda_device_buffer_async& compressed_block_ptrs, + const cuda_device_buffer_async& compressed_block_sizes, + cuda_device_buffer_async& scratch_space, + cuda_device_buffer_async& uncompressed_block_ptrs, + cuda_device_buffer_async& uncompressed_block_sizes, + cuda_device_buffer_async& block_statuses, + const decompression_options& options + ) const + = + 0; + + private + : + /// ################################################################################## + /// Generic functions across all compressors + /// ################################################################################## + + + /// \brief Validate all the errors per-block collected during compression/decompression and throw these + /// as aggregated exception + /// + /// This function NEEDS to be called after synchronization of the stream and the compression/decompression + /// operations as otherwise these statuses are not yet guaranteed to be valid. + /// + /// \param device_statuses The device-memory held nvcompStatus_t vector. + void validate_per_block_statuses(cuda_device_buffer_async& device_statuses) const + { + _COMPRESSED_PROFILE_FUNCTION(); + auto status_buffer = NAMESPACE_COMPRESSED_IMAGE::cuda::make_host_mem( + device_statuses.size + ); + + cuda_api::instance().memcpy_async( + status_buffer.get(), + device_statuses.get(), + device_statuses.bytes(), + cudaMemcpyDeviceToHost + ); + cuda_api::instance().stream_synchronize(cudaStreamPerThread); + + + std::vector error_messages; + for (size_t i = 0; i < device_statuses.size; ++i) + { + if (status_buffer.get()[i] != nvcompStatus_t::nvcompSuccess) + { + error_messages.emplace_back( + std::format( + "block {} failed with nvcomp status: '{}'", + i, + cuda::util::status_t_to_string(status_buffer.get()[i]) + ) + ); + } + } + + if (!error_messages.empty()) + { + std::string joined_errors; + joined_errors.reserve(error_messages.size() * 64); + + for (size_t j = 0; j < error_messages.size(); ++j) + { + joined_errors += error_messages[j]; + if (j + 1 < error_messages.size()) + { + joined_errors += '\n'; + } + } + + throw std::runtime_error( + std::format( + "compression/decompression failed for {} out of {} blocks:\n{}", + error_messages.size(), + device_statuses.size, + joined_errors + ) + ); + } + } + + + /// \brief Generates the temporary buffer the compressor uses internally. + /// + /// \param block_size The block size to use for compression + /// \param num_blocks The number of blocks to compress + /// \param options The compression/decompression options that will be used + cuda_device_buffer_async generate_temp_buffer( + const size_t block_size, + const size_t num_blocks, + std::variant options + ) const + { + _COMPRESSED_PROFILE_FUNCTION(); + auto size = this->get_temp_bytes(block_size, num_blocks, options); + return make_device_buffer_async(size); + } + + /// \brief Generate a buffer (vector) containing pointers into the individual blocks + /// + /// These pointers index into the device memory from `device_buffer` + /// + /// \param device_uncompressed_data The device memory buffer which holds the uncompressed data + /// \param block_size The size of a single block + /// \param num_blocks The number of blocks `device_buffer` stores + static cuda_device_buffer_async generate_device_block_pointers( + const cuda_device_buffer_async& device_uncompressed_data, + const size_t block_size, + const size_t num_blocks + ) + { + _COMPRESSED_PROFILE_FUNCTION(); + std::vector ptrs(num_blocks); + + auto device_base_ptr = static_cast(device_uncompressed_data.get_raw()); + for (size_t i = 0; i < num_blocks; ++i) + { + ptrs[i] = device_base_ptr + block_size * i; + } + + // Now that we have this memory on the host, we memcpy it over + auto device_buffer = make_device_buffer_async(num_blocks); + cuda_api::instance().memcpy_async( + device_buffer.get_raw(), + ptrs.data(), + device_buffer.bytes(), + cudaMemcpyHostToDevice + ); + + return std::move(device_buffer); + } + + /// \brief Compute a vector of all the block sizes of the uncompressed data. + /// + /// All elements will be == to `block_size` except the last element which will be the mod. + /// + /// \param num_bytes The total uncompressed bytes + /// \param block_size The block size, already fitted to fit within + /// \param num_blocks The number of blocks to generate. + std::vector generate_block_sizes( + const size_t num_bytes, + const size_t block_size, + const size_t num_blocks + ) const + { + _COMPRESSED_PROFILE_FUNCTION(); + std::vector out(num_blocks, block_size); + if (!out.empty()) + { + out[out.size() - 1] = num_bytes - (block_size * (num_blocks - 1)); + } + return out; + } + }; + } + } // namespace cuda +} // namespace NAMESPACE_COMPRESSED_IMAGE diff --git a/compressed_image/include/compressed/cuda/compressors/cascaded.h b/compressed_image/include/compressed/cuda/compressors/cascaded.h new file mode 100644 index 0000000..3e02f9c --- /dev/null +++ b/compressed_image/include/compressed/cuda/compressors/cascaded.h @@ -0,0 +1,197 @@ +#pragma once + +#include "compressed/macros.h" + +#include "compressed/cuda/nvcomp_hook.h" +#include "compressed/cuda/compressors/base.h" +#include "compressed/cuda/compressors/util.h" + +namespace +NAMESPACE_COMPRESSED_IMAGE::cuda +{ + template + struct cascaded_compressor final : public detail::compressor + { + [[nodiscard]] NAMESPACE_COMPRESSED_IMAGE::enums::codec codec() const noexcept override + { + return NAMESPACE_COMPRESSED_IMAGE::enums::codec::cascaded_gpu; + }; + + [[nodiscard]] compression_options default_compression_opts() const noexcept override + { + auto opts = nvcompBatchedCascadedCompressDefaultOpts; + opts.type = util::to_nvcomp_type(); + if (opts.type == nvcompType_t::NVCOMP_TYPE_FLOAT16) + { + opts.type = nvcompType_t::NVCOMP_TYPE_SHORT; + } + return opts; + }; + + [[nodiscard]] decompression_options default_decompression_opts() const noexcept override + { + return nvcompBatchedCascadedDecompressDefaultOpts; + }; + + [[nodiscard]] size_t max_block_size() const noexcept override + { + return nvcompCascadedCompressionMaxAllowedChunkSize; + }; + + private: + size_t get_temp_bytes( + size_t block_size, + size_t num_blocks, + std::variant options + ) const override + { + _COMPRESSED_PROFILE_FUNCTION(); + if (std::holds_alternative(options)) + { + size_t temp_bytes{}; + const auto status = nvcomp_api::instance().CascadedCompressGetTempSizeAsync( + num_blocks, + block_size, + std::get(std::get(options)), + &temp_bytes, + block_size * num_blocks + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "cascaded: Unable to retrieve the scratch bytes required for compression due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + + return temp_bytes; + } + + size_t temp_bytes{}; + const auto status = nvcomp_api::instance().CascadedDecompressGetTempSizeAsync( + num_blocks, + block_size, + std::get(std::get(options)), + &temp_bytes, + block_size * num_blocks + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "cascaded: Unable to retrieve the scratch bytes required for decompression due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + + return temp_bytes; + } + + + size_t block_max_compressed_size(size_t block_size, compression_options& options) const override + { + _COMPRESSED_PROFILE_FUNCTION(); + size_t max_bytes = 0; + const auto status = nvcomp_api::instance().CascadedCompressGetMaxOutputChunkSize( + block_size, + std::get(options), + &max_bytes + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "cascaded: Unable to retrieve the maximum compressed size for a block due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + + return max_bytes; + } + + + void compression_impl( + size_t block_size, + size_t num_blocks, + const cuda_device_buffer_async& uncompressed_block_ptrs, + const cuda_device_buffer_async& uncompressed_block_sizes, + cuda_device_buffer_async& scratch_space, + cuda_device_buffer_async& compressed_block_ptrs, + cuda_device_buffer_async& compressed_block_sizes, + cuda_device_buffer_async& block_statuses, + const compression_options& options + ) const override + { + _COMPRESSED_PROFILE_FUNCTION(); + const auto status = nvcomp_api::instance().CascadedCompressAsync( + uncompressed_block_ptrs.get(), + uncompressed_block_sizes.get(), + block_size, + num_blocks, + scratch_space.get_raw(), + scratch_space.bytes(), + compressed_block_ptrs.get(), + compressed_block_sizes.get(), + std::get(options), + block_statuses.get(), + cudaStreamPerThread + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "cascaded: nvcompBatchedCascadedCompressAsync failed to launch due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + }; + + void decompression_impl( + size_t num_blocks, + const cuda_device_buffer_async& compressed_block_ptrs, + const cuda_device_buffer_async& compressed_block_sizes, + cuda_device_buffer_async& scratch_space, + cuda_device_buffer_async& uncompressed_block_ptrs, + cuda_device_buffer_async& uncompressed_block_sizes, + cuda_device_buffer_async& block_statuses, + const decompression_options& options + ) const override + { + _COMPRESSED_PROFILE_FUNCTION(); + auto block_sizes_out = cuda::make_device_buffer_async(num_blocks); + + const auto status = nvcomp_api::instance().CascadedDecompressAsync( + compressed_block_ptrs.get(), + compressed_block_sizes.get(), + uncompressed_block_sizes.get(), + block_sizes_out.get(), + num_blocks, + scratch_space.get_raw(), + scratch_space.size, + uncompressed_block_ptrs.get(), + std::get(options), + block_statuses.get(), + cudaStreamPerThread + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "cascaded: nvcompBatchedCascadedDecompressAsync failed to launch due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + } + }; +} // namespace NAMESPACE_COMPRESSED_IMAGE::cuda diff --git a/compressed_image/include/compressed/cuda/compressors/deflate.h b/compressed_image/include/compressed/cuda/compressors/deflate.h new file mode 100644 index 0000000..bac2c4c --- /dev/null +++ b/compressed_image/include/compressed/cuda/compressors/deflate.h @@ -0,0 +1,195 @@ +#pragma once + +#include "compressed/macros.h" + +#include "compressed/cuda/nvcomp_hook.h" +#include "compressed/cuda/compressors/base.h" +#include "compressed/cuda/compressors/util.h" + +namespace +NAMESPACE_COMPRESSED_IMAGE +{ + namespace cuda + { + template + struct deflate_compressor final : public detail::compressor + { + [[nodiscard]] NAMESPACE_COMPRESSED_IMAGE::enums::codec codec() const noexcept override + { + return NAMESPACE_COMPRESSED_IMAGE::enums::codec::deflate_gpu; + }; + + [[nodiscard]] compression_options default_compression_opts() const noexcept override + { + // This defaults to a low compression, high throughput mode. + return nvcompBatchedDeflateCompressDefaultOpts; + }; + + [[nodiscard]] decompression_options default_decompression_opts() const noexcept override + { + return nvcompBatchedDeflateDecompressDefaultOpts; + }; + + [[nodiscard]] size_t max_block_size() const noexcept override + { + return nvcompDeflateCompressionMaxAllowedChunkSize; + }; + + private: + size_t get_temp_bytes( + size_t block_size, + size_t num_blocks, + std::variant options + ) const override + { + _COMPRESSED_PROFILE_FUNCTION(); + if (std::holds_alternative(options)) + { + size_t temp_bytes{}; + const auto status = nvcomp_api::instance().DeflateCompressGetTempSizeAsync( + num_blocks, + block_size, + std::get(std::get(options)), + &temp_bytes, + block_size * num_blocks + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "deflate: Unable to retrieve the scratch bytes required for compression due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + + return temp_bytes; + } + + size_t temp_bytes{}; + const auto status = nvcomp_api::instance().DeflateDecompressGetTempSizeAsync( + num_blocks, + block_size, + std::get(std::get(options)), + &temp_bytes, + block_size * num_blocks + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "deflate: Unable to retrieve the scratch bytes required for decompression due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + + return temp_bytes; + } + + + size_t block_max_compressed_size(size_t block_size, compression_options& options) const override + { + _COMPRESSED_PROFILE_FUNCTION(); + size_t max_bytes = 0; + const auto status = nvcomp_api::instance().DeflateCompressGetMaxOutputChunkSize( + block_size, + std::get(options), + &max_bytes + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "deflate: Unable to retrieve the maximum compressed size for a block due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + + return max_bytes; + } + + + void compression_impl( + size_t block_size, + size_t num_blocks, + const cuda_device_buffer_async& uncompressed_block_ptrs, + const cuda_device_buffer_async& uncompressed_block_sizes, + cuda_device_buffer_async& scratch_space, + cuda_device_buffer_async& compressed_block_ptrs, + cuda_device_buffer_async& compressed_block_sizes, + cuda_device_buffer_async& block_statuses, + const compression_options& options + ) const override + { + _COMPRESSED_PROFILE_FUNCTION(); + const auto status = nvcomp_api::instance().DeflateCompressAsync( + uncompressed_block_ptrs.get(), + uncompressed_block_sizes.get(), + block_size, + num_blocks, + scratch_space.get_raw(), + scratch_space.bytes(), + compressed_block_ptrs.get(), + compressed_block_sizes.get(), + std::get(options), + block_statuses.get(), + cudaStreamPerThread + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "deflate: nvcompBatchedDeflateCompressAsync failed to launch due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + }; + + void decompression_impl( + size_t num_blocks, + const cuda_device_buffer_async& compressed_block_ptrs, + const cuda_device_buffer_async& compressed_block_sizes, + cuda_device_buffer_async& scratch_space, + cuda_device_buffer_async& uncompressed_block_ptrs, + cuda_device_buffer_async& uncompressed_block_sizes, + cuda_device_buffer_async& block_statuses, + const decompression_options& options + ) const override + { + _COMPRESSED_PROFILE_FUNCTION(); + auto block_sizes_out = cuda::make_device_buffer_async(num_blocks); + + const auto status = nvcomp_api::instance().DeflateDecompressAsync( + compressed_block_ptrs.get(), + compressed_block_sizes.get(), + uncompressed_block_sizes.get(), + block_sizes_out.get(), + num_blocks, + scratch_space.get_raw(), + scratch_space.size, + uncompressed_block_ptrs.get(), + std::get(options), + block_statuses.get(), + cudaStreamPerThread + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "deflate: nvcompBatchedDeflateDecompressAsync failed to launch due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + } + }; + } // namespace cuda +} // namespace NAMESPACE_COMPRESSED_IMAGE diff --git a/compressed_image/include/compressed/cuda/compressors/gdeflate.h b/compressed_image/include/compressed/cuda/compressors/gdeflate.h new file mode 100644 index 0000000..59579e6 --- /dev/null +++ b/compressed_image/include/compressed/cuda/compressors/gdeflate.h @@ -0,0 +1,192 @@ +#pragma once + +#include "compressed/macros.h" + +#include "compressed/cuda/nvcomp_hook.h" +#include "compressed/cuda/compressors/base.h" +#include "compressed/cuda/compressors/util.h" + +namespace +NAMESPACE_COMPRESSED_IMAGE::cuda +{ + template + struct gdeflate_compressor final : public detail::compressor + { + [[nodiscard]] NAMESPACE_COMPRESSED_IMAGE::enums::codec codec() const noexcept override + { + return NAMESPACE_COMPRESSED_IMAGE::enums::codec::gdeflate_gpu; + }; + + [[nodiscard]] compression_options default_compression_opts() const noexcept override + { + // This defaults to a low compression, high throughput mode. + return nvcompBatchedGdeflateCompressDefaultOpts; + }; + + [[nodiscard]] decompression_options default_decompression_opts() const noexcept override + { + return nvcompBatchedGdeflateDecompressDefaultOpts; + }; + + [[nodiscard]] size_t max_block_size() const noexcept override + { + return nvcompGdeflateCompressionMaxAllowedChunkSize; + }; + + private: + size_t get_temp_bytes( + size_t block_size, + size_t num_blocks, + std::variant options + ) const override + { + _COMPRESSED_PROFILE_FUNCTION(); + if (std::holds_alternative(options)) + { + size_t temp_bytes{}; + const auto status = nvcomp_api::instance().GdeflateCompressGetTempSizeAsync( + num_blocks, + block_size, + std::get(std::get(options)), + &temp_bytes, + block_size * num_blocks + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "gdeflate: Unable to retrieve the scratch bytes required for compression due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + + return temp_bytes; + } + + size_t temp_bytes{}; + const auto status = nvcomp_api::instance().GdeflateDecompressGetTempSizeAsync( + num_blocks, + block_size, + std::get(std::get(options)), + &temp_bytes, + block_size * num_blocks + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "gdeflate: Unable to retrieve the scratch bytes required for decompression due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + + return temp_bytes; + } + + + size_t block_max_compressed_size(size_t block_size, compression_options& options) const override + { + _COMPRESSED_PROFILE_FUNCTION(); + size_t max_bytes = 0; + const auto status = nvcomp_api::instance().GdeflateCompressGetMaxOutputChunkSize( + block_size, + std::get(options), + &max_bytes + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "gdeflate: Unable to retrieve the maximum compressed size for a block due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + + return max_bytes; + } + + + void compression_impl( + size_t block_size, + size_t num_blocks, + const cuda_device_buffer_async& uncompressed_block_ptrs, + const cuda_device_buffer_async& uncompressed_block_sizes, + cuda_device_buffer_async& scratch_space, + cuda_device_buffer_async& compressed_block_ptrs, + cuda_device_buffer_async& compressed_block_sizes, + cuda_device_buffer_async& block_statuses, + const compression_options& options + ) const override + { + _COMPRESSED_PROFILE_FUNCTION(); + const auto status = nvcomp_api::instance().GdeflateCompressAsync( + uncompressed_block_ptrs.get(), + uncompressed_block_sizes.get(), + block_size, + num_blocks, + scratch_space.get_raw(), + scratch_space.bytes(), + compressed_block_ptrs.get(), + compressed_block_sizes.get(), + std::get(options), + block_statuses.get(), + cudaStreamPerThread + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "gdeflate: nvcompBatchedGdeflateCompressAsync failed to launch due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + }; + + void decompression_impl( + size_t num_blocks, + const cuda_device_buffer_async& compressed_block_ptrs, + const cuda_device_buffer_async& compressed_block_sizes, + cuda_device_buffer_async& scratch_space, + cuda_device_buffer_async& uncompressed_block_ptrs, + cuda_device_buffer_async& uncompressed_block_sizes, + cuda_device_buffer_async& block_statuses, + const decompression_options& options + ) const override + { + _COMPRESSED_PROFILE_FUNCTION(); + auto block_sizes_out = cuda::make_device_buffer_async(num_blocks); + + const auto status = nvcomp_api::instance().GdeflateDecompressAsync( + compressed_block_ptrs.get(), + compressed_block_sizes.get(), + uncompressed_block_sizes.get(), + block_sizes_out.get(), + num_blocks, + scratch_space.get_raw(), + scratch_space.size, + uncompressed_block_ptrs.get(), + std::get(options), + block_statuses.get(), + cudaStreamPerThread + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "gdeflate: nvcompBatchedGdeflateDecompressAsync failed to launch due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + } + }; +} // namespace NAMESPACE_COMPRESSED_IMAGE:cuda diff --git a/compressed_image/include/compressed/cuda/compressors/lz4.h b/compressed_image/include/compressed/cuda/compressors/lz4.h new file mode 100644 index 0000000..eef6e97 --- /dev/null +++ b/compressed_image/include/compressed/cuda/compressors/lz4.h @@ -0,0 +1,198 @@ +#pragma once + +#include "compressed/macros.h" + +#include "compressed/cuda/compressors/base.h" +#include "compressed/cuda/compressors/util.h" +#include "compressed/cuda/nvcomp_hook.h" + + +namespace +NAMESPACE_COMPRESSED_IMAGE::cuda +{ + template + struct lz4_compressor : public detail::compressor + { + [[nodiscard]] NAMESPACE_COMPRESSED_IMAGE::enums::codec codec() const noexcept override + { + return NAMESPACE_COMPRESSED_IMAGE::enums::codec::lz4_gpu; + }; + + [[nodiscard]] compression_options default_compression_opts() const noexcept override + { + auto opts = nvcompBatchedLZ4CompressDefaultOpts; + opts.data_type = util::to_nvcomp_type(); + if (opts.data_type == nvcompType_t::NVCOMP_TYPE_FLOAT16) + { + opts.data_type = nvcompType_t::NVCOMP_TYPE_SHORT; + } + return opts; + }; + + [[nodiscard]] decompression_options default_decompression_opts() const noexcept override + { + return nvcompBatchedLZ4DecompressDefaultOpts; + }; + + [[nodiscard]] size_t max_block_size() const noexcept override + { + return nvcompLZ4CompressionMaxAllowedChunkSize; + }; + + private: + size_t get_temp_bytes( + size_t block_size, + size_t num_blocks, + std::variant options + ) const override + { + _COMPRESSED_PROFILE_FUNCTION(); + if (std::holds_alternative(options)) + { + size_t temp_bytes{}; + const auto status = nvcomp_api::instance().LZ4CompressGetTempSizeAsync( + num_blocks, + block_size, + std::get(std::get(options)), + &temp_bytes, + block_size * num_blocks + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "lz4: Unable to retrieve the scratch bytes required for compression due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + + return temp_bytes; + } + + size_t temp_bytes{}; + const auto status = nvcomp_api::instance().LZ4DecompressGetTempSizeAsync( + num_blocks, + block_size, + std::get(std::get(options)), + &temp_bytes, + block_size * num_blocks + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "lz4: Unable to retrieve the scratch bytes required for decompression due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + + return temp_bytes; + } + + + size_t block_max_compressed_size(size_t block_size, compression_options& options) const override + { + _COMPRESSED_PROFILE_FUNCTION(); + size_t max_bytes = 0; + const auto status = nvcomp_api::instance().LZ4CompressGetMaxOutputChunkSize( + block_size, + std::get(options), + &max_bytes + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "lz4: Unable to retrieve the maximum compressed size for a block due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + + return max_bytes; + } + + + void compression_impl( + size_t block_size, + size_t num_blocks, + const cuda_device_buffer_async& uncompressed_block_ptrs, + const cuda_device_buffer_async& uncompressed_block_sizes, + cuda_device_buffer_async& scratch_space, + cuda_device_buffer_async& compressed_block_ptrs, + cuda_device_buffer_async& compressed_block_sizes, + cuda_device_buffer_async& block_statuses, + const compression_options& options + ) const override + { + _COMPRESSED_PROFILE_FUNCTION(); + const auto status = nvcomp_api::instance().LZ4CompressAsync( + uncompressed_block_ptrs.get(), + uncompressed_block_sizes.get(), + block_size, + num_blocks, + scratch_space.get_raw(), + scratch_space.bytes(), + compressed_block_ptrs.get(), + compressed_block_sizes.get(), + std::get(options), + block_statuses.get(), + cudaStreamPerThread + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "lz4: nvcompBatchedLZ4CompressAsync failed to launch due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + }; + + void decompression_impl( + size_t num_blocks, + const cuda_device_buffer_async& compressed_block_ptrs, + const cuda_device_buffer_async& compressed_block_sizes, + cuda_device_buffer_async& scratch_space, + cuda_device_buffer_async& uncompressed_block_ptrs, + cuda_device_buffer_async& uncompressed_block_sizes, + cuda_device_buffer_async& block_statuses, + const decompression_options& options + ) const override + { + _COMPRESSED_PROFILE_FUNCTION(); + auto block_sizes_out = cuda::make_device_buffer_async(num_blocks); + + const auto status = nvcomp_api::instance().LZ4DecompressAsync( + compressed_block_ptrs.get(), + compressed_block_sizes.get(), + uncompressed_block_sizes.get(), + block_sizes_out.get(), + num_blocks, + scratch_space.get_raw(), + scratch_space.size, + uncompressed_block_ptrs.get(), + std::get(options), + block_statuses.get(), + cudaStreamPerThread + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "lz4: nvcompBatchedLZ4DecompressAsync failed to launch due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + } + }; +} // namespace NAMESPACE_COMPRESSED_IMAGE::cuda diff --git a/compressed_image/include/compressed/cuda/compressors/snappy.h b/compressed_image/include/compressed/cuda/compressors/snappy.h new file mode 100644 index 0000000..62986e5 --- /dev/null +++ b/compressed_image/include/compressed/cuda/compressors/snappy.h @@ -0,0 +1,194 @@ +#pragma once + +#include "compressed/macros.h" + +#include "compressed/cuda/nvcomp_hook.h" +#include "compressed/cuda/compressors/base.h" +#include "compressed/cuda/compressors/util.h" + +namespace +NAMESPACE_COMPRESSED_IMAGE +{ + namespace cuda + { + template + struct snappy_compressor final : public detail::compressor + { + [[nodiscard]] NAMESPACE_COMPRESSED_IMAGE::enums::codec codec() const noexcept override + { + return NAMESPACE_COMPRESSED_IMAGE::enums::codec::snappy_gpu; + }; + + [[nodiscard]] compression_options default_compression_opts() const noexcept override + { + return nvcompBatchedSnappyCompressDefaultOpts; + }; + + [[nodiscard]] decompression_options default_decompression_opts() const noexcept override + { + return nvcompBatchedSnappyDecompressDefaultOpts; + }; + + [[nodiscard]] size_t max_block_size() const noexcept override + { + return nvcompSnappyCompressionMaxAllowedChunkSize; + }; + + private: + size_t get_temp_bytes( + size_t block_size, + size_t num_blocks, + std::variant options + ) const override + { + _COMPRESSED_PROFILE_FUNCTION(); + if (std::holds_alternative(options)) + { + size_t temp_bytes{}; + const auto status = nvcomp_api::instance().SnappyCompressGetTempSizeAsync( + num_blocks, + block_size, + std::get(std::get(options)), + &temp_bytes, + block_size * num_blocks + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "snappy: Unable to retrieve the scratch bytes required for compression due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + + return temp_bytes; + } + + size_t temp_bytes{}; + const auto status = nvcomp_api::instance().SnappyDecompressGetTempSizeAsync( + num_blocks, + block_size, + std::get(std::get(options)), + &temp_bytes, + block_size * num_blocks + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "snappy: Unable to retrieve the scratch bytes required for decompression due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + + return temp_bytes; + } + + + size_t block_max_compressed_size(size_t block_size, compression_options& options) const override + { + _COMPRESSED_PROFILE_FUNCTION(); + size_t max_bytes = 0; + const auto status = nvcomp_api::instance().SnappyCompressGetMaxOutputChunkSize( + block_size, + std::get(options), + &max_bytes + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "snappy: Unable to retrieve the maximum compressed size for a block due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + + return max_bytes; + } + + + void compression_impl( + size_t block_size, + size_t num_blocks, + const cuda_device_buffer_async& uncompressed_block_ptrs, + const cuda_device_buffer_async& uncompressed_block_sizes, + cuda_device_buffer_async& scratch_space, + cuda_device_buffer_async& compressed_block_ptrs, + cuda_device_buffer_async& compressed_block_sizes, + cuda_device_buffer_async& block_statuses, + const compression_options& options + ) const override + { + _COMPRESSED_PROFILE_FUNCTION(); + const auto status = nvcomp_api::instance().SnappyCompressAsync( + uncompressed_block_ptrs.get(), + uncompressed_block_sizes.get(), + block_size, + num_blocks, + scratch_space.get_raw(), + scratch_space.bytes(), + compressed_block_ptrs.get(), + compressed_block_sizes.get(), + std::get(options), + block_statuses.get(), + cudaStreamPerThread + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "snappy: nvcompBatchedSnappyCompressAsync failed to launch due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + }; + + void decompression_impl( + size_t num_blocks, + const cuda_device_buffer_async& compressed_block_ptrs, + const cuda_device_buffer_async& compressed_block_sizes, + cuda_device_buffer_async& scratch_space, + cuda_device_buffer_async& uncompressed_block_ptrs, + cuda_device_buffer_async& uncompressed_block_sizes, + cuda_device_buffer_async& block_statuses, + const decompression_options& options + ) const override + { + _COMPRESSED_PROFILE_FUNCTION(); + auto block_sizes_out = cuda::make_device_buffer_async(num_blocks); + + const auto status = nvcomp_api::instance().SnappyDecompressAsync( + compressed_block_ptrs.get(), + compressed_block_sizes.get(), + uncompressed_block_sizes.get(), + block_sizes_out.get(), + num_blocks, + scratch_space.get_raw(), + scratch_space.size, + uncompressed_block_ptrs.get(), + std::get(options), + block_statuses.get(), + cudaStreamPerThread + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "snappy: nvcompBatchedSnappyDecompressAsync failed to launch due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + } + }; + } // namespace cuda +} // namespace NAMESPACE_COMPRESSED_IMAGE diff --git a/compressed_image/include/compressed/cuda/compressors/util.h b/compressed_image/include/compressed/cuda/compressors/util.h new file mode 100644 index 0000000..e32ce25 --- /dev/null +++ b/compressed_image/include/compressed/cuda/compressors/util.h @@ -0,0 +1,90 @@ +#pragma once + +#include "compressed/macros.h" + +#include +#ifdef COMPRESSED_IMAGE_OIIO_AVAILABLE +#include +#endif + +namespace +NAMESPACE_COMPRESSED_IMAGE +{ + namespace cuda + { + namespace util + { + template + constexpr nvcompType_t to_nvcomp_type() + { + if constexpr (std::is_same_v || std::is_same_v) + return NVCOMP_TYPE_CHAR; + else if constexpr (std::is_same_v || std::is_same_v) + return NVCOMP_TYPE_UCHAR; + else if constexpr (std::is_same_v || std::is_same_v) + return NVCOMP_TYPE_SHORT; + else if constexpr (std::is_same_v || std::is_same_v) + return NVCOMP_TYPE_USHORT; + else if constexpr (std::is_same_v || std::is_same_v) + return NVCOMP_TYPE_INT; + else if constexpr (std::is_same_v || std::is_same_v) + return NVCOMP_TYPE_UINT; + else if constexpr (std::is_same_v || std::is_same_v) + return NVCOMP_TYPE_LONGLONG; + else if constexpr (std::is_same_v || std::is_same_v) + return NVCOMP_TYPE_ULONGLONG; +#ifdef COMPRESSED_IMAGE_OIIO_AVAILABLE + else if constexpr (std::is_same_v) + return NVCOMP_TYPE_FLOAT16; +#endif + else if constexpr (std::is_same_v) + return NVCOMP_TYPE_UINT; // fallback: map float -> uint + else if constexpr (std::is_same_v) + return NVCOMP_TYPE_ULONGLONG; // fallback: map double -> ulonglong + else + return NVCOMP_TYPE_BITS; // fallback default + } + + + /// \brief Convert a nvcompStatus_t object into a human-readable string for printing. + /// \param status The status to convert + /// \return A human-readable string explaining the error. + constexpr inline std::string_view status_t_to_string(const nvcompStatus_t& status) noexcept + { + switch (status) + { + case nvcompStatus_t::nvcompSuccess: + return "success"; + case nvcompStatus_t::nvcompErrorInvalidValue: + return "invalid value"; + case nvcompStatus_t::nvcompErrorNotSupported: + return "not supported"; + case nvcompStatus_t::nvcompErrorCannotDecompress: + return "cannot decompress"; + case nvcompStatus_t::nvcompErrorBadChecksum: + return "bad checksum"; + case nvcompStatus_t::nvcompErrorCannotVerifyChecksums: + return "cannot verify checksums"; + case nvcompStatus_t::nvcompErrorOutputBufferTooSmall: + return "output buffer too small"; + case nvcompStatus_t::nvcompErrorWrongHeaderLength: + return "wrong header length"; + case nvcompStatus_t::nvcompErrorAlignment: + return "alignment error"; + case nvcompStatus_t::nvcompErrorChunkSizeTooLarge: + return "chunk size too large"; + case nvcompStatus_t::nvcompErrorCannotCompress: + return "cannot compress"; + case nvcompStatus_t::nvcompErrorWrongInputLength: + return "wrong input length"; + case nvcompStatus_t::nvcompErrorCudaError: + return "CUDA error"; + case nvcompStatus_t::nvcompErrorInternal: + return "internal error"; + default: + return "unknown error"; + } + } + } // namespace util + } // namespace cuda +} // namespace NAMESPACE_COMPRESSED_IMAGE diff --git a/compressed_image/include/compressed/cuda/compressors/zstd.h b/compressed_image/include/compressed/cuda/compressors/zstd.h new file mode 100644 index 0000000..d3fb885 --- /dev/null +++ b/compressed_image/include/compressed/cuda/compressors/zstd.h @@ -0,0 +1,194 @@ +#pragma once + +#include "compressed/macros.h" + +#include "compressed/cuda/nvcomp_hook.h" +#include "compressed/cuda/compressors/base.h" +#include "compressed/cuda/compressors/util.h" + +namespace +NAMESPACE_COMPRESSED_IMAGE +{ + namespace cuda + { + template + struct zstd_compressor final : public detail::compressor + { + [[nodiscard]] NAMESPACE_COMPRESSED_IMAGE::enums::codec codec() const noexcept override + { + return NAMESPACE_COMPRESSED_IMAGE::enums::codec::zstd_gpu; + }; + + [[nodiscard]] compression_options default_compression_opts() const noexcept override + { + return nvcompBatchedZstdCompressDefaultOpts; + }; + + [[nodiscard]] decompression_options default_decompression_opts() const noexcept override + { + return nvcompBatchedZstdDecompressDefaultOpts; + }; + + [[nodiscard]] size_t max_block_size() const noexcept override + { + return nvcompZstdCompressionMaxAllowedChunkSize; + }; + + private: + size_t get_temp_bytes( + size_t block_size, + size_t num_blocks, + std::variant options + ) const override + { + _COMPRESSED_PROFILE_FUNCTION(); + if (std::holds_alternative(options)) + { + size_t temp_bytes{}; + const auto status = nvcomp_api::instance().ZstdCompressGetTempSizeAsync( + num_blocks, + block_size, + std::get(std::get(options)), + &temp_bytes, + block_size * num_blocks + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "zstd: Unable to retrieve the scratch bytes required for compression due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + + return temp_bytes; + } + + size_t temp_bytes{}; + const auto status = nvcomp_api::instance().ZstdDecompressGetTempSizeAsync( + num_blocks, + block_size, + std::get(std::get(options)), + &temp_bytes, + block_size * num_blocks + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "zstd: Unable to retrieve the scratch bytes required for decompression due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + + return temp_bytes; + } + + + size_t block_max_compressed_size(size_t block_size, compression_options& options) const override + { + _COMPRESSED_PROFILE_FUNCTION(); + size_t max_bytes = 0; + const auto status = nvcomp_api::instance().ZstdCompressGetMaxOutputChunkSize( + block_size, + std::get(options), + &max_bytes + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "zstd: Unable to retrieve the maximum compressed size for a block due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + + return max_bytes; + } + + + void compression_impl( + size_t block_size, + size_t num_blocks, + const cuda_device_buffer_async& uncompressed_block_ptrs, + const cuda_device_buffer_async& uncompressed_block_sizes, + cuda_device_buffer_async& scratch_space, + cuda_device_buffer_async& compressed_block_ptrs, + cuda_device_buffer_async& compressed_block_sizes, + cuda_device_buffer_async& block_statuses, + const compression_options& options + ) const override + { + _COMPRESSED_PROFILE_FUNCTION(); + const auto status = nvcomp_api::instance().ZstdCompressAsync( + uncompressed_block_ptrs.get(), + uncompressed_block_sizes.get(), + block_size, + num_blocks, + scratch_space.get_raw(), + scratch_space.bytes(), + compressed_block_ptrs.get(), + compressed_block_sizes.get(), + std::get(options), + block_statuses.get(), + cudaStreamPerThread + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "zstd: nvcompBatchedZstdCompressAsync failed to launch due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + }; + + void decompression_impl( + size_t num_blocks, + const cuda_device_buffer_async& compressed_block_ptrs, + const cuda_device_buffer_async& compressed_block_sizes, + cuda_device_buffer_async& scratch_space, + cuda_device_buffer_async& uncompressed_block_ptrs, + cuda_device_buffer_async& uncompressed_block_sizes, + cuda_device_buffer_async& block_statuses, + const decompression_options& options + ) const override + { + _COMPRESSED_PROFILE_FUNCTION(); + auto block_sizes_out = cuda::make_device_buffer_async(num_blocks); + + const auto status = nvcomp_api::instance().ZstdDecompressAsync( + compressed_block_ptrs.get(), + compressed_block_sizes.get(), + uncompressed_block_sizes.get(), + block_sizes_out.get(), + num_blocks, + scratch_space.get_raw(), + scratch_space.size, + uncompressed_block_ptrs.get(), + std::get(options), + block_statuses.get(), + cudaStreamPerThread + ); + + if (status != nvcompStatus_t::nvcompSuccess) + { + throw std::runtime_error( + std::format( + "zstd: nvcompBatchedZstdDecompressAsync failed to launch due to nvcomp error: '{}'", + util::status_t_to_string(status) + ) + ); + } + } + }; + } // namespace cuda +} // namespace NAMESPACE_COMPRESSED_IMAGE diff --git a/compressed_image/include/compressed/cuda/cuda_hook.h b/compressed_image/include/compressed/cuda/cuda_hook.h new file mode 100644 index 0000000..1888e78 --- /dev/null +++ b/compressed_image/include/compressed/cuda/cuda_hook.h @@ -0,0 +1,371 @@ +/* +Dynamic function hook for cuda that loads the library at runtime and hooks various functions such as +cudaMalloc, cudaFree, etc. + +Unfortunately, it doesn't seem as though there's an open source library so we do the minimal hooking here. + +Note: This header should only ever be included on a machine that also has the cuda libraries! +*/ + +#pragma once + +#include +#include +#include +#include +#include + +#include + +#include "compressed/logger.h" +#include "compressed/macros.h" +#include "compressed/cuda/proc_util.h" + + +namespace +NAMESPACE_COMPRESSED_IMAGE +{ + namespace cuda + { + /// \brief Singleton class for dynamically loading CUDA functions at runtime. + /// + /// This allows calling CUDA functions like cudaMalloc/cudaFree + /// without linking against CUDA at compile time. + /// + /// Usage: + /// + /// compressed::cuda::cuda_api::instance().malloc(ptr, size); + /// compressed::cuda::cuda_api::instance().free(ptr); + /// \brief Singleton for dynamically loading CUDA runtime functions. + class cuda_api + { + public: + /// Access the singleton instance + static cuda_api& instance() + { + static cuda_api inst; + return inst; + } + + // --- Runtime queries --- + bool available() const noexcept { return handle_ != nullptr; } + int device_count() const; + int current_device() const; + void set_device(int device); + bool has_device() const; + cudaDeviceProp device_properties(int device) const; + int device_attribute(cudaDeviceAttr attr, int device) const; + + // --- Memory management --- + void malloc(void*& ptr, size_t size) const; + void malloc_host(void*& ptr, size_t size) const; + void malloc_async(void*& ptr, size_t size, cudaStream_t stream = cudaStreamPerThread); + void free(void* ptr) const; + void free_host(void* ptr) const; + void free_async(void* ptr, cudaStream_t stream = cudaStreamPerThread); + + // --- Page-locking (Pinning) --- + void host_register(void* ptr, size_t size, unsigned int flags = cudaHostRegisterDefault) const; + void host_unregister(void* ptr) const; + + // --- Data transfer --- + void memcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind); + void memcpy_async( + void* dst, + const void* src, + size_t count, + cudaMemcpyKind kind, + cudaStream_t stream = cudaStreamPerThread + ); + + // --- Streams & Pools --- + void stream_synchronize(cudaStream_t stream) const; + void set_mem_pool_size(int device, uint64_t threshold = std::numeric_limits::max()); + + // Non-copyable + cuda_api(const cuda_api&) = delete; + cuda_api& operator=(const cuda_api&) = delete; + cuda_api(cuda_api&&) = delete; + cuda_api& operator=(cuda_api&&) = delete; + + private: + // Private constructor + cuda_api(); + + template + static void cuda_call(Func func, std::string_view func_name, Args&&... args); + + // CUDA library handle + proc::library_handle handle_ = nullptr; + + // --- Function pointer typedefs --- + using cuda_malloc_t = cudaError_t(*)(void**, size_t); + using cuda_malloc_async_t = cudaError_t(*)(void**, size_t, cudaStream_t); + using cuda_free_t = decltype(&cudaFree); + using cuda_free_async_t = decltype(&cudaFreeAsync); + using cuda_malloc_host_t = cudaError_t(*)(void**, size_t); + using cuda_free_host_t = decltype(&cudaFreeHost); + + using cuda_host_register_t = decltype(&cudaHostRegister); + using cuda_host_unregister_t = decltype(&cudaHostUnregister);; + + using cuda_memcpy_t = decltype(&cudaMemcpy); + using cuda_memcpy_async_t = decltype(&cudaMemcpyAsync); + using cuda_stream_sync_t = decltype(&cudaStreamSynchronize); + + using cuda_get_mempool_t = decltype(&cudaDeviceGetDefaultMemPool); + using cuda_set_mempool_t = decltype(&cudaMemPoolSetAttribute); + + using cuda_set_device_t = decltype(&cudaSetDevice); + using cuda_get_device_count_t = decltype(&cudaGetDeviceCount); + using cuda_get_props_t = decltype(&cudaGetDeviceProperties); + using cuda_get_device_t = decltype(&cudaGetDevice); + using cuda_get_attr_t = decltype(&cudaDeviceGetAttribute); + + using cuda_get_error_str_t = decltype(&cudaGetErrorString); + + // --- Function pointers --- + cuda_malloc_t malloc_fn_ = nullptr; + cuda_malloc_host_t malloc_host_fn_ = nullptr; + cuda_malloc_async_t malloc_async_fn_ = nullptr; + cuda_free_t free_fn_ = nullptr; + cuda_free_host_t free_host_fn_ = nullptr; + cuda_free_async_t free_async_fn_ = nullptr; + + cuda_host_register_t host_register_fn_ = nullptr; + cuda_host_unregister_t host_unregister_fn_ = nullptr; + + cuda_memcpy_t memcpy_fn_ = nullptr; + cuda_memcpy_async_t memcpy_async_fn_ = nullptr; + cuda_stream_sync_t stream_sync_fn_ = nullptr; + + cuda_get_mempool_t get_mempool_fn_ = nullptr; + cuda_set_mempool_t set_mempool_fn_ = nullptr; + + cuda_set_device_t set_device_fn_ = nullptr; + cuda_get_device_count_t get_count_fn_ = nullptr; + cuda_get_props_t get_props_fn_ = nullptr; + cuda_get_device_t get_device_fn_ = nullptr; + cuda_get_attr_t get_attr_fn_ = nullptr; + + cuda_get_error_str_t get_error_str_fn_ = nullptr; + }; + + // =========================================================== + // Implementation + // =========================================================== + + inline int cuda_api::device_count() const + { + _COMPRESSED_PROFILE_FUNCTION(); + int count = 0; + cuda_call(get_count_fn_, "cudaGetDeviceCount", &count); + return count; + } + + inline int cuda_api::current_device() const + { + _COMPRESSED_PROFILE_FUNCTION(); + int dev = -1; + cuda_call(get_device_fn_, "cudaGetDevice", &dev); + return dev; + } + + inline void cuda_api::set_device(int device) + { + _COMPRESSED_PROFILE_FUNCTION(); + cuda_call(set_device_fn_, "cudaSetDevice", device); + } + + inline bool cuda_api::has_device() const + { + _COMPRESSED_PROFILE_FUNCTION(); + if (!available()) return false; + + try + { + return device_count() > 0; + } + catch (const std::exception& e) + { + NAMESPACE_COMPRESSED_IMAGE::get_logger()->warn( + std::format( + "Unhandled exception while trying to retrieve the cuda device count: {}", + e.what() + ) + ); + } + + return false; + } + + inline cudaDeviceProp cuda_api::device_properties(int device) const + { + _COMPRESSED_PROFILE_FUNCTION(); + cudaDeviceProp prop{}; + cuda_call(get_props_fn_, "cudaGetDeviceProperties", &prop, device); + return prop; + } + + inline int cuda_api::device_attribute(cudaDeviceAttr attr, int device) const + { + _COMPRESSED_PROFILE_FUNCTION(); + int value = 0; + cuda_call(get_attr_fn_, "cudaDeviceGetAttribute", &value, attr, device); + return value; + } + + inline void cuda_api::malloc(void*& ptr, size_t size) const + { + cuda_call(malloc_fn_, "cudaMalloc", &ptr, size); + } + + inline void cuda_api::malloc_host(void*& ptr, size_t size) const + { + _COMPRESSED_PROFILE_FUNCTION(); + cuda_call(malloc_host_fn_, "cudaMallocHost", &ptr, size); + } + + inline void cuda_api::malloc_async(void*& ptr, size_t size, cudaStream_t stream) + { + cuda_call(malloc_async_fn_, "cudaMallocAsync", &ptr, size, stream); + } + + inline void cuda_api::free(void* ptr) const + { + cuda_call(free_fn_, "cudaFree", ptr); + } + + inline void cuda_api::free_host(void* ptr) const + { + _COMPRESSED_PROFILE_FUNCTION(); + cuda_call(free_host_fn_, "cudaFreeHost", ptr); + } + + inline void cuda_api::free_async(void* ptr, cudaStream_t stream) + { + cuda_call(free_async_fn_, "cudaFreeAsync", ptr, stream); + } + + inline void cuda_api::host_register(void* ptr, size_t size, unsigned int flags) const + { + _COMPRESSED_PROFILE_FUNCTION(); + cuda_call(host_register_fn_, "cudaHostRegister", ptr, size, flags); + } + + inline void cuda_api::host_unregister(void* ptr) const + { + _COMPRESSED_PROFILE_FUNCTION(); + cuda_call(host_unregister_fn_, "cudaHostUnregister", ptr); + } + + inline void cuda_api::memcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) + { + cuda_call(memcpy_fn_, "cudaMemcpy", dst, src, count, kind); + } + + inline void cuda_api::memcpy_async( + void* dst, + const void* src, + size_t count, + cudaMemcpyKind kind, + cudaStream_t stream) + { + cuda_call(memcpy_async_fn_, "cudaMemcpyAsync", dst, src, count, kind, stream); + } + + inline void cuda_api::stream_synchronize(cudaStream_t stream) const + { + _COMPRESSED_PROFILE_FUNCTION(); + cuda_call(stream_sync_fn_, "cudaStreamSynchronize", stream); + } + + inline void cuda_api::set_mem_pool_size(int device, uint64_t threshold) + { + _COMPRESSED_PROFILE_FUNCTION(); + cudaMemPool_t mempool{}; + cuda_call(get_mempool_fn_, "cudaDeviceGetDefaultMemPool", &mempool, device); + cuda_call( + set_mempool_fn_, + "cudaMemPoolSetAttribute", + mempool, + cudaMemPoolAttrReleaseThreshold, + &threshold + ); + } + + // --- private helpers --- + template + void cuda_api::cuda_call(Func func, std::string_view func_name, Args&&... args) + { + if (!func) + { + throw std::runtime_error( + std::format( + "CUDA function '{}' is unavailable (library or entrypoint not loaded).", + func_name + ) + ); + } + + const cudaError_t err = func(std::forward(args)...); + if (err != cudaSuccess) + { + auto& inst = instance(); + const char* raw_msg = inst.get_error_str_fn_ ? inst.get_error_str_fn_(err) : nullptr; + std::string invalid_msg = std::format( + "unknown error or missing driver string table. Cuda code {}", + static_cast(err) + ); + + std::string_view msg = raw_msg ? std::string_view(raw_msg) : invalid_msg; + + throw std::runtime_error(std::format("{} failed: {}", func_name, msg)); + } + } + + inline cuda_api::cuda_api() + { +#if defined(_WIN32) + // Targets the 64-bit Runtime API for CUDA 12 + const std::string cuda_name = "cudart64_12.dll"; +#elif defined(__linux__) + const std::string cuda_name = "libcudart.so"; +#else + const std::string cuda_name; +#endif + + handle_ = proc::load_library(cuda_name); + if (!handle_) return; + +#define LOAD(fn, member) member = proc::get_symbol(handle_, #fn, cuda_name) + + LOAD(cudaMalloc, malloc_fn_); + LOAD(cudaMallocHost, malloc_host_fn_); + LOAD(cudaMallocAsync, malloc_async_fn_); + LOAD(cudaFree, free_fn_); + LOAD(cudaFreeHost, free_host_fn_); + LOAD(cudaFreeAsync, free_async_fn_); + + LOAD(cudaHostRegister, host_register_fn_); + LOAD(cudaHostUnregister, host_unregister_fn_); + + LOAD(cudaMemcpy, memcpy_fn_); + LOAD(cudaMemcpyAsync, memcpy_async_fn_); + LOAD(cudaStreamSynchronize, stream_sync_fn_); + + LOAD(cudaDeviceGetDefaultMemPool, get_mempool_fn_); + LOAD(cudaMemPoolSetAttribute, set_mempool_fn_); + + LOAD(cudaSetDevice, set_device_fn_); + LOAD(cudaGetDeviceCount, get_count_fn_); + LOAD(cudaGetDeviceProperties, get_props_fn_); + LOAD(cudaGetDevice, get_device_fn_); + LOAD(cudaDeviceGetAttribute, get_attr_fn_); + + LOAD(cudaGetErrorString, get_error_str_fn_); + +#undef LOAD + } + } // namespace cuda +} // namespace NAMESPACE_COMPRESSED_IMAGE diff --git a/compressed_image/include/compressed/cuda/enums.h b/compressed_image/include/compressed/cuda/enums.h new file mode 100644 index 0000000..78db4ce --- /dev/null +++ b/compressed_image/include/compressed/cuda/enums.h @@ -0,0 +1,25 @@ +#pragma once + +#include "compressed/macros.h" + + +namespace NAMESPACE_COMPRESSED_IMAGE +{ + + namespace cuda + { + + namespace enums + { + /// \brief the storage location of a given compressing data buffer. + enum class storage_location + { + device, ///< Data is stored on the device (gpu) and only pulled back to the cpu when accessing + host ///< Data is stored on the host (cpu) incurring an additional cost for copying back and forth memory. + }; + + } // namespace enums + + } // namespace cuda + +} // namespace NAMESPACE_COMPRESSED_IMAGE \ No newline at end of file diff --git a/compressed_image/include/compressed/cuda/exceptions.h b/compressed_image/include/compressed/cuda/exceptions.h new file mode 100644 index 0000000..1ef54c4 --- /dev/null +++ b/compressed_image/include/compressed/cuda/exceptions.h @@ -0,0 +1,37 @@ +#pragma once + +#include +#include + +#include "compressed/macros.h" + + +namespace NAMESPACE_COMPRESSED_IMAGE +{ + + namespace cuda + { + + /// \brief Exception thrown when a CUDA library cannot be loaded + class library_not_found : public std::runtime_error + { + public: + explicit library_not_found(std::string_view msg) + : std::runtime_error(std::string(msg)) + { + } + }; + + /// \brief Exception thrown when a CUDA function cannot be found in the library + class symbol_not_found : public std::runtime_error + { + public: + explicit symbol_not_found(std::string_view msg) + : std::runtime_error(std::string(msg)) + { + } + }; + + } // namespace cuda + +} // namespace NAMESPACE_COMPRESSED_IMAGE \ No newline at end of file diff --git a/compressed_image/include/compressed/cuda/gpu.h b/compressed_image/include/compressed/cuda/gpu.h new file mode 100644 index 0000000..7996e4c --- /dev/null +++ b/compressed_image/include/compressed/cuda/gpu.h @@ -0,0 +1,96 @@ +#pragma once + +#include "compressed/macros.h" + +#include "compressed/cuda/cuda_hook.h" +#include "compressed/cuda/nvcomp_hook.h" + + +namespace +NAMESPACE_COMPRESSED_IMAGE +{ + namespace cuda + { + /// \brief Check if CUDA runtime is available and at least one device exists. + inline bool is_available() + { + return cuda_api::instance().available() && nvcomp_api::instance().available() && cuda_api::instance(). + has_device(); + } + + /// \brief Get the number of available CUDA devices. + /// \return Number of devices detected by the CUDA runtime. + inline int device_count() + { + return cuda_api::instance().device_count(); + } + + /// \brief Get the index of the currently active CUDA device. + /// \return Device index (0-based). + inline int current_device() + { + return cuda_api::instance().current_device(); + } + + /// \brief Set the active CUDA device for the calling thread. + /// \param device The index of the device to make current. + inline void set_device(const int device) + { + cuda_api::instance().set_device(device); + } + + /// \brief Retrieve full device property structures for all CUDA devices. + /// \return A vector of \c cudaDeviceProp, one for each device. + inline std::vector devices() + { + std::vector properties; + for (int i = 0; i < device_count(); ++i) + { + properties.push_back(cuda_api::instance().device_properties(i)); + } + return properties; + } + + /// \brief Get the names of all available CUDA devices. + /// \return A vector of device name strings. + inline std::vector device_names() + { + std::vector names; + for (int i = 0; i < device_count(); ++i) + { + names.emplace_back(cuda_api::instance().device_properties(i).name); + } + return names; + } + + /// \brief RAII guard to temporarily switch CUDA devices. + /// + /// Saves the currently active device on construction, switches to the given + /// device, and restores the previous device on destruction. + struct device_guard + { + /// \brief Construct a guard and switch to the given device. + /// \param new_device The device index to switch to. + explicit device_guard(int new_device) + { + prev_device_ = current_device(); + set_device(new_device); + } + + /// \brief Destructor restores the previous device. + ~device_guard() + { + try { set_device(prev_device_); } + catch (...) + { + } + } + + device_guard(const device_guard&) = delete; + device_guard& operator=(const device_guard&) = delete; + + private: + int prev_device_{-1}; ///< Previously active device index. + }; + } // namespace cuda +} // namespace NAMESPACE_COMPRESSED_IMAGE diff --git a/compressed_image/include/compressed/cuda/memory.h b/compressed_image/include/compressed/cuda/memory.h new file mode 100644 index 0000000..c03c90f --- /dev/null +++ b/compressed_image/include/compressed/cuda/memory.h @@ -0,0 +1,324 @@ +/* +Wrapper around cuda memory allocation/deallocation using std::unique_ptr to manage freeing memory appropriately +again instead of having to do this by hand + +This header file includes the following structs: + +scoped_host_pinner + A RAII struct for pinning a buffer to gpu memory for quicker gpu <-> cpu memory transfers. This should be used only + for staging buffers (like a chunk buffer). Doing this for big buffers often causes performance degradation so use + with care. + +cuda_device_buffer + A RAII-managed GPU buffer allocated using CUDAs non-async memory functions. + +cuda_device_buffer_async + A RAII-managed GPU buffer allocated using CUDAs async memory functions. +*/ +#pragma once + +#include + +#include + +#include "compressed/macros.h" +#include "compressed/util.h" +#include "compressed/cuda/cuda_hook.h" + + +namespace +NAMESPACE_COMPRESSED_IMAGE +{ + namespace cuda + { + namespace detail + { + struct device_deleter + { + void operator()(void* ptr) const noexcept + { + if (ptr) + { + try + { + cuda_api::instance().free(ptr); + } + catch (...) + { + // suppress exceptions in destructors + } + } + } + }; + + struct device_deleter_async + { + // Must be the same stream used for construction, use the factory functions to ensure this holds + cudaStream_t stream = cudaStreamPerThread; + + void operator()(void* ptr) const noexcept + { + if (ptr) + { + try + { + cuda_api::instance().free_async(ptr, stream); + } + catch (...) + { + // suppress exceptions in destructors + } + } + } + }; + + struct host_deleter + { + void operator()(void* ptr) const noexcept + { + if (ptr) + { + try + { + cuda_api::instance().free_host(ptr); + } + catch (...) + { + // suppress exceptions in destructors + } + } + } + }; + } // namespace detail + + + /// \brief A RAII wrapper for registering and deregistering host memory. + /// + /// Automatically pins the cpu-memory for gpu-operations. The `scoped_host_pinner` holds a thin view over the + /// registered memory, meaning it is not valid for the `scoped_host_pinner` to exceed the lifespan of the held + /// memory. + struct scoped_host_pinner + { + void* ptr = nullptr; + size_t bytes = 0; + + scoped_host_pinner(void* p, const size_t b, const unsigned int flags = cudaHostRegisterDefault) + : ptr(p), bytes(b) + { + if (ptr && bytes > 0) + { + cuda_api::instance().host_register(ptr, bytes, flags); + } + } + + ~scoped_host_pinner() noexcept + { + if (ptr && bytes > 0) + { + try + { + cuda_api::instance().host_unregister(ptr); + } + catch (...) + { + // Suppress exceptions inside destructors during unwinding + } + } + } + + // The scoped_host_pinner is move-only to ensure we don't deregister the same memory multiple times. + scoped_host_pinner(const scoped_host_pinner&) = delete; + scoped_host_pinner& operator=(const scoped_host_pinner&) = delete; + + scoped_host_pinner(scoped_host_pinner&& other) noexcept + : ptr(std::exchange(other.ptr, nullptr)), bytes(std::exchange(other.bytes, 0)) + { + } + + scoped_host_pinner& operator=(scoped_host_pinner&& other) noexcept + { + if (this != &other) + { + if (ptr && bytes > 0) + { + try { cuda_api::instance().host_unregister(ptr); } + catch (...) + { + } + } + ptr = std::exchange(other.ptr, nullptr); + bytes = std::exchange(other.bytes, 0); + } + return *this; + } + }; + + // ------------------------------------------------------------------------- + // Smart pointer aliases (void*, untyped) + // ------------------------------------------------------------------------- + using cuda_device_mem = std::unique_ptr; + using cuda_device_mem_async = std::unique_ptr; + using cuda_host_mem = std::unique_ptr; + + // ------------------------------------------------------------------------- + // Allocation helpers (typed) + // ------------------------------------------------------------------------- + template + using cuda_device_ptr = std::unique_ptr; + + /// \brief RAII wrapper around a gpu memory buffer allocated using synchronous APIs. + template + struct cuda_device_buffer + { + /// \brief the underlying raw device ptr. + cuda_device_ptr data = nullptr; + /// \brief the number of elements in the device buffer (expressed as a multiple of T) + size_t size{}; + + T* get() noexcept { return this->data.get(); } + const T* get() const noexcept { return this->data.get(); } + void* get_raw() noexcept { return static_cast(this->get()); } + [[nodiscard]] const void* get_raw() const noexcept { return static_cast(this->get()); } + + [[nodiscard]] size_t bytes() const noexcept { return this->size * sizeof(T); } + }; + + template + using cuda_device_ptr_async = std::unique_ptr; + + /// \brief RAII wrapper around a gpu memory buffer allocated using synchronous APIs. + template + struct cuda_device_buffer_async + { + /// \brief the underlying raw device ptr. + cuda_device_ptr_async data = nullptr; + /// \brief the number of elements in the device buffer (expressed as a multiple of T) + size_t size{}; + + /// \brief Generate a device buffer (using asynchronous memory ops) from a host buffer copying the data. + /// + /// \param buffer The buffer to use as a size reference and to generate the device pointer from + static cuda_device_buffer_async from_host(std::span buffer) + { + _COMPRESSED_PROFILE_FUNCTION(); + void* raw = nullptr; + + cuda_api::instance().malloc_async( + raw, + buffer.size() * sizeof(T), + cudaStreamPerThread + ); + + auto gpu_buffer = cuda_device_buffer_async{ + cuda_device_ptr_async( + static_cast(raw), + detail::device_deleter_async{cudaStreamPerThread} + ), + buffer.size() + }; + + cuda_api::instance().memcpy_async( + static_cast(gpu_buffer.data.get()), + buffer.data(), + gpu_buffer.bytes(), + cudaMemcpyHostToDevice + ); + + return gpu_buffer; + } + + static cuda_device_buffer_async from_host(std::vector& buffer) + { + return cuda_device_buffer_async::from_host(std::span(buffer.begin(), buffer.end())); + } + + /// \brief memcpy the gpu buffer into `buffer`. + /// + /// \throws std::invalid_argument if the size of `buffer` does not match the size of the gpu buffer. + void to_host(std::span buffer) + { + _COMPRESSED_PROFILE_FUNCTION(); + if (buffer.size() != this->size) + { + throw std::invalid_argument( + std::format( + "Cuda: Invalid buffer passed to `to_host` function. Expected exactly {} elements but instead" + " got {}.", + this->size, + buffer.size() + ) + ); + } + + cuda_api::instance().memcpy_async( + static_cast(buffer.data()), + this->get_raw(), + this->size * sizeof(T), + cudaMemcpyDeviceToHost + ); + } + + /// \brief allocate and memcpy the compressed data back to the host. + NAMESPACE_COMPRESSED_IMAGE::util::default_init_vector to_host() + { + util::default_init_vector buffer(this->size); + this->to_host(std::span(buffer.begin(), buffer.end())); + return buffer; + } + + T* get() noexcept { return this->data.get(); } + const T* get() const noexcept { return this->data.get(); } + void* get_raw() noexcept { return static_cast(this->get()); } + [[nodiscard]] const void* get_raw() const noexcept { return static_cast(this->get()); } + + [[nodiscard]] size_t bytes() const noexcept { return this->size * sizeof(T); } + }; + + template + using cuda_host_ptr = std::unique_ptr; + + // ------------------------------------------------------------------------- + // Factory functions, use these whenever possible! + // ------------------------------------------------------------------------- + template + inline cuda_device_ptr make_device_mem(size_t count) + { + void* raw = nullptr; + cuda_api::instance().malloc(raw, count * sizeof(T)); + return cuda_device_ptr(static_cast(raw)); + } + + template + inline cuda_device_buffer make_device_buffer(size_t count) + { + _COMPRESSED_PROFILE_FUNCTION(); + auto managed_ptr = make_device_mem(count); + return cuda_device_buffer{std::move(managed_ptr), count}; + } + + template + inline cuda_device_ptr_async make_device_mem_async(size_t count, cudaStream_t stream = cudaStreamPerThread) + { + void* raw = nullptr; + cuda_api::instance().malloc_async(raw, count * sizeof(T), stream); + return cuda_device_ptr_async(static_cast(raw), detail::device_deleter_async{stream}); + } + + template + inline cuda_device_buffer_async make_device_buffer_async(size_t count, + cudaStream_t stream = cudaStreamPerThread) + { + _COMPRESSED_PROFILE_FUNCTION(); + auto managed_ptr = make_device_mem_async(count, stream); + return cuda_device_buffer_async{std::move(managed_ptr), count}; + } + + template + inline cuda_host_ptr make_host_mem(size_t count) + { + void* raw = nullptr; + cuda_api::instance().malloc_host(raw, count * sizeof(T)); + return cuda_host_ptr(static_cast(raw)); + } + } +} // namespace NAMESPACE_COMPRESSED_IMAGE diff --git a/compressed_image/include/compressed/cuda/nvcomp_hook.h b/compressed_image/include/compressed/cuda/nvcomp_hook.h new file mode 100644 index 0000000..7dbdfa0 --- /dev/null +++ b/compressed_image/include/compressed/cuda/nvcomp_hook.h @@ -0,0 +1,576 @@ +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "compressed/macros.h" +#include "compressed/cuda/proc_util.h" + +namespace +NAMESPACE_COMPRESSED_IMAGE::cuda +{ + /// \brief Singleton class for dynamically loading nvcomp functions and constants at runtime. + class nvcomp_api + { + public: + static nvcomp_api& instance() + { + static nvcomp_api inst; + return inst; + } + + bool available() const noexcept { return handle_ != nullptr; } + + // --- LZ4 API --- + nvcompStatus_t LZ4CompressGetTempSizeAsync(size_t n, + size_t s, + nvcompBatchedLZ4CompressOpts_t o, + size_t* t, + size_t m) const { return lz4_comp_temp_fn_(n, s, o, t, m); } + + nvcompStatus_t LZ4DecompressGetTempSizeAsync(size_t n, + size_t s, + nvcompBatchedLZ4DecompressOpts_t o, + size_t* t, + size_t m) const { return lz4_decomp_temp_fn_(n, s, o, t, m); } + + nvcompStatus_t LZ4CompressGetMaxOutputChunkSize(size_t s, nvcompBatchedLZ4CompressOpts_t o, size_t* m) const + { + return lz4_max_out_fn_(s, o, m); + } + + nvcompStatus_t LZ4CompressAsync(const void* const* u, + const size_t* us, + size_t ms, + size_t b, + void* tp, + size_t tb, + void* const* cp, + size_t* cs, + nvcompBatchedLZ4CompressOpts_t o, + nvcompStatus_t* st, + cudaStream_t stream) const + { + return lz4_compress_fn_(u, us, ms, b, tp, tb, cp, cs, o, st, stream); + } + + nvcompStatus_t LZ4DecompressAsync(const void* const* cp, + const size_t* cs, + const size_t* us, + size_t* ac, + size_t b, + void* tp, + size_t tb, + void* const* up, + nvcompBatchedLZ4DecompressOpts_t o, + nvcompStatus_t* st, + cudaStream_t stream) const + { + return lz4_decompress_fn_(cp, cs, us, ac, b, tp, tb, up, o, st, stream); + } + + // --- Cascaded API --- + nvcompStatus_t CascadedCompressGetTempSizeAsync(size_t n, + size_t s, + nvcompBatchedCascadedCompressOpts_t o, + size_t* t, + size_t m) const + { + return cascaded_comp_temp_fn_(n, s, o, t, m); + } + + nvcompStatus_t CascadedDecompressGetTempSizeAsync(size_t n, + size_t s, + nvcompBatchedCascadedDecompressOpts_t o, + size_t* t, + size_t m) const + { + return cascaded_decomp_temp_fn_(n, s, o, t, m); + } + + nvcompStatus_t CascadedCompressGetMaxOutputChunkSize(size_t s, + nvcompBatchedCascadedCompressOpts_t o, + size_t* m) const { return cascaded_max_out_fn_(s, o, m); } + + nvcompStatus_t CascadedCompressAsync(const void* const* u, + const size_t* us, + size_t ms, + size_t b, + void* tp, + size_t tb, + void* const* cp, + size_t* cs, + nvcompBatchedCascadedCompressOpts_t o, + nvcompStatus_t* st, + cudaStream_t stream) const + { + return cascaded_compress_fn_(u, us, ms, b, tp, tb, cp, cs, o, st, stream); + } + + nvcompStatus_t CascadedDecompressAsync(const void* const* cp, + const size_t* cs, + const size_t* us, + size_t* ac, + size_t b, + void* tp, + size_t tb, + void* const* up, + nvcompBatchedCascadedDecompressOpts_t o, + nvcompStatus_t* st, + cudaStream_t stream) const + { + return cascaded_decompress_fn_(cp, cs, us, ac, b, tp, tb, up, o, st, stream); + } + + // --- Deflate API --- + nvcompStatus_t DeflateCompressGetTempSizeAsync(size_t n, + size_t s, + nvcompBatchedDeflateCompressOpts_t o, + size_t* t, + size_t m) const { return deflate_comp_temp_fn_(n, s, o, t, m); } + + nvcompStatus_t DeflateDecompressGetTempSizeAsync(size_t n, + size_t s, + nvcompBatchedDeflateDecompressOpts_t o, + size_t* t, + size_t m) const + { + return deflate_decomp_temp_fn_(n, s, o, t, m); + } + + nvcompStatus_t DeflateCompressGetMaxOutputChunkSize(size_t s, + nvcompBatchedDeflateCompressOpts_t o, + size_t* m) const { return deflate_max_out_fn_(s, o, m); } + + nvcompStatus_t DeflateCompressAsync(const void* const* u, + const size_t* us, + size_t ms, + size_t b, + void* tp, + size_t tb, + void* const* cp, + size_t* cs, + nvcompBatchedDeflateCompressOpts_t o, + nvcompStatus_t* st, + cudaStream_t stream) const + { + return deflate_compress_fn_(u, us, ms, b, tp, tb, cp, cs, o, st, stream); + } + + nvcompStatus_t DeflateDecompressAsync(const void* const* cp, + const size_t* cs, + const size_t* us, + size_t* ac, + size_t b, + void* tp, + size_t tb, + void* const* up, + nvcompBatchedDeflateDecompressOpts_t o, + nvcompStatus_t* st, + cudaStream_t stream) const + { + return deflate_decompress_fn_(cp, cs, us, ac, b, tp, tb, up, o, st, stream); + } + + // --- Gdeflate API --- + nvcompStatus_t GdeflateCompressGetTempSizeAsync(size_t n, + size_t s, + nvcompBatchedGdeflateCompressOpts_t o, + size_t* t, + size_t m) const + { + return gdeflate_comp_temp_fn_(n, s, o, t, m); + } + + nvcompStatus_t GdeflateDecompressGetTempSizeAsync(size_t n, + size_t s, + nvcompBatchedGdeflateDecompressOpts_t o, + size_t* t, + size_t m) const + { + return gdeflate_decomp_temp_fn_(n, s, o, t, m); + } + + nvcompStatus_t GdeflateCompressGetMaxOutputChunkSize(size_t s, + nvcompBatchedGdeflateCompressOpts_t o, + size_t* m) const { return gdeflate_max_out_fn_(s, o, m); } + + nvcompStatus_t GdeflateCompressAsync(const void* const* u, + const size_t* us, + size_t ms, + size_t b, + void* tp, + size_t tb, + void* const* cp, + size_t* cs, + nvcompBatchedGdeflateCompressOpts_t o, + nvcompStatus_t* st, + cudaStream_t stream) const + { + return gdeflate_compress_fn_(u, us, ms, b, tp, tb, cp, cs, o, st, stream); + } + + nvcompStatus_t GdeflateDecompressAsync(const void* const* cp, + const size_t* cs, + const size_t* us, + size_t* ac, + size_t b, + void* tp, + size_t tb, + void* const* up, + nvcompBatchedGdeflateDecompressOpts_t o, + nvcompStatus_t* st, + cudaStream_t stream) const + { + return gdeflate_decompress_fn_(cp, cs, us, ac, b, tp, tb, up, o, st, stream); + } + + // --- Snappy API --- + nvcompStatus_t SnappyCompressGetTempSizeAsync(size_t n, + size_t s, + nvcompBatchedSnappyCompressOpts_t o, + size_t* t, + size_t m) const { return snappy_comp_temp_fn_(n, s, o, t, m); } + + nvcompStatus_t SnappyDecompressGetTempSizeAsync(size_t n, + size_t s, + nvcompBatchedSnappyDecompressOpts_t o, + size_t* t, + size_t m) const + { + return snappy_decomp_temp_fn_(n, s, o, t, m); + } + + nvcompStatus_t SnappyCompressGetMaxOutputChunkSize(size_t s, + nvcompBatchedSnappyCompressOpts_t o, + size_t* m) const { return snappy_max_out_fn_(s, o, m); } + + nvcompStatus_t SnappyCompressAsync(const void* const* u, + const size_t* us, + size_t ms, + size_t b, + void* tp, + size_t tb, + void* const* cp, + size_t* cs, + nvcompBatchedSnappyCompressOpts_t o, + nvcompStatus_t* st, + cudaStream_t stream) const + { + return snappy_compress_fn_(u, us, ms, b, tp, tb, cp, cs, o, st, stream); + } + + nvcompStatus_t SnappyDecompressAsync(const void* const* cp, + const size_t* cs, + const size_t* us, + size_t* ac, + size_t b, + void* tp, + size_t tb, + void* const* up, + nvcompBatchedSnappyDecompressOpts_t o, + nvcompStatus_t* st, + cudaStream_t stream) const + { + return snappy_decompress_fn_(cp, cs, us, ac, b, tp, tb, up, o, st, stream); + } + + // --- Zstd API --- + nvcompStatus_t ZstdCompressGetTempSizeAsync(size_t n, + size_t s, + nvcompBatchedZstdCompressOpts_t o, + size_t* t, + size_t m) const { return zstd_comp_temp_fn_(n, s, o, t, m); } + + nvcompStatus_t ZstdDecompressGetTempSizeAsync(size_t n, + size_t s, + nvcompBatchedZstdDecompressOpts_t o, + size_t* t, + size_t m) const { return zstd_decomp_temp_fn_(n, s, o, t, m); } + + nvcompStatus_t ZstdCompressGetMaxOutputChunkSize(size_t s, nvcompBatchedZstdCompressOpts_t o, size_t* m) const + { + return zstd_max_out_fn_(s, o, m); + } + + nvcompStatus_t ZstdCompressAsync(const void* const* u, + const size_t* us, + size_t ms, + size_t b, + void* tp, + size_t tb, + void* const* cp, + size_t* cs, + nvcompBatchedZstdCompressOpts_t o, + nvcompStatus_t* st, + cudaStream_t stream) const + { + return zstd_compress_fn_(u, us, ms, b, tp, tb, cp, cs, o, st, stream); + } + + nvcompStatus_t ZstdDecompressAsync(const void* const* cp, + const size_t* cs, + const size_t* us, + size_t* ac, + size_t b, + void* tp, + size_t tb, + void* const* up, + nvcompBatchedZstdDecompressOpts_t o, + nvcompStatus_t* st, + cudaStream_t stream) const + { + return zstd_decompress_fn_(cp, cs, us, ac, b, tp, tb, up, o, st, stream); + } + + nvcomp_api(const nvcomp_api&) = delete; + nvcomp_api& operator=(const nvcomp_api&) = delete; + + private: + nvcomp_api() + { +#if defined(_WIN32) + const std::string lib_name = "nvcomp64_5.dll"; +#elif defined(__linux__) + const std::string lib_name = "libnvcomp.so"; +#else + return; +#endif + handle_ = proc::load_library(lib_name); + if (!handle_) return; + +#define LOAD_FN(fn, member) member = proc::get_symbol(handle_, #fn, lib_name) +#define LOAD_VAR(type, name, member) member = proc::get_symbol(handle_, #name, lib_name) + + // --- LZ4 --- + LOAD_FN(nvcompBatchedLZ4CompressGetTempSizeAsync, lz4_comp_temp_fn_); + LOAD_FN(nvcompBatchedLZ4DecompressGetTempSizeAsync, lz4_decomp_temp_fn_); + LOAD_FN(nvcompBatchedLZ4CompressGetMaxOutputChunkSize, lz4_max_out_fn_); + LOAD_FN(nvcompBatchedLZ4CompressAsync, lz4_compress_fn_); + LOAD_FN(nvcompBatchedLZ4DecompressAsync, lz4_decompress_fn_); + + // --- Cascaded --- + LOAD_FN(nvcompBatchedCascadedCompressGetTempSizeAsync, cascaded_comp_temp_fn_); + LOAD_FN(nvcompBatchedCascadedDecompressGetTempSizeAsync, cascaded_decomp_temp_fn_); + LOAD_FN(nvcompBatchedCascadedCompressGetMaxOutputChunkSize, cascaded_max_out_fn_); + LOAD_FN(nvcompBatchedCascadedCompressAsync, cascaded_compress_fn_); + LOAD_FN(nvcompBatchedCascadedDecompressAsync, cascaded_decompress_fn_); + + // --- Deflate --- + LOAD_FN(nvcompBatchedDeflateCompressGetTempSizeAsync, deflate_comp_temp_fn_); + LOAD_FN(nvcompBatchedDeflateDecompressGetTempSizeAsync, deflate_decomp_temp_fn_); + LOAD_FN(nvcompBatchedDeflateCompressGetMaxOutputChunkSize, deflate_max_out_fn_); + LOAD_FN(nvcompBatchedDeflateCompressAsync, deflate_compress_fn_); + LOAD_FN(nvcompBatchedDeflateDecompressAsync, deflate_decompress_fn_); + + // --- Gdeflate --- + LOAD_FN(nvcompBatchedGdeflateCompressGetTempSizeAsync, gdeflate_comp_temp_fn_); + LOAD_FN(nvcompBatchedGdeflateDecompressGetTempSizeAsync, gdeflate_decomp_temp_fn_); + LOAD_FN(nvcompBatchedGdeflateCompressGetMaxOutputChunkSize, gdeflate_max_out_fn_); + LOAD_FN(nvcompBatchedGdeflateCompressAsync, gdeflate_compress_fn_); + LOAD_FN(nvcompBatchedGdeflateDecompressAsync, gdeflate_decompress_fn_); + + // --- Snappy --- + LOAD_FN(nvcompBatchedSnappyCompressGetTempSizeAsync, snappy_comp_temp_fn_); + LOAD_FN(nvcompBatchedSnappyDecompressGetTempSizeAsync, snappy_decomp_temp_fn_); + LOAD_FN(nvcompBatchedSnappyCompressGetMaxOutputChunkSize, snappy_max_out_fn_); + LOAD_FN(nvcompBatchedSnappyCompressAsync, snappy_compress_fn_); + LOAD_FN(nvcompBatchedSnappyDecompressAsync, snappy_decompress_fn_); + + // --- Zstd --- + LOAD_FN(nvcompBatchedZstdCompressGetTempSizeAsync, zstd_comp_temp_fn_); + LOAD_FN(nvcompBatchedZstdDecompressGetTempSizeAsync, zstd_decomp_temp_fn_); + LOAD_FN(nvcompBatchedZstdCompressGetMaxOutputChunkSize, zstd_max_out_fn_); + LOAD_FN(nvcompBatchedZstdCompressAsync, zstd_compress_fn_); + LOAD_FN(nvcompBatchedZstdDecompressAsync, zstd_decompress_fn_); + +#undef LOAD_FN +#undef LOAD_VAR + } + + proc::library_handle handle_ = nullptr; + + // --- Function Pointer Types --- + + // Type definitions matching exact function layouts + nvcompStatus_t (*lz4_comp_temp_fn_)(size_t, size_t, nvcompBatchedLZ4CompressOpts_t, size_t*, size_t) = nullptr; + nvcompStatus_t (*lz4_decomp_temp_fn_)(size_t, size_t, nvcompBatchedLZ4DecompressOpts_t, size_t*, size_t) = + nullptr; + nvcompStatus_t (*lz4_max_out_fn_)(size_t, nvcompBatchedLZ4CompressOpts_t, size_t*) = nullptr; + nvcompStatus_t (*lz4_compress_fn_)(const void* const*, + const size_t*, + size_t, + size_t, + void*, + size_t, + void* const*, + size_t*, + nvcompBatchedLZ4CompressOpts_t, + nvcompStatus_t*, + cudaStream_t) = nullptr; + nvcompStatus_t (*lz4_decompress_fn_)(const void* const*, + const size_t*, + const size_t*, + size_t*, + size_t, + void*, + size_t, + void* const*, + nvcompBatchedLZ4DecompressOpts_t, + nvcompStatus_t*, + cudaStream_t) = nullptr; + + nvcompStatus_t (*cascaded_comp_temp_fn_)(size_t, size_t, nvcompBatchedCascadedCompressOpts_t, size_t*, size_t) = + nullptr; + nvcompStatus_t (*cascaded_decomp_temp_fn_)(size_t, + size_t, + nvcompBatchedCascadedDecompressOpts_t, + size_t*, + size_t) = nullptr; + nvcompStatus_t (*cascaded_max_out_fn_)(size_t, nvcompBatchedCascadedCompressOpts_t, size_t*) = nullptr; + nvcompStatus_t (*cascaded_compress_fn_)(const void* const*, + const size_t*, + size_t, + size_t, + void*, + size_t, + void* const*, + size_t*, + nvcompBatchedCascadedCompressOpts_t, + nvcompStatus_t*, + cudaStream_t) = nullptr; + nvcompStatus_t (*cascaded_decompress_fn_)(const void* const*, + const size_t*, + const size_t*, + size_t*, + size_t, + void*, + size_t, + void* const*, + nvcompBatchedCascadedDecompressOpts_t, + nvcompStatus_t*, + cudaStream_t) = nullptr; + + nvcompStatus_t (*deflate_comp_temp_fn_)(size_t, size_t, nvcompBatchedDeflateCompressOpts_t, size_t*, size_t) = + nullptr; + nvcompStatus_t (*deflate_decomp_temp_fn_)(size_t, size_t, nvcompBatchedDeflateDecompressOpts_t, size_t*, size_t) + = nullptr; + nvcompStatus_t (*deflate_max_out_fn_)(size_t, nvcompBatchedDeflateCompressOpts_t, size_t*) = nullptr; + nvcompStatus_t (*deflate_compress_fn_)(const void* const*, + const size_t*, + size_t, + size_t, + void*, + size_t, + void* const*, + size_t*, + nvcompBatchedDeflateCompressOpts_t, + nvcompStatus_t*, + cudaStream_t) = nullptr; + nvcompStatus_t (*deflate_decompress_fn_)(const void* const*, + const size_t*, + const size_t*, + size_t*, + size_t, + void*, + size_t, + void* const*, + nvcompBatchedDeflateDecompressOpts_t, + nvcompStatus_t*, + cudaStream_t) = nullptr; + + nvcompStatus_t (*gdeflate_comp_temp_fn_)(size_t, size_t, nvcompBatchedGdeflateCompressOpts_t, size_t*, size_t) = + nullptr; + nvcompStatus_t (*gdeflate_decomp_temp_fn_)(size_t, + size_t, + nvcompBatchedGdeflateDecompressOpts_t, + size_t*, + size_t) = nullptr; + nvcompStatus_t (*gdeflate_max_out_fn_)(size_t, nvcompBatchedGdeflateCompressOpts_t, size_t*) = nullptr; + nvcompStatus_t (*gdeflate_compress_fn_)(const void* const*, + const size_t*, + size_t, + size_t, + void*, + size_t, + void* const*, + size_t*, + nvcompBatchedGdeflateCompressOpts_t, + nvcompStatus_t*, + cudaStream_t) = nullptr; + nvcompStatus_t (*gdeflate_decompress_fn_)(const void* const*, + const size_t*, + const size_t*, + size_t*, + size_t, + void*, + size_t, + void* const*, + nvcompBatchedGdeflateDecompressOpts_t, + nvcompStatus_t*, + cudaStream_t) = nullptr; + + nvcompStatus_t (*snappy_comp_temp_fn_)(size_t, size_t, nvcompBatchedSnappyCompressOpts_t, size_t*, size_t) = + nullptr; + nvcompStatus_t (*snappy_decomp_temp_fn_)(size_t, size_t, nvcompBatchedSnappyDecompressOpts_t, size_t*, size_t) = + nullptr; + nvcompStatus_t (*snappy_max_out_fn_)(size_t, nvcompBatchedSnappyCompressOpts_t, size_t*) = nullptr; + nvcompStatus_t (*snappy_compress_fn_)(const void* const*, + const size_t*, + size_t, + size_t, + void*, + size_t, + void* const*, + size_t*, + nvcompBatchedSnappyCompressOpts_t, + nvcompStatus_t*, + cudaStream_t) = nullptr; + nvcompStatus_t (*snappy_decompress_fn_)(const void* const*, + const size_t*, + const size_t*, + size_t*, + size_t, + void*, + size_t, + void* const*, + nvcompBatchedSnappyDecompressOpts_t, + nvcompStatus_t*, + cudaStream_t) = nullptr; + + nvcompStatus_t (*zstd_comp_temp_fn_)(size_t, size_t, nvcompBatchedZstdCompressOpts_t, size_t*, size_t) = + nullptr; + nvcompStatus_t (*zstd_decomp_temp_fn_)(size_t, size_t, nvcompBatchedZstdDecompressOpts_t, size_t*, size_t) = + nullptr; + nvcompStatus_t (*zstd_max_out_fn_)(size_t, nvcompBatchedZstdCompressOpts_t, size_t*) = nullptr; + nvcompStatus_t (*zstd_compress_fn_)(const void* const*, + const size_t*, + size_t, + size_t, + void*, + size_t, + void* const*, + size_t*, + nvcompBatchedZstdCompressOpts_t, + nvcompStatus_t*, + cudaStream_t) = nullptr; + nvcompStatus_t (*zstd_decompress_fn_)(const void* const*, + const size_t*, + const size_t*, + size_t*, + size_t, + void*, + size_t, + void* const*, + nvcompBatchedZstdDecompressOpts_t, + nvcompStatus_t*, + cudaStream_t) = nullptr; + }; +} diff --git a/compressed_image/include/compressed/cuda/proc_util.h b/compressed_image/include/compressed/cuda/proc_util.h new file mode 100644 index 0000000..5b92a37 --- /dev/null +++ b/compressed_image/include/compressed/cuda/proc_util.h @@ -0,0 +1,127 @@ +/* +Header for various procutils such as finding symbols in a file and raising the appropriate error if it cannot be located. + +Note: This expects windows/linux as it is part of the cuda subfolder which only support windows and linux. It is not + intended to be a generic dll/so module. It is an implementation detail of compressed-image and should not be used + outside of it! +*/ + +#pragma once + +#include +#include + +#if defined(_WIN32) +#include +#else +#include +#endif + +#include "compressed/macros.h" +#include "compressed/cuda/exceptions.h" + + +namespace NAMESPACE_COMPRESSED_IMAGE +{ + + namespace cuda + { + + namespace proc + { + + // Platform-independent handle type +#if defined(_WIN32) + // HMODULE decays down to PVOID which is void* but for ease of use later we alias it directly. + using library_handle_impl_ = HMODULE; +#else + using library_handle_impl_ = void*; +#endif + + /// Custom deleter for unique_ptr freeing/closing a dll/so automatically on destruction. + struct library_deleter + { + void operator()(library_handle_impl_ handle) const + { + if (!handle) return; +#if defined(_WIN32) + FreeLibrary(handle); +#else + dlclose(handle); +#endif + } + }; + + /// Unique-ptr wrapped handle pointer, is automatically freed on destruction. + using library_handle = std::unique_ptr, library_deleter>; + + // Function to load a library and return a unique_ptr-managed handle + inline library_handle load_library(std::string name) + { + library_handle_impl_ handle = nullptr; + +#if defined(_WIN32) + handle = LoadLibraryA(name.c_str()); +#elif defined(__linux__) + handle = dlopen(name.c_str(), RTLD_GLOBAL | RTLD_LAZY); +#endif + + if (!handle) + { + throw library_not_found(std::format("Failed to load library: {}", name)); + } + + return library_handle(handle); + } + + /// \brief retrieves the symbol `symbol_name` from the given library handle + /// + /// \param handle The library handle to load the symbol from + /// \param symbol_name The symbol name to load + /// \param object_name The name of the library handle. may be left empty, only used for error messages. + template + func_sig get_symbol(library_handle& handle, std::string symbol_name, std::string object_name) + { + if (!handle) + { + throw std::invalid_argument( + std::format( + "Internal: passed empty library handle while retrieving symbol of name {} from object {}", + symbol_name, + object_name + ) + ); + } + +#if defined(_WIN32) + func_sig func_ptr = reinterpret_cast(GetProcAddress(handle.get(), symbol_name.c_str())); +#elif defined(__linux__) + func_sig func_ptr = reinterpret_cast(dlsym(handle.get(), symbol_name.c_str())); +#else + func_sig func_ptr = nullptr; + throw symbol_not_found( + std::format( + "Unable to find symbol {} in library {} as we are on an unsupported platform for CUDA.", + symbol_name, object_name + ) + ); +#endif + + if (!func_ptr) + { + throw symbol_not_found( + std::format( + "Unable to find symbol {} in library {} while dynamically loading it.", + symbol_name, object_name + ) + ); + } + return func_ptr; + } + + } // namespace proc + + } // namespace cuda + + +} // namespace NAMESPACE_COMPRESSED_IMAGE \ No newline at end of file diff --git a/compressed_image/include/compressed/detail/oiio_util.h b/compressed_image/include/compressed/detail/oiio_util.h index b93838c..a472163 100644 --- a/compressed_image/include/compressed/detail/oiio_util.h +++ b/compressed_image/include/compressed/detail/oiio_util.h @@ -17,12 +17,11 @@ #include "scoped_timer.h" #include "compressed/json_alias.h" -namespace NAMESPACE_COMPRESSED_IMAGE +namespace +NAMESPACE_COMPRESSED_IMAGE { - namespace detail { - /// \brief Create a mapping of contiguous begin-end pairs from the passed channel names /// /// Takes the input channel names and constructs a list of (sorted) pairs for the begin and end channel ranges. @@ -38,9 +37,9 @@ namespace NAMESPACE_COMPRESSED_IMAGE /// \param channelnames The channelnames to construct pairings for, invalid channelnames throw a std::out_of_range /// /// \return A mapping of begin-end pairs for the channels - inline std::vector>get_contiguous_channels( + inline std::vector> get_contiguous_channels( const std::unique_ptr& input_ptr, - std::vector channelnames + const std::vector channelnames ) { std::unordered_map map_name_to_index; @@ -57,7 +56,7 @@ namespace NAMESPACE_COMPRESSED_IMAGE } // Sort them to ensure we can map them correctly. - std::sort(indices.begin(), indices.end()); + std::ranges::sort(indices); std::vector> result; if (indices.empty()) @@ -83,12 +82,10 @@ namespace NAMESPACE_COMPRESSED_IMAGE } - // Utilities related to OIIO ParamValue (the internal metadata type) helping us convert them into json-able // types namespace param_value { - /// \brief JSON-like types that we can store enum class _JSONType { @@ -97,7 +94,7 @@ namespace NAMESPACE_COMPRESSED_IMAGE _string }; - inline _JSONType to_json_type(OIIO::ParamValue pvalue) + inline _JSONType to_json_type(const OIIO::ParamValue& pvalue) { _COMPRESSED_PROFILE_FUNCTION(); auto type = pvalue.type(); @@ -116,18 +113,20 @@ namespace NAMESPACE_COMPRESSED_IMAGE type == OIIO::TypeDesc::INT32 || type == OIIO::TypeDesc::UINT64 || type == OIIO::TypeDesc::INT64 - ) + ) { return _JSONType::_int; } - else if (type == OIIO::TypeDesc::HALF || type == OIIO::TypeDesc::FLOAT || type == OIIO::TypeDesc::DOUBLE) + else if (type == OIIO::TypeDesc::HALF || type == OIIO::TypeDesc::FLOAT || type == + OIIO::TypeDesc::DOUBLE) { return _JSONType::_float; } throw std::invalid_argument( std::format( - "Unknown json type for param value: {}", pvalue.name().string() + "Unknown json type for param value: {}", + pvalue.name().string() ) ); } @@ -240,10 +239,7 @@ namespace NAMESPACE_COMPRESSED_IMAGE return out; } } - } // detail - - } // NAMESPACE_COMPRESSED_IMAGE -#endif // COMPRESSED_IMAGE_OIIO_AVAILABLE \ No newline at end of file +#endif // COMPRESSED_IMAGE_OIIO_AVAILABLE diff --git a/compressed_image/include/compressed/detail/scoped_timer.h b/compressed_image/include/compressed/detail/scoped_timer.h index 2e2feba..7646149 100644 --- a/compressed_image/include/compressed/detail/scoped_timer.h +++ b/compressed_image/include/compressed/detail/scoped_timer.h @@ -22,19 +22,21 @@ #include #ifdef _COMPRESSED_PROFILE -#define _COMPRESSED_PROFILE_SCOPE(name) NAMESPACE_COMPRESSED_IMAGE::detail::InstrumentationTimer timer##__LINE__(name) -#define _COMPRESSED_PROFILE_FUNCTION() NAMESPACE_COMPRESSED_IMAGE::detail::InstrumentationTimer timer##__FUNCTION__##__LINE__(__FUNCTION__) +#define CONCAT_2_IMPL(x, y) x##y +#define CONCAT_2(x, y) CONCAT_2_IMPL(x, y) + +#define _COMPRESSED_PROFILE_SCOPE(name) NAMESPACE_COMPRESSED_IMAGE::detail::InstrumentationTimer CONCAT_2(timer, __LINE__)(name) +#define _COMPRESSED_PROFILE_FUNCTION() NAMESPACE_COMPRESSED_IMAGE::detail::InstrumentationTimer CONCAT_2(timer, __LINE__)(__FUNCTION__) #else #define _COMPRESSED_PROFILE_SCOPE(name) #define _COMPRESSED_PROFILE_FUNCTION() #endif -namespace NAMESPACE_COMPRESSED_IMAGE +namespace +NAMESPACE_COMPRESSED_IMAGE { - namespace detail { - struct ProfileResult { std::string Name; @@ -54,6 +56,7 @@ namespace NAMESPACE_COMPRESSED_IMAGE std::ofstream m_OutputStream; int m_ProfileCount; std::mutex m_lock; + public: Instrumentor() : m_CurrentSession(nullptr), m_ProfileCount(0) @@ -64,7 +67,7 @@ namespace NAMESPACE_COMPRESSED_IMAGE { m_OutputStream.open(filepath); WriteHeader(); - m_CurrentSession = new InstrumentationSession{ name }; + m_CurrentSession = new InstrumentationSession{name}; } void EndSession() @@ -137,19 +140,21 @@ namespace NAMESPACE_COMPRESSED_IMAGE { auto endTimepoint = std::chrono::high_resolution_clock::now(); - long long start = std::chrono::time_point_cast(m_StartTimepoint).time_since_epoch().count(); - long long end = std::chrono::time_point_cast(endTimepoint).time_since_epoch().count(); + long long start = std::chrono::time_point_cast(m_StartTimepoint). + time_since_epoch().count(); + long long end = std::chrono::time_point_cast(endTimepoint).time_since_epoch() + .count(); uint32_t threadID = static_cast(std::hash{}(std::this_thread::get_id())); - Instrumentor::Get().WriteProfile({ m_Name, start, end, threadID }); + Instrumentor::Get().WriteProfile({m_Name, start, end, threadID}); m_Stopped = true; } + private: const std::string m_Name{}; std::chrono::time_point m_StartTimepoint; bool m_Stopped = false; }; - } // detail } // NAMESPACE_COMPRESSED_IMAGE diff --git a/compressed_image/include/compressed/detail/scratch_buffer_pool.h b/compressed_image/include/compressed/detail/scratch_buffer_pool.h new file mode 100644 index 0000000..9fc9bf8 --- /dev/null +++ b/compressed_image/include/compressed/detail/scratch_buffer_pool.h @@ -0,0 +1,406 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "compressed/macros.h" +#include "compressed/util.h" + +namespace +NAMESPACE_COMPRESSED_IMAGE +{ + namespace detail + { + class scratch_buffer_pool; + + /// \brief RAII handle representing a temporary scratch buffer checked out from a scratch buffer pool. + /// + /// The lease owns the temporary byte buffer while compression is using it. When the lease is destroyed, + /// the buffer is returned to the originating pool if the pool is still alive and the buffer satisfies the + /// pool's caching limits. + /// + /// This type is move-only. Moving transfers both the buffer and the responsibility to return it. + class scratch_buffer_lease + { + public: + scratch_buffer_lease() = default; + + /// Construct a lease from a pool and an already allocated buffer. + /// + /// \param pool The pool the buffer should be returned to when the lease is destroyed. + /// \param buffer The byte buffer owned by this lease. + /// \param size The logical size of the scratch buffer to expose via span(). + scratch_buffer_lease( + std::shared_ptr pool, + util::default_init_vector buffer, + const size_t size + ) + : m_pool(std::move(pool)), + m_buffer(std::move(buffer)), + m_size(size) + { + } + + scratch_buffer_lease(scratch_buffer_lease&& other) noexcept + : m_pool(std::move(other.m_pool)), + m_buffer(std::move(other.m_buffer)), + m_size(other.m_size) + { + other.m_size = 0; + } + + scratch_buffer_lease& operator=(scratch_buffer_lease&& other) noexcept + { + if (this != &other) + { + release(); + + m_pool = std::move(other.m_pool); + m_buffer = std::move(other.m_buffer); + m_size = other.m_size; + + other.m_size = 0; + } + + return *this; + } + + scratch_buffer_lease(const scratch_buffer_lease&) = delete; + scratch_buffer_lease& operator=(const scratch_buffer_lease&) = delete; + + ~scratch_buffer_lease() + { + release(); + } + + /// Retrieve a mutable span over the leased scratch memory. + /// + /// The returned span is only valid for as long as this lease remains alive and unmoved. + /// + /// \return A mutable span covering the requested logical scratch buffer size. + std::span span() noexcept + { + return std::span(m_buffer.data(), m_size); + } + + /// Retrieve a const span over the leased scratch memory. + /// + /// The returned span is only valid for as long as this lease remains alive and unmoved. + /// + /// \return A const span covering the requested logical scratch buffer size. + std::span span() const noexcept + { + return std::span(m_buffer.data(), m_size); + } + + /// Retrieve the logical size of the leased scratch span in bytes. + /// + /// \return The size requested when this lease was created. + size_t size() const noexcept + { + return m_size; + } + + /// Check whether the lease currently owns a buffer large enough for its logical size. + /// + /// \return True if the lease owns a large enough buffer, false otherwise. + bool valid() const noexcept + { + return m_buffer.size() >= m_size; + } + + private: + /// Return the currently held buffer to the pool, if any. + void release(); + + std::shared_ptr m_pool{}; + util::default_init_vector m_buffer{}; + size_t m_size = 0; + }; + + + /// \brief Configuration options for scratch buffer pooling. + /// + /// These options control how many returned buffers are cached and how much memory the pool may retain. + /// Buffers that exceed the configured limits are simply released instead of cached. + struct scratch_buffer_pool_options + { + size_t max_cached_buffers = 0; + size_t max_cached_bytes = 1024 * 1024 * 1024; // 1GB + }; + + + /// \brief Thread-safe pool for temporary compression scratch buffers. + /// + /// The pool is used by low-level CPU compression paths to avoid repeatedly allocating temporary output + /// buffers for Blosc2 compression. Buffers are handed out as move-only scratch_buffer_lease objects and are + /// automatically returned to the pool when the lease goes out of scope. + /// + /// The pool itself does not have global ownership. Channels keep a shared reference to the active pool, + /// while the global registry only stores a weak reference. This allows the pool to be globally discoverable + /// while still being destroyed when the last channel / iterator reference disappears. + class scratch_buffer_pool : public std::enable_shared_from_this + { + public: + explicit scratch_buffer_pool(const scratch_buffer_pool_options options = {}) + : m_options(options) + { + if (m_options.max_cached_buffers == 0) + { + m_options.max_cached_buffers = std::max(1, std::thread::hardware_concurrency()); + } + } + + /// Acquire a scratch buffer of at least \p size bytes. + /// + /// This function first attempts to reuse the smallest cached buffer that is large enough. If none is available, + /// a new buffer is allocated. The returned lease keeps the pool alive for the duration of the lease. + /// + /// \param size The minimum scratch buffer size in bytes. + /// \return A move-only lease containing a scratch buffer with logical size \p size. + scratch_buffer_lease acquire(const size_t size) + { + util::default_init_vector buffer{}; + + { + std::scoped_lock lock(m_mutex); + + auto best = m_available.end(); + for (auto it = m_available.begin(); it != m_available.end(); ++it) + { + if (it->size() >= size && (best == m_available.end() || it->size() < best->size())) + { + best = it; + } + } + + if (best != m_available.end()) + { + m_cached_bytes -= best->size(); + buffer = std::move(*best); + m_available.erase(best); + } + } + + // Fallback if no available buffer is found. Resize it and have it be returned to the pool after. + if (buffer.size() < size) + { + buffer.resize(size); + } + + return scratch_buffer_lease(shared_from_this(), std::move(buffer), size); + } + + /// Clear all currently cached scratch buffers. + /// + /// Active leases are not affected. Buffers currently checked out will either be returned later or released, + /// depending on the pool limits at the time they are returned. + void clear() + { + std::scoped_lock lock(m_mutex); + m_available.clear(); + m_cached_bytes = 0; + } + + /// Retrieve the total number of bytes currently cached by the pool. + /// + /// This only includes buffers currently stored in the pool, not buffers checked out by active leases. + /// + /// \return The number of cached bytes. + size_t cached_bytes() const + { + std::scoped_lock lock(m_mutex); + return m_cached_bytes; + } + + /// Retrieve the number of buffers currently cached by the pool. + /// + /// This only includes buffers currently stored in the pool, not buffers checked out by active leases. + /// + /// \return The number of cached buffers. + size_t cached_buffers() const + { + std::scoped_lock lock(m_mutex); + return m_available.size(); + } + + private: + friend class scratch_buffer_lease; + + /// Return a buffer to the pool if doing so does not exceed the configured cache limits. + /// + /// \param buffer The buffer to return. + void release(util::default_init_vector buffer) + { + if (buffer.empty()) + { + return; + } + + std::scoped_lock lock(m_mutex); + + if (m_available.size() >= m_options.max_cached_buffers) + { + return; + } + + if (m_cached_bytes + buffer.size() > m_options.max_cached_bytes) + { + return; + } + + m_cached_bytes += buffer.size(); + m_available.push_back(std::move(buffer)); + } + + scratch_buffer_pool_options m_options{}; + mutable std::mutex m_mutex{}; + std::vector> m_available{}; + size_t m_cached_bytes = 0; + }; + + inline void scratch_buffer_lease::release() + { + if (!m_pool || m_buffer.empty()) + { + return; + } + + auto pool = std::move(m_pool); + auto buffer = std::move(m_buffer); + m_size = 0; + + pool->release(std::move(buffer)); + } + + /// \brief Weak global registry for the currently active scratch buffer pool. + /// + /// The registry allows low-level compression code to discover the active scratch pool without threading pool + /// references through every compression API. It intentionally stores only a weak reference so that the pool is + /// destroyed once all owning channel / iterator references are gone. + class scratch_pool_registry + { + public: + /// Retrieve the current pool or create a new one for channel-owned use. + /// + /// Channels call this to obtain a shared reference to the globally discoverable pool. The registry keeps only + /// a weak reference; the returned shared pointer is what keeps the pool alive. + /// + /// \return A shared pointer to the active scratch buffer pool. + static std::shared_ptr get_or_create_for_channel() + { + std::scoped_lock lock(mutex()); + + if (auto pool = pool_ref().lock()) + { + return pool; + } + + auto pool = std::make_shared(); + pool_ref() = pool; + return pool; + } + + /// Retrieve the currently active scratch pool, if one is still alive. + /// + /// Low-level compression wrappers use this to acquire pooled scratch buffers when a channel-owned pool exists. + /// If no pool is alive, callers should fall back to local temporary allocation. + /// + /// \return The active pool, or nullptr if no channel-owned pool exists. + static std::shared_ptr current() + { + std::scoped_lock lock(mutex()); + return pool_ref().lock(); + } + + /// Clear cached buffers from the active pool, if one exists. + /// + /// This does not destroy the pool while channels still hold shared references to it. + static void clear() + { + if (auto pool = current()) + { + pool->clear(); + } + } + + /// Retrieve the number of bytes cached in the active pool. + /// + /// \return The active pool's cached byte count, or 0 if no pool exists. + static size_t cached_bytes() + { + if (auto pool = current()) + { + return pool->cached_bytes(); + } + + return 0; + } + + /// Retrieve the number of buffers cached in the active pool. + /// + /// \return The active pool's cached buffer count, or 0 if no pool exists. + static size_t cached_buffers() + { + if (auto pool = current()) + { + return pool->cached_buffers(); + } + + return 0; + } + + private: + /// Retrieve the registry mutex. + /// + /// \return A process-local mutex guarding the weak pool reference. + static std::mutex& mutex() + { + static std::mutex value{}; + return value; + } + + /// Retrieve the weak reference to the currently active pool. + /// + /// \return A process-local weak pointer to the active pool. + static std::weak_ptr& pool_ref() + { + static std::weak_ptr value{}; + return value; + } + }; + } // namespace detail + + /// \brief Clear cached scratch buffers from the active global scratch pool. + /// + /// This releases memory currently cached by the pool without invalidating any live channels or active compression + /// operations. Buffers checked out by active leases are not affected and may be returned to the pool later. + inline void clear_scratch_pool() + { + detail::scratch_pool_registry::clear(); + } + + /// \brief Retrieve the number of bytes currently cached by the active global scratch pool. + /// + /// This value does not include buffers that are currently checked out by active compression operations. + /// + /// \return The number of bytes cached by the active pool, or 0 if no pool is alive. + inline size_t scratch_pool_cached_bytes() + { + return detail::scratch_pool_registry::cached_bytes(); + } + + /// \brief Retrieve the number of buffers currently cached by the active global scratch pool. + /// + /// This value does not include buffers that are currently checked out by active compression operations. + /// + /// \return The number of buffers cached by the active pool, or 0 if no pool is alive. + inline size_t scratch_pool_cached_buffers() + { + return detail::scratch_pool_registry::cached_buffers(); + } +} // NAMESPACE_COMPRESSED_IMAGE diff --git a/compressed_image/include/compressed/enums.h b/compressed_image/include/compressed/enums.h index bdd9dd1..4cd76d6 100644 --- a/compressed_image/include/compressed/enums.h +++ b/compressed_image/include/compressed/enums.h @@ -1,5 +1,7 @@ #pragma once +#include + #include "macros.h" #ifdef COMPRESSED_IMAGE_OIIO_AVAILABLE @@ -7,70 +9,144 @@ #include #endif -namespace NAMESPACE_COMPRESSED_IMAGE +namespace +NAMESPACE_COMPRESSED_IMAGE { - namespace enums - { - /// Enum representing available compression codecs. - /// - /// These codecs are inherited from `blosc2` and define different compression algorithms - /// that can be used when storing or transmitting compressed images. - enum class codec - { - blosclz, ///< Lightweight, fast compression optimized for high-speed decompression. - lz4, ///< Extremely fast compression and decompression with moderate compression ratio. - lz4hc, ///< High-compression variant of LZ4 with slower compression but similar fast decompression. - zstd, ///< Zstandard compression providing high compression ratios with decent speed. - }; + namespace enums + { + /// Enum representing available compression codecs. + /// + /// These codecs are inherited from `blosc2`/`nvcomp` and define different compression algorithms + /// that can be used when storing compressed images. Any gpu codecs rely on nvidia gpus to function + /// but will fall back gracefully to a cpu equivalent should there be no nvidia gpu or missing cuda + /// libraries. + enum class codec + { + blosclz, ///< Lightweight, fast compression optimized for high-speed decompression. + lz4, ///< Extremely fast compression and decompression with moderate compression ratio. + lz4hc, ///< High-compression variant of LZ4 with slower compression but similar fast decompression. + zstd, ///< Zstandard compression providing high compression ratios with decent speed. + lz4_gpu, ///< (cuda) gpu variant of lz4 compression, faster throughput compared to regular lz4 + snappy_gpu, ///< (cuda) gpu variant of snappy, a fast compression codec with moderate throughput + zstd_gpu, ///< (cuda) gpu variant of zstd, faster throughput compared to regular zstd + deflate_gpu, ///< (cuda) gpu variant of deflate, faster througput compared to regular deflate + gdeflate_gpu, ///< (cuda) a bit-swizzled variant of deflate, optimized for gpu performance. + cascaded_gpu + ///< (cuda) proprietary compression scheme built up by several simple compression schemes like rle, bitpacking and delta + }; + [[nodiscard]] constexpr std::string_view to_string(const codec value) + { + switch (value) + { + case codec::blosclz: return "blosclz"; + case codec::lz4: return "lz4"; + case codec::lz4hc: return "lz4hc"; + case codec::zstd: return "zstd"; + case codec::lz4_gpu: return "lz4_gpu"; + case codec::snappy_gpu: return "snappy_gpu"; + case codec::zstd_gpu: return "zstd_gpu"; + case codec::deflate_gpu: return "deflate_gpu"; + case codec::gdeflate_gpu: return "gdeflate_gpu"; + case codec::cascaded_gpu: return "cascaded_gpu"; + } -#ifdef COMPRESSED_IMAGE_OIIO_AVAILABLE + return "unknown"; + } - /// Get a OpenImageIO TypeDesc based on the given template parameter returning OIIO::TypeDesc::Unknown - /// if the image coordinate is not part of the valid template specializations for photoshop buffers - template - constexpr OIIO::TypeDesc get_type_desc() - { - if constexpr (std::is_same_v) - { - return OIIO::TypeDesc::UINT8; - } - else if constexpr (std::is_same_v) - { - return OIIO::TypeDesc::INT8; - } - else if constexpr (std::is_same_v) - { - return OIIO::TypeDesc::UINT16; - } - else if constexpr (std::is_same_v) - { - return OIIO::TypeDesc::INT16; - } - else if constexpr (std::is_same_v) - { - return OIIO::TypeDesc::UINT32; - } - else if constexpr (std::is_same_v) - { - return OIIO::TypeDesc::INT32; - } - else if constexpr (std::is_same_v) - { - return OIIO::TypeDesc::FLOAT; - } - else if constexpr (std::is_same_v) - { - return OIIO::TypeDesc::HALF; - } - else - { - return OIIO::TypeDesc::UNKNOWN; - } - } + [[nodiscard]] inline bool is_gpu_codec(const codec codec) + { + if (codec == codec::blosclz || codec == codec::lz4 || codec == codec::lz4hc || codec == codec::zstd) + { + return false; + } + return true; + } -#endif // COMPRESSED_IMAGE_OIIO_AVAILABLE + /// \brief map for the cpu codec fallbacks if no nvidia gpu is detected. + /// + /// These are constant and do not change. + static const std::map s_gpu_codec_fallback = { + {codec::lz4_gpu, codec::lz4}, + {codec::snappy_gpu, codec::lz4}, + {codec::zstd_gpu, codec::zstd}, + {codec::deflate_gpu, codec::zstd}, + {codec::gdeflate_gpu, codec::zstd}, + {codec::cascaded_gpu, codec::lz4} + }; + + namespace detail + { + /// \brief enum representing the different underlying compression/decompression wrappers we use for cpu/gpu + enum class compression_library + { + c_blosc2, + nvcomp + }; + + /// \brief mapping of compression codecs to their respective underlying libraries. + /// + /// Used internally to dispatch the calls. + static const std::map s_library_mapping = { + {codec::blosclz, compression_library::c_blosc2}, + {codec::lz4, compression_library::c_blosc2}, + {codec::lz4hc, compression_library::c_blosc2}, + {codec::zstd, compression_library::c_blosc2}, + {codec::lz4_gpu, compression_library::nvcomp}, + {codec::snappy_gpu, compression_library::nvcomp}, + {codec::zstd_gpu, compression_library::nvcomp}, + {codec::deflate_gpu, compression_library::nvcomp}, + {codec::gdeflate_gpu, compression_library::nvcomp}, + {codec::cascaded_gpu, compression_library::nvcomp} + }; + } // namespace detail - } // namespace enums +#ifdef COMPRESSED_IMAGE_OIIO_AVAILABLE + + /// Get a OpenImageIO TypeDesc based on the given template parameter returning OIIO::TypeDesc::Unknown + /// if the image coordinate is not part of the valid template specializations for photoshop buffers + template + constexpr OIIO::TypeDesc get_type_desc() + { + if constexpr (std::is_same_v) + { + return OIIO::TypeDesc::UINT8; + } + else if constexpr (std::is_same_v) + { + return OIIO::TypeDesc::INT8; + } + else if constexpr (std::is_same_v) + { + return OIIO::TypeDesc::UINT16; + } + else if constexpr (std::is_same_v) + { + return OIIO::TypeDesc::INT16; + } + else if constexpr (std::is_same_v) + { + return OIIO::TypeDesc::UINT32; + } + else if constexpr (std::is_same_v) + { + return OIIO::TypeDesc::INT32; + } + else if constexpr (std::is_same_v) + { + return OIIO::TypeDesc::FLOAT; + } + else if constexpr (std::is_same_v) + { + return OIIO::TypeDesc::HALF; + } + else + { + return OIIO::TypeDesc::UNKNOWN; + } + } + +#endif // COMPRESSED_IMAGE_OIIO_AVAILABLE + } // namespace enums } // NAMESPACE_COMPRESSED_IMAGE diff --git a/compressed_image/include/compressed/gpu_util.h b/compressed_image/include/compressed/gpu_util.h new file mode 100644 index 0000000..df3b30b --- /dev/null +++ b/compressed_image/include/compressed/gpu_util.h @@ -0,0 +1,21 @@ +#pragma once + +#include "compressed/macros.h" +#include "compressed/cuda/cuda_hook.h" + +namespace +NAMESPACE_COMPRESSED_IMAGE::gpu +{ + [[nodiscard]] inline bool is_available() noexcept + { + try + { + auto& inst = cuda::cuda_api::instance(); + return true; + } + catch (...) + { + return false; + } + } +} // namespace NAMESPACE_COMPRESSED_IMAGE::gpu diff --git a/compressed_image/include/compressed/image.h b/compressed_image/include/compressed/image.h index 6bf0832..6940566 100644 --- a/compressed_image/include/compressed/image.h +++ b/compressed_image/include/compressed/image.h @@ -18,7 +18,6 @@ #endif #include "macros.h" -#include "fwd.h" #include "blosc2/wrapper.h" #include "blosc2/schunk.h" #include "blosc2/lazyschunk.h" @@ -29,1884 +28,1942 @@ #include "detail/oiio_util.h" #include "detail/scoped_timer.h" -#include "iterators/iterator.h" - -namespace NAMESPACE_COMPRESSED_IMAGE +namespace +NAMESPACE_COMPRESSED_IMAGE { - - /// Compressed Image representation with easy access to different channels. Internally functions very similar to an NDArray - /// with the important distinction that the number of dimensions is fixed to be 3-Dimensional (width, height, channels). - /// They are laid out in scanline order with each channel being its own distinct object which may have any size. - /// - /// The image is stored in a non-resizable fashion so whatever the resolution was going into it, is what the image will be. - /// To rescale or refit the image a new `image` has to be constructed. - /// - /// The data is compressed in memory and we store it as part of a blosc2 super-chunk which is essentially a 3d array of - /// super-chunk -> chunk -> block. Where having the block size fit into L1 cache and the Chunk size into L3 cache is desirable - /// as each block can be handled by a single cpu core while the chunk fits well within shared L3 memory. - template - struct image : public std::ranges::view_interface> - { - using value_type = T; - - image() = default; - image(image&&) = default; - image& operator=(image&&) = default; - image(const image&) = delete; - image& operator=(const image&) = delete; - ~image() = default; - - - /// Constructs an image object with the specified channels, dimensions, and optional compression parameters. - /// - /// This constructor creates an image from a given set of channels. The channel names can optionally be specified. - /// The image is then compressed using the provided codec and compression level. - /// - /// Example: - /// \code{.cpp} - /// std::vector> channels = ...; - /// compressed::image my_image(channels, 1920, 1080, {"r", "g", "b"}, codec::lz4, 5); - /// \endcode - /// - /// \param channels A vector of spans containing the image channels (each channel is a 2D array of pixel data). - /// on construction these will be compressed thus the data can be safely freed after this function. - /// \param width The width of the image in pixels. - /// \param height The height of the image in pixels. - /// \param channel_names (Optional) A list of channel names, must match the number of channels provided. - /// If omitted or incorrect, channel names are ignored. - /// \param compression_codec (Optional) The codec used for compression, default is `codec::lz4`. - /// \param compression_level (Optional) The compression level, default is `9`. - /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to - /// comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle - /// larger blocks feel free to up this number. - /// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. - /// This should be tweaked to be no larger than the size of the usual images you are expecting - /// to compress for optimal performance but this could be upped which might give better compression - /// ratios. Must be a multiple of sizeof(T). - /// \throws std::runtime_error if a channel fails to be inserted. - image( - std::vector> channels, - size_t width, - size_t height, - std::vector channel_names = {}, - enums::codec compression_codec = enums::codec::lz4, - size_t compression_level = 9, - size_t block_size = s_default_blocksize, - size_t chunk_size = s_default_chunksize - ) - { - _COMPRESSED_PROFILE_FUNCTION(); - m_Width = width; - m_Height = height; - m_ChannelNames = channel_names; - auto comp_level_adjusted = util::ensure_compression_level(compression_level); - - // c-blosc2 chunks can at most be 2 gigabytes so the set chunk size should not exceed this. - assert(chunk_size < std::numeric_limits::max()); - assert(block_size < chunk_size); - if (channel_names.size() != channels.size() && channel_names.size() != 0) - { - std::cout << std::format( - "Invalid channelnames passed to image constructor, required them to match the number of" \ - " channels in the channels parameter.Expected {} items but instead got {} names. Ignoring channel names", - channels.size(), channel_names.size()) << std::endl; - - m_ChannelNames = {}; - } - - // Iterate all channels and start creating channels for it. - size_t channel_idx = 0; - for (const auto& _channel : channels) - { - try - { - // Generate the channel and append it. - m_Channels.push_back(compressed::channel( - _channel, - width, - height, - compression_codec, - comp_level_adjusted, - block_size, - chunk_size - )); - } - catch (const std::exception& e) - { - if (m_ChannelNames.size() > 0) - { - throw std::runtime_error( - std::format( - "Failed to insert channel '{}' at position {}. Full error: \n{}", - m_ChannelNames[channel_idx], - channel_idx, - e.what() - ) - ); - } - else - { - throw std::runtime_error( - std::format( - "Failed to insert channel at position {}. Full error: \n{}", - channel_idx, - e.what() - ) - ); - } - } - ++channel_idx; - } - } - - - /// Constructs an image object with the specified channels, dimensions, and optional compression parameters. - /// - /// This constructor creates an image from a given set of channels. The channel names can optionally be specified. - /// The image is then compressed using the provided codec and compression level. - /// - /// Example: - /// \code{.cpp} - /// std::vector> channels = ...; - /// compressed::image my_image(channels, 1920, 1080, {"r", "g", "b"}, codec::lz4, 5); - /// \endcode - /// - /// \param channels A vector of vectors containing the image channels (each channel is a 2D array of pixel data). - /// \param width The width of the image in pixels. - /// \param height The height of the image in pixels. - /// \param channel_names (Optional) A list of channel names, must match the number of channels provided. - /// If omitted or incorrect, channel names are ignored. - /// \param compression_codec (Optional) The codec used for compression, default is `codec::lz4`. - /// \param compression_level (Optional) The compression level, default is `9`. - /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to - /// comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle - /// larger blocks feel free to up this number. - /// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. - /// This should be tweaked to be no larger than the size of the usual images you are expecting - /// to compress for optimal performance but this could be upped which might give better compression - /// ratios. Must be a multiple of sizeof(T). - /// \throws std::runtime_error if a channel fails to be inserted. - image( - std::vector> channels, - size_t width, - size_t height, - std::vector channel_names = {}, - enums::codec compression_codec = enums::codec::lz4, - size_t compression_level = 9, - size_t block_size = s_default_blocksize, - size_t chunk_size = s_default_chunksize - ) - { - _COMPRESSED_PROFILE_FUNCTION(); - m_Width = width; - m_Height = height; - m_ChannelNames = channel_names; - auto comp_level_adjusted = util::ensure_compression_level(compression_level); - - // c-blosc2 chunks can at most be 2 gigabytes so the set chunk size should not exceed this. - assert(chunk_size < std::numeric_limits::max()); - assert(block_size < chunk_size); - if (channel_names.size() != channels.size() && channel_names.size() != 0) - { - std::cout << std::format( - "Invalid channelnames passed to image constructor, required them to match the number of" \ - " channels in the channels parameter.Expected {} items but instead got {} names. Ignoring channel names", - channels.size(), channel_names.size()) << std::endl; - - m_ChannelNames = {}; - } - - // Iterate all channels and start creating channels for it. - size_t channel_idx = 0; - for (const auto& _channel : channels) - { - try - { - // Generate the channel and append it. - m_Channels.push_back(compressed::channel( - std::span(_channel.begin(), _channel.end()), - width, - height, - compression_codec, - comp_level_adjusted, - block_size, - chunk_size - )); - } - catch (const std::exception& e) - { - if (m_ChannelNames.size() > 0) - { - throw std::runtime_error( - std::format( - "Failed to insert channel '{}' at position {}. Full error: \n{}", - m_ChannelNames[channel_idx], - channel_idx, - e.what() - ) - ); - } - else - { - throw std::runtime_error( - std::format( - "Failed to insert channel at position {}. Full error: \n{}", - channel_idx, - e.what() - ) - ); - } - } - ++channel_idx; - } - } - - - /// Constructs an image object with the specified channels and dimensions, optionally passing channelnames. - /// - /// This constructor creates an image from a given set of channels. The channel names can optionally be specified. - /// The passed channels should already be compressed::channel instances. - /// - /// - /// \param channels A vector of compressed::channel instances to initialize the image with. - /// \param width The width of the image in pixels. - /// \param height The height of the image in pixels. - /// \param channel_names (Optional) A list of channel names, must match the number of channels provided. - /// If omitted or incorrect, channel names are ignored. - image( - std::vector> channels, - size_t width, - size_t height, - std::vector channel_names = {} - ) - { - _COMPRESSED_PROFILE_FUNCTION(); - m_Width = width; - m_Height = height; - m_ChannelNames = channel_names; - - if (channel_names.size() != channels.size() && channel_names.size() != 0) - { - std::cout << std::format( - "Invalid channelnames passed to image constructor, required them to match the number of" \ - " channels in the channels parameter.Expected {} items but instead got {} names. Ignoring channel names", - channels.size(), channel_names.size()) << std::endl; - - m_ChannelNames = {}; - } - - size_t counter = 0; - for (auto& channel : channels) - { - if (channel.width() != width) - { - throw std::invalid_argument( - std::format( - "Invalid channel passed to compressed::image constructor at index {}. It's width does not" - " equal {} but instead is {}", - counter, width, channel.width() - ) - ); - } - if (channel.height() != height) - { - throw std::invalid_argument( - std::format( - "Invalid channel passed to compressed::image constructor at index {}. It's height does not" - " equal {} but instead is {}", - counter, height, channel.height() - ) - ); - } - - ++counter; - } - m_Channels = std::move(channels); - } + /// Compressed Image representation with easy access to different channels. Internally functions very similar to an NDArray + /// with the important distinction that the number of dimensions is fixed to be 3-Dimensional (width, height, channels). + /// They are laid out in scanline order with each channel being its own distinct object which may have any size. + /// + /// The image is stored in a non-resizable fashion so whatever the resolution was going into it, is what the image will be. + /// To rescale or refit the image a new `image` has to be constructed. + /// + /// The data is compressed in memory and we store it as part of a blosc2 super-chunk which is essentially a 3d array of + /// super-chunk -> chunk -> block. Where having the block size fit into L1 cache and the Chunk size into L3 cache is desirable + /// as each block can be handled by a single cpu core while the chunk fits well within shared L3 memory. + template + struct image : public std::ranges::view_interface> + { + using value_type = T; + + image() = default; + image(image&&) = default; + image& operator=(image&&) = default; + image(const image&) = delete; + image& operator=(const image&) = delete; + ~image() = default; + + + /// Constructs an image object with the specified channels, dimensions, and optional compression parameters. + /// + /// This constructor creates an image from a given set of channels. The channel names can optionally be specified. + /// The image is then compressed using the provided codec and compression level. + /// + /// Example: + /// \code{.cpp} + /// std::vector> channels = ...; + /// compressed::image my_image(channels, 1920, 1080, {"r", "g", "b"}, codec::lz4, 5); + /// \endcode + /// + /// \param channels A vector of spans containing the image channels (each channel is a 2D array of pixel data). + /// on construction these will be compressed thus the data can be safely freed after this function. + /// \param width The width of the image in pixels. + /// \param height The height of the image in pixels. + /// \param channel_names (Optional) A list of channel names, must match the number of channels provided. + /// If omitted or incorrect, channel names are ignored. + /// \param compression_codec (Optional) The codec used for compression, default is `codec::lz4`. + /// \param compression_level (Optional) The compression level, default is `9`. + /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to + /// comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle + /// larger blocks feel free to up this number. + /// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. + /// This should be tweaked to be no larger than the size of the usual images you are expecting + /// to compress for optimal performance but this could be upped which might give better compression + /// ratios. Must be a multiple of sizeof(T). + /// \throws std::runtime_error if a channel fails to be inserted. + image( + std::vector> channels, + size_t width, + size_t height, + const std::vector& channel_names = {}, + enums::codec compression_codec = enums::codec::lz4, + size_t compression_level = 9, + size_t block_size = s_default_blocksize, + size_t chunk_size = s_default_chunksize + ) + { + _COMPRESSED_PROFILE_FUNCTION(); + m_Width = width; + m_Height = height; + m_ChannelNames = channel_names; + auto comp_level_adjusted = util::ensure_compression_level(compression_level); + + // c-blosc2 chunks can at most be 2 gigabytes so the set chunk size should not exceed this. + assert(chunk_size < std::numeric_limits::max()); + assert(block_size < chunk_size); + if (channel_names.size() != channels.size() && channel_names.size() != 0) + { + std::cout << std::format( + "Invalid channelnames passed to image constructor, required them to match the number of" + " channels in the channels parameter.Expected {} items but instead got {} names. Ignoring channel names", + channels.size(), + channel_names.size() + ) << std::endl; + + m_ChannelNames = {}; + } + + // Iterate all channels and start creating channels for it. + size_t channel_idx = 0; + for (const auto& _channel : channels) + { + try + { + // Generate the channel and append it. + m_Channels.push_back( + compressed::channel( + _channel, + width, + height, + compression_codec, + comp_level_adjusted, + block_size, + chunk_size + ) + ); + } + catch (const std::exception& e) + { + if (m_ChannelNames.size() > 0) + { + throw std::runtime_error( + std::format( + "Failed to insert channel '{}' at position {}. Full error: \n{}", + m_ChannelNames[channel_idx], + channel_idx, + e.what() + ) + ); + } + else + { + throw std::runtime_error( + std::format( + "Failed to insert channel at position {}. Full error: \n{}", + channel_idx, + e.what() + ) + ); + } + } + ++channel_idx; + } + } + + + /// Constructs an image object with the specified channels, dimensions, and optional compression parameters. + /// + /// This constructor creates an image from a given set of channels. The channel names can optionally be specified. + /// The image is then compressed using the provided codec and compression level. + /// + /// Example: + /// \code{.cpp} + /// std::vector> channels = ...; + /// compressed::image my_image(channels, 1920, 1080, {"r", "g", "b"}, codec::lz4, 5); + /// \endcode + /// + /// \param channels A vector of vectors containing the image channels (each channel is a 2D array of pixel data). + /// \param width The width of the image in pixels. + /// \param height The height of the image in pixels. + /// \param channel_names (Optional) A list of channel names, must match the number of channels provided. + /// If omitted or incorrect, channel names are ignored. + /// \param compression_codec (Optional) The codec used for compression, default is `codec::lz4`. + /// \param compression_level (Optional) The compression level, default is `9`. + /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to + /// comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle + /// larger blocks feel free to up this number. + /// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. + /// This should be tweaked to be no larger than the size of the usual images you are expecting + /// to compress for optimal performance but this could be upped which might give better compression + /// ratios. Must be a multiple of sizeof(T). + /// \throws std::runtime_error if a channel fails to be inserted. + image( + std::vector> channels, + size_t width, + size_t height, + std::vector channel_names = {}, + enums::codec compression_codec = enums::codec::lz4, + size_t compression_level = 9, + size_t block_size = s_default_blocksize, + size_t chunk_size = s_default_chunksize + ) + { + _COMPRESSED_PROFILE_FUNCTION(); + m_Width = width; + m_Height = height; + m_ChannelNames = channel_names; + auto comp_level_adjusted = util::ensure_compression_level(compression_level); + + // c-blosc2 chunks can at most be 2 gigabytes so the set chunk size should not exceed this. + assert(chunk_size < std::numeric_limits::max()); + assert(block_size < chunk_size); + if (channel_names.size() != channels.size() && channel_names.size() != 0) + { + std::cout << std::format( + "Invalid channelnames passed to image constructor, required them to match the number of" + " channels in the channels parameter.Expected {} items but instead got {} names. Ignoring channel names", + channels.size(), + channel_names.size() + ) << std::endl; + + m_ChannelNames = {}; + } + + // Iterate all channels and start creating channels for it. + size_t channel_idx = 0; + for (const auto& _channel : channels) + { + try + { + // Generate the channel and append it. + m_Channels.push_back( + compressed::channel( + std::span(_channel.begin(), _channel.end()), + width, + height, + compression_codec, + comp_level_adjusted, + block_size, + chunk_size + ) + ); + } + catch (const std::exception& e) + { + if (m_ChannelNames.size() > 0) + { + throw std::runtime_error( + std::format( + "Failed to insert channel '{}' at position {}. Full error: \n{}", + m_ChannelNames[channel_idx], + channel_idx, + e.what() + ) + ); + } + else + { + throw std::runtime_error( + std::format( + "Failed to insert channel at position {}. Full error: \n{}", + channel_idx, + e.what() + ) + ); + } + } + ++channel_idx; + } + } + + + /// Constructs an image object with the specified channels and dimensions, optionally passing channelnames. + /// + /// This constructor creates an image from a given set of channels. The channel names can optionally be specified. + /// The passed channels should already be compressed::channel instances. + /// + /// + /// \param channels A vector of compressed::channel instances to initialize the image with. + /// \param width The width of the image in pixels. + /// \param height The height of the image in pixels. + /// \param channel_names (Optional) A list of channel names, must match the number of channels provided. + /// If omitted or incorrect, channel names are ignored. + image( + std::vector> channels, + size_t width, + size_t height, + std::vector channel_names = {} + ) + { + _COMPRESSED_PROFILE_FUNCTION(); + m_Width = width; + m_Height = height; + m_ChannelNames = channel_names; + + if (channel_names.size() != channels.size() && channel_names.size() != 0) + { + std::cout << std::format( + "Invalid channelnames passed to image constructor, required them to match the number of" + " channels in the channels parameter.Expected {} items but instead got {} names. Ignoring channel names", + channels.size(), + channel_names.size() + ) << std::endl; + + m_ChannelNames = {}; + } + + size_t counter = 0; + for (auto& channel : channels) + { + if (channel.width() != width) + { + throw std::invalid_argument( + std::format( + "Invalid channel passed to compressed::image constructor at index {}. It's width does not" + " equal {} but instead is {}", + counter, + width, + channel.width() + ) + ); + } + if (channel.height() != height) + { + throw std::invalid_argument( + std::format( + "Invalid channel passed to compressed::image constructor at index {}. It's height does not" + " equal {} but instead is {}", + counter, + height, + channel.height() + ) + ); + } + + ++counter; + } + m_Channels = std::move(channels); + } #ifdef COMPRESSED_IMAGE_OIIO_AVAILABLE - /// \brief Reads a compressed image from a file using OpenImageIO and compresses it during reading. - /// - /// Requires CompressedImage to have been compiled with OpenImageIO support. - /// - /// This function reads an image file in chunks and compresses it on the fly leading to much - /// lower memory usage at near-identical performance to raw OpenImageIO reads. On an image - /// that is well compressible this can easily achieve a compression ratio of 5-10x. - /// - /// The type does not have to match that of the underlying image as OpenImageIO will take - /// care of converting the files into the specified format. It is perfectly valid to read - /// a floating point image as e.g. uint16_t etc. - /// - /// Example: - /// \code{.cpp} - /// std::filesystem::path filepath = "image.exr"; - /// auto img = compressed::image::read(filepath, 0, compressed::enums::codec::lz4, 5); - /// \endcode - /// - /// \param filepath The file path of the image to read. - /// \param subimage The subimage to extract the channels from (default: 0). Only relevant for multi-part images. - /// \param compression_codec The compression codec to use (default: LZ4). - /// \param compression_level The compression level (default: 9). - /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to - /// comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle - /// larger blocks feel free to up this number. - /// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. - /// This should be tweaked to be no larger than the size of the usual images you are expecting - /// to compress for optimal performance but this could be upped which might give better compression - /// ratios. Must be a multiple of sizeof(T). - /// \return A compressed image instance. - static image read( - std::filesystem::path filepath, - int subimage = 0, - enums::codec compression_codec = enums::codec::lz4, - size_t compression_level = 9, - size_t block_size = s_default_blocksize, - size_t chunk_size = s_default_chunksize - ) - { - // Initialize the OIIO primitives - auto input_ptr = OIIO::ImageInput::open(filepath); - if (!input_ptr) - { - throw std::invalid_argument(std::format("File {} does not exist on disk", filepath.string())); - } - - // Ensure we seek to the right subimage before retrieving the spec as it is subimage dependent. - auto res = input_ptr->seek_subimage(subimage, 0); - if (!res) - { - throw std::invalid_argument( - std::format( - "File '{}' does not have a subimage {}, cannot seek to it", filepath.string(), subimage - ) - ); - } - const OIIO::ImageSpec& spec = input_ptr->spec(); - - return image::read( - std::move(input_ptr), - spec.channelnames, - subimage, - compression_codec, - compression_level, - block_size, - chunk_size - ); - } - - /// \brief Reads a compressed image from a file using OpenImageIO and compresses it during reading. - /// - /// Requires CompressedImage to have been compiled with OpenImageIO support. - /// - /// This function reads an image file in chunks and compresses it on the fly leading to much - /// lower memory usage at near-identical performance to raw OpenImageIO reads. On an image - /// that is well compressible this can easily achieve a compression ratio of 5-10x. - /// - /// The type does not have to match that of the underlying image as OpenImageIO will take - /// care of converting the files into the specified format. It is perfectly valid to read - /// a floating point image as e.g. uint16_t etc. - /// - /// This overload allows you to specify a custom invocable function which is executed after a chunk has been read - /// and before it is compressed. If you have some common operations like color management or a filter which you - /// wish to apply this would go in here. - /// Specifying these right away in the read is much more efficient than iterating over the image again later and - /// applying these. - /// - /// The function passed should have no notion of coordinates or similar, it should simply assume to receive a block - /// of data (that is part of an image) as well as the channel index we are currently operating over. - /// - /// Example: - /// \code{.cpp} - /// std::filesystem::path filepath = "image.exr"; - /// - /// // Read an image file and apply a post-process which adds 1 to the pixel value for all RGB channels (0, 1, 2). - /// - /// auto postprocess = [](size_t channel_idx, std::span chunk) - /// { - /// if (channel_idx > 2) - /// { - /// return; - /// } - /// - /// std::for_each(std::execution::par_unseq, chunk.begin(), chunk.end(), [](T& value) - /// { - /// value += 1; - /// } - /// }; - /// - /// auto img = compressed::image::read( - /// filepath, - /// std::forward(postprocess), - /// 0, // subimage - /// compressed::enums::codec::lz4, // compression_code - /// 5 // compression_level - /// ); - /// \endcode - /// - /// \param filepath The file path of the image to read. - /// \param postprocess A postprocessing function to run after read but before re-compression. This function should - /// take a `size_t` and a `std::span` where the `size_t` is the channel index we are currently - /// iterating over (e.g. 3 for the alpha channel) and the `std::span` is a chunk within that - /// channel, where this chunk is and what coordinates it represents is not passed along. - /// \param subimage The subimage to extract the channels from (default: 0). Only relevant for multi-part images. - /// \param compression_codec The compression codec to use (default: LZ4). - /// \param compression_level The compression level (default: 9). - /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to - /// comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle - /// larger blocks feel free to up this number. - /// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. - /// This should be tweaked to be no larger than the size of the usual images you are expecting - /// to compress for optimal performance but this could be upped which might give better compression - /// ratios. Must be a multiple of sizeof(T). - /// \return A compressed image instance. - template - requires std::invocable, size_t, std::span> - static image read( - std::filesystem::path filepath, - PostProcess&& postprocess, - int subimage = 0, - enums::codec compression_codec = enums::codec::lz4, - size_t compression_level = 9, - size_t block_size = s_default_blocksize, - size_t chunk_size = s_default_chunksize - ) - { - _COMPRESSED_PROFILE_FUNCTION(); - // Initialize the OIIO primitives - auto input_ptr = OIIO::ImageInput::open(filepath); - if (!input_ptr) - { - throw std::invalid_argument(std::format("File {} does not exist on disk", filepath.string())); - } - - // Ensure we seek to the right subimage before retrieving the spec as it is subimage dependent. - auto res = input_ptr->seek_subimage(subimage, 0); - if (!res) - { - throw std::invalid_argument( - std::format( - "File '{}' does not have a subimage {}, cannot seek to it", filepath.string(), subimage - ) - ); - } - const OIIO::ImageSpec& spec = input_ptr->spec(); - - return image::read( - std::move(input_ptr), - std::forward(postprocess), - spec.channelnames, - subimage, - compression_codec, - compression_level, - block_size, - chunk_size - ); - } - - /// \brief Reads a compressed image from a file using OpenImageIO and compresses it during reading. - /// - /// Requires CompressedImage to have been compiled with OpenImageIO support. - /// - /// This function reads an image file in chunks and compresses it on the fly leading to much - /// lower memory usage at near-identical performance to raw OpenImageIO reads. On an image - /// that is well compressible this can easily achieve a compression ratio of 5-10x. - /// - /// The type does not have to match that of the underlying image as OpenImageIO will take - /// care of converting the files into the specified format. It is perfectly valid to read - /// a floating point image as e.g. uint16_t etc. - /// - /// This overload allows you to only extract the channels specified which is useful if you have e.g. - /// a multilayer file but only wish to extract the RGBA components. - /// - /// We will internally take care of optimizing the calls to the OpenImageIO API for maximum read throughput. - /// - /// Example: - /// \code{.cpp} - /// std::filesystem::path filepath = "image.exr"; - /// - /// auto input_ptr = OIIO::ImageInput::open(filepath); - /// if (!input_ptr) - /// { - /// throw std::runtime_error(std::format("file {} does not exist on disk", filepath.string())); - /// } - /// - /// auto img = compressed::image::read(input_ptr, {0, 1, 2, 3}); - /// \endcode - /// - /// \param input_ptr The opened OIIO input pointer. - /// \param channelindices The channels you wish to extract. These may be specified in any order. We throw a - /// std::out_of_range if one of the passed channels does not exist. It is perfectly valid - /// to e.g. call this with {3, 1, 2} when the underlying channel structure may be - /// RGBA. Sorting these back into their underlying channel structure is done on read. - /// \param subimage The subimage to extract the channels from (default: 0). Only relevant for multi-part images. - /// \param compression_codec The compression codec to use (default: LZ4). - /// \param compression_level The compression level (default: 9). - /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to - /// comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle - /// larger blocks feel free to up this number. - /// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. - /// This should be tweaked to be no larger than the size of the usual images you are expecting - /// to compress for optimal performance but this could be upped which might give better compression - /// ratios. Must be a multiple of sizeof(T). - /// \return A compressed image instance. - static image read( - std::unique_ptr input_ptr, - std::vector channelindices, - int subimage = 0, - enums::codec compression_codec = enums::codec::lz4, - size_t compression_level = 9, - size_t block_size = s_default_blocksize, - size_t chunk_size = s_default_chunksize - ) - { - _COMPRESSED_PROFILE_FUNCTION(); - std::vector channelnames{}; - - // Ensure we seek to the right subimage before retrieving the spec as it is subimage dependent. - auto res = input_ptr->seek_subimage(subimage, 0); - if (!res) - { - throw std::invalid_argument( - std::format( - "File does not have a subimage {}, cannot seek to it", subimage - ) - ); - } - const auto& spec = input_ptr->spec(); - - for (int i : channelindices) - { - channelnames.push_back(spec.channelnames.at(i)); - } - - return image::read( - std::move(input_ptr), - std::move(channelnames), - subimage, - compression_codec, - compression_level, - block_size, - chunk_size - ); - } - - /// \brief Reads a compressed image from a file using OpenImageIO and compresses it during reading. - /// - /// Requires CompressedImage to have been compiled with OpenImageIO support. - /// - /// This function reads an image file in chunks and compresses it on the fly leading to much - /// lower memory usage at near-identical performance to raw OpenImageIO reads. On an image - /// that is well compressible this can easily achieve a compression ratio of 5-10x. - /// - /// The type does not have to match that of the underlying image as OpenImageIO will take - /// care of converting the files into the specified format. It is perfectly valid to read - /// a floating point image as e.g. uint16_t etc. - /// - /// This overload allows you to only extract the channels specified which is useful if you have e.g. - /// a multilayer file but only wish to extract the RGBA components. - /// - /// We will internally take care of optimizing the calls to the OpenImageIO API for maximum read throughput. - /// - /// This function allows you to specify a custom invocable function which is executed after a chunk has been read - /// and before it is compressed. If you have some common operations like color management or a filter which you - /// wish to apply this would go in here. - /// Specifying these right away in the read is much more efficient than iterating over the image again later and - /// applying these. - /// - /// The function passed should have no notion of coordinates or similar, it should simply assume to receive a block - /// of data (that is part of an image) as well as the channel index we are currently operating over. - /// - /// Example: - /// \code{.cpp} - /// std::filesystem::path filepath = "image.exr"; - /// - /// auto input_ptr = OIIO::ImageInput::open(filepath); - /// if (!input_ptr) - /// { - /// throw std::runtime_error(std::format("file {} does not exist on disk", filepath.string())); - /// } - /// - /// auto postprocess = [](size_t channel_idx, std::span chunk) - /// { - /// if (channel_idx > 2) - /// { - /// return; - /// } - /// - /// std::for_each(std::execution::par_unseq, chunk.begin(), chunk.end(), [](T& value) - /// { - /// value += 1; - /// } - /// }; - /// - /// // Read an image file and apply a post-process which adds 1 to the pixel value for all RGB channels (0, 1, 2). - /// auto img = compressed::image::read( - /// std::move(input_ptr), - /// std::forward(postprocess), - /// { 0, 1, 2, 3}, // only read the RGBA channels - /// 0, // subimage - /// compressed::enums::codec::lz4, - /// 5 - /// ); - /// \endcode - /// - /// \param input_ptr The opened OIIO input pointer. - /// \param postprocess A postprocessing function to run after read but before re-compression. This function should - /// take a `size_t` and a `std::span` where the `size_t` is the channel index we are currently - /// iterating over (e.g. 3 for the alpha channel) and the `std::span` is a chunk within that - /// channel, where this chunk is and what coordinates it represents is not passed along. - /// \param channelindices The channels you wish to extract. These may be specified in any order. We throw a - /// std::out_of_range if one of the passed channels does not exist. It is perfectly valid - /// to e.g. call this with {3, 1, 2} when the underlying channel structure may be - /// RGBA. Sorting these back into their underlying channel structure is done on read. - /// \param subimage The subimage to extract the channels from (default: 0). Only relevant for multi-part images. - /// \param compression_codec The compression codec to use (default: LZ4). - /// \param compression_level The compression level (default: 9). - /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to - /// comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle - /// larger blocks feel free to up this number. - /// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. - /// This should be tweaked to be no larger than the size of the usual images you are expecting - /// to compress for optimal performance but this could be upped which might give better compression - /// ratios. Must be a multiple of sizeof(T). - /// \return A compressed image instance. - template - requires std::invocable, size_t, std::span> - static image read( - std::unique_ptr input_ptr, - PostProcess&& postprocess, - std::vector channelindices, - int subimage = 0, - enums::codec compression_codec = enums::codec::lz4, - size_t compression_level = 9, - size_t block_size = s_default_blocksize, - size_t chunk_size = s_default_chunksize - ) - { - _COMPRESSED_PROFILE_FUNCTION(); - std::vector channelnames{}; - - // Ensure we seek to the right subimage before retrieving the spec as it is subimage dependent. - auto res = input_ptr->seek_subimage(subimage, 0); - if (!res) - { - throw std::invalid_argument( - std::format( - "File does not have a subimage {}, cannot seek to it", subimage - ) - ); - } - const auto& spec = input_ptr->spec(); - - for (int i : channelindices) - { - channelnames.push_back(spec.channelnames.at(i)); - } - - return image::read( - std::move(input_ptr), - std::forward(postprocess), - subimage, - std::move(channelnames), - compression_codec, - compression_level, - block_size, - chunk_size - ); - } - - - /// \brief Reads a compressed image from a file using OpenImageIO and compresses it during reading. - /// - /// Requires CompressedImage to have been compiled with OpenImageIO support. - /// - /// This function reads an image file in chunks and compresses it on the fly leading to much - /// lower memory usage at near-identical performance to raw OpenImageIO reads. On an image - /// that is well compressible this can easily achieve a compression ratio of 5-10x. - /// - /// The type does not have to match that of the underlying image as OpenImageIO will take - /// care of converting the files into the specified format. It is perfectly valid to read - /// a floating point image as e.g. uint16_t etc. - /// - /// This overload allows you to only extract the channels specified which is useful if you have e.g. - /// a multilayer file but only wish to extract the RGBA components. - /// - /// We will internally take care of optimizing the calls to the OpenImageIO API for maximum read throughput. - /// - /// Example: - /// \code{.cpp} - /// std::filesystem::path filepath = "image.exr"; - /// - /// auto input_ptr = OIIO::ImageInput::open(filepath); - /// if (!input_ptr) - /// { - /// throw std::runtime_error(std::format("file {} does not exist on disk", filepath.string())); - /// } - /// - /// auto img = compressed::image::read(std::move(input_ptr), {"R", "G", "B", "A"}); - /// \endcode - /// - /// \param input_ptr The opened OIIO input pointer. - /// \param channelnames The channels you wish to extract. These may be specified in any order. We throw a - /// std::out_of_range if one of the passed channels does not exist. It is perfectly valid - /// to e.g. call this with {"G", "R", "A"} when the underlying channel structure may be - /// RGBA. Sorting these back into their underlying channel structure is done on read. - /// \param subimage The subimage to extract the channels from (default: 0). Only relevant for multi-part images. - /// \param compression_codec The compression codec to use (default: LZ4). - /// \param compression_level The compression level (default: 9). - /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to - /// comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle - /// larger blocks feel free to up this number. - /// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. - /// This should be tweaked to be no larger than the size of the usual images you are expecting - /// to compress for optimal performance but this could be upped which might give better compression - /// ratios. Must be a multiple of sizeof(T). - /// \return A compressed image instance. - static image read( - std::unique_ptr input_ptr, - std::vector channelnames, - int subimage = 0, - enums::codec compression_codec = enums::codec::lz4, - size_t compression_level = 9, - size_t block_size = s_default_blocksize, - size_t chunk_size = s_default_chunksize - ) - { - _COMPRESSED_PROFILE_FUNCTION(); - return image::read_impl( - std::move(input_ptr), - std::move(channelnames), - std::nullopt, - subimage, - compression_codec, - compression_level, - block_size, - chunk_size - ); - } - - - /// \brief Reads a compressed image from a file using OpenImageIO and compresses it during reading. - /// - /// Requires CompressedImage to have been compiled with OpenImageIO support. - /// - /// This function reads an image file in chunks and compresses it on the fly leading to much - /// lower memory usage at near-identical performance to raw OpenImageIO reads. On an image - /// that is well compressible this can easily achieve a compression ratio of 5-10x. - /// - /// The type does not have to match that of the underlying image as OpenImageIO will take - /// care of converting the files into the specified format. It is perfectly valid to read - /// a floating point image as e.g. uint16_t etc. - /// - /// This overload allows you to only extract the channels specified which is useful if you have e.g. - /// a multilayer file but only wish to extract the RGBA components. - /// - /// We will internally take care of optimizing the calls to the OpenImageIO API for maximum read throughput. - /// - /// This function allows you to specify a custom invocable function which is executed after a chunk has been read - /// and before it is compressed. If you have some common operations like color management or a filter which you - /// wish to apply this would go in here. - /// Specifying these right away in the read is much more efficient than iterating over the image again later and - /// applying these. - /// - /// The function passed should have no notion of coordinates or similar, it should simply assume to receive a block - /// of data (that is part of an image) as well as the channel index we are currently operating over. - /// - /// - /// Example: - /// \code{.cpp} - /// std::filesystem::path filepath = "image.exr"; - /// - /// auto input_ptr = OIIO::ImageInput::open(filepath); - /// if (!input_ptr) - /// { - /// throw std::runtime_error(std::format("file {} does not exist on disk", filepath.string())); - /// } - /// - /// auto postprocess = [](size_t channel_idx, std::span chunk) - /// { - /// if (channel_idx > 2) - /// { - /// return; - /// } - /// - /// std::for_each(std::execution::par_unseq, chunk.begin(), chunk.end(), [](T& value) - /// { - /// value += 1; - /// } - /// }; - /// - /// // Read an image file and apply a post-process which adds 1 to the pixel value for all RGB channels (0, 1, 2). - /// auto img = compressed::image::read( - /// std::move(input_ptr), - /// std::forward(postprocess), - /// { 0, 1, 2, 3}, // only read the RGBA channels - /// 0, // subimage - /// compressed::enums::codec::lz4, - /// 5 - /// ); - /// \endcode - /// - /// \param input_ptr The opened OIIO input pointer. - /// \param postprocess A postprocessing function to run after read but before re-compression. This function should - /// take a `size_t` and a `std::span` where the `size_t` is the channel index we are currently - /// iterating over (e.g. 3 for the alpha channel) and the `std::span` is a chunk within that - /// channel, where this chunk is and what coordinates it represents is not passed along. - /// \param channelnames The channels you wish to extract. These may be specified in any order. We throw a - /// std::out_of_range if one of the passed channels does not exist. It is perfectly valid - /// to e.g. call this with {"G", "R", "A"} when the underlying channel structure may be - /// RGBA. Sorting these back into their underlying channel structure is done on read. - /// \param subimage The subimage to extract the channels from (default: 0). Only relevant for multi-part images. - /// \param compression_codec The compression codec to use (default: LZ4). - /// \param compression_level The compression level (default: 9). - /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to - /// comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle - /// larger blocks feel free to up this number. - /// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. - /// This should be tweaked to be no larger than the size of the usual images you are expecting - /// to compress for optimal performance but this could be upped which might give better compression - /// ratios. Must be a multiple of sizeof(T). - /// \return A compressed image instance. - template - requires std::invocable, size_t, std::span> - static image read( - std::unique_ptr input_ptr, - PostProcess&& postprocess, - std::vector channelnames, - int subimage = 0, - enums::codec compression_codec = enums::codec::lz4, - size_t compression_level = 9, - size_t block_size = s_default_blocksize, - size_t chunk_size = s_default_chunksize - ) - { - _COMPRESSED_PROFILE_FUNCTION(); - return image::read_impl( - std::move(input_ptr), - std::move(channelnames), - std::forward(postprocess), - subimage, - compression_codec, - compression_level, - block_size, - chunk_size - ); - } - - - /// \brief Read the metadata from the openimageio pointer into a json representation - /// \param input_ptr The input file to query - /// \return The metadata encoded as json. This does not recursively parse jsons! - static json_ordered read_oiio_metadata(const OIIO::ImageSpec& spec) - { - return detail::param_value::to_json(spec.extra_attribs); - } - - /// \brief Read the metadata from the file into a json representation - /// \param input_ptr The input file to query - /// - /// \throws std::invalid_argument if the file does not exist on disk. - /// - /// \return The metadata encoded as json. This does not recursively parse jsons! - static json_ordered read_oiio_metadata(std::filesystem::path filepath) - { - // Initialize the OIIO primitives - auto input_ptr = OIIO::ImageInput::open(filepath); - if (!input_ptr) - { - throw std::invalid_argument(std::format("File {} does not exist on disk", filepath.string())); - } - - return detail::param_value::to_json(input_ptr->spec().extra_attribs); - } + /// \brief Reads a compressed image from a file using OpenImageIO and compresses it during reading. + /// + /// Requires CompressedImage to have been compiled with OpenImageIO support. + /// + /// This function reads an image file in chunks and compresses it on the fly leading to much + /// lower memory usage at near-identical performance to raw OpenImageIO reads. On an image + /// that is well compressible this can easily achieve a compression ratio of 5-10x. + /// + /// The type does not have to match that of the underlying image as OpenImageIO will take + /// care of converting the files into the specified format. It is perfectly valid to read + /// a floating point image as e.g. uint16_t etc. + /// + /// Example: + /// \code{.cpp} + /// std::filesystem::path filepath = "image.exr"; + /// auto img = compressed::image::read(filepath, 0, compressed::enums::codec::lz4, 5); + /// \endcode + /// + /// \param filepath The file path of the image to read. + /// \param subimage The subimage to extract the channels from (default: 0). Only relevant for multi-part images. + /// \param compression_codec The compression codec to use (default: LZ4). + /// \param compression_level The compression level (default: 9). + /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to + /// comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle + /// larger blocks feel free to up this number. + /// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. + /// This should be tweaked to be no larger than the size of the usual images you are expecting + /// to compress for optimal performance but this could be upped which might give better compression + /// ratios. Must be a multiple of sizeof(T). + /// \return A compressed image instance. + static image read( + std::filesystem::path filepath, + int subimage = 0, + enums::codec compression_codec = enums::codec::lz4, + size_t compression_level = 9, + size_t block_size = s_default_blocksize, + size_t chunk_size = s_default_chunksize + ) + { + // Initialize the OIIO primitives + auto input_ptr = OIIO::ImageInput::open(filepath); + if (!input_ptr) + { + throw std::invalid_argument(std::format("File {} does not exist on disk", filepath.string())); + } + + // Ensure we seek to the right subimage before retrieving the spec as it is subimage dependent. + auto res = input_ptr->seek_subimage(subimage, 0); + if (!res) + { + throw std::invalid_argument( + std::format( + "File '{}' does not have a subimage {}, cannot seek to it", + filepath.string(), + subimage + ) + ); + } + const OIIO::ImageSpec& spec = input_ptr->spec(); + + return image::read( + std::move(input_ptr), + spec.channelnames, + subimage, + compression_codec, + compression_level, + block_size, + chunk_size + ); + } + + /// \brief Reads a compressed image from a file using OpenImageIO and compresses it during reading. + /// + /// Requires CompressedImage to have been compiled with OpenImageIO support. + /// + /// This function reads an image file in chunks and compresses it on the fly leading to much + /// lower memory usage at near-identical performance to raw OpenImageIO reads. On an image + /// that is well compressible this can easily achieve a compression ratio of 5-10x. + /// + /// The type does not have to match that of the underlying image as OpenImageIO will take + /// care of converting the files into the specified format. It is perfectly valid to read + /// a floating point image as e.g. uint16_t etc. + /// + /// This overload allows you to specify a custom invocable function which is executed after a chunk has been read + /// and before it is compressed. If you have some common operations like color management or a filter which you + /// wish to apply this would go in here. + /// Specifying these right away in the read is much more efficient than iterating over the image again later and + /// applying these. + /// + /// The function passed should have no notion of coordinates or similar, it should simply assume to receive a block + /// of data (that is part of an image) as well as the channel index we are currently operating over. + /// + /// Example: + /// \code{.cpp} + /// std::filesystem::path filepath = "image.exr"; + /// + /// // Read an image file and apply a post-process which adds 1 to the pixel value for all RGB channels (0, 1, 2). + /// + /// auto postprocess = [](size_t channel_idx, std::span chunk) + /// { + /// if (channel_idx > 2) + /// { + /// return; + /// } + /// + /// std::for_each(std::execution::par_unseq, chunk.begin(), chunk.end(), [](T& value) + /// { + /// value += 1; + /// } + /// }; + /// + /// auto img = compressed::image::read( + /// filepath, + /// std::forward(postprocess), + /// 0, // subimage + /// compressed::enums::codec::lz4, // compression_code + /// 5 // compression_level + /// ); + /// \endcode + /// + /// \param filepath The file path of the image to read. + /// \param postprocess A postprocessing function to run after read but before re-compression. This function should + /// take a `size_t` and a `std::span` where the `size_t` is the channel index we are currently + /// iterating over (e.g. 3 for the alpha channel) and the `std::span` is a chunk within that + /// channel, where this chunk is and what coordinates it represents is not passed along. + /// \param subimage The subimage to extract the channels from (default: 0). Only relevant for multi-part images. + /// \param compression_codec The compression codec to use (default: LZ4). + /// \param compression_level The compression level (default: 9). + /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to + /// comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle + /// larger blocks feel free to up this number. + /// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. + /// This should be tweaked to be no larger than the size of the usual images you are expecting + /// to compress for optimal performance but this could be upped which might give better compression + /// ratios. Must be a multiple of sizeof(T). + /// \return A compressed image instance. + template + requires std::invocable, size_t, std::span> + static image read( + std::filesystem::path filepath, + PostProcess&& postprocess, + int subimage = 0, + enums::codec compression_codec = enums::codec::lz4, + size_t compression_level = 9, + size_t block_size = s_default_blocksize, + size_t chunk_size = s_default_chunksize + ) + { + _COMPRESSED_PROFILE_FUNCTION(); + // Initialize the OIIO primitives + auto input_ptr = OIIO::ImageInput::open(filepath); + if (!input_ptr) + { + throw std::invalid_argument(std::format("File {} does not exist on disk", filepath.string())); + } + + // Ensure we seek to the right subimage before retrieving the spec as it is subimage dependent. + auto res = input_ptr->seek_subimage(subimage, 0); + if (!res) + { + throw std::invalid_argument( + std::format( + "File '{}' does not have a subimage {}, cannot seek to it", + filepath.string(), + subimage + ) + ); + } + const OIIO::ImageSpec& spec = input_ptr->spec(); + + return image::read( + std::move(input_ptr), + std::forward(postprocess), + spec.channelnames, + subimage, + compression_codec, + compression_level, + block_size, + chunk_size + ); + } + + /// \brief Reads a compressed image from a file using OpenImageIO and compresses it during reading. + /// + /// Requires CompressedImage to have been compiled with OpenImageIO support. + /// + /// This function reads an image file in chunks and compresses it on the fly leading to much + /// lower memory usage at near-identical performance to raw OpenImageIO reads. On an image + /// that is well compressible this can easily achieve a compression ratio of 5-10x. + /// + /// The type does not have to match that of the underlying image as OpenImageIO will take + /// care of converting the files into the specified format. It is perfectly valid to read + /// a floating point image as e.g. uint16_t etc. + /// + /// This overload allows you to only extract the channels specified which is useful if you have e.g. + /// a multilayer file but only wish to extract the RGBA components. + /// + /// We will internally take care of optimizing the calls to the OpenImageIO API for maximum read throughput. + /// + /// Example: + /// \code{.cpp} + /// std::filesystem::path filepath = "image.exr"; + /// + /// auto input_ptr = OIIO::ImageInput::open(filepath); + /// if (!input_ptr) + /// { + /// throw std::runtime_error(std::format("file {} does not exist on disk", filepath.string())); + /// } + /// + /// auto img = compressed::image::read(input_ptr, {0, 1, 2, 3}); + /// \endcode + /// + /// \param input_ptr The opened OIIO input pointer. + /// \param channelindices The channels you wish to extract. These may be specified in any order. We throw a + /// std::out_of_range if one of the passed channels does not exist. It is perfectly valid + /// to e.g. call this with {3, 1, 2} when the underlying channel structure may be + /// RGBA. Sorting these back into their underlying channel structure is done on read. + /// \param subimage The subimage to extract the channels from (default: 0). Only relevant for multi-part images. + /// \param compression_codec The compression codec to use (default: LZ4). + /// \param compression_level The compression level (default: 9). + /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to + /// comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle + /// larger blocks feel free to up this number. + /// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. + /// This should be tweaked to be no larger than the size of the usual images you are expecting + /// to compress for optimal performance but this could be upped which might give better compression + /// ratios. Must be a multiple of sizeof(T). + /// \return A compressed image instance. + static image read( + std::unique_ptr input_ptr, + std::vector channelindices, + int subimage = 0, + enums::codec compression_codec = enums::codec::lz4, + size_t compression_level = 9, + size_t block_size = s_default_blocksize, + size_t chunk_size = s_default_chunksize + ) + { + _COMPRESSED_PROFILE_FUNCTION(); + std::vector channelnames{}; + + // Ensure we seek to the right subimage before retrieving the spec as it is subimage dependent. + auto res = input_ptr->seek_subimage(subimage, 0); + if (!res) + { + throw std::invalid_argument( + std::format( + "File does not have a subimage {}, cannot seek to it", + subimage + ) + ); + } + const auto& spec = input_ptr->spec(); + + for (int i : channelindices) + { + channelnames.push_back(spec.channelnames.at(i)); + } + + return image::read( + std::move(input_ptr), + std::move(channelnames), + subimage, + compression_codec, + compression_level, + block_size, + chunk_size + ); + } + + /// \brief Reads a compressed image from a file using OpenImageIO and compresses it during reading. + /// + /// Requires CompressedImage to have been compiled with OpenImageIO support. + /// + /// This function reads an image file in chunks and compresses it on the fly leading to much + /// lower memory usage at near-identical performance to raw OpenImageIO reads. On an image + /// that is well compressible this can easily achieve a compression ratio of 5-10x. + /// + /// The type does not have to match that of the underlying image as OpenImageIO will take + /// care of converting the files into the specified format. It is perfectly valid to read + /// a floating point image as e.g. uint16_t etc. + /// + /// This overload allows you to only extract the channels specified which is useful if you have e.g. + /// a multilayer file but only wish to extract the RGBA components. + /// + /// We will internally take care of optimizing the calls to the OpenImageIO API for maximum read throughput. + /// + /// This function allows you to specify a custom invocable function which is executed after a chunk has been read + /// and before it is compressed. If you have some common operations like color management or a filter which you + /// wish to apply this would go in here. + /// Specifying these right away in the read is much more efficient than iterating over the image again later and + /// applying these. + /// + /// The function passed should have no notion of coordinates or similar, it should simply assume to receive a block + /// of data (that is part of an image) as well as the channel index we are currently operating over. + /// + /// Example: + /// \code{.cpp} + /// std::filesystem::path filepath = "image.exr"; + /// + /// auto input_ptr = OIIO::ImageInput::open(filepath); + /// if (!input_ptr) + /// { + /// throw std::runtime_error(std::format("file {} does not exist on disk", filepath.string())); + /// } + /// + /// auto postprocess = [](size_t channel_idx, std::span chunk) + /// { + /// if (channel_idx > 2) + /// { + /// return; + /// } + /// + /// std::for_each(std::execution::par_unseq, chunk.begin(), chunk.end(), [](T& value) + /// { + /// value += 1; + /// } + /// }; + /// + /// // Read an image file and apply a post-process which adds 1 to the pixel value for all RGB channels (0, 1, 2). + /// auto img = compressed::image::read( + /// std::move(input_ptr), + /// std::forward(postprocess), + /// { 0, 1, 2, 3}, // only read the RGBA channels + /// 0, // subimage + /// compressed::enums::codec::lz4, + /// 5 + /// ); + /// \endcode + /// + /// \param input_ptr The opened OIIO input pointer. + /// \param postprocess A postprocessing function to run after read but before re-compression. This function should + /// take a `size_t` and a `std::span` where the `size_t` is the channel index we are currently + /// iterating over (e.g. 3 for the alpha channel) and the `std::span` is a chunk within that + /// channel, where this chunk is and what coordinates it represents is not passed along. + /// \param channelindices The channels you wish to extract. These may be specified in any order. We throw a + /// std::out_of_range if one of the passed channels does not exist. It is perfectly valid + /// to e.g. call this with {3, 1, 2} when the underlying channel structure may be + /// RGBA. Sorting these back into their underlying channel structure is done on read. + /// \param subimage The subimage to extract the channels from (default: 0). Only relevant for multi-part images. + /// \param compression_codec The compression codec to use (default: LZ4). + /// \param compression_level The compression level (default: 9). + /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to + /// comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle + /// larger blocks feel free to up this number. + /// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. + /// This should be tweaked to be no larger than the size of the usual images you are expecting + /// to compress for optimal performance but this could be upped which might give better compression + /// ratios. Must be a multiple of sizeof(T). + /// \return A compressed image instance. + template + requires std::invocable, size_t, std::span> + static image read( + std::unique_ptr input_ptr, + PostProcess&& postprocess, + std::vector channelindices, + int subimage = 0, + enums::codec compression_codec = enums::codec::lz4, + size_t compression_level = 9, + size_t block_size = s_default_blocksize, + size_t chunk_size = s_default_chunksize + ) + { + _COMPRESSED_PROFILE_FUNCTION(); + std::vector channelnames{}; + + // Ensure we seek to the right subimage before retrieving the spec as it is subimage dependent. + auto res = input_ptr->seek_subimage(subimage, 0); + if (!res) + { + throw std::invalid_argument( + std::format( + "File does not have a subimage {}, cannot seek to it", + subimage + ) + ); + } + const auto& spec = input_ptr->spec(); + + for (int i : channelindices) + { + channelnames.push_back(spec.channelnames.at(i)); + } + + return image::read( + std::move(input_ptr), + std::forward(postprocess), + subimage, + std::move(channelnames), + compression_codec, + compression_level, + block_size, + chunk_size + ); + } + + + /// \brief Reads a compressed image from a file using OpenImageIO and compresses it during reading. + /// + /// Requires CompressedImage to have been compiled with OpenImageIO support. + /// + /// This function reads an image file in chunks and compresses it on the fly leading to much + /// lower memory usage at near-identical performance to raw OpenImageIO reads. On an image + /// that is well compressible this can easily achieve a compression ratio of 5-10x. + /// + /// The type does not have to match that of the underlying image as OpenImageIO will take + /// care of converting the files into the specified format. It is perfectly valid to read + /// a floating point image as e.g. uint16_t etc. + /// + /// This overload allows you to only extract the channels specified which is useful if you have e.g. + /// a multilayer file but only wish to extract the RGBA components. + /// + /// We will internally take care of optimizing the calls to the OpenImageIO API for maximum read throughput. + /// + /// Example: + /// \code{.cpp} + /// std::filesystem::path filepath = "image.exr"; + /// + /// auto input_ptr = OIIO::ImageInput::open(filepath); + /// if (!input_ptr) + /// { + /// throw std::runtime_error(std::format("file {} does not exist on disk", filepath.string())); + /// } + /// + /// auto img = compressed::image::read(std::move(input_ptr), {"R", "G", "B", "A"}); + /// \endcode + /// + /// \param input_ptr The opened OIIO input pointer. + /// \param channelnames The channels you wish to extract. These may be specified in any order. We throw a + /// std::out_of_range if one of the passed channels does not exist. It is perfectly valid + /// to e.g. call this with {"G", "R", "A"} when the underlying channel structure may be + /// RGBA. Sorting these back into their underlying channel structure is done on read. + /// \param subimage The subimage to extract the channels from (default: 0). Only relevant for multi-part images. + /// \param compression_codec The compression codec to use (default: LZ4). + /// \param compression_level The compression level (default: 9). + /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to + /// comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle + /// larger blocks feel free to up this number. + /// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. + /// This should be tweaked to be no larger than the size of the usual images you are expecting + /// to compress for optimal performance but this could be upped which might give better compression + /// ratios. Must be a multiple of sizeof(T). + /// \return A compressed image instance. + static image read( + std::unique_ptr input_ptr, + std::vector channelnames, + int subimage = 0, + enums::codec compression_codec = enums::codec::lz4, + size_t compression_level = 9, + size_t block_size = s_default_blocksize, + size_t chunk_size = s_default_chunksize + ) + { + _COMPRESSED_PROFILE_FUNCTION(); + return image::read_impl( + std::move(input_ptr), + std::move(channelnames), + std::nullopt, + subimage, + compression_codec, + compression_level, + block_size, + chunk_size + ); + } + + + /// \brief Reads a compressed image from a file using OpenImageIO and compresses it during reading. + /// + /// Requires CompressedImage to have been compiled with OpenImageIO support. + /// + /// This function reads an image file in chunks and compresses it on the fly leading to much + /// lower memory usage at near-identical performance to raw OpenImageIO reads. On an image + /// that is well compressible this can easily achieve a compression ratio of 5-10x. + /// + /// The type does not have to match that of the underlying image as OpenImageIO will take + /// care of converting the files into the specified format. It is perfectly valid to read + /// a floating point image as e.g. uint16_t etc. + /// + /// This overload allows you to only extract the channels specified which is useful if you have e.g. + /// a multilayer file but only wish to extract the RGBA components. + /// + /// We will internally take care of optimizing the calls to the OpenImageIO API for maximum read throughput. + /// + /// This function allows you to specify a custom invocable function which is executed after a chunk has been read + /// and before it is compressed. If you have some common operations like color management or a filter which you + /// wish to apply this would go in here. + /// Specifying these right away in the read is much more efficient than iterating over the image again later and + /// applying these. + /// + /// The function passed should have no notion of coordinates or similar, it should simply assume to receive a block + /// of data (that is part of an image) as well as the channel index we are currently operating over. + /// + /// + /// Example: + /// \code{.cpp} + /// std::filesystem::path filepath = "image.exr"; + /// + /// auto input_ptr = OIIO::ImageInput::open(filepath); + /// if (!input_ptr) + /// { + /// throw std::runtime_error(std::format("file {} does not exist on disk", filepath.string())); + /// } + /// + /// auto postprocess = [](size_t channel_idx, std::span chunk) + /// { + /// if (channel_idx > 2) + /// { + /// return; + /// } + /// + /// std::for_each(std::execution::par_unseq, chunk.begin(), chunk.end(), [](T& value) + /// { + /// value += 1; + /// } + /// }; + /// + /// // Read an image file and apply a post-process which adds 1 to the pixel value for all RGB channels (0, 1, 2). + /// auto img = compressed::image::read( + /// std::move(input_ptr), + /// std::forward(postprocess), + /// { 0, 1, 2, 3}, // only read the RGBA channels + /// 0, // subimage + /// compressed::enums::codec::lz4, + /// 5 + /// ); + /// \endcode + /// + /// \param input_ptr The opened OIIO input pointer. + /// \param postprocess A postprocessing function to run after read but before re-compression. This function should + /// take a `size_t` and a `std::span` where the `size_t` is the channel index we are currently + /// iterating over (e.g. 3 for the alpha channel) and the `std::span` is a chunk within that + /// channel, where this chunk is and what coordinates it represents is not passed along. + /// \param channelnames The channels you wish to extract. These may be specified in any order. We throw a + /// std::out_of_range if one of the passed channels does not exist. It is perfectly valid + /// to e.g. call this with {"G", "R", "A"} when the underlying channel structure may be + /// RGBA. Sorting these back into their underlying channel structure is done on read. + /// \param subimage The subimage to extract the channels from (default: 0). Only relevant for multi-part images. + /// \param compression_codec The compression codec to use (default: LZ4). + /// \param compression_level The compression level (default: 9). + /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to + /// comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle + /// larger blocks feel free to up this number. + /// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. + /// This should be tweaked to be no larger than the size of the usual images you are expecting + /// to compress for optimal performance but this could be upped which might give better compression + /// ratios. Must be a multiple of sizeof(T). + /// \return A compressed image instance. + template + requires std::invocable, size_t, std::span> + static image read( + std::unique_ptr input_ptr, + PostProcess&& postprocess, + std::vector channelnames, + int subimage = 0, + enums::codec compression_codec = enums::codec::lz4, + size_t compression_level = 9, + size_t block_size = s_default_blocksize, + size_t chunk_size = s_default_chunksize + ) + { + _COMPRESSED_PROFILE_FUNCTION(); + return image::read_impl( + std::move(input_ptr), + std::move(channelnames), + std::forward(postprocess), + subimage, + compression_codec, + compression_level, + block_size, + chunk_size + ); + } + + + /// \brief Read the metadata from the openimageio pointer into a json representation + /// \return The metadata encoded as json. This does not recursively parse jsons! + static json_ordered read_oiio_metadata(const OIIO::ImageSpec& spec) + { + return detail::param_value::to_json(spec.extra_attribs); + } + + /// \brief Read the metadata from the file into a json representation + /// + /// \throws std::invalid_argument if the file does not exist on disk. + /// + /// \return The metadata encoded as json. This does not recursively parse jsons! + static json_ordered read_oiio_metadata(std::filesystem::path filepath) + { + // Initialize the OIIO primitives + auto input_ptr = OIIO::ImageInput::open(filepath); + if (!input_ptr) + { + throw std::invalid_argument(std::format("File {} does not exist on disk", filepath.string())); + } + + return detail::param_value::to_json(input_ptr->spec().extra_attribs); + } #endif // COMPRESSED_IMAGE_OIIO_AVAILABLE - /// Adds a compressed channel to the image. - /// - /// This method moves the provided channel into the image's internal storage, adding it to the list of channels. - /// - /// Example: - /// \code{.cpp} - /// compressed::channel channel = ...; - /// my_image.add_channel(std::move(channel)); - /// \endcode - /// - /// \param _channel The channel to be added to the image. - /// \param name (Optional) Channel name of the channel to be inserted. If no channel names are set this argument is ignored. - void add_channel(compressed::channel _channel, std::optional name = std::nullopt) - { - if (_channel.width() != this->width()) - { - throw std::invalid_argument( - std::format( - "Cannot add channel '{}' to the image as its width does not match that of the image." - " Expected {:L} pixels but instead got {:L} pixels", - name.value_or(""), - this->width(), _channel.width() - ) - ); - } - if (_channel.height() != this->height()) - { - throw std::invalid_argument( - std::format( - "Cannot add channel '{}' to the image as its height does not match that of the image." - " Expected {:L} pixels but instead got {:L} pixels", - name.value_or(""), - this->height(), _channel.height() - ) - ); - } - - if (name.has_value() && m_ChannelNames.size() == m_Channels.size()) - { - m_ChannelNames.push_back(name.value()); - } - else if (m_ChannelNames.size() > 0) - { - m_ChannelNames.push_back(name.value_or("")); - } - - m_Channels.push_back(std::move(_channel)); - } - - /// Adds a channel to the image. - /// - /// This method moves the provided channel into the image's internal storage, compressing it and adding it to the list of channels. - /// - /// Example: - /// \code{.cpp} - /// std::span channel = ...; - /// my_image.add_channel(channel, 1920, 1080, "red")); - /// \endcode - /// - /// \param data The channel to be added to the image. - /// \param width The width of the channel - /// \param height The height of the channel - /// \param name (Optional) Channel name of the channel to be inserted. If no channel names are set this argument is ignored. - /// \param compression_codec (Optional) Compression codec to apply to the channel, every channel is allowed to have a different one. - /// \param compression_level (Optional) Compression level, defaults to 5. - void add_channel( - std::span data, - size_t width, - size_t height, - std::optional name = std::nullopt, - enums::codec compression_codec = enums::codec::lz4, - uint8_t compression_level = 5 - ) - { - if (width != this->width()) - { - throw std::invalid_argument( - std::format( - "Cannot add channel '{}' to the image as its width does not match that of the image." - " Expected {:L} pixels but instead got {:L} pixels", - name.value_or(""), - width, this->width() - ) - ); - } - if (height != this->height()) - { - throw std::invalid_argument( - std::format( - "Cannot add channel '{}' to the image as its height does not match that of the image." - " Expected {:L} pixels but instead got {:L} pixels", - name.value_or(""), - height, this->height() - ) - ); - } - - if (name.has_value() && m_ChannelNames.size() == m_Channels.size()) - { - m_ChannelNames.push_back(name.value()); - } - else if (m_ChannelNames.size() > 0) - { - m_ChannelNames.push_back(name.value_or("")); - } - - m_Channels.push_back(compressed::channel( - std::span(data.begin(), data.end()), - width, - height, - compression_codec, - compression_level - )); - } - - - /// Remove a channel by its index. - /// - /// \param index The index of the channel to remove. - /// \throws std::out_of_range if the index is out of bounds. - void remove_channel(size_t index) - { - // Extract the channel and let it exit the scope to destruct - auto channel = this->extract_channel(index); - } - - /// Remove a channel by its name. - /// - /// \param name The name of the channel to remove. - /// \throws std::out_of_range if the channel name is invalid. - void remove_channel(const std::string_view name) - { - // Extract the channel and let it exit the scope to destruct - auto channel = this->extract_channel(name); - } - - /// Extracts a channel by its index. - /// - /// Remove the channel from the image and gives you full control over the channel. Also erases - /// its channel name. - /// - /// \param index The index of the channel to retrieve. - /// \return The channel object. - /// \throws std::out_of_range if the index is out of bounds. - compressed::channel extract_channel(size_t index) - { - if (index >= m_Channels.size()) - { - throw std::out_of_range("Channel index out of range"); - } - auto ret = std::move(m_Channels[index]); - - m_Channels.erase(m_Channels.begin() + index); - m_ChannelNames.erase(m_ChannelNames.begin() + index); - - return std::move(ret); - } - - /// Extracts a channel by its name. - /// - /// Remove the channel from the image and gives you full control over the channel. Also erases - /// its channel name. - /// - /// \param name The name of the channel to retrieve. - /// \return The channel object. - /// \throws std::out_of_range if the channel name is invalid. - compressed::channel extract_channel(const std::string_view name) - { - size_t index = get_channel_offset(name); - return extract_channel(index); - } - - /// \brief Prints statistical information about the image file structure. - /// - /// This function outputs various details about the compressed image, - /// including dimensions, number of channels, compression ratio, and metadata. - /// - /// Example output: - /// - /// Statistics for image buffer: - /// Width: 1024 - /// Height: 768 - /// Channels: 3 - /// Channelnames: [R, G, B] - /// -------------- - /// Compressed Size: 123456 bytes - /// Uncompressed Size: 3145728 bytes - /// Compression ratio: 25.5x - /// Num Chunks: 512 - /// Metadata: - /// { - /// "author": "User", - /// "timestamp": "2024-03-15" - /// } - void print_statistics() - { - size_t compressed_size = 0; - size_t uncompressed_size = 0; - size_t num_chunks = 0; - for (const auto& channel : m_Channels) - { - compressed_size += channel.compressed_bytes(); - uncompressed_size += channel.uncompressed_size(); - num_chunks += channel.num_chunks(); - } - - std::cout << "Statistics for image buffer:" << std::endl; - std::cout << " Width: " << m_Width << std::endl; - std::cout << " Height: " << m_Height << std::endl; - std::cout << " Channels: " << m_Channels.size() << std::endl; - std::cout << " Channelnames: ["; - - for (size_t i = 0; i < m_ChannelNames.size(); ++i) - { - std::cout << m_ChannelNames[i]; - if (i < m_ChannelNames.size() - 1) - { - std::cout << ", "; - } - } - - std::cout << "]" << std::endl; - std::cout << " -------------- " << std::endl; - std::cout << " Compressed Size: " << compressed_size << std::endl; - std::cout << " Uncompressed Size: " << uncompressed_size << std::endl; - std::cout << " Compression ratio: " << static_cast(uncompressed_size) / compressed_size << "x" << std::endl; - std::cout << " Num Chunks: " << num_chunks << std::endl; - std::cout << " Metadata: " << "\n " << m_Metadata.dump(4) << std::endl; - } - - - /// Return the compression ratio over all channels. - double compression_ratio() const noexcept - { - size_t total_uncompressed = 1; - size_t total_compressed = 1; - for (const auto& channel : m_Channels) - { - total_compressed += channel.compressed_bytes(); - total_uncompressed += channel.uncompressed_size(); - } - return static_cast(total_uncompressed) / total_compressed; - } - - - // --------------------------------------------------------------------------------------------------------------------- - // Iterators - // --------------------------------------------------------------------------------------------------------------------- - - auto begin() noexcept { return m_Channels.begin(); } - auto begin() const noexcept { return m_Channels.begin(); } - auto end() noexcept { return m_Channels.end(); } - auto end() const noexcept { return m_Channels.end(); } - - - // --------------------------------------------------------------------------------------------------------------------- - // Accessors - // --------------------------------------------------------------------------------------------------------------------- - - /// Retrieves a reference to a channel by its index. - /// - /// \param index The index of the channel to retrieve. - /// \return A reference to the requested channel. - /// \throws std::out_of_range if the index is out of bounds. - compressed::channel& channel(size_t index) - { - if (index >= m_Channels.size()) - { - throw std::out_of_range("Channel index out of range"); - } - return m_Channels[index]; - } - - /// Retrieves a reference to a channel by its name. - /// - /// \param name The name of the channel to retrieve. - /// \return A reference to the requested channel. - /// \throws std::out_of_range if the channel name is invalid. - compressed::channel& channel(const std::string_view name) - { - size_t index = get_channel_offset(name); - return m_Channels[index]; - } - - /// Retrieves references to multiple channels by name and returns them as a tuple. - /// - /// Can be used with structured bindings to quickly get the specified channels from an image. - /// These are returned as references (but don't have to be bound as such) - /// - /// Example: - /// - /// \code{.cpp} - /// compressed::image my_image = ...; - /// auto [r, g, b] = my_image.channels("r", "g", "b"); - /// \endcode - /// - /// \tparam Args Variadic template arguments, each convertible to std::string. - /// \param channel_names The names of the channels to retrieve. - /// \return A tuple containing references to the requested channels. - template - requires (std::conjunction_v...>) - auto channels(Args... channel_names) - { - return std::tie(this->channel(std::forward(channel_names))...); - } - - /// Retrieves references to multiple channels by index and returns them as a tuple. - /// - /// Can be used with structured bindings to quickly get the specified channels from an image. - /// These are returned as references (but don't have to be bound as such) - /// - /// Example: - /// - /// \code{.cpp} - /// compressed::image my_image = ...; - /// auto [r, g, b] = my_image.channels(0, 1, 2); - /// \endcode - /// - /// \tparam Args Variadic template arguments, each convertible to size_t. - /// \param channel_indices The indices of the channels to get - /// \return A tuple containing references to the requested channels. - template - requires (std::conjunction_v...>) - auto channels(Args... channel_indices) - { - return std::tie(this->channel(std::forward(channel_indices))...); - } - - /// Retrieves references to multiple channels their indices and returns them in a vector. - /// - /// \param channel_indices A vector of channel indices. - /// \return A vector containing references to the requested channels. - /// \throws std::out_of_range if any channel indec is invalid. - std::vector&> channels(std::vector channel_indices) - { - std::vector> result{}; - for (const auto& index : channel_indices) - { - result.append(this->channel(index)); - } - return result; - } - - /// Retrieves references to multiple channels by name and returns them in a vector. - /// - /// \param channel_names A vector of channel names. - /// \return A vector containing references to the requested channels. - /// \throws std::out_of_range if any channel name is invalid. - std::vector&> channels(std::vector channel_names) - { - std::vector> result{}; - for (const auto& name : channel_names) - { - result.append(this->channel(name)); - } - return result; - } - - /// Retrieves references to all of the channels in the image - /// - /// \return A vector containing references to the all the channels. - std::vector>& channels() - { - return m_Channels; - } - - /// Retrieves const references to all of the channels in the image - /// - /// \return A vector containing references to the all the channels. - const std::vector>& channels() const - { - return m_Channels; - } - - /// Decompress all of the channels and return them in planar fashion. - /// - /// Each channel's decompressed data is stored as a separate vector. - /// - /// \return A vector of decompressed channel data, where each inner vector corresponds to a channel. - std::vector> get_decompressed() const - { - std::vector> result{}; - for (const auto& channel : m_Channels) - { - result.push_back(channel.get_decompressed()); - } - return result; - } - - - /// Retrieve the logical index of the given channel. - /// - /// This function searches for the specified channel name in the list of available channels. - /// If the channel is not found, it throws a `std::invalid_argument`. - /// - /// \param channelname The name of the channel to search for. - /// \return The index of the channel if found. - /// \throws std::invalid_argument if the channel is not available. - size_t get_channel_offset(const std::string_view channelname) const - { - for (size_t i = 0; i < m_ChannelNames.size(); ++i) - { - if (m_ChannelNames[i] == channelname) - { - return i; - } - } - throw std::invalid_argument(std::format("Unknown channelname '{}' encountered", channelname)); - } - - /// Width of the Image - size_t width() const noexcept - { - return m_Width; - } - - /// Height of the image - size_t height() const noexcept - { - return m_Height; - } - - /// Total number of channels in the image - size_t num_channels() const noexcept - { - return m_Channels.size(); - } - - /// Names of the channels stored on the image, are stored in the same order as the logical indices. So if the channelnames - /// are { "B", "G", "R" } accessing channel "R" would be index 2. - std::vector channelnames() const noexcept - { - return m_ChannelNames; - } - - /// Set the channelnames according to their logical indices, - void channelnames(std::vector _channelnames) - { - if (_channelnames.size() != m_Channels.size()) - { - throw std::invalid_argument(std::format( - "Invalid number of arguments received for setting channelnames. Expected vector size to be exactly {} but instead got {}", - m_Channels.size(), - _channelnames.size() - ).c_str() - ); - } - m_ChannelNames = _channelnames; - } - - /// Arbitrary user metadata, not authored or managed by image class, it's up to the caller to handle what goes in and comes out - void metadata(const json_ordered& _metadata) noexcept - { - m_Metadata = _metadata; - } - - /// Arbitrary user metadata, not authored or managed by the image class, it's up to the caller to handle what goes in and comes out - json_ordered& metadata() noexcept - { - return m_Metadata; - } - - /// Arbitrary user metadata, not authored or managed by image class, it's up to the caller to handle what goes in and comes out - const json_ordered& metadata() const noexcept - { - return m_Metadata; - } - - /// Update the number of threads used internally by c-blosc2 for compression and decompression. - /// This is automatically set when iterating through the images with compressed::for_each for example - /// by specifying the compression codec. - void update_nthreads(size_t nthreads) - { - for (auto& channel : m_Channels) - { - channel.update_nthreads(nthreads); - } - } - - /// \brief Get the chunk size used for compression, this is the same across all channels. - /// - /// \throws std::runtime_error If the channels of the image do not all share the same chunk size as this is - /// currently unsupported. - /// - /// \return The chunk size in bytes. - size_t chunk_size() const - { - size_t chunk_size = 0; - for (const auto& channel : m_Channels) - { - if (chunk_size != 0 && channel.chunk_size() != chunk_size) - { - throw std::runtime_error( - "Validation Error: Channels in image do not all have the same chunk size. This is currently" - " unsupported." - ); - } - chunk_size = channel.chunk_size(); - } - return chunk_size; - } - - size_t block_size() const - { - size_t block_size = 0; - for (const auto& channel : m_Channels) - { - if (block_size != 0 && channel.block_size() != block_size) - { - throw std::runtime_error( - "Validation Error: Channels in image do not all have the same block size. This is currently" - " unsupported." - ); - } - block_size = channel.block_size(); - } - return block_size; - } - - private: - /// All the channels, each holding their own decompression and compression context. - std::vector> m_Channels{}; - - /// Arbitrary user metadata, not authored or managed by us, it's up to the caller to handle what goes in and comes out - json_ordered m_Metadata{}; - - /// Optional set of channelnames to associate to the channels. If not specified sensible defaults are chosen. For example, - /// if 3 channels are provided we default to { "R", "G", "B" } - std::vector m_ChannelNames{}; - - /// The width of the image file - size_t m_Width = 1; - - /// The height of the image file - size_t m_Height = 1; - - private: - - -// Implementations for the read() functions. -// ----------------------------------------------------------------------------------- -// ----------------------------------------------------------------------------------- + /// Adds a compressed channel to the image. + /// + /// This method moves the provided channel into the image's internal storage, adding it to the list of channels. + /// + /// Example: + /// \code{.cpp} + /// compressed::channel channel = ...; + /// my_image.add_channel(std::move(channel)); + /// \endcode + /// + /// \param _channel The channel to be added to the image. + /// \param name (Optional) Channel name of the channel to be inserted. If no channel names are set this argument is ignored. + void add_channel(compressed::channel _channel, std::optional name = std::nullopt) + { + if (_channel.width() != this->width()) + { + throw std::invalid_argument( + std::format( + "Cannot add channel '{}' to the image as its width does not match that of the image." + " Expected {:L} pixels but instead got {:L} pixels", + name.value_or(""), + this->width(), + _channel.width() + ) + ); + } + if (_channel.height() != this->height()) + { + throw std::invalid_argument( + std::format( + "Cannot add channel '{}' to the image as its height does not match that of the image." + " Expected {:L} pixels but instead got {:L} pixels", + name.value_or(""), + this->height(), + _channel.height() + ) + ); + } + + if (name.has_value() && m_ChannelNames.size() == m_Channels.size()) + { + m_ChannelNames.push_back(name.value()); + } + else if (m_ChannelNames.size() > 0) + { + m_ChannelNames.push_back(name.value_or("")); + } + + m_Channels.push_back(std::move(_channel)); + } + + /// Adds a channel to the image. + /// + /// This method moves the provided channel into the image's internal storage, compressing it and adding it to the list of channels. + /// + /// Example: + /// \code{.cpp} + /// std::span channel = ...; + /// my_image.add_channel(channel, 1920, 1080, "red")); + /// \endcode + /// + /// \param data The channel to be added to the image. + /// \param width The width of the channel + /// \param height The height of the channel + /// \param name (Optional) Channel name of the channel to be inserted. If no channel names are set this argument is ignored. + /// \param compression_codec (Optional) Compression codec to apply to the channel, every channel is allowed to have a different one. + /// \param compression_level (Optional) Compression level, defaults to 5. + void add_channel( + std::span data, + size_t width, + size_t height, + std::optional name = std::nullopt, + enums::codec compression_codec = enums::codec::lz4, + uint8_t compression_level = 5 + ) + { + if (width != this->width()) + { + throw std::invalid_argument( + std::format( + "Cannot add channel '{}' to the image as its width does not match that of the image." + " Expected {:L} pixels but instead got {:L} pixels", + name.value_or(""), + width, + this->width() + ) + ); + } + if (height != this->height()) + { + throw std::invalid_argument( + std::format( + "Cannot add channel '{}' to the image as its height does not match that of the image." + " Expected {:L} pixels but instead got {:L} pixels", + name.value_or(""), + height, + this->height() + ) + ); + } + + if (name.has_value() && m_ChannelNames.size() == m_Channels.size()) + { + m_ChannelNames.push_back(name.value()); + } + else if (m_ChannelNames.size() > 0) + { + m_ChannelNames.push_back(name.value_or("")); + } + + m_Channels.push_back( + compressed::channel( + std::span(data.begin(), data.end()), + width, + height, + compression_codec, + compression_level + ) + ); + } + + + /// Remove a channel by its index. + /// + /// \param index The index of the channel to remove. + /// \throws std::out_of_range if the index is out of bounds. + void remove_channel(size_t index) + { + // Extract the channel and let it exit the scope to destruct + auto channel = this->extract_channel(index); + } + + /// Remove a channel by its name. + /// + /// \param name The name of the channel to remove. + /// \throws std::out_of_range if the channel name is invalid. + void remove_channel(const std::string_view name) + { + // Extract the channel and let it exit the scope to destruct + auto channel = this->extract_channel(name); + } + + /// Extracts a channel by its index. + /// + /// Remove the channel from the image and gives you full control over the channel. Also erases + /// its channel name. + /// + /// \param index The index of the channel to retrieve. + /// \return The channel object. + /// \throws std::out_of_range if the index is out of bounds. + compressed::channel extract_channel(size_t index) + { + if (index >= m_Channels.size()) + { + throw std::out_of_range("Channel index out of range"); + } + auto ret = std::move(m_Channels[index]); + + m_Channels.erase(m_Channels.begin() + index); + m_ChannelNames.erase(m_ChannelNames.begin() + index); + + return std::move(ret); + } + + /// Extracts a channel by its name. + /// + /// Remove the channel from the image and gives you full control over the channel. Also erases + /// its channel name. + /// + /// \param name The name of the channel to retrieve. + /// \return The channel object. + /// \throws std::out_of_range if the channel name is invalid. + compressed::channel extract_channel(const std::string_view name) + { + size_t index = get_channel_offset(name); + return extract_channel(index); + } + + /// \brief Prints statistical information about the image file structure. + /// + /// This function outputs various details about the compressed image, + /// including dimensions, number of channels, compression ratio, and metadata. + /// + /// Example output: + /// + /// Statistics for image buffer: + /// Width: 1024 + /// Height: 768 + /// Channels: 3 + /// Channelnames: [R, G, B] + /// -------------- + /// Compressed Size: 123456 bytes + /// Uncompressed Size: 3145728 bytes + /// Compression ratio: 25.5x + /// Num Chunks: 512 + /// Metadata: + /// { + /// "author": "User", + /// "timestamp": "2024-03-15" + /// } + void print_statistics() + { + size_t compressed_size = 0; + size_t uncompressed_size = 0; + size_t num_chunks = 0; + for (const auto& channel : m_Channels) + { + compressed_size += channel.compressed_bytes(); + uncompressed_size += channel.uncompressed_size(); + num_chunks += channel.num_chunks(); + } + + std::cout << "Statistics for image buffer:" << std::endl; + std::cout << " Width: " << m_Width << std::endl; + std::cout << " Height: " << m_Height << std::endl; + std::cout << " Channels: " << m_Channels.size() << std::endl; + std::cout << " Channelnames: ["; + + for (size_t i = 0; i < m_ChannelNames.size(); ++i) + { + std::cout << m_ChannelNames[i]; + if (i < m_ChannelNames.size() - 1) + { + std::cout << ", "; + } + } + + std::cout << "]" << std::endl; + std::cout << " -------------- " << std::endl; + std::cout << " Compressed Size: " << compressed_size << std::endl; + std::cout << " Uncompressed Size: " << uncompressed_size << std::endl; + std::cout << " Compression ratio: " << static_cast(uncompressed_size) / compressed_size << "x" << + std::endl; + std::cout << " Num Chunks: " << num_chunks << std::endl; + std::cout << " Metadata: " << "\n " << m_Metadata.dump(4) << std::endl; + } + + + /// Return the compression ratio over all channels. + double compression_ratio() const noexcept + { + size_t total_uncompressed = 1; + size_t total_compressed = 1; + for (const auto& channel : m_Channels) + { + total_compressed += channel.compressed_bytes(); + total_uncompressed += channel.uncompressed_size(); + } + return static_cast(total_uncompressed) / total_compressed; + } + + + // --------------------------------------------------------------------------------------------------------------------- + // Iterators + // --------------------------------------------------------------------------------------------------------------------- + + auto begin() noexcept { return m_Channels.begin(); } + auto begin() const noexcept { return m_Channels.begin(); } + auto end() noexcept { return m_Channels.end(); } + auto end() const noexcept { return m_Channels.end(); } + + + // --------------------------------------------------------------------------------------------------------------------- + // Accessors + // --------------------------------------------------------------------------------------------------------------------- + + /// Retrieves a reference to a channel by its index. + /// + /// \param index The index of the channel to retrieve. + /// \return A reference to the requested channel. + /// \throws std::out_of_range if the index is out of bounds. + compressed::channel& channel(size_t index) + { + if (index >= m_Channels.size()) + { + throw std::out_of_range("Channel index out of range"); + } + return m_Channels[index]; + } + + /// Retrieves a reference to a channel by its name. + /// + /// \param name The name of the channel to retrieve. + /// \return A reference to the requested channel. + /// \throws std::out_of_range if the channel name is invalid. + compressed::channel& channel(const std::string_view name) + { + size_t index = get_channel_offset(name); + return m_Channels[index]; + } + + /// Retrieves references to multiple channels by name and returns them as a tuple. + /// + /// Can be used with structured bindings to quickly get the specified channels from an image. + /// These are returned as references (but don't have to be bound as such) + /// + /// Example: + /// + /// \code{.cpp} + /// compressed::image my_image = ...; + /// auto [r, g, b] = my_image.channels("r", "g", "b"); + /// \endcode + /// + /// \tparam Args Variadic template arguments, each convertible to std::string. + /// \param channel_names The names of the channels to retrieve. + /// \return A tuple containing references to the requested channels. + template + requires (std::conjunction_v...>) + auto channels(Args... channel_names) + { + return std::tie(this->channel(std::forward(channel_names))...); + } + + /// Retrieves references to multiple channels by index and returns them as a tuple. + /// + /// Can be used with structured bindings to quickly get the specified channels from an image. + /// These are returned as references (but don't have to be bound as such) + /// + /// Example: + /// + /// \code{.cpp} + /// compressed::image my_image = ...; + /// auto [r, g, b] = my_image.channels(0, 1, 2); + /// \endcode + /// + /// \tparam Args Variadic template arguments, each convertible to size_t. + /// \param channel_indices The indices of the channels to get + /// \return A tuple containing references to the requested channels. + template + requires (std::conjunction_v...>) + auto channels(Args... channel_indices) + { + return std::tie(this->channel(std::forward(channel_indices))...); + } + + /// Retrieves references to multiple channels their indices and returns them in a vector. + /// + /// \param channel_indices A vector of channel indices. + /// \return A vector containing references to the requested channels. + /// \throws std::out_of_range if any channel indec is invalid. + std::vector&> channels(std::vector channel_indices) + { + std::vector> result{}; + for (const auto& index : channel_indices) + { + result.append(this->channel(index)); + } + return result; + } + + /// Retrieves references to multiple channels by name and returns them in a vector. + /// + /// \param channel_names A vector of channel names. + /// \return A vector containing references to the requested channels. + /// \throws std::out_of_range if any channel name is invalid. + std::vector&> channels(std::vector channel_names) + { + std::vector> result{}; + for (const auto& name : channel_names) + { + result.append(this->channel(name)); + } + return result; + } + + /// Retrieves references to all of the channels in the image + /// + /// \return A vector containing references to the all the channels. + std::vector>& channels() + { + return m_Channels; + } + + /// Retrieves const references to all of the channels in the image + /// + /// \return A vector containing references to the all the channels. + const std::vector>& channels() const + { + return m_Channels; + } + + /// Decompress all of the channels and return them in planar fashion. + /// + /// Each channel's decompressed data is stored as a separate vector. + /// + /// \return A vector of decompressed channel data, where each inner vector corresponds to a channel. + std::vector> get_decompressed() const + { + std::vector> result{}; + for (const auto& channel : m_Channels) + { + result.push_back(channel.get_decompressed()); + } + return result; + } + + + /// Retrieve the logical index of the given channel. + /// + /// This function searches for the specified channel name in the list of available channels. + /// If the channel is not found, it throws a `std::invalid_argument`. + /// + /// \param channelname The name of the channel to search for. + /// \return The index of the channel if found. + /// \throws std::invalid_argument if the channel is not available. + size_t get_channel_offset(const std::string_view channelname) const + { + for (size_t i = 0; i < m_ChannelNames.size(); ++i) + { + if (m_ChannelNames[i] == channelname) + { + return i; + } + } + throw std::invalid_argument(std::format("Unknown channelname '{}' encountered", channelname)); + } + + /// Width of the Image + size_t width() const noexcept + { + return m_Width; + } + + /// Height of the image + size_t height() const noexcept + { + return m_Height; + } + + /// Total number of channels in the image + size_t num_channels() const noexcept + { + return m_Channels.size(); + } + + /// Names of the channels stored on the image, are stored in the same order as the logical indices. So if the channelnames + /// are { "B", "G", "R" } accessing channel "R" would be index 2. + std::vector channelnames() const noexcept + { + return m_ChannelNames; + } + + /// Set the channelnames according to their logical indices, + void channelnames(std::vector _channelnames) + { + if (_channelnames.size() != m_Channels.size()) + { + throw std::invalid_argument( + std::format( + "Invalid number of arguments received for setting channelnames. Expected vector size to be exactly {} but instead got {}", + m_Channels.size(), + _channelnames.size() + ).c_str() + ); + } + m_ChannelNames = _channelnames; + } + + /// Arbitrary user metadata, not authored or managed by image class, it's up to the caller to handle what goes in and comes out + void metadata(const json_ordered& _metadata) noexcept + { + m_Metadata = _metadata; + } + + /// Arbitrary user metadata, not authored or managed by the image class, it's up to the caller to handle what goes in and comes out + json_ordered& metadata() noexcept + { + return m_Metadata; + } + + /// Arbitrary user metadata, not authored or managed by image class, it's up to the caller to handle what goes in and comes out + const json_ordered& metadata() const noexcept + { + return m_Metadata; + } + + /// Update the number of threads used internally by c-blosc2 for compression and decompression. + /// This is automatically set when iterating through the images with compressed::for_each for example + /// by specifying the compression codec. + void update_nthreads(size_t nthreads) + { + for (auto& channel : m_Channels) + { + channel.update_nthreads(nthreads); + } + } + + /// \brief Get the chunk size used for compression, this is the same across all channels. + /// + /// \throws std::runtime_error If the channels of the image do not all share the same chunk size as this is + /// currently unsupported. + /// + /// \return The chunk size in bytes. + size_t chunk_size() const + { + size_t chunk_size = 0; + for (const auto& channel : m_Channels) + { + if (chunk_size != 0 && channel.chunk_size() != chunk_size) + { + throw std::runtime_error( + "Validation Error: Channels in image do not all have the same chunk size. This is currently" + " unsupported." + ); + } + chunk_size = channel.chunk_size(); + } + return chunk_size; + } + + size_t block_size() const + { + size_t block_size = 0; + for (const auto& channel : m_Channels) + { + if (block_size != 0 && channel.block_size() != block_size) + { + throw std::runtime_error( + "Validation Error: Channels in image do not all have the same block size. This is currently" + " unsupported." + ); + } + block_size = channel.block_size(); + } + return block_size; + } + + private: + struct ring_buffer_slot + { + util::default_init_vector interleaved_buffer; + std::vector> deinterleaved_buffer; + std::vector memory_pinners; + std::future processing_future; + + ring_buffer_slot() = default; + ring_buffer_slot(ring_buffer_slot&&) noexcept = default; + ring_buffer_slot& operator=(ring_buffer_slot&&) noexcept = default; + ring_buffer_slot(const ring_buffer_slot&) = delete; + ring_buffer_slot& operator=(const ring_buffer_slot&) = delete; + }; + + using ring_buffer_t = std::vector; + + private: + /// All the channels, each holding their own decompression and compression context. + std::vector> m_Channels{}; + + /// Arbitrary user metadata, not authored or managed by us, it's up to the caller to handle what goes in and comes out + json_ordered m_Metadata{}; + + /// Optional set of channelnames to associate to the channels. If not specified sensible defaults are chosen. For example, + /// if 3 channels are provided we default to { "R", "G", "B" } + std::vector m_ChannelNames{}; + + /// The width of the image file + size_t m_Width = 1; + + /// The height of the image file + size_t m_Height = 1; + + private: + // Implementations for the read() functions. + // ----------------------------------------------------------------------------------- + // ----------------------------------------------------------------------------------- #ifdef COMPRESSED_IMAGE_OIIO_AVAILABLE - /// \brief Read implementation for all the call to image::read(). - /// - /// This function takes care of reading data from the input pointer and propagating it to read_contiguous_channels_impl. - /// - /// \param input_ptr The pointer to read the data from - /// \param channelnames The channels to read from the file, non-existant channels throw std::out_of_range - /// \param postprocess An optional postprocessing step to apply to the chunks before they get compressed. - /// \param compression_codec The compression codec to apply - /// \param compression_level The compression level to compress with - /// \param block_size The block size to apply to the compressed data - /// \param chunk_size The chunk size to apply to the compressed data - /// - /// \returns The decoded image. - template - requires std::invocable, size_t, std::span> || std::is_same_v, std::nullopt_t> - static image read_impl( - std::unique_ptr input_ptr, - std::vector channelnames, - PostProcess&& postprocess, - int subimage, - enums::codec compression_codec = enums::codec::lz4, - size_t compression_level = 9, - size_t block_size = s_default_blocksize, - size_t chunk_size = s_default_chunksize - ) - { - _COMPRESSED_PROFILE_FUNCTION(); - assert(chunk_size % sizeof(T) == 0); - auto comp_level_adjusted = util::ensure_compression_level(compression_level); - - // Seek to the right subimage before getting the spec. - auto res = input_ptr->seek_subimage(subimage, 0); - if (!res) - { - throw std::invalid_argument( - std::format( - "File does not have a subimage {}, cannot seek to it", subimage - ) - ); - } - const OIIO::ImageSpec& spec = input_ptr->spec(); - - // Align the chunk size to the scanlines and tiles (if applicable), this makes our life considerably - // easier and allows us to not deal with partial scanlines. - size_t chunk_size_aligned = 0; - if (spec.tile_height != 0) - { - chunk_size_aligned = util::align_chunk_to_tile_bytes(spec.width, spec.tile_height, chunk_size); - } - else - { - chunk_size_aligned = util::align_chunk_to_scanlines_bytes(spec.width, chunk_size); - } - - // Get a std::vector containing a begin-end pair for all contiguous channels in our channelnames. - // So if we pass 'R', 'B' and 'A' in a rgba image we would get the following - // { {0 - 1}, {2 - 4} } - // This allows us to both maximize performance by handling as many channels in one go as we can while also - // minimizing memory footprint by only ever allocating as much as we need for the max amount of contiguous - // channels we can encounter. - std::vector> channels; - auto channel_ranges_contiguous = detail::get_contiguous_channels(input_ptr, channelnames); - size_t max_num_channels = 0; - for (const auto& [chbegin, chend] : channel_ranges_contiguous) - { - if (static_cast(chend) - chbegin > max_num_channels) - { - max_num_channels = static_cast(chend) - chbegin; - } - } - - - // Set up scratch buffers - // ----------------------------------------------------------------------------------- - // ----------------------------------------------------------------------------------- - - // Maximum chunk size we will need to account for (times number of channels). - const size_t max_chunk_size = chunk_size_aligned * max_num_channels; - - // Initialize our swap buffers, these are going to be either discarded after - // or compressed from. - util::default_init_vector interleaved_buffer(max_chunk_size / sizeof(T)); - std::vector> deinterleaved_buffer(max_num_channels); - std::for_each(std::execution::par_unseq, deinterleaved_buffer.begin(), deinterleaved_buffer.end(), [&](auto& buffer) - { - buffer.resize(chunk_size_aligned / sizeof(T)); - }); - - // Buffer to hold a single chunk. We will reuse this quite frequently - auto chunk_buffer = util::default_init_vector(blosc2::min_compressed_size(chunk_size_aligned)); - - // Read and compress the channel pairs in chunks - // ----------------------------------------------------------------------------------- - // ----------------------------------------------------------------------------------- - - // This will be the channelnames we will construct the image with. This is to avoid cases where the user - // passes the channel names in a different order than they appear in such as 'A', 'G', 'R'. This should - // still create the channel names as expected in correct order. - std::vector new_channelnames{}; - - // Iterate all the pair and extract them, refitting the buffers as needed. - // This is where the actual work of reading start. - for (auto [chbegin, chend] : channel_ranges_contiguous) - { - // Calculate some preliminary data for computing how many scanlines to extract in one go. - int nchannels = chend - chbegin; - const size_t bytes_per_scanline = static_cast(spec.width) * nchannels * sizeof(T); - - const size_t chunk_size_all = chunk_size_aligned * nchannels; - const size_t scanlines_per_chunk = chunk_size_all / bytes_per_scanline; - - // Refit the swap buffers as `read_contiguous_channels_impl` expects these to be exactly sized. - auto interleaved_fitted = std::span(interleaved_buffer.begin(), chunk_size_all / sizeof(T)); - std::vector> deinterleaved_fitted{}; - for (auto idx : std::views::iota(0, nchannels)) - { - // construct a span from the util::default_init_vector - deinterleaved_fitted.push_back( - std::span(deinterleaved_buffer.at(idx).begin(), deinterleaved_buffer.at(idx).end()) - ); - } - - // Create and initialize the contexts and schunks. These are pretty light weight so we don't need - // to worry about creating them outside of the loop/reusing them. - std::vector contexts; - std::vector> schunks; - for ([[maybe_unused]] auto _ : std::views::iota(0, nchannels)) - { - schunks.push_back(blosc2::schunk(block_size, chunk_size_aligned)); - contexts.push_back(blosc2::create_compression_context( - std::thread::hardware_concurrency(), - compression_codec, - comp_level_adjusted, - block_size - )); - } - - // Read the contiguous channel sequence into the contexts and schunks. - if constexpr (std::invocable, size_t, std::span>) - { - if (spec.tile_height != 0) - { - image::read_contiguous_channels_impl( - input_ptr, - subimage, - chbegin, - chend, - interleaved_fitted, - deinterleaved_fitted, - scanlines_per_chunk, - contexts, - schunks, - chunk_buffer, - std::forward(postprocess) - ); - } - else - { - image::read_contiguous_channels_impl( - input_ptr, - subimage, - chbegin, - chend, - interleaved_fitted, - deinterleaved_fitted, - scanlines_per_chunk, - contexts, - schunks, - chunk_buffer, - std::forward(postprocess) - ); - } - } - else - { - if (spec.tile_height != 0) - { - image::read_contiguous_channels_impl( - input_ptr, - subimage, - chbegin, - chend, - interleaved_fitted, - deinterleaved_fitted, - scanlines_per_chunk, - contexts, - schunks, - chunk_buffer, - std::nullopt - ); - } - else - { - image::read_contiguous_channels_impl( - input_ptr, - subimage, - chbegin, - chend, - interleaved_fitted, - deinterleaved_fitted, - scanlines_per_chunk, - contexts, - schunks, - chunk_buffer, - std::nullopt - ); - } - } - - - // Finally create the channels from the schunks - for (const auto channel_idx : std::views::iota(0, nchannels)) - { - _COMPRESSED_PROFILE_SCOPE("generate channels"); - channels.push_back( - compressed::channel( - std::move(schunks[channel_idx]), - spec.width, - spec.height, - compression_codec, - comp_level_adjusted - ) - ); - } - - // Store the correctly mapped channelnames - for (auto channel_idx : std::views::iota(chbegin, chend)) - { - new_channelnames.push_back(spec.channelnames.at(channel_idx)); - } - } - - // Construct the image instance. - auto img = compressed::image(std::move(channels), spec.width, spec.height, new_channelnames); - img.metadata(compressed::image::read_oiio_metadata(spec)); - return std::move(img); - } - - - /// \brief Read a contiguous channel sequence from the passed input pointer - /// - /// When reading with OpenImageIO it is a lot more efficient to parse as many channels as possible in one go - /// rather than reading one channel at a time as the ImageInput keeps the data as compressed (in many cases). - /// If we were to read one channel at a time this would significantly slow down our read speeds. - /// - /// Due to us only being able to read contiguous channels at a time this helper function allows us to do that. - /// - /// \param input_ptr The opened OpenImageIO ImageInput. - /// \param chbegin The start channel to read - /// \param chend The end channel to read - /// \param interleaved_buffer The buffer into which we will read the channels (before then interleaving). - /// must be sized to exactly fit nchannels * width * height - /// \param deinterleaved_buffer The buffers to deinterleave into, must be exactly of size nchannels with each - /// sub-buffer being exactly width * height. - /// \param scanlines_per_chunk The number of scanlines that fit into one chunk (exactly). - /// \param contexts The contexts for compression, must be exactly nchannels amount - /// \param schunks The schunks for compression, must be exactly nchannels amount - /// \param chunk_buffer A scratch buffer for compression (from which we copy). - /// - /// \throws std::invalid_argument if any of the above conditions is not met. - template - requires std::invocable, size_t, std::span> || std::is_same_v, std::nullopt_t> - static void read_contiguous_channels_impl( - std::unique_ptr& input_ptr, - int subimage, - int chbegin, - int chend, - std::span interleaved_buffer, - std::vector>& deinterleaved_buffer, - size_t scanlines_per_chunk, - std::vector& contexts, - std::vector>& schunks, - util::default_init_vector& chunk_buffer, - PostProcess&& postprocess - ) - { - _COMPRESSED_PROFILE_FUNCTION(); - const int nchannels = chend - chbegin; - assert(input_ptr->current_subimage() == subimage); - const OIIO::ImageSpec& spec = input_ptr->spec(); - const auto typedesc = enums::get_type_desc(); - - // Ensure this function is called with at least 1 channel to read. - if (nchannels < 1) - { - throw std::runtime_error( - std::format( - "read_contiguous_channels_impl: passed number of channels is less than one. This should not happen. Got {}", - nchannels - ) - ); - } - - // Ensure the interleaved buffer is correctly sized. - if (interleaved_buffer.size() != static_cast(nchannels) * spec.width * scanlines_per_chunk) - { - throw std::invalid_argument( - std::format( - "read_contiguous_channels_impl: Received incorrectly sized interleaved buffer, should be exactly" - " {:L} elements large but instead got {:L}.", - static_cast(nchannels) * spec.width * scanlines_per_chunk, - interleaved_buffer.size() - ) - ); - } - // Ensure the deinterleaved buffer, and its subbuffers, are correctly sized. - if (deinterleaved_buffer.size() != static_cast(nchannels)) - { - throw std::invalid_argument( - std::format( - "read_contiguous_channels_impl: Received incorrectly sized deinterleaved buffer, should be exactly" - " {:L} elements large but instead got {:L}.", - nchannels, - deinterleaved_buffer.size() - ) - ); - } - for (const auto& buffer : deinterleaved_buffer) - { - if (buffer.size() != spec.width * scanlines_per_chunk) - { - throw std::invalid_argument( - std::format( - "read_contiguous_channels_impl: Received incorrectly sized deinterleaved buffer," - " should be exactly {:L} elements large but instead got {:L}.", - static_cast(nchannels) * spec.width * scanlines_per_chunk, - interleaved_buffer.size() - ) - ); - } - } - // Ensure the contexts and schunks are correctly sized - if (contexts.size() != static_cast(nchannels) || schunks.size() != static_cast(nchannels)) - { - throw std::runtime_error( - std::format( - "read_contiguous_channels_impl: Internal error: Expected the number of passed schunks and contexts" - " to exactly match the number of requested channels. Instead got {} and {} while {} was the expected" - " number.", - schunks.size(), - contexts.size(), - nchannels - ) - ); - } - - // Iterate all scanlines and read as many scanlines as possible in one go, compressing them on the fly - // into all of the super-chunks. This works for data windows as well where the y and x may not start at zero - int y = spec.y; - while (y < (spec.height + spec.y)) - { - _COMPRESSED_PROFILE_SCOPE("Read Scanlines/Tiles and compress"); - int scanlines_to_read = static_cast(std::min( - scanlines_per_chunk, static_cast(spec.height + spec.y - y) - )); - - - bool read_successful = false; - // Since the passed `scanlines_per_chunk` is already appropriately aligned to either tiles or scanlines, - // we can safely call either `read_tiles` or `read_scanlines` here making sure we are correctly aligned - if constexpr (read_tiles) - { - read_successful = input_ptr->read_tiles( - subimage, - 0, // miplevel - spec.x, // xbegin - spec.width, // xend - y, // ybegin - y + scanlines_to_read, // yend - 0, // zbegin - 1, // zend - chbegin, - chend, - typedesc, - static_cast(interleaved_buffer.data()) - ); - } - else - { - read_successful = input_ptr->read_scanlines( - subimage, - 0, // miplevel - y, // ybegin - y + scanlines_to_read, // yend - 0, // z - chbegin, - chend, - typedesc, - static_cast(interleaved_buffer.data()) - ); - } - - if (!read_successful) - { - throw std::runtime_error( - std::format( - "OIIO read failure when reading scanlines {}-{} for channels {}-{}: '{}'", - y, y + scanlines_to_read, chbegin, chend, input_ptr->geterror() - ) - ); - } - - // Deinterleave the buffers, in some cases we may be deinterleaving empty space here but that - // is ok as we refit the buffers. Since in most cases the size will only be off by at most one - // scanline. In the case of the last chunk, we may be at worst deinterleaving only one scanline - // with the rest being empty space but that is also ok. - image_algo::deinterleave(std::span(interleaved_buffer), deinterleaved_buffer); - - // Now start compressing the chunks and appending them into the super-chunks. - for (auto channel_idx : std::views::iota(0, nchannels)) - { - // How many elements we actually read per buffer - size_t read_elements = static_cast(scanlines_to_read) * spec.width; - auto deinterleaved_fitted = std::span(deinterleaved_buffer[channel_idx].data(), read_elements); - - // Perform the user-passed postprocessing, this may be anything and it's up to the user to decide - // what goes here. - if constexpr (std::invocable, size_t, std::span>) - { - auto absolute_channel_idx = chbegin + channel_idx; - postprocess(absolute_channel_idx, deinterleaved_fitted); - } - - schunks[channel_idx].append_chunk( - contexts[channel_idx], - deinterleaved_fitted, - std::span(chunk_buffer) - ); - } - y += scanlines_to_read; - } - } + /// \brief Read implementation for all the call to image::read(). + /// + /// This function takes care of reading data from the input pointer and propagating it to read_contiguous_channels_impl. + /// + /// \param input_ptr The pointer to read the data from + /// \param channelnames The channels to read from the file, non-existant channels throw std::out_of_range + /// \param postprocess An optional postprocessing step to apply to the chunks before they get compressed. + /// \param compression_codec The compression codec to apply + /// \param compression_level The compression level to compress with + /// \param block_size The block size to apply to the compressed data + /// \param chunk_size The chunk size to apply to the compressed data + /// + /// \returns The decoded image. + template + requires std::invocable, size_t, std::span> || std::is_same_v< + std::remove_cvref_t, std::nullopt_t> + static image read_impl( + std::unique_ptr input_ptr, + std::vector channelnames, + PostProcess&& postprocess, + int subimage, + enums::codec compression_codec = enums::codec::lz4, + size_t compression_level = 9, + size_t block_size = s_default_blocksize, + size_t chunk_size = s_default_chunksize + ) + { + _COMPRESSED_PROFILE_FUNCTION(); + assert(chunk_size % sizeof(T) == 0); + auto comp_level_adjusted = util::ensure_compression_level(compression_level); + + // Seek to the right subimage before getting the spec. + auto res = input_ptr->seek_subimage(subimage, 0); + if (!res) + { + throw std::invalid_argument( + std::format( + "File does not have a subimage {}, cannot seek to it", + subimage + ) + ); + } + const OIIO::ImageSpec& spec = input_ptr->spec(); + + // Align the chunk size to the scanlines and tiles (if applicable), this makes our life considerably + // easier and allows us to not deal with partial scanlines. + size_t chunk_size_aligned = 0; + if (spec.tile_height != 0) + { + chunk_size_aligned = util::align_chunk_to_tile_bytes(spec.width, spec.tile_height, chunk_size); + } + else + { + chunk_size_aligned = util::align_chunk_to_scanlines_bytes(spec.width, chunk_size); + } + + // Get a std::vector containing a begin-end pair for all contiguous channels in our channelnames. + // So if we pass 'R', 'B' and 'A' in a rgba image we would get the following + // { {0 - 1}, {2 - 4} } + // This allows us to both maximize performance by handling as many channels in one go as we can while also + // minimizing memory footprint by only ever allocating as much as we need for the max amount of contiguous + // channels we can encounter. + std::vector> channels; + auto channel_ranges_contiguous = detail::get_contiguous_channels(input_ptr, channelnames); + size_t max_num_channels = 0; + for (const auto& [chbegin, chend] : channel_ranges_contiguous) + { + if (static_cast(chend) - chbegin > max_num_channels) + { + max_num_channels = static_cast(chend) - chbegin; + } + } + + // Set up the Ring Buffer (Double Buffering) + // ----------------------------------------------------------------------------------- + constexpr size_t ring_buffer_size = 2; + const size_t max_chunk_size = chunk_size_aligned * max_num_channels; + ring_buffer_t ring_buffer(ring_buffer_size); + + for (auto& slot : ring_buffer) + { + slot.interleaved_buffer.resize(max_chunk_size / sizeof(T)); + slot.deinterleaved_buffer.resize(max_num_channels); + for (auto& buffer : slot.deinterleaved_buffer) + { + buffer.resize(chunk_size_aligned / sizeof(T)); + } + + if (enums::is_gpu_codec(compression_codec)) + { + slot.memory_pinners.reserve(1 + slot.deinterleaved_buffer.size()); + slot.memory_pinners.emplace_back( + slot.interleaved_buffer.data(), + slot.interleaved_buffer.size() * sizeof(T) + ); + for (auto& buffer : slot.deinterleaved_buffer) + { + slot.memory_pinners.emplace_back(buffer.data(), buffer.size() * sizeof(T)); + } + } + } + + // Read and compress the channel pairs in chunks + // ----------------------------------------------------------------------------------- + // ----------------------------------------------------------------------------------- + + // This will be the channelnames we will construct the image with. This is to avoid cases where the user + // passes the channel names in a different order than they appear in such as 'A', 'G', 'R'. This should + // still create the channel names as expected in correct order. + std::vector new_channelnames{}; + + // Initialize a scratch buffer for compression/decompression. + auto scratch_buffer = detail::scratch_pool_registry::get_or_create_for_channel(); + + // Iterate all the pair and extract them, refitting the buffers as needed. + // This is where the actual work of reading start. + for (auto [chbegin, chend] : channel_ranges_contiguous) + { + // Calculate some preliminary data for computing how many scanlines to extract in one go. + int nchannels = chend - chbegin; + const size_t bytes_per_scanline = static_cast(spec.width) * nchannels * sizeof(T); + const size_t chunk_size_all = chunk_size_aligned * nchannels; + const size_t scanlines_per_chunk = chunk_size_all / bytes_per_scanline; + std::vector> schunks; + for ([[maybe_unused]] auto _ : std::views::iota(0, nchannels)) + { + schunks.push_back(detail::schunk(block_size, chunk_size_aligned)); + } + + // Pass the managed ring buffer into our streaming implementation + if constexpr (std::invocable, size_t, std::span>) + { + if (spec.tile_height != 0) + { + image::template read_contiguous_channels_impl( + input_ptr, + subimage, + chbegin, + chend, + compression_codec, + comp_level_adjusted, + block_size, + ring_buffer, + scanlines_per_chunk, + schunks, + std::forward(postprocess) + ); + } + else + { + image::template read_contiguous_channels_impl( + input_ptr, + subimage, + chbegin, + chend, + compression_codec, + comp_level_adjusted, + block_size, + ring_buffer, + scanlines_per_chunk, + schunks, + std::forward(postprocess) + ); + } + } + else + { + if (spec.tile_height != 0) + { + image::template read_contiguous_channels_impl( + input_ptr, + subimage, + chbegin, + chend, + compression_codec, + comp_level_adjusted, + block_size, + ring_buffer, + scanlines_per_chunk, + schunks, + std::nullopt + ); + } + else + { + image::template read_contiguous_channels_impl( + input_ptr, + subimage, + chbegin, + chend, + compression_codec, + comp_level_adjusted, + block_size, + ring_buffer, + scanlines_per_chunk, + schunks, + std::nullopt + ); + } + } + + for (const auto channel_idx : std::views::iota(0, nchannels)) + { + _COMPRESSED_PROFILE_SCOPE("generate channels"); + channels.push_back( + compressed::channel( + std::move(schunks[channel_idx]), + spec.width, + spec.height, + compression_codec, + comp_level_adjusted + ) + ); + } + + for (auto channel_idx : std::views::iota(chbegin, chend)) + { + new_channelnames.push_back(spec.channelnames.at(channel_idx)); + } + } + + auto img = compressed::image(std::move(channels), spec.width, spec.height, new_channelnames); + img.metadata(compressed::image::read_oiio_metadata(spec)); + return std::move(img); + } + + + /// \brief Read a contiguous channel sequence from the passed input pointer + /// + /// When reading with OpenImageIO it is a lot more efficient to parse as many channels as possible in one go + /// rather than reading one channel at a time as the ImageInput keeps the data as compressed (in many cases). + /// If we were to read one channel at a time this would significantly slow down our read speeds. + /// + /// Due to us only being able to read contiguous channels at a time this helper function allows us to do that. + /// + /// \param input_ptr The opened OpenImageIO ImageInput. + /// \param chbegin The start channel to read + /// \param chend The end channel to read + /// \param interleaved_buffer The buffer into which we will read the channels (before then interleaving). + /// must be sized to exactly fit nchannels * width * height + /// \param deinterleaved_buffer The buffers to deinterleave into, must be exactly of size nchannels with each + /// sub-buffer being exactly width * height. + /// \param scanlines_per_chunk The number of scanlines that fit into one chunk (exactly). + /// \param contexts The contexts for compression, must be exactly nchannels amount + /// \param schunks The schunks for compression, must be exactly nchannels amount + /// \param chunk_buffer A scratch buffer for compression (from which we copy). + /// + /// \throws std::invalid_argument if any of the above conditions is not met. + template + requires std::invocable, size_t, std::span> || std::is_same_v< + std::remove_cvref_t, std::nullopt_t> + static void read_contiguous_channels_impl( + std::unique_ptr& input_ptr, + const int subimage, + const int chbegin, + const int chend, + const enums::codec compression_codec, + const size_t compression_level, + const size_t block_size, + ring_buffer_t& ring_buffer, + size_t scanlines_per_chunk, + std::vector>& schunks, + PostProcess&& postprocess + ) + { + _COMPRESSED_PROFILE_FUNCTION(); + const int nchannels = chend - chbegin; + assert(input_ptr->current_subimage() == subimage); + const OIIO::ImageSpec& spec = input_ptr->spec(); + const auto typedesc = enums::get_type_desc(); + + // Ensure this function is called with at least 1 channel to read. + if (nchannels < 1) + { + throw std::runtime_error( + std::format( + "read_contiguous_channels_impl: passed number of channels is less than one. This should not happen. Got {}", + nchannels + ) + ); + } + + + // Iterate all scanlines and read as many scanlines as possible in one go, compressing them on the fly + // into all of the super-chunks. This works for data windows as well where the y and x may not start at zero + std::future previous_compute_future; + size_t ring_index = 0; + int y = spec.y; + + while (y < (spec.height + spec.y)) + { + _COMPRESSED_PROFILE_SCOPE("Read Scanlines/Tiles and compress"); + int scanlines_to_read = static_cast(std::min( + scanlines_per_chunk, + static_cast(spec.height + spec.y - y) + )); + + // Select active slot in our ring buffer + auto& slot = ring_buffer[ring_index]; + + // 1. Wait if this slot's own previous turn hasn't finished (Safe Guard for small ring buffers) + if (slot.processing_future.valid()) + { + slot.processing_future.get(); + } + + // Slice out exact span dimensions required for the OIIO validation checks and bounds + const size_t chunk_size_all = scanlines_per_chunk * spec.width * nchannels; + auto interleaved_fitted = std::span(slot.interleaved_buffer.data(), chunk_size_all); + + // 2. STAGE 1 (I/O): Synchronously read next file chunk on main thread + bool read_successful = false; + if constexpr (read_tiles) + { + _COMPRESSED_PROFILE_SCOPE("read tiles"); + read_successful = input_ptr->read_tiles( + subimage, + 0, + spec.x, + spec.width, + y, + y + scanlines_to_read, + 0, + 1, + chbegin, + chend, + typedesc, + static_cast(interleaved_fitted.data()) + ); + } + else + { + _COMPRESSED_PROFILE_SCOPE("read scanlines"); + read_successful = input_ptr->read_scanlines( + subimage, + 0, + y, + y + scanlines_to_read, + 0, + chbegin, + chend, + typedesc, + static_cast(interleaved_fitted.data()) + ); + } + + if (!read_successful) + { + throw std::runtime_error( + std::format( + "OIIO read failure when reading scanlines {}-{} for channels {}-{}: '{}'", + y, + y + scanlines_to_read, + chbegin, + chend, + input_ptr->geterror() + ) + ); + } + + // 3. ORDER ENFORCEMENT: Wait for chunk k-1's compression to completely finish + // before spawning chunk k's compute task. This guarantees blocks append to schunks in sequential order. + if (previous_compute_future.valid()) + { + previous_compute_future.get(); + } + + size_t read_elements = static_cast(scanlines_to_read) * spec.width; + + // 4. STAGE 2 (COMPUTE): Delegate processing & compression of the freshly read chunk to a background task. + // Main thread loops back immediately to read chunk k+1 into the alternate buffer slot. + slot.processing_future = std::async( + std::launch::async, + [ + &slot, interleaved_fitted, nchannels, read_elements, compression_codec, compression_level, + block_size, chbegin, + y, scanlines_to_read, spec_width = spec.width, spec_height = spec.height, spec_y = spec.y, + &schunks, &postprocess + ]() + { + // Slice deinterleaved spans for this active task + std::vector> deinterleaved_fitted_views; + deinterleaved_fitted_views.reserve(nchannels); + for (int idx = 0; idx < nchannels; ++idx) + { + deinterleaved_fitted_views.emplace_back( + slot.deinterleaved_buffer[idx].data(), + slot.deinterleaved_buffer[idx].size() + ); + } + + // Compute steps + image_algo::deinterleave(std::span(interleaved_fitted), deinterleaved_fitted_views); + + for (auto channel_idx : std::views::iota(0, nchannels)) + { + auto context = NAMESPACE_COMPRESSED_IMAGE::channel::create_compression_context( + compression_codec, + std::thread::hardware_concurrency(), + compression_level, + block_size, + 0 + ); + + auto channel_span = std::span( + slot.deinterleaved_buffer[channel_idx].data(), + read_elements + ); + + if constexpr (std::invocable, size_t, std::span>) + { + auto absolute_channel_idx = chbegin + channel_idx; + postprocess(absolute_channel_idx, channel_span); + } + + schunks[channel_idx].append_chunk(std::move(context), channel_span); + + // Logging + if (y + scanlines_to_read == (spec_height + spec_y)) + { + std::string_view codec_name = enums::to_string(compression_codec); + std::string backend = enums::is_gpu_codec(compression_codec) ? "cuda" : "blosc2"; + get_logger()->debug( + std::format( + "[channel: {}] {} {}: uncompressed {} bytes; compressed {} bytes; cratio {}", + channel_idx, + backend, + codec_name, + schunks[channel_idx].chunk_bytes(), + schunks[channel_idx].csize(), + static_cast(schunks[channel_idx].chunk_bytes()) / schunks[channel_idx]. + csize() + ) + ); + } + } + } + ); + + // Save our background work task tracking token to previous handle + previous_compute_future = std::move(slot.processing_future); + + // Cycle the Ring Buffer index and step image coordinate offset + ring_index = (ring_index + 1) % ring_buffer.size(); + y += scanlines_to_read; + } + + // 5. Final sync: block until the last chunk's processing pipeline completely winds down + if (previous_compute_future.valid()) + { + previous_compute_future.get(); + } + } #endif // COMPRESSED_IMAGE_OIIO_AVAILABLE - - }; - -} // NAMESPACE_COMPRESSED_IMAGE \ No newline at end of file + }; +} // NAMESPACE_COMPRESSED_IMAGE diff --git a/compressed_image/include/compressed/iterators/channel.h b/compressed_image/include/compressed/iterators/channel.h new file mode 100644 index 0000000..35c1613 --- /dev/null +++ b/compressed_image/include/compressed/iterators/channel.h @@ -0,0 +1,444 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "compressed/blosc2/typedefs.h" +#include "compressed/blosc2/wrapper.h" +#include "compressed/containers/chunk_span.h" +#include "compressed/context.h" +#include "compressed/cuda/compression.h" +#include "compressed/detail/scoped_timer.h" +#include "compressed/enums.h" +#include "compressed/macros.h" +#include "compressed/util.h" + +namespace +NAMESPACE_COMPRESSED_IMAGE +{ + template + struct fitted_buffer + { + fitted_buffer() = default; + + explicit fitted_buffer(size_t initial_size) + { + m_buffer.resize(initial_size); + m_size = initial_size; + } + + std::span get() + { + return std::span(m_buffer.begin(), m_buffer.begin() + m_size); + } + + std::span get() const + { + return std::span(m_buffer.begin(), m_buffer.begin() + m_size); + } + + void reset() + { + m_size = m_buffer.size(); + m_is_fitted = false; + } + + void ensure_capacity(size_t capacity) + { + if (capacity > m_buffer.size()) + { + m_buffer.resize(capacity); + } + + if (!m_is_fitted) + { + m_size = m_buffer.size(); + } + } + + void refit(size_t new_size) + { + if (new_size > m_buffer.size()) + { + throw std::invalid_argument( + std::format("New size exceeds buffer capacity. Maximum size is {:L}", m_buffer.size()) + ); + } + m_size = new_size; + m_is_fitted = true; + } + + size_t capacity() const noexcept + { + return m_buffer.size(); + } + + private: + util::default_init_vector m_buffer; + bool m_is_fitted = false; + size_t m_size = 0; + }; + + template + struct channel_iterator + { + using storage_type = std::remove_const_t; + using schunk_pointer = schunk_var_ptr; + + using iterator_category = std::forward_iterator_tag; + using difference_type = std::ptrdiff_t; + using value_type = container::chunk_span; + using pointer = value_type*; + using reference = value_type&; + + channel_iterator() = default; + + channel_iterator( + schunk_pointer schunk, + size_t chunk_index, + size_t num_chunks, + size_t width, + size_t height, + enums::codec codec, + uint8_t compression_level, + size_t num_threads, + size_t block_size, + size_t chunk_size + ) + : m_state( + std::make_shared( + std::move(schunk), + detail::scratch_pool_registry::get_or_create_for_channel(), + chunk_index, + num_chunks, + width, + height, + codec, + compression_level, + num_threads, + block_size, + chunk_size + ) + ) + { + } + + ~channel_iterator() + { + if constexpr (!std::is_const_v) + { + try + { + flush(); + } + catch (...) + { + // Iterators must not throw from destructors. + } + } + } + + value_type operator*() + { + ensure_dereferenceable(); + load_current_chunk(); + + if constexpr (!std::is_const_v) + { + m_state->dirty = true; + } + + return m_state->current_chunk; + } + + channel_iterator& operator++() + { + ensure_state(); + + if constexpr (!std::is_const_v) + { + flush(); + } + + if (m_state->chunk_index < m_state->num_chunks) + { + ++m_state->chunk_index; + } + + m_state->loaded = false; + return *this; + } + + channel_iterator operator++(int) + { + channel_iterator copy = *this; + ++(*this); + return copy; + } + + bool operator==(const channel_iterator& other) const noexcept + { + if (!m_state || !other.m_state) + { + return !m_state && !other.m_state; + } + + return m_state->schunk == other.m_state->schunk && m_state->chunk_index == other.m_state->chunk_index; + } + + bool operator!=(const channel_iterator& other) const noexcept + { + return !(*this == other); + } + + private: + struct state + { + state( + schunk_pointer schunk_, + std::shared_ptr scratch_pool_, + size_t chunk_index_, + size_t num_chunks_, + size_t width_, + size_t height_, + enums::codec codec_, + uint8_t compression_level_, + size_t num_threads_, + size_t block_size_, + size_t chunk_size_ + ) + : schunk(std::move(schunk_)), + scratch_pool(std::move(scratch_pool_)), + chunk_index(chunk_index_), + num_chunks(num_chunks_), + width(width_), + height(height_), + codec(codec_), + compression_level(compression_level_), + num_threads(num_threads_), + block_size(block_size_), + chunk_size(chunk_size_) + { + } + + schunk_pointer schunk = nullptr; + std::shared_ptr scratch_pool = nullptr; + size_t chunk_index = 0; + size_t num_chunks = 0; + size_t width = 0; + size_t height = 0; + enums::codec codec = enums::codec::lz4; + uint8_t compression_level = 9; + size_t num_threads = 1; + size_t block_size = 0; + size_t chunk_size = 0; + + fitted_buffer decompressed_buffer{}; + fitted_buffer compressed_buffer{}; + value_type current_chunk{}; + + std::optional context{}; + bool loaded = false; + bool dirty = false; + }; + + std::shared_ptr m_state{}; + + void ensure_state() const + { + if (!m_state || !m_state->schunk) + { + throw std::runtime_error("Invalid channel iterator state."); + } + } + + void ensure_dereferenceable() const + { + ensure_state(); + + if (m_state->chunk_index >= m_state->num_chunks) + { + throw std::out_of_range("Cannot dereference end channel iterator."); + } + } + + void ensure_context() + { + ensure_state(); + + if (m_state->context.has_value()) + { + return; + } + + const int gpu_device = enums::is_gpu_codec(m_state->codec) ? cuda::current_device() : 0; + m_state->context = create_context( + m_state->codec, + m_state->num_threads, + m_state->compression_level, + m_state->block_size, + gpu_device + ); + } + + void load_current_chunk() + { + if (m_state->loaded) + { + return; + } + + ensure_context(); + + const size_t chunk_elems = std::visit( + [&](const auto& schunk) + { + return schunk.chunk_elements(m_state->chunk_index); + }, + *m_state->schunk + ); + const size_t schunk_total = std::visit( + [&](const auto& schunk) + { + return schunk.size(); + }, + *m_state->schunk + ); + + size_t max_chunk_elems = m_state->chunk_size / sizeof(storage_type); + // Optimize for small chunks by allocating at most what is held in total. + max_chunk_elems = std::min(max_chunk_elems, schunk_total); + + m_state->decompressed_buffer.ensure_capacity(max_chunk_elems); + m_state->decompressed_buffer.refit(chunk_elems); + + auto writable_buffer = m_state->decompressed_buffer.get(); + + std::visit( + [&](const auto& schunk) + { + if (enums::is_gpu_codec(m_state->codec)) + { + schunk.chunk(writable_buffer, m_state->chunk_index); + } + else + { + auto& cpu_context = std::get(*m_state->context); + schunk.chunk(cpu_context.decompression_ctx.get(), writable_buffer, m_state->chunk_index); + } + }, + *m_state->schunk + ); + + if constexpr (std::is_const_v) + { + m_state->current_chunk = value_type( + std::span(writable_buffer.data(), writable_buffer.size()), + m_state->width, + m_state->height, + m_state->chunk_index, + m_state->chunk_size / sizeof(storage_type) + ); + } + else + { + m_state->current_chunk = value_type( + writable_buffer, + m_state->width, + m_state->height, + m_state->chunk_index, + m_state->chunk_size / sizeof(storage_type) + ); + } + + m_state->loaded = true; + m_state->dirty = false; + } + + void flush() + { + if constexpr (std::is_const_v) + { + return; + } + else + { + if (!m_state || !m_state->loaded || !m_state->dirty || m_state->chunk_index >= m_state->num_chunks) + { + return; + } + + ensure_context(); + + auto buffer = m_state->decompressed_buffer.get(); + + std::visit( + [&](auto& schunk) + { + if (buffer.size() != schunk.chunk_elements(m_state->chunk_index)) + { + throw std::invalid_argument( + std::format( + "Invalid iterator chunk buffer size. Expected {} elements, got {}.", + schunk.chunk_elements(m_state->chunk_index), + buffer.size() + ) + ); + } + + if (enums::is_gpu_codec(m_state->codec)) + { + auto& gpu_context = std::get(*m_state->context); + schunk.set_chunk(gpu_context.ctx, buffer, m_state->chunk_index); + } + else + { + auto& cpu_context = std::get(*m_state->context); + schunk.set_chunk(cpu_context.compression_ctx, buffer, m_state->chunk_index); + } + }, + *m_state->schunk + ); + + m_state->dirty = false; + } + } + + static compression_context_var create_context( + const enums::codec codec, + const size_t num_threads, + const size_t compression_level, + const size_t block_size, + const int gpu_device + ) + { + if (enums::is_gpu_codec(codec)) + { + return gpu_compression_context{ + .ctx = cuda::make_compression_context(codec, gpu_device, block_size) + }; + } + + return cpu_compression_context{ + .compression_ctx = blosc2::create_compression_context( + num_threads, + codec, + static_cast(compression_level), + block_size + ), + .decompression_ctx = blosc2::create_decompression_context(num_threads), + .nthreads = num_threads + }; + } + }; +} // NAMESPACE_COMPRESSED_IMAGE diff --git a/compressed_image/include/compressed/iterators/iterator.h b/compressed_image/include/compressed/iterators/iterator.h deleted file mode 100644 index d436895..0000000 --- a/compressed_image/include/compressed/iterators/iterator.h +++ /dev/null @@ -1,308 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -#include "compressed/detail/scoped_timer.h" -#include "compressed/macros.h" -#include "compressed/blosc2/wrapper.h" -#include "compressed/containers/chunk_span.h" - -namespace NAMESPACE_COMPRESSED_IMAGE -{ - - // Image iterator, cannot be used in parallel as it iterates the chunks. Dereferencing it gives a span view over the current decompressed - // context. - template - struct channel_iterator - { - // Iterator type definitions - using iterator_category = std::forward_iterator_tag; - using difference_type = std::ptrdiff_t; - using value_type = container::chunk_span; - using pointer = value_type*; - using reference = value_type&; - - channel_iterator() = default; - - channel_iterator( - blosc2::schunk_var_ptr schunk, - blosc2::context_raw_ptr compression_context, - blosc2::context_raw_ptr decompression_context, - size_t chunk_index, - size_t width, - size_t height - ) - : m_Schunk(schunk), - m_CompressionContext(compression_context), - m_DecompressionContext(decompression_context), - m_ChunkIndex(chunk_index), - m_Width(width), - m_Height(height) - { - // Check that we are not out of range, throw if we are - std::visit([&](auto& schunk) - { - if (m_ChunkIndex > schunk.num_chunks()) - { - throw std::out_of_range( - std::format( - "chunk_index is out of range for total number of chunks in blosc2_schunk." - " Max chunk number is {} but received {}", - schunk.num_chunks(), m_ChunkIndex - ) - ); - } - }, *m_Schunk); - - // Check that we don't pass zero width or height as e.g. the x() and y() functions of chunk_span require division by these dimensions - if (m_Width == 0 || m_Height == 0) - { - throw std::runtime_error( - std::format( - "passed zero width or height to iterator which is not valid, expected at least 1 pixel in either dimensions. Got [width: {} px, height: {} px]", - m_Width, m_Height - ) - ); - } - } - - ~channel_iterator() - { - _COMPRESSED_PROFILE_FUNCTION(); - // We need to ensure that the last chunk also gets compressed on destruction - // because of e.g. scope exit - if (m_DecompressionBufferWasRefitted) - { - compress_chunk(m_CompressionContext); - // If we iterated through the whole range at this point we'd have a - // chunk index == nchunks but the last chunk was not yet compressed. In this case - // we have to ensure we set the index back to compress again. - auto chunk_idx = m_ChunkIndex; - std::visit([&](auto& schunk) - { - if (m_ChunkIndex == schunk.num_chunks()) - { - chunk_idx = chunk_idx - 1; - } - }, *m_Schunk); - update_chunk(chunk_idx); - } - } - - /// Dereference operator: decompress the current chunk and recompress (if necessary) the previously compressed - /// chunk. value_type is a view over the current buffers. Iterator going out of scope while value_type is accessed is UB. - value_type operator*() - { - _COMPRESSED_PROFILE_FUNCTION(); - - // Initialize the data, this allows the base iterator to be copied over - // quite cheaply - if (!m_Initialized) - { - m_CompressionBuffer.resize(blosc2::min_compressed_size(this->chunk_bytes())); - m_CompressionBufferSize = m_CompressionBuffer.size(); - m_DecompressionBuffer.resize(blosc2::min_decompressed_size(this->chunk_bytes())); - m_DecompressionBufferSize = m_DecompressionBuffer.size(); - m_Initialized = true; - } - - if (!this->valid()) - { - throw std::runtime_error("Invalid Iterator struct encountered, cannot dereference item"); - } - - // Compress the previously decompressed chunk if it has been modified. - if (m_DecompressionBufferWasRefitted && m_ChunkIndex != 0) - { - this->compress_chunk(m_CompressionContext); - this->update_chunk(m_ChunkIndex - 1); - } - - // In most cases m_Decompressed.fitted_data should be identical to m_Decompressed.data. However, this is not true - // for the last chunk in the schunk which may not be the same decompressed size. - this->decompress_chunk(m_DecompressionContext); - - if (this->decompression_buffer_byte_size() % sizeof(T) != 0) - { - throw std::runtime_error( - std::format( - "Unable to dereference iterator as the decompressed size is not a multiple of {}." \ - " Got {:L} bytes. This is likely an internal decompression error.", - sizeof(T), decompression_buffer_byte_size() - ) - ); - } - - std::span item_span(reinterpret_cast(m_DecompressionBuffer.data()), m_DecompressionBufferSize / sizeof(T)); - return container::chunk_span(item_span, m_Width, m_Height, m_ChunkIndex, this->chunk_bytes()); - } - - // Pre-increment operator: move to the next chunk - channel_iterator& operator++() - { - ++m_ChunkIndex; - std::visit([&](auto& schunk) - { - if (m_ChunkIndex > schunk.num_chunks()) - { - throw std::out_of_range("Iterator: count exceeds number of chunks"); - }; - }, *m_Schunk); - return *this; - } - - channel_iterator& operator++(int) - { - channel_iterator temp = *this; - ++(*this); - return temp; - } - - bool operator==(const channel_iterator& other) const noexcept - { - return m_ChunkIndex == other.m_ChunkIndex && m_Schunk == other.m_Schunk; - } - - bool operator!=(const channel_iterator& other) const noexcept - { - return m_ChunkIndex != other.m_ChunkIndex || m_Schunk != other.m_Schunk; - } - - /// Return the chunk index the iterator is currently at. - size_t chunk_index() const noexcept { return m_ChunkIndex; } - - /// Return the chunk size of all but the last chunk. - size_t chunk_elements() const noexcept - { - return std::visit([&](auto& schunk) -> size_t - { - return schunk.chunk_elements(); - }, *m_Schunk); - } - - /// Return the chunk size of all but the last chunk. - size_t chunk_bytes() const noexcept - { - return std::visit([&](auto& schunk) -> size_t - { - return schunk.chunk_bytes(); - }, *m_Schunk); - } - - private: - - /// Buffers for storing compressed and decompressed data. These hold enough data for ChunkSize - /// but may be smaller, thus we keep track of m_CompressionBufferSize and m_DecompressionBufferSize - util::default_init_vector m_CompressionBuffer; - bool m_CompressionBufferWasRefitted = false; - size_t m_CompressionBufferSize = 0; // The fitted size of the container (only holding the compressed size) - - std::vector m_DecompressionBuffer; - bool m_DecompressionBufferWasRefitted = false; - size_t m_DecompressionBufferSize = 0; // The fitted size of the container (only holding the decompressed size) - - /// Pointers to the blosc2 structs. The data is owned by the `channel` struct and we just have a view over it. - blosc2::schunk_var_ptr m_Schunk; - blosc2::context_raw_ptr m_CompressionContext = nullptr; - blosc2::context_raw_ptr m_DecompressionContext = nullptr; - - size_t m_ChunkIndex = 0; - size_t m_Width = 0; - size_t m_Height = 0; - - /// this is set in the dereference operator to only initialize on first access - /// not on setup. - bool m_Initialized = false; - - private: - - size_t compression_buffer_byte_size() const noexcept - { - return m_CompressionBufferSize; - } - - size_t compression_buffer_max_byte_size() const noexcept - { - return m_CompressionBuffer.size(); - } - - size_t decompression_buffer_byte_size() const noexcept - { - return m_DecompressionBufferSize; - } - - size_t decompression_buffer_max_byte_size() const noexcept - { - return m_DecompressionBuffer.size(); - } - - /// Check for validity of this struct. - bool valid() const - { - if (!m_Schunk) - { - return false; - } - return std::visit([&](auto& schunk) - { - // Check that the schunk, compression and decompression ptrs are not null - bool ptrs_valid = m_Schunk && m_CompressionContext && m_DecompressionContext; - if (!ptrs_valid) - { - return false; - } - - bool compression_size_valid = m_CompressionBufferSize <= m_CompressionBuffer.size(); - bool decompression_size_valid = m_DecompressionBufferSize <= m_DecompressionBuffer.size(); - - bool idx_valid = m_ChunkIndex < schunk.num_chunks(); - bool compressed_data_valid = compression_buffer_max_byte_size() >= blosc2::min_compressed_size(this->chunk_bytes()); - bool decompressed_data_valid = decompression_buffer_max_byte_size() >= blosc2::min_decompressed_size(this->chunk_bytes()); - - return idx_valid && compressed_data_valid && decompressed_data_valid && compression_size_valid && decompression_size_valid; - }, *m_Schunk); - } - - /// Decompress a chunk using the given context and chunk pointer. Decompressing into the buffer - void decompress_chunk(blosc2::context_raw_ptr decompression_context_ptr) - { - _COMPRESSED_PROFILE_FUNCTION(); - auto buffer_span = std::span(reinterpret_cast(m_DecompressionBuffer.data()), m_DecompressionBufferSize / sizeof(T)); - - // apply the decompression. - std::visit([&](auto& schunk) - { - schunk.chunk(decompression_context_ptr, buffer_span, m_ChunkIndex); - m_DecompressionBufferSize = schunk.chunk_bytes(m_ChunkIndex); - m_DecompressionBufferWasRefitted = true; - }, *m_Schunk); - } - - /// Compress a chunk from the decompressed view into the compressed view - void compress_chunk(blosc2::context_raw_ptr compression_context_ptr) - { - _COMPRESSED_PROFILE_FUNCTION(); - std::span fitted = { reinterpret_cast(m_DecompressionBuffer.data()), m_DecompressionBufferSize / sizeof(T) }; - auto compressed_size = blosc2::compress(compression_context_ptr, fitted, m_CompressionBuffer); - - m_CompressionBufferSize = compressed_size; - m_CompressionBufferWasRefitted = true; - } - - /// Update and replace the chunk inside of the superchunk at the given index. - void update_chunk(size_t chunk_index) - { - _COMPRESSED_PROFILE_FUNCTION(); - auto byte_span = std::span(m_CompressionBuffer.data(), this->compression_buffer_byte_size()); - std::visit([&](auto& schunk) - { - schunk.set_chunk(byte_span, chunk_index); - }, *m_Schunk); - } - }; - - -} // NAMESPACE_COMPRESSED_IMAGE \ No newline at end of file diff --git a/compressed_image/include/compressed/logger.h b/compressed_image/include/compressed/logger.h new file mode 100644 index 0000000..fcb0b39 --- /dev/null +++ b/compressed_image/include/compressed/logger.h @@ -0,0 +1,52 @@ +#pragma once + +#include "macros.h" + +#include +#include +#include + + +namespace +NAMESPACE_COMPRESSED_IMAGE +{ + namespace detail + { + static std::shared_ptr s_logger = nullptr; + /// \brief The default logger name used internally if the user does not provide one. + static inline std::string s_default_logger_name = "compressed_image"; + } + + + /// \brief Set the logger instance used by the compressed-image api. + /// + /// This function allows consumers of the library to provide their own `spdlog::logger` instance. + /// This can be useful to integrate the library’s logging output into an existing logging system, + /// route messages to a file, or change verbosity dynamically. + /// + /// If no logger is set, the library will lazily create a default one that logs to `stdout` at warning level. + /// + /// \param logger The `spdlog::logger` instance to use for all library logging. + inline void set_logger(std::shared_ptr logger) + { + detail::s_logger = logger; + } + + /// \brief Retrieve the current logger instance used by the cryptomatte-api. + /// + /// If no logger has been previously set via `set_logger`, this function will initialize + /// a default logger named `"cryptomatte_api"` that logs to standard output with color support, + /// and at `spdlog::level::warn` verbosity. + /// + /// \return A shared pointer to the currently active `spdlog::logger`. + inline std::shared_ptr get_logger() + { + if (!detail::s_logger) + { + // Lazy init with a sensible default + detail::s_logger = spdlog::stdout_color_mt(detail::s_default_logger_name); + detail::s_logger->set_level(spdlog::level::info); + } + return detail::s_logger; + } +} // NAMESPACE_COMPRESSED_IMAGE diff --git a/examples/gpu_compression/CMakeLists.txt b/examples/gpu_compression/CMakeLists.txt new file mode 100644 index 0000000..469f5d7 --- /dev/null +++ b/examples/gpu_compression/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(lazy_channels main.cpp) +target_link_libraries(lazy_channels PRIVATE compressed_image) diff --git a/examples/gpu_compression/main.cpp b/examples/gpu_compression/main.cpp new file mode 100644 index 0000000..4f479fe --- /dev/null +++ b/examples/gpu_compression/main.cpp @@ -0,0 +1,57 @@ + +#include +#include +#include +#include + +#include + + +auto main() -> int +{ + // The compressed_image API provides multiple ways of generating lazy chunks to represent sparse data. This generates + // chunks represented by a single value that take up just a couple of bytes. This method is especially useful when you + // are planning to fill the channel with sparse data to then pass along to an image or somewhere else. + auto channel_zeros = compressed::channel::zeros(1920, 1080); + auto channel_full = compressed::channel::full(1920, 1080, 65535 /* fill value */); + + // We can also directly mirror another channel, this doesn't have to be a lazy channel! + auto channel_zeros_like = compressed::channel::zeros_like(channel_zeros); + auto channel_full_like = compressed::channel::full_like(channel_full, 24 /* fill value */); + + // When working with these lazy channels one has to slightly rethink how they approach modifying chunks within a + // channel. This is because the usual `set_chunk` method will actually trigger a non-lazy chunk to be generated using + // up more memory and being slower + // + // So instead of: + for ([[maybe_unused]] auto chunk : channel_zeros) + { + // modify the chunk + } + + // One should instead do the following: + + // Generate a vector with uninitialized data since we'll set it directly after. + compressed::util::default_init_vector chunk_buffer(channel_zeros.chunk_size()); + + for (size_t chunk_idx = 0; chunk_idx < channel_zeros.num_chunks(); ++chunk_idx) + { + // Only conditionally modify the chunk, do this to avoid breaking the laziness of chunks unless necessary. + if (true /*some arbitrary condition*/) + { + // Note: we need to ensure this is set to chunk_elems(chunk_idx) as the last chunk of an channel may be smaller + // than the rest of the chunks in the channel, this way we don't have to worry about the chunk size. + std::span chunk_span(chunk_buffer.data(), channel_zeros.chunk_elems(chunk_idx)); + + channel_zeros.get_chunk(chunk_span, chunk_idx); + + // modify the data to your hearts content + + channel_zeros.set_chunk(chunk_span, chunk_idx); + } + } + + // While lazy chunks are mentioned as a good way of generating sparse data they are also generally the fastest way to + // initialize a channel you are planning to populate fully as it is very cheap to instantiate and you only pay the + // memory price as you go! +} \ No newline at end of file diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index b202d9b..06f38cd 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -4,9 +4,9 @@ file(GLOB_RECURSE MY_SOURCES CONFIGURE_DEPENDS "src/*.cpp") enable_testing() add_executable(compressed_image_test ${MY_SOURCES} "main.cpp") -if(MSVC) - target_compile_options(compressed_image_test PRIVATE /MP /utf-8) -endif() +if (MSVC) + target_compile_options(compressed_image_test PRIVATE /MP /utf-8 /bigobj) +endif () target_link_libraries(compressed_image_test PRIVATE compressed_image) target_link_libraries(compressed_image_test PRIVATE doctest) @@ -15,8 +15,18 @@ add_test(test_compressed_image compressed_image_test) # Copy the images/ folder to the build dir to run the tests add_custom_command(TARGET compressed_image_test POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy_directory - ${CMAKE_CURRENT_SOURCE_DIR}/images/ $/images) + COMMAND ${CMAKE_COMMAND} -E copy_directory + ${CMAKE_CURRENT_SOURCE_DIR}/images/ $/images) add_custom_command(TARGET compressed_image_test POST_BUILD - COMMAND ${CMAKE_COMMAND} -E echo - "Finished copying test files to output directory $/images") \ No newline at end of file + COMMAND ${CMAKE_COMMAND} -E echo + "Finished copying test files to output directory $/images") + +if (NVCOMP_RUNTIME_BINARIES) + add_custom_command( + TARGET compressed_image_test POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + ${NVCOMP_RUNTIME_BINARIES} + $ + COMMENT "Syncing nvcomp runtime dependencies next to test executable..." + ) +endif () \ No newline at end of file diff --git a/test/main.cpp b/test/main.cpp index 7c5687a..dee8622 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -2,103 +2,362 @@ #define DOCTEST_CONFIG_IMPLEMENT #include "doctest.h" + #include #include #include #include #include +#define _COMPRESSED_PROFILE 1 +#include "compressed/detail/scoped_timer.h" -/// Create a reporter which prints out failure statistics at the end -struct FailureReporter : public doctest::ConsoleReporter +struct test_log_reporter : public doctest::ConsoleReporter { - FailureReporter(const doctest::ContextOptions& opt) : doctest::ConsoleReporter(opt) {} + const doctest::TestCaseData* tc = nullptr; + std::mutex mutex; + + // Track both test cases and subcases for the final report summary + std::vector> test_durations; + + // Flat raw structure captured during test runs + struct subcase_report + { + std::string name; + double seconds = 0.0; + size_t depth = 0; + }; + + std::vector buffered_subcases; + + // Reconstructed tree structure used for grouped reporting + struct subcase_node + { + std::string name; + double total_seconds = 0.0; + int call_count = 0; + std::vector children; + }; + + test_log_reporter(const doctest::ContextOptions& opt) : doctest::ConsoleReporter(opt) + { + } + + void test_case_start(const doctest::TestCaseData& in) override + { + std::lock_guard lock(mutex); + tc = ∈ + buffered_subcases.clear(); + active_subcase_stack.clear(); + } + + void test_case_reenter(const doctest::TestCaseData& in) override + { + std::lock_guard lock(mutex); + tc = ∈ + active_subcase_stack.clear(); + } + + void test_case_end(const doctest::CurrentTestCaseStats& in) override + { + std::lock_guard lock(mutex); + if (!tc) return; + + // Track the root parent test case + test_durations.push_back({std::string(tc->m_name), in.seconds}); + + constexpr int column_width = 120; + constexpr const char* blue = "\033[36m"; + constexpr const char* red = "\033[31m"; + constexpr const char* gold = "\033[33m"; + constexpr const char* reset = "\033[0m"; + constexpr const char* dim_dot = "\033[90m"; + + if (!in.testCaseSuccess) + { + std::string reason = "Assertion failure"; + if (in.failure_flags & doctest::TestCaseFailureReason::Exception) reason = "Unhandled Exception"; + else if (in.failure_flags & doctest::TestCaseFailureReason::Crash) reason = "Crash"; + else if (in.failure_flags & doctest::TestCaseFailureReason::ShouldHaveFailedButDidnt) + reason = "Expected failure missing"; + + std::cout + << gold << "\n===============================================================================\n" + << blue << "[doctest]" << reset << " Failure in test case: " << red << tc->m_name << reset + << " | Reason: " << gold << reason << "\n" + << "===============================================================================\n" << reset + << std::endl; + } + else + { + std::string time_str = format_duration_clean(in.seconds); + std::string time_color = get_duration_color(in.seconds); + std::string right_side = std::format("{}{} ok{}", time_color, time_str, reset); + std::string right_side_plain = std::format("{} ok", time_str); + + size_t max_name_len = column_width - std::string("[doctest] ").length() - right_side_plain.length() - 3; + std::string display_name = tc->m_name; + if (display_name.length() > max_name_len) + { + display_name = truncate_string(display_name, max_name_len); + } + + std::string left_side = std::format("[doctest] {}", display_name); + int fill_dots = column_width - static_cast(left_side.length() + right_side_plain.length()); + if (fill_dots < 3) fill_dots = 3; + + std::cout << left_side << dim_dot << std::string(fill_dots, '.') << reset << right_side << std::endl; + } + + // 1. Reconstruct hierarchical tree from raw flat loop history + std::vector root_nodes; + std::vector current_path_names; + + for (const auto& sub : buffered_subcases) + { + if (current_path_names.size() >= sub.depth) + { + current_path_names.resize(sub.depth - 1); + } + current_path_names.push_back(sub.name); + + std::vector* current_level = &root_nodes; + for (size_t d = 0; d < current_path_names.size(); ++d) + { + const std::string& name = current_path_names[d]; + auto it = std::find_if( + current_level->begin(), + current_level->end(), + [&name](const subcase_node& node) { return node.name == name; } + ); + + if (it == current_level->end()) + { + current_level->push_back(subcase_node{name, 0.0, 0, {}}); + it = current_level->end() - 1; + } + + if (d == current_path_names.size() - 1) + { + it->total_seconds += sub.seconds; + it->call_count++; + } + current_level = &it->children; + } + } + + // 2. Extract and recursively add subcase profiles into final summary metrics + collect_subcase_durations(root_nodes, std::string(tc->m_name)); + + // 3. Print the collapsed tree down the console pipe using clean ASCII configurations + print_subcase_tree(root_nodes, "", column_width); - void test_case_start(const doctest::TestCaseData& in) override { tc = ∈ } + buffered_subcases.clear(); + tc = nullptr; + } + void subcase_start(const doctest::SubcaseSignature& in) override + { + std::lock_guard lock(mutex); - void test_case_end(const doctest::CurrentTestCaseStats& in) override - { - if (in.failure_flags == doctest::TestCaseFailureReason::Exception || - in.failure_flags == doctest::TestCaseFailureReason::Crash || - in.failure_flags == doctest::TestCaseFailureReason::ShouldHaveFailedButDidnt - ) - { - std::lock_guard lock(mutex); - if (tc) - { - std::string reason = ""; - if (in.failure_flags == doctest::TestCaseFailureReason::Exception) - { - reason = "Exception"; - } - else if (in.failure_flags == doctest::TestCaseFailureReason::Crash) - { - reason = "Crash"; - } - else if (in.failure_flags == doctest::TestCaseFailureReason::ShouldHaveFailedButDidnt) - { - reason = "Expected failure not failing"; - } + size_t idx = buffered_subcases.size(); + buffered_subcases.push_back({in.m_name.c_str(), 0.0, active_subcase_stack.size() + 1}); + active_subcase_stack.push_back({in.m_name.c_str(), std::chrono::high_resolution_clock::now(), idx}); + } - constexpr const char* blue = "\033[36m"; - constexpr const char* red = "\033[31m"; - constexpr const char* gold = "\033[33m"; - constexpr const char* reset = "\033[0m"; - std::cout - << gold << "===============================================================================\n" - << blue << "[doctest]" << reset << " Failure in test case: " << red << std::string(tc->m_name) << reset - << " with reason: " << gold << reason << "\n" - << "===============================================================================\n" << reset - << std::endl; - } - } - else - { - constexpr int column_width = 100; - std::cout << std::format("[doctest] {:.<{}} ok", std::string(tc->m_name), column_width - 12) << std::endl; - } - } + void subcase_end() override + { + auto end_time = std::chrono::high_resolution_clock::now(); + if (active_subcase_stack.empty()) return; + auto top = active_subcase_stack.back(); + active_subcase_stack.pop_back(); - void report_query(const doctest::QueryData& /*in*/) override {} + std::chrono::duration elapsed = end_time - top.start_time; - void test_run_start() override {} + std::lock_guard lock(mutex); + if (top.report_index < buffered_subcases.size()) + { + buffered_subcases[top.report_index].seconds = elapsed.count(); + } + } - void test_run_end(const doctest::TestRunStats& /*in*/) override {} + void log_assert(const doctest::AssertData& in) override + { + if (!in.m_failed) return; - void test_case_reenter(const doctest::TestCaseData& /*in*/) override {} + std::lock_guard lock(mutex); + constexpr const char* red = "\033[31m"; + constexpr const char* gold = "\033[33m"; + constexpr const char* reset = "\033[0m"; - void test_case_exception(const doctest::TestCaseException& /*in*/) override {} + std::cout << "\n" + << red << " `-- ASSERTION FAILURE:\n" << reset + << " " << gold << "File: " << reset << in.m_file << ":" << in.m_line << "\n" + << " " << gold << "Expr: " << reset << in.m_expr << "\n" + << " " << gold << "Decomp: " << red << in.m_decomp << reset << "\n" + << std::endl; + } - void subcase_start(const doctest::SubcaseSignature& /*in*/) override {} + void test_run_end(const doctest::TestRunStats& /*in*/) override + { + std::lock_guard lock(mutex); + if (test_durations.empty()) return; - void subcase_end() override {} + constexpr const char* gold = "\033[33m"; + constexpr const char* reset = "\033[0m"; - void log_assert([[maybe_unused]] const doctest::AssertData& in) override {} + std::sort( + test_durations.begin(), + test_durations.end(), + [](const auto& a, const auto& b) { return a.second > b.second; } + ); - void log_message(const doctest::MessageData& /*in*/) override {} + std::cout << "\n" << gold << "Slowest test paths & subcases (Top 10):" << reset << "\n"; - void test_case_skipped(const doctest::TestCaseData& /*in*/) override {} + size_t display_count = std::min(size_t(10), test_durations.size()); + for (size_t i = 0; i < display_count; ++i) + { + const auto& [name, seconds] = test_durations[i]; + std::string time_str = format_duration_clean(seconds, true); + std::string time_color = get_duration_color(seconds); + std::cout << std::format(" [{}{}{}] {}\n", time_color, time_str, reset, name); + } + std::cout << std::endl; + } + + void report_query(const doctest::QueryData&) override + { + } + + void test_run_start() override + { + } + + void test_case_exception(const doctest::TestCaseException&) override + { + } + + void log_message(const doctest::MessageData&) override + { + } + + void test_case_skipped(const doctest::TestCaseData&) override + { + } private: + struct subcase_timing + { + std::string name; + std::chrono::high_resolution_clock::time_point start_time; + size_t report_index; + }; + + inline static thread_local std::vector active_subcase_stack; + + void collect_subcase_durations(const std::vector& nodes, const std::string& parent_path) + { + for (const auto& node : nodes) + { + std::string current_path = parent_path + " > " + node.name; + std::string report_name = current_path; + + test_durations.push_back({report_name, node.total_seconds}); + + if (!node.children.empty()) + { + collect_subcase_durations(node.children, current_path); + } + } + } + + void print_subcase_tree(const std::vector& nodes, const std::string& prefix, int column_width) + { + constexpr const char* reset = "\033[0m"; + constexpr const char* dim_dot = "\033[90m"; + + for (size_t i = 0; i < nodes.size(); ++i) + { + const auto& node = nodes[i]; + bool is_last = (i == nodes.size() - 1); + std::string branch = is_last ? "`-- " : "|-- "; - const doctest::TestCaseData* tc = nullptr; - std::mutex mutex; + std::string display_name = node.name; + + std::string sub_time_str = format_duration_clean(node.total_seconds); + std::string sub_time_color = get_duration_color(node.total_seconds); + std::string right_side = std::format("{}{} ok{}", sub_time_color, sub_time_str, reset); + std::string right_side_plain = std::format("{} ok", sub_time_str); + + size_t left_base_len = 10 + prefix.length() + branch.length(); + size_t space_budget = column_width - left_base_len - right_side_plain.length() - 3; + + if (display_name.length() > space_budget) + { + display_name = truncate_string(display_name, space_budget); + } + + std::string left_side = std::format(" {}{}{}", prefix, branch, display_name); + int fill_dots = column_width - static_cast(left_side.length() + right_side_plain.length()); + if (fill_dots < 3) fill_dots = 3; + + std::cout << left_side << dim_dot << std::string(fill_dots, '.') << reset << right_side << std::endl; + + if (!node.children.empty()) + { + std::string next_prefix = prefix + (is_last ? " " : "| "); + print_subcase_tree(node.children, next_prefix, column_width); + } + } + } + + static std::string format_duration_clean(double seconds, bool fixed_width = false) + { + if (fixed_width) + { + if (seconds < 0.001) return std::format("{:>8}", "<1ms"); + if (seconds < 1.0) return std::format("{:>6.1f}ms", seconds * 1000.0); + return std::format("{:>6.2f}s ", seconds); + } + if (seconds < 0.001) return "(<1ms)"; + if (seconds < 1.0) return std::format("({:.1f}ms)", seconds * 1000.0); + return std::format("({:.2f}s)", seconds); + } + + static std::string get_duration_color(const double seconds) + { + if (seconds < 0.010) return "\033[90m"; // Dim Grey (<10ms) + if (seconds < 0.250) return "\033[32m"; // Clean Green (<250ms) + if (seconds < 1.000) return "\033[0m"; // Standard Text (<1s) + if (seconds < 3.000) return "\033[33m"; // Warning Yellow (<3s) + return "\033[1;31m"; // Bold Panic Red (>=3s) + } + + static std::string truncate_string(const std::string& str, size_t max_len) + { + if (str.length() <= max_len) return str; + if (max_len <= 3) return "..."; + return str.substr(0, max_len - 3) + "..."; + } }; -REGISTER_LISTENER("failure", /*priority=*/1, FailureReporter); +REGISTER_LISTENER("test_log", /*priority=*/1, test_log_reporter); int main() { - doctest::Context context; - int res = context.run(); - - if (context.shouldExit()) - { - return res; - } - return res; -} \ No newline at end of file + compressed::detail::Instrumentor::Get().BeginSession("Tests"); + + doctest::Context context; + int res = context.run(); + + if (context.shouldExit()) + { + compressed::detail::Instrumentor::Get().EndSession(); + return res; + } + compressed::detail::Instrumentor::Get().EndSession(); + return res; +} diff --git a/test/src/test_channel.cpp b/test/src/test_channel.cpp index 405bea2..429c525 100644 --- a/test/src/test_channel.cpp +++ b/test/src/test_channel.cpp @@ -8,36 +8,39 @@ #include +#define _COMPRESSED_PROFILE 1 #include +#include #include #include "util.h" - // ----------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------- -TEST_CASE("Initialize channel from incorrect schunk" - * doctest::no_breaks(true) - * doctest::no_output(true) - * doctest::should_fail(true) +TEST_CASE( + "Initialize channel from incorrect schunk" + * doctest::no_breaks(true) + * doctest::no_output(true) + * doctest::should_fail(true) ) { - auto schunk = compressed::blosc2::schunk(); - auto channel = compressed::channel(std::move(schunk), 1, 1); + auto schunk = compressed::detail::schunk(); + auto channel = compressed::channel(std::move(schunk), 1, 1); } // ----------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------- -TEST_CASE("Initialize channel from incorrect span" - * doctest::no_breaks(true) - * doctest::no_output(true) - * doctest::should_fail(true) +TEST_CASE( + "Initialize channel from incorrect span" + * doctest::no_breaks(true) + * doctest::no_output(true) + * doctest::should_fail(true) ) { - auto vec = std::vector(50); - auto channel = compressed::channel(std::span(vec), 1, 1); + auto vec = std::vector(50); + auto channel = compressed::channel(std::span(vec), 1, 1); } @@ -45,29 +48,29 @@ TEST_CASE("Initialize channel from incorrect span" // ----------------------------------------------------------------------------------- TEST_CASE("Empty channel creation") { - auto vec = std::vector(0); + auto vec = std::vector(0); + + auto channel = compressed::channel(std::span(vec), 0, 0); - auto channel = compressed::channel(std::span(vec), 0, 0); - - CHECK(channel.uncompressed_size() == 0); - CHECK(channel.width() == 0); - CHECK(channel.height() == 0); + CHECK(channel.uncompressed_size() == 0); + CHECK(channel.width() == 0); + CHECK(channel.height() == 0); - auto decompressed = channel.get_decompressed(); - CHECK(decompressed.size() == 0); + auto decompressed = channel.get_decompressed(); + CHECK(decompressed.size() == 0); } // ----------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------- TEST_CASE("Roundtrip channel creation") { - auto vec = std::vector(50); - std::iota(vec.begin(), vec.end(), 0); + auto vec = std::vector(50); + std::iota(vec.begin(), vec.end(), 0); - auto channel = compressed::channel(std::span(vec), 10, 5); - auto roundtripped = channel.get_decompressed(); + auto channel = compressed::channel(std::span(vec), 10, 5); + auto roundtripped = channel.get_decompressed(); - CHECK(vec == roundtripped); + CHECK(vec == roundtripped); } @@ -75,68 +78,117 @@ TEST_CASE("Roundtrip channel creation") // ----------------------------------------------------------------------------------- TEST_CASE("Roundtrip channel creation larger than chunksize") { - auto vec = std::vector(8192); - std::iota(vec.begin(), vec.end(), 0); - - auto channel = compressed::channel(std::span(vec), 128, 64, compressed::enums::codec::lz4, 9, 128, 4096); - auto roundtripped = channel.get_decompressed(); - - CHECK(vec == roundtripped); + auto vec = std::vector(8192); + std::iota(vec.begin(), vec.end(), 0); + + auto channel = compressed::channel( + std::span(vec), + 128, + 64, + compressed::enums::codec::lz4, + 9, + 128, + 4096 + ); + auto roundtripped = channel.get_decompressed(); + + CHECK(vec == roundtripped); } // ----------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------- -TEST_CASE("Channel get attributes" +TEST_CASE( + "Channel get attributes" ) { - auto vec = std::vector(50); - auto channel = compressed::channel(std::span(vec), 10, 5, compressed::enums::codec::blosclz, 9); - - CHECK(channel.width() == 10); - CHECK(channel.height() == 5); - CHECK(channel.compression() == compressed::enums::codec::blosclz); - CHECK(channel.compression_context() != nullptr); - CHECK(channel.decompression_context() != nullptr); - CHECK(channel.uncompressed_size() == 50); - CHECK(channel.num_chunks() == 1); + auto vec = std::vector(50); + auto channel = compressed::channel(std::span(vec), 10, 5, compressed::enums::codec::blosclz, 9); + + CHECK(channel.width() == 10); + CHECK(channel.height() == 5); + CHECK(channel.compression() == compressed::enums::codec::blosclz); + CHECK(channel.uncompressed_size() == 50); + CHECK(channel.num_chunks() == 1); } - // ----------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------- TEST_CASE("Channel iterate") { - auto vec = std::vector(128, 255); - auto channel = compressed::channel(std::span(vec), 16, 8); - - SUBCASE("Read") - { - for (auto chunk_span : channel) - { - for (auto& pixel : chunk_span) - { - CHECK(pixel == 255); - } - } - } - - SUBCASE("Modify") - { - for (auto chunk_span : channel) - { - for (auto& pixel : chunk_span) - { - pixel = 128; - } - } - - for (auto chunk_span : channel) - { - for (auto& pixel : chunk_span) - { - CHECK(pixel == 128); - } - } - } -} \ No newline at end of file + auto vec = std::vector(128, 255); + auto channel = compressed::channel(std::span(vec), 16, 8); + + SUBCASE("Read") + { + size_t count = 0; + + for (auto chunk_span : channel) + { + for (auto& pixel : chunk_span) + { + CHECK(pixel == 255); + ++count; + } + } + + CHECK(count == vec.size()); + } + + SUBCASE("Modify") + { + for (auto chunk_span : channel) + { + for (auto& pixel : chunk_span) + { + pixel = 128; + } + } + + for (auto chunk_span : channel) + { + for (auto& pixel : chunk_span) + { + CHECK(pixel == 128); + } + } + + auto decompressed = channel.get_decompressed(); + test_util::check_vector_verbose(decompressed, static_cast(128)); + } +} + + +// ----------------------------------------------------------------------------------- +// ----------------------------------------------------------------------------------- +TEST_CASE("Channel iterate multiple chunks") +{ + auto vec = std::vector(128); + std::iota(vec.begin(), vec.end(), uint16_t{0}); + + auto channel = compressed::channel( + std::span(vec), + 16, + 8, + compressed::enums::codec::lz4, + 9, + 64, + 64 + ); + + size_t count = 0; + for (auto chunk_span : channel) + { + for (auto& pixel : chunk_span) + { + pixel = 42; + ++count; + } + } + + CHECK(count == vec.size()); + + auto decompressed = channel.get_decompressed(); + CHECK(decompressed.size() == vec.size()); + CHECK(std::ranges::all_of(decompressed, [](auto value) { return value == 42; })); +} diff --git a/test/src/test_chunk_span.cpp b/test/src/test_chunk_span.cpp index f5dbffe..72bc633 100644 --- a/test/src/test_chunk_span.cpp +++ b/test/src/test_chunk_span.cpp @@ -6,6 +6,7 @@ #include #include +#define _COMPRESSED_PROFILE 1 #include #include "util.h" @@ -15,14 +16,20 @@ // ----------------------------------------------------------------------------------- TEST_CASE("Get coordinates in base-chunk") { - std::vector data(50); - auto span_container = compressed::container::chunk_span(std::span(data), 10, 5, 0, compressed::s_default_chunksize); + std::vector data(50); + auto span_container = compressed::container::chunk_span( + std::span(data), + 10, + 5, + 0, + compressed::s_default_chunksize + ); - CHECK(span_container.x(9) == 9); - CHECK(span_container.y(5) == 0); + CHECK(span_container.x(9) == 9); + CHECK(span_container.y(5) == 0); - CHECK(span_container.x(15) == 5); - CHECK(span_container.y(15) == 1); + CHECK(span_container.x(15) == 5); + CHECK(span_container.y(15) == 1); } @@ -30,13 +37,13 @@ TEST_CASE("Get coordinates in base-chunk") // ----------------------------------------------------------------------------------- TEST_CASE("Get coordinates in non-base chunk") { - std::vector data(50); - auto span_container = compressed::container::chunk_span(std::span(data), 128, 128, 1, 128); + std::vector data(50); + auto span_container = compressed::container::chunk_span(std::span(data), 128, 128, 1, 128); - CHECK(span_container.x(9) == 9); - CHECK(span_container.y(5) == 1); - CHECK(span_container.x(135) == 7); - CHECK(span_container.y(129) == 2); + CHECK(span_container.x(9) == 9); + CHECK(span_container.y(5) == 1); + CHECK(span_container.x(135) == 7); + CHECK(span_container.y(129) == 2); } @@ -44,14 +51,20 @@ TEST_CASE("Get coordinates in non-base chunk") // ----------------------------------------------------------------------------------- TEST_CASE("Iter over chunk") { - std::vector data(50, 5); - auto span_container = compressed::container::chunk_span(std::span(data), 50, 1, 0, compressed::s_default_chunksize); - - size_t count = 0; - for (const auto& pixel : span_container) - { - CHECK(pixel == 5); - ++count; - } - CHECK(count == 50); + std::vector data(50, 5); + auto span_container = compressed::container::chunk_span( + std::span(data), + 50, + 1, + 0, + compressed::s_default_chunksize + ); + + size_t count = 0; + for (const auto& pixel : span_container) + { + CHECK(pixel == 5); + ++count; + } + CHECK(count == 50); } diff --git a/test/src/test_image.cpp b/test/src/test_image.cpp index 0810ab1..fa0298f 100644 --- a/test/src/test_image.cpp +++ b/test/src/test_image.cpp @@ -2,14 +2,12 @@ #include #include -#include #include +#define _COMPRESSED_PROFILE 1 #include #include -#include -#include #include "util.h" @@ -18,67 +16,82 @@ // ----------------------------------------------------------------------------------- TEST_CASE("Read compressed file smaller than one chunk") { - std::string name = "uv_grid_2048x2048.jpg"; - auto path = std::filesystem::current_path() / "images" / name; - - auto image = compressed::image::read( - path, - 0, - compressed::enums::codec::lz4, - 9, - compressed::s_default_blocksize, - compressed::s_default_chunksize * 2 - ); - auto image_data = image.get_decompressed(); - auto image_ref = test_util::read_oiio(path); - - - test_util::compare_images(image_data, image_ref, name); + test_util::parametrize_codecs( + [&](compressed::enums::codec codec) + { + std::string name = "uv_grid_2048x2048.jpg"; + auto path = std::filesystem::current_path() / "images" / name; + + auto image = compressed::image::read( + path, + 0, + codec, + 9, + compressed::s_default_blocksize, + compressed::s_default_chunksize * 2 + ); + auto image_data = image.get_decompressed(); + auto image_ref = test_util::read_oiio(path); + + + test_util::compare_images(image_data, image_ref, name); + } + ); } // ----------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------- TEST_CASE("Read compressed tiled file and extract channels") { - std::string name = "tiled_cryptomatte.exr"; - auto path = std::filesystem::current_path() / "images" / name; - - auto image = compressed::image::read( - path, - 0, - compressed::enums::codec::lz4, - 9, - compressed::s_default_blocksize, - compressed::s_default_chunksize * 2 - ); - auto image_data = image.get_decompressed(); - auto image_ref = test_util::read_oiio(path); - - test_util::compare_images(image_data, image_ref, name); + test_util::parametrize_codecs( + [&](compressed::enums::codec codec) + { + std::string name = "tiled_cryptomatte.exr"; + auto path = std::filesystem::current_path() / "images" / name; + + auto image = compressed::image::read( + path, + 0, + codec, + 9, + compressed::s_default_blocksize, + compressed::s_default_chunksize * 2 + ); + auto image_data = image.get_decompressed(); + auto image_ref = test_util::read_oiio(path); + + test_util::compare_images(image_data, image_ref, name); + } + ); } // ----------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------- TEST_CASE("Read compressed multipart file and extract channels") { - std::string name = "multipart.0001.exr"; - auto path = std::filesystem::current_path() / "images" / name; - - for (int subimage = 0; subimage < 10; ++subimage) - { - auto image = compressed::image::read( - path, - subimage, - compressed::enums::codec::lz4, - 9, - compressed::s_default_blocksize, - compressed::s_default_chunksize * 2 - ); - auto image_data = image.get_decompressed(); - auto image_ref = test_util::read_oiio(path, subimage); - - test_util::compare_images(image_data, image_ref, name); - } + test_util::parametrize_codecs( + [&](compressed::enums::codec codec) + { + std::string name = "multipart.0001.exr"; + auto path = std::filesystem::current_path() / "images" / name; + + for (int subimage = 0; subimage < 10; ++subimage) + { + auto image = compressed::image::read( + path, + subimage, + codec, + 9, + compressed::s_default_blocksize, + compressed::s_default_chunksize * 2 + ); + auto image_data = image.get_decompressed(); + auto image_ref = test_util::read_oiio(path, subimage); + + test_util::compare_images(image_data, image_ref, name); + } + } + ); } @@ -86,26 +99,31 @@ TEST_CASE("Read compressed multipart file and extract channels") // ----------------------------------------------------------------------------------- TEST_CASE("Read compressed file and extract channels") { - std::string name = "uv_grid_2048x2048.jpg"; - auto path = std::filesystem::current_path() / "images" / name; - - auto image = compressed::image::read(path); - - std::vector> decompressed; - for ([[maybe_unused]] auto _: std::views::iota(size_t{ 0 }, image.num_channels())) - { - // Since we keep pulling out the channels the indices change back to zero - auto channel = image.extract_channel(0); - decompressed.push_back(channel.get_decompressed()); - } - auto image_ref = test_util::read_oiio(path); - - // Since we extracted the channels, the number of channels should be zero with the channelnames - // also being empty - CHECK(image.num_channels() == 0); - CHECK(image.channelnames() == std::vector{}); - - test_util::compare_images(decompressed, image_ref, name); + test_util::parametrize_codecs( + [&](compressed::enums::codec codec) + { + std::string name = "uv_grid_2048x2048.jpg"; + auto path = std::filesystem::current_path() / "images" / name; + + auto image = compressed::image::read(path, 0, codec); + + std::vector> decompressed; + for ([[maybe_unused]] auto _ : std::views::iota(size_t{0}, image.num_channels())) + { + // Since we keep pulling out the channels the indices change back to zero + auto channel = image.extract_channel(0); + decompressed.push_back(channel.get_decompressed()); + } + auto image_ref = test_util::read_oiio(path); + + // Since we extracted the channels, the number of channels should be zero with the channelnames + // also being empty + CHECK(image.num_channels() == 0); + CHECK(image.channelnames() == std::vector{}); + + test_util::compare_images(decompressed, image_ref, name); + } + ); } @@ -113,17 +131,17 @@ TEST_CASE("Read compressed file and extract channels") // ----------------------------------------------------------------------------------- TEST_CASE("Read compressed file get attributes") { - std::string name = "uv_grid_2048x2048.jpg"; - auto path = std::filesystem::current_path() / "images" / name; + std::string name = "uv_grid_2048x2048.jpg"; + auto path = std::filesystem::current_path() / "images" / name; - auto image = compressed::image::read(path); + auto image = compressed::image::read(path); - CHECK(image.width() == 2048); - CHECK(image.height() == 2048); - CHECK(image.num_channels() == 3); - CHECK(image.channelnames() == std::vector{"R", "G", "B"}); - CHECK(image.metadata().size() > 0); - CHECK(image.chunk_size() == compressed::s_default_chunksize); + CHECK(image.width() == 2048); + CHECK(image.height() == 2048); + CHECK(image.num_channels() == 3); + CHECK(image.channelnames() == std::vector{"R", "G", "B"}); + CHECK(image.metadata().size() > 0); + CHECK(image.chunk_size() == compressed::s_default_chunksize); } @@ -131,15 +149,20 @@ TEST_CASE("Read compressed file get attributes") // ----------------------------------------------------------------------------------- TEST_CASE("Read compressed file exactly than one chunk") { - std::string name = "uv_grid_2048x2048.jpg"; - auto path = std::filesystem::current_path() / "images" / name; + test_util::parametrize_codecs( + [&](compressed::enums::codec codec) + { + std::string name = "uv_grid_2048x2048.jpg"; + auto path = std::filesystem::current_path() / "images" / name; - auto image = compressed::image::read(path); - auto image_data = image.get_decompressed(); - auto image_ref = test_util::read_oiio(path); + auto image = compressed::image::read(path, 0, codec); + auto image_data = image.get_decompressed(); + auto image_ref = test_util::read_oiio(path); - test_util::compare_images(image_data, image_ref, name); + test_util::compare_images(image_data, image_ref, name); + } + ); } @@ -147,22 +170,27 @@ TEST_CASE("Read compressed file exactly than one chunk") // ----------------------------------------------------------------------------------- TEST_CASE("Read compressed file larger than one chunk") { - std::string name = "multilayer_2560x1440.exr"; - auto path = std::filesystem::current_path() / "images" / name; - - auto image = compressed::image::read( - path, - 0, - compressed::enums::codec::lz4, - 9, - compressed::s_default_blocksize, - compressed::s_default_chunksize / 2 - ); - auto image_data = image.get_decompressed(); - auto image_ref = test_util::read_oiio(path); - - - test_util::compare_images(image_data, image_ref, name); + test_util::parametrize_codecs( + [&](compressed::enums::codec codec) + { + std::string name = "multilayer_2560x1440.exr"; + auto path = std::filesystem::current_path() / "images" / name; + + auto image = compressed::image::read( + path, + 0, + codec, + 9, + compressed::s_default_blocksize, + compressed::s_default_chunksize / 2 + ); + auto image_data = image.get_decompressed(); + auto image_ref = test_util::read_oiio(path); + + + test_util::compare_images(image_data, image_ref, name); + } + ); } @@ -170,25 +198,33 @@ TEST_CASE("Read compressed file larger than one chunk") // ----------------------------------------------------------------------------------- TEST_CASE("Read compressed file, subset of channel indices") { - test_util::parametrize([&]([[maybe_unused]] T type) - { - std::string name = "multilayer_2560x1440.exr"; - auto path = std::filesystem::current_path() / "images" / name; - auto input_ptr = OIIO::ImageInput::open(path.string()); - - auto image = compressed::image::read( - std::move(input_ptr), - { 0, 1, 2, 3 }, - 0, - compressed::enums::codec::lz4, - 9, - compressed::s_default_blocksize, - compressed::s_default_chunksize / 2 - ); - - CHECK(image.num_channels() == 4); - CHECK(image.channelnames() == std::vector{"R", "G", "B", "A"}); - }); + test_util::parametrize_codecs( + [&](compressed::enums::codec codec) + { + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + std::string name = "multilayer_2560x1440.exr"; + auto path = std::filesystem::current_path() / "images" / name; + auto input_ptr = OIIO::ImageInput::open(path.string()); + + auto image = compressed::image::read( + std::move(input_ptr), + {0, 1, 2, 3}, + 0, + codec, + 9, + compressed::s_default_blocksize, + compressed::s_default_chunksize / 2 + ); + + CHECK(image.num_channels() == 4); + CHECK(image.channelnames() == std::vector{"R", "G", "B", "A"}); + } + + ); + } + ); } @@ -196,25 +232,32 @@ TEST_CASE("Read compressed file, subset of channel indices") // ----------------------------------------------------------------------------------- TEST_CASE("Read compressed file, non contiguous channel indices") { - test_util::parametrize([&]([[maybe_unused]] T type) - { - std::string name = "multilayer_2560x1440.exr"; - auto path = std::filesystem::current_path() / "images" / name; - auto input_ptr = OIIO::ImageInput::open(path.string()); - - auto image = compressed::image::read( - std::move(input_ptr), - { 0, 2, 3, 11 }, - 0, - compressed::enums::codec::lz4, - 9, - compressed::s_default_blocksize, - compressed::s_default_chunksize / 2 - ); - - CHECK(image.num_channels() == 4); - CHECK(image.channelnames() == std::vector{ "R", "B", "A", "VRayCryptomatte00.R"}); - }); + test_util::parametrize_codecs( + [&](compressed::enums::codec codec) + { + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + std::string name = "multilayer_2560x1440.exr"; + auto path = std::filesystem::current_path() / "images" / name; + auto input_ptr = OIIO::ImageInput::open(path.string()); + + auto image = compressed::image::read( + std::move(input_ptr), + {0, 2, 3, 11}, + 0, + codec, + 9, + compressed::s_default_blocksize, + compressed::s_default_chunksize / 2 + ); + + CHECK(image.num_channels() == 4); + CHECK(image.channelnames() == std::vector{ "R", "B", "A", "VRayCryptomatte00.R"}); + } + ); + } + ); } @@ -222,57 +265,66 @@ TEST_CASE("Read compressed file, non contiguous channel indices") // ----------------------------------------------------------------------------------- TEST_CASE("Read compressed file, non contiguous channel indices, out of order") { - test_util::parametrize([&]([[maybe_unused]] T type) - { - std::string name = "multilayer_2560x1440.exr"; - auto path = std::filesystem::current_path() / "images" / name; - auto input_ptr = OIIO::ImageInput::open(path.string()); - - auto image = compressed::image::read( - std::move(input_ptr), - { 11, 0, 2, 3 }, - 0, - compressed::enums::codec::lz4, - 9, - compressed::s_default_blocksize, - compressed::s_default_chunksize / 2 - ); - - // Despite us specifying "VRayCryptomatte00.R" first, since it appears later in the channels this should - // have the same ordering as the file - CHECK(image.num_channels() == 4); - CHECK(image.channelnames() == std::vector{ "R", "B", "A", "VRayCryptomatte00.R"}); - }); + test_util::parametrize_codecs( + [&](compressed::enums::codec codec) + { + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + std::string name = "multilayer_2560x1440.exr"; + auto path = std::filesystem::current_path() / "images" / name; + auto input_ptr = OIIO::ImageInput::open(path.string()); + + auto image = compressed::image::read( + std::move(input_ptr), + {11, 0, 2, 3}, + 0, + codec, + 9, + compressed::s_default_blocksize, + compressed::s_default_chunksize / 2 + ); + + // Despite us specifying "VRayCryptomatte00.R" first, since it appears later in the channels this should + // have the same ordering as the file + CHECK(image.num_channels() == 4); + CHECK(image.channelnames() == std::vector{ "R", "B", "A", "VRayCryptomatte00.R"}); + } + ); + } + ); } // ----------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------- TEST_CASE( - "Read compressed file, invalid channel index" - * doctest::no_breaks(true) - * doctest::no_output(true) - * doctest::should_fail(true) + "Read compressed file, invalid channel index" + * doctest::no_breaks(true) + * doctest::no_output(true) + * doctest::should_fail(true) ) { - test_util::parametrize([&]([[maybe_unused]] T type) - { - std::string name = "multilayer_2560x1440.exr"; - auto path = std::filesystem::current_path() / "images" / name; - auto input_ptr = OIIO::ImageInput::open(path.string()); - - // this should fail as this file does not have a 64th channel - auto image = compressed::image::read( - std::move(input_ptr), - { 0, 1, 64 }, - 0, - compressed::enums::codec::lz4, - 9, - compressed::s_default_blocksize, - compressed::s_default_chunksize / 2 - ); - - }); + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + std::string name = "multilayer_2560x1440.exr"; + auto path = std::filesystem::current_path() / "images" / name; + auto input_ptr = OIIO::ImageInput::open(path.string()); + + // this should fail as this file does not have a 64th channel + auto image = compressed::image::read( + std::move(input_ptr), + {0, 1, 64}, + 0, + compressed::enums::codec::lz4, + 9, + compressed::s_default_blocksize, + compressed::s_default_chunksize / 2 + ); + } + + ); } @@ -280,25 +332,33 @@ TEST_CASE( // ----------------------------------------------------------------------------------- TEST_CASE("Read compressed file, subset of channel names") { - test_util::parametrize([&]([[maybe_unused]] T type) - { - std::string name = "multilayer_2560x1440.exr"; - auto path = std::filesystem::current_path() / "images" / name; - auto input_ptr = OIIO::ImageInput::open(path.string()); - - auto image = compressed::image::read( - std::move(input_ptr), - { "R", "G", "B", "A" }, - 0, - compressed::enums::codec::lz4, - 9, - compressed::s_default_blocksize, - compressed::s_default_chunksize / 2 - ); - - CHECK(image.num_channels() == 4); - CHECK(image.channelnames() == std::vector{"R", "G", "B", "A"}); - }); + test_util::parametrize_codecs( + [&](compressed::enums::codec codec) + { + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + std::string name = "multilayer_2560x1440.exr"; + auto path = std::filesystem::current_path() / "images" / name; + auto input_ptr = OIIO::ImageInput::open(path.string()); + + auto image = compressed::image::read( + std::move(input_ptr), + {"R", "G", "B", "A"}, + 0, + codec, + 9, + compressed::s_default_blocksize, + compressed::s_default_chunksize / 2 + ); + + CHECK(image.num_channels() == 4); + CHECK(image.channelnames() == std::vector{"R", "G", "B", "A"}); + } + + ); + } + ); } @@ -306,25 +366,32 @@ TEST_CASE("Read compressed file, subset of channel names") // ----------------------------------------------------------------------------------- TEST_CASE("Read compressed file, non contiguous channel names") { - test_util::parametrize([&]([[maybe_unused]] T type) - { - std::string name = "multilayer_2560x1440.exr"; - auto path = std::filesystem::current_path() / "images" / name; - auto input_ptr = OIIO::ImageInput::open(path.string()); - - auto image = compressed::image::read( - std::move(input_ptr), - { "R", "B", "A", "VRayCryptomatte00.R" }, - 0, - compressed::enums::codec::lz4, - 9, - compressed::s_default_blocksize, - compressed::s_default_chunksize / 2 - ); - - CHECK(image.num_channels() == 4); - CHECK(image.channelnames() == std::vector{ "R", "B", "A", "VRayCryptomatte00.R"}); - }); + test_util::parametrize_codecs( + [&](compressed::enums::codec codec) + { + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + std::string name = "multilayer_2560x1440.exr"; + auto path = std::filesystem::current_path() / "images" / name; + auto input_ptr = OIIO::ImageInput::open(path.string()); + + auto image = compressed::image::read( + std::move(input_ptr), + {"R", "B", "A", "VRayCryptomatte00.R"}, + 0, + codec, + 9, + compressed::s_default_blocksize, + compressed::s_default_chunksize / 2 + ); + + CHECK(image.num_channels() == 4); + CHECK(image.channelnames() == std::vector{ "R", "B", "A", "VRayCryptomatte00.R"}); + } + ); + } + ); } @@ -332,98 +399,113 @@ TEST_CASE("Read compressed file, non contiguous channel names") // ----------------------------------------------------------------------------------- TEST_CASE("Read compressed file, non contiguous channel names, out of order") { - test_util::parametrize([&]([[maybe_unused]] T type) - { - std::string name = "multilayer_2560x1440.exr"; - auto path = std::filesystem::current_path() / "images" / name; - auto input_ptr = OIIO::ImageInput::open(path.string()); - - auto image = compressed::image::read( - std::move(input_ptr), - { "VRayCryptomatte00.R", "R", "B", "A" }, - 0, - compressed::enums::codec::lz4, - 9, - compressed::s_default_blocksize, - compressed::s_default_chunksize / 2 - ); - - // Despite us specifying "VRayCryptomatte00.R" first, since it appears later in the channels this should - // have the same ordering as the file - CHECK(image.num_channels() == 4); - CHECK(image.channelnames() == std::vector{ "R", "B", "A", "VRayCryptomatte00.R"}); - }); + test_util::parametrize_codecs( + [&](compressed::enums::codec codec) + { + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + std::string name = "multilayer_2560x1440.exr"; + auto path = std::filesystem::current_path() / "images" / name; + auto input_ptr = OIIO::ImageInput::open(path.string()); + + auto image = compressed::image::read( + std::move(input_ptr), + {"VRayCryptomatte00.R", "R", "B", "A"}, + 0, + codec, + 9, + compressed::s_default_blocksize, + compressed::s_default_chunksize / 2 + ); + + // Despite us specifying "VRayCryptomatte00.R" first, since it appears later in the channels this should + // have the same ordering as the file + CHECK(image.num_channels() == 4); + CHECK(image.channelnames() == std::vector{ "R", "B", "A", "VRayCryptomatte00.R"}); + } + ); + } + ); } // ----------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------- TEST_CASE( - "Read compressed file, invalid channel name" - * doctest::no_breaks(true) - * doctest::no_output(true) - * doctest::should_fail(true) + "Read compressed file, invalid channel name" + * doctest::no_breaks(true) + * doctest::no_output(true) + * doctest::should_fail(true) ) { - test_util::parametrize([&]([[maybe_unused]] T type) - { - std::string name = "multilayer_2560x1440.exr"; - auto path = std::filesystem::current_path() / "images" / name; - auto input_ptr = OIIO::ImageInput::open(path.string()); - - // this should fail as this file does not have a z channel - auto image = compressed::image::read( - std::move(input_ptr), - { "R", "G", "Z" }, - 0, - compressed::enums::codec::lz4, - 9, - compressed::s_default_blocksize, - compressed::s_default_chunksize / 2 - ); - - }); + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + std::string name = "multilayer_2560x1440.exr"; + auto path = std::filesystem::current_path() / "images" / name; + auto input_ptr = OIIO::ImageInput::open(path.string()); + + // this should fail as this file does not have a z channel + auto image = compressed::image::read( + std::move(input_ptr), + {"R", "G", "Z"}, + 0, + compressed::enums::codec::lz4, + 9, + compressed::s_default_blocksize, + compressed::s_default_chunksize / 2 + ); + } + ); } - // ----------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------- TEST_CASE("Read compressed file with postprocess, subset of channel names") { - test_util::parametrize([&]([[maybe_unused]] T type) - { - std::string name = "multilayer_2560x1440.exr"; - auto path = std::filesystem::current_path() / "images" / name; - auto input_ptr = OIIO::ImageInput::open(path.string()); - - auto image = compressed::image::read( - std::move(input_ptr), - []([[maybe_unused]] size_t channel_idx, std::span values) - { - for (auto& value : values) - { - value = static_cast(25); - } - }, - { "R", "G", "B", "A" }, - 0, - compressed::enums::codec::lz4, - 9, - compressed::s_default_blocksize, - compressed::s_default_chunksize / 2 - ); - - CHECK(image.num_channels() == 4); - CHECK(image.channelnames() == std::vector{"R", "G", "B", "A"}); - - // Check that our postprocess worked - auto decompressed = image.get_decompressed(); - for (const auto& channel : decompressed) - { - test_util::check_vector_verbose(channel, static_cast(25)); - } - }); + test_util::parametrize_codecs( + [&](compressed::enums::codec codec) + { + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + std::string name = "multilayer_2560x1440.exr"; + auto path = std::filesystem::current_path() / "images" / name; + auto input_ptr = OIIO::ImageInput::open(path.string()); + + auto image = compressed::image::read( + std::move(input_ptr), + []([[maybe_unused]] size_t channel_idx, std::span values) + { + for (auto& value : values) + { + value = static_cast(25); + } + }, + {"R", "G", "B", "A"}, + 0, + codec, + 9, + compressed::s_default_blocksize, + compressed::s_default_chunksize / 2 + ); + + CHECK(image.num_channels() == 4); + CHECK(image.channelnames() == std::vector{"R", "G", "B", "A"}); + + // Check that our postprocess worked + auto decompressed = image.get_decompressed(); + for (const auto& channel : decompressed) + { + test_util::check_vector_verbose(channel, static_cast(25)); + } + } + + ); + } + ); } @@ -431,39 +513,51 @@ TEST_CASE("Read compressed file with postprocess, subset of channel names") // ----------------------------------------------------------------------------------- TEST_CASE("Read compressed file with postprocess, non contiguous channel names") { - test_util::parametrize([&]([[maybe_unused]] T type) - { - std::string name = "multilayer_2560x1440.exr"; - auto path = std::filesystem::current_path() / "images" / name; - auto input_ptr = OIIO::ImageInput::open(path.string()); - - auto image = compressed::image::read( - std::move(input_ptr), - []([[maybe_unused]] size_t channel_idx, std::span values) - { - for (auto& value : values) - { - value = static_cast(25); - } - }, - { "R", "B", "A", "VRayCryptomatte00.R" }, - 0, - compressed::enums::codec::lz4, - 9, - compressed::s_default_blocksize, - compressed::s_default_chunksize / 2 - ); - - CHECK(image.num_channels() == 4); - CHECK(image.channelnames() == std::vector{ "R", "B", "A", "VRayCryptomatte00.R"}); - - // Check that our postprocess worked - auto decompressed = image.get_decompressed(); - for (const auto& channel : decompressed) - { - test_util::check_vector_verbose(channel, static_cast(25)); - } - }); + test_util::parametrize_codecs( + [&](compressed::enums::codec codec) + { + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + std::string name = "multilayer_2560x1440.exr"; + auto path = std::filesystem::current_path() / "images" / name; + auto input_ptr = OIIO::ImageInput::open(path.string()); + + auto image = compressed::image::read( + std::move(input_ptr), + []([[maybe_unused]] size_t channel_idx, std::span values) + { + for (auto& value : values) + { + value = static_cast(25); + } + }, + {"R", "B", "A", "VRayCryptomatte00.R"}, + 0, + codec, + 9, + compressed::s_default_blocksize, + compressed::s_default_chunksize / 2 + ); + + CHECK(image.num_channels() == 4); + CHECK( + image.channelnames() == std::vector{ "R", + "B", + "A", + "VRayCryptomatte00.R"} + ); + + // Check that our postprocess worked + auto decompressed = image.get_decompressed(); + for (const auto& channel : decompressed) + { + test_util::check_vector_verbose(channel, static_cast(25)); + } + } + ); + } + ); } @@ -471,131 +565,144 @@ TEST_CASE("Read compressed file with postprocess, non contiguous channel names") // ----------------------------------------------------------------------------------- TEST_CASE("Read compressed file with postprocess, non contiguous channel names, out of order") { - test_util::parametrize([&]([[maybe_unused]] T type) - { - std::string name = "multilayer_2560x1440.exr"; - auto path = std::filesystem::current_path() / "images" / name; - auto input_ptr = OIIO::ImageInput::open(path.string()); - - auto image = compressed::image::read( - std::move(input_ptr), - []([[maybe_unused]] size_t channel_idx, std::span values) - { - for (auto& value : values) - { - value = static_cast(25); - } - }, - { "VRayCryptomatte00.R", "R", "B", "A" }, - 0, - compressed::enums::codec::lz4, - 9, - compressed::s_default_blocksize, - compressed::s_default_chunksize / 2 - ); - - // Despite us specifying "VRayCryptomatte00.R" first, since it appears later in the channels this should - // have the same ordering as the file - CHECK(image.num_channels() == 4); - CHECK(image.channelnames() == std::vector{ "R", "B", "A", "VRayCryptomatte00.R"}); - - // Check that our postprocess worked - auto decompressed = image.get_decompressed(); - for (const auto& channel : decompressed) - { - test_util::check_vector_verbose(channel, static_cast(25)); - } - }); + test_util::parametrize_codecs( + [&](compressed::enums::codec codec) + { + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + std::string name = "multilayer_2560x1440.exr"; + auto path = std::filesystem::current_path() / "images" / name; + auto input_ptr = OIIO::ImageInput::open(path.string()); + + auto image = compressed::image::read( + std::move(input_ptr), + []([[maybe_unused]] size_t channel_idx, std::span values) + { + for (auto& value : values) + { + value = static_cast(25); + } + }, + {"VRayCryptomatte00.R", "R", "B", "A"}, + 0, + codec, + 9, + compressed::s_default_blocksize, + compressed::s_default_chunksize / 2 + ); + + // Despite us specifying "VRayCryptomatte00.R" first, since it appears later in the channels this should + // have the same ordering as the file + CHECK(image.num_channels() == 4); + CHECK( + image.channelnames() == std::vector{ "R", + "B", + "A", + "VRayCryptomatte00.R"} + ); + + // Check that our postprocess worked + auto decompressed = image.get_decompressed(); + for (const auto& channel : decompressed) + { + test_util::check_vector_verbose(channel, static_cast(25)); + } + } + + ); + } + ); } // ----------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------- TEST_CASE( - "Read compressed file with postprocess, invalid channel name" - * doctest::no_breaks(true) - * doctest::no_output(true) - * doctest::should_fail(true) + "Read compressed file with postprocess, invalid channel name" + * doctest::no_breaks(true) + * doctest::no_output(true) + * doctest::should_fail(true) ) { - test_util::parametrize([&]([[maybe_unused]] T type) - { - std::string name = "multilayer_2560x1440.exr"; - auto path = std::filesystem::current_path() / "images" / name; - auto input_ptr = OIIO::ImageInput::open(path.string()); - - // this should fail as this file does not have a z channel - auto image = compressed::image::read( - std::move(input_ptr), - []([[maybe_unused]] size_t channel_idx, std::span values) - { - for (auto& value : values) - { - value = static_cast(25); - } - }, - { "R", "G", "Z" }, - 0, - compressed::enums::codec::lz4, - 9, - compressed::s_default_blocksize, - compressed::s_default_chunksize / 2 - ); - }); + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + std::string name = "multilayer_2560x1440.exr"; + auto path = std::filesystem::current_path() / "images" / name; + auto input_ptr = OIIO::ImageInput::open(path.string()); + + // this should fail as this file does not have a z channel + auto image = compressed::image::read( + std::move(input_ptr), + []([[maybe_unused]] size_t channel_idx, std::span values) + { + for (auto& value : values) + { + value = static_cast(25); + } + }, + {"R", "G", "Z"}, + 0, + compressed::enums::codec::lz4, + 9, + compressed::s_default_blocksize, + compressed::s_default_chunksize / 2 + ); + } + ); } - - // ----------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------- TEST_CASE("Initialize image and iterate parametrized") { - test_util::parametrize([&]([[maybe_unused]] T type) - { - auto channel_r_data = std::vector(128, static_cast(255)); - - auto image = compressed::image( - std::vector>{ channel_r_data}, - 16, - 8 - ); - - SUBCASE("Read") - { - auto& r_ref = image.channel(0); - for (auto chunk : r_ref) - { - for (auto& pixel : chunk) - { - CHECK(pixel == static_cast(255)); - } - } - } - - SUBCASE("Modify") - { - auto& r_ref = image.channel(0); - for (auto chunk : r_ref) - { - for (auto& pixel : chunk) - { - pixel = static_cast(128); - } - } - - auto& r_ref_2 = image.channel(0); - for (auto chunk_ : r_ref_2) - { - for (auto& pixel : chunk_) - { - CHECK(pixel == static_cast(128)); - } - } - } - } - ); - + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + auto channel_r_data = std::vector(128, static_cast(255)); + + auto image = compressed::image( + std::vector>{channel_r_data}, + 16, + 8 + ); + + SUBCASE("Read") + { + auto& r_ref = image.channel(0); + for (auto chunk : r_ref) + { + for (auto& pixel : chunk) + { + CHECK(pixel == static_cast(255)); + } + } + } + + SUBCASE("Modify") + { + auto& r_ref = image.channel(0); + for (auto chunk : r_ref) + { + for (auto& pixel : chunk) + { + pixel = static_cast(128); + } + } + + auto& r_ref_2 = image.channel(0); + for (auto chunk_ : r_ref_2) + { + for (auto& pixel : chunk_) + { + CHECK(pixel == static_cast(128)); + } + } + } + } + ); } @@ -603,34 +710,35 @@ TEST_CASE("Initialize image and iterate parametrized") // ----------------------------------------------------------------------------------- TEST_CASE("Zip image channels parametrized") { - test_util::parametrize([&]([[maybe_unused]] T type) - { - auto channel_r_data = std::vector(128, static_cast(255)); - auto channel_g_data = std::vector(128, static_cast(0)); - auto channel_b_data = std::vector(128, static_cast(199)); - - auto image = compressed::image( - std::vector>{ channel_r_data, channel_g_data, channel_b_data }, - 16, - 8 - ); - - auto [r, g, b] = image.channels(0, 1, 2); - CHECK(r == image.channel(0)); - CHECK(g == image.channel(1)); - CHECK(b == image.channel(2)); - - for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b)) - { - for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk)) - { - CHECK(r_pixel == static_cast(255)); - CHECK(g_pixel == static_cast(0)); - CHECK(b_pixel == static_cast(199)); - } - } - } - ); + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + auto channel_r_data = std::vector(128, static_cast(255)); + auto channel_g_data = std::vector(128, static_cast(0)); + auto channel_b_data = std::vector(128, static_cast(199)); + + auto image = compressed::image( + std::vector>{channel_r_data, channel_g_data, channel_b_data}, + 16, + 8 + ); + + auto [r, g, b] = image.channels(0, 1, 2); + CHECK(r == image.channel(0)); + CHECK(g == image.channel(1)); + CHECK(b == image.channel(2)); + + for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b)) + { + for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk)) + { + CHECK(r_pixel == static_cast(255)); + CHECK(g_pixel == static_cast(0)); + CHECK(b_pixel == static_cast(199)); + } + } + } + ); } @@ -638,39 +746,40 @@ TEST_CASE("Zip image channels parametrized") // ----------------------------------------------------------------------------------- TEST_CASE("Zip image channels equal to chunk size parametrized") { - test_util::parametrize([&]([[maybe_unused]] T type) - { - auto channel_r_data = std::vector(1024, static_cast(255)); - auto channel_g_data = std::vector(1024, static_cast(0)); - auto channel_b_data = std::vector(1024, static_cast(199)); - - auto image = compressed::image( - std::vector>{ channel_r_data, channel_g_data, channel_b_data }, - 64, - 16, - {}, - compressed::enums::codec::lz4, - 9, - 256, - 1024 - ); - - auto [r, g, b] = image.channels(0, 1, 2); - CHECK(r == image.channel(0)); - CHECK(g == image.channel(1)); - CHECK(b == image.channel(2)); - - for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b)) - { - for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk)) - { - CHECK(r_pixel == static_cast(255)); - CHECK(g_pixel == static_cast(0)); - CHECK(b_pixel == static_cast(199)); - } - } - } - ); + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + auto channel_r_data = std::vector(1024, static_cast(255)); + auto channel_g_data = std::vector(1024, static_cast(0)); + auto channel_b_data = std::vector(1024, static_cast(199)); + + auto image = compressed::image( + std::vector>{channel_r_data, channel_g_data, channel_b_data}, + 64, + 16, + {}, + compressed::enums::codec::lz4, + 9, + 256, + 1024 + ); + + auto [r, g, b] = image.channels(0, 1, 2); + CHECK(r == image.channel(0)); + CHECK(g == image.channel(1)); + CHECK(b == image.channel(2)); + + for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b)) + { + for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk)) + { + CHECK(r_pixel == static_cast(255)); + CHECK(g_pixel == static_cast(0)); + CHECK(b_pixel == static_cast(199)); + } + } + } + ); } @@ -678,39 +787,40 @@ TEST_CASE("Zip image channels equal to chunk size parametrized") // ----------------------------------------------------------------------------------- TEST_CASE("Zip image channels larger to chunk size parametrized") { - test_util::parametrize([&]([[maybe_unused]] T type) - { - auto channel_r_data = std::vector(1024, static_cast(255)); - auto channel_g_data = std::vector(1024, static_cast(0)); - auto channel_b_data = std::vector(1024, static_cast(199)); - - auto image = compressed::image( - std::vector>{ channel_r_data, channel_g_data, channel_b_data }, - 64, - 16, - {}, - compressed::enums::codec::lz4, - 9, - 256, - 768 - ); - - auto [r, g, b] = image.channels(0, 1, 2); - CHECK(r == image.channel(0)); - CHECK(g == image.channel(1)); - CHECK(b == image.channel(2)); - - for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b)) - { - for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk)) - { - CHECK(r_pixel == static_cast(255)); - CHECK(g_pixel == static_cast(0)); - CHECK(b_pixel == static_cast(199)); - } - } - } - ); + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + auto channel_r_data = std::vector(1024, static_cast(255)); + auto channel_g_data = std::vector(1024, static_cast(0)); + auto channel_b_data = std::vector(1024, static_cast(199)); + + auto image = compressed::image( + std::vector>{channel_r_data, channel_g_data, channel_b_data}, + 64, + 16, + {}, + compressed::enums::codec::lz4, + 9, + 256, + 768 + ); + + auto [r, g, b] = image.channels(0, 1, 2); + CHECK(r == image.channel(0)); + CHECK(g == image.channel(1)); + CHECK(b == image.channel(2)); + + for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b)) + { + for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk)) + { + CHECK(r_pixel == static_cast(255)); + CHECK(g_pixel == static_cast(0)); + CHECK(b_pixel == static_cast(199)); + } + } + } + ); } @@ -718,44 +828,45 @@ TEST_CASE("Zip image channels larger to chunk size parametrized") // ----------------------------------------------------------------------------------- TEST_CASE("Zip modify image channels parametrized") { - test_util::parametrize([&]([[maybe_unused]] T type) - { - auto channel_r_data = std::vector(128, static_cast(255)); - auto channel_g_data = std::vector(128, static_cast(0)); - auto channel_b_data = std::vector(128, static_cast(199)); - - auto image = compressed::image( - std::vector>{ channel_r_data, channel_g_data, channel_b_data }, - 16, - 8 - ); - - auto [r, g, b] = image.channels(0, 1, 2); - CHECK(r == image.channel(0)); - CHECK(g == image.channel(1)); - CHECK(b == image.channel(2)); - - for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b)) - { - for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk)) - { - r_pixel = static_cast(12); - g_pixel = static_cast(13); - b_pixel = static_cast(14); - } - } - - for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b)) - { - for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk)) - { - CHECK(r_pixel == static_cast(12)); - CHECK(g_pixel == static_cast(13)); - CHECK(b_pixel == static_cast(14)); - } - } - } - ); + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + auto channel_r_data = std::vector(128, static_cast(255)); + auto channel_g_data = std::vector(128, static_cast(0)); + auto channel_b_data = std::vector(128, static_cast(199)); + + auto image = compressed::image( + std::vector>{channel_r_data, channel_g_data, channel_b_data}, + 16, + 8 + ); + + auto [r, g, b] = image.channels(0, 1, 2); + CHECK(r == image.channel(0)); + CHECK(g == image.channel(1)); + CHECK(b == image.channel(2)); + + for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b)) + { + for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk)) + { + r_pixel = static_cast(12); + g_pixel = static_cast(13); + b_pixel = static_cast(14); + } + } + + for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b)) + { + for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk)) + { + CHECK(r_pixel == static_cast(12)); + CHECK(g_pixel == static_cast(13)); + CHECK(b_pixel == static_cast(14)); + } + } + } + ); } @@ -763,49 +874,50 @@ TEST_CASE("Zip modify image channels parametrized") // ----------------------------------------------------------------------------------- TEST_CASE("Zip modify image channels equal to chunk size parametrized") { - test_util::parametrize([&]([[maybe_unused]] T type) - { - auto channel_r_data = std::vector(1024, static_cast(255)); - auto channel_g_data = std::vector(1024, static_cast(0)); - auto channel_b_data = std::vector(1024, static_cast(199)); - - auto image = compressed::image( - std::vector>{ channel_r_data, channel_g_data, channel_b_data }, - 64, - 16, - {}, - compressed::enums::codec::lz4, - 9, - 256, - 1024 - ); - - auto [r, g, b] = image.channels(0, 1, 2); - CHECK(r == image.channel(0)); - CHECK(g == image.channel(1)); - CHECK(b == image.channel(2)); - - for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b)) - { - for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk)) - { - r_pixel = static_cast(12); - g_pixel = static_cast(13); - b_pixel = static_cast(14); - } - } - - for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b)) - { - for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk)) - { - CHECK(r_pixel == static_cast(12)); - CHECK(g_pixel == static_cast(13)); - CHECK(b_pixel == static_cast(14)); - } - } - } - ); + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + auto channel_r_data = std::vector(1024, static_cast(255)); + auto channel_g_data = std::vector(1024, static_cast(0)); + auto channel_b_data = std::vector(1024, static_cast(199)); + + auto image = compressed::image( + std::vector>{channel_r_data, channel_g_data, channel_b_data}, + 64, + 16, + {}, + compressed::enums::codec::lz4, + 9, + 256, + 1024 + ); + + auto [r, g, b] = image.channels(0, 1, 2); + CHECK(r == image.channel(0)); + CHECK(g == image.channel(1)); + CHECK(b == image.channel(2)); + + for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b)) + { + for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk)) + { + r_pixel = static_cast(12); + g_pixel = static_cast(13); + b_pixel = static_cast(14); + } + } + + for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b)) + { + for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk)) + { + CHECK(r_pixel == static_cast(12)); + CHECK(g_pixel == static_cast(13)); + CHECK(b_pixel == static_cast(14)); + } + } + } + ); } @@ -813,47 +925,48 @@ TEST_CASE("Zip modify image channels equal to chunk size parametrized") // ----------------------------------------------------------------------------------- TEST_CASE("Zip modify image channels larger to chunk size parametrized") { - test_util::parametrize([&]([[maybe_unused]] T type) - { - auto channel_r_data = std::vector(1024, static_cast(255)); - auto channel_g_data = std::vector(1024, static_cast(0)); - auto channel_b_data = std::vector(1024, static_cast(199)); - - auto image = compressed::image( - std::vector>{ channel_r_data, channel_g_data, channel_b_data }, - 64, - 16, - {}, - compressed::enums::codec::lz4, - 9, - 256, - 768 - ); - - auto [r, g, b] = image.channels(0, 1, 2); - CHECK(r == image.channel(0)); - CHECK(g == image.channel(1)); - CHECK(b == image.channel(2)); - - for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b)) - { - for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk)) - { - r_pixel = static_cast(12); - g_pixel = static_cast(13); - b_pixel = static_cast(14); - } - } - - for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b)) - { - for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk)) - { - CHECK(r_pixel == static_cast(12)); - CHECK(g_pixel == static_cast(13)); - CHECK(b_pixel == static_cast(14)); - } - } - } - ); -} \ No newline at end of file + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + auto channel_r_data = std::vector(1024, static_cast(255)); + auto channel_g_data = std::vector(1024, static_cast(0)); + auto channel_b_data = std::vector(1024, static_cast(199)); + + auto image = compressed::image( + std::vector>{channel_r_data, channel_g_data, channel_b_data}, + 64, + 16, + {}, + compressed::enums::codec::lz4, + 9, + 256, + 768 + ); + + auto [r, g, b] = image.channels(0, 1, 2); + CHECK(r == image.channel(0)); + CHECK(g == image.channel(1)); + CHECK(b == image.channel(2)); + + for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b)) + { + for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk)) + { + r_pixel = static_cast(12); + g_pixel = static_cast(13); + b_pixel = static_cast(14); + } + } + + for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b)) + { + for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk)) + { + CHECK(r_pixel == static_cast(12)); + CHECK(g_pixel == static_cast(13)); + CHECK(b_pixel == static_cast(14)); + } + } + } + ); +} diff --git a/test/src/test_iterator.cpp b/test/src/test_iterator.cpp deleted file mode 100644 index 854a48d..0000000 --- a/test/src/test_iterator.cpp +++ /dev/null @@ -1,83 +0,0 @@ -#include "doctest.h" - -#include -#include -#include -#include -#include - -#include -#include - -#include "util.h" - - -TEST_CASE("Iterator: serial access") -{ - std::string name = "uv_grid_2048x2048.jpg"; - auto path = std::filesystem::current_path() / "images" / name; - auto image = compressed::image::read(path); - - auto& r = image.channel(0); - size_t count = 0; - for (const auto& chunk : r) - { - CHECK(chunk.chunk_index() == count); - ++count; - } -} - - -TEST_CASE("Iterator: iterate out of bounds" - * doctest::no_breaks(true) - * doctest::no_output(true) - * doctest::should_fail(true)) -{ - std::string name = "uv_grid_2048x2048.jpg"; - auto path = std::filesystem::current_path() / "images" / name; - auto image = compressed::image::read(path); - - auto& r = image.channel(0); - auto it = r.begin(); - ++it; - ++it; -} - - - -TEST_CASE("Iterator: comparison") -{ - std::string name = "uv_grid_2048x2048.jpg"; - auto path = std::filesystem::current_path() / "images" / name; - auto image = compressed::image::read( - path, - 0, - compressed::enums::codec::lz4, - 9, - 4096, - 16384 - ); - - auto& r = image.channel(0); - auto it = r.begin(); - auto it_2 = r.begin(); - - CHECK(it == it_2); - ++it; - CHECK(it != it_2); - - // Different image, iterator should not match - auto image_2 = compressed::image::read( - path, - 0, - compressed::enums::codec::lz4, - 9, - 4096, - 16384 - ); - auto& r_2 = image_2.channel(0); - auto it_other = r_2.begin(); - - CHECK(it_other != it); - CHECK(it_other != it_2); -} diff --git a/test/src/test_schunk.cpp b/test/src/test_schunk.cpp index 6384d5d..3929226 100644 --- a/test/src/test_schunk.cpp +++ b/test/src/test_schunk.cpp @@ -5,32 +5,38 @@ #include #include #include -#include #include +#define _COMPRESSED_PROFILE 1 #include #include #include "util.h" +#include "compressed/channel.h" // ----------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------- TEST_CASE("Schunk: initialize with chunk size") { - test_util::parametrize([&]([[maybe_unused]] T type) - { - compressed::blosc2::schunk super_chunk(128, 4096); + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + compressed::detail::schunk super_chunk(128, 4096); - auto ctx = compressed::blosc2::create_decompression_context(std::thread::hardware_concurrency()); + auto ctx = compressed::blosc2::create_decompression_context(std::thread::hardware_concurrency()); - // this schunk is empty so we expect no items - auto decompressed = super_chunk.to_uncompressed(ctx); - CHECK(decompressed.size() == 0); + auto compression_ctx = compressed::cpu_compression_context{ + .compression_ctx = nullptr, + .decompression_ctx = std::move(ctx), + .nthreads = std::thread::hardware_concurrency() + }; - // similarly converting to schunk should work, but be empty - auto raw_schunk = super_chunk.to_schunk(); - }); + // this schunk is empty so we expect no items + auto decompressed = super_chunk.to_uncompressed(compression_ctx); + CHECK(decompressed.size() == 0); + } + ); } @@ -38,38 +44,42 @@ TEST_CASE("Schunk: initialize with chunk size") // ----------------------------------------------------------------------------------- TEST_CASE("Schunk: initialize with data") { - test_util::parametrize([&]([[maybe_unused]] T type) - { - std::vector data(4096); - std::iota(data.begin(), data.end(), T{ 0 }); + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + std::vector data(4096); + std::iota(data.begin(), data.end(), T{0}); - auto ctx = compressed::blosc2::create_compression_context( - std::thread::hardware_concurrency(), - compressed::enums::codec::lz4, - 9, - 128 - ); - compressed::blosc2::schunk super_chunk(std::span(data), 64, 256, ctx); + auto ctx = compressed::channel::create_compression_context( + compressed::enums::codec::lz4, + std::thread::hardware_concurrency(), + 9, + 128, + 0 + ); + compressed::detail::schunk super_chunk(std::span(data), 64, 256, std::move(ctx)); - auto decomp_ctx = compressed::blosc2::create_decompression_context(std::thread::hardware_concurrency()); - SUBCASE("Check decompressed") - { - // We expect the same number of elements - auto decompressed = super_chunk.to_uncompressed(decomp_ctx); - CHECK(decompressed.size() == 4096); - CHECK(decompressed == data); - } - SUBCASE("Check blosc2 schunk result") - { - // we also expect the right result converting to schunk - auto raw_schunk = super_chunk.to_schunk(); - CHECK(raw_schunk->nchunks == 4096 * sizeof(T) / 256); - CHECK(raw_schunk->nbytes / sizeof(T) == 4096); - } - SUBCASE("Get chunk") - { - auto chunk = super_chunk.chunk(decomp_ctx, 0); - CHECK(chunk.size() == 256 / sizeof(T)); - } - }); -} \ No newline at end of file + auto decomp_ctx = compressed::channel::create_compression_context( + compressed::enums::codec::lz4, + std::thread::hardware_concurrency(), + 9, + 128, + 0 + ); + SUBCASE("Check decompressed") + { + // We expect the same number of elements + auto decompressed = super_chunk.to_uncompressed( + std::get(decomp_ctx) + ); + CHECK(decompressed.size() == 4096); + CHECK(decompressed == data); + } + SUBCASE("Get chunk") + { + auto chunk = super_chunk.chunk(std::get(decomp_ctx), size_t{0}); + CHECK(chunk.size() == 256 / sizeof(T)); + } + } + ); +} diff --git a/test/src/test_zip.cpp b/test/src/test_zip.cpp index 1083711..fc25d1f 100644 --- a/test/src/test_zip.cpp +++ b/test/src/test_zip.cpp @@ -1,11 +1,8 @@ #include "doctest.h" -#include -#include #include #include -#include - +#define _COMPRESSED_PROFILE 1 #include #include "util.h" @@ -15,63 +12,78 @@ // ----------------------------------------------------------------------------------- TEST_CASE("compressed::ranges::zip sequenced loops") { - test_util::parametrize([&]([[maybe_unused]] T type) - { - std::vector data_a(25, static_cast(25)); - std::vector data_b(25, static_cast(50)); - std::vector data_c(25, static_cast(75)); - - auto gen = compressed::ranges::zip(data_a, data_b, data_c); - std::for_each(std::execution::seq, gen.begin(), gen.end(), [](auto vals) - { - auto& [a, b, c] = vals; - CHECK(a == static_cast(25)); - CHECK(b == static_cast(50)); - CHECK(c == static_cast(75)); - }); - }); + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + std::vector data_a(25, static_cast(25)); + std::vector data_b(25, static_cast(50)); + std::vector data_c(25, static_cast(75)); + + auto gen = compressed::ranges::zip(data_a, data_b, data_c); + std::for_each( + std::execution::seq, + gen.begin(), + gen.end(), + [](auto vals) + { + auto& [a, b, c] = vals; + CHECK(a == static_cast(25)); + CHECK(b == static_cast(50)); + CHECK(c == static_cast(75)); + } + ); + } + ); } // ----------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------- TEST_CASE("compressed::ranges::zip parallel loops") { - test_util::parametrize([&]([[maybe_unused]] T type) - { - std::vector data_a(25, static_cast(25)); - std::vector data_b(25, static_cast(50)); - std::vector data_c(25, static_cast(75)); - - auto gen = compressed::ranges::zip(data_a, data_b, data_c); - std::for_each(std::execution::par_unseq, gen.begin(), gen.end(), [](auto vals) - { - auto& [a, b, c] = vals; - CHECK(a == static_cast(25)); - CHECK(b == static_cast(50)); - CHECK(c == static_cast(75)); - }); - }); + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + std::vector data_a(25, static_cast(25)); + std::vector data_b(25, static_cast(50)); + std::vector data_c(25, static_cast(75)); + + auto gen = compressed::ranges::zip(data_a, data_b, data_c); + std::for_each( + std::execution::par_unseq, + gen.begin(), + gen.end(), + [](auto vals) + { + auto& [a, b, c] = vals; + CHECK(a == static_cast(25)); + CHECK(b == static_cast(50)); + CHECK(c == static_cast(75)); + } + ); + } + ); } - // ----------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------- TEST_CASE("compressed::ranges::zip regular for loop") { - test_util::parametrize([&]([[maybe_unused]] T type) - { - std::vector data_a(25, static_cast(25)); - std::vector data_b(25, static_cast(50)); - std::vector data_c(25, static_cast(75)); - - for (auto [a, b, c] : compressed::ranges::zip(data_a, data_b, data_c)) - { - CHECK(a == static_cast(25)); - CHECK(b == static_cast(50)); - CHECK(c == static_cast(75)); - } - }); + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + std::vector data_a(25, static_cast(25)); + std::vector data_b(25, static_cast(50)); + std::vector data_c(25, static_cast(75)); + + for (auto [a, b, c] : compressed::ranges::zip(data_a, data_b, data_c)) + { + CHECK(a == static_cast(25)); + CHECK(b == static_cast(50)); + CHECK(c == static_cast(75)); + } + } + ); } @@ -79,23 +91,30 @@ TEST_CASE("compressed::ranges::zip regular for loop") // ----------------------------------------------------------------------------------- TEST_CASE("compressed::ranges::zip serial mismatched sizes") { - test_util::parametrize([&]([[maybe_unused]] T type) - { - // We expect to only iterate up until index 25 here - std::vector data_a(30, static_cast(25)); - std::vector data_b(25, static_cast(50)); - std::vector data_c(45, static_cast(75)); - - auto gen = compressed::ranges::zip(data_a, data_b, data_c); - CHECK(gen.size() == 25); - std::for_each(std::execution::seq, gen.begin(), gen.end(), [](auto vals) - { - auto& [a, b, c] = vals; - CHECK(a == static_cast(25)); - CHECK(b == static_cast(50)); - CHECK(c == static_cast(75)); - }); - }); + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + // We expect to only iterate up until index 25 here + std::vector data_a(30, static_cast(25)); + std::vector data_b(25, static_cast(50)); + std::vector data_c(45, static_cast(75)); + + auto gen = compressed::ranges::zip(data_a, data_b, data_c); + CHECK(gen.size() == 25); + std::for_each( + std::execution::seq, + gen.begin(), + gen.end(), + [](auto vals) + { + auto& [a, b, c] = vals; + CHECK(a == static_cast(25)); + CHECK(b == static_cast(50)); + CHECK(c == static_cast(75)); + } + ); + } + ); } @@ -103,23 +122,30 @@ TEST_CASE("compressed::ranges::zip serial mismatched sizes") // ----------------------------------------------------------------------------------- TEST_CASE("compressed::ranges::zip parallel mismatched sizes") { - test_util::parametrize([&]([[maybe_unused]] T type) - { - // We expect to only iterate up until index 25 here - std::vector data_a(30, static_cast(25)); - std::vector data_b(25, static_cast(50)); - std::vector data_c(45, static_cast(75)); - - auto gen = compressed::ranges::zip(data_a, data_b, data_c); - CHECK(gen.size() == 25); - std::for_each(std::execution::par_unseq, gen.begin(), gen.end(), [](auto vals) - { - auto& [a, b, c] = vals; - CHECK(a == static_cast(25)); - CHECK(b == static_cast(50)); - CHECK(c == static_cast(75)); - }); - }); + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + // We expect to only iterate up until index 25 here + std::vector data_a(30, static_cast(25)); + std::vector data_b(25, static_cast(50)); + std::vector data_c(45, static_cast(75)); + + auto gen = compressed::ranges::zip(data_a, data_b, data_c); + CHECK(gen.size() == 25); + std::for_each( + std::execution::par_unseq, + gen.begin(), + gen.end(), + [](auto vals) + { + auto& [a, b, c] = vals; + CHECK(a == static_cast(25)); + CHECK(b == static_cast(50)); + CHECK(c == static_cast(75)); + } + ); + } + ); } @@ -127,23 +153,25 @@ TEST_CASE("compressed::ranges::zip parallel mismatched sizes") // ----------------------------------------------------------------------------------- TEST_CASE("compressed::ranges::zip regular for loop mismatched sizes") { - test_util::parametrize([&]([[maybe_unused]] T type) - { - // We expect to only iterate up until index 25 here - std::vector data_a(30, static_cast(25)); - std::vector data_b(25, static_cast(50)); - std::vector data_c(45, static_cast(75)); - - size_t count = 0; - for (auto [a, b, c] : compressed::ranges::zip(data_a, data_b, data_c)) - { - CHECK(a == static_cast(25)); - CHECK(b == static_cast(50)); - CHECK(c == static_cast(75)); - ++count; - } - CHECK(count == 25); - }); + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + // We expect to only iterate up until index 25 here + std::vector data_a(30, static_cast(25)); + std::vector data_b(25, static_cast(50)); + std::vector data_c(45, static_cast(75)); + + size_t count = 0; + for (auto [a, b, c] : compressed::ranges::zip(data_a, data_b, data_c)) + { + CHECK(a == static_cast(25)); + CHECK(b == static_cast(50)); + CHECK(c == static_cast(75)); + ++count; + } + CHECK(count == 25); + } + ); } @@ -151,36 +179,48 @@ TEST_CASE("compressed::ranges::zip regular for loop mismatched sizes") // ----------------------------------------------------------------------------------- TEST_CASE("compressed::ranges::zip parallel mismatched sizes modify") { - test_util::parametrize([&]([[maybe_unused]] T type) - { - // We expect to only iterate up until index 25 here - std::vector data_a(30, static_cast(25)); - std::vector data_b(25, static_cast(50)); - std::vector data_c(45, static_cast(75)); - - auto gen = compressed::ranges::zip(data_a, data_b, data_c); - CHECK(gen.size() == 25); - std::for_each(std::execution::par_unseq, gen.begin(), gen.end(), [](auto vals) - { - auto& [a, b, c] = vals; - CHECK(a == static_cast(25)); - CHECK(b == static_cast(50)); - CHECK(c == static_cast(75)); - - a = 75; - b = 49; - c = 25; - }); - - auto gen_2 = compressed::ranges::zip(data_a, data_b, data_c); - std::for_each(std::execution::par_unseq, gen_2.begin(), gen_2.end(), [](auto vals) - { - auto& [a, b, c] = vals; - CHECK(a == static_cast(75)); - CHECK(b == static_cast(49)); - CHECK(c == static_cast(25)); - }); - }); + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + // We expect to only iterate up until index 25 here + std::vector data_a(30, static_cast(25)); + std::vector data_b(25, static_cast(50)); + std::vector data_c(45, static_cast(75)); + + auto gen = compressed::ranges::zip(data_a, data_b, data_c); + CHECK(gen.size() == 25); + std::for_each( + std::execution::par_unseq, + gen.begin(), + gen.end(), + [](auto vals) + { + auto& [a, b, c] = vals; + CHECK(a == static_cast(25)); + CHECK(b == static_cast(50)); + CHECK(c == static_cast(75)); + + a = 75; + b = 49; + c = 25; + } + ); + + auto gen_2 = compressed::ranges::zip(data_a, data_b, data_c); + std::for_each( + std::execution::par_unseq, + gen_2.begin(), + gen_2.end(), + [](auto vals) + { + auto& [a, b, c] = vals; + CHECK(a == static_cast(75)); + CHECK(b == static_cast(49)); + CHECK(c == static_cast(25)); + } + ); + } + ); } @@ -188,36 +228,48 @@ TEST_CASE("compressed::ranges::zip parallel mismatched sizes modify") // ----------------------------------------------------------------------------------- TEST_CASE("compressed::ranges::zip serial mismatched sizes modify") { - test_util::parametrize([&]([[maybe_unused]] T type) - { - // We expect to only iterate up until index 25 here - std::vector data_a(30, static_cast(25)); - std::vector data_b(25, static_cast(50)); - std::vector data_c(45, static_cast(75)); - - auto gen = compressed::ranges::zip(data_a, data_b, data_c); - CHECK(gen.size() == 25); - std::for_each(std::execution::seq, gen.begin(), gen.end(), [](auto vals) - { - auto& [a, b, c] = vals; - CHECK(a == static_cast(25)); - CHECK(b == static_cast(50)); - CHECK(c == static_cast(75)); - - a = 75; - b = 49; - c = 25; - }); - - auto gen_2 = compressed::ranges::zip(data_a, data_b, data_c); - std::for_each(std::execution::seq, gen_2.begin(), gen_2.end(), [](auto vals) - { - auto& [a, b, c] = vals; - CHECK(a == static_cast(75)); - CHECK(b == static_cast(49)); - CHECK(c == static_cast(25)); - }); - }); + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + // We expect to only iterate up until index 25 here + std::vector data_a(30, static_cast(25)); + std::vector data_b(25, static_cast(50)); + std::vector data_c(45, static_cast(75)); + + auto gen = compressed::ranges::zip(data_a, data_b, data_c); + CHECK(gen.size() == 25); + std::for_each( + std::execution::seq, + gen.begin(), + gen.end(), + [](auto vals) + { + auto& [a, b, c] = vals; + CHECK(a == static_cast(25)); + CHECK(b == static_cast(50)); + CHECK(c == static_cast(75)); + + a = 75; + b = 49; + c = 25; + } + ); + + auto gen_2 = compressed::ranges::zip(data_a, data_b, data_c); + std::for_each( + std::execution::seq, + gen_2.begin(), + gen_2.end(), + [](auto vals) + { + auto& [a, b, c] = vals; + CHECK(a == static_cast(75)); + CHECK(b == static_cast(49)); + CHECK(c == static_cast(25)); + } + ); + } + ); } @@ -225,47 +277,49 @@ TEST_CASE("compressed::ranges::zip serial mismatched sizes modify") // ----------------------------------------------------------------------------------- TEST_CASE("compressed::ranges::zip regular for loop mismatched sizes modify") { - test_util::parametrize([&]([[maybe_unused]] T type) - { - // We expect to only iterate up until index 25 here - std::vector data_a(30, static_cast(25)); - std::vector data_b(25, static_cast(50)); - std::vector data_c(45, static_cast(75)); - - size_t count = 0; - for (auto [a, b, c] : compressed::ranges::zip(data_a, data_b, data_c)) - { - CHECK(a == static_cast(25)); - CHECK(b == static_cast(50)); - CHECK(c == static_cast(75)); - - a = static_cast(75); - b = static_cast(49); - c = static_cast(25); - ++count; - } - CHECK(count == 25); - - for (auto [a, b, c] : compressed::ranges::zip(data_a, data_b, data_c)) - { - CHECK(a == static_cast(75)); - CHECK(b == static_cast(49)); - CHECK(c == static_cast(25)); - } - - // The zip should have only touched the first 25 elements with the rest being the same - size_t count_2 = 0; - for (const auto& elem : data_a) - { - if (count_2 < 25) - { - CHECK(elem == 75); - } - else - { - CHECK(elem == 25); - } - ++count_2; - } - }); -} \ No newline at end of file + test_util::parametrize( + [&]([[maybe_unused]] T type) + { + // We expect to only iterate up until index 25 here + std::vector data_a(30, static_cast(25)); + std::vector data_b(25, static_cast(50)); + std::vector data_c(45, static_cast(75)); + + size_t count = 0; + for (auto [a, b, c] : compressed::ranges::zip(data_a, data_b, data_c)) + { + CHECK(a == static_cast(25)); + CHECK(b == static_cast(50)); + CHECK(c == static_cast(75)); + + a = static_cast(75); + b = static_cast(49); + c = static_cast(25); + ++count; + } + CHECK(count == 25); + + for (auto [a, b, c] : compressed::ranges::zip(data_a, data_b, data_c)) + { + CHECK(a == static_cast(75)); + CHECK(b == static_cast(49)); + CHECK(c == static_cast(25)); + } + + // The zip should have only touched the first 25 elements with the rest being the same + size_t count_2 = 0; + for (const auto& elem : data_a) + { + if (count_2 < 25) + { + CHECK(elem == 75); + } + else + { + CHECK(elem == 25); + } + ++count_2; + } + } + ); +} diff --git a/test/src/util.h b/test/src/util.h index 2b34ec7..4331b0e 100644 --- a/test/src/util.h +++ b/test/src/util.h @@ -1,158 +1,234 @@ #pragma once -#include #include #include #include #include #include -#include #include #include +#include +// Explicit stringification support for Imath::half in doctest assertions +namespace doctest +{ + template <> + struct StringMaker + { + static String convert(const Imath_3_1::half& value) + { + // Safely cast to float, which doctest already knows how to print perfectly + return toString(static_cast(value)); + } + }; +} namespace test_util { - - /// Read the image using OpenImageIO (OIIO) and deinterleave all the channels into discrete buffers. - /// - /// This function opens an image file using OIIO, reads its pixel data into a single buffer, - /// and then separates the interleaved channel data into individual channel buffers. - /// - /// \tparam T The pixel data type (e.g., uint8_t, float). - /// \param filepath The file path to the image. - /// \return A vector of vectors, where each inner vector represents a deinterleaved channel. - /// \throws std::runtime_error if the image fails to open or read. - template - std::vector> read_oiio(std::filesystem::path filepath, int subimage = 0) - { - auto input_ptr = OIIO::ImageInput::open(filepath.string()); - if (!input_ptr) - { - throw std::runtime_error(std::format("Failed to open image {}", filepath.string())); - } - auto res = input_ptr->seek_subimage(subimage, 0); - if (!res) - { - throw std::runtime_error(std::format("Image {} does not contain subimage {}", filepath.string(), subimage)); - } - const OIIO::ImageSpec& spec = input_ptr->spec(); - std::vector pixels(static_cast(spec.width) * spec.height * spec.nchannels); - std::vector> channels; - for ([[maybe_unused]] auto _ : std::views::iota(0, spec.nchannels)) - { - channels.push_back(std::vector(static_cast(spec.width) * spec.height)); - } - - auto typedesc = compressed::enums::get_type_desc(); - auto ok = input_ptr->read_image(subimage, 0, 0, spec.nchannels, typedesc, static_cast(pixels.data())); - if (!ok) - { - throw std::runtime_error(std::format("Image {} failed to read because: {}", filepath.string(), input_ptr->geterror())); - } - compressed::image_algo::deinterleave(std::span(pixels), channels); - return channels; - } - - - /// Compare two nested vectors (representing two multi-channel images), ensuring their contents are equal. - /// - /// This function checks if two images (stored as `std::vector>`) have the same number of channels, - /// that each channel contains the same number of elements, and that all pixel values match. - /// If any discrepancy is found, a detailed exception is thrown. - /// - /// \tparam T The pixel data type (e.g., uint8_t, float). - /// \param a The first image to compare. - /// \param b The second image to compare. - /// \param name A label for the images, used in error messages. - /// \throws std::runtime_error if the images differ in structure or content. - template - void compare_images(std::vector> a, std::vector> b, std::string name) - { - if (a.size() != b.size()) - { - throw std::runtime_error(std::format("{}: Error while comparing images, mismatch in number of channels {} : {}", name, a.size(), b.size())); - } - - for (auto channel_idx : std::views::iota(static_cast(0), a.size())) - { - if (a[channel_idx].size() != b[channel_idx].size()) - { - throw std::runtime_error( - std::format("{}: Error while comparing images, mismatch in number of items while comparing channel {} a: {:L} b: {:L}", - name, - channel_idx, - a[channel_idx].size(), - b[channel_idx].size()) - ); - } - - for (auto i : std::views::iota(static_cast(0), a[channel_idx].size())) - { - if (a[channel_idx][i] != b[channel_idx][i]) - { - throw std::runtime_error( - std::format("{}: Error while comparing images, mismatch at element {} in channel {}. a: {}, b: {}", - name, - i, - channel_idx, - a[channel_idx][i], - b[channel_idx][i]) - ); - } - } - } - } - - - /// Parametrize the given test lambda for the given types. - template - void parametrize(Lambda&& lambda) - { - (lambda(Types{}), ...); - } - - namespace detail - { - template - concept ContainerPair = requires(Container1 x, Container2 y) { - { x.size() } -> std::convertible_to; - { y.size() } -> std::convertible_to; - { x[0] } -> std::same_as; - }; - - template - concept ContainerAndValue = requires(Container x, T val) { - { x.size() } -> std::convertible_to; - { x[0] != val } -> std::convertible_to; - }; - - } // detail - - - template - requires detail::ContainerPair - void check_vector_verbose(const Container1& x, const Container2& y) - { - REQUIRE(x.size() == y.size()); - for (size_t i = 0; i < x.size(); ++i) { - if (x[i] != y[i]) { - REQUIRE_MESSAGE(x[i] == y[i], "Failed vector index: " << i); - } - } - } - - template - requires detail::ContainerAndValue - void check_vector_verbose(const Container& x, T y) - { - for (size_t i = 0; i < x.size(); ++i) { - if (x[i] != y) { - REQUIRE_MESSAGE(x[i] == y, "Failed vector index: " << i); - } - } - } - -} \ No newline at end of file + /// Read the image using OpenImageIO (OIIO) and deinterleave all the channels into discrete buffers. + /// + /// This function opens an image file using OIIO, reads its pixel data into a single buffer, + /// and then separates the interleaved channel data into individual channel buffers. + /// + /// \tparam T The pixel data type (e.g., uint8_t, float). + /// \param filepath The file path to the image. + /// \return A vector of vectors, where each inner vector represents a deinterleaved channel. + /// \throws std::runtime_error if the image fails to open or read. + template + std::vector> read_oiio(std::filesystem::path filepath, int subimage = 0) + { + auto input_ptr = OIIO::ImageInput::open(filepath.string()); + if (!input_ptr) + { + throw std::runtime_error(std::format("Failed to open image {}", filepath.string())); + } + auto res = input_ptr->seek_subimage(subimage, 0); + if (!res) + { + throw std::runtime_error(std::format("Image {} does not contain subimage {}", filepath.string(), subimage)); + } + const OIIO::ImageSpec& spec = input_ptr->spec(); + std::vector pixels(static_cast(spec.width) * spec.height * spec.nchannels); + std::vector> channels; + for ([[maybe_unused]] auto _ : std::views::iota(0, spec.nchannels)) + { + channels.push_back(std::vector(static_cast(spec.width) * spec.height)); + } + + auto typedesc = compressed::enums::get_type_desc(); + auto ok = input_ptr->read_image(subimage, 0, 0, spec.nchannels, typedesc, static_cast(pixels.data())); + if (!ok) + { + throw std::runtime_error( + std::format("Image {} failed to read because: {}", filepath.string(), input_ptr->geterror()) + ); + } + compressed::image_algo::deinterleave(std::span(pixels), channels); + return channels; + } + + + /// Compare two nested vectors (representing two multi-channel images), ensuring their contents are equal. + /// + /// This function checks if two images (stored as `std::vector>`) have the same number of channels, + /// that each channel contains the same number of elements, and that all pixel values match. + /// If any discrepancy is found, a detailed exception is thrown. + /// + /// \tparam T The pixel data type (e.g., uint8_t, float). + /// \param a The first image to compare. + /// \param b The second image to compare. + /// \param name A label for the images, used in error messages. + /// \throws std::runtime_error if the images differ in structure or content. + template + void compare_images(std::vector> a, std::vector> b, std::string name) + { + if (a.size() != b.size()) + { + throw std::runtime_error( + std::format( + "{}: Error while comparing images, mismatch in number of channels {} : {}", + name, + a.size(), + b.size() + ) + ); + } + + for (auto channel_idx : std::views::iota(static_cast(0), a.size())) + { + if (a[channel_idx].size() != b[channel_idx].size()) + { + throw std::runtime_error( + std::format( + "{}: Error while comparing images, mismatch in number of items while comparing channel {} a: {:L} b: {:L}", + name, + channel_idx, + a[channel_idx].size(), + b[channel_idx].size() + ) + ); + } + + for (auto i : std::views::iota(static_cast(0), a[channel_idx].size())) + { + if (a[channel_idx][i] != b[channel_idx][i]) + { + throw std::runtime_error( + std::format( + "{}: Error while comparing images, mismatch at element {} in channel {}. a: {}, b: {}", + name, + i, + channel_idx, + a[channel_idx][i], + b[channel_idx][i] + ) + ); + } + } + } + } + + + /// Parametrize the given test lambda for the given types. + template + void parametrize(Lambda&& lambda) + { + ([&]() + { + if constexpr (std::is_same_v) + { + SUBCASE("") + lambda(T{}); + } + else + { + const std::string name = std::format("<{}>", typeid(T).name()); + SUBCASE(name.c_str()) + lambda(T{}); + } + }.template operator()(), ...); + } + + template + void parametrize_codecs(Lambda&& lambda) + { + // 1. Define all possible variants from your enums::codec + constexpr std::array all_codecs = { + compressed::enums::codec::blosclz, + compressed::enums::codec::lz4, + compressed::enums::codec::lz4hc, + compressed::enums::codec::zstd, + compressed::enums::codec::lz4_gpu, + compressed::enums::codec::snappy_gpu, + compressed::enums::codec::zstd_gpu, + compressed::enums::codec::deflate_gpu, + compressed::enums::codec::gdeflate_gpu, + compressed::enums::codec::cascaded_gpu + }; + + for (const auto codec : all_codecs) + { + // 3. Skip GPU codecs dynamically if CUDA is not available + if (compressed::enums::is_gpu_codec(codec) && !compressed::cuda::is_available()) + { + continue; + } + + // 4. Capture the string in the local loop scope so it outlives the SUBCASE macro evaluation + std::string codec_name = std::string(compressed::enums::to_string(codec)); + + SUBCASE(codec_name.c_str()) + { + lambda(codec); + } + } + } + + namespace detail + { + template + concept ContainerPair = requires(Container1 x, Container2 y) + { + { x.size() } -> std::convertible_to; + { y.size() } -> std::convertible_to; + { x[0] } -> std::same_as; + }; + + template + concept ContainerAndValue = requires(Container x, T val) + { + { x.size() } -> std::convertible_to; + { x[0] != val } -> std::convertible_to; + }; + } // detail + + + template + requires detail::ContainerPair + void check_vector_verbose(const Container1& x, const Container2& y) + { + REQUIRE(x.size() == y.size()); + for (size_t i = 0; i < x.size(); ++i) + { + if (x[i] != y[i]) + { + REQUIRE_MESSAGE(x[i] == y[i], "Failed vector index: " << i); + } + } + } + + template + requires detail::ContainerAndValue + void check_vector_verbose(const Container& x, T y) + { + for (size_t i = 0; i < x.size(); ++i) + { + if (x[i] != y) + { + REQUIRE_MESSAGE(x[i] == y, "Failed vector index: " << i); + } + } + } +} diff --git a/thirdparty/spdlog b/thirdparty/spdlog new file mode 160000 index 0000000..79524dd --- /dev/null +++ b/thirdparty/spdlog @@ -0,0 +1 @@ +Subproject commit 79524ddd08a4ec981b7fea76afd08ee05f83755d diff --git a/thirdparty/vcpkg b/thirdparty/vcpkg index 5a2324f..120deac 160000 --- a/thirdparty/vcpkg +++ b/thirdparty/vcpkg @@ -1 +1 @@ -Subproject commit 5a2324f6667233aeb903d3117f6fd259a2be6f8b +Subproject commit 120deac3062162151622ca4860575a33844ba10b diff --git a/vcpkg.json b/vcpkg.json index 0f1e71b..d4b713c 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -6,7 +6,7 @@ "overrides": [ { "name": "openimageio", - "version": "2.5.16.0" + "version": "3.0.9.1" } ] } \ No newline at end of file