diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..0388516
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,90 @@
+---
+Language: Cpp
+BasedOnStyle: Microsoft
+Standard: Latest
+
+IndentWidth: 4
+TabWidth: 4
+UseTab: Never
+ColumnLimit: 120
+
+BreakBeforeBraces: Allman
+BraceWrapping:
+  AfterCaseLabel: true
+  AfterClass: true
+  AfterControlStatement: MultiLine
+  AfterEnum: true
+  AfterFunction: true
+  AfterNamespace: true
+  AfterObjCDeclaration: true
+  AfterStruct: true
+  AfterUnion: true
+  AfterExternBlock: true
+  BeforeCatch: true
+  BeforeElse: true
+  BeforeLambdaBody: true
+  BeforeWhile: true
+  IndentBraces: false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+
+NamespaceIndentation: All
+FixNamespaceComments: true
+CompactNamespaces: false
+
+AccessModifierOffset: -4
+IndentAccessModifiers: false
+
+PointerAlignment: Left
+ReferenceAlignment: Left
+DerivePointerAlignment: false
+
+AlignAfterOpenBracket: AlwaysBreak
+BinPackArguments: false
+BinPackParameters: false
+AllowAllArgumentsOnNextLine: false
+AllowAllParametersOfDeclarationOnNextLine: false
+PenaltyBreakBeforeFirstCallParameter: 0
+
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+IndentWrappedFunctionNames: false
+
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortBlocksOnASingleLine: Never
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLoopsOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortEnumsOnASingleLine: false
+
+AlwaysBreakTemplateDeclarations: Yes
+BreakConstructorInitializers: BeforeComma
+PackConstructorInitializers: Never
+
+Cpp11BracedListStyle: true
+SpaceBeforeCpp11BracedList: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+SpaceInEmptyParentheses: false
+SpaceBeforeParens: ControlStatements
+SpaceAfterTemplateKeyword: false
+
+SortIncludes: CaseSensitive
+IncludeBlocks: Regroup
+IncludeCategories:
+  - Regex: '^<.*>$'
+    Priority: 1
+  - Regex: '^"(blosc2|nlohmann|nvcomp|cuda|cuda_runtime).*'
+    Priority: 2
+  - Regex: '^".*"$'
+    Priority: 3
+
+ReflowComments: true
+AlignTrailingComments: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MaxEmptyLinesToKeep: 2
+
+DeriveLineEnding: true
+InsertNewlineAtEOF: true
+...
\ No newline at end of file
diff --git a/.clang-format-ignore b/.clang-format-ignore
new file mode 100644
index 0000000..c3d6b98
--- /dev/null
+++ b/.clang-format-ignore
@@ -0,0 +1 @@
+thirdparty/**
\ No newline at end of file
diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 0000000..3c1deb3
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,42 @@
+Checks: >
+  -*,
+  readability-identifier-naming
+
+CheckOptions:
+  - key: readability-identifier-naming.FunctionCase
+    value: lower_case
+  - key: readability-identifier-naming.MethodCase
+    value: lower_case
+  - key: readability-identifier-naming.VariableCase
+    value: lower_case
+
+  # Allow local temporaries like _compressor
+  - key: readability-identifier-naming.LocalVariableCase
+    value: lower_case
+  - key: readability-identifier-naming.LocalVariableIgnoredRegexp
+    value: '^_[a-z0-9_]+$'
+
+  # Public members: no prefix
+  - key: readability-identifier-naming.PublicMemberCase
+    value: lower_case
+
+  # Protected members: m_ prefix
+  - key: readability-identifier-naming.ProtectedMemberCase
+    value: lower_case
+  - key: readability-identifier-naming.ProtectedMemberPrefix
+    value: m_
+
+  # Private members: m_ prefix
+  - key: readability-identifier-naming.PrivateMemberCase
+    value: lower_case
+  - key: readability-identifier-naming.PrivateMemberPrefix
+    value: m_
+
+  - key: readability-identifier-naming.ClassCase
+    value: lower_case
+  - key: readability-identifier-naming.StructCase
+    value: lower_case
+  - key: readability-identifier-naming.EnumCase
+    value: lower_case
+  - key: readability-identifier-naming.EnumConstantCase
+    value: lower_case
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index d053b09..a2dbeb4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,6 +19,8 @@ benchmark/images/*.tga
 # Release artifacts
 release/
 
+.idea/
+
 # Wheel Artifacts
 wheels/
 wheelhouse/
diff --git a/.gitmodules b/.gitmodules
index f87a610..ae3034d 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -25,3 +25,6 @@
 [submodule "thirdparty/pybind11_json"]
 	path = thirdparty/pybind11_json
 	url = https://github.com/pybind/pybind11_json
+[submodule "thirdparty/spdlog"]
+	path = thirdparty/spdlog
+	url = https://github.com/gabime/spdlog
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f2f033a..85de98f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,37 +1,42 @@
-﻿cmake_minimum_required (VERSION 3.19)
+﻿cmake_minimum_required(VERSION 3.19)
 set(VCPKG_LIBRARY_LINKAGE static)
 
+if (POLICY CMP0135)
+   cmake_policy(SET CMP0135 NEW)
+endif ()
+
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
-set (CMAKE_CXX_STANDARD 20)
-project (CompressedImageBuild)
+set(CMAKE_CXX_STANDARD 20)
+project(CompressedImageBuild)
 
 # If we are compiling as the main project we automatically turn on all the build options.
 # This can be circumvented by passing "-DCOMPRESSED_DETERMINE_MAIN_PROJECT=OFF"
 set(MAIN_PROJECT OFF)
-option (
-    COMPRESSED_DETERMINE_MAIN_PROJECT 
-    "Whether to automatically determine if we are building this module as main project" 
-    ON
+option(
+   COMPRESSED_DETERMINE_MAIN_PROJECT
+   "Whether to automatically determine if we are building this module as main project"
+   ON
 )
 if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR AND COMPRESSED_DETERMINE_MAIN_PROJECT)
-    message("Compiling compressed-image as main project")
-    set(MAIN_PROJECT ON)
-else()
-    set(MAIN_PROJECT OFF)
-endif()
+   message("Compiling compressed-image as main project")
+   set(MAIN_PROJECT ON)
+else ()
+   set(MAIN_PROJECT OFF)
+endif ()
 
 if (MAIN_PROJECT)
-    set(COMPRESSED_IMAGE_USE_VCPKG ON)
-    set(COMPRESSED_IMAGE_BUILD_TESTS ON)
-    set(COMPRESSED_IMAGE_BUILD_EXAMPLES ON)
-    set(COMPRESSED_IMAGE_BUILD_PYTHON ON)
-    set(COMPRESSED_IMAGE_BUILD_DOCS ON)
-    set(COMPRESSED_IMAGE_BUILD_BENCHMARKS ON)
-    set(COMPRESSED_IMAGE_EXTENDED_WARNINGS ON)
-endif()
+   set(COMPRESSED_IMAGE_USE_VCPKG ON)
+   set(COMPRESSED_IMAGE_BUILD_TESTS ON)
+   set(COMPRESSED_IMAGE_BUILD_EXAMPLES ON)
+   set(COMPRESSED_IMAGE_BUILD_PYTHON ON)
+   set(COMPRESSED_IMAGE_BUILD_DOCS ON)
+   set(COMPRESSED_IMAGE_BUILD_BENCHMARKS ON)
+   set(COMPRESSED_IMAGE_EXTENDED_WARNINGS ON)
+endif ()
 
 option(COMPRESSED_IMAGE_USE_VCPKG "Whether to use the submodule version of vcpkg to resolve the dependencies instead of system libraries." OFF)
 option(COMPRESSED_IMAGE_EXTENDED_WARNINGS "Whether to compile with extended warnings (-Wextra, -Werror etc.)" OFF)
+option(COMPRESSED_IMAGE_CUDA_VERSION "CUDA Runtime/Toolkit version" 12)
 option(COMPRESSED_IMAGE_BUILD_TESTS OFF)
 option(COMPRESSED_IMAGE_BUILD_EXAMPLES OFF)
 option(COMPRESSED_IMAGE_BUILD_DOCS OFF)
@@ -44,70 +49,83 @@ option(_COMPRESSED_IMAGE_SANITIZE_FLAGS "Internal CI flag for enabling sanitizer
 # Add thirdparty libraries
 # --------------------------------------------------------------------------
 
+find_package(CUDAToolkit REQUIRED COMPRESSED_IMAGE_CUDA_VERSION)
+
 # Add c-blosc2
 set(DEACTIVATE_ZLIB ON)
 set(BUILD_TESTS OFF)
 set(BUILD_FUZZERS OFF)
 set(BUILD_BENCHMARKS OFF)
 set(BUILD_EXAMPLES OFF)
-add_subdirectory (thirdparty/c-blosc2)
+add_subdirectory(thirdparty/c-blosc2)
 
 # Add target for blosc2 headers
 add_library(blosc2_include INTERFACE)
-target_include_directories(blosc2_include SYSTEM INTERFACE thirdparty/c-blosc2/include)
+target_include_directories(blosc2_include SYSTEM INTERFACE
+   $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/c-blosc2/include>
+   $<INSTALL_INTERFACE:include>
+)
+
 
 # JSON module for parsing/storing metadata
 add_subdirectory(thirdparty/json)
 
+# spdlog for logging
+set(SPDLOG_USE_STD_FORMAT ON)
+add_subdirectory(thirdparty/spdlog)
+
+# Pull nvcomp (headers + dll/so target)
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/FetchNvcomp.cmake)
+
 # Include the local vcpkg toolchain, if requested, otherwise it is up to the user
 # to provide a valid OpenImageIO library (that can be found via find_package)
 if (COMPRESSED_IMAGE_USE_VCPKG)
-    include("${PROJECT_SOURCE_DIR}/thirdparty/vcpkg/scripts/buildsystems/vcpkg.cmake")
-endif()
+   include("${CMAKE_CURRENT_LIST_DIR}/thirdparty/vcpkg/scripts/buildsystems/vcpkg.cmake")
+endif ()
 
 find_package(OpenImageIO CONFIG QUIET)
 if (OpenImageIO_FOUND)
-    message(STATUS "Found OpenImageIO")
-    set(COMPRESSED_IMAGE_HAVE_OIIO TRUE)
-else()
-    message(WARNING "OpenImageIO not found, some features will not be available")
-    set(COMPRESSED_IMAGE_HAVE_OIIO TRUE)
-endif()
+   message(STATUS "Found OpenImageIO")
+   set(COMPRESSED_IMAGE_HAVE_OIIO TRUE)
+else ()
+   message(WARNING "OpenImageIO not found, some features will not be available")
+   set(COMPRESSED_IMAGE_HAVE_OIIO FALSE)
+endif ()
 
 # Projects
 # --------------------------------------------------------------------------
 add_subdirectory(compressed_image)
 
 if (COMPRESSED_IMAGE_BUILD_TESTS)
-    add_library(doctest INTERFACE)
-    target_include_directories(doctest SYSTEM INTERFACE thirdparty/doctest/doctest)
+   add_library(doctest INTERFACE)
+   target_include_directories(doctest SYSTEM INTERFACE thirdparty/doctest/doctest)
 
-    add_subdirectory(test)
-endif()
+   add_subdirectory(test)
+endif ()
 
 if (COMPRESSED_IMAGE_BUILD_EXAMPLES)
-    add_subdirectory(examples/read_from_file)
-    add_subdirectory(examples/read_with_postprocess)
-    add_subdirectory(examples/lazy_channels)
-    add_subdirectory(examples/modifying_image)
-endif()
+   add_subdirectory(examples/read_from_file)
+   add_subdirectory(examples/read_with_postprocess)
+   add_subdirectory(examples/lazy_channels)
+   add_subdirectory(examples/modifying_image)
+endif ()
 
 if (COMPRESSED_IMAGE_BUILD_BENCHMARKS)
-    set(BENCHMARK_ENABLE_INSTALL OFF)
-    set(BENCHMARK_INSTALL_DOCS OFF)
-    set(BENCHMARK_ENABLE_TESTING OFF)
-    set(BENCHMARK_ENABLE_GTEST_TESTS OFF)
-    add_subdirectory(thirdparty/benchmark)
-    add_subdirectory(benchmark)
-endif()
+   set(BENCHMARK_ENABLE_INSTALL OFF)
+   set(BENCHMARK_INSTALL_DOCS OFF)
+   set(BENCHMARK_ENABLE_TESTING OFF)
+   set(BENCHMARK_ENABLE_GTEST_TESTS OFF)
+   add_subdirectory(thirdparty/benchmark)
+   add_subdirectory(benchmark)
+endif ()
 
 if (COMPRESSED_IMAGE_BUILD_DOCS)
-    add_subdirectory(docs)
-endif()
+   add_subdirectory(docs)
+endif ()
 
 if (COMPRESSED_IMAGE_BUILD_PYTHON)
-    add_subdirectory(thirdparty/pybind11)
-    add_subdirectory(thirdparty/pybind11_image_util)
-    add_subdirectory(thirdparty/pybind11_json)
-    add_subdirectory(python)
-endif()
\ No newline at end of file
+   add_subdirectory(thirdparty/pybind11)
+   add_subdirectory(thirdparty/pybind11_image_util)
+   add_subdirectory(thirdparty/pybind11_json)
+   add_subdirectory(python)
+endif ()
\ No newline at end of file
diff --git a/cmake/FetchNvcomp.cmake b/cmake/FetchNvcomp.cmake
new file mode 100644
index 0000000..0afa027
--- /dev/null
+++ b/cmake/FetchNvcomp.cmake
@@ -0,0 +1,64 @@
+# FetchNvcomp.cmake
+# Fetch NVCOMP headers and dynamic libraries for runtime loading.
+# Provides namespaced target:
+#   compressed::nvcomp_headers
+
+include(FetchContent)
+
+##############################################################
+# Fetch dynamically from NVIDIA Redistributables
+##############################################################
+
+if (WIN32)
+   set(NVCOMP_URL "https://developer.download.nvidia.com/compute/nvcomp/redist/nvcomp/windows-x86_64/nvcomp-windows-x86_64-5.0.0.6_cuda11-archive.zip")
+   set(NVCOMP_SHA256 "5C2E1EE55398F47D28806EB7C53ACA33B9E22D6D5B3ACEC86BBC4253C7E6D1D3")
+elseif (UNIX AND NOT APPLE)
+   set(NVCOMP_URL "https://developer.download.nvidia.com/compute/nvcomp/redist/nvcomp/linux-x86_64/nvcomp-linux-x86_64-5.0.0.6_cuda11-archive.tar.xz")
+   set(NVCOMP_SHA256 "64F5F7CC622F36006C503EE5A3F9D730B5C6CC49E4FAB0FC0507C1272D5EFA7B")
+else ()
+   message(FATAL_ERROR "Unsupported platform for NVCOMP")
+endif ()
+
+FetchContent_Declare(_nvcomp_src
+   URL ${NVCOMP_URL}
+   URL_HASH SHA256=${NVCOMP_SHA256}
+)
+FetchContent_MakeAvailable(_nvcomp_src)
+set(NVCOMP_ROOT ${_nvcomp_src_SOURCE_DIR})
+
+##############################################################
+# Locate Runtime Binaries
+##############################################################
+
+if (WIN32)
+   file(GLOB FOUND_BINARIES "${NVCOMP_ROOT}/bin/*.dll")
+else ()
+   file(GLOB FOUND_BINARIES "${NVCOMP_ROOT}/lib/libnvcomp.so*")
+endif ()
+
+set(NVCOMP_RUNTIME_BINARIES "${FOUND_BINARIES}" CACHE INTERNAL "nvcomp runtime binaries")
+
+##############################################################
+# Set up Compile-Time Header Targets
+##############################################################
+
+add_library(compressed_nvcomp_headers INTERFACE)
+
+target_include_directories(compressed_nvcomp_headers INTERFACE
+   $<BUILD_INTERFACE:${NVCOMP_ROOT}/include>
+   $<INSTALL_INTERFACE:include>
+)
+
+add_library(compressed::nvcomp_headers ALIAS compressed_nvcomp_headers)
+
+##############################################################
+# Install Rules (For Deployment / Packaging)
+##############################################################
+
+install(DIRECTORY ${NVCOMP_ROOT}/include/ DESTINATION include)
+
+if (WIN32)
+   install(FILES ${NVCOMP_RUNTIME_BINARIES} DESTINATION bin)
+else ()
+   install(FILES ${NVCOMP_RUNTIME_BINARIES} DESTINATION lib)
+endif ()
\ No newline at end of file
diff --git a/compressed_image/CMakeLists.txt b/compressed_image/CMakeLists.txt
index 02b814d..688917a 100644
--- a/compressed_image/CMakeLists.txt
+++ b/compressed_image/CMakeLists.txt
@@ -1,51 +1,56 @@
 ﻿project(CompressedImage)
 
 add_library(compressed_image INTERFACE)
-target_include_directories(compressed_image INTERFACE "include")
-target_link_libraries(compressed_image INTERFACE  
-	blosc2_static 
-	blosc2_include 
-	nlohmann_json 
-	OpenImageIO::OpenImageIO
+target_include_directories(compressed_image INTERFACE
+   include
+   ${CUDAToolkit_INCLUDE_DIRS}
+)
+target_link_libraries(compressed_image INTERFACE
+   blosc2_static
+   blosc2_include
+   nlohmann_json
+   OpenImageIO::OpenImageIO
+   compressed::nvcomp_headers
+   spdlog::spdlog_header_only
 )
 
 if (MSVC)
-	target_compile_options(compressed_image INTERFACE /utf-8 /MP /DNOMINMAX)
-endif()
+   target_compile_options(compressed_image INTERFACE /utf-8 /MP /DNOMINMAX)
+endif ()
 
 if (COMPRESSED_IMAGE_HAVE_OIIO)
-	target_compile_definitions(compressed_image INTERFACE COMPRESSED_IMAGE_OIIO_AVAILABLE)
-endif()
+   target_compile_definitions(compressed_image INTERFACE COMPRESSED_IMAGE_OIIO_AVAILABLE)
+endif ()
 
 
 # Crank up warning levels on both MSVC, Clang and GCC
 if (COMPRESSED_IMAGE_EXTENDED_WARNINGS)
-	if (CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
-	target_compile_options(
-		compressed_image 
-		INTERFACE 
-		-Wall 
-		-Werror 
-		-Wextra
-	)
-	elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
-	target_compile_options(
-		compressed_image 
-		INTERFACE 
-		/W4 
-		/WX 
-		/w44062 
-		/w44464 
-		/w45264
-	)
-	endif()
-endif()
+   if (CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
+      target_compile_options(
+         compressed_image
+         INTERFACE
+         -Wall
+         -Werror
+         -Wextra
+      )
+   elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
+      target_compile_options(
+         compressed_image
+         INTERFACE
+         /W4
+         /WX
+         /w44062
+         /w44464
+         /w45264
+      )
+   endif ()
+endif ()
 
 # Enable sanitizers unless on macOS (not supported) or Windows (github runners run out of memory). 
 # These are for our CI runs only and should be ignored by users.
 if (_COMPRESSED_IMAGE_SANITIZE_FLAGS AND NOT APPLE)
-	if (CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
-		target_compile_options(compressed_image INTERFACE -fsanitize=address,leak,undefined)
-		target_link_options(compressed_image INTERFACE -fsanitize=address,leak,undefined)
-	endif()
-endif()
\ No newline at end of file
+   if (CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
+      target_compile_options(compressed_image INTERFACE -fsanitize=address,leak,undefined)
+      target_link_options(compressed_image INTERFACE -fsanitize=address,leak,undefined)
+   endif ()
+endif ()
\ No newline at end of file
diff --git a/compressed_image/include/compressed/blosc2/lazyschunk.h b/compressed_image/include/compressed/blosc2/lazyschunk.h
index f92eda7..6b44e57 100644
--- a/compressed_image/include/compressed/blosc2/lazyschunk.h
+++ b/compressed_image/include/compressed/blosc2/lazyschunk.h
@@ -10,365 +10,519 @@
 #include "compressed/util.h"
 #include "wrapper.h"
 #include "schunk_mixin.h"
+#include "compressed/cuda/compression.h"
 
 #include "compressed/detail/scoped_timer.h"
 
-namespace NAMESPACE_COMPRESSED_IMAGE
+namespace
+NAMESPACE_COMPRESSED_IMAGE
 {
-
-	namespace blosc2
-	{
-
-		namespace detail
-		{
-
-			/// Wrapper representing a lazy chunk holding either an initialized (and compressed) chunk
-			/// in the form of a byte array or just a single T representing a lazy state
-			template <typename T>
-			struct lazy_chunk
-			{
-				std::variant<std::vector<std::byte>, T> value;
-				size_t num_elements = 0;
-
-				size_t byte_size() const noexcept
-				{
-					return num_elements * sizeof(T);
-				}
-
-				bool is_lazy() const noexcept
-				{
-					return std::holds_alternative<T>(this->value);
-				}
-			};
-
-		} // detail
-
-
-		template <typename T>
-		struct lazy_schunk final : public detail::schunk_mixin<T, detail::lazy_chunk<T>>
-		{
-			using detail::schunk_mixin<T, detail::lazy_chunk<T>>::chunk_bytes;
-
-			lazy_schunk() = default;
-			lazy_schunk(lazy_schunk&& other) noexcept
-			{
-				this->m_Chunks = std::move(other.m_Chunks);
-				this->m_ChunkSize = other.m_ChunkSize;
-				this->m_BlockSize = other.m_BlockSize;
-			}
-			lazy_schunk& operator=(lazy_schunk&& other) noexcept
-			{
-				if (this != &other)
-				{
-					this->m_Chunks = std::move(other.m_Chunks);
-					this->m_ChunkSize = other.m_ChunkSize;
-					this->m_BlockSize = other.m_BlockSize;
-				}
-				return *this;
-			}
-			lazy_schunk(const lazy_schunk& other) = default;
-			lazy_schunk& operator=(const lazy_schunk& other) = default;
-
-
-			/// Initialize a lazy super-chunk from the given value, has a near-zero
-			/// cost with the chunks only being initialized on read/modify.
-			/// 
-			/// \param value The initial value to fill.
-			/// \param num_elements The size to initialize the data with.
-			/// \param block_size The requested chunk size. It is up to the caller to ensure
-			///                   this is appropriately sized
-			/// \param chunk_size The requested chunk size. It is up to the caller to ensure
-			///                   this is appropriately sized (i.e. by using util::align_chunk_to_scanlines)
-			lazy_schunk(T value, size_t num_elements, size_t block_size, size_t chunk_size)
-			{
-				util::validate_chunk_size<T>(chunk_size, "lazy_schunk");
-				this->m_BlockSize = block_size;
-				this->m_ChunkSize = chunk_size;
-
-				size_t num_bytes = num_elements * sizeof(T);
-
-				// Calculate all 'full' chunks and the final remainder (if any).
-				size_t num_full_chunks = num_bytes / this->m_ChunkSize;
-				size_t remainder_bytes = num_bytes - (this->m_ChunkSize * num_full_chunks);
-
-				// Initialize lazy chunks with the provided value of T
-				for ([[maybe_unused]] auto idx : std::views::iota(size_t{ 0 }, num_full_chunks))
-				{
-					detail::lazy_chunk<T> chunk = { value, this->m_ChunkSize / sizeof(T) };
-					this->m_Chunks.push_back(std::move(chunk));
-				}
-				if (remainder_bytes > 0)
-				{
-					detail::lazy_chunk<T> chunk = { value, remainder_bytes / sizeof(T) };
-					this->m_Chunks.push_back(std::move(chunk));
-				}
-			}
-
-			size_t chunk_bytes(size_t index) const override
-			{
-				if (index > this->m_Chunks.size() - 1)
-				{
-					throw std::out_of_range(
-						std::format("Cannot access index {} in lazy-schunk. Total amount of chunks is {}", index, this->m_Chunks.size())
-					);
-				}
-
-				return this->m_Chunks[index].num_elements * sizeof(T);
-			}
-
-			/// convert the lazy schunk into a super-chunk, generating any 
-			/// not yet initialized lazy chunks in the process. This should 
-			/// be done once all the data is computed to minimize the overhead. 
-			schunk_ptr to_schunk() override
-			{
-				_COMPRESSED_PROFILE_FUNCTION();
-				// Initialize the chunks, either appending the byte array directly to the schunk 
-				// or compressing the lazy chunk.
-				blosc2::schunk_ptr schunk = create_default_schunk();
-
-				// Allocate and compress the lazy buff. Since this only needs to happen once
-				// as all lazy values are the same we can just use the same compressed buffer for all.
-				util::default_init_vector<std::byte> lazy_compressed_data;
-				if (this->has_lazy_chunk())
-				{
-					auto lazy_buff = std::vector<T>(this->chunk_elements(), this->lazy_chunk_value());
-					lazy_compressed_data.resize(blosc2::min_compressed_size(this->m_ChunkSize));
-
-					auto context = blosc2::create_compression_context<T>(
-						schunk, 
-						std::thread::hardware_concurrency(), 
-						enums::codec::lz4, 
-						9, 
-						this->m_BlockSize
-					);
-					blosc2::compress(context, std::span<T>(lazy_buff), std::span<std::byte>(lazy_compressed_data));
-				}
-
-				// Iterate all the chunks, if lazy add the compressed lazy buffer, else add the compressed data.
-				for (auto& chunk : this->m_Chunks)
-				{
-					if (std::holds_alternative<std::vector<std::byte>>(chunk.value))
-					{
-						auto& data = std::get<std::vector<std::byte>>(chunk.value);
-						blosc2_schunk_append_chunk(
-							schunk.get(),
-							reinterpret_cast<uint8_t*>(data.data()),
-							true // copy
-						);
-					}
-					else
-					{
-						assert(lazy_compressed_data.size() >= BLOSC2_MAX_OVERHEAD);
-						// we already initialized the buffer to the lazychunk value above
-						blosc2_schunk_append_chunk(
-							schunk.get(),
-							reinterpret_cast<uint8_t*>(lazy_compressed_data.data()),
-							true // copy
-						);
-					}
-				}
-
-				return schunk;
-			}
-
-			/// Generate an uncompressed vector from the chunks, using the decompression context
-			/// to perform the decompression.
-			std::vector<T> to_uncompressed(blosc2::context_ptr& decompression_ctx) const override
-			{
-				std::vector<T> uncompressed(this->size(), this->lazy_chunk_value());
-
-				size_t offset = 0; // element offset
-				for (const auto& chunk : this->m_Chunks)
-				{
-					if (std::holds_alternative<std::vector<std::byte>>(chunk.value))
-					{
-						auto subspan = std::span<T>(uncompressed.data() + offset, chunk.num_elements);
-						blosc2::decompress(decompression_ctx, subspan, std::get<std::vector<std::byte>>(chunk.value));
-					}
-					// Since we already initialized the uncompressed data to the lazy chunks' value we don't need
-					// to do any filling here.
-					offset += chunk.num_elements;
-				}
-
-				return uncompressed;
-			}
-
-			std::vector<T> chunk(blosc2::context_ptr& decompression_ctx, size_t index) const override
-			{
-				return this->chunk(decompression_ctx.get(), index);
-			}
-
-			std::vector<T> chunk(blosc2::context_raw_ptr decompression_ctx, size_t index) const override
-			{
-				if (index > this->m_Chunks.size() - 1)
-				{
-					throw std::out_of_range(
-						std::format("Cannot access index {} in lazy-schunk. Total amount of chunks is {}", index, this->m_Chunks.size())
-					);
-				}
-
-				if (std::holds_alternative<std::vector<std::byte>>(this->m_Chunks[index].value))
-				{
-					std::vector<T> uncompressed(this->chunk_elements(index), 0);
-					this->chunk(decompression_ctx, std::span<T>(uncompressed), index);
-					return uncompressed;
-				}
-				return std::vector<T>(this->chunk_elements(index), std::get<T>(this->m_Chunks[index].value));
-			}
-
-			void chunk(blosc2::context_ptr& decompression_ctx, std::span<T> buffer, size_t index) const override
-			{
-				this->chunk(decompression_ctx.get(), buffer, index);
-			}
-
-			void chunk(blosc2::context_raw_ptr decompression_ctx, std::span<T> buffer, size_t index) const override
-			{
-				if (index > this->m_Chunks.size() - 1)
-				{
-					throw std::out_of_range(
-						std::format("Cannot access index {} in lazy-schunk. Total amount of chunks is {}", index, this->m_Chunks.size())
-					);
-				}
-
-				// Either decompress from the compressed data or fill with the lazy chunks value
-				if (std::holds_alternative<std::vector<std::byte>>(this->m_Chunks.at(index).value))
-				{
-					auto& compressed = std::get<std::vector<std::byte>>(this->m_Chunks.at(index).value);
-					blosc2::decompress(
-						decompression_ctx,
-						buffer,
-						std::span<const std::byte>(compressed)
-					);
-				}
-				else
-				{
-					std::fill(
-						std::execution::par_unseq,
-						buffer.begin(),
-						buffer.end(),
-						std::get<T>(this->m_Chunks[index].value)
-					);
-				}
-			}
-
-			void set_chunk(std::vector<std::byte> compressed, size_t index) override
-			{
-				this->validate_chunk_index(index);
-				this->m_Chunks[index].value = std::move(compressed);
-				this->validate_chunk_sizes();
-			}
-
-			void set_chunk(std::span<const std::byte> compressed, size_t index) override
-			{
-				this->validate_chunk_index(index);
-				this->m_Chunks[index].value = std::vector<std::byte>(compressed.begin(), compressed.end());
-				this->validate_chunk_sizes();
-			}
-
-			void set_chunk(blosc2::context_ptr& compression_ctx, std::span<T> uncompressed, size_t index) override
-			{
-				this->validate_chunk_index(index);
-
-				util::default_init_vector<std::byte> compression_buffer(blosc2::min_compressed_size(this->m_ChunkSize));
-				std::span<std::byte> compression_span(compression_buffer);
-
-				auto csize = blosc2::compress<T>(compression_ctx, uncompressed, compression_span);
-
-				// copy over a new vector containing all the elements from the compression span.
-				this->m_Chunks[index].value = std::vector<std::byte>(compression_span.begin(), compression_span.begin() + csize);
-				this->m_Chunks[index].num_elements = uncompressed.size();
-				this->validate_chunk_sizes();
-			}
-
-			void append_chunk(std::vector<std::byte> compressed) override
-			{
-				auto num_elements = blosc2::chunk_num_elements<T>(compressed);
-				auto chunk = detail::lazy_chunk<T>{ .value = std::move(compressed), .num_elements = num_elements };
-				this->m_Chunks.push_back(std::move(chunk));
-				this->validate_chunk_sizes();
-			}
-
-			void append_chunk(blosc2::context_ptr& compression_ctx, std::span<T> uncompressed) override
-			{
-				util::default_init_vector<std::byte> compression_buffer(blosc2::min_compressed_size(this->chunk_bytes()));
-				std::span<std::byte> compression_span(compression_buffer);
-				this->append_chunk(compression_ctx, uncompressed, compression_span);
-				this->validate_chunk_sizes();
-			};
-
-			void append_chunk(blosc2::context_ptr& compression_ctx, std::span<T> uncompressed, std::span<std::byte> compression_buff) override
-			{
-				auto csize = blosc2::compress<T>(compression_ctx, uncompressed, compression_buff);
-				auto chunk = detail::lazy_chunk<T>{
-					.value = std::vector<std::byte>(compression_buff.begin(), compression_buff.begin() + csize),
-					.num_elements = uncompressed.size()
-				};
-				this->m_Chunks.push_back(std::move(chunk));
-				this->validate_chunk_sizes();
-			}
-
-			/// Retrieve the total compressed size of the lazy-schunk.
-			/// Lazy chunks will count as the size of T.
-			size_t csize() const noexcept override
-			{
-				size_t _csize = 0;
-				for (const auto& chunk : this->m_Chunks)
-				{
-					if (std::holds_alternative<T>(chunk.value))
-					{
-						_csize += sizeof(T);
-					}
-					else
-					{
-						_csize += std::get<std::vector<std::byte>>(chunk.value).size();
-					}
-				}
-				return _csize;
-			}
-
-			// The total uncompressed size of the lazy-schunk in elements.
-			size_t size() const noexcept override
-			{
-				size_t _size = 0;
-				for (const auto& chunk : this->m_Chunks)
-				{
-					_size += chunk.num_elements;
-				}
-				return _size;
-			}
-
-		private:
-
-			/// Check whether this->m_Chunks contain any still-lazy chunks.
-			bool has_lazy_chunk() const noexcept
-			{
-				for (const auto& chunk : this->m_Chunks)
-				{
-					if (std::holds_alternative<T>(chunk.value))
-					{
-						return true;
-					}
-				}
-				return false;
-			}
-
-			/// Get the value of the first encountered lazy chunk, since we only create lazy chunks with a single value
-			/// this is a valid way of accessing this value. if no lazy chunk exists we simply return T{}
-			T lazy_chunk_value() const noexcept
-			{
-				for (const auto& chunk : this->m_Chunks)
-				{
-					if (std::holds_alternative<T>(chunk.value))
-					{
-						return std::get<T>(chunk.value);
-					}
-				}
-				return T{};
-
-			}
-
-		};
-
-	} // blosc2
-
-} // NAMESPACE_COMPRESSED_IMAGE
\ No newline at end of file
+    namespace detail
+    {
+        /// Wrapper representing a lazy chunk holding either an initialized (and compressed) chunk
+        /// in the form of a byte array or just a single T representing a lazy state
+        template <typename T, typename _storage_type>
+        struct lazy_chunk
+        {
+            std::variant<_storage_type, T> value;
+            size_t num_elements = 0;
+
+            lazy_chunk(std::variant<_storage_type, T> v, size_t n) noexcept
+                : value(std::move(v)), num_elements(n)
+            {
+            }
+
+            size_t byte_size() const noexcept
+            {
+                return num_elements * sizeof(T);
+            }
+
+            bool is_lazy() const noexcept
+            {
+                return std::holds_alternative<T>(this->value);
+            }
+        };
+
+
+        template <typename T>
+        struct lazy_schunk final :
+            public detail::schunk_mixin<
+                T, /* element type */
+                detail::lazy_chunk<T, detail::gpu_chunk<T>>, /* gpu storage type */
+                detail::lazy_chunk<T, detail::cpu_chunk> /* cpu storage type */
+            >
+        {
+            /// Bring the gpu_container and cpu_container using declarations into this struct
+            using detail::schunk_mixin<
+                T,
+                detail::lazy_chunk<T, detail::gpu_chunk<T>>,
+                detail::lazy_chunk<T, detail::cpu_chunk>
+            >::gpu_container;
+            using detail::schunk_mixin<
+                T,
+                detail::lazy_chunk<T, detail::gpu_chunk<T>>,
+                detail::lazy_chunk<T, detail::cpu_chunk>
+            >::cpu_container;
+
+            using detail::schunk_mixin<
+                T,
+                detail::lazy_chunk<T, detail::gpu_chunk<T>>,
+                detail::lazy_chunk<T, detail::cpu_chunk>
+            >::chunk;
+            using detail::schunk_mixin<
+                T,
+                detail::lazy_chunk<T, detail::gpu_chunk<T>>,
+                detail::lazy_chunk<T, detail::cpu_chunk>
+            >::chunk_bytes;
+            using detail::schunk_mixin<
+                T,
+                detail::lazy_chunk<T, detail::gpu_chunk<T>>,
+                detail::lazy_chunk<T, detail::cpu_chunk>
+            >::is_gpu_chunk;
+            using detail::schunk_mixin<
+                T,
+                detail::lazy_chunk<T, detail::gpu_chunk<T>>,
+                detail::lazy_chunk<T, detail::cpu_chunk>
+            >::to_uncompressed;
+
+            lazy_schunk() = default;
+
+            lazy_schunk(lazy_schunk&& other) noexcept
+            {
+                this->m_chunks = std::move(other.m_chunks);
+                this->m_chunk_size = other.m_chunk_size;
+                this->m_block_size = other.m_block_size;
+            }
+
+            lazy_schunk& operator=(lazy_schunk&& other) noexcept
+            {
+                if (this != &other)
+                {
+                    this->m_chunks = std::move(other.m_chunks);
+                    this->m_chunk_size = other.m_chunk_size;
+                    this->m_block_size = other.m_block_size;
+                }
+                return *this;
+            }
+
+            lazy_schunk(const lazy_schunk& other) = default;
+            lazy_schunk& operator=(const lazy_schunk& other) = default;
+
+
+            /// Initialize a lazy super-chunk from the given value, has a near-zero
+            /// cost with the chunks only being initialized on read/modify.
+            ///
+            /// \param value The initial value to fill.
+            /// \param num_elements The size to initialize the data with.
+            /// \param block_size The requested chunk size. It is up to the caller to ensure
+            ///                   this is appropriately sized
+            /// \param chunk_size The requested chunk size. It is up to the caller to ensure
+            ///                   this is appropriately sized (i.e. by using util::align_chunk_to_scanlines)
+            lazy_schunk(const T value, const size_t num_elements, const size_t block_size, const size_t chunk_size)
+            {
+                util::validate_chunk_size<T>(chunk_size, "lazy_schunk");
+                this->m_block_size = block_size;
+                this->m_chunk_size = chunk_size;
+
+                size_t num_bytes = num_elements * sizeof(T);
+
+                // Calculate all 'full' chunks and the final remainder (if any).
+                size_t num_full_chunks = num_bytes / this->m_chunk_size;
+                size_t remainder_bytes = num_bytes - (this->m_chunk_size * num_full_chunks);
+
+                // Initialize lazy chunks with the provided value of T
+                for ([[maybe_unused]] auto idx : std::views::iota(size_t{0}, num_full_chunks))
+                {
+                    detail::lazy_chunk<T, cpu_chunk> chunk = {value, this->m_chunk_size / sizeof(T)};
+                    this->m_chunks.push_back(std::move(chunk));
+                }
+                if (remainder_bytes > 0)
+                {
+                    detail::lazy_chunk<T, gpu_chunk<T>> chunk = {value, remainder_bytes / sizeof(T)};
+                    this->m_chunks.push_back(std::move(chunk));
+                }
+            }
+
+            size_t chunk_bytes(size_t index) const override
+            {
+                if (index > this->m_chunks.size() - 1)
+                {
+                    throw std::out_of_range(
+                        std::format(
+                            "Cannot access index {} in lazy-schunk. Total amount of chunks is {}",
+                            index,
+                            this->m_chunks.size()
+                        )
+                    );
+                }
+
+                return std::visit(
+                    [&](const auto& chunk)
+                    {
+                        return chunk.num_elements * sizeof(T);
+                    },
+                    this->m_chunks[index]
+                );
+            }
+
+            /// Generate an uncompressed vector from the chunks, using the decompression context
+            /// to perform the decompression.
+            std::vector<T> to_uncompressed(
+                cpu_compression_context& cpu_ctx,
+                gpu_compression_context gpu_ctx
+            ) const override
+            {
+                std::vector<T> uncompressed(this->size(), this->lazy_chunk_value());
+
+                size_t offset = 0; // element offset
+                for (const auto& chunk : this->m_chunks)
+                {
+                    if (std::holds_alternative<gpu_container>(chunk))
+                    {
+                        const auto& _chunk_val = std::get<gpu_container>(chunk);
+
+                        // Since we already initialized the uncompressed data to the lazy chunks' value we don't need
+                        // to do any filling here.
+                        if (_chunk_val.is_lazy())
+                        {
+                            offset += _chunk_val.num_elements;
+                            continue;
+                        }
+
+                        auto subspan = std::span<T>(uncompressed.data() + offset, _chunk_val.num_elements);
+
+                        auto compressor = cuda::make_compressor<T>(gpu_ctx.ctx.codec);
+                        std::visit(
+                            [&](auto& _compressor)
+                            {
+                                _compressor.decompress(std::get<gpu_chunk<T>>(_chunk_val.value), subspan);
+                            },
+                            compressor
+                        );
+
+                        offset += _chunk_val.num_elements;
+                    }
+                    else
+                    {
+                        const auto& _chunk_val = std::get<cpu_container>(chunk);
+
+                        // Since we already initialized the uncompressed data to the lazy chunks' value we don't need
+                        // to do any filling here.
+                        if (_chunk_val.is_lazy())
+                        {
+                            offset += _chunk_val.num_elements;
+                            continue;
+                        }
+
+                        auto subspan = std::span<T>(uncompressed.data() + offset, _chunk_val.num_elements);
+                        blosc2::decompress(
+                            cpu_ctx.decompression_ctx,
+                            subspan,
+                            std::get<detail::cpu_chunk>(_chunk_val.value)
+                        );
+                        offset += _chunk_val.num_elements;
+                    }
+                }
+
+                return uncompressed;
+            }
+
+            std::vector<T> chunk(blosc2::context_raw_ptr decompression_ctx, size_t index) const override
+            {
+                if (index > this->m_chunks.size() - 1)
+                {
+                    throw std::out_of_range(
+                        std::format(
+                            "Cannot access index {} in lazy-schunk. Total amount of chunks is {}",
+                            index,
+                            this->m_chunks.size()
+                        )
+                    );
+                }
+
+                const auto& chunk_val = std::get<cpu_container>(this->m_chunks.at(index));
+
+                if (std::holds_alternative<cpu_chunk>(chunk_val.value))
+                {
+                    std::vector<T> uncompressed(this->chunk_elements(index), 0);
+                    this->chunk(decompression_ctx, std::span<T>(uncompressed), index);
+                    return uncompressed;
+                }
+                return std::vector<T>(this->chunk_elements(index), std::get<T>(chunk_val.value));
+            }
+
+            void chunk(blosc2::context_raw_ptr decompression_ctx, std::span<T> buffer, size_t index) const override
+            {
+                this->validate_chunk_index(index);
+                if (this->is_gpu_chunk(index))
+                {
+                    throw std::runtime_error(
+                        "Invalid function overload called for lazy_schunk::chunk. The given chunk is not a cpu"
+                        " chunk but a gpu chunk."
+                    );
+                }
+
+                // Either decompress from the compressed data or fill with the lazy chunks value
+                if (const auto& chunk_val = std::get<cpu_container>(this->m_chunks.at(index)); std::holds_alternative<
+                    cpu_chunk>(chunk_val.value))
+                {
+                    const auto& compressed = std::get<cpu_chunk>(chunk_val.value);
+                    blosc2::decompress(
+                        decompression_ctx,
+                        buffer,
+                        std::span<const std::byte>(compressed)
+                    );
+                }
+                else
+                {
+                    std::fill(
+                        std::execution::par_unseq,
+                        buffer.begin(),
+                        buffer.end(),
+                        std::get<T>(chunk_val.value)
+                    );
+                }
+            }
+
+            void chunk(std::span<T> buffer, size_t index) const override
+            {
+                this->validate_chunk_index(index);
+                if (!this->is_gpu_chunk(index))
+                {
+                    throw std::runtime_error(
+                        "Invalid function overload called for lazy_schunk::chunk. The given chunk is not a gpu"
+                        " chunk but a cpu chunk."
+                    );
+                }
+
+                // Either decompress from the compressed data or fill with the lazy chunks value
+                if (const auto& chunk_val = std::get<gpu_container>(this->m_chunks.at(index)); std::holds_alternative<
+                    gpu_chunk<T>>(chunk_val.value))
+                {
+                    const auto& chunk_container = std::get<gpu_chunk<T>>(chunk_val.value);
+                    auto compressor = cuda::make_compressor<T>(chunk_container);
+                    std::visit(
+                        [&](auto& _compressor)
+                        {
+                            _compressor.decompress(chunk_container, std::span<T>(buffer));
+                        },
+                        compressor
+                    );
+                }
+                else
+                {
+                    std::fill(
+                        std::execution::par_unseq,
+                        buffer.begin(),
+                        buffer.end(),
+                        std::get<T>(chunk_val.value)
+                    );
+                }
+            }
+
+            void set_chunk(blosc2::context_ptr& compression_ctx, std::span<T> uncompressed, size_t index) override
+            {
+                this->validate_chunk_index(index);
+
+                auto compressed = blosc2::compress_to_chunk<T>(compression_ctx, uncompressed);
+
+                auto chunk = detail::lazy_chunk<T, cpu_chunk>{
+                    std::move(compressed),
+                    uncompressed.size()
+                };
+                this->m_chunks[index] = std::move(chunk);
+                this->validate_chunk_sizes();
+            }
+
+            void set_chunk(cuda::nvcomp_context compression_ctx, std::span<T> uncompressed, size_t index) override
+            {
+                this->validate_chunk_index(index);
+
+                auto compressor = cuda::make_compressor<T>(compression_ctx.codec);
+                cuda::compressed_chunk<T> _chunk{};
+                std::visit(
+                    [&](auto& _compressor)
+                    {
+                        _chunk = _compressor.compress(uncompressed, compression_ctx);
+                    },
+                    compressor
+                );
+                auto chunk = detail::lazy_chunk<T, gpu_chunk<T>>{
+                    std::move(_chunk),
+                    uncompressed.size()
+                };
+                this->m_chunks[index] = std::move(chunk);
+                this->validate_chunk_sizes();
+            }
+
+            void append_chunk(cuda::nvcomp_context compression_ctx, std::span<const T> uncompressed) override
+            {
+                auto compressor = cuda::make_compressor<T>(compression_ctx.codec);
+                cuda::compressed_chunk<T> _chunk{};
+                std::visit(
+                    [&](auto& _compressor) -> void
+                    {
+                        _chunk = _compressor.compress(uncompressed, compression_ctx);
+                    },
+                    compressor
+                );
+
+                auto chunk = detail::lazy_chunk<T, gpu_chunk<T>>{
+                    std::move(_chunk),
+                    uncompressed.size()
+                };
+                this->m_chunks.push_back(std::move(chunk));
+                this->validate_chunk_sizes();
+            }
+
+            void append_chunk(blosc2::context_ptr& compression_ctx,
+                              std::span<T> uncompressed,
+                              std::span<std::byte> compression_buff) override
+            {
+                auto csize = blosc2::compress<T>(compression_ctx, uncompressed, compression_buff);
+                auto chunk = detail::lazy_chunk<T, cpu_chunk>{
+                    cpu_chunk(compression_buff.begin(), compression_buff.begin() + csize),
+                    uncompressed.size()
+                };
+                this->m_chunks.push_back(std::move(chunk));
+                this->validate_chunk_sizes();
+            }
+
+            void append_chunk(compression_context_var compression_ctx, std::span<T> uncompressed) override
+            {
+                if (std::holds_alternative<cpu_compression_context>(compression_ctx))
+                {
+                    auto compressed = blosc2::compress_to_chunk<T>(
+                        std::get<cpu_compression_context>(compression_ctx).compression_ctx,
+                        uncompressed
+                    );
+                    auto chunk = detail::lazy_chunk<T, cpu_chunk>{
+                        cpu_chunk(compressed.begin(), compressed.end()),
+                        uncompressed.size()
+                    };
+                    this->m_chunks.push_back(std::move(chunk));
+                }
+                else
+                {
+                    auto compressor = cuda::make_compressor<T>(
+                        std::get<gpu_compression_context>(compression_ctx).ctx.codec
+                    );
+                    cuda::compressed_chunk<T> gpu_chunk{};
+                    std::visit(
+                        [&](auto& _compressor)
+                        {
+                            gpu_chunk = _compressor.compress(
+                                uncompressed,
+                                std::get<gpu_compression_context>(compression_ctx).ctx
+                            );
+                        },
+                        compressor
+                    );
+
+                    auto chunk = detail::lazy_chunk<T, detail::gpu_chunk<T>>{
+                        std::move(gpu_chunk),
+                        uncompressed.size()
+                    };
+                    this->m_chunks.push_back(std::move(chunk));
+                }
+                this->validate_chunk_sizes();
+            };
+
+            /// Retrieve the total compressed size of the lazy-schunk.
+            /// Lazy chunks will count as the size of T.
+            size_t csize() const noexcept override
+            {
+                size_t _csize = 0;
+                size_t idx = 0;
+                for (const auto& chunk : this->m_chunks)
+                {
+                    if (this->is_gpu_chunk(idx))
+                    {
+                        const auto& _chunk = std::get<gpu_container>(chunk);
+                        if (std::holds_alternative<T>(_chunk.value))
+                        {
+                            _csize += sizeof(T);
+                        }
+                        else
+                        {
+                            _csize += std::get<gpu_chunk<T>>(_chunk.value).size();
+                        }
+                    }
+                    else
+                    {
+                        const auto& _chunk = std::get<cpu_container>(chunk);
+                        if (std::holds_alternative<T>(_chunk.value))
+                        {
+                            _csize += sizeof(T);
+                        }
+                        else
+                        {
+                            _csize += std::get<cpu_chunk>(_chunk.value).size();
+                        }
+                    }
+                    ++idx;
+                }
+                return _csize;
+            }
+
+            // The total uncompressed size of the lazy-schunk in elements.
+            size_t size() const noexcept override
+            {
+                size_t _size = 0;
+                for (const auto& chunk : this->m_chunks)
+                {
+                    std::visit(
+                        [&](const auto& _chunk) -> void
+                        {
+                            _size += _chunk.num_elements;
+                        },
+                        chunk
+                    );
+                }
+                return _size;
+            }
+
+        private:
+            /// Check whether this->m_chunks contain any still-lazy chunks.
+            bool has_lazy_chunk() const noexcept
+            {
+                for (const auto& chunk : this->m_chunks)
+                {
+                    if (std::holds_alternative<T>(chunk.value))
+                    {
+                        return true;
+                    }
+                }
+                return false;
+            }
+
+            /// Get the value of the first encountered lazy chunk, since we only create lazy chunks with a single value
+            /// this is a valid way of accessing this value. if no lazy chunk exists we simply return T{}
+            T lazy_chunk_value() const noexcept
+            {
+                for (const auto& chunk : this->m_chunks)
+                {
+                    T value = {};
+
+                    std::visit(
+                        [&](const auto& _chunk)
+                        {
+                            if (_chunk.is_lazy())
+                            {
+                                value = std::get<T>(_chunk.value);
+                            }
+                        },
+                        chunk
+                    );
+
+                    if (value != T{})
+                    {
+                        return value;
+                    }
+                }
+
+                return {};
+            }
+        };
+    } // detail
+} // NAMESPACE_COMPRESSED_IMAGE
diff --git a/compressed_image/include/compressed/blosc2/schunk.h b/compressed_image/include/compressed/blosc2/schunk.h
index c090527..fcdc730 100644
--- a/compressed_image/include/compressed/blosc2/schunk.h
+++ b/compressed_image/include/compressed/blosc2/schunk.h
@@ -12,269 +12,358 @@
 #include "wrapper.h"
 #include "schunk_mixin.h"
 
-namespace NAMESPACE_COMPRESSED_IMAGE
-{
+#include "compressed/cuda/compression.h"
 
-	namespace blosc2
-	{
-
-		template <typename T>
-		struct schunk final: public detail::schunk_mixin<T, std::vector<std::byte>>
-		{
-			using detail::schunk_mixin<T, std::vector<std::byte>>::chunk_bytes;
-
-			schunk() = default;
-
-			schunk(schunk&& other) noexcept 
-			{
-				this->m_Chunks = std::move(other.m_Chunks);
-				this->m_ChunkSize = other.m_ChunkSize;
-				this->m_BlockSize = other.m_BlockSize;
-			}
-			schunk& operator=(schunk&& other) noexcept 
-			{
-				if (this != &other) 
-				{
-					this->m_Chunks = std::move(other.m_Chunks);
-					this->m_ChunkSize = other.m_ChunkSize;
-					this->m_BlockSize = other.m_BlockSize;
-				}
-				return *this;
-			}
-			schunk(const schunk& other) = default;
-			schunk& operator=(const schunk& other) = default;
-
-
-			/// Initialize an empty schunk with just a schunk size. The data can then later
-			/// be filled with append_chunk for example.
-			schunk(size_t block_size, size_t chunk_size)
-			{
-				util::validate_chunk_size<T>(chunk_size, "schunk");
-				this->m_ChunkSize = chunk_size;
-				this->m_BlockSize = block_size;
-			}
-
-			/// Initialize a super-chunk from the given vector, compressing it
-			/// 
-			/// \param data The data to store
-			/// \param block_size The requested block size. It is up to the caller to ensure
-			///                   this is appropriately sized
-			/// \param chunk_size The requested chunk size. It is up to the caller to ensure
-			///                   this is appropriately sized (i.e. by using util::align_chunk_to_scanlines)
-			/// \param compression_ctx The compression context to be used for compressing the data.
-			schunk(std::span<const T> data, size_t block_size, size_t chunk_size, blosc2::context_ptr& compression_ctx)
-			{
-				util::validate_chunk_size<T>(chunk_size, "schunk");
-				this->m_BlockSize = block_size;
-				this->m_ChunkSize = chunk_size;
-
-				// Compression buffer we will continuously overwrite in our compression, the chunk data is then copied out
-				// of this on initialization.
-				util::default_init_vector<std::byte> compression_buffer(blosc2::min_compressed_size(chunk_size));
-				auto compression_span = std::span<std::byte>(compression_buffer);
-
-				size_t num_elements = data.size();
-				size_t num_bytes = num_elements * sizeof(T);
-
-				// Calculate all 'full' chunks and the final remainder (if any).
-				size_t num_full_chunks = num_bytes / this->chunk_bytes();
-				size_t remainder_bytes = num_bytes - (this->chunk_bytes() * num_full_chunks);
-
-				size_t data_offset = 0;
-				// Initialize the chunks by compressing them.
-				for ([[maybe_unused]] auto idx : std::views::iota(size_t{ 0 }, num_full_chunks))
-				{
-					auto subspan = std::span<const T>(data.data() + data_offset, this->chunk_elements());
-					auto csize = blosc2::compress<T>(compression_ctx, subspan, compression_span);
-
-					// copy over a new vector containing all the elements from the compression span.
-					this->m_Chunks.push_back(std::vector<std::byte>(compression_span.begin(), compression_span.begin() + csize));
-
-					data_offset += this->chunk_elements();
-				}
-				if (remainder_bytes > 0)
-				{
-					auto subspan = std::span<const T>(data.data() + data_offset, data.size() - data_offset);
-					auto csize = blosc2::compress<T>(compression_ctx, subspan, compression_span);
-
-					// copy over a new vector containing all the elements from the compression span.
-					this->m_Chunks.push_back(std::vector<std::byte>(compression_span.begin(), compression_span.begin() + csize));
-
-					// no need to move over the data_offset.
-				}
-			}
-
-			schunk_ptr to_schunk() override
-			{
-				_COMPRESSED_PROFILE_FUNCTION();
-				blosc2::schunk_ptr schunk = create_default_schunk();
-				for (auto& chunk : this->m_Chunks)
-				{
-					blosc2_schunk_append_chunk(
-						schunk.get(),
-						reinterpret_cast<uint8_t*>(chunk.data()),
-						true // copy, blosc2 will internally at some point do this anyways.
-					);
-				}
-
-				return schunk;
-			}
-
-			std::vector<T> to_uncompressed(blosc2::context_ptr& decompression_ctx) const override
-			{
-				_COMPRESSED_PROFILE_FUNCTION();
-				auto num_elems = this->size();
-				std::vector<T> data(num_elems);
-
-				size_t data_offset = 0;
-				for (auto idx : std::views::iota(size_t{ 0 }, this->m_Chunks.size()))
-				{
-					size_t chunk_elems = this->chunk_elements(idx);
-
-					auto subspan = std::span<T>(data.data() + data_offset, chunk_elems);
-					this->chunk(decompression_ctx, subspan, idx);
-
-					data_offset += chunk_elems;
-				}
-
-				return data;
-			}
-
-			std::vector<T> chunk(blosc2::context_ptr& decompression_ctx, size_t index) const override
-			{
-				return this->chunk(decompression_ctx.get(), index);
-			}
-
-			std::vector<T> chunk(blosc2::context_raw_ptr decompression_ctx, size_t index) const override
-			{
-				this->validate_chunk_index(index);
-
-				std::vector<T> decompressed(this->chunk_elements(index));
-				auto chunk_span = std::span<const std::byte>(this->m_Chunks[index].begin(), this->m_Chunks[index].end());
-				blosc2::decompress(decompression_ctx, std::span<T>(decompressed), chunk_span);
-
-				return std::move(decompressed);
-			}
-
-			void chunk(blosc2::context_ptr& decompression_ctx, std::span<T> buffer, size_t index) const override
-			{
-				this->chunk(decompression_ctx.get(), buffer, index);
-			}
-
-			void chunk(blosc2::context_raw_ptr decompression_ctx, std::span<T> buffer, size_t index) const override
-			{
-				this->validate_chunk_index(index);
-
-				if (buffer.size() < this->chunk_elements(index))
-				{
-					throw std::invalid_argument(
-						std::format(
-							"Unable to decompress chunk at idx {} into buffer as the buffer needs to at least have the size {:L}."
-							" Instead got {:L}", index, this->chunk_elements(index), buffer.size()
-						)
-					);
-				}
-
-				auto chunk_span = std::span<const std::byte>(this->m_Chunks[index].begin(), this->m_Chunks[index].end());
-				blosc2::decompress(decompression_ctx, std::span<T>(buffer), chunk_span);
-			}
-
-			void set_chunk(std::vector<std::byte> compressed, size_t index) override
-			{
-				this->validate_chunk_index(index);
-				this->m_Chunks[index] = std::move(compressed);
-				this->validate_chunk_sizes();
-			}
-
-			void set_chunk(std::span<const std::byte> compressed, size_t index) override
-			{
-				this->validate_chunk_index(index);
-				this->m_Chunks[index] = std::vector<std::byte>(compressed.begin(), compressed.end());
-				this->validate_chunk_sizes();
-			}
-
-			void set_chunk(blosc2::context_ptr& compression_ctx, std::span<T> uncompressed, size_t index) override
-			{
-				this->validate_chunk_index(index);
-
-				util::default_init_vector<std::byte> compression_buffer(blosc2::min_compressed_size(this->chunk_bytes()));
-				std::span<std::byte> compression_span(compression_buffer);
-
-				auto csize = blosc2::compress<T>(compression_ctx, uncompressed, compression_span);
-
-				// copy over a new vector containing all the elements from the compression span.
-				this->m_Chunks[index] = std::vector<std::byte>(compression_span.begin(), compression_span.begin() + csize);
-				this->validate_chunk_sizes();
-			}
-
-			/// Append to the schunk with the uncompressed data (compressing it).
-			///
-			/// \param compressed the compressed chunk
-			void append_chunk(std::vector<std::byte> compressed) override
-			{
-				this->m_Chunks.push_back(std::move(compressed));
-				this->validate_chunk_sizes();
-			};
-
-			/// Append to the schunk with the uncompressed data (compressing it).
-			///
-			/// \param compression_ctx the compression context to use for compression.
-			/// \param uncompressed the uncompressed chunk
-			void append_chunk(blosc2::context_ptr& compression_ctx, std::span<T> uncompressed) override
-			{
-				util::default_init_vector<std::byte> compression_buffer(blosc2::min_compressed_size(this->chunk_bytes()));
-				std::span<std::byte> compression_span(compression_buffer);
-				this->append_chunk(compression_ctx, uncompressed, compression_span);
-				this->validate_chunk_sizes();
-			};
-
-			void append_chunk(blosc2::context_ptr& compression_ctx, std::span<T> uncompressed, std::span<std::byte> compression_buff) override
-			{
-				if (compression_buff.size() < blosc2::min_compressed_size(this->chunk_bytes()))
-				{
-					throw std::runtime_error(
-						std::format(
-							"Error while appending chunk to super-chunk. Expected compression buffer to be at least"
-							" {:L} bytes but instead we got {:L} bytes", blosc2::min_compressed_size(this->chunk_bytes()),
-							compression_buff.size()
-						)
-					);
-				}
-				auto csize = blosc2::compress<T>(compression_ctx, uncompressed, compression_buff);
-				assert(csize <= compression_buff.size());
-				// copy over a new vector containing all the elements from the compression span.
-				this->m_Chunks.push_back(std::vector<std::byte>(compression_buff.begin(), compression_buff.begin() + csize));
-				this->validate_chunk_sizes();
-			}
-
-			size_t chunk_bytes(size_t index) const override
-			{
-				return blosc2::chunk_num_elements<T>(this->m_Chunks[index]) * sizeof(T);
-			}
-
-			/// The total compressed size of the schunk
-			virtual size_t csize() const noexcept override
-			{
-				size_t _size = 0;
-				for (const auto& chunk : this->m_Chunks)
-				{
-					_size += chunk.size();
-				}
-				return _size;
-			};
-
-			size_t size() const noexcept override
-			{
-				size_t _size = 0;
-				for (const auto& chunk : this->m_Chunks)
-				{
-					_size += blosc2::chunk_num_elements<T>(chunk);
-				}
-				return _size;
-			};
-
-		};
-
-	} // blosc2
-
-} // NAMESPACE_COMPRESSED_IMAGE
\ No newline at end of file
+namespace
+NAMESPACE_COMPRESSED_IMAGE
+{
+    namespace detail
+    {
+        template <typename T>
+        struct schunk final :
+            public detail::schunk_mixin<T>
+        {
+            using detail::schunk_mixin<T>::gpu_container;
+            using detail::schunk_mixin<T>::cpu_container;
+
+            using detail::schunk_mixin<T>::chunk_bytes;
+            using detail::schunk_mixin<T>::chunk;
+            using detail::schunk_mixin<T>::is_gpu_chunk;
+            using detail::schunk_mixin<T>::to_uncompressed;
+
+            schunk() = default;
+
+            schunk(schunk&& other) noexcept
+            {
+                this->m_chunks = std::move(other.m_chunks);
+                this->m_chunk_size = other.m_chunk_size;
+                this->m_block_size = other.m_block_size;
+            }
+
+            schunk& operator=(schunk&& other) noexcept
+            {
+                if (this != &other)
+                {
+                    this->m_chunks = std::move(other.m_chunks);
+                    this->m_chunk_size = other.m_chunk_size;
+                    this->m_block_size = other.m_block_size;
+                }
+                return *this;
+            }
+
+            schunk(const schunk& other) = default;
+            schunk& operator=(const schunk& other) = default;
+
+
+            /// Initialize an empty schunk with just a schunk size. The data can then later
+            /// be filled with append_chunk for example.
+            schunk(size_t block_size, size_t chunk_size)
+            {
+                util::validate_chunk_size<T>(chunk_size, "schunk");
+                this->m_chunk_size = chunk_size;
+                this->m_block_size = block_size;
+            }
+
+            /// Initialize a super-chunk from the given vector, compressing it
+            ///
+            /// \param data The data to store
+            /// \param block_size The requested block size. It is up to the caller to ensure
+            ///                   this is appropriately sized
+            /// \param chunk_size The requested chunk size. It is up to the caller to ensure
+            ///                   this is appropriately sized (i.e., by using util::align_chunk_to_scanlines)
+            /// \param compression_ctx The compression context to be used for compressing the data. Depending on which
+            ///                        type this is, this will initialize the data using gpu/cpu compression internally.
+            schunk(std::span<const T> data,
+                   size_t block_size,
+                   size_t chunk_size,
+                   compression_context_var compression_ctx)
+            {
+                util::validate_chunk_size<T>(chunk_size, "schunk");
+                this->m_block_size = block_size;
+                this->m_chunk_size = chunk_size;
+
+                const size_t num_elements = data.size();
+                const size_t num_bytes = num_elements * sizeof(T);
+
+                // Calculate all 'full' chunks and the final remainder (if any).
+                const size_t num_full_chunks = num_bytes / this->chunk_bytes();
+                const size_t remainder_bytes = num_bytes - (this->chunk_bytes() * num_full_chunks);
+
+                // When compressing using gpu compression, we don't allocate a scratch buffer on the cpu as we internally
+                // use a memory-pool on the gpu that we reuse between compressions, making allocations quite cheap.
+                if (std::holds_alternative<gpu_compression_context>(compression_ctx))
+                {
+                    size_t data_offset = 0;
+
+                    for ([[maybe_unused]] auto idx : std::views::iota(size_t{0}, num_full_chunks))
+                    {
+                        auto subspan = std::span<const T>(data.data() + data_offset, this->chunk_elements());
+                        this->append_chunk(std::get<gpu_compression_context>(compression_ctx).ctx, subspan);
+
+                        data_offset += this->chunk_elements();
+                    }
+                    if (remainder_bytes > 0)
+                    {
+                        auto subspan = std::span<const T>(data.data() + data_offset, data.size() - data_offset);
+
+                        this->append_chunk(std::get<gpu_compression_context>(compression_ctx).ctx, subspan);
+                        // no need to move over the data_offset.
+                    }
+                }
+                else
+                {
+                    // Compression buffer we will continuously overwrite in our compression, the chunk data is then copied out
+                    // of this on initialization.
+                    util::default_init_vector<std::byte> compression_buffer(blosc2::min_compressed_size(chunk_size));
+                    auto compression_span = std::span<std::byte>(compression_buffer);
+
+                    size_t data_offset = 0;
+                    // Initialize the chunks by compressing them.
+                    for ([[maybe_unused]] auto idx : std::views::iota(size_t{0}, num_full_chunks))
+                    {
+                        auto subspan = std::span<const T>(data.data() + data_offset, this->chunk_elements());
+                        auto csize = blosc2::compress<T>(
+                            std::get<cpu_compression_context>(compression_ctx).compression_ctx.get(),
+                            subspan,
+                            compression_span
+                        );
+
+                        // copy over a new vector containing all the elements from the compression span.
+                        this->m_chunks.push_back(
+                            util::default_init_vector<std::byte>(
+                                compression_span.begin(),
+                                compression_span.begin() + csize
+                            )
+                        );
+
+                        data_offset += this->chunk_elements();
+                    }
+                    if (remainder_bytes > 0)
+                    {
+                        auto subspan = std::span<const T>(data.data() + data_offset, data.size() - data_offset);
+                        auto csize = blosc2::compress<T>(
+                            std::get<cpu_compression_context>(compression_ctx).compression_ctx.get(),
+                            subspan,
+                            compression_span
+                        );
+
+                        // copy over a new vector containing all the elements from the compression span.
+                        this->m_chunks.push_back(
+                            util::default_init_vector<std::byte>(
+                                compression_span.begin(),
+                                compression_span.begin() + csize
+                            )
+                        );
+
+                        // no need to move over the data_offset.
+                    }
+                }
+            }
+
+
+            void chunk(std::span<T> buffer, size_t index) const override
+            {
+                this->validate_chunk_index(index);
+                if (!this->is_gpu_chunk(index))
+                {
+                    throw std::runtime_error(
+                        "Invalid function overload called for schunk::chunk. The given chunk is not a gpu"
+                        " chunk but a cpu chunk."
+                    );
+                }
+                const auto& chunk_data = std::get<gpu_container>(this->m_chunks.at(index));
+                auto compressor = cuda::make_compressor<T>(chunk_data);
+                std::visit(
+                    [&](auto& _compressor)
+                    {
+                        _compressor.decompress(chunk_data, std::span<T>(buffer));
+                    },
+                    compressor
+                );
+            }
+
+            void chunk(blosc2::context_raw_ptr decompression_ctx, std::span<T> buffer, size_t index) const override
+            {
+                this->validate_chunk_index(index);
+                if (this->is_gpu_chunk(index))
+                {
+                    throw std::runtime_error(
+                        "Invalid function overload called for schunk::chunk. The given chunk is not a cpu"
+                        " chunk but a gpu chunk."
+                    );
+                }
+
+                if (buffer.size() < this->chunk_elements(index))
+                {
+                    throw std::invalid_argument(
+                        std::format(
+                            "Unable to decompress chunk at idx {} into buffer as the buffer needs to at least have the size {:L}."
+                            " Instead got {:L}",
+                            index,
+                            this->chunk_elements(index),
+                            buffer.size()
+                        )
+                    );
+                }
+
+                const auto& chunk_data = std::get<cpu_container>(this->m_chunks.at(index));
+                auto chunk_span = std::span<const std::byte>(chunk_data.begin(), chunk_data.end());
+                blosc2::decompress(decompression_ctx, std::span<T>(buffer), chunk_span);
+            }
+
+            void set_chunk(blosc2::context_ptr& compression_ctx, std::span<T> uncompressed, size_t index) override
+            {
+                this->validate_chunk_index(index);
+
+                auto compressed = blosc2::compress_to_chunk<T>(compression_ctx, uncompressed);
+
+                // copy over a new vector containing all the elements from the compression span.
+                this->m_chunks[index] = std::move(compressed);
+                this->validate_chunk_sizes();
+            }
+
+            void set_chunk(cuda::nvcomp_context compression_ctx, std::span<T> uncompressed, size_t index) override
+            {
+                this->validate_chunk_index(index);
+
+                auto compressor = cuda::make_compressor<T>(compression_ctx.codec);
+                cuda::compressed_chunk<T> _chunk{};
+                std::visit(
+                    [&](auto& _compressor)
+                    {
+                        _chunk = _compressor.compress(uncompressed, compression_ctx);
+                    },
+                    compressor
+                );
+
+                this->m_chunks[index] = std::move(_chunk);
+                this->validate_chunk_sizes();
+            }
+
+            void append_chunk(blosc2::context_ptr& compression_ctx,
+                              std::span<T> uncompressed,
+                              std::span<std::byte> compression_buff) override
+            {
+                if (compression_buff.size() < blosc2::min_compressed_size(this->chunk_bytes()))
+                {
+                    throw std::runtime_error(
+                        std::format(
+                            "Error while appending chunk to super-chunk. Expected compression buffer to be at least"
+                            " {:L} bytes but instead we got {:L} bytes",
+                            blosc2::min_compressed_size(this->chunk_bytes()),
+                            compression_buff.size()
+                        )
+                    );
+                }
+                auto csize = blosc2::compress<T>(compression_ctx, uncompressed, compression_buff);
+                assert(csize <= compression_buff.size());
+                // copy over a new vector containing all the elements from the compression span.
+                this->m_chunks.push_back(cpu_chunk(compression_buff.begin(), compression_buff.begin() + csize));
+                this->validate_chunk_sizes();
+            }
+
+            void append_chunk(cuda::nvcomp_context compression_ctx, std::span<const T> uncompressed) override
+            {
+                auto compressor = cuda::make_compressor<T>(compression_ctx.codec);
+
+                cuda::compressed_chunk<T> _chunk{};
+                std::visit(
+                    [&](auto& _compressor)
+                    {
+                        _chunk = _compressor.compress(uncompressed, compression_ctx);
+                    },
+                    compressor
+                );
+
+                this->m_chunks.push_back(std::move(_chunk));
+                this->validate_chunk_sizes();
+            }
+
+            void append_chunk(compression_context_var compression_ctx, std::span<T> uncompressed) override
+            {
+                if (std::holds_alternative<cpu_compression_context>(compression_ctx))
+                {
+                    auto compressed = blosc2::compress_to_chunk<T>(
+                        std::get<cpu_compression_context>(compression_ctx).compression_ctx,
+                        uncompressed
+                    );
+                    this->m_chunks.push_back(std::move(compressed));
+                }
+                else
+                {
+                    auto compressor = cuda::make_compressor<T>(
+                        std::get<gpu_compression_context>(compression_ctx).ctx.codec
+                    );
+                    cuda::compressed_chunk<T> _chunk{};
+                    std::visit(
+                        [&](auto& _compressor)
+                        {
+                            _chunk = _compressor.compress(
+                                uncompressed,
+                                std::get<gpu_compression_context>(compression_ctx).ctx
+                            );
+                        },
+                        compressor
+                    );
+
+                    this->m_chunks.push_back(std::move(_chunk));
+                }
+                this->validate_chunk_sizes();
+            };
+
+            size_t chunk_bytes(size_t index) const override
+            {
+                if (is_gpu_chunk(index))
+                {
+                    const auto& _chunk = std::get<gpu_container>(this->m_chunks.at(index));
+                    return _chunk.byte_size();
+                }
+                const auto& _chunk = std::get<cpu_container>(this->m_chunks.at(index));
+                return blosc2::chunk_num_elements<T>(_chunk) * sizeof(T);
+            }
+
+            /// The total compressed size of the schunk
+            size_t csize() const noexcept override
+            {
+                size_t _size = 0;
+                size_t index = 0;
+                for ([[maybe_unused]] const auto& chunk : this->m_chunks)
+                {
+                    if (is_gpu_chunk(index))
+                    {
+                        const auto& _chunk = std::get<gpu_container>(this->m_chunks.at(index));
+                        _size += _chunk.csize();
+                    }
+                    else
+                    {
+                        const auto& _chunk = std::get<cpu_container>(this->m_chunks.at(index));
+                        _size += _chunk.size();
+                    }
+                    ++index;
+                }
+                return _size;
+            };
+
+            [[nodiscard]] size_t size() const noexcept override
+            {
+                size_t _size = 0;
+                size_t index = 0;
+                for ([[maybe_unused]] const auto& chunk : this->m_chunks)
+                {
+                    if (is_gpu_chunk(index))
+                    {
+                        const auto& _chunk = std::get<gpu_container>(this->m_chunks.at(index));
+                        _size += _chunk.size();
+                    }
+                    else
+                    {
+                        const auto& _chunk = std::get<cpu_container>(this->m_chunks.at(index));
+                        _size += blosc2::chunk_num_elements<T>(_chunk);
+                    }
+                    ++index;
+                }
+                return _size;
+            };
+        };
+    } // detail
+} // NAMESPACE_COMPRESSED_IMAGE
diff --git a/compressed_image/include/compressed/blosc2/schunk_mixin.h b/compressed_image/include/compressed/blosc2/schunk_mixin.h
index b72e512..563089c 100644
--- a/compressed_image/include/compressed/blosc2/schunk_mixin.h
+++ b/compressed_image/include/compressed/blosc2/schunk_mixin.h
@@ -7,264 +7,431 @@
 #include "compressed/macros.h"
 #include "wrapper.h"
 #include "compressed/constants.h"
+#include "compressed/context.h"
+#include "compressed/cuda/compressors/base.h"
 
-namespace NAMESPACE_COMPRESSED_IMAGE
+
+namespace
+NAMESPACE_COMPRESSED_IMAGE
 {
-	namespace blosc2
-	{
-
-		namespace detail
-		{
-
-			/// Opaque mixin around a blosc2 super-chunk with the intention of not using a `blosc2_schunk`
-			/// itself but instead of using it directly the chunks should be stored individually.
-			/// Subclassed by either a `schunk` or a `lazy_schunk` depending on the needs of the 
-			/// consumer.
-			template <typename T, typename ContainerType>
-			struct schunk_mixin
-			{
-				virtual ~schunk_mixin() = default;
-
-				/// convert the struct into a blosc2 schunk.
-				virtual blosc2::schunk_ptr to_schunk() = 0;
-
-				/// Generate an uncompressed vector from all of the chunks.
-				///
-				/// \param decompression_ctx the decompression context pr.
-				/// 
-				/// \returns a contiguous vector representing the uncompressed schunk.
-				virtual std::vector<T> to_uncompressed(blosc2::context_ptr& decompression_ctx) const = 0;
-
-				/// Retrieve the uncompressed chunk at `index`.
-				///
-				/// \param decompression_ctx the decompression context ptr
-				/// \param index the index of the chunk within the schunk.
-				/// 
-				/// \throws std::out_of_range if the index is not valid
-				virtual std::vector<T> chunk(blosc2::context_ptr& decompression_ctx, size_t index) const = 0;
-
-				/// Retrieve the uncompressed chunk at `index`.
-				///
-				/// \param decompression_ctx the decompression context ptr
-				/// \param index the index of the chunk within the schunk.
-				/// 
-				/// \throws std::out_of_range if the index is not valid
-				virtual std::vector<T> chunk(blosc2::context_raw_ptr decompression_cx, size_t index) const = 0;
-
-				/// Retrieve the uncompressed chunk at `index`.
-				///
-				/// \param decompression_ctx the decompression context ptr
-				/// \param buffer the buffer to fill the uncompressed data with. Must be at least max chunk size.
-				/// \param index the index of the chunk within the schunk.
-				/// 
-				/// \throws std::out_of_range if the index is not valid
-				virtual void chunk(blosc2::context_ptr& decompression_ctx, std::span<T> buffer, size_t index) const = 0;
-
-				/// Retrieve the uncompressed chunk at `index`.
-				///
-				/// \param decompression_ctx the decompression context ptr
-				/// \param buffer the buffer to fill the uncompressed data with. Must be at least max chunk size.
-				/// \param index the index of the chunk within the schunk.
-				/// 
-				/// \throws std::out_of_range if the index is not valid
-				virtual void chunk(blosc2::context_raw_ptr decompression_ctx, std::span<T> buffer, size_t index) const = 0;
-
-				/// Set the chunk at `index` to the compressed data.
-				///
-				/// \param compressed the compressed chunk
-				/// \param index the index of the chunk within the schunk.
-				/// 
-				/// \throws std::out_of_range if the index is not valid
-				virtual void set_chunk(std::vector<std::byte> compressed, size_t index) = 0;
-
-				/// Set the chunk at `index` to the compressed data.
-				///
-				/// \param compressed the compressed chunk
-				/// \param index the index of the chunk within the schunk.
-				/// 
-				/// \throws std::out_of_range if the index is not valid
-				virtual void set_chunk(std::span<const std::byte> compressed, size_t index) = 0;
-
-				/// Set the chunk at `index` to the uncompressed data (compressing it).
-				///
-				/// \param compression_ctx the compression context to use for compression.
-				/// \param uncompressed the uncompressed chunk
-				/// \param index the index of the chunk within the schunk.
-				/// 
-				/// \throws std::out_of_range if the index is not valid
-				virtual void set_chunk(blosc2::context_ptr& compression_ctx, std::span<T> uncompressed, size_t index) = 0;
-
-				/// Append to the schunk with the uncompressed data (compressing it).
-				///
-				/// \param compressed the compressed chunk
-				virtual void append_chunk(std::vector<std::byte> compressed) = 0;
-
-				/// Append to the schunk with the uncompressed data (compressing it).
-				///
-				/// \param compression_ctx the compression context to use for compression.
-				/// \param uncompressed the uncompressed chunk
-				virtual void append_chunk(blosc2::context_ptr& compression_ctx, std::span<T> uncompressed) = 0;
-
-				/// Append to the schunk with the uncompressed data (compressing it).
-				///
-				/// \param compression_ctx the compression context to use for compression.
-				/// \param uncompressed the uncompressed chunk
-				/// \param compression_buff the compression buffer to use for temporary storage.
-				virtual void append_chunk(blosc2::context_ptr& compression_ctx, std::span<T> uncompressed, std::span<std::byte> compression_buff) = 0;
-
-				/// Retrieve the number of elements (uncompressed) that the schunk stores.
-				///
-				/// \throws std::runtime_error if the chunk_bytes / sizeof(T) is not cleanly divisble
-				size_t chunk_elements() const
-				{
-					auto _size =  this->chunk_bytes();
-					if (_size % sizeof(T) != 0)
-					{
-						throw std::runtime_error(
-							std::format(
-								"Internal Error: The chunk byte size is not cleanly divisible by the sizeof T." 
-								" Chunk size is {:L} while sizeof(T) is {}", _size, sizeof(T)
-							)
-						);
-					}
-					return _size / sizeof(T);
-				};
-
-				/// Retrieve the number of elements (uncompressed) that the schunk stores at a given chunk.
-				/// In all cases except for chunk_elements(num_chunks() - 1) this will return chunk_elements.
-				///
-				/// \throws std::out_of_range if the index is not valid in the super-chunk.
-				/// \throws std::runtime_error if the chunk_bytes / sizeof(T) is not cleanly divisble
-				size_t chunk_elements(size_t index) const
-				{
-					auto _size = this->chunk_bytes(index);
-					if (_size % sizeof(T) != 0)
-					{
-						throw std::runtime_error(
-							std::format(
-								"Internal Error: The chunk byte size is not cleanly divisible by the sizeof T."
-								" Chunk size is {:L} while sizeof(T) is {}", _size, sizeof(T)
-							)
-						);
-					}
-					return _size / sizeof(T);
-				};
-
-				/// Retrieve the number of bytes stored by the super-chunk per-chunk. This will be equivalent
-				/// to the number of uncompressed bytes stored by each chunk up to num_chunks() - 1.
-				/// The last chunk may be smaller (but not bigger) in size than this value.
-				size_t chunk_bytes() const
-				{
-					return this->m_ChunkSize;
-				};
-				
-				/// Retrieve the number of bytes stored by the chunk at index `index`. This will be equivalent to 
-				/// chunk_bytes unless it is the last chunk in which case it may be smaller.
-				/// 
-				/// \throws std::out_of_range if the index is not valid in the super-chunk.
-				virtual size_t chunk_bytes(size_t index) const = 0;
-
-				/// The number of chunks in the super-chunk
-				size_t num_chunks() const noexcept
-				{
-					return m_Chunks.size();
-				}
-
-				/// The total compressed size of the schunk in bytes
-				virtual size_t csize() const noexcept = 0;
-
-				/// The total uncompressed size of the schunk in elements
-				virtual size_t size() const noexcept = 0;
-
-				/// The total number of bytes stored in the schunk when uncompressed.
-				/// equivalent to size() * sizeof(T)
-				size_t byte_size() const noexcept
-				{
-					return size() * sizeof(T);
-				}
-
-				size_t max_chunk_size()
-				{
-					return m_ChunkSize;
-				}
-
-				size_t max_block_size()
-				{
-					return m_BlockSize;
-				}
-
-			protected:
-				std::vector<ContainerType> m_Chunks{};
-				/// The maximum size a chunk is constrained to, in bytes. This will dictate the size of all chunks from
-				///  0 - (this->m_Chunks.size() - 1). The last chunk may be any other size smaller than or equal to this value.
-				size_t m_ChunkSize = s_default_chunksize;
-				size_t m_BlockSize = s_default_blocksize;
-
-				/// Validate the chunk index throwing a std::out_of_range if the index is not valid.
-				void validate_chunk_index(size_t index) const
-				{
-					if (index > m_Chunks.size() - 1)
-					{
-						throw std::out_of_range(
-							std::format("Cannot access index {} in schunk. Total amount of chunks is {}", index, m_Chunks.size())
-						);
-					}
-				}
-
-				/// Validate all the chunk sizes currently held by the super-chunk. This function
-				/// ensures that the chunks 
-				void validate_chunk_sizes() const
-				{
-					// Check that all chunks barring the last one are equal to m_ChunkSize
-					for (auto i : std::views::iota(size_t{ 0 }, this->num_chunks() - 1))
-					{
-						if (this->chunk_bytes(i) != this->chunk_bytes())
-						{
-							throw std::invalid_argument(
-								std::format(
-									"Error while validating chunk sizes; Expected all chunks to have a size equivalent to {:L} (m_ChunkSize)."
-									" However, chunk {} instead has a chunk size of {:L}. Having a size different from the rest of the chunks"
-									" is only supported for the last chunk (blosc2 limitation). Please ensure that all chunks are equally sized"
-									" when modifying the super-chunk (excluding the last one).",
-									this->chunk_bytes(), i, this->chunk_bytes(i)
-								)
-							);
-						}
-					}
-					
-					// Check that the last chunk is not larger than the rest.
-					if (this->chunk_bytes(this->num_chunks() - 1) > this->chunk_bytes())
-					{
-						throw std::runtime_error(
-							std::format(
-								"Error while validating chunk sizes; Expected the last chunk to be at most {:L} bytes,"
-								" instead got {:L} bytes.",
-								this->chunk_bytes(), this->chunk_bytes(this->num_chunks() - 1)
-							)
-						);
-					}
-				}
-
-				/// Get the buffer size for T for the given byte size. Checks that the buffer
-				/// can be divided cleanly by sizeof(T).
-				size_t get_T_buffer_size(size_t byte_size) const
-				{
-					if (byte_size % sizeof(T) != 0)
-					{
-						throw std::runtime_error(
-							std::format(
-								"Cannot get buffer size for type T of size {} because it is not evenly divisible for buffer size {:L}",
-								sizeof(T),
-								byte_size
-							)
-						);
-					}
-					return byte_size / sizeof(T);
-				}
-			};
-
-		} // detail
-
-	} // blosc2
-
-} // NAMESPACE_COMPRESSED_IMAGE
\ No newline at end of file
+    namespace detail
+    {
+        /// \brief The default storage class for a gpu compressed chunk.
+        ///
+        /// \note this chunk may not live on the gpu, this just indicates it was generated
+        ///		  on the gpu.
+        template <typename T>
+        using gpu_chunk = cuda::compressed_chunk<T>;
+
+        /// \brief The default storage class for a cpu compressed chunk.
+        using cpu_chunk = util::default_init_vector<std::byte>;
+
+        /// Mixin for representing a blosc2-style super-chunk for both cpu and gpu chunks.
+        ///
+        /// \tparam _gpu_container_type The type for a gpu compressed chunk
+        /// \tparam _cpu_container_type The type for a cpu compressed chunk
+        template <typename T, typename _gpu_container_type = gpu_chunk<T>, typename _cpu_container_type = cpu_chunk>
+        struct schunk_mixin
+        {
+            using gpu_container = _gpu_container_type;
+            using cpu_container = _cpu_container_type;
+
+            virtual ~schunk_mixin() = default;
+
+            /// Checks whether the chunk at `index` is a gpu/cpu chunk
+            ///
+            /// \parm index The chunk index
+            ///
+            /// \throws std::runtime_error if the chunk index is not valid
+            bool is_gpu_chunk(size_t index) const
+            {
+                if (index > m_chunks.size() - 1)
+                {
+                    throw std::runtime_error(
+                        std::format(
+                            "Invalid chunk index {}, can at most index up to {}",
+                            index,
+                            m_chunks.size() - 1
+                        )
+                    );
+                }
+
+                return std::holds_alternative<_gpu_container_type>(m_chunks.at(index));
+            };
+
+            /// Generate an uncompressed vector from all of the chunks.
+            ///
+            /// \param cpu_ctx the decompression context for all cpu based chunks.
+            /// \param gpu_ctx the decompression context for all gpu based chunks.
+            ///
+            /// \returns a contiguous vector representing the uncompressed schunk.
+            virtual std::vector<T> to_uncompressed(
+                cpu_compression_context& cpu_ctx,
+                [[maybe_unused]] gpu_compression_context gpu_ctx
+            ) const
+            {
+                _COMPRESSED_PROFILE_FUNCTION();
+                auto num_elems = this->size();
+                std::vector<T> data(num_elems);
+
+                size_t data_offset = 0;
+                for (auto idx : std::views::iota(size_t{0}, this->m_chunks.size()))
+                {
+                    size_t chunk_elems = this->chunk_elements(idx);
+
+                    auto subspan = std::span<T>(data.data() + data_offset, chunk_elems);
+
+                    if (this->is_gpu_chunk(idx))
+                    {
+                        this->chunk(subspan, idx);
+                    }
+                    else
+                    {
+                        if (!cpu_ctx.decompression_ctx || !cpu_ctx.compression_ctx)
+                        {
+                            throw std::invalid_argument(
+                                std::format(
+                                    "Chunk {}: valid cpu decompression and compression contexts must be provided"
+                                    " for cpu chunks",
+                                    idx
+                                )
+                            );
+                        }
+
+                        this->chunk(cpu_ctx.decompression_ctx.get(), subspan, idx);
+                    }
+
+                    data_offset += chunk_elems;
+                }
+
+                return data;
+            };
+
+            /// Generate an uncompressed vector from all of the chunks.
+            ///
+            /// This overload may only be called if the schunk contains no gpu chunks.
+            ///
+            /// \param context the decompression context for the chunks
+            ///
+            /// \throws std::runtime_error if the schunk contains one or more gpu chunks.
+            ///
+            /// \returns a contiguous vector representing the uncompressed schunk.
+            std::vector<T> to_uncompressed(cpu_compression_context& context) const
+            {
+                for (size_t i = 0; i < this->num_chunks(); ++i)
+                {
+                    if (is_gpu_chunk(i))
+                    {
+                        throw std::runtime_error(
+                            std::format(
+                                "Invalid overload of 'to_uncompressed' called. This overload may only be called if"
+                                " there are no GPU chunks. However, at least chunk {} is a gpu chunk. Please pass"
+                                " an explicit GPU decompressor.",
+                                i
+                            )
+                        );
+                    }
+                }
+                return this->to_uncompressed(context, gpu_compression_context{cuda::nvcomp_context{}});
+            }
+
+            /// Generate an uncompressed vector from all of the chunks.
+            ///
+            /// This overload may only be called if the schunk contains no cpu chunks.
+            ///
+            /// \param context the decompression context for the chunks
+            ///
+            /// \throws std::runtime_error if the schunk contains one or more gpu chunks.
+            ///
+            /// \returns a contiguous vector representing the uncompressed schunk.
+            std::vector<T> to_uncompressed(gpu_compression_context context) const
+            {
+                for (size_t i = 0; i < this->num_chunks(); ++i)
+                {
+                    if (!is_gpu_chunk(i))
+                    {
+                        throw std::runtime_error(
+                            std::format(
+                                "Invalid overload of 'to_uncompressed' called. This overload may only be called if"
+                                " there are no CPU chunks. However, at least chunk {} is a cpu chunk. Please pass"
+                                " an explicit CPU decompressor.",
+                                i
+                            )
+                        );
+                    }
+                }
+                auto _cpu_context = cpu_compression_context{};
+                return this->to_uncompressed(_cpu_context, context);
+            }
+
+            /// Retrieve the uncompressed chunk at `index`.
+            ///
+            /// \param context the decompression context
+            /// \param index the index of the chunk within the schunk.
+            ///
+            /// \throws std::out_of_range if the index is not valid
+            virtual std::vector<T> chunk(cpu_compression_context& context, const size_t index) const
+            {
+                return this->chunk(context.decompression_ctx.get(), index);
+            };
+
+            /// Retrieve the uncompressed gpu chunk at `index`.
+            ///
+            /// \param context the decompression context
+            /// \param index the index of the chunk within the schunk.
+            ///
+            /// \throws std::out_of_range if the index is not valid
+            virtual std::vector<T> chunk(const cuda::nvcomp_context context, const size_t index) const
+            {
+                std::vector<T> buffer(this->size());
+                this->chunk(context, index);
+                return buffer;
+            };
+
+            /// Retrieve the uncompressed chunk at `index`.
+            ///
+            /// \param decompression_ctx the decompression context ptr
+            /// \param index the index of the chunk within the schunk.
+            ///
+            /// \throws std::out_of_range if the index is not valid
+            virtual std::vector<T> chunk(blosc2::context_raw_ptr decompression_ctx, size_t index) const
+            {
+                std::vector<T> buffer(this->chunk_elements(index));
+                this->chunk(decompression_ctx, std::span<T>(buffer), index);
+                return buffer;
+            };
+
+            /// Retrieve the uncompressed chunk at `index`.
+            ///
+            /// \param decompression_ctx the decompression context ptr
+            /// \param buffer the buffer to fill the uncompressed data with. Must be at least max chunk size.
+            /// \param index the index of the chunk within the schunk.
+            ///
+            /// \throws std::out_of_range if the index is not valid
+            virtual void chunk(blosc2::context_ptr& decompression_ctx, std::span<T> buffer, size_t index) const
+            {
+                this->chunk(decompression_ctx.get(), buffer, index);
+            };
+
+            /// Retrieve the uncompressed gpu chunk at `index`.
+            ///
+            /// \param buffer the buffer to fill the uncompressed data with. Must be at least max chunk size.
+            /// \param index the index of the chunk within the schunk.
+            ///
+            /// \throws std::out_of_range if the index is not valid
+            virtual void chunk(std::span<T> buffer, size_t index) const = 0;
+
+            /// Retrieve the uncompressed chunk at `index`.
+            ///
+            /// \param decompression_ctx the decompression context ptr
+            /// \param buffer the buffer to fill the uncompressed data with. Must be at least max chunk size.
+            /// \param index the index of the chunk within the schunk.
+            ///
+            /// \throws std::out_of_range if the index is not valid
+            virtual void chunk(blosc2::context_raw_ptr decompression_ctx, std::span<T> buffer, size_t index) const = 0;
+
+            /// Set the chunk at `index` to the uncompressed data (compressing it).
+            ///
+            /// \param compression_ctx the compression context to use for compression.
+            /// \param uncompressed the uncompressed chunk
+            /// \param index the index of the chunk within the schunk.
+            ///
+            /// \throws std::out_of_range if the index is not valid
+            virtual void set_chunk(blosc2::context_ptr& compression_ctx, std::span<T> uncompressed, size_t index) = 0;
+
+            /// Set the chunk at `index` to the uncompressed data (compressing it).
+            ///
+            /// \param compression_ctx the compression context to use for compression.
+            /// \param uncompressed the uncompressed chunk
+            /// \param index the index of the chunk within the schunk.
+            ///
+            /// \throws std::out_of_range if the index is not valid
+            virtual void set_chunk(cuda::nvcomp_context compression_ctx, std::span<T> uncompressed, size_t index) = 0;
+
+
+            /// Append to the schunk with the uncompressed data (compressing it).
+            ///
+            /// \param compression_ctx the compression context to use for compression.
+            /// \param uncompressed the uncompressed chunk
+            virtual void append_chunk(cuda::nvcomp_context compression_ctx, std::span<const T> uncompressed) = 0;
+
+            /// Append to the schunk with the uncompressed data (compressing it).
+            ///
+            /// \param compression_ctx the compression context to use for compression.
+            /// \param uncompressed the uncompressed chunk
+            /// \param compression_buff the compression buffer to use for temporary storage.
+            virtual void append_chunk(blosc2::context_ptr& compression_ctx,
+                                      std::span<T> uncompressed,
+                                      std::span<std::byte> compression_buff) = 0;
+
+            /// Append to the schunk with the uncompressed data (compressing it).
+            ///
+            /// \param compression_ctx the compression context to use for compression.
+            /// \param uncompressed the uncompressed chunk
+            virtual void append_chunk(compression_context_var compression_ctx,
+                                      std::span<T> uncompressed) = 0;
+
+
+            /// Retrieve the number of elements (uncompressed) that the schunk stores.
+            ///
+            /// \throws std::runtime_error if the chunk_bytes / sizeof(T) is not cleanly divisble
+            size_t chunk_elements() const
+            {
+                auto _size = this->chunk_bytes();
+                if (_size % sizeof(T) != 0)
+                {
+                    throw std::runtime_error(
+                        std::format(
+                            "Internal Error: The chunk byte size is not cleanly divisible by the sizeof T."
+                            " Chunk size is {:L} while sizeof(T) is {}",
+                            _size,
+                            sizeof(T)
+                        )
+                    );
+                }
+                return _size / sizeof(T);
+            };
+
+            /// Retrieve the number of elements (uncompressed) that the schunk stores at a given chunk.
+            /// In all cases except for chunk_elements(num_chunks() - 1) this will return chunk_elements.
+            ///
+            /// \throws std::out_of_range if the index is not valid in the super-chunk.
+            /// \throws std::runtime_error if the chunk_bytes / sizeof(T) is not cleanly divisble
+            size_t chunk_elements(size_t index) const
+            {
+                auto _size = this->chunk_bytes(index);
+                if (_size % sizeof(T) != 0)
+                {
+                    throw std::runtime_error(
+                        std::format(
+                            "Internal Error: The chunk byte size is not cleanly divisible by the sizeof T."
+                            " Chunk size is {:L} while sizeof(T) is {}",
+                            _size,
+                            sizeof(T)
+                        )
+                    );
+                }
+                return _size / sizeof(T);
+            };
+
+            /// Retrieve the number of bytes stored by the super-chunk per-chunk. This will be equivalent
+            /// to the number of uncompressed bytes stored by each chunk up to num_chunks() - 1.
+            /// The last chunk may be smaller (but not bigger) in size than this value.
+            size_t chunk_bytes() const
+            {
+                return this->m_chunk_size;
+            };
+
+            /// Retrieve the number of bytes stored by the chunk at index `index`. This will be equivalent to
+            /// chunk_bytes unless it is the last chunk in which case it may be smaller.
+            ///
+            /// \throws std::out_of_range if the index is not valid in the super-chunk.
+            virtual size_t chunk_bytes(size_t index) const = 0;
+
+            /// The number of chunks in the super-chunk
+            size_t num_chunks() const noexcept
+            {
+                return m_chunks.size();
+            }
+
+            /// The total compressed size of the schunk in bytes
+            virtual size_t csize() const noexcept = 0;
+
+            /// The total uncompressed size of the schunk in elements
+            virtual size_t size() const noexcept = 0;
+
+            /// The total number of bytes stored in the schunk when uncompressed.
+            /// equivalent to size() * sizeof(T)
+            size_t byte_size() const noexcept
+            {
+                return size() * sizeof(T);
+            }
+
+            size_t max_chunk_size() const noexcept
+            {
+                return m_chunk_size;
+            }
+
+            size_t max_block_size() const noexcept
+            {
+                return m_block_size;
+            }
+
+        protected:
+            std::vector<std::variant<_cpu_container_type, _gpu_container_type>> m_chunks{};
+            /// The maximum size a chunk is constrained to, in bytes. This will dictate the size of all chunks from
+            ///  0 - (this->m_chunks.size() - 1). The last chunk may be any other size smaller than or equal to this value.
+            size_t m_chunk_size = s_default_chunksize;
+            size_t m_block_size = s_default_blocksize;
+
+            /// Validate the chunk index throwing a std::out_of_range if the index is not valid.
+            void validate_chunk_index(size_t index) const
+            {
+                if (index > m_chunks.size() - 1)
+                {
+                    throw std::out_of_range(
+                        std::format(
+                            "Cannot access index {} in schunk. Total amount of chunks is {}",
+                            index,
+                            m_chunks.size()
+                        )
+                    );
+                }
+            }
+
+            /// Validate all the chunk sizes currently held by the super-chunk. This function
+            /// ensures that the chunks
+            void validate_chunk_sizes() const
+            {
+                // Check that all chunks barring the last one are equal to m_chunk_size
+                for (auto i : std::views::iota(size_t{0}, this->num_chunks() - 1))
+                {
+                    if (this->chunk_bytes(i) != this->chunk_bytes())
+                    {
+                        throw std::invalid_argument(
+                            std::format(
+                                "Error while validating chunk sizes; Expected all chunks to have a size equivalent to {:L} (m_chunk_size)."
+                                " However, chunk {} instead has a chunk size of {:L}. Having a size different from the rest of the chunks"
+                                " is only supported for the last chunk (blosc2 limitation). Please ensure that all chunks are equally sized"
+                                " when modifying the super-chunk (excluding the last one).",
+                                this->chunk_bytes(),
+                                i,
+                                this->chunk_bytes(i)
+                            )
+                        );
+                    }
+                }
+
+                // Check that the last chunk is not larger than the rest.
+                if (this->chunk_bytes(this->num_chunks() - 1) > this->chunk_bytes())
+                {
+                    throw std::runtime_error(
+                        std::format(
+                            "Error while validating chunk sizes; Expected the last chunk to be at most {:L} bytes,"
+                            " instead got {:L} bytes.",
+                            this->chunk_bytes(),
+                            this->chunk_bytes(this->num_chunks() - 1)
+                        )
+                    );
+                }
+            }
+
+            /// Get the buffer size for T for the given byte size. Checks that the buffer
+            /// can be divided cleanly by sizeof(T).
+            size_t get_T_buffer_size(size_t byte_size) const
+            {
+                if (byte_size % sizeof(T) != 0)
+                {
+                    throw std::runtime_error(
+                        std::format(
+                            "Cannot get buffer size for type T of size {} because it is not evenly divisible for buffer size {:L}",
+                            sizeof(T),
+                            byte_size
+                        )
+                    );
+                }
+                return byte_size / sizeof(T);
+            }
+        };
+    } // detail
+} // NAMESPACE_COMPRESSED_IMAGE
diff --git a/compressed_image/include/compressed/blosc2/typedefs.h b/compressed_image/include/compressed/blosc2/typedefs.h
index 89f5f24..2059d5c 100644
--- a/compressed_image/include/compressed/blosc2/typedefs.h
+++ b/compressed_image/include/compressed/blosc2/typedefs.h
@@ -8,15 +8,10 @@
 
 namespace NAMESPACE_COMPRESSED_IMAGE
 {
-
-	namespace blosc2
-	{
 		
-		template <typename T>
-		using schunk_var_ptr = std::shared_ptr<std::variant<blosc2::schunk<T>, blosc2::lazy_schunk<T>>>;
-		template <typename T>
-		using schunk_var = std::variant<blosc2::schunk<T>, blosc2::lazy_schunk<T>>;
-
-	} // blosc2
+	template <typename T>
+	using schunk_var_ptr = std::shared_ptr<std::variant<detail::schunk<T>, detail::lazy_schunk<T>>>;
+	template <typename T>
+	using schunk_var = std::variant<detail::schunk<T>, detail::lazy_schunk<T>>;
 
 } // NAMESPACE_COMPRESSED_IMAGE
\ No newline at end of file
diff --git a/compressed_image/include/compressed/blosc2/wrapper.h b/compressed_image/include/compressed/blosc2/wrapper.h
index d8982ab..3099fe3 100644
--- a/compressed_image/include/compressed/blosc2/wrapper.h
+++ b/compressed_image/include/compressed/blosc2/wrapper.h
@@ -6,468 +6,536 @@
 #include "compressed/enums.h"
 #include "compressed/blosc2/util.h"
 #include "compressed/detail/scoped_timer.h"
+#include "compressed/detail/scratch_buffer_pool.h"
 
 #include "blosc2.h"
 
-#include "blosc2/blosc2-common.h"
-#include "blosc2/blosc2-stdio.h"
 #include "blosc2/filters-registry.h"
 
 #include <span>
 
-namespace NAMESPACE_COMPRESSED_IMAGE
+namespace
+NAMESPACE_COMPRESSED_IMAGE
 {
-
-	namespace blosc2
-	{
-
-		namespace detail
-		{
-			static const inline bool g_filters_registered = false;
-
-			/// Initialize filters in c-blosc2. Since we don't have an explicit entry point this needs to be checked on every call to compress and decompress.
-			/// May be a no-op if detail::g_filters_registered is true.
-			inline void init_filters()
-			{
-				if (!detail::g_filters_registered)
-				{
-					register_filters();
-				}
-			}
-
-		}
-
-		// Custom deleter for blosc2 structs for use in a smart pointer
-		template <typename T>
-		struct deleter {};
-
-		template <>
-		struct deleter<blosc2_schunk>
-		{
-			void operator()(blosc2_schunk* schunk)
-			{
-				blosc2_schunk_free(schunk);
-			}
-		};
-
-		template <>
-		struct deleter<blosc2_context>
-		{
-			void operator()(blosc2_context* context)
-			{
-				blosc2_free_ctx(context);
-			}
-		};
-
-		/// Typedef the blosc2 primitives into both smart pointers and as raw ptrs
-		typedef std::unique_ptr<blosc2_schunk, deleter<blosc2_schunk>>		schunk_ptr;
-		typedef blosc2_schunk*												schunk_raw_ptr;
-		typedef void*														chunk_raw_ptr;
-		typedef std::unique_ptr<blosc2_context, deleter<blosc2_context>>	context_ptr;
-		typedef blosc2_context*												context_raw_ptr;
-		
-		/// Maps a codec enum into its blosc2 representation.
-		///
-		/// \param compcode the compression codec to get
-		/// 
-		/// \returns The mapped enum as uint8_t since blosc expects it that way
-		inline uint8_t codec_to_blosc2(enums::codec compcode)
-		{
-			if (compcode == enums::codec::blosclz)
-			{
-				return static_cast<uint8_t>(BLOSC_BLOSCLZ);
-			}
-			else if (compcode == enums::codec::lz4)
-			{
-				return static_cast<uint8_t>(BLOSC_LZ4);
-			}
-			else if (compcode == enums::codec::lz4hc)
-			{
-				return static_cast<uint8_t>(BLOSC_LZ4HC);
-			}
-			else if (compcode == enums::codec::zstd)
-			{
-				return static_cast<uint8_t>(BLOSC_ZSTD);
-			}
-			return BLOSC_BLOSCLZ;
-		}
-
-		/// Maps a blosc2 compression codec into an enum representation
-		///
-		/// \param compcode the compression codec to get
-		/// 
-		/// \returns The mapped enum
-		inline enums::codec blosc2_to_codec(uint8_t compcode)
-		{
-			if (compcode == BLOSC_BLOSCLZ)
-			{
-				return enums::codec::blosclz;
-			}
-			else if (compcode == BLOSC_LZ4)
-			{
-				return  enums::codec::lz4;
-			}
-			else if (compcode == BLOSC_LZ4HC)
-			{
-				return enums::codec::lz4hc;
-			}
-			else if (compcode == BLOSC_ZSTD)
-			{
-				return enums::codec::zstd;
-			}
-			return enums::codec::blosclz;
-		}
-	
-		/// Compress the `data` into `chunk` using the provided `context`. 
-		/// 
-		/// This function applies Blosc2 compression to the input `data` and stores the compressed 
-		/// result in `chunk`. If compression fails, it throws a `std::runtime_error` with the 
-		/// corresponding Blosc2 error code.
-		/// 
-		/// \tparam T The data type of the input buffer.
-		/// \param context A raw pointer to the Blosc2 compression context.
-		/// \param data The input data to be compressed, provided as a `std::span<T>`.
-		/// \param chunk The output chunk where compressed data will be stored, provided as a `std::span<std::byte>`.
-		/// \returns The compressed byte size of the chunk. This size includes a header with metadata, 
-		///          which Blosc2 internally uses.
-		/// \throws std::runtime_error if compression fails, with the Blosc2 error code.
-		template <typename T>
-		size_t compress(context_raw_ptr context, std::span<T> data, std::span<std::byte> chunk)
-		{
-			_COMPRESSED_PROFILE_FUNCTION();
-			detail::init_filters();
-			const auto cbytes = blosc2_compress_ctx(
-				context,
-				static_cast<const void*>(data.data()),
-				static_cast<int32_t>(data.size() * sizeof(T)),
-				static_cast<void*>(chunk.data()),
-				static_cast<int32_t>(chunk.size())
-			);
-			if (cbytes < 0)
-			{
-				throw std::runtime_error(std::format("Unable to compress context using Blosc2 with error code {}", cbytes));
-			}
-
-			return cbytes;
-		}
-		
-		/// Compress the `data` into `chunk` using the provided `context`. 
-		/// 
-		/// This function applies Blosc2 compression to the input `data` and stores the compressed 
-		/// result in `chunk`. If compression fails, it throws a `std::runtime_error` with the 
-		/// corresponding Blosc2 error code.
-		/// 
-		/// \tparam T The data type of the input buffer.
-		/// \param context A raw pointer to the Blosc2 compression context.
-		/// \param data The input data to be compressed, provided as a `std::span<T>`.
-		/// \param chunk The output chunk where compressed data will be stored, provided as a `std::span<std::byte>`.
-		/// \returns The compressed byte size of the chunk. This size includes a header with metadata, 
-		///          which Blosc2 internally uses.
-		/// \throws std::runtime_error if compression fails, with the Blosc2 error code.
-		template <typename T>
-		size_t compress(context_raw_ptr context, std::span<const T> data, std::span<std::byte> chunk)
-		{
-			_COMPRESSED_PROFILE_FUNCTION();
-			detail::init_filters();
-			const auto cbytes = blosc2_compress_ctx(
-				context,
-				static_cast<const void*>(data.data()),
-				static_cast<int32_t>(data.size() * sizeof(T)),
-				static_cast<void*>(chunk.data()),
-				static_cast<int32_t>(chunk.size())
-			);
-			if (cbytes < 0)
-			{
-				throw std::runtime_error(std::format("Unable to compress context using Blosc2 with error code {}", cbytes));
-			}
-
-			return cbytes;
-		}
-
-		/// Compress the `data` into `chunk` using the provided `context`. 
-		/// 
-		/// This function applies Blosc2 compression to the input `data` and stores the compressed 
-		/// result in `chunk`. If compression fails, it throws a `std::runtime_error` with the 
-		/// corresponding Blosc2 error code.
-		/// 
-		/// \tparam T The data type of the input buffer.
-		/// \param context A unique pointer to the Blosc2 compression context.
-		/// \param data The input data to be compressed, provided as a `std::span<T>`.
-		/// \param chunk The output chunk where compressed data will be stored, provided as a `std::span<std::byte>`.
-		/// \returns The compressed byte size of the chunk. This size includes a header with metadata, 
-		///          which Blosc2 internally uses.
-		/// \throws std::runtime_error if compression fails, with the Blosc2 error code.
-		template <typename T>
-		size_t compress(context_ptr& context, std::span<T> data, std::span<std::byte> chunk)
-		{
-			return compress(context.get(), data, chunk);
-		}
-
-		/// Compress the `data` into `chunk` using the provided `context`. 
-		/// 
-		/// This function applies Blosc2 compression to the input `data` and stores the compressed 
-		/// result in `chunk`. If compression fails, it throws a `std::runtime_error` with the 
-		/// corresponding Blosc2 error code.
-		/// 
-		/// \tparam T The data type of the input buffer.
-		/// \param context A unique pointer to the Blosc2 compression context.
-		/// \param data The input data to be compressed, provided as a `std::span<T>`.
-		/// \param chunk The output chunk where compressed data will be stored, provided as a `std::span<std::byte>`.
-		/// \returns The compressed byte size of the chunk. This size includes a header with metadata, 
-		///          which Blosc2 internally uses.
-		/// \throws std::runtime_error if compression fails, with the Blosc2 error code.
-		template <typename T>
-		size_t compress(context_ptr& context, std::span<const T> data, std::span<std::byte> chunk)
-		{
-			return compress(context.get(), data, chunk);
-		}
-
-		/// Decompress a Blosc2 `chunk` into `buffer` using the provided `context`. 
-		/// 
-		/// This function reverses the Blosc2 compression, restoring the original uncompressed data. 
-		/// If decompression fails, it throws a `std::runtime_error` with the corresponding error code.
-		/// 
-		/// \tparam T The data type of the decompressed output.
-		/// \param context A raw pointer to the Blosc2 decompression context.
-		/// \param buffer The output buffer where decompressed data will be stored, provided as a `std::span<T>`.
-		/// \param chunk The compressed input data to be decompressed, provided as a `std::span<std::byte>`.
-		/// \returns The decompressed byte size of the buffer.
-		/// \throws std::runtime_error if decompression fails, with the Blosc2 error code.
-		template <typename T>
-		size_t decompress(context_raw_ptr context, std::span<T> buffer, std::span<const std::byte> chunk)
-		{
-			_COMPRESSED_PROFILE_FUNCTION();
-			detail::init_filters();
-			if (buffer.size() * sizeof(T) > std::numeric_limits<int32_t>::max())
-			{
-				throw std::out_of_range(std::format("Blosc2 chunk size may not exceed numeric limit of int32_t, got {:L} which would exceed that", buffer.size() * sizeof(T)));
-			}
-
-			int decompressed_size = blosc2_decompress_ctx(
-				context,
-				static_cast<const void*>(chunk.data()),
-				std::numeric_limits<int32_t>::max(),
-				buffer.data(),
-				static_cast<int32_t>(buffer.size() * sizeof(T))
-			);
-
-			if (decompressed_size < 0)
-			{
-				throw std::runtime_error(std::format("Error code {} while decompressing blosc2 chunk", decompressed_size));
-			}
-			return decompressed_size;
-		}
-
-
-		/// Decompress a Blosc2 `chunk` into `buffer` using the provided `context`. 
-		/// 
-		/// This function reverses the Blosc2 compression, restoring the original uncompressed data. 
-		/// If decompression fails, it throws a `std::runtime_error` with the corresponding error code.
-		/// 
-		/// \tparam T The data type of the decompressed output.
-		/// \param context A unique pointer to the Blosc2 decompression context.
-		/// \param buffer The output buffer where decompressed data will be stored, provided as a `std::span<T>`.
-		/// \param chunk The compressed input data to be decompressed, provided as a `std::span<std::byte>`.
-		/// \returns The decompressed byte size of the buffer.
-		/// \throws std::runtime_error if decompression fails, with the Blosc2 error code.
-		template <typename T>
-		size_t decompress(context_ptr& context, std::span<T> buffer, std::span<const std::byte> chunk)
-		{
-			return decompress(context.get(), buffer, chunk);
-		}
-
-		/// Append the chunk into the super-chunk. The chunk in this case does not need to be refitted as its actual
-		/// size since c-blosc will read the size from its header bytes.
-		inline size_t append_chunk(schunk_ptr& schunk, std::span<std::byte> chunk)
-		{
-			detail::init_filters();
-			// We don't expose the copy parameter as internally in c-blosc if the chunk was compressed at all (i.e. compressed size < 
-			// uncompressed size) the chunk gets realloc'd anyways effectively copying it.
-			auto nchunks = blosc2_schunk_append_chunk(
-				schunk.get(),
-				reinterpret_cast<uint8_t*>(chunk.data()),
-				true // copy
-			);
-
-			if (nchunks < 0)
-			{
-				throw std::runtime_error(std::format("Unable to append chunk into super-chunk with the following blosc2 error code {}", nchunks));
-			}
-
-			return nchunks;
-		}
-
-		/// Create a default schunk with BLOSC2_CPARAMS_DEFAULTS and BLOSC2_DPARAMS_DEFAULTS
-		inline blosc2::schunk_ptr create_default_schunk()
-		{
-			detail::init_filters();
-			auto cparams = BLOSC2_CPARAMS_DEFAULTS;
-			auto dparams = BLOSC2_DPARAMS_DEFAULTS;
-			blosc2_storage storage = BLOSC2_STORAGE_DEFAULTS;
-			storage.cparams = &cparams;
-			storage.dparams = &dparams;
-			return blosc2::schunk_ptr(blosc2_schunk_new(&storage));
-		}
-
-		/// Create blosc2 compression parameters for the given input.
-		template <typename T>
-		blosc2_cparams create_blosc2_cparams(schunk_ptr& schunk, size_t nthreads, enums::codec codec, uint8_t compression_level, size_t block_size)
-		{
-			if (nthreads > std::numeric_limits<int16_t>::max())
-			{
-				throw std::out_of_range(std::format("Number of threads may not exceed {}, got {:L}", std::numeric_limits<int16_t>::max(), nthreads));
-			}
-			nthreads = std::max(nthreads, static_cast<size_t>(1));
-
-			assert(std::numeric_limits<int32_t>::max() > block_size);
-
-			detail::init_filters();
-			blosc2_cparams cparams = BLOSC2_CPARAMS_DEFAULTS;
-			cparams.blocksize = static_cast<int32_t>(block_size);;
-			cparams.typesize = sizeof(T);
-			cparams.splitmode = BLOSC_AUTO_SPLIT;
-			cparams.clevel = compression_level;
-			cparams.nthreads = static_cast<int16_t>(nthreads);
-			cparams.schunk = schunk.get();
-			cparams.compcode = codec_to_blosc2(codec);
-
-			return cparams;
-		}
-
-		/// Create blosc2 compression parameters for the given input.
-		template <typename T>
-		blosc2_cparams create_blosc2_cparams(size_t nthreads, enums::codec codec, uint8_t compression_level, size_t block_size)
-		{
-			if (nthreads > std::numeric_limits<int16_t>::max())
-			{
-				throw std::out_of_range(std::format("Number of threads may not exceed {}, got {:L}", std::numeric_limits<int16_t>::max(), nthreads));
-			}
-			nthreads = std::max(nthreads, static_cast<size_t>(1));
-
-			assert(std::numeric_limits<int32_t>::max() > block_size);
-
-			detail::init_filters();
-			blosc2_cparams cparams = BLOSC2_CPARAMS_DEFAULTS;
-			cparams.blocksize = static_cast<int32_t>(block_size);
-			cparams.typesize = sizeof(T);
-			cparams.splitmode = BLOSC_AUTO_SPLIT;
-			cparams.clevel = compression_level;
-			cparams.nthreads = static_cast<int16_t>(nthreads);
-			cparams.compcode = codec_to_blosc2(codec);
-
-			return cparams;
-		}
-
-		/// Create a blosc2 compression context with the given number of threads.
-		template <typename T>
-		blosc2::context_ptr create_compression_context(schunk_ptr& schunk, size_t nthreads, enums::codec codec, uint8_t compression_level, size_t block_size)
-		{
-			_COMPRESSED_PROFILE_FUNCTION();
-			detail::init_filters();
-			auto cparams = create_blosc2_cparams<T>(schunk, nthreads, codec, compression_level, block_size);
-			return blosc2::context_ptr(blosc2_create_cctx(cparams));
-		}
-
-		template <typename T>
-		blosc2::context_ptr create_compression_context(size_t nthreads, enums::codec codec, uint8_t compression_level, size_t block_size)
-		{
-			_COMPRESSED_PROFILE_FUNCTION();
-			detail::init_filters();
-			auto cparams = create_blosc2_cparams<T>(nthreads, codec, compression_level, block_size);
-			return blosc2::context_ptr(blosc2_create_cctx(cparams));
-		}
-
-		/// Create a blosc2 decompression context with the given number of threads.
-		inline blosc2::context_ptr create_decompression_context(schunk_ptr& schunk, size_t nthreads)
-		{
-			_COMPRESSED_PROFILE_FUNCTION();
-			if (nthreads > std::numeric_limits<int16_t>::max())
-			{
-				throw std::out_of_range(std::format("Number of threads may not exceed {}, got {:L}", std::numeric_limits<int16_t>::max(), nthreads));
-			}
-			nthreads = std::min(nthreads, static_cast<size_t>(1));
-
-			detail::init_filters();
-			auto dparams = BLOSC2_DPARAMS_DEFAULTS;
-			dparams.schunk = schunk.get();
-			dparams.nthreads = static_cast<int16_t>(nthreads);
-
-			return blosc2::context_ptr(blosc2_create_dctx(dparams));
-		}
-
-		/// Create a blosc2 decompression context with the given number of threads.
-		inline blosc2::context_ptr create_decompression_context(size_t nthreads)
-		{
-			_COMPRESSED_PROFILE_FUNCTION();
-			if (nthreads > std::numeric_limits<int16_t>::max())
-			{
-				throw std::out_of_range(std::format("Number of threads may not exceed {}, got {:L}", std::numeric_limits<int16_t>::max(), nthreads));
-			}
-			nthreads = std::min(nthreads, static_cast<size_t>(1));
-
-			detail::init_filters();
-			auto dparams = BLOSC2_DPARAMS_DEFAULTS;
-			dparams.nthreads = static_cast<int16_t>(nthreads);
-
-			return blosc2::context_ptr(blosc2_create_dctx(dparams));
-		}
-
-		/// Get the minimum size needed to store the compressed data.
-		template <size_t ChunkSize>
-		constexpr size_t min_compressed_size()
-		{
-			return ChunkSize + BLOSC2_MAX_OVERHEAD;
-		}
-
-		/// Get the minimum size needed to store the compressed data.
-		inline constexpr size_t min_compressed_size(size_t chunk_size)
-		{
-			return chunk_size + BLOSC2_MAX_OVERHEAD;
-		}
-
-		/// Get the minimum size needed to store the decompressed data.
-		template <size_t ChunkSize>
-		constexpr size_t min_decompressed_size()
-		{
-			return ChunkSize;
-		}
-
-		/// Get the minimum size needed to store the decompressed data.
-		inline constexpr size_t min_decompressed_size(size_t chunk_size)
-		{
-			return chunk_size;
-		}
-
-		/// Get the number of elements of the uncompressed chunk.
-		///
-		/// \tparam T the type to check against
-		/// \param chunk the compressed chunk to query
-		/// 
-		/// \throws std::runtime_error if we encounter a blosc2 error.
-		template <typename T>
-		size_t chunk_num_elements(const std::vector<std::byte>& chunk)
-		{
-			int32_t nbytes{};
-			int32_t cbytes{};
-			int32_t blocksize{};
-			auto res = blosc2_cbuffer_sizes(
-				static_cast<const void*>(chunk.data()),
-				&nbytes,
-				&cbytes,
-				&blocksize
-			);
-			if (res < 0)
-			{
-				throw std::runtime_error(std::format("Unable to find buffer sizes due to blosc2 error: {}", map_error_code(res)));
-			}
-
-			assert(nbytes > 0);
-			assert(nbytes % sizeof(T) == 0);
-
-			return static_cast<size_t>(nbytes) / sizeof(T);
-		}
-
-	} // namespace blosc2
-
-
-} // NAMESPACE_COMPRESSED_IMAGE
\ No newline at end of file
+    namespace blosc2
+    {
+        namespace detail
+        {
+            static const inline bool g_filters_registered = false;
+
+            /// Initialize filters in c-blosc2. Since we don't have an explicit entry point this needs to be checked on every call to compress and decompress.
+            /// May be a no-op if detail::g_filters_registered is true.
+            inline void init_filters()
+            {
+                if (!detail::g_filters_registered)
+                {
+                    register_filters();
+                }
+            }
+        }
+
+        // Custom deleter for blosc2 structs for use in a smart pointer
+        template <typename T>
+        struct deleter
+        {
+        };
+
+        template <>
+        struct deleter<blosc2_schunk>
+        {
+            void operator()(blosc2_schunk* schunk)
+            {
+                blosc2_schunk_free(schunk);
+            }
+        };
+
+        template <>
+        struct deleter<blosc2_context>
+        {
+            void operator()(blosc2_context* context)
+            {
+                blosc2_free_ctx(context);
+            }
+        };
+
+        /// Typedef the blosc2 primitives into both smart pointers and as raw ptrs
+        typedef std::unique_ptr<blosc2_schunk, deleter<blosc2_schunk>> schunk_ptr;
+        typedef blosc2_schunk* schunk_raw_ptr;
+        typedef void* chunk_raw_ptr;
+        typedef std::unique_ptr<blosc2_context, deleter<blosc2_context>> context_ptr;
+        typedef blosc2_context* context_raw_ptr;
+
+        /// Maps a codec enum into its blosc2 representation.
+        ///
+        /// \param compcode the compression codec to get
+        ///
+        /// \returns The mapped enum as uint8_t since blosc expects it that way
+        inline uint8_t codec_to_blosc2(enums::codec compcode)
+        {
+            if (compcode == enums::codec::blosclz)
+            {
+                return static_cast<uint8_t>(BLOSC_BLOSCLZ);
+            }
+            else if (compcode == enums::codec::lz4)
+            {
+                return static_cast<uint8_t>(BLOSC_LZ4);
+            }
+            else if (compcode == enums::codec::lz4hc)
+            {
+                return static_cast<uint8_t>(BLOSC_LZ4HC);
+            }
+            else if (compcode == enums::codec::zstd)
+            {
+                return static_cast<uint8_t>(BLOSC_ZSTD);
+            }
+            return BLOSC_BLOSCLZ;
+        }
+
+        /// Maps a blosc2 compression codec into an enum representation
+        ///
+        /// \param compcode the compression codec to get
+        ///
+        /// \returns The mapped enum
+        inline enums::codec blosc2_to_codec(uint8_t compcode)
+        {
+            if (compcode == BLOSC_BLOSCLZ)
+            {
+                return enums::codec::blosclz;
+            }
+            else if (compcode == BLOSC_LZ4)
+            {
+                return enums::codec::lz4;
+            }
+            else if (compcode == BLOSC_LZ4HC)
+            {
+                return enums::codec::lz4hc;
+            }
+            else if (compcode == BLOSC_ZSTD)
+            {
+                return enums::codec::zstd;
+            }
+            return enums::codec::blosclz;
+        }
+
+
+        /// Get the minimum size needed to store the compressed data.
+        template <size_t ChunkSize>
+        constexpr size_t min_compressed_size()
+        {
+            return ChunkSize + BLOSC2_MAX_OVERHEAD;
+        }
+
+        /// Get the minimum size needed to store the compressed data.
+        inline constexpr size_t min_compressed_size(size_t chunk_size)
+        {
+            return chunk_size + BLOSC2_MAX_OVERHEAD;
+        }
+
+        /// Get the minimum size needed to store the decompressed data.
+        template <size_t ChunkSize>
+        constexpr size_t min_decompressed_size()
+        {
+            return ChunkSize;
+        }
+
+        /// Get the minimum size needed to store the decompressed data.
+        inline constexpr size_t min_decompressed_size(size_t chunk_size)
+        {
+            return chunk_size;
+        }
+
+
+        /// Compress the `data` into `chunk` using the provided `context`.
+        ///
+        /// This function applies Blosc2 compression to the input `data` and stores the compressed
+        /// result in `chunk`. If compression fails, it throws a `std::runtime_error` with the
+        /// corresponding Blosc2 error code.
+        ///
+        /// \tparam T The data type of the input buffer.
+        /// \param context A raw pointer to the Blosc2 compression context.
+        /// \param data The input data to be compressed, provided as a `std::span<T>`.
+        /// \param chunk The output chunk where compressed data will be stored, provided as a `std::span<std::byte>`.
+        /// \returns The compressed byte size of the chunk. This size includes a header with metadata,
+        ///          which Blosc2 internally uses.
+        /// \throws std::runtime_error if compression fails, with the Blosc2 error code.
+        template <typename T>
+        size_t compress(context_raw_ptr context, std::span<T> data, std::span<std::byte> chunk)
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            detail::init_filters();
+            const auto cbytes = blosc2_compress_ctx(
+                context,
+                static_cast<const void*>(data.data()),
+                static_cast<int32_t>(data.size() * sizeof(T)),
+                static_cast<void*>(chunk.data()),
+                static_cast<int32_t>(chunk.size())
+            );
+            if (cbytes < 0)
+            {
+                throw std::runtime_error(
+                    std::format("Unable to compress context using Blosc2 with error code {}", cbytes)
+                );
+            }
+
+            return cbytes;
+        }
+
+        /// Compress the `data` into `chunk` using the provided `context`.
+        ///
+        /// This function applies Blosc2 compression to the input `data` and stores the compressed
+        /// result in `chunk`. If compression fails, it throws a `std::runtime_error` with the
+        /// corresponding Blosc2 error code.
+        ///
+        /// \tparam T The data type of the input buffer.
+        /// \param context A raw pointer to the Blosc2 compression context.
+        /// \param data The input data to be compressed, provided as a `std::span<T>`.
+        /// \param chunk The output chunk where compressed data will be stored, provided as a `std::span<std::byte>`.
+        /// \returns The compressed byte size of the chunk. This size includes a header with metadata,
+        ///          which Blosc2 internally uses.
+        /// \throws std::runtime_error if compression fails, with the Blosc2 error code.
+        template <typename T>
+        size_t compress(context_raw_ptr context, std::span<const T> data, std::span<std::byte> chunk)
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            detail::init_filters();
+            const auto cbytes = blosc2_compress_ctx(
+                context,
+                static_cast<const void*>(data.data()),
+                static_cast<int32_t>(data.size() * sizeof(T)),
+                static_cast<void*>(chunk.data()),
+                static_cast<int32_t>(chunk.size())
+            );
+            if (cbytes < 0)
+            {
+                throw std::runtime_error(
+                    std::format("Unable to compress context using Blosc2 with error code {}", cbytes)
+                );
+            }
+
+            return cbytes;
+        }
+
+        /// Compress the `data` into `chunk` using the provided `context`.
+        ///
+        /// This function applies Blosc2 compression to the input `data` and stores the compressed
+        /// result in `chunk`. If compression fails, it throws a `std::runtime_error` with the
+        /// corresponding Blosc2 error code.
+        ///
+        /// \tparam T The data type of the input buffer.
+        /// \param context A unique pointer to the Blosc2 compression context.
+        /// \param data The input data to be compressed, provided as a `std::span<T>`.
+        /// \param chunk The output chunk where compressed data will be stored, provided as a `std::span<std::byte>`.
+        /// \returns The compressed byte size of the chunk. This size includes a header with metadata,
+        ///          which Blosc2 internally uses.
+        /// \throws std::runtime_error if compression fails, with the Blosc2 error code.
+        template <typename T>
+        size_t compress(context_ptr& context, std::span<T> data, std::span<std::byte> chunk)
+        {
+            return compress(context.get(), data, chunk);
+        }
+
+        /// Compress the `data` into `chunk` using the provided `context`.
+        ///
+        /// This function applies Blosc2 compression to the input `data` and stores the compressed
+        /// result in `chunk`. If compression fails, it throws a `std::runtime_error` with the
+        /// corresponding Blosc2 error code.
+        ///
+        /// \tparam T The data type of the input buffer.
+        /// \param context A unique pointer to the Blosc2 compression context.
+        /// \param data The input data to be compressed, provided as a `std::span<T>`.
+        /// \param chunk The output chunk where compressed data will be stored, provided as a `std::span<std::byte>`.
+        /// \returns The compressed byte size of the chunk. This size includes a header with metadata,
+        ///          which Blosc2 internally uses.
+        /// \throws std::runtime_error if compression fails, with the Blosc2 error code.
+        template <typename T>
+        size_t compress(context_ptr& context, std::span<const T> data, std::span<std::byte> chunk)
+        {
+            return compress(context.get(), data, chunk);
+        }
+
+        template <typename T>
+        util::default_init_vector<std::byte> compress_to_chunk(context_raw_ptr context, std::span<T> data)
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+
+            const auto required_size = min_compressed_size(data.size_bytes());
+
+            if (auto pool = NAMESPACE_COMPRESSED_IMAGE::detail::scratch_pool_registry::current())
+            {
+                auto lease = pool->acquire(required_size);
+                auto scratch = lease.span();
+                const auto csize = compress(context, data, scratch);
+                return util::default_init_vector<std::byte>(scratch.begin(), scratch.begin() + csize);
+            }
+
+            util::default_init_vector<std::byte> scratch(required_size);
+            const auto csize = compress(context, data, std::span<std::byte>(scratch));
+            return util::default_init_vector<std::byte>(scratch.begin(), scratch.begin() + csize);
+        }
+
+        template <typename T>
+        util::default_init_vector<std::byte> compress_to_chunk(context_raw_ptr context, std::span<const T> data)
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+
+            const auto required_size = min_compressed_size(data.size_bytes());
+
+            if (auto pool = NAMESPACE_COMPRESSED_IMAGE::detail::scratch_pool_registry::current())
+            {
+                auto lease = pool->acquire(required_size);
+                auto scratch = lease.span();
+                const auto csize = compress(context, data, scratch);
+                return util::default_init_vector<std::byte>(scratch.begin(), scratch.begin() + csize);
+            }
+
+            util::default_init_vector<std::byte> scratch(required_size);
+            const auto csize = compress(context, data, std::span<std::byte>(scratch));
+            return util::default_init_vector<std::byte>(scratch.begin(), scratch.begin() + csize);
+        }
+
+        template <typename T>
+        util::default_init_vector<std::byte> compress_to_chunk(context_ptr& context, std::span<T> data)
+        {
+            return compress_to_chunk(context.get(), data);
+        }
+
+        template <typename T>
+        util::default_init_vector<std::byte> compress_to_chunk(context_ptr& context, std::span<const T> data)
+        {
+            return compress_to_chunk(context.get(), data);
+        }
+
+        /// Decompress a Blosc2 `chunk` into `buffer` using the provided `context`.
+        ///
+        /// This function reverses the Blosc2 compression, restoring the original uncompressed data.
+        /// If decompression fails, it throws a `std::runtime_error` with the corresponding error code.
+        ///
+        /// \tparam T The data type of the decompressed output.
+        /// \param context A raw pointer to the Blosc2 decompression context.
+        /// \param buffer The output buffer where decompressed data will be stored, provided as a `std::span<T>`.
+        /// \param chunk The compressed input data to be decompressed, provided as a `std::span<std::byte>`.
+        /// \returns The decompressed byte size of the buffer.
+        /// \throws std::runtime_error if decompression fails, with the Blosc2 error code.
+        template <typename T>
+        size_t decompress(context_raw_ptr context, std::span<T> buffer, std::span<const std::byte> chunk)
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            detail::init_filters();
+            if (buffer.size() * sizeof(T) > std::numeric_limits<int32_t>::max())
+            {
+                throw std::out_of_range(
+                    std::format(
+                        "Blosc2 chunk size may not exceed numeric limit of int32_t, got {:L} which would exceed that",
+                        buffer.size() * sizeof(T)
+                    )
+                );
+            }
+
+            int decompressed_size = blosc2_decompress_ctx(
+                context,
+                static_cast<const void*>(chunk.data()),
+                std::numeric_limits<int32_t>::max(),
+                buffer.data(),
+                static_cast<int32_t>(buffer.size() * sizeof(T))
+            );
+
+            if (decompressed_size < 0)
+            {
+                throw std::runtime_error(
+                    std::format("Error code {} while decompressing blosc2 chunk", decompressed_size)
+                );
+            }
+            return decompressed_size;
+        }
+
+
+        /// Decompress a Blosc2 `chunk` into `buffer` using the provided `context`.
+        ///
+        /// This function reverses the Blosc2 compression, restoring the original uncompressed data.
+        /// If decompression fails, it throws a `std::runtime_error` with the corresponding error code.
+        ///
+        /// \tparam T The data type of the decompressed output.
+        /// \param context A unique pointer to the Blosc2 decompression context.
+        /// \param buffer The output buffer where decompressed data will be stored, provided as a `std::span<T>`.
+        /// \param chunk The compressed input data to be decompressed, provided as a `std::span<std::byte>`.
+        /// \returns The decompressed byte size of the buffer.
+        /// \throws std::runtime_error if decompression fails, with the Blosc2 error code.
+        template <typename T>
+        size_t decompress(context_ptr& context, std::span<T> buffer, std::span<const std::byte> chunk)
+        {
+            return decompress(context.get(), buffer, chunk);
+        }
+
+        /// Create blosc2 compression parameters for the given input.
+        template <typename T>
+        blosc2_cparams create_blosc2_cparams(schunk_ptr& schunk,
+                                             size_t nthreads,
+                                             enums::codec codec,
+                                             uint8_t compression_level,
+                                             size_t block_size)
+        {
+            if (nthreads > std::numeric_limits<int16_t>::max())
+            {
+                throw std::out_of_range(
+                    std::format(
+                        "Number of threads may not exceed {}, got {:L}",
+                        std::numeric_limits<int16_t>::max(),
+                        nthreads
+                    )
+                );
+            }
+            nthreads = std::max(nthreads, static_cast<size_t>(1));
+
+            assert(std::numeric_limits<int32_t>::max() > block_size);
+
+            detail::init_filters();
+            blosc2_cparams cparams = BLOSC2_CPARAMS_DEFAULTS;
+            cparams.blocksize = static_cast<int32_t>(block_size);;
+            cparams.typesize = sizeof(T);
+            cparams.splitmode = BLOSC_AUTO_SPLIT;
+            cparams.clevel = compression_level;
+            cparams.nthreads = static_cast<int16_t>(nthreads);
+            cparams.schunk = schunk.get();
+            cparams.compcode = codec_to_blosc2(codec);
+
+            return cparams;
+        }
+
+        /// Create blosc2 compression parameters for the given input.
+        template <typename T>
+        blosc2_cparams create_blosc2_cparams(size_t nthreads,
+                                             enums::codec codec,
+                                             uint8_t compression_level,
+                                             size_t block_size)
+        {
+            if (nthreads > std::numeric_limits<int16_t>::max())
+            {
+                throw std::out_of_range(
+                    std::format(
+                        "Number of threads may not exceed {}, got {:L}",
+                        std::numeric_limits<int16_t>::max(),
+                        nthreads
+                    )
+                );
+            }
+            nthreads = std::max(nthreads, static_cast<size_t>(1));
+
+            assert(std::numeric_limits<int32_t>::max() > block_size);
+
+            detail::init_filters();
+            blosc2_cparams cparams = BLOSC2_CPARAMS_DEFAULTS;
+            cparams.blocksize = static_cast<int32_t>(block_size);
+            cparams.typesize = sizeof(T);
+            cparams.splitmode = BLOSC_AUTO_SPLIT;
+            cparams.clevel = compression_level;
+            cparams.nthreads = static_cast<int16_t>(nthreads);
+            cparams.compcode = codec_to_blosc2(codec);
+
+            return cparams;
+        }
+
+        /// Create a blosc2 compression context with the given number of threads.
+        template <typename T>
+        blosc2::context_ptr create_compression_context(schunk_ptr& schunk,
+                                                       size_t nthreads,
+                                                       enums::codec codec,
+                                                       uint8_t compression_level,
+                                                       size_t block_size)
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            detail::init_filters();
+            auto cparams = create_blosc2_cparams<T>(schunk, nthreads, codec, compression_level, block_size);
+            return blosc2::context_ptr(blosc2_create_cctx(cparams));
+        }
+
+        template <typename T>
+        blosc2::context_ptr create_compression_context(size_t nthreads,
+                                                       enums::codec codec,
+                                                       uint8_t compression_level,
+                                                       size_t block_size)
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            detail::init_filters();
+            auto cparams = create_blosc2_cparams<T>(nthreads, codec, compression_level, block_size);
+            return blosc2::context_ptr(blosc2_create_cctx(cparams));
+        }
+
+        /// Create a blosc2 decompression context with the given number of threads.
+        inline blosc2::context_ptr create_decompression_context(schunk_ptr& schunk, size_t nthreads)
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            if (nthreads > std::numeric_limits<int16_t>::max())
+            {
+                throw std::out_of_range(
+                    std::format(
+                        "Number of threads may not exceed {}, got {:L}",
+                        std::numeric_limits<int16_t>::max(),
+                        nthreads
+                    )
+                );
+            }
+            nthreads = std::min(nthreads, static_cast<size_t>(1));
+
+            detail::init_filters();
+            auto dparams = BLOSC2_DPARAMS_DEFAULTS;
+            dparams.schunk = schunk.get();
+            dparams.nthreads = static_cast<int16_t>(nthreads);
+
+            return blosc2::context_ptr(blosc2_create_dctx(dparams));
+        }
+
+        /// Create a blosc2 decompression context with the given number of threads.
+        inline blosc2::context_ptr create_decompression_context(size_t nthreads)
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            if (nthreads > std::numeric_limits<int16_t>::max())
+            {
+                throw std::out_of_range(
+                    std::format(
+                        "Number of threads may not exceed {}, got {:L}",
+                        std::numeric_limits<int16_t>::max(),
+                        nthreads
+                    )
+                );
+            }
+            nthreads = std::min(nthreads, static_cast<size_t>(1));
+
+            detail::init_filters();
+            auto dparams = BLOSC2_DPARAMS_DEFAULTS;
+            dparams.nthreads = static_cast<int16_t>(nthreads);
+
+            return blosc2::context_ptr(blosc2_create_dctx(dparams));
+        }
+
+        /// Get the number of elements of the uncompressed chunk.
+        ///
+        /// \tparam T the type to check against
+        /// \param chunk the compressed chunk to query
+        ///
+        /// \throws std::runtime_error if we encounter a blosc2 error.
+        template <typename T>
+        size_t chunk_num_elements(const std::span<const std::byte> chunk)
+        {
+            int32_t nbytes{};
+            int32_t cbytes{};
+            int32_t blocksize{};
+            auto res = blosc2_cbuffer_sizes(
+                static_cast<const void*>(chunk.data()),
+                &nbytes,
+                &cbytes,
+                &blocksize
+            );
+            if (res < 0)
+            {
+                throw std::runtime_error(
+                    std::format("Unable to find buffer sizes due to blosc2 error: {}", map_error_code(res))
+                );
+            }
+
+            assert(nbytes > 0);
+            assert(nbytes % sizeof(T) == 0);
+
+            return static_cast<size_t>(nbytes) / sizeof(T);
+        }
+    } // namespace blosc2
+} // NAMESPACE_COMPRESSED_IMAGE
diff --git a/compressed_image/include/compressed/channel.h b/compressed_image/include/compressed/channel.h
index b5a4f41..2d26084 100644
--- a/compressed_image/include/compressed/channel.h
+++ b/compressed_image/include/compressed/channel.h
@@ -6,581 +6,787 @@
 #include <memory>
 #include <optional>
 #include <limits>
-#include <execution>
 
 #include "blosc2.h"
 #include "nlohmann/json.hpp"
 
 #include "macros.h"
-#include "enums.h"
 #include "fwd.h"
+#include "enums.h"
 #include "blosc2/wrapper.h"
 #include "blosc2/typedefs.h"
 #include "blosc2/schunk.h"
 #include "blosc2/lazyschunk.h"
 #include "constants.h"
+#include "context.h"
+#include "logger.h"
 #include "util.h"
-#include "json_alias.h"
 #include "detail/scoped_timer.h"
-#include "iterators/iterator.h"
+#include "detail/scratch_buffer_pool.h"
+#include "iterators/channel.h"
 
 
-namespace NAMESPACE_COMPRESSED_IMAGE
+namespace
+NAMESPACE_COMPRESSED_IMAGE
 {
-
-	template <typename T>
-	struct channel : public std::ranges::view_interface<channel<T>>
-	{
-		using value_type = T;
-		using iterator = channel_iterator<T>;
-		using const_iterator = channel_iterator<const T>;
-
-		channel(channel&& other)
-		{
-			m_Schunk = std::move(other.m_Schunk);
-			m_Codec = other.m_Codec;
-			m_Nthreads = other.m_Nthreads;			
-			m_CompressionContext = std::move(other.m_CompressionContext);
-			m_DecompressionContext = std::move(other.m_DecompressionContext);
-			m_CompressionLevel = other.m_CompressionLevel;
-			m_Width = other.m_Width;
-			m_Height = other.m_Height;
-		};
-		channel& operator=(channel&& other)
-		{
-			if (this != &other)
-			{
-				m_Schunk = std::move(other.m_Schunk);
-				m_Codec = other.m_Codec;
-				m_Nthreads = other.m_Nthreads;
-				m_CompressionContext = std::move(other.m_CompressionContext);
-				m_DecompressionContext = std::move(other.m_DecompressionContext);
-				m_CompressionLevel = other.m_CompressionLevel;
-				m_Width = other.m_Width;
-				m_Height = other.m_Height;
-			}
-			return *this;
-		};
-		channel(const channel&) = delete;
-		channel& operator=(const channel&) = delete;
-			
-
-		/// Default ctor, ensures the schunk and compression/decompression contexts are always initialized
-		/// into valid states. This will not generate a valid channel however and the ctor taking data or the static
-		/// functions `zeros` and `full` are preferred.
-		channel()
-		{
-			m_Schunk = std::make_shared<blosc2::schunk_var<T>>(blosc2::lazy_schunk<T>(0, 1, s_default_blocksize, s_default_chunksize));
-			m_CompressionContext = blosc2::create_compression_context<T>(
-				std::thread::hardware_concurrency() / 2,
-				enums::codec::lz4,
-				9,
-				s_default_blocksize
-			);
-			m_DecompressionContext = blosc2::create_decompression_context(std::thread::hardware_concurrency() / 2);
-		};
-
-		/// Initialize the channel with the given data.
-		/// 
-		/// \param data The span of input data to be compressed.
-		/// \param width The width of the image channel.
-		/// \param height The height of the image channel.
-		/// \param compression_codec The compression codec to be used (default is lz4).
-		/// \param compression_level The compression level (default is 5).
-		/// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to 
-		///					  comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle 
-		///					  larger blocks feel free to up this number although this may not increase performance
-		/// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. 
-		///					  This should be tweaked to be no larger than the size of the usual images you are expecting  
-		///					  to compress for optimal performance but this could be upped which might give better compression
-		///					  ratios. Must be a multiple of sizeof(T).
-		channel(
-			const std::span<const T> data,
-			size_t width,
-			size_t height,
-			enums::codec compression_codec = enums::codec::lz4,
-			uint8_t compression_level = 9,
-			size_t block_size = s_default_blocksize,
-			size_t chunk_size = s_default_chunksize
-		)
-		{
-			_COMPRESSED_PROFILE_FUNCTION();
-			m_Width = width;
-			m_Height = height;
-			m_Codec = compression_codec;
-			m_CompressionLevel = util::ensure_compression_level(compression_level);
-			if (data.size() != width * height)
-			{
-				throw std::runtime_error(
-					std::format(
-						"Invalid channel data passed. Expected its size to match up to width * height ({} * {}) which would be {:L}." \
-						" Instead received {:L}",
-						width, height, width * height, data.size()
-					)
-				);
-			}
-
-			// c-blosc2 chunks can at most be 2 gigabytes so the set chunk size should not exceed this.
-			assert(chunk_size < std::numeric_limits<int32_t>::max());
-			assert(block_size < chunk_size);
-
-			m_CompressionContext = blosc2::create_compression_context<T>(std::thread::hardware_concurrency() / 2, m_Codec, m_CompressionLevel, block_size);
-			m_DecompressionContext = blosc2::create_decompression_context(std::thread::hardware_concurrency() / 2);
-
-			// Align the chunks to the scanlines, makes our lifes a lot easier on read/write.
-			auto chunk_size_aligned = util::align_chunk_to_scanlines_bytes<T>(m_Width, chunk_size);
-			m_Schunk = std::make_shared<blosc2::schunk_var<T>>(blosc2::schunk<T>(data, block_size, chunk_size_aligned, m_CompressionContext));
-		}
-
-
-		/// Initialize the channel with the given data.
-		/// 
-		/// \param schunk The initialized super-chunk.
-		/// \param width The width of the image channel.
-		/// \param height The height of the image channel.
-		/// \param compression_codec The compression codec to be used.
-		/// \param compression_level The compression level (default is 5).
-		/// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to 
-		///					  comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle 
-		///					  larger blocks feel free to up this number although this may not increase performance
-		/// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. 
-		///					  This should be tweaked to be no larger than the size of the usual images you are expecting  
-		///					  to compress for optimal performance but this could be upped which might give better compression
-		///					  ratios. Must be a multiple of sizeof(T).
-		channel(
-			blosc2::schunk_var<T> schunk,
-			size_t width,
-			size_t height,
-			enums::codec compression_codec = enums::codec::lz4,
-			uint8_t compression_level = 9
-		)
-		{
-			_COMPRESSED_PROFILE_FUNCTION();
-			m_Codec = compression_codec;
-			m_CompressionLevel = util::ensure_compression_level(compression_level);
-
-			if (std::holds_alternative<blosc2::schunk<T>>(schunk))
-			{
-				if (std::get<blosc2::schunk<T>>(schunk).size() != width * height)
-				{
-					throw std::invalid_argument(
-						std::format(
-							"Invalid schunk passed to compressed::channel constructor. Expected a size of {:L} but instead got {:L}",
-							width * height,
-							std::get<blosc2::schunk<T>>(schunk).size()
-						)
-					);
-				}
-			}
-			else if (std::holds_alternative<blosc2::lazy_schunk<T>>(schunk))
-			{
-				if (std::get<blosc2::lazy_schunk<T>>(schunk).size() != width * height)
-				{
-					throw std::invalid_argument(
-						std::format(
-							"Invalid schunk passed to compressed::channel constructor. Expected a size of {:L} but instead got {:L}",
-							width * height,
-							std::get<blosc2::schunk<T>>(schunk).size()
-						)
-					);
-				}
-			}
-
-			m_Schunk = std::make_shared<blosc2::schunk_var<T>>(std::move(schunk));
-			m_Width = width;
-			m_Height = height;
-
-			// Store the compression and decompression contexts, retrieving the block size from the underlying schunk
-			// wrapper
-			std::visit([&](auto& schunk)
-				{
-					m_CompressionContext = blosc2::create_compression_context<T>(std::thread::hardware_concurrency() / 2, m_Codec, m_CompressionLevel, schunk.max_block_size());
-					m_DecompressionContext = blosc2::create_decompression_context(std::thread::hardware_concurrency() / 2);
-				}, *m_Schunk);
-			
-		}
-
-
-		/// Create a channel filled with zeros.
-		///
-		/// Generates a lazy-channel which only stores a single value T per-chunk, only setting this to a compressed buffer
-		/// if set with something like `set_chunk`. This is especially memory efficient and should be the preferred way 
-		/// when wanting to generate an empty channel only filling out some parts (i.e. sparse cryptomatte loading).
-		/// 
-		/// \param width The width of the image channel.
-		/// \param height The height of the image channel.
-		/// \param compression_codec The compression codec to be used.
-		/// \param compression_level The compression level (default is 9).
-		/// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to 
-		///                   comfortably fit into the L1 cache of most modern CPUs.
-		/// \param chunk_size The size of each individual chunk, defaults to 4MB. Should be no larger than the expected image size 
-		///                   for optimal performance and must be a multiple of sizeof(T).
-		/// \return A channel instance with all values initialized to zero.
-		static channel zeros(
-			size_t width, 
-			size_t height,
-			enums::codec compression_codec = enums::codec::lz4,
-			uint8_t compression_level = 9,
-			size_t block_size = s_default_blocksize,
-			size_t chunk_size = s_default_chunksize
-		)
-		{
-			return channel<T>::full(width, height, static_cast<T>(0), compression_codec, compression_level, block_size, chunk_size);
-		}
-
-		/// Create a zero-initialized channel with the same shape and compression parameters as another channel.
-		///
-		/// Generates a lazy-channel which only stores a single value T per-chunk, only setting this to a compressed buffer
-		/// if set with something like `set_chunk`. This is especially memory efficient and should be the preferred way 
-		/// when wanting to generate an empty channel only filling out some parts (i.e. sparse cryptomatte loading).
-		/// 
-		/// \param other The reference channel from which to copy shape and compression settings.
-		/// \return A new channel instance with the same dimensions and compression settings as \p other, filled with zeros.
-		static channel zeros_like(const channel& other)
-		{
-			return channel<T>::zeros(
-				other.width(), 
-				other.height(), 
-				other.compression(), 
-				other.compression_level(), 
-				other.block_size(), 
-				other.chunk_size()
-			);
-		}
-
-		/// Create a channel filled with a specific value.
-		///
-		/// Generates a lazy-channel which only stores a single value T per-chunk, only setting this to a compressed buffer
-		/// if set with something like `set_chunk`. This is especially memory efficient and should be the preferred way 
-		/// when wanting to generate an empty channel only filling out some parts (i.e. sparse cryptomatte loading).
-		/// 
-		/// \param width The width of the image channel.
-		/// \param height The height of the image channel.
-		/// \param fill_value The value to fill the channel with.
-		/// \param compression_codec The compression codec to be used.
-		/// \param compression_level The compression level (default is 9).
-		/// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB.
-		/// \param chunk_size The size of each individual chunk, defaults to 4MB. Should be no larger than the expected image size 
-		///                   for optimal performance and must be a multiple of sizeof(T).
-		/// \return A channel instance with all values initialized to \p fill_value.
-		static channel full(
-			size_t width,
-			size_t height,
-			T fill_value,
-			enums::codec compression_codec = enums::codec::lz4,
-			uint8_t compression_level = 9,
-			size_t block_size = s_default_blocksize,
-			size_t chunk_size = s_default_chunksize
-		)
-		{
-			const size_t chunk_size_aligned = util::align_chunk_to_scanlines_bytes<T>(width, chunk_size);
-			const size_t num_elements = width * height;
-
-			auto schunk = blosc2::lazy_schunk<T>(fill_value, num_elements, block_size, chunk_size_aligned);
-			return channel(std::move(schunk), width, height, compression_codec, compression_level);
-		}
-
-
-		/// Create a channel filled with a specific value and the same shape and compression settings as another channel.
-		///
-		/// Generates a lazy-channel which only stores a single value T per-chunk, only setting this to a compressed buffer
-		/// if set with something like `set_chunk`. This is especially memory efficient and should be the preferred way 
-		/// when wanting to generate an empty channel only filling out some parts (i.e. sparse cryptomatte loading).
-		/// 
-		/// \param other The reference channel from which to copy shape and compression settings.
-		/// \param fill_value The value to fill the channel with.
-		/// \return A new channel instance filled with \p fill_value and the same dimensions and compression settings as \p other.
-		static channel full_like(const channel& other, T fill_value)
-		{
-			return channel<T>::full(
-				other.width(),
-				other.height(),
-				fill_value,
-				other.compression(),
-				other.compression_level(),
-				other.block_size(),
-				other.chunk_size()
-			);
-		}
-
-		/// Returns an iterator pointing to the beginning of the compressed data.
-		///
-		/// \return An iterator to the beginning of the compressed data.
-		iterator begin()
-		{
-			return iterator(m_Schunk, m_CompressionContext.get(), m_DecompressionContext.get(), 0, m_Width, m_Height);
-		}
-
-		/// Returns an iterator pointing to the end of the compressed data.
-		///
-		/// \return An iterator to the end of the compressed data.
-		iterator end()
-		{
-			if (m_Schunk)
-			{
-				return std::visit([&](auto& schunk)
-					{
-						return iterator(m_Schunk, m_CompressionContext.get(), m_DecompressionContext.get(), schunk.num_chunks(), m_Width, m_Height);
-					}, *m_Schunk);
-			}
-			throw std::runtime_error("Internal Error: Unable to create end iterator as m_Schunk is uninitialized.");
-		}
-
-		/// Retrieve a view to the compression context. In most cases users will not have to modify this.
-		///
-		/// \return A pointer to the compression context.
-		blosc2::context_raw_ptr compression_context() { return m_CompressionContext.get(); }
-
-		/// Retrieve a view to the decompression context. In most cases users will not have to modify this.
-		///
-		/// \return A pointer to the decompression context.
-		blosc2::context_raw_ptr decompression_context() { return m_DecompressionContext.get(); }
-
-		/// Update the number of threads used internally by c-blosc2 for compression and decompression.
-		/// 
-		/// \param nthreads The number of threads to use for compression and decompression.
-		/// \param block_size The block size to compress to
-		void update_nthreads(size_t nthreads, size_t block_size = s_default_blocksize)
-		{
-			m_CompressionContext = blosc2::create_compression_context<T>(nthreads, m_Codec, m_CompressionLevel, block_size);
-			m_DecompressionContext = blosc2::create_decompression_context(nthreads);
-			m_Nthreads = nthreads;
-		}
-
-		/// The channel width.
-		///
-		/// \return The width of the channel.
-		size_t width() const noexcept { return m_Width; }
-
-		/// The channel height.
-		///
-		/// \return The height of the channel.
-		size_t height() const noexcept { return m_Height; }
-		
-		/// Retrieve the compression codec used.
-		///
-		/// \return The compression codec.
-		enums::codec compression() const noexcept { return m_Codec; }
-
-		/// Retrieve the compression level used.
-		///
-		/// \return The compression level (typically from 1-9).
-		uint8_t compression_level() const noexcept
-		{
-			return m_CompressionLevel;
-		}
-
-		/// Retrieve the compressed data size.
-		///
-		/// \return The size of the compressed data in bytes.
-		size_t compressed_bytes() const
-		{
-			if (!m_Schunk)
-			{
-				throw std::runtime_error("Channel instance is not properly initialized, unable to get decompressed data");
-			}
-
-			if (std::holds_alternative<blosc2::schunk<T>>(*m_Schunk))
-			{
-				return std::get<blosc2::schunk<T>>(*m_Schunk).csize();
-			}
-			else if (std::holds_alternative<blosc2::lazy_schunk<T>>(*m_Schunk))
-			{
-				return std::get<blosc2::lazy_schunk<T>>(*m_Schunk).csize();
-			}
-			return {};
-		}
-		
-		/// Retrieve the uncompressed data size.
-		///
-		/// \return The size of the uncompressed data in elements.
-		size_t uncompressed_size() const
-		{
-			if (!m_Schunk)
-			{
-				throw std::runtime_error("Channel instance is not properly initialized, unable to get decompressed data");
-			}
-
-			if (std::holds_alternative<blosc2::schunk<T>>(*m_Schunk))
-			{
-				return std::get<blosc2::schunk<T>>(*m_Schunk).size();
-			}
-			else if (std::holds_alternative<blosc2::lazy_schunk<T>>(*m_Schunk))
-			{
-				return std::get<blosc2::lazy_schunk<T>>(*m_Schunk).size();
-			}
-			return {};
-		}
-		
-		/// Retrieve the total number of chunks the channel stores.
-		///
-		/// \return The number of chunks.
-		size_t num_chunks() const 
-		{ 
-			assert(m_Schunk != nullptr);
-
-			if (std::holds_alternative<blosc2::schunk<T>>(*m_Schunk))
-			{
-				return std::get<blosc2::schunk<T>>(*m_Schunk).num_chunks();
-			}
-			else if (std::holds_alternative<blosc2::lazy_schunk<T>>(*m_Schunk))
-			{
-				return std::get<blosc2::lazy_schunk<T>>(*m_Schunk).num_chunks();
-			}
-			return {};
-		}
-
-		/// \brief Retrieve the block size (in bytes) of the channel
-		///
-		/// The internal blosc2 implementation reserves changing this value on compression so it may be possible
-		/// that this is not the value you initially set.
-		/// 
-		/// \return The block size (in bytes).
-		size_t block_size() const
-		{
-			assert(m_Schunk != nullptr);
-			return std::visit([&](auto& schunk)
-				{
-					return schunk.max_block_size();
-				}, *m_Schunk);
-		}
-
-		/// \brief Retrieve the chunk size (in bytes) of the channel
-		/// 
-		/// This will be all of the chunk sizes except for the last chunk. The last chunk may be smaller so to accurately
-		/// capture it you should use the override with a size_t
-		/// 
-		/// \return The chunk size (in bytes).
-		size_t chunk_size() const noexcept
-		{
-			assert(m_Schunk != nullptr);
-			return std::visit([&](auto& schunk)
-				{
-					return schunk.chunk_bytes();
-				}, *m_Schunk);
-		}
-
-		size_t chunk_elems() const
-		{
-			auto chunk_size = this->chunk_size();
-			assert(chunk_size % sizeof(T) == 0);
-			return chunk_size / sizeof(T);
-		}
-
-		/// \brief Retrieve the chunk size (in bytes) of the channel at the given chunk index.
-		/// 
-		/// \return The chunk size (in bytes) at index `chunk_index`.
-		/// 
-		/// \throws std::out_of_range if the chunk index is invalid
-		size_t chunk_size(size_t chunk_index) const
-		{
-			assert(m_Schunk != nullptr);
-			return std::visit([&](auto& schunk)
-				{
-					return schunk.chunk_bytes(chunk_index);
-				}, *m_Schunk);
-		}
-
-		size_t chunk_elems(size_t chunk_index) const
-		{
-			auto chunk_size = this->chunk_size(chunk_index);
-			assert(chunk_size % sizeof(T) == 0);
-			return chunk_size / sizeof(T);
-		}
-
-
-		/// Retrieves and decompresses a chunk of data into the provided buffer.
-		///
-		/// This function retrieves the chunk at the given index from the internal `schunk`,
-		/// decompresses it using the current decompression context, and stores the result in `buffer`.
-		///
-		/// \param buffer A span representing the destination buffer to store the decompressed data.
-		///               Must be large enough to hold one chunk of decompressed data.
-		/// \param chunk_idx The index of the chunk to retrieve.
-		///
-		/// \throws std::runtime_error if the internal `schunk` pointer is not initialized.
-		void get_chunk(std::span<T> buffer, size_t chunk_idx) const
-		{
-			if (!m_Schunk)
-			{
-				throw std::runtime_error("Internal Error: Channel instance is not properly initialized, unable to get decompressed data");
-			}
-
-			return std::visit([&](const auto& schunk)
-				{
-					// We cheat a little bit here by creating this compression ctx on the fly, unfortunately this is 
-					// necessary as blosc2 will actually modify the ctx on decompression.
-					auto decomp_ctx = blosc2::create_decompression_context(m_Nthreads);
-					return schunk.chunk(decomp_ctx, buffer, chunk_idx);
-				}, *m_Schunk);
-		}
-
-		/// Compresses and sets a chunk of data from the provided buffer at the specified index.
-		///
-		/// This function compresses the data in the provided buffer using the current compression
-		/// context and writes it into the internal `schunk` at the given index.
-		///
-		/// \param buffer A span representing the source data to be compressed and stored.
-		/// \param chunk_idx The index of the chunk to overwrite or set with the compressed data.
-		///
-		/// \throws std::runtime_error if the internal `schunk` pointer is not initialized.
-		void set_chunk(std::span<T> buffer, size_t chunk_idx)
-		{
-			if (!m_Schunk)
-			{
-				throw std::runtime_error("Internal Error: Channel instance is not properly initialized, unable to set data");
-			}
-
-			return std::visit([&](auto& schunk)
-				{
-					return schunk.set_chunk(m_CompressionContext, buffer, chunk_idx);
-				}, *m_Schunk);
-		}
-
-		/// Get the decompressed data as a vector.
-		///
-		/// \throws std::runtime_error if the internal `schunk` pointer is not initialized.
-		/// 
-		/// \return A vector containing the decompressed data.
-		std::vector<T> get_decompressed() const
-		{
-			if (!m_Schunk)
-			{
-				throw std::runtime_error("Internal Error: Channel instance is not properly initialized, unable to get decompressed data");
-			}
-			return std::visit([&](const auto& schunk)
-				{
-					// We cheat a little bit here by creating this compression ctx on the fly, unfortunately this is 
-					// necessary as blosc2 will actually modify the ctx on decompression.
-					auto decomp_ctx = blosc2::create_decompression_context(m_Nthreads);
-					return schunk.to_uncompressed(decomp_ctx);
-				}, *m_Schunk);
-		}
-
-		/// Equality operators, compares pointers to check for equality
-		bool operator==(const channel<T>& other) const noexcept
-		{
-			return this == &other;
-		}
-
-	private:
-		/// The storage for the internal data, stored contiguously in a compressed data format
-		blosc2::schunk_var_ptr<T> m_Schunk = nullptr;
-		enums::codec m_Codec = enums::codec::lz4;
-
-		size_t m_Nthreads = std::thread::hardware_concurrency() / 2;
-
-		/// We store a compression and decompression context here to allow us to reuse them rather than having
-		/// to reinitialize them on launch. May be nullptr;
-		blosc2::context_ptr m_CompressionContext = nullptr;
-		blosc2::context_ptr m_DecompressionContext = nullptr;
-
-		/// Compression level.
-		uint8_t m_CompressionLevel = 9;
-
-		/// The width and height of the channel.
-		size_t m_Width = 1;
-		size_t m_Height = 1;
-	};
-
-} // NAMESPACE_COMPRESSED_IMAGE
\ No newline at end of file
+    template <typename T>
+    struct channel : public std::ranges::view_interface<channel<T>>
+    {
+        using value_type = T;
+        using iterator = channel_iterator<T>;
+        using const_iterator = channel_iterator<const T>;
+
+        channel(channel&& other) noexcept
+        {
+            m_schunk = std::move(other.m_schunk);
+            m_scratch_pool = std::move(other.m_scratch_pool);
+            m_codec = other.m_codec;
+            m_compression_level = other.m_compression_level;
+            m_num_threads = other.m_num_threads;
+            m_width = other.m_width;
+            m_height = other.m_height;
+        };
+
+        channel& operator=(channel&& other) noexcept
+        {
+            if (this != &other)
+            {
+                m_schunk = std::move(other.m_schunk);
+                m_scratch_pool = std::move(other.m_scratch_pool);
+                m_codec = other.m_codec;
+                m_compression_level = other.m_compression_level;
+                m_num_threads = other.m_num_threads;
+                m_width = other.m_width;
+                m_height = other.m_height;
+            }
+            return *this;
+        };
+        channel(const channel&) = delete;
+        channel& operator=(const channel&) = delete;
+
+
+        /// Default ctor, ensures the schunk and compression/decompression contexts are always initialized
+        /// into valid states. This will not generate a valid channel however, and the ctor taking data or the static
+        /// functions `zeros` and `full` are preferred.
+        channel()
+        {
+            m_scratch_pool = detail::scratch_pool_registry::get_or_create_for_channel();
+            m_schunk = std::make_shared<schunk_var<T>>(
+                detail::lazy_schunk<T>(0, 1, s_default_blocksize, s_default_chunksize)
+            );
+        };
+
+        /// Initialize the channel with the given data.
+        ///
+        /// \param data The span of input data to be compressed.
+        /// \param width The width of the image channel.
+        /// \param height The height of the image channel.
+        /// \param compression_codec The compression codec to be used (default is lz4).
+        /// \param compression_level The compression level (default is 5).
+        /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to
+        ///					  comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle
+        ///					  larger blocks feel free to up this number although this may not increase performance
+        /// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel.
+        ///					  This should be tweaked to be no larger than the size of the usual images you are expecting
+        ///					  to compress for optimal performance, but this could be upped which might give better compression
+        ///					  ratios. Must be a multiple of sizeof(T).
+        /// \param gpu_device The GPU device to user for compression/decompression. This only has an effect if the codec
+        ///                   chosen is one of the gpu_* codecs. If not specified, the best default device will be used.
+        ///                   To find out which devices are available, we provide the utility functions
+        ///                   `NAMESPACE_COMPRESSED_IMAGE::cuda::device_names()` and `NAMESPACE_COMPRESSED_IMAGE::cuda::devices()`.
+        ///                   The logical index into the arrays returned by those functions is the index that is passed
+        ///                   here.
+        channel(
+            const std::span<const T> data,
+            const size_t width,
+            const size_t height,
+            const enums::codec compression_codec = enums::codec::lz4,
+            const uint8_t compression_level = 9,
+            const size_t block_size = s_default_blocksize,
+            const size_t chunk_size = s_default_chunksize,
+            std::optional<int> gpu_device = std::nullopt
+        )
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            m_scratch_pool = detail::scratch_pool_registry::get_or_create_for_channel();
+            m_width = width;
+            m_height = height;
+            m_codec = compression_codec;
+            m_num_threads = std::thread::hardware_concurrency() / 2;
+            m_compression_level = util::ensure_compression_level(compression_level);
+            if (data.size() != width * height)
+            {
+                throw std::runtime_error(
+                    std::format(
+                        "Invalid channel data passed. Expected its size to match up to width * height ({} * {}) which would be {:L}."
+                        " Instead received {:L}",
+                        width,
+                        height,
+                        width * height,
+                        data.size()
+                    )
+                );
+            }
+
+            if (enums::is_gpu_codec(m_codec) && !cuda::is_available())
+            {
+                m_codec = enums::s_gpu_codec_fallback.at(m_codec);
+                get_logger()->warn(
+                    "Unable to use the provided gpu codec '{}' as no cuda device is available."
+                    " Falling back to cpu codec '{}'.",
+                    enums::to_string(compression_codec),
+                    enums::to_string(m_codec)
+                );
+            }
+
+            if (enums::is_gpu_codec(m_codec))
+            {
+                // Ensure the gpu index passed is valid. We treat this as a failure instead of falling back to some
+                // other value as this indicates the user passed an invalid device.
+                if (cuda::is_available() && gpu_device && gpu_device.value() > cuda::devices().size())
+                {
+                    throw std::invalid_argument(
+                        std::format(
+                            "Invalid GPU device index passed to compressed::channel constructor. Expected a value between 0 and {:L} but instead got {:L}",
+                            cuda::devices().size(),
+                            gpu_device.value()
+                        )
+                    );
+                }
+            }
+            else
+            {
+                // c-blosc2 chunks can at most be 2 gigabytes so the set chunk size should not exceed this.
+                assert(chunk_size < std::numeric_limits<int32_t>::max());
+                assert(block_size <= chunk_size);
+            }
+
+            // Align the chunks to the scanlines, makes our life a lot easier on read / write.
+            auto chunk_size_aligned = util::align_chunk_to_scanlines_bytes<T>(m_width, chunk_size);
+
+            auto gpu_device_index = enums::is_gpu_codec(m_codec) ? cuda::current_device() : 0;
+            auto compression_ctx = this->create_compression_context(
+                m_codec,
+                m_num_threads,
+                m_compression_level,
+                block_size,
+                gpu_device_index
+            );
+
+            m_schunk = std::make_shared<schunk_var<T>>(
+                detail::schunk<T>(data, block_size, chunk_size_aligned, std::move(compression_ctx))
+            );
+        }
+
+
+        /// Initialize the channel with the given data.
+        ///
+        /// \param schunk The initialized super-chunk.
+        /// \param width The width of the image channel.
+        /// \param height The height of the image channel.
+        /// \param compression_codec The compression codec to be used.
+        /// \param compression_level The compression level (default is 5).
+        channel(
+            schunk_var<T> schunk,
+            const size_t width,
+            const size_t height,
+            const enums::codec compression_codec = enums::codec::lz4,
+            const uint8_t compression_level = 9
+        )
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            m_scratch_pool = detail::scratch_pool_registry::get_or_create_for_channel();
+            m_codec = compression_codec;
+            m_compression_level = util::ensure_compression_level(compression_level);
+
+            if (std::holds_alternative<detail::schunk<T>>(schunk))
+            {
+                if (std::get<detail::schunk<T>>(schunk).size() != width * height)
+                {
+                    throw std::invalid_argument(
+                        std::format(
+                            "Invalid schunk passed to compressed::channel constructor. Expected a size of {:L} but instead got {:L}",
+                            width * height,
+                            std::get<detail::schunk<T>>(schunk).size()
+                        )
+                    );
+                }
+            }
+            else if (std::holds_alternative<detail::lazy_schunk<T>>(schunk))
+            {
+                if (std::get<detail::lazy_schunk<T>>(schunk).size() != width * height)
+                {
+                    throw std::invalid_argument(
+                        std::format(
+                            "Invalid schunk passed to compressed::channel constructor. Expected a size of {:L} but instead got {:L}",
+                            width * height,
+                            std::get<detail::schunk<T>>(schunk).size()
+                        )
+                    );
+                }
+            }
+
+            m_schunk = std::make_shared<schunk_var<T>>(std::move(schunk));
+            m_num_threads = std::thread::hardware_concurrency() / 2;
+            m_width = width;
+            m_height = height;
+        }
+
+
+        /// Create a channel filled with zeros.
+        ///
+        /// Generates a lazy-channel which only stores a single value T per-chunk, only setting this to a compressed buffer
+        /// if set with something like `set_chunk`. This is especially memory efficient and should be the preferred way
+        /// when wanting to generate an empty channel only filling out some parts (i.e. sparse cryptomatte loading).
+        ///
+        /// \param width The width of the image channel.
+        /// \param height The height of the image channel.
+        /// \param compression_codec The compression codec to be used.
+        /// \param compression_level The compression level (default is 9).
+        /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to
+        ///                   comfortably fit into the L1 cache of most modern CPUs.
+        /// \param chunk_size The size of each individual chunk, defaults to 4MB. Should be no larger than the expected image size
+        ///                   for optimal performance and must be a multiple of sizeof(T).
+        /// \return A channel instance with all values initialized to zero.
+        static channel zeros(
+            const size_t width,
+            const size_t height,
+            const enums::codec compression_codec = enums::codec::lz4,
+            const uint8_t compression_level = 9,
+            const size_t block_size = s_default_blocksize,
+            const size_t chunk_size = s_default_chunksize
+        )
+        {
+            return channel<T>::full(
+                width,
+                height,
+                static_cast<T>(0),
+                compression_codec,
+                compression_level,
+                block_size,
+                chunk_size
+            );
+        }
+
+        /// Create a zero-initialized channel with the same shape and compression parameters as another channel.
+        ///
+        /// Generates a lazy-channel which only stores a single value T per-chunk, only setting this to a compressed buffer
+        /// if set with something like `set_chunk`. This is especially memory efficient and should be the preferred way
+        /// when wanting to generate an empty channel only filling out some parts (i.e. sparse cryptomatte loading).
+        ///
+        /// \param other The reference channel from which to copy shape and compression settings.
+        /// \return A new channel instance with the same dimensions and compression settings as \p other, filled with zeros.
+        static channel zeros_like(const channel& other)
+        {
+            return channel<T>::zeros(
+                other.width(),
+                other.height(),
+                other.compression(),
+                other.compression_level(),
+                other.block_size(),
+                other.chunk_size()
+            );
+        }
+
+        /// Create a channel filled with a specific value.
+        ///
+        /// Generates a lazy-channel which only stores a single value T per-chunk, only setting this to a compressed buffer
+        /// if set with something like `set_chunk`. This is especially memory efficient and should be the preferred way
+        /// when wanting to generate an empty channel only filling out some parts (i.e. sparse cryptomatte loading).
+        ///
+        /// \param width The width of the image channel.
+        /// \param height The height of the image channel.
+        /// \param fill_value The value to fill the channel with.
+        /// \param compression_codec The compression codec to be used.
+        /// \param compression_level The compression level (default is 9).
+        /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB.
+        /// \param chunk_size The size of each individual chunk, defaults to 4MB. Should be no larger than the expected image size
+        ///                   for optimal performance and must be a multiple of sizeof(T).
+        /// \return A channel instance with all values initialized to \p fill_value.
+        static channel full(
+            const size_t width,
+            const size_t height,
+            const T fill_value,
+            const enums::codec compression_codec = enums::codec::lz4,
+            const uint8_t compression_level = 9,
+            const size_t block_size = s_default_blocksize,
+            const size_t chunk_size = s_default_chunksize
+        )
+        {
+            const size_t chunk_size_aligned = util::align_chunk_to_scanlines_bytes<T>(width, chunk_size);
+            const size_t num_elements = width * height;
+
+            auto schunk = detail::lazy_schunk<T>(fill_value, num_elements, block_size, chunk_size_aligned);
+            return channel(std::move(schunk), width, height, compression_codec, compression_level);
+        }
+
+
+        /// Create a channel filled with a specific value and the same shape and compression settings as another channel.
+        ///
+        /// Generates a lazy-channel which only stores a single value T per-chunk, only setting this to a compressed buffer
+        /// if set with something like `set_chunk`. This is especially memory efficient and should be the preferred way
+        /// when wanting to generate an empty channel only filling out some parts (i.e. sparse cryptomatte loading).
+        ///
+        /// \param other The reference channel from which to copy shape and compression settings.
+        /// \param fill_value The value to fill the channel with.
+        /// \return A new channel instance filled with \p fill_value and the same dimensions and compression settings as \p other.
+        static channel full_like(const channel& other, T fill_value)
+        {
+            return channel<T>::full(
+                other.width(),
+                other.height(),
+                fill_value,
+                other.compression(),
+                other.compression_level(),
+                other.block_size(),
+                other.chunk_size()
+            );
+        }
+
+        /// Returns an iterator pointing to the beginning of the decompressed channel chunks.
+                ///
+                /// The iterator does not allocate on construction. Its internal buffers and compression/decompression
+                /// context are initialized lazily when the first chunk is dereferenced.
+        iterator begin()
+        {
+            if (!m_schunk)
+            {
+                throw std::runtime_error(
+                    "Internal Error: Unable to create begin iterator as m_schunk is uninitialized."
+                );
+            }
+
+            return iterator(
+                m_schunk,
+                0,
+                this->num_chunks(),
+                m_width,
+                m_height,
+                m_codec,
+                m_compression_level,
+                m_num_threads,
+                this->block_size(),
+                this->chunk_size()
+            );
+        }
+
+        /// Returns an iterator pointing past the last decompressed channel chunk.
+        iterator end()
+        {
+            if (!m_schunk)
+            {
+                throw std::runtime_error("Internal Error: Unable to create end iterator as m_schunk is uninitialized.");
+            }
+
+            return iterator(
+                m_schunk,
+                this->num_chunks(),
+                this->num_chunks(),
+                m_width,
+                m_height,
+                m_codec,
+                m_compression_level,
+                m_num_threads,
+                this->block_size(),
+                this->chunk_size()
+            );
+        }
+
+        /// Update the number of threads used internally by c-blosc2 for compression and decompression. Only valid for
+        /// CPU compression/decompression
+        ///
+        /// \param nthreads The number of threads to use for compression and decompression.
+        void update_nthreads(size_t nthreads)
+        {
+            m_num_threads = nthreads;
+        }
+
+        /// The channel width.
+        ///
+        /// \return The width of the channel.
+        size_t width() const noexcept
+        {
+            return m_width;
+        }
+
+        /// The channel height.
+        ///
+        /// \return The height of the channel.
+        size_t height() const noexcept
+        {
+            return m_height;
+        }
+
+        /// Retrieve the compression codec used.
+        ///
+        /// \return The compression codec.
+        enums::codec compression() const noexcept
+        {
+            return m_codec;
+        }
+
+        /// Retrieve the compression level used.
+        ///
+        /// \return The compression level (typically from 1-9).
+        uint8_t compression_level() const noexcept
+        {
+            return m_compression_level;
+        }
+
+        /// Retrieve the compressed data size.
+        ///
+        /// \return The size of the compressed data in bytes.
+        size_t compressed_bytes() const
+        {
+            if (!m_schunk)
+            {
+                throw std::runtime_error(
+                    "Channel instance is not properly initialized, unable to get decompressed data"
+                );
+            }
+
+            if (std::holds_alternative<detail::schunk<T>>(*m_schunk))
+            {
+                return std::get<detail::schunk<T>>(*m_schunk).csize();
+            }
+            else if (std::holds_alternative<detail::lazy_schunk<T>>(*m_schunk))
+            {
+                return std::get<detail::lazy_schunk<T>>(*m_schunk).csize();
+            }
+            return {};
+        }
+
+        /// Retrieve the uncompressed data size.
+        ///
+        /// \return The size of the uncompressed data in elements.
+        size_t uncompressed_size() const
+        {
+            if (!m_schunk)
+            {
+                throw std::runtime_error(
+                    "Channel instance is not properly initialized, unable to get decompressed data"
+                );
+            }
+
+            if (std::holds_alternative<detail::schunk<T>>(*m_schunk))
+            {
+                return std::get<detail::schunk<T>>(*m_schunk).size();
+            }
+            else if (std::holds_alternative<detail::lazy_schunk<T>>(*m_schunk))
+            {
+                return std::get<detail::lazy_schunk<T>>(*m_schunk).size();
+            }
+            return {};
+        }
+
+        /// Retrieve the total number of chunks the channel stores.
+        ///
+        /// \return The number of chunks.
+        size_t num_chunks() const
+        {
+            assert(m_schunk != nullptr);
+
+            return std::visit(
+                [](const auto& schunk_)
+                {
+                    return schunk_.num_chunks();
+                },
+                *m_schunk
+            );
+        }
+
+        /// \brief Retrieve the block size (in bytes) of the channel
+        ///
+        /// The internal blosc2 implementation reserves changing this value on compression so it may be possible
+        /// that this is not the value you initially set.
+        ///
+        /// \return The block size (in bytes).
+        size_t block_size() const
+        {
+            assert(m_schunk != nullptr);
+            return std::visit(
+                [&](auto& schunk)
+                {
+                    return schunk.max_block_size();
+                },
+                *m_schunk
+            );
+        }
+
+        /// \brief Retrieve the chunk size (in bytes) of the channel
+        ///
+        /// This will be all of the chunk sizes except for the last chunk. The last chunk may be smaller so to accurately
+        /// capture it you should use the override with a size_t
+        ///
+        /// \return The chunk size (in bytes).
+        size_t chunk_size() const noexcept
+        {
+            assert(m_schunk != nullptr);
+            return std::visit(
+                [&](auto& schunk)
+                {
+                    return schunk.chunk_bytes();
+                },
+                *m_schunk
+            );
+        }
+
+        size_t chunk_elems() const
+        {
+            auto chunk_size = this->chunk_size();
+            assert(chunk_size % sizeof(T) == 0);
+            return chunk_size / sizeof(T);
+        }
+
+        /// \brief Retrieve the chunk size (in bytes) of the channel at the given chunk index.
+        ///
+        /// \return The chunk size (in bytes) at index `chunk_index`.
+        ///
+        /// \throws std::out_of_range if the chunk index is invalid
+        size_t chunk_size(size_t chunk_index) const
+        {
+            assert(m_schunk != nullptr);
+            return std::visit(
+                [&](auto& schunk)
+                {
+                    return schunk.chunk_bytes(chunk_index);
+                },
+                *m_schunk
+            );
+        }
+
+        size_t chunk_elems(size_t chunk_index) const
+        {
+            auto chunk_size = this->chunk_size(chunk_index);
+            assert(chunk_size % sizeof(T) == 0);
+            return chunk_size / sizeof(T);
+        }
+
+
+        /// Retrieves and decompresses a chunk of data into the provided buffer.
+        ///
+        /// This function retrieves the chunk at the given index from the internal `schunk`,
+        /// decompresses it using the current decompression context, and stores the result in `buffer`.
+        ///
+        /// \param buffer A span representing the destination buffer to store the decompressed data.
+        ///               Must be large enough to hold one chunk of decompressed data.
+        /// \param chunk_idx The index of the chunk to retrieve.
+        ///
+        /// \throws std::runtime_error if the internal `schunk` pointer is not initialized.
+        void get_chunk(std::span<T> buffer, size_t chunk_idx) const
+        {
+            if (!m_schunk)
+            {
+                throw std::runtime_error(
+                    "Internal Error: Channel instance is not properly initialized, unable to get decompressed data"
+                );
+            }
+
+            std::visit(
+                [&](const auto& schunk)
+                {
+                    if (enums::is_gpu_codec(m_codec))
+                    {
+                        schunk.chunk(buffer, chunk_idx);
+                    }
+                    else
+                    {
+                        auto compression_context = this->create_compression_context(
+                            m_codec,
+                            m_num_threads,
+                            m_compression_level,
+                            this->block_size(),
+                            0
+                        );
+
+                        schunk.chunk(
+                            std::get<cpu_compression_context>(compression_context).decompression_ctx.get(),
+                            buffer,
+                            chunk_idx
+                        );
+                    }
+                }
+                * m_schunk
+            );
+        }
+
+        /// Compresses and sets a chunk of data from the provided buffer at the specified index.
+        ///
+        /// This function compresses the data in the provided buffer using the current compression
+        /// context and writes it into the internal `schunk` at the given index.
+        ///
+        /// \param buffer A span representing the source data to be compressed and stored.
+        /// \param chunk_idx The index of the chunk to overwrite or set with the compressed data.
+        ///
+        /// \throws std::runtime_error if the internal `schunk` pointer is not initialized.
+        void set_chunk(std::span<T> buffer, size_t chunk_idx)
+        {
+            if (!m_schunk)
+            {
+                throw std::runtime_error(
+                    "Internal Error: Channel instance is not properly initialized, unable to set data"
+                );
+            }
+
+            std::visit(
+                [&](auto& schunk)
+                {
+                    if (buffer.size() != schunk.chunk_elements(chunk_idx))
+                    {
+                        throw std::invalid_argument(
+                            std::format(
+                                "Invalid chunk passed to `set_chunk`. Expected this to contain exactly {} elements."
+                                " Instead it holds {}. This is likely due to having not correctly checked the number"
+                                " of elements.",
+                                schunk.chunk_elements(chunk_idx),
+                                buffer.size()
+                            )
+                        );
+                    }
+
+                    if (enums::is_gpu_codec(m_codec))
+                    {
+                        auto compression_context = this->create_compression_context(
+                            m_codec,
+                            m_num_threads,
+                            m_compression_level,
+                            this->block_size(),
+                            cuda::current_device()
+                        );
+
+                        schunk.set_chunk(
+                            std::get<gpu_compression_context>(compression_context).ctx,
+                            buffer,
+                            chunk_idx
+                        );
+                    }
+                    else
+                    {
+                        auto compression_context = this->create_compression_context(
+                            m_codec,
+                            m_num_threads,
+                            m_compression_level,
+                            this->block_size(),
+                            0
+                        );
+
+                        schunk.set_chunk(
+                            std::get<cpu_compression_context>(compression_context).compression_ctx,
+                            buffer,
+                            chunk_idx
+                        );
+                    }
+                },
+                *m_schunk
+            );
+        }
+
+        /// Get the decompressed data as a vector.
+        ///
+        /// \throws std::runtime_error if the internal `schunk` pointer is not initialized.
+        ///
+        /// \return A vector containing the decompressed data.
+        std::vector<T> get_decompressed() const
+        {
+            if (!m_schunk)
+            {
+                throw std::runtime_error(
+                    "Internal Error: Channel instance is not properly initialized, unable to get decompressed data"
+                );
+            }
+
+
+            return std::visit(
+                [&](const auto& schunk)
+                {
+                    if (enums::is_gpu_codec(m_codec))
+                    {
+                        auto compression_context = this->create_compression_context(
+                            m_codec,
+                            m_num_threads,
+                            m_compression_level,
+                            this->block_size(),
+                            cuda::current_device()
+                        );
+                        return schunk.to_uncompressed(std::get<gpu_compression_context>(compression_context));
+                    }
+                    auto compression_context = this->create_compression_context(
+                        m_codec,
+                        m_num_threads,
+                        m_compression_level,
+                        this->block_size(),
+                        0
+                    );
+                    return schunk.to_uncompressed(std::get<cpu_compression_context>(compression_context));
+                },
+                *m_schunk
+            );
+        }
+
+        /// Equality operators, compares pointers to check for equality
+        bool operator==(const channel<T>& other) const noexcept
+        {
+            return this == &other;
+        }
+
+        /// \brief Create a compression context for the given codec.
+        ///
+        /// This will initialize either a gpu or cpu compressor/decompressor, returning it. This is primarily
+        /// for internal API usage.
+        ///
+        /// \param codec The compression codec, the type of context to initialize is inferred from this.
+        /// \param num_threads The compression/decompression threads. Only used when the codec is cpu-based
+        /// \param compression_level The compression level. Only used when the codec is cpu-based
+        /// \param block_size The block size for the compressed data.
+        /// \param gpu_device The GPU device to use for compression/decompression. Only used when the codec is gpu-based
+        static compression_context_var create_compression_context(
+            const enums::codec codec,
+            const size_t num_threads,
+            const size_t compression_level,
+            const size_t block_size,
+            const int gpu_device
+        )
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            if (enums::is_gpu_codec(codec))
+            {
+                return gpu_compression_context{
+                    .ctx = cuda::make_compression_context<T>(codec, gpu_device, block_size)
+
+                };
+            }
+            else
+            {
+                return cpu_compression_context{
+                    .compression_ctx = blosc2::create_compression_context<T>(
+                        num_threads,
+                        codec,
+                        static_cast<uint8_t>(compression_level),
+                        block_size
+                    ),
+                    .decompression_ctx = blosc2::create_decompression_context(num_threads),
+                    .nthreads = num_threads
+                };
+            }
+        }
+
+    private
+    :
+        friend struct image<T>;
+
+        /// The storage for the internal data, stored contiguously in a compressed data format
+        schunk_var_ptr<T> m_schunk = nullptr;
+        /// Keeps the globally discoverable scratch pool alive for as long as this channel exists.
+        std::shared_ptr<detail::scratch_buffer_pool> m_scratch_pool = nullptr;
+        /// The compression codec in use.
+        enums::codec m_codec = enums::codec::lz4;
+        /// Compression level.
+        uint8_t m_compression_level = 9;
+        /// The number of threads used for cpu compression/decompression (blosc2 only).
+        size_t m_num_threads = std::thread::hardware_concurrency() / 2;
+
+        /// The width and height of the channel.
+        size_t m_width = 1;
+        size_t m_height = 1;
+    };
+} // NAMESPACE_COMPRESSED_IMAGE
diff --git a/compressed_image/include/compressed/constants.h b/compressed_image/include/compressed/constants.h
index 6f548ed..afcf899 100644
--- a/compressed_image/include/compressed/constants.h
+++ b/compressed_image/include/compressed/constants.h
@@ -2,12 +2,11 @@
 
 #include "macros.h"
 
-
-namespace NAMESPACE_COMPRESSED_IMAGE
+namespace
+NAMESPACE_COMPRESSED_IMAGE
 {
-	/// Default chunk size for blosc2 super-chunks. This equates to 4MB or one 2048*2048 channel
-	constexpr static inline std::size_t s_default_chunksize = 4'194'304;
-	/// Default block size for blosc2 chunks. This equates to 16 scanlines in that same 2048*2048 channel.
-	constexpr static inline std::size_t s_default_blocksize = 32'768;
-
-} // NAMESPACE_COMPRESSED_IMAGE
\ No newline at end of file
+    /// Default chunk size for blosc2 super-chunks. This equates to 4MB or one 2048*2048 channel
+    constexpr static inline size_t s_default_chunksize = 4'194'304;
+    /// Default block size for blosc2 chunks. This equates to 16 scanlines in that same 2048*2048 channel.
+    constexpr static inline size_t s_default_blocksize = 32'768;
+} // NAMESPACE_COMPRESSED_IMAGE
diff --git a/compressed_image/include/compressed/context.h b/compressed_image/include/compressed/context.h
new file mode 100644
index 0000000..9ae1176
--- /dev/null
+++ b/compressed_image/include/compressed/context.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <variant>
+
+#include "blosc2/wrapper.h"
+#include "compressed/macros.h"
+#include "cuda/compressors/base.h"
+
+namespace
+NAMESPACE_COMPRESSED_IMAGE
+{
+    struct cpu_compression_context
+    {
+        blosc2::context_ptr compression_ctx = nullptr;
+        blosc2::context_ptr decompression_ctx = nullptr;
+
+        size_t nthreads{};
+    };
+
+    struct gpu_compression_context
+    {
+        cuda::nvcomp_context ctx{};
+    };
+
+    using compression_context_var = std::variant<cpu_compression_context, gpu_compression_context>;
+} // NAMESPACE_COMPRESSED_IMAGE
diff --git a/compressed_image/include/compressed/cuda/compression.h b/compressed_image/include/compressed/cuda/compression.h
new file mode 100644
index 0000000..4be5f58
--- /dev/null
+++ b/compressed_image/include/compressed/cuda/compression.h
@@ -0,0 +1,83 @@
+#pragma once
+
+#include "compressed/enums.h"
+
+#include "compressed/cuda/compressors/lz4.h"
+#include "compressed/cuda/compressors/snappy.h"
+#include "compressed/cuda/compressors/zstd.h"
+#include "compressed/cuda/compressors/deflate.h"
+#include "compressed/cuda/compressors/gdeflate.h"
+#include "compressed/cuda/compressors/cascaded.h"
+
+namespace
+NAMESPACE_COMPRESSED_IMAGE
+{
+    namespace cuda
+    {
+        template <typename T>
+        using compressor_var = std::variant<
+            lz4_compressor<T>,
+            snappy_compressor<T>,
+            zstd_compressor<T>,
+            deflate_compressor<T>,
+            gdeflate_compressor<T>,
+            cascaded_compressor<T>>;
+
+
+        template <typename T>
+        compressor_var<T> make_compressor(NAMESPACE_COMPRESSED_IMAGE::enums::codec codec)
+        {
+            switch (codec)
+            {
+            case NAMESPACE_COMPRESSED_IMAGE::enums::codec::lz4_gpu:
+                return lz4_compressor<T>{};
+            case NAMESPACE_COMPRESSED_IMAGE::enums::codec::snappy_gpu:
+                return snappy_compressor<T>{};
+            case NAMESPACE_COMPRESSED_IMAGE::enums::codec::zstd_gpu:
+                return zstd_compressor<T>{};
+            case NAMESPACE_COMPRESSED_IMAGE::enums::codec::deflate_gpu:
+                return deflate_compressor<T>{};
+            case NAMESPACE_COMPRESSED_IMAGE::enums::codec::gdeflate_gpu:
+                return gdeflate_compressor<T>{};
+            case NAMESPACE_COMPRESSED_IMAGE::enums::codec::cascaded_gpu:
+                return cascaded_compressor<T>{};
+            default:
+                throw std::invalid_argument(
+                    std::format("Unknown or unsupported gpu codec: {}", static_cast<int>(codec))
+                );
+            }
+        }
+
+        template <typename T>
+        compressor_var<T> make_compressor(const cuda::compressed_chunk<T>& chunk)
+        {
+            return make_compressor<T>(chunk.context.codec);
+        }
+
+
+        template <typename T>
+        nvcomp_context make_compression_context(
+            const NAMESPACE_COMPRESSED_IMAGE::enums::codec codec,
+            const int gpu_device,
+            const size_t block_size
+        )
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            auto compressor = make_compressor<T>(codec);
+
+            return std::visit(
+                [&](auto&& compressor_raw)
+                {
+                    return nvcomp_context{
+                        .comp_options = compressor_raw.default_compression_opts(),
+                        .decomp_options = compressor_raw.default_decompression_opts(),
+                        .block_size = block_size,
+                        .codec = codec,
+                        .gpu_device = gpu_device
+                    };
+                },
+                compressor
+            );
+        }
+    } // namespace cuda
+} // NAMESPACE_COMPRESSED_IMAGE
diff --git a/compressed_image/include/compressed/cuda/compressors/base.h b/compressed_image/include/compressed/cuda/compressors/base.h
new file mode 100644
index 0000000..ecc028c
--- /dev/null
+++ b/compressed_image/include/compressed/cuda/compressors/base.h
@@ -0,0 +1,701 @@
+﻿/*
+Entry point for the various compressors of nvcomp
+*/
+
+#pragma once
+
+#include <variant>
+#include <cstddef>
+#include <vector>
+#include <ranges>
+#include <execution>
+
+#include <nvcomp.h>
+#include <nvcomp/lz4.h>
+#include <nvcomp/cascaded.h>
+#include <nvcomp/deflate.h>
+#include <nvcomp/gdeflate.h>
+#include <nvcomp/gzip.h>
+#include <nvcomp/snappy.h>
+#include <nvcomp/zstd.h>
+
+#include "compressed/macros.h"
+#include "compressed/constants.h"
+#include "compressed/enums.h"
+#include "compressed/util.h"
+
+#include "compressed/cuda/memory.h"
+#include "compressed/cuda/enums.h"
+#include "compressed/cuda/compressors/util.h"
+#include "compressed/cuda/cuda_hook.h"
+#include "compressed/cuda/gpu.h"
+
+
+namespace
+NAMESPACE_COMPRESSED_IMAGE
+{
+    namespace cuda
+    {
+        /// \brief compression options for the various nvcomp compressors.
+        using compression_options = std::variant<
+            nvcompBatchedLZ4CompressOpts_t,
+            nvcompBatchedCascadedCompressOpts_t,
+            nvcompBatchedDeflateCompressOpts_t,
+            nvcompBatchedGdeflateCompressOpts_t,
+            nvcompBatchedGzipCompressOpts_t,
+            nvcompBatchedSnappyCompressOpts_t,
+            nvcompBatchedZstdCompressOpts_t
+        >;
+
+        /// \brief decompression options for the various nvcomp compressors.
+        using decompression_options = std::variant<
+            nvcompBatchedLZ4DecompressOpts_t,
+            nvcompBatchedCascadedDecompressOpts_t,
+            nvcompBatchedDeflateDecompressOpts_t,
+            nvcompBatchedGdeflateDecompressOpts_t,
+            nvcompBatchedGzipDecompressOpts_t,
+            nvcompBatchedSnappyDecompressOpts_t,
+            nvcompBatchedZstdDecompressOpts_t
+        >;
+
+
+        /// \brief Compression context for nvcomp/cuda based compression and decompression.
+        ///
+        /// Similar in scope to a `blosc2_context` struct but instead holds gpu-specific information.
+        struct nvcomp_context
+        {
+            /// The options to use for compression
+            compression_options comp_options{};
+
+            /// The options to use for decompression
+            decompression_options decomp_options{};
+
+            /// The block size used for compression. All blocks will have this size except for the last one which may
+            /// be smaller. The input is split into this many blocks. If the requested block size exceeds what the
+            /// compressor allows, it is internally reduced. A typical recommended value is 65536 (2^16).
+            size_t block_size = s_default_blocksize;
+
+            /// The compression codec to be used. Must be one of the GPU codecs to be valid.
+            NAMESPACE_COMPRESSED_IMAGE::enums::codec codec{};
+
+            /// The GPU device to use for compression/decompression.
+            int gpu_device = 0;
+        };
+
+
+        /// A single compressed chunk holding a collection of blocks inside it. Similar to a blosc2 chunk but instead
+        /// of being stored as a single chunk
+        ///
+        /// .. note::
+        ///		The block sizes of the chunk will always be equal to `max_block_size` except for the last block which
+        ///		may be smaller.
+        template <typename T>
+        struct compressed_chunk
+        {
+            /// \brief The compressed data, stored on device as a std::vector.
+            cuda_device_buffer_async<std::byte> compressed_data;
+
+            /// \brief The compressed size of each block in bytes.
+            std::vector<size_t> compressed_block_sizes{};
+
+            /// \brief The uncompressed block sizes of all the blocks inside. Expressed as bytes.
+            std::vector<size_t> uncompressed_block_sizes{};
+
+            /// \brief The compression context used for compression/decompression. Once set this may not be modified.
+            nvcomp_context context{};
+
+            compressed_chunk() = default;
+
+            compressed_chunk(
+                cuda_device_buffer_async<std::byte> _compressed_data,
+                std::vector<size_t> _comp_sizes,
+                std::vector<size_t> _uncomp_sizes,
+                nvcomp_context ctx)
+                : compressed_data(std::move(_compressed_data)),
+                  compressed_block_sizes(std::move(_comp_sizes)),
+                  uncompressed_block_sizes(std::move(_uncomp_sizes)),
+                  context(std::move(ctx))
+            {
+            }
+
+            compressed_chunk(const compressed_chunk&) = delete;
+            compressed_chunk& operator=(const compressed_chunk&) = delete;
+            compressed_chunk(compressed_chunk&&) noexcept = default;
+            compressed_chunk& operator=(compressed_chunk&&) noexcept = default;
+
+            [[nodiscard]] size_t csize() const
+            {
+                return this->compressed_data.bytes();
+            }
+
+            [[nodiscard]] size_t size() const
+            {
+                return this->byte_size() / sizeof(T);
+            }
+
+            [[nodiscard]] size_t byte_size() const
+            {
+                return std::accumulate(uncompressed_block_sizes.begin(), uncompressed_block_sizes.end(), size_t{0});
+            }
+
+            [[nodiscard]] inline size_t max_block_size() const
+            {
+                if (uncompressed_block_sizes.empty())
+                    return 0;
+
+                return uncompressed_block_sizes.at(0);
+            }
+        };
+
+
+        namespace detail
+        {
+            /// \brief base cuda-based compressor base that provides utility functions for the various compressor implementations
+            ///
+            /// Note: we use blosc2 terminology for a lot of these calls i.e. what nvcomp may call 'chunks' we call blocks
+            ///		  as each unit we are compressing is already a chunk. This gives us the same 3D data structure where
+            ///		  we cascade from channel -> chunks -> blocks. Unlike with blosc2 the blocks here are transparent to us
+            ///		  but we hide them from the call site
+            ///
+            /// When re-implementing this for different compression procedures, one needs to implement:
+            ///
+            /// - codec							<-- The codec associated with this compressor
+            /// - default_compression_opts		<-- Default compression options
+            /// - default_decompression_opts	<-- Default decompression options
+            /// - get_temp_bytes				<-- The number of scratch bytes required for compression/decompression
+            /// - block_max_compressed_size		<-- The max block size of a compressed block, given the compression opts
+            /// - max_block_size				<-- The overall max (uncompressed) size a block may have.
+            /// - compression_impl				<-- The implementation of the compression procedure
+            /// - decompression_impl			<-- The implementation of the decompression procedure
+            template <typename T>
+            struct compressor
+            {
+                virtual ~compressor() = default;
+
+                /// \brief Compress a CPU buffer into a compressed chunk using CUDA.
+                ///
+                /// This function transfers the input data to the specified CUDA device, splits it
+                /// into fixed-size blocks, compresses each block asynchronously, and copies the
+                /// compressed results back to host memory. The compression is performed using
+                /// the algorithm configured by \p context.
+                ///
+                /// \param data        The input data buffer to compress (host memory).
+                /// \param context	   The compression/decompression context used for the generated blocks.
+                ///
+                /// \return A \c compressed_chunk containing the compressed blocks, their sizes,
+                ///         and codec metadata describing the compression algorithm used.
+                ///
+                /// \throws std::bad_variant_access  If the provided \p options are not valid for
+                ///                                  the current compression algorithm.
+                /// \throws std::runtime_error       If any CUDA memory allocation, copy, or kernel
+                ///                                  execution fails.
+                ///
+                /// \note Compression uses asynchronous CUDA operations with per-thread streams and
+                ///       memory pooling. Data is synchronized before returning, but overlapping
+                ///       work on other streams may proceed concurrently.
+                compressed_chunk<T> compress(std::span<const T> data, nvcomp_context context) const
+                {
+                    _COMPRESSED_PROFILE_FUNCTION();
+
+                    device_guard guard(context.gpu_device);
+                    cuda_api::instance().set_mem_pool_size(context.gpu_device);
+
+                    context.block_size = this->fit_block_size(context.block_size);
+
+                    // ##################################################################################
+                    // Set up device uncompressed memory
+                    // ##################################################################################
+                    const size_t num_blocks = (data.size() * sizeof(T) + context.block_size - 1) / context.block_size;
+                    auto block_sizes = this->generate_block_sizes(
+                        data.size() * sizeof(T),
+                        context.block_size,
+                        num_blocks
+                    );
+
+                    auto device_uncompressed_data = make_device_buffer_async<T>(data.size());
+                    cuda_api::instance().memcpy_async(
+                        device_uncompressed_data.get_raw(),
+                        static_cast<const void*>(data.data()),
+                        device_uncompressed_data.bytes(),
+                        cudaMemcpyHostToDevice
+                    );
+
+                    auto device_block_pointers = compressor::generate_device_block_pointers(
+                        device_uncompressed_data,
+                        context.block_size,
+                        num_blocks
+                    );
+                    auto device_block_sizes = cuda_device_buffer_async<size_t>::from_host(block_sizes);
+
+                    // ##################################################################################
+                    // Set up device temporary compressed memory (Flat Buffer Strategy)
+                    // ##################################################################################
+                    auto max_block_compressed_size = this->block_max_compressed_size(
+                        context.block_size,
+                        context.comp_options
+                    );
+
+                    auto temp_compressed_buffer = make_device_buffer_async<std::byte>(
+                        num_blocks * max_block_compressed_size
+                    );
+                    std::byte* base_device_ptr = temp_compressed_buffer.get();
+
+                    std::vector<void*> host_compressed_ptrs(num_blocks);
+                    for (size_t i = 0; i < num_blocks; ++i)
+                    {
+                        host_compressed_ptrs[i] = static_cast<void*>(base_device_ptr + (i * max_block_compressed_size));
+                    }
+
+                    auto device_compressed_ptrs = cuda_device_buffer_async<void*>::from_host(host_compressed_ptrs);
+
+                    // ##################################################################################
+                    // Set up scratch buffer
+                    // ##################################################################################
+                    auto device_temp_compression_buffer = this->generate_temp_buffer(
+                        context.block_size,
+                        num_blocks,
+                        context.comp_options
+                    );
+
+                    // ##################################################################################
+                    // Call the compression routine
+                    // ##################################################################################
+                    auto device_compressed_bytes = make_device_buffer_async<size_t>(num_blocks);
+                    auto device_statuses = make_device_buffer_async<nvcompStatus_t>(num_blocks);
+
+                    this->compression_impl(
+                        context.block_size,
+                        num_blocks,
+                        device_block_pointers,
+                        device_block_sizes,
+                        device_temp_compression_buffer,
+                        device_compressed_ptrs,
+                        device_compressed_bytes,
+                        device_statuses,
+                        context.comp_options
+                    );
+
+                    // ##################################################################################
+                    // Copy sizes back to host & allocate flat fitted GPU buffer
+                    // ##################################################################################
+                    auto compressed_bytes_pinned = make_host_mem<size_t>(num_blocks);
+
+                    device_compressed_bytes.to_host(
+                        std::span<size_t>(compressed_bytes_pinned.get(), num_blocks)
+                    );
+
+                    cuda_api::instance().stream_synchronize(cudaStreamPerThread);
+
+                    this->validate_per_block_statuses(device_statuses);
+
+                    // Compute tracking offsets and total required memory size (Prefix Sum)
+                    size_t total_fitted_bytes = 0;
+                    std::vector<size_t> host_block_offsets(num_blocks);
+                    std::vector<size_t> host_compressed_sizes(num_blocks);
+
+                    for (size_t i = 0; i < num_blocks; ++i)
+                    {
+                        size_t actual_size = compressed_bytes_pinned.get()[i];
+                        host_compressed_sizes[i] = actual_size;
+                        host_block_offsets[i] = total_fitted_bytes;
+                        total_fitted_bytes += actual_size;
+                    }
+
+                    // Allocate ONE single, tightly fitted GPU buffer for output
+                    auto fitted_device_buffer = make_device_buffer_async<std::byte>(total_fitted_bytes);
+                    std::byte* dest_base_ptr = fitted_device_buffer.get();
+
+                    // Parallel device-to-device streaming memcpys
+                    for (size_t i = 0; i < num_blocks; ++i)
+                    {
+                        const size_t actual_size = host_compressed_sizes[i];
+                        if (actual_size == 0) continue;
+
+                        void* src_ptr = static_cast<void*>(base_device_ptr + (i * max_block_compressed_size));
+                        void* dst_ptr = static_cast<void*>(dest_base_ptr + host_block_offsets[i]);
+
+                        cuda_api::instance().memcpy_async(
+                            dst_ptr,
+                            src_ptr,
+                            actual_size,
+                            cudaMemcpyDeviceToDevice
+                        );
+                    }
+
+                    // ##################################################################################
+                    // Finalize async work
+                    // ##################################################################################
+                    cuda_api::instance().stream_synchronize(cudaStreamPerThread);
+
+                    return compressed_chunk<T>{
+                        std::move(fitted_device_buffer),
+                        std::move(host_compressed_sizes),
+                        std::move(block_sizes),
+                        std::move(context)
+                    };
+                };
+
+
+                /// \brief Decompress a compressed_chunk directly into a preallocated CPU buffer.
+                ///
+                /// This function decompresses all blocks in \p chunk and writes the output
+                /// sequentially into \p output. The caller must ensure that \p output is
+                /// large enough to hold the full decompressed data.
+                ///
+                /// \param chunk     The compressed data (blocks + sizes).
+                /// \param output    The preallocated span of memory where the uncompressed data
+                ///                  will be written.
+                ///
+                /// \throws std::runtime_error on CUDA or nvCOMP failure.
+                void decompress(const compressed_chunk<T>& chunk, std::span<T> output) const
+                {
+                    _COMPRESSED_PROFILE_FUNCTION();
+
+                    nvcomp_context context = chunk.context;
+
+                    device_guard guard(context.gpu_device);
+                    cuda_api::instance().set_mem_pool_size(context.gpu_device);
+
+                    const size_t num_blocks = chunk.compressed_block_sizes.size();
+
+                    // ##################################################################################
+                    // Slice flat buffer pointers back out on the host via prefix tracking
+                    // ##################################################################################
+                    std::vector<const void*> host_compressed_ptrs(num_blocks);
+                    size_t running_offset = 0;
+                    const std::byte* base_compressed_ptr = chunk.compressed_data.get();
+
+                    for (size_t i = 0; i < num_blocks; ++i)
+                    {
+                        host_compressed_ptrs[i] = static_cast<const void*>(base_compressed_ptr + running_offset);
+                        running_offset += chunk.compressed_block_sizes[i];
+                    }
+
+                    auto device_compressed_ptrs = cuda_device_buffer_async<const void
+                        *>::from_host(host_compressed_ptrs);
+                    auto device_compressed_bytes = cuda_device_buffer_async<size_t>::from_host(
+                        chunk.compressed_block_sizes
+                    );
+
+                    // ##################################################################################
+                    // Allocate single contiguous device buffer for all output
+                    // ##################################################################################
+                    auto device_output = make_device_buffer_async<T>(output.size());
+
+                    std::vector<void*> host_uncompressed_ptrs(num_blocks);
+                    size_t offset_bytes = 0;
+                    for (size_t i = 0; i < num_blocks; ++i)
+                    {
+                        host_uncompressed_ptrs[i] = reinterpret_cast<void*>(device_output.get() + (offset_bytes / sizeof
+                            (T)));
+                        offset_bytes += context.block_size;
+                    }
+
+                    auto device_uncompressed_ptrs = cuda_device_buffer_async<void*>::from_host(host_uncompressed_ptrs);
+                    std::vector<size_t> _tmp_block_sizes = chunk.uncompressed_block_sizes;
+                    auto device_uncompressed_bytes = cuda_device_buffer_async<size_t>::from_host(_tmp_block_sizes);
+
+                    // ##################################################################################
+                    // Allocate scratch buffer for decompression
+                    // ##################################################################################
+                    auto device_temp = this->generate_temp_buffer(
+                        context.block_size,
+                        num_blocks,
+                        context.decomp_options
+                    );
+                    auto device_statuses = make_device_buffer_async<nvcompStatus_t>(num_blocks);
+
+                    // ##################################################################################
+                    // Call algorithm-specific device decompression
+                    // ##################################################################################
+                    decompression_impl(
+                        num_blocks,
+                        device_compressed_ptrs,
+                        device_compressed_bytes,
+                        device_temp,
+                        device_uncompressed_ptrs,
+                        device_uncompressed_bytes,
+                        device_statuses,
+                        context.decomp_options
+                    );
+
+                    // ##################################################################################
+                    // Copy result back to host
+                    // ##################################################################################
+                    cuda_api::instance().memcpy_async(
+                        static_cast<void*>(output.data()),
+                        device_output.get_raw(),
+                        output.size() * sizeof(T),
+                        cudaMemcpyDeviceToHost
+                    );
+
+                    cuda_api::instance().stream_synchronize(cudaStreamPerThread);
+
+                    compressor<T>::validate_per_block_statuses(device_statuses);
+                }
+
+                /// \brief The codec associated with the compressor.
+                [[nodiscard]] virtual NAMESPACE_COMPRESSED_IMAGE::enums::codec codec() const noexcept
+                =
+                0;
+
+                [[nodiscard]] virtual compression_options default_compression_opts() const noexcept
+                =
+                0;
+                [[nodiscard]] virtual decompression_options default_decompression_opts() const noexcept
+                =
+                0;
+
+                /// \brief The max block size allowed for a given compressor. Implementation defined.
+                [[nodiscard]] virtual size_t max_block_size() const noexcept
+                =
+                0;
+
+                /// \brief Fits the given `block_size` to be <= what the compressor allows.
+                ///
+                /// Additionally, ensures the block size aligns to T such that we can go back and forth from
+                /// std::byte <-> T cleanly.
+                [[nodiscard]] size_t fit_block_size(size_t block_size) const noexcept
+                {
+                    auto fitted = std::min(block_size, this->max_block_size());
+                    fitted -= fitted % sizeof(T);
+                    return fitted;
+                };
+
+            private
+            :
+                /// ##################################################################################
+                /// Pure virtual function, dependent on compressor.
+                /// ##################################################################################
+
+                /// \brief Retrieve the number of temporary device bytes needed for the compression/decompression procedure
+                ///
+                /// \param block_size The block size of one of the sub-streams
+                /// \param num_blocks The total number of blocks
+                /// \param options	  The compression/decompression options for which to get the number of temporary
+                ///					  bytes
+                virtual size_t get_temp_bytes(
+                    size_t block_size,
+                    size_t num_blocks,
+                    std::variant<compression_options, decompression_options> options
+                ) const
+                =
+                0;
+
+                /// \brief Retrieve the maximum size required for compressing a single block for the given options
+                ///
+                /// \param block_size The size of a single block
+                /// \param options	  The compression options which are used for compression.
+                virtual size_t block_max_compressed_size(size_t block_size, compression_options& options) const
+                =
+                0;
+
+
+                /// \brief Call the underlying compression implementation of the set of blocks.
+                ///
+                /// All memory allocation needs to happen before this point, as this assumes all of these buffers have
+                /// been allocated and filled
+                ///
+                /// \param block_size				The overall block size, all blocks except for the last should have
+                ///									this size.
+                /// \param num_blocks				The overall number of blocks
+                /// \param uncompressed_block_ptrs	The pointers to the start of each uncompressed block
+                /// \param uncompressed_block_sizes	The size of each uncompressed block
+                /// \param scratch_space			The scratch bytes used by the compressor during compression.
+                /// \param compressed_block_ptrs	To be filled out by the implementation, the pointers to the start
+                ///									of each (preallocated) compressed block
+                /// \param compressed_block_sizes	To be filled out by the implementation, the sizes of the compressed
+                ///									blocks.
+                /// \param block_statuses			The statuses per compressed block, these live on the GPU and must be
+                ///									copied back for introspection.
+                /// \param options					The compression options to use, must be valid for the current
+                ///									compressor.
+                virtual void compression_impl(
+                    size_t block_size,
+                    size_t num_blocks,
+                    const cuda_device_buffer_async<void*>& uncompressed_block_ptrs,
+                    const cuda_device_buffer_async<size_t>& uncompressed_block_sizes,
+                    cuda_device_buffer_async<std::byte>& scratch_space,
+                    cuda_device_buffer_async<void*>& compressed_block_ptrs,
+                    cuda_device_buffer_async<size_t>& compressed_block_sizes,
+                    cuda_device_buffer_async<nvcompStatus_t>& block_statuses,
+                    const compression_options& options
+                ) const
+                =
+                0;
+
+                /// \brief Low-level device decompression implementation.
+                ///
+                /// This function should be implemented by the derived class for a specific
+                /// compression algorithm (e.g., lz, zstd). It operates entirely on device
+                /// memory and writes decompressed data into `uncompressed_block_ptrs`.
+                ///
+                /// \param num_blocks               The overall number of blocks
+                /// \param compressed_block_ptrs    Device buffer containing pointers to compressed blocks.
+                /// \param compressed_block_sizes   Device buffer containing sizes of compressed blocks.
+                /// \param scratch_space            Temporary device buffer allocated for decompression.
+                /// \param uncompressed_block_ptrs	The pointers to the start of each uncompressed block, filled out by
+                ///									this function
+                /// \param uncompressed_block_sizes	The size of each uncompressed block, filled out by this function.
+                /// \param block_statuses			The statuses per compressed block, these live on the GPU and must be
+                ///									copied back for introspection.
+                /// \param options                  Algorithm-specific decompression options.
+                virtual void decompression_impl(
+                    size_t num_blocks,
+                    const cuda_device_buffer_async<const void*>& compressed_block_ptrs,
+                    const cuda_device_buffer_async<size_t>& compressed_block_sizes,
+                    cuda_device_buffer_async<std::byte>& scratch_space,
+                    cuda_device_buffer_async<void*>& uncompressed_block_ptrs,
+                    cuda_device_buffer_async<size_t>& uncompressed_block_sizes,
+                    cuda_device_buffer_async<nvcompStatus_t>& block_statuses,
+                    const decompression_options& options
+                ) const
+                =
+                0;
+
+            private
+            :
+                /// ##################################################################################
+                /// Generic functions across all compressors
+                /// ##################################################################################
+
+
+                /// \brief Validate all the errors per-block collected during compression/decompression and throw these
+                ///		   as aggregated exception
+                ///
+                /// This function NEEDS to be called after synchronization of the stream and the compression/decompression
+                /// operations as otherwise these statuses are not yet guaranteed to be valid.
+                ///
+                /// \param device_statuses The device-memory held nvcompStatus_t vector.
+                void validate_per_block_statuses(cuda_device_buffer_async<nvcompStatus_t>& device_statuses) const
+                {
+                    _COMPRESSED_PROFILE_FUNCTION();
+                    auto status_buffer = NAMESPACE_COMPRESSED_IMAGE::cuda::make_host_mem<nvcompStatus_t>(
+                        device_statuses.size
+                    );
+
+                    cuda_api::instance().memcpy_async(
+                        status_buffer.get(),
+                        device_statuses.get(),
+                        device_statuses.bytes(),
+                        cudaMemcpyDeviceToHost
+                    );
+                    cuda_api::instance().stream_synchronize(cudaStreamPerThread);
+
+
+                    std::vector<std::string> error_messages;
+                    for (size_t i = 0; i < device_statuses.size; ++i)
+                    {
+                        if (status_buffer.get()[i] != nvcompStatus_t::nvcompSuccess)
+                        {
+                            error_messages.emplace_back(
+                                std::format(
+                                    "block {} failed with nvcomp status: '{}'",
+                                    i,
+                                    cuda::util::status_t_to_string(status_buffer.get()[i])
+                                )
+                            );
+                        }
+                    }
+
+                    if (!error_messages.empty())
+                    {
+                        std::string joined_errors;
+                        joined_errors.reserve(error_messages.size() * 64);
+
+                        for (size_t j = 0; j < error_messages.size(); ++j)
+                        {
+                            joined_errors += error_messages[j];
+                            if (j + 1 < error_messages.size())
+                            {
+                                joined_errors += '\n';
+                            }
+                        }
+
+                        throw std::runtime_error(
+                            std::format(
+                                "compression/decompression failed for {} out of {} blocks:\n{}",
+                                error_messages.size(),
+                                device_statuses.size,
+                                joined_errors
+                            )
+                        );
+                    }
+                }
+
+
+                /// \brief Generates the temporary buffer the compressor uses internally.
+                ///
+                /// \param block_size The block size to use for compression
+                /// \param num_blocks The number of blocks to compress
+                /// \param options The compression/decompression options that will be used
+                cuda_device_buffer_async<std::byte> generate_temp_buffer(
+                    const size_t block_size,
+                    const size_t num_blocks,
+                    std::variant<compression_options, decompression_options> options
+                ) const
+                {
+                    _COMPRESSED_PROFILE_FUNCTION();
+                    auto size = this->get_temp_bytes(block_size, num_blocks, options);
+                    return make_device_buffer_async<std::byte>(size);
+                }
+
+                /// \brief Generate a buffer (vector) containing pointers into the individual blocks
+                ///
+                /// These pointers index into the device memory from `device_buffer`
+                ///
+                /// \param device_uncompressed_data The device memory buffer which holds the uncompressed data
+                /// \param block_size The size of a single block
+                /// \param num_blocks The number of blocks `device_buffer` stores
+                static cuda_device_buffer_async<void*> generate_device_block_pointers(
+                    const cuda_device_buffer_async<T>& device_uncompressed_data,
+                    const size_t block_size,
+                    const size_t num_blocks
+                )
+                {
+                    _COMPRESSED_PROFILE_FUNCTION();
+                    std::vector<const void*> ptrs(num_blocks);
+
+                    auto device_base_ptr = static_cast<const char*>(device_uncompressed_data.get_raw());
+                    for (size_t i = 0; i < num_blocks; ++i)
+                    {
+                        ptrs[i] = device_base_ptr + block_size * i;
+                    }
+
+                    // Now that we have this memory on the host, we memcpy it over
+                    auto device_buffer = make_device_buffer_async<void*>(num_blocks);
+                    cuda_api::instance().memcpy_async(
+                        device_buffer.get_raw(),
+                        ptrs.data(),
+                        device_buffer.bytes(),
+                        cudaMemcpyHostToDevice
+                    );
+
+                    return std::move(device_buffer);
+                }
+
+                /// \brief Compute a vector of all the block sizes of the uncompressed data.
+                ///
+                /// All elements will be == to `block_size` except the last element which will be the mod.
+                ///
+                /// \param num_bytes The total uncompressed bytes
+                /// \param block_size The block size, already fitted to fit within
+                /// \param num_blocks The number of blocks to generate.
+                std::vector<size_t> generate_block_sizes(
+                    const size_t num_bytes,
+                    const size_t block_size,
+                    const size_t num_blocks
+                ) const
+                {
+                    _COMPRESSED_PROFILE_FUNCTION();
+                    std::vector<size_t> out(num_blocks, block_size);
+                    if (!out.empty())
+                    {
+                        out[out.size() - 1] = num_bytes - (block_size * (num_blocks - 1));
+                    }
+                    return out;
+                }
+            };
+        }
+    } // namespace cuda
+} // namespace NAMESPACE_COMPRESSED_IMAGE
diff --git a/compressed_image/include/compressed/cuda/compressors/cascaded.h b/compressed_image/include/compressed/cuda/compressors/cascaded.h
new file mode 100644
index 0000000..3e02f9c
--- /dev/null
+++ b/compressed_image/include/compressed/cuda/compressors/cascaded.h
@@ -0,0 +1,197 @@
+#pragma once
+
+#include "compressed/macros.h"
+
+#include "compressed/cuda/nvcomp_hook.h"
+#include "compressed/cuda/compressors/base.h"
+#include "compressed/cuda/compressors/util.h"
+
+namespace
+NAMESPACE_COMPRESSED_IMAGE::cuda
+{
+    template <typename T>
+    struct cascaded_compressor final : public detail::compressor<T>
+    {
+        [[nodiscard]] NAMESPACE_COMPRESSED_IMAGE::enums::codec codec() const noexcept override
+        {
+            return NAMESPACE_COMPRESSED_IMAGE::enums::codec::cascaded_gpu;
+        };
+
+        [[nodiscard]] compression_options default_compression_opts() const noexcept override
+        {
+            auto opts = nvcompBatchedCascadedCompressDefaultOpts;
+            opts.type = util::to_nvcomp_type<T>();
+            if (opts.type == nvcompType_t::NVCOMP_TYPE_FLOAT16)
+            {
+                opts.type = nvcompType_t::NVCOMP_TYPE_SHORT;
+            }
+            return opts;
+        };
+
+        [[nodiscard]] decompression_options default_decompression_opts() const noexcept override
+        {
+            return nvcompBatchedCascadedDecompressDefaultOpts;
+        };
+
+        [[nodiscard]] size_t max_block_size() const noexcept override
+        {
+            return nvcompCascadedCompressionMaxAllowedChunkSize;
+        };
+
+    private:
+        size_t get_temp_bytes(
+            size_t block_size,
+            size_t num_blocks,
+            std::variant<compression_options, decompression_options> options
+        ) const override
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            if (std::holds_alternative<compression_options>(options))
+            {
+                size_t temp_bytes{};
+                const auto status = nvcomp_api::instance().CascadedCompressGetTempSizeAsync(
+                    num_blocks,
+                    block_size,
+                    std::get<nvcompBatchedCascadedCompressOpts_t>(std::get<compression_options>(options)),
+                    &temp_bytes,
+                    block_size * num_blocks
+                );
+
+                if (status != nvcompStatus_t::nvcompSuccess)
+                {
+                    throw std::runtime_error(
+                        std::format(
+                            "cascaded: Unable to retrieve the scratch bytes required for compression due to nvcomp error: '{}'",
+                            util::status_t_to_string(status)
+                        )
+                    );
+                }
+
+                return temp_bytes;
+            }
+
+            size_t temp_bytes{};
+            const auto status = nvcomp_api::instance().CascadedDecompressGetTempSizeAsync(
+                num_blocks,
+                block_size,
+                std::get<nvcompBatchedCascadedDecompressOpts_t>(std::get<decompression_options>(options)),
+                &temp_bytes,
+                block_size * num_blocks
+            );
+
+            if (status != nvcompStatus_t::nvcompSuccess)
+            {
+                throw std::runtime_error(
+                    std::format(
+                        "cascaded: Unable to retrieve the scratch bytes required for decompression due to nvcomp error: '{}'",
+                        util::status_t_to_string(status)
+                    )
+                );
+            }
+
+            return temp_bytes;
+        }
+
+
+        size_t block_max_compressed_size(size_t block_size, compression_options& options) const override
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            size_t max_bytes = 0;
+            const auto status = nvcomp_api::instance().CascadedCompressGetMaxOutputChunkSize(
+                block_size,
+                std::get<nvcompBatchedCascadedCompressOpts_t>(options),
+                &max_bytes
+            );
+
+            if (status != nvcompStatus_t::nvcompSuccess)
+            {
+                throw std::runtime_error(
+                    std::format(
+                        "cascaded: Unable to retrieve the maximum compressed size for a block due to nvcomp error: '{}'",
+                        util::status_t_to_string(status)
+                    )
+                );
+            }
+
+            return max_bytes;
+        }
+
+
+        void compression_impl(
+            size_t block_size,
+            size_t num_blocks,
+            const cuda_device_buffer_async<void*>& uncompressed_block_ptrs,
+            const cuda_device_buffer_async<size_t>& uncompressed_block_sizes,
+            cuda_device_buffer_async<std::byte>& scratch_space,
+            cuda_device_buffer_async<void*>& compressed_block_ptrs,
+            cuda_device_buffer_async<size_t>& compressed_block_sizes,
+            cuda_device_buffer_async<nvcompStatus_t>& block_statuses,
+            const compression_options& options
+        ) const override
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            const auto status = nvcomp_api::instance().CascadedCompressAsync(
+                uncompressed_block_ptrs.get(),
+                uncompressed_block_sizes.get(),
+                block_size,
+                num_blocks,
+                scratch_space.get_raw(),
+                scratch_space.bytes(),
+                compressed_block_ptrs.get(),
+                compressed_block_sizes.get(),
+                std::get<nvcompBatchedCascadedCompressOpts_t>(options),
+                block_statuses.get(),
+                cudaStreamPerThread
+            );
+
+            if (status != nvcompStatus_t::nvcompSuccess)
+            {
+                throw std::runtime_error(
+                    std::format(
+                        "cascaded: nvcompBatchedCascadedCompressAsync failed to launch due to nvcomp error: '{}'",
+                        util::status_t_to_string(status)
+                    )
+                );
+            }
+        };
+
+        void decompression_impl(
+            size_t num_blocks,
+            const cuda_device_buffer_async<const void*>& compressed_block_ptrs,
+            const cuda_device_buffer_async<size_t>& compressed_block_sizes,
+            cuda_device_buffer_async<std::byte>& scratch_space,
+            cuda_device_buffer_async<void*>& uncompressed_block_ptrs,
+            cuda_device_buffer_async<size_t>& uncompressed_block_sizes,
+            cuda_device_buffer_async<nvcompStatus_t>& block_statuses,
+            const decompression_options& options
+        ) const override
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            auto block_sizes_out = cuda::make_device_buffer_async<size_t>(num_blocks);
+
+            const auto status = nvcomp_api::instance().CascadedDecompressAsync(
+                compressed_block_ptrs.get(),
+                compressed_block_sizes.get(),
+                uncompressed_block_sizes.get(),
+                block_sizes_out.get(),
+                num_blocks,
+                scratch_space.get_raw(),
+                scratch_space.size,
+                uncompressed_block_ptrs.get(),
+                std::get<nvcompBatchedCascadedDecompressOpts_t>(options),
+                block_statuses.get(),
+                cudaStreamPerThread
+            );
+
+            if (status != nvcompStatus_t::nvcompSuccess)
+            {
+                throw std::runtime_error(
+                    std::format(
+                        "cascaded: nvcompBatchedCascadedDecompressAsync failed to launch due to nvcomp error: '{}'",
+                        util::status_t_to_string(status)
+                    )
+                );
+            }
+        }
+    };
+} // namespace NAMESPACE_COMPRESSED_IMAGE::cuda
diff --git a/compressed_image/include/compressed/cuda/compressors/deflate.h b/compressed_image/include/compressed/cuda/compressors/deflate.h
new file mode 100644
index 0000000..bac2c4c
--- /dev/null
+++ b/compressed_image/include/compressed/cuda/compressors/deflate.h
@@ -0,0 +1,195 @@
+#pragma once
+
+#include "compressed/macros.h"
+
+#include "compressed/cuda/nvcomp_hook.h"
+#include "compressed/cuda/compressors/base.h"
+#include "compressed/cuda/compressors/util.h"
+
+namespace
+NAMESPACE_COMPRESSED_IMAGE
+{
+    namespace cuda
+    {
+        template <typename T>
+        struct deflate_compressor final : public detail::compressor<T>
+        {
+            [[nodiscard]] NAMESPACE_COMPRESSED_IMAGE::enums::codec codec() const noexcept override
+            {
+                return NAMESPACE_COMPRESSED_IMAGE::enums::codec::deflate_gpu;
+            };
+
+            [[nodiscard]] compression_options default_compression_opts() const noexcept override
+            {
+                // This defaults to a low compression, high throughput mode.
+                return nvcompBatchedDeflateCompressDefaultOpts;
+            };
+
+            [[nodiscard]] decompression_options default_decompression_opts() const noexcept override
+            {
+                return nvcompBatchedDeflateDecompressDefaultOpts;
+            };
+
+            [[nodiscard]] size_t max_block_size() const noexcept override
+            {
+                return nvcompDeflateCompressionMaxAllowedChunkSize;
+            };
+
+        private:
+            size_t get_temp_bytes(
+                size_t block_size,
+                size_t num_blocks,
+                std::variant<compression_options, decompression_options> options
+            ) const override
+            {
+                _COMPRESSED_PROFILE_FUNCTION();
+                if (std::holds_alternative<compression_options>(options))
+                {
+                    size_t temp_bytes{};
+                    const auto status = nvcomp_api::instance().DeflateCompressGetTempSizeAsync(
+                        num_blocks,
+                        block_size,
+                        std::get<nvcompBatchedDeflateCompressOpts_t>(std::get<compression_options>(options)),
+                        &temp_bytes,
+                        block_size * num_blocks
+                    );
+
+                    if (status != nvcompStatus_t::nvcompSuccess)
+                    {
+                        throw std::runtime_error(
+                            std::format(
+                                "deflate: Unable to retrieve the scratch bytes required for compression due to nvcomp error: '{}'",
+                                util::status_t_to_string(status)
+                            )
+                        );
+                    }
+
+                    return temp_bytes;
+                }
+
+                size_t temp_bytes{};
+                const auto status = nvcomp_api::instance().DeflateDecompressGetTempSizeAsync(
+                    num_blocks,
+                    block_size,
+                    std::get<nvcompBatchedDeflateDecompressOpts_t>(std::get<decompression_options>(options)),
+                    &temp_bytes,
+                    block_size * num_blocks
+                );
+
+                if (status != nvcompStatus_t::nvcompSuccess)
+                {
+                    throw std::runtime_error(
+                        std::format(
+                            "deflate: Unable to retrieve the scratch bytes required for decompression due to nvcomp error: '{}'",
+                            util::status_t_to_string(status)
+                        )
+                    );
+                }
+
+                return temp_bytes;
+            }
+
+
+            size_t block_max_compressed_size(size_t block_size, compression_options& options) const override
+            {
+                _COMPRESSED_PROFILE_FUNCTION();
+                size_t max_bytes = 0;
+                const auto status = nvcomp_api::instance().DeflateCompressGetMaxOutputChunkSize(
+                    block_size,
+                    std::get<nvcompBatchedDeflateCompressOpts_t>(options),
+                    &max_bytes
+                );
+
+                if (status != nvcompStatus_t::nvcompSuccess)
+                {
+                    throw std::runtime_error(
+                        std::format(
+                            "deflate: Unable to retrieve the maximum compressed size for a block due to nvcomp error: '{}'",
+                            util::status_t_to_string(status)
+                        )
+                    );
+                }
+
+                return max_bytes;
+            }
+
+
+            void compression_impl(
+                size_t block_size,
+                size_t num_blocks,
+                const cuda_device_buffer_async<void*>& uncompressed_block_ptrs,
+                const cuda_device_buffer_async<size_t>& uncompressed_block_sizes,
+                cuda_device_buffer_async<std::byte>& scratch_space,
+                cuda_device_buffer_async<void*>& compressed_block_ptrs,
+                cuda_device_buffer_async<size_t>& compressed_block_sizes,
+                cuda_device_buffer_async<nvcompStatus_t>& block_statuses,
+                const compression_options& options
+            ) const override
+            {
+                _COMPRESSED_PROFILE_FUNCTION();
+                const auto status = nvcomp_api::instance().DeflateCompressAsync(
+                    uncompressed_block_ptrs.get(),
+                    uncompressed_block_sizes.get(),
+                    block_size,
+                    num_blocks,
+                    scratch_space.get_raw(),
+                    scratch_space.bytes(),
+                    compressed_block_ptrs.get(),
+                    compressed_block_sizes.get(),
+                    std::get<nvcompBatchedDeflateCompressOpts_t>(options),
+                    block_statuses.get(),
+                    cudaStreamPerThread
+                );
+
+                if (status != nvcompStatus_t::nvcompSuccess)
+                {
+                    throw std::runtime_error(
+                        std::format(
+                            "deflate: nvcompBatchedDeflateCompressAsync failed to launch due to nvcomp error: '{}'",
+                            util::status_t_to_string(status)
+                        )
+                    );
+                }
+            };
+
+            void decompression_impl(
+                size_t num_blocks,
+                const cuda_device_buffer_async<const void*>& compressed_block_ptrs,
+                const cuda_device_buffer_async<size_t>& compressed_block_sizes,
+                cuda_device_buffer_async<std::byte>& scratch_space,
+                cuda_device_buffer_async<void*>& uncompressed_block_ptrs,
+                cuda_device_buffer_async<size_t>& uncompressed_block_sizes,
+                cuda_device_buffer_async<nvcompStatus_t>& block_statuses,
+                const decompression_options& options
+            ) const override
+            {
+                _COMPRESSED_PROFILE_FUNCTION();
+                auto block_sizes_out = cuda::make_device_buffer_async<size_t>(num_blocks);
+
+                const auto status = nvcomp_api::instance().DeflateDecompressAsync(
+                    compressed_block_ptrs.get(),
+                    compressed_block_sizes.get(),
+                    uncompressed_block_sizes.get(),
+                    block_sizes_out.get(),
+                    num_blocks,
+                    scratch_space.get_raw(),
+                    scratch_space.size,
+                    uncompressed_block_ptrs.get(),
+                    std::get<nvcompBatchedDeflateDecompressOpts_t>(options),
+                    block_statuses.get(),
+                    cudaStreamPerThread
+                );
+
+                if (status != nvcompStatus_t::nvcompSuccess)
+                {
+                    throw std::runtime_error(
+                        std::format(
+                            "deflate: nvcompBatchedDeflateDecompressAsync failed to launch due to nvcomp error: '{}'",
+                            util::status_t_to_string(status)
+                        )
+                    );
+                }
+            }
+        };
+    } // namespace cuda
+} // namespace NAMESPACE_COMPRESSED_IMAGE
diff --git a/compressed_image/include/compressed/cuda/compressors/gdeflate.h b/compressed_image/include/compressed/cuda/compressors/gdeflate.h
new file mode 100644
index 0000000..59579e6
--- /dev/null
+++ b/compressed_image/include/compressed/cuda/compressors/gdeflate.h
@@ -0,0 +1,192 @@
+#pragma once
+
+#include "compressed/macros.h"
+
+#include "compressed/cuda/nvcomp_hook.h"
+#include "compressed/cuda/compressors/base.h"
+#include "compressed/cuda/compressors/util.h"
+
+namespace
+NAMESPACE_COMPRESSED_IMAGE::cuda
+{
+    template <typename T>
+    struct gdeflate_compressor final : public detail::compressor<T>
+    {
+        [[nodiscard]] NAMESPACE_COMPRESSED_IMAGE::enums::codec codec() const noexcept override
+        {
+            return NAMESPACE_COMPRESSED_IMAGE::enums::codec::gdeflate_gpu;
+        };
+
+        [[nodiscard]] compression_options default_compression_opts() const noexcept override
+        {
+            // This defaults to a low compression, high throughput mode.
+            return nvcompBatchedGdeflateCompressDefaultOpts;
+        };
+
+        [[nodiscard]] decompression_options default_decompression_opts() const noexcept override
+        {
+            return nvcompBatchedGdeflateDecompressDefaultOpts;
+        };
+
+        [[nodiscard]] size_t max_block_size() const noexcept override
+        {
+            return nvcompGdeflateCompressionMaxAllowedChunkSize;
+        };
+
+    private:
+        size_t get_temp_bytes(
+            size_t block_size,
+            size_t num_blocks,
+            std::variant<compression_options, decompression_options> options
+        ) const override
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            if (std::holds_alternative<compression_options>(options))
+            {
+                size_t temp_bytes{};
+                const auto status = nvcomp_api::instance().GdeflateCompressGetTempSizeAsync(
+                    num_blocks,
+                    block_size,
+                    std::get<nvcompBatchedGdeflateCompressOpts_t>(std::get<compression_options>(options)),
+                    &temp_bytes,
+                    block_size * num_blocks
+                );
+
+                if (status != nvcompStatus_t::nvcompSuccess)
+                {
+                    throw std::runtime_error(
+                        std::format(
+                            "gdeflate: Unable to retrieve the scratch bytes required for compression due to nvcomp error: '{}'",
+                            util::status_t_to_string(status)
+                        )
+                    );
+                }
+
+                return temp_bytes;
+            }
+
+            size_t temp_bytes{};
+            const auto status = nvcomp_api::instance().GdeflateDecompressGetTempSizeAsync(
+                num_blocks,
+                block_size,
+                std::get<nvcompBatchedGdeflateDecompressOpts_t>(std::get<decompression_options>(options)),
+                &temp_bytes,
+                block_size * num_blocks
+            );
+
+            if (status != nvcompStatus_t::nvcompSuccess)
+            {
+                throw std::runtime_error(
+                    std::format(
+                        "gdeflate: Unable to retrieve the scratch bytes required for decompression due to nvcomp error: '{}'",
+                        util::status_t_to_string(status)
+                    )
+                );
+            }
+
+            return temp_bytes;
+        }
+
+
+        size_t block_max_compressed_size(size_t block_size, compression_options& options) const override
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            size_t max_bytes = 0;
+            const auto status = nvcomp_api::instance().GdeflateCompressGetMaxOutputChunkSize(
+                block_size,
+                std::get<nvcompBatchedGdeflateCompressOpts_t>(options),
+                &max_bytes
+            );
+
+            if (status != nvcompStatus_t::nvcompSuccess)
+            {
+                throw std::runtime_error(
+                    std::format(
+                        "gdeflate: Unable to retrieve the maximum compressed size for a block due to nvcomp error: '{}'",
+                        util::status_t_to_string(status)
+                    )
+                );
+            }
+
+            return max_bytes;
+        }
+
+
+        void compression_impl(
+            size_t block_size,
+            size_t num_blocks,
+            const cuda_device_buffer_async<void*>& uncompressed_block_ptrs,
+            const cuda_device_buffer_async<size_t>& uncompressed_block_sizes,
+            cuda_device_buffer_async<std::byte>& scratch_space,
+            cuda_device_buffer_async<void*>& compressed_block_ptrs,
+            cuda_device_buffer_async<size_t>& compressed_block_sizes,
+            cuda_device_buffer_async<nvcompStatus_t>& block_statuses,
+            const compression_options& options
+        ) const override
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            const auto status = nvcomp_api::instance().GdeflateCompressAsync(
+                uncompressed_block_ptrs.get(),
+                uncompressed_block_sizes.get(),
+                block_size,
+                num_blocks,
+                scratch_space.get_raw(),
+                scratch_space.bytes(),
+                compressed_block_ptrs.get(),
+                compressed_block_sizes.get(),
+                std::get<nvcompBatchedGdeflateCompressOpts_t>(options),
+                block_statuses.get(),
+                cudaStreamPerThread
+            );
+
+            if (status != nvcompStatus_t::nvcompSuccess)
+            {
+                throw std::runtime_error(
+                    std::format(
+                        "gdeflate: nvcompBatchedGdeflateCompressAsync failed to launch due to nvcomp error: '{}'",
+                        util::status_t_to_string(status)
+                    )
+                );
+            }
+        };
+
+        void decompression_impl(
+            size_t num_blocks,
+            const cuda_device_buffer_async<const void*>& compressed_block_ptrs,
+            const cuda_device_buffer_async<size_t>& compressed_block_sizes,
+            cuda_device_buffer_async<std::byte>& scratch_space,
+            cuda_device_buffer_async<void*>& uncompressed_block_ptrs,
+            cuda_device_buffer_async<size_t>& uncompressed_block_sizes,
+            cuda_device_buffer_async<nvcompStatus_t>& block_statuses,
+            const decompression_options& options
+        ) const override
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            auto block_sizes_out = cuda::make_device_buffer_async<size_t>(num_blocks);
+
+            const auto status = nvcomp_api::instance().GdeflateDecompressAsync(
+                compressed_block_ptrs.get(),
+                compressed_block_sizes.get(),
+                uncompressed_block_sizes.get(),
+                block_sizes_out.get(),
+                num_blocks,
+                scratch_space.get_raw(),
+                scratch_space.size,
+                uncompressed_block_ptrs.get(),
+                std::get<nvcompBatchedGdeflateDecompressOpts_t>(options),
+                block_statuses.get(),
+                cudaStreamPerThread
+            );
+
+            if (status != nvcompStatus_t::nvcompSuccess)
+            {
+                throw std::runtime_error(
+                    std::format(
+                        "gdeflate: nvcompBatchedGdeflateDecompressAsync failed to launch due to nvcomp error: '{}'",
+                        util::status_t_to_string(status)
+                    )
+                );
+            }
+        }
+    };
+} // namespace NAMESPACE_COMPRESSED_IMAGE:cuda
diff --git a/compressed_image/include/compressed/cuda/compressors/lz4.h b/compressed_image/include/compressed/cuda/compressors/lz4.h
new file mode 100644
index 0000000..eef6e97
--- /dev/null
+++ b/compressed_image/include/compressed/cuda/compressors/lz4.h
@@ -0,0 +1,198 @@
+#pragma once
+
+#include "compressed/macros.h"
+
+#include "compressed/cuda/compressors/base.h"
+#include "compressed/cuda/compressors/util.h"
+#include "compressed/cuda/nvcomp_hook.h"
+
+
+namespace
+NAMESPACE_COMPRESSED_IMAGE::cuda
+{
+    template <typename T>
+    struct lz4_compressor : public detail::compressor<T>
+    {
+        [[nodiscard]] NAMESPACE_COMPRESSED_IMAGE::enums::codec codec() const noexcept override
+        {
+            return NAMESPACE_COMPRESSED_IMAGE::enums::codec::lz4_gpu;
+        };
+
+        [[nodiscard]] compression_options default_compression_opts() const noexcept override
+        {
+            auto opts = nvcompBatchedLZ4CompressDefaultOpts;
+            opts.data_type = util::to_nvcomp_type<T>();
+            if (opts.data_type == nvcompType_t::NVCOMP_TYPE_FLOAT16)
+            {
+                opts.data_type = nvcompType_t::NVCOMP_TYPE_SHORT;
+            }
+            return opts;
+        };
+
+        [[nodiscard]] decompression_options default_decompression_opts() const noexcept override
+        {
+            return nvcompBatchedLZ4DecompressDefaultOpts;
+        };
+
+        [[nodiscard]] size_t max_block_size() const noexcept override
+        {
+            return nvcompLZ4CompressionMaxAllowedChunkSize;
+        };
+
+    private:
+        size_t get_temp_bytes(
+            size_t block_size,
+            size_t num_blocks,
+            std::variant<compression_options, decompression_options> options
+        ) const override
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            if (std::holds_alternative<compression_options>(options))
+            {
+                size_t temp_bytes{};
+                const auto status = nvcomp_api::instance().LZ4CompressGetTempSizeAsync(
+                    num_blocks,
+                    block_size,
+                    std::get<nvcompBatchedLZ4CompressOpts_t>(std::get<compression_options>(options)),
+                    &temp_bytes,
+                    block_size * num_blocks
+                );
+
+                if (status != nvcompStatus_t::nvcompSuccess)
+                {
+                    throw std::runtime_error(
+                        std::format(
+                            "lz4: Unable to retrieve the scratch bytes required for compression due to nvcomp error: '{}'",
+                            util::status_t_to_string(status)
+                        )
+                    );
+                }
+
+                return temp_bytes;
+            }
+
+            size_t temp_bytes{};
+            const auto status = nvcomp_api::instance().LZ4DecompressGetTempSizeAsync(
+                num_blocks,
+                block_size,
+                std::get<nvcompBatchedLZ4DecompressOpts_t>(std::get<decompression_options>(options)),
+                &temp_bytes,
+                block_size * num_blocks
+            );
+
+            if (status != nvcompStatus_t::nvcompSuccess)
+            {
+                throw std::runtime_error(
+                    std::format(
+                        "lz4: Unable to retrieve the scratch bytes required for decompression due to nvcomp error: '{}'",
+                        util::status_t_to_string(status)
+                    )
+                );
+            }
+
+            return temp_bytes;
+        }
+
+
+        size_t block_max_compressed_size(size_t block_size, compression_options& options) const override
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            size_t max_bytes = 0;
+            const auto status = nvcomp_api::instance().LZ4CompressGetMaxOutputChunkSize(
+                block_size,
+                std::get<nvcompBatchedLZ4CompressOpts_t>(options),
+                &max_bytes
+            );
+
+            if (status != nvcompStatus_t::nvcompSuccess)
+            {
+                throw std::runtime_error(
+                    std::format(
+                        "lz4: Unable to retrieve the maximum compressed size for a block due to nvcomp error: '{}'",
+                        util::status_t_to_string(status)
+                    )
+                );
+            }
+
+            return max_bytes;
+        }
+
+
+        void compression_impl(
+            size_t block_size,
+            size_t num_blocks,
+            const cuda_device_buffer_async<void*>& uncompressed_block_ptrs,
+            const cuda_device_buffer_async<size_t>& uncompressed_block_sizes,
+            cuda_device_buffer_async<std::byte>& scratch_space,
+            cuda_device_buffer_async<void*>& compressed_block_ptrs,
+            cuda_device_buffer_async<size_t>& compressed_block_sizes,
+            cuda_device_buffer_async<nvcompStatus_t>& block_statuses,
+            const compression_options& options
+        ) const override
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            const auto status = nvcomp_api::instance().LZ4CompressAsync(
+                uncompressed_block_ptrs.get(),
+                uncompressed_block_sizes.get(),
+                block_size,
+                num_blocks,
+                scratch_space.get_raw(),
+                scratch_space.bytes(),
+                compressed_block_ptrs.get(),
+                compressed_block_sizes.get(),
+                std::get<nvcompBatchedLZ4CompressOpts_t>(options),
+                block_statuses.get(),
+                cudaStreamPerThread
+            );
+
+            if (status != nvcompStatus_t::nvcompSuccess)
+            {
+                throw std::runtime_error(
+                    std::format(
+                        "lz4: nvcompBatchedLZ4CompressAsync failed to launch due to nvcomp error: '{}'",
+                        util::status_t_to_string(status)
+                    )
+                );
+            }
+        };
+
+        void decompression_impl(
+            size_t num_blocks,
+            const cuda_device_buffer_async<const void*>& compressed_block_ptrs,
+            const cuda_device_buffer_async<size_t>& compressed_block_sizes,
+            cuda_device_buffer_async<std::byte>& scratch_space,
+            cuda_device_buffer_async<void*>& uncompressed_block_ptrs,
+            cuda_device_buffer_async<size_t>& uncompressed_block_sizes,
+            cuda_device_buffer_async<nvcompStatus_t>& block_statuses,
+            const decompression_options& options
+        ) const override
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            auto block_sizes_out = cuda::make_device_buffer_async<size_t>(num_blocks);
+
+            const auto status = nvcomp_api::instance().LZ4DecompressAsync(
+                compressed_block_ptrs.get(),
+                compressed_block_sizes.get(),
+                uncompressed_block_sizes.get(),
+                block_sizes_out.get(),
+                num_blocks,
+                scratch_space.get_raw(),
+                scratch_space.size,
+                uncompressed_block_ptrs.get(),
+                std::get<nvcompBatchedLZ4DecompressOpts_t>(options),
+                block_statuses.get(),
+                cudaStreamPerThread
+            );
+
+            if (status != nvcompStatus_t::nvcompSuccess)
+            {
+                throw std::runtime_error(
+                    std::format(
+                        "lz4: nvcompBatchedLZ4DecompressAsync failed to launch due to nvcomp error: '{}'",
+                        util::status_t_to_string(status)
+                    )
+                );
+            }
+        }
+    };
+} // namespace NAMESPACE_COMPRESSED_IMAGE::cuda
diff --git a/compressed_image/include/compressed/cuda/compressors/snappy.h b/compressed_image/include/compressed/cuda/compressors/snappy.h
new file mode 100644
index 0000000..62986e5
--- /dev/null
+++ b/compressed_image/include/compressed/cuda/compressors/snappy.h
@@ -0,0 +1,194 @@
+#pragma once
+
+#include "compressed/macros.h"
+
+#include "compressed/cuda/nvcomp_hook.h"
+#include "compressed/cuda/compressors/base.h"
+#include "compressed/cuda/compressors/util.h"
+
+namespace
+NAMESPACE_COMPRESSED_IMAGE
+{
+    namespace cuda
+    {
+        template <typename T>
+        struct snappy_compressor final : public detail::compressor<T>
+        {
+            [[nodiscard]] NAMESPACE_COMPRESSED_IMAGE::enums::codec codec() const noexcept override
+            {
+                return NAMESPACE_COMPRESSED_IMAGE::enums::codec::snappy_gpu;
+            };
+
+            [[nodiscard]] compression_options default_compression_opts() const noexcept override
+            {
+                return nvcompBatchedSnappyCompressDefaultOpts;
+            };
+
+            [[nodiscard]] decompression_options default_decompression_opts() const noexcept override
+            {
+                return nvcompBatchedSnappyDecompressDefaultOpts;
+            };
+
+            [[nodiscard]] size_t max_block_size() const noexcept override
+            {
+                return nvcompSnappyCompressionMaxAllowedChunkSize;
+            };
+
+        private:
+            size_t get_temp_bytes(
+                size_t block_size,
+                size_t num_blocks,
+                std::variant<compression_options, decompression_options> options
+            ) const override
+            {
+                _COMPRESSED_PROFILE_FUNCTION();
+                if (std::holds_alternative<compression_options>(options))
+                {
+                    size_t temp_bytes{};
+                    const auto status = nvcomp_api::instance().SnappyCompressGetTempSizeAsync(
+                        num_blocks,
+                        block_size,
+                        std::get<nvcompBatchedSnappyCompressOpts_t>(std::get<compression_options>(options)),
+                        &temp_bytes,
+                        block_size * num_blocks
+                    );
+
+                    if (status != nvcompStatus_t::nvcompSuccess)
+                    {
+                        throw std::runtime_error(
+                            std::format(
+                                "snappy: Unable to retrieve the scratch bytes required for compression due to nvcomp error: '{}'",
+                                util::status_t_to_string(status)
+                            )
+                        );
+                    }
+
+                    return temp_bytes;
+                }
+
+                size_t temp_bytes{};
+                const auto status = nvcomp_api::instance().SnappyDecompressGetTempSizeAsync(
+                    num_blocks,
+                    block_size,
+                    std::get<nvcompBatchedSnappyDecompressOpts_t>(std::get<decompression_options>(options)),
+                    &temp_bytes,
+                    block_size * num_blocks
+                );
+
+                if (status != nvcompStatus_t::nvcompSuccess)
+                {
+                    throw std::runtime_error(
+                        std::format(
+                            "snappy: Unable to retrieve the scratch bytes required for decompression due to nvcomp error: '{}'",
+                            util::status_t_to_string(status)
+                        )
+                    );
+                }
+
+                return temp_bytes;
+            }
+
+
+            size_t block_max_compressed_size(size_t block_size, compression_options& options) const override
+            {
+                _COMPRESSED_PROFILE_FUNCTION();
+                size_t max_bytes = 0;
+                const auto status = nvcomp_api::instance().SnappyCompressGetMaxOutputChunkSize(
+                    block_size,
+                    std::get<nvcompBatchedSnappyCompressOpts_t>(options),
+                    &max_bytes
+                );
+
+                if (status != nvcompStatus_t::nvcompSuccess)
+                {
+                    throw std::runtime_error(
+                        std::format(
+                            "snappy: Unable to retrieve the maximum compressed size for a block due to nvcomp error: '{}'",
+                            util::status_t_to_string(status)
+                        )
+                    );
+                }
+
+                return max_bytes;
+            }
+
+
+            void compression_impl(
+                size_t block_size,
+                size_t num_blocks,
+                const cuda_device_buffer_async<void*>& uncompressed_block_ptrs,
+                const cuda_device_buffer_async<size_t>& uncompressed_block_sizes,
+                cuda_device_buffer_async<std::byte>& scratch_space,
+                cuda_device_buffer_async<void*>& compressed_block_ptrs,
+                cuda_device_buffer_async<size_t>& compressed_block_sizes,
+                cuda_device_buffer_async<nvcompStatus_t>& block_statuses,
+                const compression_options& options
+            ) const override
+            {
+                _COMPRESSED_PROFILE_FUNCTION();
+                const auto status = nvcomp_api::instance().SnappyCompressAsync(
+                    uncompressed_block_ptrs.get(),
+                    uncompressed_block_sizes.get(),
+                    block_size,
+                    num_blocks,
+                    scratch_space.get_raw(),
+                    scratch_space.bytes(),
+                    compressed_block_ptrs.get(),
+                    compressed_block_sizes.get(),
+                    std::get<nvcompBatchedSnappyCompressOpts_t>(options),
+                    block_statuses.get(),
+                    cudaStreamPerThread
+                );
+
+                if (status != nvcompStatus_t::nvcompSuccess)
+                {
+                    throw std::runtime_error(
+                        std::format(
+                            "snappy: nvcompBatchedSnappyCompressAsync failed to launch due to nvcomp error: '{}'",
+                            util::status_t_to_string(status)
+                        )
+                    );
+                }
+            };
+
+            void decompression_impl(
+                size_t num_blocks,
+                const cuda_device_buffer_async<const void*>& compressed_block_ptrs,
+                const cuda_device_buffer_async<size_t>& compressed_block_sizes,
+                cuda_device_buffer_async<std::byte>& scratch_space,
+                cuda_device_buffer_async<void*>& uncompressed_block_ptrs,
+                cuda_device_buffer_async<size_t>& uncompressed_block_sizes,
+                cuda_device_buffer_async<nvcompStatus_t>& block_statuses,
+                const decompression_options& options
+            ) const override
+            {
+                _COMPRESSED_PROFILE_FUNCTION();
+                auto block_sizes_out = cuda::make_device_buffer_async<size_t>(num_blocks);
+
+                const auto status = nvcomp_api::instance().SnappyDecompressAsync(
+                    compressed_block_ptrs.get(),
+                    compressed_block_sizes.get(),
+                    uncompressed_block_sizes.get(),
+                    block_sizes_out.get(),
+                    num_blocks,
+                    scratch_space.get_raw(),
+                    scratch_space.size,
+                    uncompressed_block_ptrs.get(),
+                    std::get<nvcompBatchedSnappyDecompressOpts_t>(options),
+                    block_statuses.get(),
+                    cudaStreamPerThread
+                );
+
+                if (status != nvcompStatus_t::nvcompSuccess)
+                {
+                    throw std::runtime_error(
+                        std::format(
+                            "snappy: nvcompBatchedSnappyDecompressAsync failed to launch due to nvcomp error: '{}'",
+                            util::status_t_to_string(status)
+                        )
+                    );
+                }
+            }
+        };
+    } // namespace cuda
+} // namespace NAMESPACE_COMPRESSED_IMAGE
diff --git a/compressed_image/include/compressed/cuda/compressors/util.h b/compressed_image/include/compressed/cuda/compressors/util.h
new file mode 100644
index 0000000..e32ce25
--- /dev/null
+++ b/compressed_image/include/compressed/cuda/compressors/util.h
@@ -0,0 +1,90 @@
+#pragma once
+
+#include "compressed/macros.h"
+
+#include <nvcomp.h>
+#ifdef COMPRESSED_IMAGE_OIIO_AVAILABLE
+#include <OpenImageIO/imageio.h>
+#endif
+
+namespace
+NAMESPACE_COMPRESSED_IMAGE
+{
+    namespace cuda
+    {
+        namespace util
+        {
+            template <typename T>
+            constexpr nvcompType_t to_nvcomp_type()
+            {
+                if constexpr (std::is_same_v<T, char> || std::is_same_v<T, int8_t>)
+                    return NVCOMP_TYPE_CHAR;
+                else if constexpr (std::is_same_v<T, unsigned char> || std::is_same_v<T, uint8_t>)
+                    return NVCOMP_TYPE_UCHAR;
+                else if constexpr (std::is_same_v<T, short> || std::is_same_v<T, int16_t>)
+                    return NVCOMP_TYPE_SHORT;
+                else if constexpr (std::is_same_v<T, unsigned short> || std::is_same_v<T, uint16_t>)
+                    return NVCOMP_TYPE_USHORT;
+                else if constexpr (std::is_same_v<T, int> || std::is_same_v<T, int32_t>)
+                    return NVCOMP_TYPE_INT;
+                else if constexpr (std::is_same_v<T, unsigned int> || std::is_same_v<T, uint32_t>)
+                    return NVCOMP_TYPE_UINT;
+                else if constexpr (std::is_same_v<T, long long> || std::is_same_v<T, int64_t>)
+                    return NVCOMP_TYPE_LONGLONG;
+                else if constexpr (std::is_same_v<T, unsigned long long> || std::is_same_v<T, uint64_t>)
+                    return NVCOMP_TYPE_ULONGLONG;
+#ifdef COMPRESSED_IMAGE_OIIO_AVAILABLE
+                else if constexpr (std::is_same_v<T, Imath::half>)
+                    return NVCOMP_TYPE_FLOAT16;
+#endif
+                else if constexpr (std::is_same_v<T, float>)
+                    return NVCOMP_TYPE_UINT; // fallback: map float -> uint
+                else if constexpr (std::is_same_v<T, double>)
+                    return NVCOMP_TYPE_ULONGLONG; // fallback: map double -> ulonglong
+                else
+                    return NVCOMP_TYPE_BITS; // fallback default
+            }
+
+
+            /// \brief Convert a nvcompStatus_t object into a human-readable string for printing.
+            /// \param status The status to convert
+            /// \return A human-readable string explaining the error.
+            constexpr inline std::string_view status_t_to_string(const nvcompStatus_t& status) noexcept
+            {
+                switch (status)
+                {
+                case nvcompStatus_t::nvcompSuccess:
+                    return "success";
+                case nvcompStatus_t::nvcompErrorInvalidValue:
+                    return "invalid value";
+                case nvcompStatus_t::nvcompErrorNotSupported:
+                    return "not supported";
+                case nvcompStatus_t::nvcompErrorCannotDecompress:
+                    return "cannot decompress";
+                case nvcompStatus_t::nvcompErrorBadChecksum:
+                    return "bad checksum";
+                case nvcompStatus_t::nvcompErrorCannotVerifyChecksums:
+                    return "cannot verify checksums";
+                case nvcompStatus_t::nvcompErrorOutputBufferTooSmall:
+                    return "output buffer too small";
+                case nvcompStatus_t::nvcompErrorWrongHeaderLength:
+                    return "wrong header length";
+                case nvcompStatus_t::nvcompErrorAlignment:
+                    return "alignment error";
+                case nvcompStatus_t::nvcompErrorChunkSizeTooLarge:
+                    return "chunk size too large";
+                case nvcompStatus_t::nvcompErrorCannotCompress:
+                    return "cannot compress";
+                case nvcompStatus_t::nvcompErrorWrongInputLength:
+                    return "wrong input length";
+                case nvcompStatus_t::nvcompErrorCudaError:
+                    return "CUDA error";
+                case nvcompStatus_t::nvcompErrorInternal:
+                    return "internal error";
+                default:
+                    return "unknown error";
+                }
+            }
+        } // namespace util
+    } // namespace cuda
+} // namespace NAMESPACE_COMPRESSED_IMAGE
diff --git a/compressed_image/include/compressed/cuda/compressors/zstd.h b/compressed_image/include/compressed/cuda/compressors/zstd.h
new file mode 100644
index 0000000..d3fb885
--- /dev/null
+++ b/compressed_image/include/compressed/cuda/compressors/zstd.h
@@ -0,0 +1,194 @@
+#pragma once
+
+#include "compressed/macros.h"
+
+#include "compressed/cuda/nvcomp_hook.h"
+#include "compressed/cuda/compressors/base.h"
+#include "compressed/cuda/compressors/util.h"
+
+namespace
+NAMESPACE_COMPRESSED_IMAGE
+{
+    namespace cuda
+    {
+        template <typename T>
+        struct zstd_compressor final : public detail::compressor<T>
+        {
+            [[nodiscard]] NAMESPACE_COMPRESSED_IMAGE::enums::codec codec() const noexcept override
+            {
+                return NAMESPACE_COMPRESSED_IMAGE::enums::codec::zstd_gpu;
+            };
+
+            [[nodiscard]] compression_options default_compression_opts() const noexcept override
+            {
+                return nvcompBatchedZstdCompressDefaultOpts;
+            };
+
+            [[nodiscard]] decompression_options default_decompression_opts() const noexcept override
+            {
+                return nvcompBatchedZstdDecompressDefaultOpts;
+            };
+
+            [[nodiscard]] size_t max_block_size() const noexcept override
+            {
+                return nvcompZstdCompressionMaxAllowedChunkSize;
+            };
+
+        private:
+            size_t get_temp_bytes(
+                size_t block_size,
+                size_t num_blocks,
+                std::variant<compression_options, decompression_options> options
+            ) const override
+            {
+                _COMPRESSED_PROFILE_FUNCTION();
+                if (std::holds_alternative<compression_options>(options))
+                {
+                    size_t temp_bytes{};
+                    const auto status = nvcomp_api::instance().ZstdCompressGetTempSizeAsync(
+                        num_blocks,
+                        block_size,
+                        std::get<nvcompBatchedZstdCompressOpts_t>(std::get<compression_options>(options)),
+                        &temp_bytes,
+                        block_size * num_blocks
+                    );
+
+                    if (status != nvcompStatus_t::nvcompSuccess)
+                    {
+                        throw std::runtime_error(
+                            std::format(
+                                "zstd: Unable to retrieve the scratch bytes required for compression due to nvcomp error: '{}'",
+                                util::status_t_to_string(status)
+                            )
+                        );
+                    }
+
+                    return temp_bytes;
+                }
+
+                size_t temp_bytes{};
+                const auto status = nvcomp_api::instance().ZstdDecompressGetTempSizeAsync(
+                    num_blocks,
+                    block_size,
+                    std::get<nvcompBatchedZstdDecompressOpts_t>(std::get<decompression_options>(options)),
+                    &temp_bytes,
+                    block_size * num_blocks
+                );
+
+                if (status != nvcompStatus_t::nvcompSuccess)
+                {
+                    throw std::runtime_error(
+                        std::format(
+                            "zstd: Unable to retrieve the scratch bytes required for decompression due to nvcomp error: '{}'",
+                            util::status_t_to_string(status)
+                        )
+                    );
+                }
+
+                return temp_bytes;
+            }
+
+
+            size_t block_max_compressed_size(size_t block_size, compression_options& options) const override
+            {
+                _COMPRESSED_PROFILE_FUNCTION();
+                size_t max_bytes = 0;
+                const auto status = nvcomp_api::instance().ZstdCompressGetMaxOutputChunkSize(
+                    block_size,
+                    std::get<nvcompBatchedZstdCompressOpts_t>(options),
+                    &max_bytes
+                );
+
+                if (status != nvcompStatus_t::nvcompSuccess)
+                {
+                    throw std::runtime_error(
+                        std::format(
+                            "zstd: Unable to retrieve the maximum compressed size for a block due to nvcomp error: '{}'",
+                            util::status_t_to_string(status)
+                        )
+                    );
+                }
+
+                return max_bytes;
+            }
+
+
+            void compression_impl(
+                size_t block_size,
+                size_t num_blocks,
+                const cuda_device_buffer_async<void*>& uncompressed_block_ptrs,
+                const cuda_device_buffer_async<size_t>& uncompressed_block_sizes,
+                cuda_device_buffer_async<std::byte>& scratch_space,
+                cuda_device_buffer_async<void*>& compressed_block_ptrs,
+                cuda_device_buffer_async<size_t>& compressed_block_sizes,
+                cuda_device_buffer_async<nvcompStatus_t>& block_statuses,
+                const compression_options& options
+            ) const override
+            {
+                _COMPRESSED_PROFILE_FUNCTION();
+                const auto status = nvcomp_api::instance().ZstdCompressAsync(
+                    uncompressed_block_ptrs.get(),
+                    uncompressed_block_sizes.get(),
+                    block_size,
+                    num_blocks,
+                    scratch_space.get_raw(),
+                    scratch_space.bytes(),
+                    compressed_block_ptrs.get(),
+                    compressed_block_sizes.get(),
+                    std::get<nvcompBatchedZstdCompressOpts_t>(options),
+                    block_statuses.get(),
+                    cudaStreamPerThread
+                );
+
+                if (status != nvcompStatus_t::nvcompSuccess)
+                {
+                    throw std::runtime_error(
+                        std::format(
+                            "zstd: nvcompBatchedZstdCompressAsync failed to launch due to nvcomp error: '{}'",
+                            util::status_t_to_string(status)
+                        )
+                    );
+                }
+            };
+
+            void decompression_impl(
+                size_t num_blocks,
+                const cuda_device_buffer_async<const void*>& compressed_block_ptrs,
+                const cuda_device_buffer_async<size_t>& compressed_block_sizes,
+                cuda_device_buffer_async<std::byte>& scratch_space,
+                cuda_device_buffer_async<void*>& uncompressed_block_ptrs,
+                cuda_device_buffer_async<size_t>& uncompressed_block_sizes,
+                cuda_device_buffer_async<nvcompStatus_t>& block_statuses,
+                const decompression_options& options
+            ) const override
+            {
+                _COMPRESSED_PROFILE_FUNCTION();
+                auto block_sizes_out = cuda::make_device_buffer_async<size_t>(num_blocks);
+
+                const auto status = nvcomp_api::instance().ZstdDecompressAsync(
+                    compressed_block_ptrs.get(),
+                    compressed_block_sizes.get(),
+                    uncompressed_block_sizes.get(),
+                    block_sizes_out.get(),
+                    num_blocks,
+                    scratch_space.get_raw(),
+                    scratch_space.size,
+                    uncompressed_block_ptrs.get(),
+                    std::get<nvcompBatchedZstdDecompressOpts_t>(options),
+                    block_statuses.get(),
+                    cudaStreamPerThread
+                );
+
+                if (status != nvcompStatus_t::nvcompSuccess)
+                {
+                    throw std::runtime_error(
+                        std::format(
+                            "zstd: nvcompBatchedZstdDecompressAsync failed to launch due to nvcomp error: '{}'",
+                            util::status_t_to_string(status)
+                        )
+                    );
+                }
+            }
+        };
+    } // namespace cuda
+} // namespace NAMESPACE_COMPRESSED_IMAGE
diff --git a/compressed_image/include/compressed/cuda/cuda_hook.h b/compressed_image/include/compressed/cuda/cuda_hook.h
new file mode 100644
index 0000000..1888e78
--- /dev/null
+++ b/compressed_image/include/compressed/cuda/cuda_hook.h
@@ -0,0 +1,371 @@
+/*
+Dynamic function hook for cuda that loads the library at runtime and hooks various functions such as 
+cudaMalloc, cudaFree, etc.
+
+Unfortunately, it doesn't seem as though there's an open source library so we do the minimal hooking here.
+
+Note: This header should only ever be included on a machine that also has the cuda libraries!
+*/
+
+#pragma once
+
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <limits>
+
+#include <cuda_runtime.h>
+
+#include "compressed/logger.h"
+#include "compressed/macros.h"
+#include "compressed/cuda/proc_util.h"
+
+
+namespace
+NAMESPACE_COMPRESSED_IMAGE
+{
+    namespace cuda
+    {
+        /// \brief Singleton class for dynamically loading CUDA functions at runtime.
+        ///
+        /// This allows calling CUDA functions like cudaMalloc/cudaFree
+        /// without linking against CUDA at compile time.
+        ///
+        /// Usage:
+        ///
+        /// compressed::cuda::cuda_api::instance().malloc(ptr, size);
+        /// compressed::cuda::cuda_api::instance().free(ptr);
+        /// \brief Singleton for dynamically loading CUDA runtime functions.
+        class cuda_api
+        {
+        public:
+            /// Access the singleton instance
+            static cuda_api& instance()
+            {
+                static cuda_api inst;
+                return inst;
+            }
+
+            // --- Runtime queries ---
+            bool available() const noexcept { return handle_ != nullptr; }
+            int device_count() const;
+            int current_device() const;
+            void set_device(int device);
+            bool has_device() const;
+            cudaDeviceProp device_properties(int device) const;
+            int device_attribute(cudaDeviceAttr attr, int device) const;
+
+            // --- Memory management ---
+            void malloc(void*& ptr, size_t size) const;
+            void malloc_host(void*& ptr, size_t size) const;
+            void malloc_async(void*& ptr, size_t size, cudaStream_t stream = cudaStreamPerThread);
+            void free(void* ptr) const;
+            void free_host(void* ptr) const;
+            void free_async(void* ptr, cudaStream_t stream = cudaStreamPerThread);
+
+            // --- Page-locking (Pinning) ---
+            void host_register(void* ptr, size_t size, unsigned int flags = cudaHostRegisterDefault) const;
+            void host_unregister(void* ptr) const;
+
+            // --- Data transfer ---
+            void memcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind);
+            void memcpy_async(
+                void* dst,
+                const void* src,
+                size_t count,
+                cudaMemcpyKind kind,
+                cudaStream_t stream = cudaStreamPerThread
+            );
+
+            // --- Streams & Pools ---
+            void stream_synchronize(cudaStream_t stream) const;
+            void set_mem_pool_size(int device, uint64_t threshold = std::numeric_limits<uint64_t>::max());
+
+            // Non-copyable
+            cuda_api(const cuda_api&) = delete;
+            cuda_api& operator=(const cuda_api&) = delete;
+            cuda_api(cuda_api&&) = delete;
+            cuda_api& operator=(cuda_api&&) = delete;
+
+        private:
+            // Private constructor
+            cuda_api();
+
+            template <typename Func, typename... Args>
+            static void cuda_call(Func func, std::string_view func_name, Args&&... args);
+
+            // CUDA library handle
+            proc::library_handle handle_ = nullptr;
+
+            // --- Function pointer typedefs ---
+            using cuda_malloc_t = cudaError_t(*)(void**, size_t);
+            using cuda_malloc_async_t = cudaError_t(*)(void**, size_t, cudaStream_t);
+            using cuda_free_t = decltype(&cudaFree);
+            using cuda_free_async_t = decltype(&cudaFreeAsync);
+            using cuda_malloc_host_t = cudaError_t(*)(void**, size_t);
+            using cuda_free_host_t = decltype(&cudaFreeHost);
+
+            using cuda_host_register_t = decltype(&cudaHostRegister);
+            using cuda_host_unregister_t = decltype(&cudaHostUnregister);;
+
+            using cuda_memcpy_t = decltype(&cudaMemcpy);
+            using cuda_memcpy_async_t = decltype(&cudaMemcpyAsync);
+            using cuda_stream_sync_t = decltype(&cudaStreamSynchronize);
+
+            using cuda_get_mempool_t = decltype(&cudaDeviceGetDefaultMemPool);
+            using cuda_set_mempool_t = decltype(&cudaMemPoolSetAttribute);
+
+            using cuda_set_device_t = decltype(&cudaSetDevice);
+            using cuda_get_device_count_t = decltype(&cudaGetDeviceCount);
+            using cuda_get_props_t = decltype(&cudaGetDeviceProperties);
+            using cuda_get_device_t = decltype(&cudaGetDevice);
+            using cuda_get_attr_t = decltype(&cudaDeviceGetAttribute);
+
+            using cuda_get_error_str_t = decltype(&cudaGetErrorString);
+
+            // --- Function pointers ---
+            cuda_malloc_t malloc_fn_ = nullptr;
+            cuda_malloc_host_t malloc_host_fn_ = nullptr;
+            cuda_malloc_async_t malloc_async_fn_ = nullptr;
+            cuda_free_t free_fn_ = nullptr;
+            cuda_free_host_t free_host_fn_ = nullptr;
+            cuda_free_async_t free_async_fn_ = nullptr;
+
+            cuda_host_register_t host_register_fn_ = nullptr;
+            cuda_host_unregister_t host_unregister_fn_ = nullptr;
+
+            cuda_memcpy_t memcpy_fn_ = nullptr;
+            cuda_memcpy_async_t memcpy_async_fn_ = nullptr;
+            cuda_stream_sync_t stream_sync_fn_ = nullptr;
+
+            cuda_get_mempool_t get_mempool_fn_ = nullptr;
+            cuda_set_mempool_t set_mempool_fn_ = nullptr;
+
+            cuda_set_device_t set_device_fn_ = nullptr;
+            cuda_get_device_count_t get_count_fn_ = nullptr;
+            cuda_get_props_t get_props_fn_ = nullptr;
+            cuda_get_device_t get_device_fn_ = nullptr;
+            cuda_get_attr_t get_attr_fn_ = nullptr;
+
+            cuda_get_error_str_t get_error_str_fn_ = nullptr;
+        };
+
+        // ===========================================================
+        // Implementation
+        // ===========================================================
+
+        inline int cuda_api::device_count() const
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            int count = 0;
+            cuda_call(get_count_fn_, "cudaGetDeviceCount", &count);
+            return count;
+        }
+
+        inline int cuda_api::current_device() const
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            int dev = -1;
+            cuda_call(get_device_fn_, "cudaGetDevice", &dev);
+            return dev;
+        }
+
+        inline void cuda_api::set_device(int device)
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            cuda_call(set_device_fn_, "cudaSetDevice", device);
+        }
+
+        inline bool cuda_api::has_device() const
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            if (!available()) return false;
+
+            try
+            {
+                return device_count() > 0;
+            }
+            catch (const std::exception& e)
+            {
+                NAMESPACE_COMPRESSED_IMAGE::get_logger()->warn(
+                    std::format(
+                        "Unhandled exception while trying to retrieve the cuda device count: {}",
+                        e.what()
+                    )
+                );
+            }
+
+            return false;
+        }
+
+        inline cudaDeviceProp cuda_api::device_properties(int device) const
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            cudaDeviceProp prop{};
+            cuda_call(get_props_fn_, "cudaGetDeviceProperties", &prop, device);
+            return prop;
+        }
+
+        inline int cuda_api::device_attribute(cudaDeviceAttr attr, int device) const
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            int value = 0;
+            cuda_call(get_attr_fn_, "cudaDeviceGetAttribute", &value, attr, device);
+            return value;
+        }
+
+        inline void cuda_api::malloc(void*& ptr, size_t size) const
+        {
+            cuda_call(malloc_fn_, "cudaMalloc", &ptr, size);
+        }
+
+        inline void cuda_api::malloc_host(void*& ptr, size_t size) const
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            cuda_call(malloc_host_fn_, "cudaMallocHost", &ptr, size);
+        }
+
+        inline void cuda_api::malloc_async(void*& ptr, size_t size, cudaStream_t stream)
+        {
+            cuda_call(malloc_async_fn_, "cudaMallocAsync", &ptr, size, stream);
+        }
+
+        inline void cuda_api::free(void* ptr) const
+        {
+            cuda_call(free_fn_, "cudaFree", ptr);
+        }
+
+        inline void cuda_api::free_host(void* ptr) const
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            cuda_call(free_host_fn_, "cudaFreeHost", ptr);
+        }
+
+        inline void cuda_api::free_async(void* ptr, cudaStream_t stream)
+        {
+            cuda_call(free_async_fn_, "cudaFreeAsync", ptr, stream);
+        }
+
+        inline void cuda_api::host_register(void* ptr, size_t size, unsigned int flags) const
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            cuda_call(host_register_fn_, "cudaHostRegister", ptr, size, flags);
+        }
+
+        inline void cuda_api::host_unregister(void* ptr) const
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            cuda_call(host_unregister_fn_, "cudaHostUnregister", ptr);
+        }
+
+        inline void cuda_api::memcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind)
+        {
+            cuda_call(memcpy_fn_, "cudaMemcpy", dst, src, count, kind);
+        }
+
+        inline void cuda_api::memcpy_async(
+            void* dst,
+            const void* src,
+            size_t count,
+            cudaMemcpyKind kind,
+            cudaStream_t stream)
+        {
+            cuda_call(memcpy_async_fn_, "cudaMemcpyAsync", dst, src, count, kind, stream);
+        }
+
+        inline void cuda_api::stream_synchronize(cudaStream_t stream) const
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            cuda_call(stream_sync_fn_, "cudaStreamSynchronize", stream);
+        }
+
+        inline void cuda_api::set_mem_pool_size(int device, uint64_t threshold)
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            cudaMemPool_t mempool{};
+            cuda_call(get_mempool_fn_, "cudaDeviceGetDefaultMemPool", &mempool, device);
+            cuda_call(
+                set_mempool_fn_,
+                "cudaMemPoolSetAttribute",
+                mempool,
+                cudaMemPoolAttrReleaseThreshold,
+                &threshold
+            );
+        }
+
+        // --- private helpers ---
+        template <typename Func, typename... Args>
+        void cuda_api::cuda_call(Func func, std::string_view func_name, Args&&... args)
+        {
+            if (!func)
+            {
+                throw std::runtime_error(
+                    std::format(
+                        "CUDA function '{}' is unavailable (library or entrypoint not loaded).",
+                        func_name
+                    )
+                );
+            }
+
+            const cudaError_t err = func(std::forward<Args>(args)...);
+            if (err != cudaSuccess)
+            {
+                auto& inst = instance();
+                const char* raw_msg = inst.get_error_str_fn_ ? inst.get_error_str_fn_(err) : nullptr;
+                std::string invalid_msg = std::format(
+                    "unknown error or missing driver string table. Cuda code {}",
+                    static_cast<int>(err)
+                );
+
+                std::string_view msg = raw_msg ? std::string_view(raw_msg) : invalid_msg;
+
+                throw std::runtime_error(std::format("{} failed: {}", func_name, msg));
+            }
+        }
+
+        inline cuda_api::cuda_api()
+        {
+#if defined(_WIN32)
+            // Targets the 64-bit Runtime API for CUDA 12
+            const std::string cuda_name = "cudart64_12.dll";
+#elif defined(__linux__)
+            const std::string cuda_name = "libcudart.so";
+#else
+            const std::string cuda_name;
+#endif
+
+            handle_ = proc::load_library(cuda_name);
+            if (!handle_) return;
+
+#define LOAD(fn, member) member = proc::get_symbol<decltype(member)>(handle_, #fn, cuda_name)
+
+            LOAD(cudaMalloc, malloc_fn_);
+            LOAD(cudaMallocHost, malloc_host_fn_);
+            LOAD(cudaMallocAsync, malloc_async_fn_);
+            LOAD(cudaFree, free_fn_);
+            LOAD(cudaFreeHost, free_host_fn_);
+            LOAD(cudaFreeAsync, free_async_fn_);
+
+            LOAD(cudaHostRegister, host_register_fn_);
+            LOAD(cudaHostUnregister, host_unregister_fn_);
+
+            LOAD(cudaMemcpy, memcpy_fn_);
+            LOAD(cudaMemcpyAsync, memcpy_async_fn_);
+            LOAD(cudaStreamSynchronize, stream_sync_fn_);
+
+            LOAD(cudaDeviceGetDefaultMemPool, get_mempool_fn_);
+            LOAD(cudaMemPoolSetAttribute, set_mempool_fn_);
+
+            LOAD(cudaSetDevice, set_device_fn_);
+            LOAD(cudaGetDeviceCount, get_count_fn_);
+            LOAD(cudaGetDeviceProperties, get_props_fn_);
+            LOAD(cudaGetDevice, get_device_fn_);
+            LOAD(cudaDeviceGetAttribute, get_attr_fn_);
+
+            LOAD(cudaGetErrorString, get_error_str_fn_);
+
+#undef LOAD
+        }
+    } // namespace cuda
+} // namespace NAMESPACE_COMPRESSED_IMAGE
diff --git a/compressed_image/include/compressed/cuda/enums.h b/compressed_image/include/compressed/cuda/enums.h
new file mode 100644
index 0000000..78db4ce
--- /dev/null
+++ b/compressed_image/include/compressed/cuda/enums.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "compressed/macros.h"
+
+
+namespace NAMESPACE_COMPRESSED_IMAGE
+{
+
+	namespace cuda
+	{
+
+		namespace enums
+		{
+			/// \brief the storage location of a given compressing data buffer.
+			enum class storage_location
+			{
+				device, ///< Data is stored on the device (gpu) and only pulled back to the cpu when accessing
+				host	///< Data is stored on the host (cpu) incurring an additional cost for copying back and forth memory.
+			};
+
+		} // namespace enums
+
+	} // namespace cuda
+
+} // namespace NAMESPACE_COMPRESSED_IMAGE
\ No newline at end of file
diff --git a/compressed_image/include/compressed/cuda/exceptions.h b/compressed_image/include/compressed/cuda/exceptions.h
new file mode 100644
index 0000000..1ef54c4
--- /dev/null
+++ b/compressed_image/include/compressed/cuda/exceptions.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <stdexcept>
+#include <string_view>
+
+#include "compressed/macros.h"
+
+
+namespace NAMESPACE_COMPRESSED_IMAGE
+{
+
+	namespace cuda
+	{
+
+		/// \brief Exception thrown when a CUDA library cannot be loaded
+		class library_not_found : public std::runtime_error
+		{
+		public:
+			explicit library_not_found(std::string_view msg)
+				: std::runtime_error(std::string(msg))
+			{
+			}
+		};
+
+		/// \brief Exception thrown when a CUDA function cannot be found in the library
+		class symbol_not_found : public std::runtime_error
+		{
+		public:
+			explicit symbol_not_found(std::string_view msg)
+				: std::runtime_error(std::string(msg))
+			{
+			}
+		};
+
+	} // namespace cuda
+
+} // namespace NAMESPACE_COMPRESSED_IMAGE
\ No newline at end of file
diff --git a/compressed_image/include/compressed/cuda/gpu.h b/compressed_image/include/compressed/cuda/gpu.h
new file mode 100644
index 0000000..7996e4c
--- /dev/null
+++ b/compressed_image/include/compressed/cuda/gpu.h
@@ -0,0 +1,96 @@
+#pragma once
+
+#include "compressed/macros.h"
+
+#include "compressed/cuda/cuda_hook.h"
+#include "compressed/cuda/nvcomp_hook.h"
+
+
+namespace
+NAMESPACE_COMPRESSED_IMAGE
+{
+    namespace cuda
+    {
+        /// \brief Check if CUDA runtime is available and at least one device exists.
+        inline bool is_available()
+        {
+            return cuda_api::instance().available() && nvcomp_api::instance().available() && cuda_api::instance().
+                has_device();
+        }
+
+        /// \brief Get the number of available CUDA devices.
+        /// \return Number of devices detected by the CUDA runtime.
+        inline int device_count()
+        {
+            return cuda_api::instance().device_count();
+        }
+
+        /// \brief Get the index of the currently active CUDA device.
+        /// \return Device index (0-based).
+        inline int current_device()
+        {
+            return cuda_api::instance().current_device();
+        }
+
+        /// \brief Set the active CUDA device for the calling thread.
+        /// \param device The index of the device to make current.
+        inline void set_device(const int device)
+        {
+            cuda_api::instance().set_device(device);
+        }
+
+        /// \brief Retrieve full device property structures for all CUDA devices.
+        /// \return A vector of \c cudaDeviceProp, one for each device.
+        inline std::vector<cudaDeviceProp> devices()
+        {
+            std::vector<cudaDeviceProp> properties;
+            for (int i = 0; i < device_count(); ++i)
+            {
+                properties.push_back(cuda_api::instance().device_properties(i));
+            }
+            return properties;
+        }
+
+        /// \brief Get the names of all available CUDA devices.
+        /// \return A vector of device name strings.
+        inline std::vector<std::string> device_names()
+        {
+            std::vector<std::string> names;
+            for (int i = 0; i < device_count(); ++i)
+            {
+                names.emplace_back(cuda_api::instance().device_properties(i).name);
+            }
+            return names;
+        }
+
+        /// \brief RAII guard to temporarily switch CUDA devices.
+        ///
+        /// Saves the currently active device on construction, switches to the given
+        /// device, and restores the previous device on destruction.
+        struct device_guard
+        {
+            /// \brief Construct a guard and switch to the given device.
+            /// \param new_device The device index to switch to.
+            explicit device_guard(int new_device)
+            {
+                prev_device_ = current_device();
+                set_device(new_device);
+            }
+
+            /// \brief Destructor restores the previous device.
+            ~device_guard()
+            {
+                try { set_device(prev_device_); }
+                catch (...)
+                {
+                }
+            }
+
+            device_guard(const device_guard&) = delete;
+            device_guard& operator=(const device_guard&) = delete;
+
+        private:
+            int prev_device_{-1}; ///< Previously active device index.
+        };
+    } // namespace cuda
+} // namespace NAMESPACE_COMPRESSED_IMAGE
diff --git a/compressed_image/include/compressed/cuda/memory.h b/compressed_image/include/compressed/cuda/memory.h
new file mode 100644
index 0000000..c03c90f
--- /dev/null
+++ b/compressed_image/include/compressed/cuda/memory.h
@@ -0,0 +1,324 @@
+/*
+Wrapper around cuda memory allocation/deallocation using std::unique_ptr to manage freeing memory appropriately
+again instead of having to do this by hand
+
+This header file includes the following structs:
+
+scoped_host_pinner
+    A RAII struct for pinning a buffer to gpu memory for quicker gpu <-> cpu memory transfers. This should be used only
+    for staging buffers (like a chunk buffer). Doing this for big buffers often causes performance degradation so use
+    with care.
+
+cuda_device_buffer
+    A RAII-managed GPU buffer allocated using CUDAs non-async memory functions.
+
+cuda_device_buffer_async
+    A RAII-managed GPU buffer allocated using CUDAs async memory functions.
+*/
+#pragma once
+
+#include <memory>
+
+#include <cuda_runtime.h>
+
+#include "compressed/macros.h"
+#include "compressed/util.h"
+#include "compressed/cuda/cuda_hook.h"
+
+
+namespace
+NAMESPACE_COMPRESSED_IMAGE
+{
+    namespace cuda
+    {
+        namespace detail
+        {
+            struct device_deleter
+            {
+                void operator()(void* ptr) const noexcept
+                {
+                    if (ptr)
+                    {
+                        try
+                        {
+                            cuda_api::instance().free(ptr);
+                        }
+                        catch (...)
+                        {
+                            // suppress exceptions in destructors
+                        }
+                    }
+                }
+            };
+
+            struct device_deleter_async
+            {
+                // Must be the same stream used for construction, use the factory functions to ensure this holds
+                cudaStream_t stream = cudaStreamPerThread;
+
+                void operator()(void* ptr) const noexcept
+                {
+                    if (ptr)
+                    {
+                        try
+                        {
+                            cuda_api::instance().free_async(ptr, stream);
+                        }
+                        catch (...)
+                        {
+                            // suppress exceptions in destructors
+                        }
+                    }
+                }
+            };
+
+            struct host_deleter
+            {
+                void operator()(void* ptr) const noexcept
+                {
+                    if (ptr)
+                    {
+                        try
+                        {
+                            cuda_api::instance().free_host(ptr);
+                        }
+                        catch (...)
+                        {
+                            // suppress exceptions in destructors
+                        }
+                    }
+                }
+            };
+        } // namespace detail
+
+
+        /// \brief A RAII wrapper for registering and deregistering host memory.
+        ///
+        /// Automatically pins the cpu-memory for gpu-operations. The `scoped_host_pinner` holds a thin view over the
+        /// registered memory, meaning it is not valid for the `scoped_host_pinner` to exceed the lifespan of the held
+        /// memory.
+        struct scoped_host_pinner
+        {
+            void* ptr = nullptr;
+            size_t bytes = 0;
+
+            scoped_host_pinner(void* p, const size_t b, const unsigned int flags = cudaHostRegisterDefault)
+                : ptr(p), bytes(b)
+            {
+                if (ptr && bytes > 0)
+                {
+                    cuda_api::instance().host_register(ptr, bytes, flags);
+                }
+            }
+
+            ~scoped_host_pinner() noexcept
+            {
+                if (ptr && bytes > 0)
+                {
+                    try
+                    {
+                        cuda_api::instance().host_unregister(ptr);
+                    }
+                    catch (...)
+                    {
+                        // Suppress exceptions inside destructors during unwinding
+                    }
+                }
+            }
+
+            // The scoped_host_pinner is move-only to ensure we don't deregister the same memory multiple times.
+            scoped_host_pinner(const scoped_host_pinner&) = delete;
+            scoped_host_pinner& operator=(const scoped_host_pinner&) = delete;
+
+            scoped_host_pinner(scoped_host_pinner&& other) noexcept
+                : ptr(std::exchange(other.ptr, nullptr)), bytes(std::exchange(other.bytes, 0))
+            {
+            }
+
+            scoped_host_pinner& operator=(scoped_host_pinner&& other) noexcept
+            {
+                if (this != &other)
+                {
+                    if (ptr && bytes > 0)
+                    {
+                        try { cuda_api::instance().host_unregister(ptr); }
+                        catch (...)
+                        {
+                        }
+                    }
+                    ptr = std::exchange(other.ptr, nullptr);
+                    bytes = std::exchange(other.bytes, 0);
+                }
+                return *this;
+            }
+        };
+
+        // -------------------------------------------------------------------------
+        // Smart pointer aliases (void*, untyped)
+        // -------------------------------------------------------------------------
+        using cuda_device_mem = std::unique_ptr<void, detail::device_deleter>;
+        using cuda_device_mem_async = std::unique_ptr<void, detail::device_deleter_async>;
+        using cuda_host_mem = std::unique_ptr<void, detail::host_deleter>;
+
+        // -------------------------------------------------------------------------
+        // Allocation helpers (typed)
+        // -------------------------------------------------------------------------
+        template <typename T>
+        using cuda_device_ptr = std::unique_ptr<T, detail::device_deleter>;
+
+        /// \brief RAII wrapper around a gpu memory buffer allocated using synchronous APIs.
+        template <typename T>
+        struct cuda_device_buffer
+        {
+            /// \brief the underlying raw device ptr.
+            cuda_device_ptr<T> data = nullptr;
+            /// \brief the number of elements in the device buffer (expressed as a multiple of T)
+            size_t size{};
+
+            T* get() noexcept { return this->data.get(); }
+            const T* get() const noexcept { return this->data.get(); }
+            void* get_raw() noexcept { return static_cast<void*>(this->get()); }
+            [[nodiscard]] const void* get_raw() const noexcept { return static_cast<const void*>(this->get()); }
+
+            [[nodiscard]] size_t bytes() const noexcept { return this->size * sizeof(T); }
+        };
+
+        template <typename T>
+        using cuda_device_ptr_async = std::unique_ptr<T, detail::device_deleter_async>;
+
+        /// \brief RAII wrapper around a gpu memory buffer allocated using synchronous APIs.
+        template <typename T>
+        struct cuda_device_buffer_async
+        {
+            /// \brief the underlying raw device ptr.
+            cuda_device_ptr_async<T> data = nullptr;
+            /// \brief the number of elements in the device buffer (expressed as a multiple of T)
+            size_t size{};
+
+            /// \brief Generate a device buffer (using asynchronous memory ops) from a host buffer copying the data.
+            ///
+            /// \param buffer The buffer to use as a size reference and to generate the device pointer from
+            static cuda_device_buffer_async from_host(std::span<const T> buffer)
+            {
+                _COMPRESSED_PROFILE_FUNCTION();
+                void* raw = nullptr;
+
+                cuda_api::instance().malloc_async(
+                    raw,
+                    buffer.size() * sizeof(T),
+                    cudaStreamPerThread
+                );
+
+                auto gpu_buffer = cuda_device_buffer_async<T>{
+                    cuda_device_ptr_async<T>(
+                        static_cast<T*>(raw),
+                        detail::device_deleter_async{cudaStreamPerThread}
+                    ),
+                    buffer.size()
+                };
+
+                cuda_api::instance().memcpy_async(
+                    static_cast<void*>(gpu_buffer.data.get()),
+                    buffer.data(),
+                    gpu_buffer.bytes(),
+                    cudaMemcpyHostToDevice
+                );
+
+                return gpu_buffer;
+            }
+
+            static cuda_device_buffer_async from_host(std::vector<T>& buffer)
+            {
+                return cuda_device_buffer_async::from_host(std::span<T>(buffer.begin(), buffer.end()));
+            }
+
+            /// \brief memcpy the gpu buffer into `buffer`.
+            ///
+            /// \throws std::invalid_argument if the size of `buffer` does not match the size of the gpu buffer.
+            void to_host(std::span<T> buffer)
+            {
+                _COMPRESSED_PROFILE_FUNCTION();
+                if (buffer.size() != this->size)
+                {
+                    throw std::invalid_argument(
+                        std::format(
+                            "Cuda: Invalid buffer passed to `to_host` function. Expected exactly {} elements but instead"
+                            " got {}.",
+                            this->size,
+                            buffer.size()
+                        )
+                    );
+                }
+
+                cuda_api::instance().memcpy_async(
+                    static_cast<void*>(buffer.data()),
+                    this->get_raw(),
+                    this->size * sizeof(T),
+                    cudaMemcpyDeviceToHost
+                );
+            }
+
+            /// \brief allocate and memcpy the compressed data back to the host.
+            NAMESPACE_COMPRESSED_IMAGE::util::default_init_vector<T> to_host()
+            {
+                util::default_init_vector<T> buffer(this->size);
+                this->to_host(std::span<T>(buffer.begin(), buffer.end()));
+                return buffer;
+            }
+
+            T* get() noexcept { return this->data.get(); }
+            const T* get() const noexcept { return this->data.get(); }
+            void* get_raw() noexcept { return static_cast<void*>(this->get()); }
+            [[nodiscard]] const void* get_raw() const noexcept { return static_cast<const void*>(this->get()); }
+
+            [[nodiscard]] size_t bytes() const noexcept { return this->size * sizeof(T); }
+        };
+
+        template <typename T>
+        using cuda_host_ptr = std::unique_ptr<T, detail::host_deleter>;
+
+        // -------------------------------------------------------------------------
+        // Factory functions, use these whenever possible!
+        // -------------------------------------------------------------------------
+        template <typename T = void>
+        inline cuda_device_ptr<T> make_device_mem(size_t count)
+        {
+            void* raw = nullptr;
+            cuda_api::instance().malloc(raw, count * sizeof(T));
+            return cuda_device_ptr<T>(static_cast<T*>(raw));
+        }
+
+        template <typename T = void>
+        inline cuda_device_buffer<T> make_device_buffer(size_t count)
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            auto managed_ptr = make_device_mem<T>(count);
+            return cuda_device_buffer<T>{std::move(managed_ptr), count};
+        }
+
+        template <typename T = void>
+        inline cuda_device_ptr_async<T> make_device_mem_async(size_t count, cudaStream_t stream = cudaStreamPerThread)
+        {
+            void* raw = nullptr;
+            cuda_api::instance().malloc_async(raw, count * sizeof(T), stream);
+            return cuda_device_ptr_async<T>(static_cast<T*>(raw), detail::device_deleter_async{stream});
+        }
+
+        template <typename T = void>
+        inline cuda_device_buffer_async<T> make_device_buffer_async(size_t count,
+                                                                    cudaStream_t stream = cudaStreamPerThread)
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            auto managed_ptr = make_device_mem_async<T>(count, stream);
+            return cuda_device_buffer_async<T>{std::move(managed_ptr), count};
+        }
+
+        template <typename T = void>
+        inline cuda_host_ptr<T> make_host_mem(size_t count)
+        {
+            void* raw = nullptr;
+            cuda_api::instance().malloc_host(raw, count * sizeof(T));
+            return cuda_host_ptr<T>(static_cast<T*>(raw));
+        }
+    }
+} // namespace NAMESPACE_COMPRESSED_IMAGE
diff --git a/compressed_image/include/compressed/cuda/nvcomp_hook.h b/compressed_image/include/compressed/cuda/nvcomp_hook.h
new file mode 100644
index 0000000..7dbdfa0
--- /dev/null
+++ b/compressed_image/include/compressed/cuda/nvcomp_hook.h
@@ -0,0 +1,576 @@
+#pragma once
+
+#include <string>
+
+#include <nvcomp.h>
+#include <nvcomp/lz4.h>
+#include <nvcomp/cascaded.h>
+#include <nvcomp/deflate.h>
+#include <nvcomp/gdeflate.h>
+#include <nvcomp/snappy.h>
+#include <nvcomp/zstd.h>
+
+#include "compressed/macros.h"
+#include "compressed/cuda/proc_util.h"
+
+namespace
+NAMESPACE_COMPRESSED_IMAGE::cuda
+{
+    /// \brief Singleton class for dynamically loading nvcomp functions and constants at runtime.
+    class nvcomp_api
+    {
+    public:
+        static nvcomp_api& instance()
+        {
+            static nvcomp_api inst;
+            return inst;
+        }
+
+        bool available() const noexcept { return handle_ != nullptr; }
+
+        // --- LZ4 API ---
+        nvcompStatus_t LZ4CompressGetTempSizeAsync(size_t n,
+                                                   size_t s,
+                                                   nvcompBatchedLZ4CompressOpts_t o,
+                                                   size_t* t,
+                                                   size_t m) const { return lz4_comp_temp_fn_(n, s, o, t, m); }
+
+        nvcompStatus_t LZ4DecompressGetTempSizeAsync(size_t n,
+                                                     size_t s,
+                                                     nvcompBatchedLZ4DecompressOpts_t o,
+                                                     size_t* t,
+                                                     size_t m) const { return lz4_decomp_temp_fn_(n, s, o, t, m); }
+
+        nvcompStatus_t LZ4CompressGetMaxOutputChunkSize(size_t s, nvcompBatchedLZ4CompressOpts_t o, size_t* m) const
+        {
+            return lz4_max_out_fn_(s, o, m);
+        }
+
+        nvcompStatus_t LZ4CompressAsync(const void* const* u,
+                                        const size_t* us,
+                                        size_t ms,
+                                        size_t b,
+                                        void* tp,
+                                        size_t tb,
+                                        void* const* cp,
+                                        size_t* cs,
+                                        nvcompBatchedLZ4CompressOpts_t o,
+                                        nvcompStatus_t* st,
+                                        cudaStream_t stream) const
+        {
+            return lz4_compress_fn_(u, us, ms, b, tp, tb, cp, cs, o, st, stream);
+        }
+
+        nvcompStatus_t LZ4DecompressAsync(const void* const* cp,
+                                          const size_t* cs,
+                                          const size_t* us,
+                                          size_t* ac,
+                                          size_t b,
+                                          void* tp,
+                                          size_t tb,
+                                          void* const* up,
+                                          nvcompBatchedLZ4DecompressOpts_t o,
+                                          nvcompStatus_t* st,
+                                          cudaStream_t stream) const
+        {
+            return lz4_decompress_fn_(cp, cs, us, ac, b, tp, tb, up, o, st, stream);
+        }
+
+        // --- Cascaded API ---
+        nvcompStatus_t CascadedCompressGetTempSizeAsync(size_t n,
+                                                        size_t s,
+                                                        nvcompBatchedCascadedCompressOpts_t o,
+                                                        size_t* t,
+                                                        size_t m) const
+        {
+            return cascaded_comp_temp_fn_(n, s, o, t, m);
+        }
+
+        nvcompStatus_t CascadedDecompressGetTempSizeAsync(size_t n,
+                                                          size_t s,
+                                                          nvcompBatchedCascadedDecompressOpts_t o,
+                                                          size_t* t,
+                                                          size_t m) const
+        {
+            return cascaded_decomp_temp_fn_(n, s, o, t, m);
+        }
+
+        nvcompStatus_t CascadedCompressGetMaxOutputChunkSize(size_t s,
+                                                             nvcompBatchedCascadedCompressOpts_t o,
+                                                             size_t* m) const { return cascaded_max_out_fn_(s, o, m); }
+
+        nvcompStatus_t CascadedCompressAsync(const void* const* u,
+                                             const size_t* us,
+                                             size_t ms,
+                                             size_t b,
+                                             void* tp,
+                                             size_t tb,
+                                             void* const* cp,
+                                             size_t* cs,
+                                             nvcompBatchedCascadedCompressOpts_t o,
+                                             nvcompStatus_t* st,
+                                             cudaStream_t stream) const
+        {
+            return cascaded_compress_fn_(u, us, ms, b, tp, tb, cp, cs, o, st, stream);
+        }
+
+        nvcompStatus_t CascadedDecompressAsync(const void* const* cp,
+                                               const size_t* cs,
+                                               const size_t* us,
+                                               size_t* ac,
+                                               size_t b,
+                                               void* tp,
+                                               size_t tb,
+                                               void* const* up,
+                                               nvcompBatchedCascadedDecompressOpts_t o,
+                                               nvcompStatus_t* st,
+                                               cudaStream_t stream) const
+        {
+            return cascaded_decompress_fn_(cp, cs, us, ac, b, tp, tb, up, o, st, stream);
+        }
+
+        // --- Deflate API ---
+        nvcompStatus_t DeflateCompressGetTempSizeAsync(size_t n,
+                                                       size_t s,
+                                                       nvcompBatchedDeflateCompressOpts_t o,
+                                                       size_t* t,
+                                                       size_t m) const { return deflate_comp_temp_fn_(n, s, o, t, m); }
+
+        nvcompStatus_t DeflateDecompressGetTempSizeAsync(size_t n,
+                                                         size_t s,
+                                                         nvcompBatchedDeflateDecompressOpts_t o,
+                                                         size_t* t,
+                                                         size_t m) const
+        {
+            return deflate_decomp_temp_fn_(n, s, o, t, m);
+        }
+
+        nvcompStatus_t DeflateCompressGetMaxOutputChunkSize(size_t s,
+                                                            nvcompBatchedDeflateCompressOpts_t o,
+                                                            size_t* m) const { return deflate_max_out_fn_(s, o, m); }
+
+        nvcompStatus_t DeflateCompressAsync(const void* const* u,
+                                            const size_t* us,
+                                            size_t ms,
+                                            size_t b,
+                                            void* tp,
+                                            size_t tb,
+                                            void* const* cp,
+                                            size_t* cs,
+                                            nvcompBatchedDeflateCompressOpts_t o,
+                                            nvcompStatus_t* st,
+                                            cudaStream_t stream) const
+        {
+            return deflate_compress_fn_(u, us, ms, b, tp, tb, cp, cs, o, st, stream);
+        }
+
+        nvcompStatus_t DeflateDecompressAsync(const void* const* cp,
+                                              const size_t* cs,
+                                              const size_t* us,
+                                              size_t* ac,
+                                              size_t b,
+                                              void* tp,
+                                              size_t tb,
+                                              void* const* up,
+                                              nvcompBatchedDeflateDecompressOpts_t o,
+                                              nvcompStatus_t* st,
+                                              cudaStream_t stream) const
+        {
+            return deflate_decompress_fn_(cp, cs, us, ac, b, tp, tb, up, o, st, stream);
+        }
+
+        // --- Gdeflate API ---
+        nvcompStatus_t GdeflateCompressGetTempSizeAsync(size_t n,
+                                                        size_t s,
+                                                        nvcompBatchedGdeflateCompressOpts_t o,
+                                                        size_t* t,
+                                                        size_t m) const
+        {
+            return gdeflate_comp_temp_fn_(n, s, o, t, m);
+        }
+
+        nvcompStatus_t GdeflateDecompressGetTempSizeAsync(size_t n,
+                                                          size_t s,
+                                                          nvcompBatchedGdeflateDecompressOpts_t o,
+                                                          size_t* t,
+                                                          size_t m) const
+        {
+            return gdeflate_decomp_temp_fn_(n, s, o, t, m);
+        }
+
+        nvcompStatus_t GdeflateCompressGetMaxOutputChunkSize(size_t s,
+                                                             nvcompBatchedGdeflateCompressOpts_t o,
+                                                             size_t* m) const { return gdeflate_max_out_fn_(s, o, m); }
+
+        nvcompStatus_t GdeflateCompressAsync(const void* const* u,
+                                             const size_t* us,
+                                             size_t ms,
+                                             size_t b,
+                                             void* tp,
+                                             size_t tb,
+                                             void* const* cp,
+                                             size_t* cs,
+                                             nvcompBatchedGdeflateCompressOpts_t o,
+                                             nvcompStatus_t* st,
+                                             cudaStream_t stream) const
+        {
+            return gdeflate_compress_fn_(u, us, ms, b, tp, tb, cp, cs, o, st, stream);
+        }
+
+        nvcompStatus_t GdeflateDecompressAsync(const void* const* cp,
+                                               const size_t* cs,
+                                               const size_t* us,
+                                               size_t* ac,
+                                               size_t b,
+                                               void* tp,
+                                               size_t tb,
+                                               void* const* up,
+                                               nvcompBatchedGdeflateDecompressOpts_t o,
+                                               nvcompStatus_t* st,
+                                               cudaStream_t stream) const
+        {
+            return gdeflate_decompress_fn_(cp, cs, us, ac, b, tp, tb, up, o, st, stream);
+        }
+
+        // --- Snappy API ---
+        nvcompStatus_t SnappyCompressGetTempSizeAsync(size_t n,
+                                                      size_t s,
+                                                      nvcompBatchedSnappyCompressOpts_t o,
+                                                      size_t* t,
+                                                      size_t m) const { return snappy_comp_temp_fn_(n, s, o, t, m); }
+
+        nvcompStatus_t SnappyDecompressGetTempSizeAsync(size_t n,
+                                                        size_t s,
+                                                        nvcompBatchedSnappyDecompressOpts_t o,
+                                                        size_t* t,
+                                                        size_t m) const
+        {
+            return snappy_decomp_temp_fn_(n, s, o, t, m);
+        }
+
+        nvcompStatus_t SnappyCompressGetMaxOutputChunkSize(size_t s,
+                                                           nvcompBatchedSnappyCompressOpts_t o,
+                                                           size_t* m) const { return snappy_max_out_fn_(s, o, m); }
+
+        nvcompStatus_t SnappyCompressAsync(const void* const* u,
+                                           const size_t* us,
+                                           size_t ms,
+                                           size_t b,
+                                           void* tp,
+                                           size_t tb,
+                                           void* const* cp,
+                                           size_t* cs,
+                                           nvcompBatchedSnappyCompressOpts_t o,
+                                           nvcompStatus_t* st,
+                                           cudaStream_t stream) const
+        {
+            return snappy_compress_fn_(u, us, ms, b, tp, tb, cp, cs, o, st, stream);
+        }
+
+        nvcompStatus_t SnappyDecompressAsync(const void* const* cp,
+                                             const size_t* cs,
+                                             const size_t* us,
+                                             size_t* ac,
+                                             size_t b,
+                                             void* tp,
+                                             size_t tb,
+                                             void* const* up,
+                                             nvcompBatchedSnappyDecompressOpts_t o,
+                                             nvcompStatus_t* st,
+                                             cudaStream_t stream) const
+        {
+            return snappy_decompress_fn_(cp, cs, us, ac, b, tp, tb, up, o, st, stream);
+        }
+
+        // --- Zstd API ---
+        nvcompStatus_t ZstdCompressGetTempSizeAsync(size_t n,
+                                                    size_t s,
+                                                    nvcompBatchedZstdCompressOpts_t o,
+                                                    size_t* t,
+                                                    size_t m) const { return zstd_comp_temp_fn_(n, s, o, t, m); }
+
+        nvcompStatus_t ZstdDecompressGetTempSizeAsync(size_t n,
+                                                      size_t s,
+                                                      nvcompBatchedZstdDecompressOpts_t o,
+                                                      size_t* t,
+                                                      size_t m) const { return zstd_decomp_temp_fn_(n, s, o, t, m); }
+
+        nvcompStatus_t ZstdCompressGetMaxOutputChunkSize(size_t s, nvcompBatchedZstdCompressOpts_t o, size_t* m) const
+        {
+            return zstd_max_out_fn_(s, o, m);
+        }
+
+        nvcompStatus_t ZstdCompressAsync(const void* const* u,
+                                         const size_t* us,
+                                         size_t ms,
+                                         size_t b,
+                                         void* tp,
+                                         size_t tb,
+                                         void* const* cp,
+                                         size_t* cs,
+                                         nvcompBatchedZstdCompressOpts_t o,
+                                         nvcompStatus_t* st,
+                                         cudaStream_t stream) const
+        {
+            return zstd_compress_fn_(u, us, ms, b, tp, tb, cp, cs, o, st, stream);
+        }
+
+        nvcompStatus_t ZstdDecompressAsync(const void* const* cp,
+                                           const size_t* cs,
+                                           const size_t* us,
+                                           size_t* ac,
+                                           size_t b,
+                                           void* tp,
+                                           size_t tb,
+                                           void* const* up,
+                                           nvcompBatchedZstdDecompressOpts_t o,
+                                           nvcompStatus_t* st,
+                                           cudaStream_t stream) const
+        {
+            return zstd_decompress_fn_(cp, cs, us, ac, b, tp, tb, up, o, st, stream);
+        }
+
+        nvcomp_api(const nvcomp_api&) = delete;
+        nvcomp_api& operator=(const nvcomp_api&) = delete;
+
+    private:
+        nvcomp_api()
+        {
+#if defined(_WIN32)
+            const std::string lib_name = "nvcomp64_5.dll";
+#elif defined(__linux__)
+            const std::string lib_name = "libnvcomp.so";
+#else
+            return;
+#endif
+            handle_ = proc::load_library(lib_name);
+            if (!handle_) return;
+
+#define LOAD_FN(fn, member) member = proc::get_symbol<decltype(member)>(handle_, #fn, lib_name)
+#define LOAD_VAR(type, name, member) member = proc::get_symbol<type*>(handle_, #name, lib_name)
+
+            // --- LZ4 ---
+            LOAD_FN(nvcompBatchedLZ4CompressGetTempSizeAsync, lz4_comp_temp_fn_);
+            LOAD_FN(nvcompBatchedLZ4DecompressGetTempSizeAsync, lz4_decomp_temp_fn_);
+            LOAD_FN(nvcompBatchedLZ4CompressGetMaxOutputChunkSize, lz4_max_out_fn_);
+            LOAD_FN(nvcompBatchedLZ4CompressAsync, lz4_compress_fn_);
+            LOAD_FN(nvcompBatchedLZ4DecompressAsync, lz4_decompress_fn_);
+
+            // --- Cascaded ---
+            LOAD_FN(nvcompBatchedCascadedCompressGetTempSizeAsync, cascaded_comp_temp_fn_);
+            LOAD_FN(nvcompBatchedCascadedDecompressGetTempSizeAsync, cascaded_decomp_temp_fn_);
+            LOAD_FN(nvcompBatchedCascadedCompressGetMaxOutputChunkSize, cascaded_max_out_fn_);
+            LOAD_FN(nvcompBatchedCascadedCompressAsync, cascaded_compress_fn_);
+            LOAD_FN(nvcompBatchedCascadedDecompressAsync, cascaded_decompress_fn_);
+
+            // --- Deflate ---
+            LOAD_FN(nvcompBatchedDeflateCompressGetTempSizeAsync, deflate_comp_temp_fn_);
+            LOAD_FN(nvcompBatchedDeflateDecompressGetTempSizeAsync, deflate_decomp_temp_fn_);
+            LOAD_FN(nvcompBatchedDeflateCompressGetMaxOutputChunkSize, deflate_max_out_fn_);
+            LOAD_FN(nvcompBatchedDeflateCompressAsync, deflate_compress_fn_);
+            LOAD_FN(nvcompBatchedDeflateDecompressAsync, deflate_decompress_fn_);
+
+            // --- Gdeflate ---
+            LOAD_FN(nvcompBatchedGdeflateCompressGetTempSizeAsync, gdeflate_comp_temp_fn_);
+            LOAD_FN(nvcompBatchedGdeflateDecompressGetTempSizeAsync, gdeflate_decomp_temp_fn_);
+            LOAD_FN(nvcompBatchedGdeflateCompressGetMaxOutputChunkSize, gdeflate_max_out_fn_);
+            LOAD_FN(nvcompBatchedGdeflateCompressAsync, gdeflate_compress_fn_);
+            LOAD_FN(nvcompBatchedGdeflateDecompressAsync, gdeflate_decompress_fn_);
+
+            // --- Snappy ---
+            LOAD_FN(nvcompBatchedSnappyCompressGetTempSizeAsync, snappy_comp_temp_fn_);
+            LOAD_FN(nvcompBatchedSnappyDecompressGetTempSizeAsync, snappy_decomp_temp_fn_);
+            LOAD_FN(nvcompBatchedSnappyCompressGetMaxOutputChunkSize, snappy_max_out_fn_);
+            LOAD_FN(nvcompBatchedSnappyCompressAsync, snappy_compress_fn_);
+            LOAD_FN(nvcompBatchedSnappyDecompressAsync, snappy_decompress_fn_);
+
+            // --- Zstd ---
+            LOAD_FN(nvcompBatchedZstdCompressGetTempSizeAsync, zstd_comp_temp_fn_);
+            LOAD_FN(nvcompBatchedZstdDecompressGetTempSizeAsync, zstd_decomp_temp_fn_);
+            LOAD_FN(nvcompBatchedZstdCompressGetMaxOutputChunkSize, zstd_max_out_fn_);
+            LOAD_FN(nvcompBatchedZstdCompressAsync, zstd_compress_fn_);
+            LOAD_FN(nvcompBatchedZstdDecompressAsync, zstd_decompress_fn_);
+
+#undef LOAD_FN
+#undef LOAD_VAR
+        }
+
+        proc::library_handle handle_ = nullptr;
+
+        // --- Function Pointer Types ---
+
+        // Type definitions matching exact function layouts
+        nvcompStatus_t (*lz4_comp_temp_fn_)(size_t, size_t, nvcompBatchedLZ4CompressOpts_t, size_t*, size_t) = nullptr;
+        nvcompStatus_t (*lz4_decomp_temp_fn_)(size_t, size_t, nvcompBatchedLZ4DecompressOpts_t, size_t*, size_t) =
+            nullptr;
+        nvcompStatus_t (*lz4_max_out_fn_)(size_t, nvcompBatchedLZ4CompressOpts_t, size_t*) = nullptr;
+        nvcompStatus_t (*lz4_compress_fn_)(const void* const*,
+                                           const size_t*,
+                                           size_t,
+                                           size_t,
+                                           void*,
+                                           size_t,
+                                           void* const*,
+                                           size_t*,
+                                           nvcompBatchedLZ4CompressOpts_t,
+                                           nvcompStatus_t*,
+                                           cudaStream_t) = nullptr;
+        nvcompStatus_t (*lz4_decompress_fn_)(const void* const*,
+                                             const size_t*,
+                                             const size_t*,
+                                             size_t*,
+                                             size_t,
+                                             void*,
+                                             size_t,
+                                             void* const*,
+                                             nvcompBatchedLZ4DecompressOpts_t,
+                                             nvcompStatus_t*,
+                                             cudaStream_t) = nullptr;
+
+        nvcompStatus_t (*cascaded_comp_temp_fn_)(size_t, size_t, nvcompBatchedCascadedCompressOpts_t, size_t*, size_t) =
+            nullptr;
+        nvcompStatus_t (*cascaded_decomp_temp_fn_)(size_t,
+                                                   size_t,
+                                                   nvcompBatchedCascadedDecompressOpts_t,
+                                                   size_t*,
+                                                   size_t) = nullptr;
+        nvcompStatus_t (*cascaded_max_out_fn_)(size_t, nvcompBatchedCascadedCompressOpts_t, size_t*) = nullptr;
+        nvcompStatus_t (*cascaded_compress_fn_)(const void* const*,
+                                                const size_t*,
+                                                size_t,
+                                                size_t,
+                                                void*,
+                                                size_t,
+                                                void* const*,
+                                                size_t*,
+                                                nvcompBatchedCascadedCompressOpts_t,
+                                                nvcompStatus_t*,
+                                                cudaStream_t) = nullptr;
+        nvcompStatus_t (*cascaded_decompress_fn_)(const void* const*,
+                                                  const size_t*,
+                                                  const size_t*,
+                                                  size_t*,
+                                                  size_t,
+                                                  void*,
+                                                  size_t,
+                                                  void* const*,
+                                                  nvcompBatchedCascadedDecompressOpts_t,
+                                                  nvcompStatus_t*,
+                                                  cudaStream_t) = nullptr;
+
+        nvcompStatus_t (*deflate_comp_temp_fn_)(size_t, size_t, nvcompBatchedDeflateCompressOpts_t, size_t*, size_t) =
+            nullptr;
+        nvcompStatus_t (*deflate_decomp_temp_fn_)(size_t, size_t, nvcompBatchedDeflateDecompressOpts_t, size_t*, size_t)
+            = nullptr;
+        nvcompStatus_t (*deflate_max_out_fn_)(size_t, nvcompBatchedDeflateCompressOpts_t, size_t*) = nullptr;
+        nvcompStatus_t (*deflate_compress_fn_)(const void* const*,
+                                               const size_t*,
+                                               size_t,
+                                               size_t,
+                                               void*,
+                                               size_t,
+                                               void* const*,
+                                               size_t*,
+                                               nvcompBatchedDeflateCompressOpts_t,
+                                               nvcompStatus_t*,
+                                               cudaStream_t) = nullptr;
+        nvcompStatus_t (*deflate_decompress_fn_)(const void* const*,
+                                                 const size_t*,
+                                                 const size_t*,
+                                                 size_t*,
+                                                 size_t,
+                                                 void*,
+                                                 size_t,
+                                                 void* const*,
+                                                 nvcompBatchedDeflateDecompressOpts_t,
+                                                 nvcompStatus_t*,
+                                                 cudaStream_t) = nullptr;
+
+        nvcompStatus_t (*gdeflate_comp_temp_fn_)(size_t, size_t, nvcompBatchedGdeflateCompressOpts_t, size_t*, size_t) =
+            nullptr;
+        nvcompStatus_t (*gdeflate_decomp_temp_fn_)(size_t,
+                                                   size_t,
+                                                   nvcompBatchedGdeflateDecompressOpts_t,
+                                                   size_t*,
+                                                   size_t) = nullptr;
+        nvcompStatus_t (*gdeflate_max_out_fn_)(size_t, nvcompBatchedGdeflateCompressOpts_t, size_t*) = nullptr;
+        nvcompStatus_t (*gdeflate_compress_fn_)(const void* const*,
+                                                const size_t*,
+                                                size_t,
+                                                size_t,
+                                                void*,
+                                                size_t,
+                                                void* const*,
+                                                size_t*,
+                                                nvcompBatchedGdeflateCompressOpts_t,
+                                                nvcompStatus_t*,
+                                                cudaStream_t) = nullptr;
+        nvcompStatus_t (*gdeflate_decompress_fn_)(const void* const*,
+                                                  const size_t*,
+                                                  const size_t*,
+                                                  size_t*,
+                                                  size_t,
+                                                  void*,
+                                                  size_t,
+                                                  void* const*,
+                                                  nvcompBatchedGdeflateDecompressOpts_t,
+                                                  nvcompStatus_t*,
+                                                  cudaStream_t) = nullptr;
+
+        nvcompStatus_t (*snappy_comp_temp_fn_)(size_t, size_t, nvcompBatchedSnappyCompressOpts_t, size_t*, size_t) =
+            nullptr;
+        nvcompStatus_t (*snappy_decomp_temp_fn_)(size_t, size_t, nvcompBatchedSnappyDecompressOpts_t, size_t*, size_t) =
+            nullptr;
+        nvcompStatus_t (*snappy_max_out_fn_)(size_t, nvcompBatchedSnappyCompressOpts_t, size_t*) = nullptr;
+        nvcompStatus_t (*snappy_compress_fn_)(const void* const*,
+                                              const size_t*,
+                                              size_t,
+                                              size_t,
+                                              void*,
+                                              size_t,
+                                              void* const*,
+                                              size_t*,
+                                              nvcompBatchedSnappyCompressOpts_t,
+                                              nvcompStatus_t*,
+                                              cudaStream_t) = nullptr;
+        nvcompStatus_t (*snappy_decompress_fn_)(const void* const*,
+                                                const size_t*,
+                                                const size_t*,
+                                                size_t*,
+                                                size_t,
+                                                void*,
+                                                size_t,
+                                                void* const*,
+                                                nvcompBatchedSnappyDecompressOpts_t,
+                                                nvcompStatus_t*,
+                                                cudaStream_t) = nullptr;
+
+        nvcompStatus_t (*zstd_comp_temp_fn_)(size_t, size_t, nvcompBatchedZstdCompressOpts_t, size_t*, size_t) =
+            nullptr;
+        nvcompStatus_t (*zstd_decomp_temp_fn_)(size_t, size_t, nvcompBatchedZstdDecompressOpts_t, size_t*, size_t) =
+            nullptr;
+        nvcompStatus_t (*zstd_max_out_fn_)(size_t, nvcompBatchedZstdCompressOpts_t, size_t*) = nullptr;
+        nvcompStatus_t (*zstd_compress_fn_)(const void* const*,
+                                            const size_t*,
+                                            size_t,
+                                            size_t,
+                                            void*,
+                                            size_t,
+                                            void* const*,
+                                            size_t*,
+                                            nvcompBatchedZstdCompressOpts_t,
+                                            nvcompStatus_t*,
+                                            cudaStream_t) = nullptr;
+        nvcompStatus_t (*zstd_decompress_fn_)(const void* const*,
+                                              const size_t*,
+                                              const size_t*,
+                                              size_t*,
+                                              size_t,
+                                              void*,
+                                              size_t,
+                                              void* const*,
+                                              nvcompBatchedZstdDecompressOpts_t,
+                                              nvcompStatus_t*,
+                                              cudaStream_t) = nullptr;
+    };
+}
diff --git a/compressed_image/include/compressed/cuda/proc_util.h b/compressed_image/include/compressed/cuda/proc_util.h
new file mode 100644
index 0000000..5b92a37
--- /dev/null
+++ b/compressed_image/include/compressed/cuda/proc_util.h
@@ -0,0 +1,127 @@
+/*
+Header for various procutils such as finding symbols in a file and raising the appropriate error if it cannot be located.
+
+Note: This expects windows/linux as it is part of the cuda subfolder which only support windows and linux. It is not 
+	  intended to be a generic dll/so module. It is an implementation detail of compressed-image and should not be used
+	  outside of it!
+*/
+
+#pragma once
+
+#include <format>
+#include <memory>
+
+#if defined(_WIN32)
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
+#include "compressed/macros.h"
+#include "compressed/cuda/exceptions.h"
+
+
+namespace NAMESPACE_COMPRESSED_IMAGE
+{
+
+	namespace cuda
+	{
+
+		namespace proc
+		{
+
+			// Platform-independent handle type
+#if defined(_WIN32)
+			// HMODULE decays down to PVOID which is void* but for ease of use later we alias it directly.
+			using library_handle_impl_ = HMODULE;
+#else
+			using library_handle_impl_ = void*;
+#endif
+
+			/// Custom deleter for unique_ptr freeing/closing a dll/so automatically on destruction.
+			struct library_deleter 
+			{
+				void operator()(library_handle_impl_ handle) const
+				{
+					if (!handle) return;
+#if defined(_WIN32)
+					FreeLibrary(handle);
+#else
+					dlclose(handle);
+#endif
+				}
+			};
+
+			/// Unique-ptr wrapped handle pointer, is automatically freed on destruction.
+			using library_handle = std::unique_ptr<std::remove_pointer_t<library_handle_impl_>, library_deleter>;
+
+			// Function to load a library and return a unique_ptr-managed handle
+			inline library_handle load_library(std::string name)
+			{
+				library_handle_impl_ handle = nullptr;
+
+#if defined(_WIN32)
+				handle = LoadLibraryA(name.c_str());
+#elif defined(__linux__)
+				handle = dlopen(name.c_str(), RTLD_GLOBAL | RTLD_LAZY);
+#endif
+
+				if (!handle)
+				{
+					throw library_not_found(std::format("Failed to load library: {}", name));
+				}
+
+				return library_handle(handle);
+			}
+
+			/// \brief retrieves the symbol `symbol_name` from the given library handle
+			///
+			/// \param handle The library handle to load the symbol from
+			/// \param symbol_name The symbol name to load
+			/// \param object_name The name of the library handle. may be left empty, only used for error messages.
+			template <typename func_sig>
+			func_sig get_symbol(library_handle& handle, std::string symbol_name, std::string object_name)
+			{
+				if (!handle)
+				{
+					throw std::invalid_argument(
+						std::format(
+							"Internal: passed empty library handle while retrieving symbol of name {} from object {}",
+							symbol_name,
+							object_name
+						)
+					);
+				}
+
+#if defined(_WIN32)
+				func_sig func_ptr = reinterpret_cast<func_sig>(GetProcAddress(handle.get(), symbol_name.c_str()));
+#elif defined(__linux__)
+				func_sig func_ptr = reinterpret_cast<func_sig>(dlsym(handle.get(), symbol_name.c_str()));
+#else
+				func_sig func_ptr = nullptr;
+				throw symbol_not_found(
+					std::format(
+						"Unable to find symbol {} in library {} as we are on an unsupported platform for CUDA.",
+						symbol_name, object_name
+					)
+				);
+#endif
+
+				if (!func_ptr)
+				{
+					throw symbol_not_found(
+						std::format(
+							"Unable to find symbol {} in library {} while dynamically loading it.",
+							symbol_name, object_name
+						)
+					);
+				}
+				return func_ptr;
+			}
+
+		} // namespace proc
+
+	} // namespace cuda
+
+
+} // namespace NAMESPACE_COMPRESSED_IMAGE
\ No newline at end of file
diff --git a/compressed_image/include/compressed/detail/oiio_util.h b/compressed_image/include/compressed/detail/oiio_util.h
index b93838c..a472163 100644
--- a/compressed_image/include/compressed/detail/oiio_util.h
+++ b/compressed_image/include/compressed/detail/oiio_util.h
@@ -17,12 +17,11 @@
 #include "scoped_timer.h"
 #include "compressed/json_alias.h"
 
-namespace NAMESPACE_COMPRESSED_IMAGE
+namespace
+NAMESPACE_COMPRESSED_IMAGE
 {
-
     namespace detail
     {
-
         /// \brief Create a mapping of contiguous begin-end pairs from the passed channel names
         /// 
         /// Takes the input channel names and constructs a list of (sorted) pairs for the begin and end channel ranges.
@@ -38,9 +37,9 @@ namespace NAMESPACE_COMPRESSED_IMAGE
         /// \param channelnames The channelnames to construct pairings for, invalid channelnames throw a std::out_of_range
         /// 
         /// \return A mapping of begin-end pairs for the channels
-        inline std::vector<std::pair<int, int>>get_contiguous_channels(
+        inline std::vector<std::pair<int, int>> get_contiguous_channels(
             const std::unique_ptr<OIIO::ImageInput>& input_ptr,
-            std::vector<std::string> channelnames
+            const std::vector<std::string> channelnames
         )
         {
             std::unordered_map<std::string, int> map_name_to_index;
@@ -57,7 +56,7 @@ namespace NAMESPACE_COMPRESSED_IMAGE
             }
 
             // Sort them to ensure we can map them correctly.
-            std::sort(indices.begin(), indices.end());
+            std::ranges::sort(indices);
 
             std::vector<std::pair<int, int>> result;
             if (indices.empty())
@@ -83,12 +82,10 @@ namespace NAMESPACE_COMPRESSED_IMAGE
         }
 
 
-
         // Utilities related to OIIO ParamValue (the internal metadata type) helping us convert them into json-able
         // types
         namespace param_value
         {
-
             /// \brief JSON-like types that we can store 
             enum class _JSONType
             {
@@ -97,7 +94,7 @@ namespace NAMESPACE_COMPRESSED_IMAGE
                 _string
             };
 
-            inline _JSONType to_json_type(OIIO::ParamValue pvalue)
+            inline _JSONType to_json_type(const OIIO::ParamValue& pvalue)
             {
                 _COMPRESSED_PROFILE_FUNCTION();
                 auto type = pvalue.type();
@@ -116,18 +113,20 @@ namespace NAMESPACE_COMPRESSED_IMAGE
                     type == OIIO::TypeDesc::INT32 ||
                     type == OIIO::TypeDesc::UINT64 ||
                     type == OIIO::TypeDesc::INT64
-                    )
+                )
                 {
                     return _JSONType::_int;
                 }
-                else if (type == OIIO::TypeDesc::HALF || type == OIIO::TypeDesc::FLOAT || type == OIIO::TypeDesc::DOUBLE)
+                else if (type == OIIO::TypeDesc::HALF || type == OIIO::TypeDesc::FLOAT || type ==
+                    OIIO::TypeDesc::DOUBLE)
                 {
                     return _JSONType::_float;
                 }
 
                 throw std::invalid_argument(
                     std::format(
-                        "Unknown json type for param value: {}", pvalue.name().string()
+                        "Unknown json type for param value: {}",
+                        pvalue.name().string()
                     )
                 );
             }
@@ -240,10 +239,7 @@ namespace NAMESPACE_COMPRESSED_IMAGE
                 return out;
             }
         }
-
     } // detail
-
-
 } // NAMESPACE_COMPRESSED_IMAGE
 
-#endif // COMPRESSED_IMAGE_OIIO_AVAILABLE
\ No newline at end of file
+#endif // COMPRESSED_IMAGE_OIIO_AVAILABLE
diff --git a/compressed_image/include/compressed/detail/scoped_timer.h b/compressed_image/include/compressed/detail/scoped_timer.h
index 2e2feba..7646149 100644
--- a/compressed_image/include/compressed/detail/scoped_timer.h
+++ b/compressed_image/include/compressed/detail/scoped_timer.h
@@ -22,19 +22,21 @@
 #include <thread>
 
 #ifdef _COMPRESSED_PROFILE
-#define _COMPRESSED_PROFILE_SCOPE(name) NAMESPACE_COMPRESSED_IMAGE::detail::InstrumentationTimer timer##__LINE__(name)
-#define _COMPRESSED_PROFILE_FUNCTION()  NAMESPACE_COMPRESSED_IMAGE::detail::InstrumentationTimer timer##__FUNCTION__##__LINE__(__FUNCTION__)
+#define CONCAT_2_IMPL(x, y) x##y
+#define CONCAT_2(x, y) CONCAT_2_IMPL(x, y)
+
+#define _COMPRESSED_PROFILE_SCOPE(name) NAMESPACE_COMPRESSED_IMAGE::detail::InstrumentationTimer CONCAT_2(timer, __LINE__)(name)
+#define _COMPRESSED_PROFILE_FUNCTION()  NAMESPACE_COMPRESSED_IMAGE::detail::InstrumentationTimer CONCAT_2(timer, __LINE__)(__FUNCTION__)
 #else
 #define _COMPRESSED_PROFILE_SCOPE(name)
 #define _COMPRESSED_PROFILE_FUNCTION()
 #endif
 
-namespace NAMESPACE_COMPRESSED_IMAGE
+namespace
+NAMESPACE_COMPRESSED_IMAGE
 {
-
     namespace detail
     {
-
         struct ProfileResult
         {
             std::string Name;
@@ -54,6 +56,7 @@ namespace NAMESPACE_COMPRESSED_IMAGE
             std::ofstream m_OutputStream;
             int m_ProfileCount;
             std::mutex m_lock;
+
         public:
             Instrumentor()
                 : m_CurrentSession(nullptr), m_ProfileCount(0)
@@ -64,7 +67,7 @@ namespace NAMESPACE_COMPRESSED_IMAGE
             {
                 m_OutputStream.open(filepath);
                 WriteHeader();
-                m_CurrentSession = new InstrumentationSession{ name };
+                m_CurrentSession = new InstrumentationSession{name};
             }
 
             void EndSession()
@@ -137,19 +140,21 @@ namespace NAMESPACE_COMPRESSED_IMAGE
             {
                 auto endTimepoint = std::chrono::high_resolution_clock::now();
 
-                long long start = std::chrono::time_point_cast<std::chrono::microseconds>(m_StartTimepoint).time_since_epoch().count();
-                long long end = std::chrono::time_point_cast<std::chrono::microseconds>(endTimepoint).time_since_epoch().count();
+                long long start = std::chrono::time_point_cast<std::chrono::microseconds>(m_StartTimepoint).
+                                  time_since_epoch().count();
+                long long end = std::chrono::time_point_cast<std::chrono::microseconds>(endTimepoint).time_since_epoch()
+                    .count();
 
                 uint32_t threadID = static_cast<uint32_t>(std::hash<std::thread::id>{}(std::this_thread::get_id()));
-                Instrumentor::Get().WriteProfile({ m_Name, start, end, threadID });
+                Instrumentor::Get().WriteProfile({m_Name, start, end, threadID});
 
                 m_Stopped = true;
             }
+
         private:
             const std::string m_Name{};
             std::chrono::time_point<std::chrono::high_resolution_clock> m_StartTimepoint;
             bool m_Stopped = false;
         };
-
     } // detail
 } // NAMESPACE_COMPRESSED_IMAGE
diff --git a/compressed_image/include/compressed/detail/scratch_buffer_pool.h b/compressed_image/include/compressed/detail/scratch_buffer_pool.h
new file mode 100644
index 0000000..9fc9bf8
--- /dev/null
+++ b/compressed_image/include/compressed/detail/scratch_buffer_pool.h
@@ -0,0 +1,406 @@
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <memory>
+#include <mutex>
+#include <span>
+#include <vector>
+
+#include "compressed/macros.h"
+#include "compressed/util.h"
+
+namespace
+NAMESPACE_COMPRESSED_IMAGE
+{
+    namespace detail
+    {
+        class scratch_buffer_pool;
+
+        /// \brief RAII handle representing a temporary scratch buffer checked out from a scratch buffer pool.
+        ///
+        /// The lease owns the temporary byte buffer while compression is using it. When the lease is destroyed,
+        /// the buffer is returned to the originating pool if the pool is still alive and the buffer satisfies the
+        /// pool's caching limits.
+        ///
+        /// This type is move-only. Moving transfers both the buffer and the responsibility to return it.
+        class scratch_buffer_lease
+        {
+        public:
+            scratch_buffer_lease() = default;
+
+            /// Construct a lease from a pool and an already allocated buffer.
+            ///
+            /// \param pool The pool the buffer should be returned to when the lease is destroyed.
+            /// \param buffer The byte buffer owned by this lease.
+            /// \param size The logical size of the scratch buffer to expose via span().
+            scratch_buffer_lease(
+                std::shared_ptr<scratch_buffer_pool> pool,
+                util::default_init_vector<std::byte> buffer,
+                const size_t size
+            )
+                : m_pool(std::move(pool)),
+                  m_buffer(std::move(buffer)),
+                  m_size(size)
+            {
+            }
+
+            scratch_buffer_lease(scratch_buffer_lease&& other) noexcept
+                : m_pool(std::move(other.m_pool)),
+                  m_buffer(std::move(other.m_buffer)),
+                  m_size(other.m_size)
+            {
+                other.m_size = 0;
+            }
+
+            scratch_buffer_lease& operator=(scratch_buffer_lease&& other) noexcept
+            {
+                if (this != &other)
+                {
+                    release();
+
+                    m_pool = std::move(other.m_pool);
+                    m_buffer = std::move(other.m_buffer);
+                    m_size = other.m_size;
+
+                    other.m_size = 0;
+                }
+
+                return *this;
+            }
+
+            scratch_buffer_lease(const scratch_buffer_lease&) = delete;
+            scratch_buffer_lease& operator=(const scratch_buffer_lease&) = delete;
+
+            ~scratch_buffer_lease()
+            {
+                release();
+            }
+
+            /// Retrieve a mutable span over the leased scratch memory.
+            ///
+            /// The returned span is only valid for as long as this lease remains alive and unmoved.
+            ///
+            /// \return A mutable span covering the requested logical scratch buffer size.
+            std::span<std::byte> span() noexcept
+            {
+                return std::span<std::byte>(m_buffer.data(), m_size);
+            }
+
+            /// Retrieve a const span over the leased scratch memory.
+            ///
+            /// The returned span is only valid for as long as this lease remains alive and unmoved.
+            ///
+            /// \return A const span covering the requested logical scratch buffer size.
+            std::span<const std::byte> span() const noexcept
+            {
+                return std::span<const std::byte>(m_buffer.data(), m_size);
+            }
+
+            /// Retrieve the logical size of the leased scratch span in bytes.
+            ///
+            /// \return The size requested when this lease was created.
+            size_t size() const noexcept
+            {
+                return m_size;
+            }
+
+            /// Check whether the lease currently owns a buffer large enough for its logical size.
+            ///
+            /// \return True if the lease owns a large enough buffer, false otherwise.
+            bool valid() const noexcept
+            {
+                return m_buffer.size() >= m_size;
+            }
+
+        private:
+            /// Return the currently held buffer to the pool, if any.
+            void release();
+
+            std::shared_ptr<scratch_buffer_pool> m_pool{};
+            util::default_init_vector<std::byte> m_buffer{};
+            size_t m_size = 0;
+        };
+
+
+        /// \brief Configuration options for scratch buffer pooling.
+        ///
+        /// These options control how many returned buffers are cached and how much memory the pool may retain.
+        /// Buffers that exceed the configured limits are simply released instead of cached.
+        struct scratch_buffer_pool_options
+        {
+            size_t max_cached_buffers = 0;
+            size_t max_cached_bytes = 1024 * 1024 * 1024; // 1GB
+        };
+
+
+        /// \brief Thread-safe pool for temporary compression scratch buffers.
+        ///
+        /// The pool is used by low-level CPU compression paths to avoid repeatedly allocating temporary output
+        /// buffers for Blosc2 compression. Buffers are handed out as move-only scratch_buffer_lease objects and are
+        /// automatically returned to the pool when the lease goes out of scope.
+        ///
+        /// The pool itself does not have global ownership. Channels keep a shared reference to the active pool,
+        /// while the global registry only stores a weak reference. This allows the pool to be globally discoverable
+        /// while still being destroyed when the last channel / iterator reference disappears.
+        class scratch_buffer_pool : public std::enable_shared_from_this<scratch_buffer_pool>
+        {
+        public:
+            explicit scratch_buffer_pool(const scratch_buffer_pool_options options = {})
+                : m_options(options)
+            {
+                if (m_options.max_cached_buffers == 0)
+                {
+                    m_options.max_cached_buffers = std::max<size_t>(1, std::thread::hardware_concurrency());
+                }
+            }
+
+            /// Acquire a scratch buffer of at least \p size bytes.
+            ///
+            /// This function first attempts to reuse the smallest cached buffer that is large enough. If none is available,
+            /// a new buffer is allocated. The returned lease keeps the pool alive for the duration of the lease.
+            ///
+            /// \param size The minimum scratch buffer size in bytes.
+            /// \return A move-only lease containing a scratch buffer with logical size \p size.
+            scratch_buffer_lease acquire(const size_t size)
+            {
+                util::default_init_vector<std::byte> buffer{};
+
+                {
+                    std::scoped_lock lock(m_mutex);
+
+                    auto best = m_available.end();
+                    for (auto it = m_available.begin(); it != m_available.end(); ++it)
+                    {
+                        if (it->size() >= size && (best == m_available.end() || it->size() < best->size()))
+                        {
+                            best = it;
+                        }
+                    }
+
+                    if (best != m_available.end())
+                    {
+                        m_cached_bytes -= best->size();
+                        buffer = std::move(*best);
+                        m_available.erase(best);
+                    }
+                }
+
+                // Fallback if no available buffer is found. Resize it and have it be returned to the pool after.
+                if (buffer.size() < size)
+                {
+                    buffer.resize(size);
+                }
+
+                return scratch_buffer_lease(shared_from_this(), std::move(buffer), size);
+            }
+
+            /// Clear all currently cached scratch buffers.
+            ///
+            /// Active leases are not affected. Buffers currently checked out will either be returned later or released,
+            /// depending on the pool limits at the time they are returned.
+            void clear()
+            {
+                std::scoped_lock lock(m_mutex);
+                m_available.clear();
+                m_cached_bytes = 0;
+            }
+
+            /// Retrieve the total number of bytes currently cached by the pool.
+            ///
+            /// This only includes buffers currently stored in the pool, not buffers checked out by active leases.
+            ///
+            /// \return The number of cached bytes.
+            size_t cached_bytes() const
+            {
+                std::scoped_lock lock(m_mutex);
+                return m_cached_bytes;
+            }
+
+            /// Retrieve the number of buffers currently cached by the pool.
+            ///
+            /// This only includes buffers currently stored in the pool, not buffers checked out by active leases.
+            ///
+            /// \return The number of cached buffers.
+            size_t cached_buffers() const
+            {
+                std::scoped_lock lock(m_mutex);
+                return m_available.size();
+            }
+
+        private:
+            friend class scratch_buffer_lease;
+
+            /// Return a buffer to the pool if doing so does not exceed the configured cache limits.
+            ///
+            /// \param buffer The buffer to return.
+            void release(util::default_init_vector<std::byte> buffer)
+            {
+                if (buffer.empty())
+                {
+                    return;
+                }
+
+                std::scoped_lock lock(m_mutex);
+
+                if (m_available.size() >= m_options.max_cached_buffers)
+                {
+                    return;
+                }
+
+                if (m_cached_bytes + buffer.size() > m_options.max_cached_bytes)
+                {
+                    return;
+                }
+
+                m_cached_bytes += buffer.size();
+                m_available.push_back(std::move(buffer));
+            }
+
+            scratch_buffer_pool_options m_options{};
+            mutable std::mutex m_mutex{};
+            std::vector<util::default_init_vector<std::byte>> m_available{};
+            size_t m_cached_bytes = 0;
+        };
+
+        inline void scratch_buffer_lease::release()
+        {
+            if (!m_pool || m_buffer.empty())
+            {
+                return;
+            }
+
+            auto pool = std::move(m_pool);
+            auto buffer = std::move(m_buffer);
+            m_size = 0;
+
+            pool->release(std::move(buffer));
+        }
+
+        /// \brief Weak global registry for the currently active scratch buffer pool.
+        ///
+        /// The registry allows low-level compression code to discover the active scratch pool without threading pool
+        /// references through every compression API. It intentionally stores only a weak reference so that the pool is
+        /// destroyed once all owning channel / iterator references are gone.
+        class scratch_pool_registry
+        {
+        public:
+            /// Retrieve the current pool or create a new one for channel-owned use.
+            ///
+            /// Channels call this to obtain a shared reference to the globally discoverable pool. The registry keeps only
+            /// a weak reference; the returned shared pointer is what keeps the pool alive.
+            ///
+            /// \return A shared pointer to the active scratch buffer pool.
+            static std::shared_ptr<scratch_buffer_pool> get_or_create_for_channel()
+            {
+                std::scoped_lock lock(mutex());
+
+                if (auto pool = pool_ref().lock())
+                {
+                    return pool;
+                }
+
+                auto pool = std::make_shared<scratch_buffer_pool>();
+                pool_ref() = pool;
+                return pool;
+            }
+
+            /// Retrieve the currently active scratch pool, if one is still alive.
+            ///
+            /// Low-level compression wrappers use this to acquire pooled scratch buffers when a channel-owned pool exists.
+            /// If no pool is alive, callers should fall back to local temporary allocation.
+            ///
+            /// \return The active pool, or nullptr if no channel-owned pool exists.
+            static std::shared_ptr<scratch_buffer_pool> current()
+            {
+                std::scoped_lock lock(mutex());
+                return pool_ref().lock();
+            }
+
+            /// Clear cached buffers from the active pool, if one exists.
+            ///
+            /// This does not destroy the pool while channels still hold shared references to it.
+            static void clear()
+            {
+                if (auto pool = current())
+                {
+                    pool->clear();
+                }
+            }
+
+            /// Retrieve the number of bytes cached in the active pool.
+            ///
+            /// \return The active pool's cached byte count, or 0 if no pool exists.
+            static size_t cached_bytes()
+            {
+                if (auto pool = current())
+                {
+                    return pool->cached_bytes();
+                }
+
+                return 0;
+            }
+
+            /// Retrieve the number of buffers cached in the active pool.
+            ///
+            /// \return The active pool's cached buffer count, or 0 if no pool exists.
+            static size_t cached_buffers()
+            {
+                if (auto pool = current())
+                {
+                    return pool->cached_buffers();
+                }
+
+                return 0;
+            }
+
+        private:
+            /// Retrieve the registry mutex.
+            ///
+            /// \return A process-local mutex guarding the weak pool reference.
+            static std::mutex& mutex()
+            {
+                static std::mutex value{};
+                return value;
+            }
+
+            /// Retrieve the weak reference to the currently active pool.
+            ///
+            /// \return A process-local weak pointer to the active pool.
+            static std::weak_ptr<scratch_buffer_pool>& pool_ref()
+            {
+                static std::weak_ptr<scratch_buffer_pool> value{};
+                return value;
+            }
+        };
+    } // namespace detail
+
+    /// \brief Clear cached scratch buffers from the active global scratch pool.
+    ///
+    /// This releases memory currently cached by the pool without invalidating any live channels or active compression
+    /// operations. Buffers checked out by active leases are not affected and may be returned to the pool later.
+    inline void clear_scratch_pool()
+    {
+        detail::scratch_pool_registry::clear();
+    }
+
+    /// \brief Retrieve the number of bytes currently cached by the active global scratch pool.
+    ///
+    /// This value does not include buffers that are currently checked out by active compression operations.
+    ///
+    /// \return The number of bytes cached by the active pool, or 0 if no pool is alive.
+    inline size_t scratch_pool_cached_bytes()
+    {
+        return detail::scratch_pool_registry::cached_bytes();
+    }
+
+    /// \brief Retrieve the number of buffers currently cached by the active global scratch pool.
+    ///
+    /// This value does not include buffers that are currently checked out by active compression operations.
+    ///
+    /// \return The number of buffers cached by the active pool, or 0 if no pool is alive.
+    inline size_t scratch_pool_cached_buffers()
+    {
+        return detail::scratch_pool_registry::cached_buffers();
+    }
+} // NAMESPACE_COMPRESSED_IMAGE
diff --git a/compressed_image/include/compressed/enums.h b/compressed_image/include/compressed/enums.h
index bdd9dd1..4cd76d6 100644
--- a/compressed_image/include/compressed/enums.h
+++ b/compressed_image/include/compressed/enums.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <map>
+
 #include "macros.h"
 
 #ifdef COMPRESSED_IMAGE_OIIO_AVAILABLE
@@ -7,70 +9,144 @@
 #include <OpenImageIO/half.h>
 #endif
 
-namespace NAMESPACE_COMPRESSED_IMAGE
+namespace
+NAMESPACE_COMPRESSED_IMAGE
 {
-	namespace enums
-	{
-		/// Enum representing available compression codecs.
-		///
-		/// These codecs are inherited from `blosc2` and define different compression algorithms
-		/// that can be used when storing or transmitting compressed images.
-		enum class codec
-		{
-			blosclz, ///< Lightweight, fast compression optimized for high-speed decompression.
-			lz4,     ///< Extremely fast compression and decompression with moderate compression ratio.
-			lz4hc,   ///< High-compression variant of LZ4 with slower compression but similar fast decompression.
-			zstd,	 ///< Zstandard compression providing high compression ratios with decent speed.
-		};
+    namespace enums
+    {
+        /// Enum representing available compression codecs.
+        ///
+        /// These codecs are inherited from `blosc2`/`nvcomp` and define different compression algorithms
+        /// that can be used when storing compressed images. Any gpu codecs rely on nvidia gpus to function
+        /// but will fall back gracefully to a cpu equivalent should there be no nvidia gpu or missing cuda
+        /// libraries.
+        enum class codec
+        {
+            blosclz, ///< Lightweight, fast compression optimized for high-speed decompression.
+            lz4, ///< Extremely fast compression and decompression with moderate compression ratio.
+            lz4hc, ///< High-compression variant of LZ4 with slower compression but similar fast decompression.
+            zstd, ///< Zstandard compression providing high compression ratios with decent speed.
+            lz4_gpu, ///< (cuda) gpu variant of lz4 compression, faster throughput compared to regular lz4
+            snappy_gpu, ///< (cuda) gpu variant of snappy, a fast compression codec with moderate throughput
+            zstd_gpu, ///< (cuda) gpu variant of zstd, faster throughput compared to regular zstd
+            deflate_gpu, ///< (cuda) gpu variant of deflate, faster througput compared to regular deflate
+            gdeflate_gpu, ///< (cuda) a bit-swizzled variant of deflate, optimized for gpu performance.
+            cascaded_gpu
+            ///< (cuda) proprietary compression scheme built up by several simple compression schemes like rle, bitpacking and delta
+        };
 
+        [[nodiscard]] constexpr std::string_view to_string(const codec value)
+        {
+            switch (value)
+            {
+            case codec::blosclz: return "blosclz";
+            case codec::lz4: return "lz4";
+            case codec::lz4hc: return "lz4hc";
+            case codec::zstd: return "zstd";
+            case codec::lz4_gpu: return "lz4_gpu";
+            case codec::snappy_gpu: return "snappy_gpu";
+            case codec::zstd_gpu: return "zstd_gpu";
+            case codec::deflate_gpu: return "deflate_gpu";
+            case codec::gdeflate_gpu: return "gdeflate_gpu";
+            case codec::cascaded_gpu: return "cascaded_gpu";
+            }
 
-#ifdef COMPRESSED_IMAGE_OIIO_AVAILABLE
+            return "unknown";
+        }
 
-		/// Get a OpenImageIO TypeDesc based on the given template parameter returning OIIO::TypeDesc::Unknown
-		/// if the image coordinate is not part of the valid template specializations for photoshop buffers
-		template <typename T>
-		constexpr OIIO::TypeDesc get_type_desc()
-		{
-			if constexpr (std::is_same_v<T, uint8_t>)
-			{
-				return OIIO::TypeDesc::UINT8;
-			}
-			else if constexpr (std::is_same_v<T, int8_t>)
-			{
-				return OIIO::TypeDesc::INT8;
-			}
-			else if constexpr (std::is_same_v<T, uint16_t>)
-			{
-				return OIIO::TypeDesc::UINT16;
-			}
-			else if constexpr (std::is_same_v<T, int16_t>)
-			{
-				return OIIO::TypeDesc::INT16;
-			}
-			else if constexpr (std::is_same_v<T, uint32_t>)
-			{
-				return OIIO::TypeDesc::UINT32;
-			}
-			else if constexpr (std::is_same_v<T, int32_t>)
-			{
-				return OIIO::TypeDesc::INT32;
-			}
-			else if constexpr (std::is_same_v<T, float>)
-			{
-				return OIIO::TypeDesc::FLOAT;
-			}
-			else if constexpr (std::is_same_v<T, half>)
-			{
-				return OIIO::TypeDesc::HALF;
-			}
-			else
-			{
-				return OIIO::TypeDesc::UNKNOWN;
-			}
-		}
+        [[nodiscard]] inline bool is_gpu_codec(const codec codec)
+        {
+            if (codec == codec::blosclz || codec == codec::lz4 || codec == codec::lz4hc || codec == codec::zstd)
+            {
+                return false;
+            }
+            return true;
+        }
 
-#endif // COMPRESSED_IMAGE_OIIO_AVAILABLE
+        /// \brief map for the cpu codec fallbacks if no nvidia gpu is detected.
+        ///
+        /// These are constant and do not change.
+        static const std::map<codec, codec> s_gpu_codec_fallback = {
+            {codec::lz4_gpu, codec::lz4},
+            {codec::snappy_gpu, codec::lz4},
+            {codec::zstd_gpu, codec::zstd},
+            {codec::deflate_gpu, codec::zstd},
+            {codec::gdeflate_gpu, codec::zstd},
+            {codec::cascaded_gpu, codec::lz4}
+        };
+
+        namespace detail
+        {
+            /// \brief enum representing the different underlying compression/decompression wrappers we use for cpu/gpu
+            enum class compression_library
+            {
+                c_blosc2,
+                nvcomp
+            };
+
+            /// \brief mapping of compression codecs to their respective underlying libraries.
+            ///
+            /// Used internally to dispatch the calls.
+            static const std::map<codec, compression_library> s_library_mapping = {
+                {codec::blosclz, compression_library::c_blosc2},
+                {codec::lz4, compression_library::c_blosc2},
+                {codec::lz4hc, compression_library::c_blosc2},
+                {codec::zstd, compression_library::c_blosc2},
+                {codec::lz4_gpu, compression_library::nvcomp},
+                {codec::snappy_gpu, compression_library::nvcomp},
+                {codec::zstd_gpu, compression_library::nvcomp},
+                {codec::deflate_gpu, compression_library::nvcomp},
+                {codec::gdeflate_gpu, compression_library::nvcomp},
+                {codec::cascaded_gpu, compression_library::nvcomp}
+            };
+        } // namespace detail
 
-	} // namespace enums
 
+#ifdef COMPRESSED_IMAGE_OIIO_AVAILABLE
+
+        /// Get a OpenImageIO TypeDesc based on the given template parameter returning OIIO::TypeDesc::Unknown
+        /// if the image coordinate is not part of the valid template specializations for photoshop buffers
+        template <typename T>
+        constexpr OIIO::TypeDesc get_type_desc()
+        {
+            if constexpr (std::is_same_v<T, uint8_t>)
+            {
+                return OIIO::TypeDesc::UINT8;
+            }
+            else if constexpr (std::is_same_v<T, int8_t>)
+            {
+                return OIIO::TypeDesc::INT8;
+            }
+            else if constexpr (std::is_same_v<T, uint16_t>)
+            {
+                return OIIO::TypeDesc::UINT16;
+            }
+            else if constexpr (std::is_same_v<T, int16_t>)
+            {
+                return OIIO::TypeDesc::INT16;
+            }
+            else if constexpr (std::is_same_v<T, uint32_t>)
+            {
+                return OIIO::TypeDesc::UINT32;
+            }
+            else if constexpr (std::is_same_v<T, int32_t>)
+            {
+                return OIIO::TypeDesc::INT32;
+            }
+            else if constexpr (std::is_same_v<T, float>)
+            {
+                return OIIO::TypeDesc::FLOAT;
+            }
+            else if constexpr (std::is_same_v<T, half>)
+            {
+                return OIIO::TypeDesc::HALF;
+            }
+            else
+            {
+                return OIIO::TypeDesc::UNKNOWN;
+            }
+        }
+
+#endif // COMPRESSED_IMAGE_OIIO_AVAILABLE
+    } // namespace enums
 } // NAMESPACE_COMPRESSED_IMAGE
diff --git a/compressed_image/include/compressed/gpu_util.h b/compressed_image/include/compressed/gpu_util.h
new file mode 100644
index 0000000..df3b30b
--- /dev/null
+++ b/compressed_image/include/compressed/gpu_util.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "compressed/macros.h"
+#include "compressed/cuda/cuda_hook.h"
+
+namespace
+NAMESPACE_COMPRESSED_IMAGE::gpu
+{
+    [[nodiscard]] inline bool is_available() noexcept
+    {
+        try
+        {
+            auto& inst = cuda::cuda_api::instance();
+            return true;
+        }
+        catch (...)
+        {
+            return false;
+        }
+    }
+} // namespace NAMESPACE_COMPRESSED_IMAGE::gpu
diff --git a/compressed_image/include/compressed/image.h b/compressed_image/include/compressed/image.h
index 6bf0832..6940566 100644
--- a/compressed_image/include/compressed/image.h
+++ b/compressed_image/include/compressed/image.h
@@ -18,7 +18,6 @@
 #endif
 
 #include "macros.h"
-#include "fwd.h"
 #include "blosc2/wrapper.h"
 #include "blosc2/schunk.h"
 #include "blosc2/lazyschunk.h"
@@ -29,1884 +28,1942 @@
 #include "detail/oiio_util.h"
 #include "detail/scoped_timer.h"
 
-#include "iterators/iterator.h"
-
-namespace NAMESPACE_COMPRESSED_IMAGE 
+namespace
+NAMESPACE_COMPRESSED_IMAGE
 {
-
-	/// Compressed Image representation with easy access to different channels. Internally functions very similar to an NDArray
-	/// with the important distinction that the number of dimensions is fixed to be 3-Dimensional (width, height, channels).
-	/// They are laid out in scanline order with each channel being its own distinct object which may have any size.
-	/// 
-	/// The image is stored in a non-resizable fashion so whatever the resolution was going into it, is what the image will be.
-	/// To rescale or refit the image a new `image` has to be constructed.
-	/// 
-	/// The data is compressed in memory and we store it as part of a blosc2 super-chunk which is essentially a 3d array of 
-	/// super-chunk -> chunk -> block. Where having the block size fit into L1 cache and the Chunk size into L3 cache is desirable
-	/// as each block can be handled by a single cpu core while the chunk fits well within shared L3 memory.
-	template <typename T>
-	struct image : public std::ranges::view_interface<image<T>>
-	{
-		using value_type = T;
-
-		image() = default;
-		image(image&&) = default;
-		image& operator=(image&&) = default;
-		image(const image&) = delete;
-		image& operator=(const image&) = delete;
-		~image() = default;
-
-
-		/// Constructs an image object with the specified channels, dimensions, and optional compression parameters.
-		/// 
-		/// This constructor creates an image from a given set of channels. The channel names can optionally be specified. 
-		/// The image is then compressed using the provided codec and compression level.
-		/// 
-		/// Example:
-		/// \code{.cpp}
-		/// std::vector<std::span<const uint8_t>> channels = ...;
-		/// compressed::image<uint8_t> my_image(channels, 1920, 1080, {"r", "g", "b"}, codec::lz4, 5);
-		/// \endcode
-		/// 
-		/// \param channels A vector of spans containing the image channels (each channel is a 2D array of pixel data).
-		///					on construction these will be compressed thus the data can be safely freed after this function.
-		/// \param width The width of the image in pixels.
-		/// \param height The height of the image in pixels.
-		/// \param channel_names (Optional) A list of channel names, must match the number of channels provided. 
-		///					     If omitted or incorrect, channel names are ignored.
-		/// \param compression_codec (Optional) The codec used for compression, default is `codec::lz4`.
-		/// \param compression_level (Optional) The compression level, default is `9`.
-		/// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to 
-		///					  comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle 
-		///					  larger blocks feel free to up this number.
-		/// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. 
-		///					  This should be tweaked to be no larger than the size of the usual images you are expecting  
-		///					  to compress for optimal performance but this could be upped which might give better compression
-		///					  ratios. Must be a multiple of sizeof(T).
-		/// \throws std::runtime_error if a channel fails to be inserted.
-		image(
-			std::vector<std::span<const T>> channels,
-			size_t width,
-			size_t height,
-			std::vector<std::string> channel_names = {},
-			enums::codec compression_codec = enums::codec::lz4,
-			size_t compression_level = 9,
-			size_t block_size = s_default_blocksize,
-			size_t chunk_size = s_default_chunksize
-		)
-		{
-			_COMPRESSED_PROFILE_FUNCTION();
-			m_Width = width;
-			m_Height = height;
-			m_ChannelNames = channel_names;
-			auto comp_level_adjusted = util::ensure_compression_level(compression_level);
-
-			// c-blosc2 chunks can at most be 2 gigabytes so the set chunk size should not exceed this.
-			assert(chunk_size < std::numeric_limits<int32_t>::max());
-			assert(block_size < chunk_size);
-			if (channel_names.size() != channels.size() && channel_names.size() != 0)
-			{
-				std::cout << std::format(
-					"Invalid channelnames passed to image constructor, required them to match the number of" \
-					" channels in the channels parameter.Expected {} items but instead got {} names. Ignoring channel names",
-					channels.size(), channel_names.size()) << std::endl;
-
-				m_ChannelNames = {};
-			}
-
-			// Iterate all channels and start creating channels for it.
-			size_t channel_idx = 0;
-			for (const auto& _channel : channels)
-			{
-				try
-				{
-					// Generate the channel and append it.
-					m_Channels.push_back(compressed::channel<T>(
-						_channel,
-						width,
-						height,
-						compression_codec,
-						comp_level_adjusted,
-						block_size,
-						chunk_size
-					));
-				}
-				catch (const std::exception& e)
-				{
-					if (m_ChannelNames.size() > 0)
-					{
-						throw std::runtime_error(
-							std::format(
-								"Failed to insert channel '{}' at position {}. Full error: \n{}",
-								m_ChannelNames[channel_idx],
-								channel_idx,
-								e.what()
-							)
-						);
-					}
-					else
-					{
-						throw std::runtime_error(
-							std::format(
-								"Failed to insert channel at position {}. Full error: \n{}",
-								channel_idx,
-								e.what()
-							)
-						);
-					}
-				}
-				++channel_idx;
-			}
-		}
-
-
-		/// Constructs an image object with the specified channels, dimensions, and optional compression parameters.
-		/// 
-		/// This constructor creates an image from a given set of channels. The channel names can optionally be specified. 
-		/// The image is then compressed using the provided codec and compression level.
-		/// 
-		/// Example:
-		/// \code{.cpp}
-		/// std::vector<std::vector<uint8_t>> channels = ...;
-		/// compressed::image<uint8_t> my_image(channels, 1920, 1080, {"r", "g", "b"}, codec::lz4, 5);
-		/// \endcode
-		/// 
-		/// \param channels A vector of vectors containing the image channels (each channel is a 2D array of pixel data).
-		/// \param width The width of the image in pixels.
-		/// \param height The height of the image in pixels.
-		/// \param channel_names (Optional) A list of channel names, must match the number of channels provided. 
-		///					     If omitted or incorrect, channel names are ignored.
-		/// \param compression_codec (Optional) The codec used for compression, default is `codec::lz4`.
-		/// \param compression_level (Optional) The compression level, default is `9`.
-		/// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to 
-		///					  comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle 
-		///					  larger blocks feel free to up this number.
-		/// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. 
-		///					  This should be tweaked to be no larger than the size of the usual images you are expecting  
-		///					  to compress for optimal performance but this could be upped which might give better compression
-		///					  ratios. Must be a multiple of sizeof(T).
-		/// \throws std::runtime_error if a channel fails to be inserted.
-		image(
-			std::vector<std::vector<T>> channels,
-			size_t width,
-			size_t height,
-			std::vector<std::string> channel_names = {},
-			enums::codec compression_codec = enums::codec::lz4,
-			size_t compression_level = 9,
-			size_t block_size = s_default_blocksize,
-			size_t chunk_size = s_default_chunksize
-		)
-		{
-			_COMPRESSED_PROFILE_FUNCTION();
-			m_Width = width;
-			m_Height = height;
-			m_ChannelNames = channel_names;
-			auto comp_level_adjusted = util::ensure_compression_level(compression_level);
-
-			// c-blosc2 chunks can at most be 2 gigabytes so the set chunk size should not exceed this.
-			assert(chunk_size < std::numeric_limits<int32_t>::max());
-			assert(block_size < chunk_size);
-			if (channel_names.size() != channels.size() && channel_names.size() != 0)
-			{
-				std::cout << std::format(
-					"Invalid channelnames passed to image constructor, required them to match the number of" \
-					" channels in the channels parameter.Expected {} items but instead got {} names. Ignoring channel names", 
-					channels.size(), channel_names.size()) << std::endl;
-
-				m_ChannelNames = {};
-			}
-
-			// Iterate all channels and start creating channels for it.
-			size_t channel_idx = 0;
-			for (const auto& _channel : channels)
-			{
-				try
-				{
-					// Generate the channel and append it.
-					m_Channels.push_back(compressed::channel<T>(
-						std::span<const T>(_channel.begin(), _channel.end()),
-						width,
-						height,
-						compression_codec,
-						comp_level_adjusted,
-						block_size,
-						chunk_size
-					));
-				}
-				catch (const std::exception& e)
-				{
-					if (m_ChannelNames.size() > 0)
-					{
-						throw std::runtime_error(
-							std::format(
-								"Failed to insert channel '{}' at position {}. Full error: \n{}",
-								m_ChannelNames[channel_idx],
-								channel_idx,
-								e.what()
-							)
-						);
-					}
-					else
-					{
-						throw std::runtime_error(
-							std::format(
-								"Failed to insert channel at position {}. Full error: \n{}",
-								channel_idx,
-								e.what()
-							)
-						);
-					}
-				}
-				++channel_idx;
-			}
-		}
-
-
-		/// Constructs an image object with the specified channels and dimensions, optionally passing channelnames.
-		/// 
-		/// This constructor creates an image from a given set of channels. The channel names can optionally be specified. 
-		/// The passed channels should already be compressed::channel instances.
-		/// 
-		/// 
-		/// \param channels A vector of compressed::channel instances to initialize the image with.
-		/// \param width The width of the image in pixels.
-		/// \param height The height of the image in pixels.
-		/// \param channel_names (Optional) A list of channel names, must match the number of channels provided. 
-		///					     If omitted or incorrect, channel names are ignored.
-		image(
-			std::vector<compressed::channel<T>> channels,
-			size_t width,
-			size_t height,
-			std::vector<std::string> channel_names = {}
-		)
-		{
-			_COMPRESSED_PROFILE_FUNCTION();
-			m_Width = width;
-			m_Height = height;
-			m_ChannelNames = channel_names;
-
-			if (channel_names.size() != channels.size() && channel_names.size() != 0)
-			{
-				std::cout << std::format(
-					"Invalid channelnames passed to image constructor, required them to match the number of" \
-					" channels in the channels parameter.Expected {} items but instead got {} names. Ignoring channel names",
-					channels.size(), channel_names.size()) << std::endl;
-
-				m_ChannelNames = {};
-			}
-
-			size_t counter = 0;
-			for (auto& channel : channels)
-			{
-				if (channel.width() != width)
-				{
-					throw std::invalid_argument(
-						std::format(
-							"Invalid channel passed to compressed::image constructor at index {}. It's width does not"
-							" equal {} but instead is {}",
-							counter, width, channel.width()
-						)
-					);
-				}
-				if (channel.height() != height)
-				{
-					throw std::invalid_argument(
-						std::format(
-							"Invalid channel passed to compressed::image constructor at index {}. It's height does not"
-							" equal {} but instead is {}",
-							counter, height, channel.height()
-						)
-					);
-				}
-
-				++counter;
-			}
-			m_Channels = std::move(channels);
-		}
+    /// Compressed Image representation with easy access to different channels. Internally functions very similar to an NDArray
+    /// with the important distinction that the number of dimensions is fixed to be 3-Dimensional (width, height, channels).
+    /// They are laid out in scanline order with each channel being its own distinct object which may have any size.
+    ///
+    /// The image is stored in a non-resizable fashion so whatever the resolution was going into it, is what the image will be.
+    /// To rescale or refit the image a new `image` has to be constructed.
+    ///
+    /// The data is compressed in memory and we store it as part of a blosc2 super-chunk which is essentially a 3d array of
+    /// super-chunk -> chunk -> block. Where having the block size fit into L1 cache and the Chunk size into L3 cache is desirable
+    /// as each block can be handled by a single cpu core while the chunk fits well within shared L3 memory.
+    template <typename T>
+    struct image : public std::ranges::view_interface<image<T>>
+    {
+        using value_type = T;
+
+        image() = default;
+        image(image&&) = default;
+        image& operator=(image&&) = default;
+        image(const image&) = delete;
+        image& operator=(const image&) = delete;
+        ~image() = default;
+
+
+        /// Constructs an image object with the specified channels, dimensions, and optional compression parameters.
+        ///
+        /// This constructor creates an image from a given set of channels. The channel names can optionally be specified.
+        /// The image is then compressed using the provided codec and compression level.
+        ///
+        /// Example:
+        /// \code{.cpp}
+        /// std::vector<std::span<const uint8_t>> channels = ...;
+        /// compressed::image<uint8_t> my_image(channels, 1920, 1080, {"r", "g", "b"}, codec::lz4, 5);
+        /// \endcode
+        ///
+        /// \param channels A vector of spans containing the image channels (each channel is a 2D array of pixel data).
+        ///					on construction these will be compressed thus the data can be safely freed after this function.
+        /// \param width The width of the image in pixels.
+        /// \param height The height of the image in pixels.
+        /// \param channel_names (Optional) A list of channel names, must match the number of channels provided.
+        ///					     If omitted or incorrect, channel names are ignored.
+        /// \param compression_codec (Optional) The codec used for compression, default is `codec::lz4`.
+        /// \param compression_level (Optional) The compression level, default is `9`.
+        /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to
+        ///					  comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle
+        ///					  larger blocks feel free to up this number.
+        /// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel.
+        ///					  This should be tweaked to be no larger than the size of the usual images you are expecting
+        ///					  to compress for optimal performance but this could be upped which might give better compression
+        ///					  ratios. Must be a multiple of sizeof(T).
+        /// \throws std::runtime_error if a channel fails to be inserted.
+        image(
+            std::vector<std::span<const T>> channels,
+            size_t width,
+            size_t height,
+            const std::vector<std::string>& channel_names = {},
+            enums::codec compression_codec = enums::codec::lz4,
+            size_t compression_level = 9,
+            size_t block_size = s_default_blocksize,
+            size_t chunk_size = s_default_chunksize
+        )
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            m_Width = width;
+            m_Height = height;
+            m_ChannelNames = channel_names;
+            auto comp_level_adjusted = util::ensure_compression_level(compression_level);
+
+            // c-blosc2 chunks can at most be 2 gigabytes so the set chunk size should not exceed this.
+            assert(chunk_size < std::numeric_limits<int32_t>::max());
+            assert(block_size < chunk_size);
+            if (channel_names.size() != channels.size() && channel_names.size() != 0)
+            {
+                std::cout << std::format(
+                    "Invalid channelnames passed to image constructor, required them to match the number of"
+                    " channels in the channels parameter.Expected {} items but instead got {} names. Ignoring channel names",
+                    channels.size(),
+                    channel_names.size()
+                ) << std::endl;
+
+                m_ChannelNames = {};
+            }
+
+            // Iterate all channels and start creating channels for it.
+            size_t channel_idx = 0;
+            for (const auto& _channel : channels)
+            {
+                try
+                {
+                    // Generate the channel and append it.
+                    m_Channels.push_back(
+                        compressed::channel<T>(
+                            _channel,
+                            width,
+                            height,
+                            compression_codec,
+                            comp_level_adjusted,
+                            block_size,
+                            chunk_size
+                        )
+                    );
+                }
+                catch (const std::exception& e)
+                {
+                    if (m_ChannelNames.size() > 0)
+                    {
+                        throw std::runtime_error(
+                            std::format(
+                                "Failed to insert channel '{}' at position {}. Full error: \n{}",
+                                m_ChannelNames[channel_idx],
+                                channel_idx,
+                                e.what()
+                            )
+                        );
+                    }
+                    else
+                    {
+                        throw std::runtime_error(
+                            std::format(
+                                "Failed to insert channel at position {}. Full error: \n{}",
+                                channel_idx,
+                                e.what()
+                            )
+                        );
+                    }
+                }
+                ++channel_idx;
+            }
+        }
+
+
+        /// Constructs an image object with the specified channels, dimensions, and optional compression parameters.
+        ///
+        /// This constructor creates an image from a given set of channels. The channel names can optionally be specified.
+        /// The image is then compressed using the provided codec and compression level.
+        ///
+        /// Example:
+        /// \code{.cpp}
+        /// std::vector<std::vector<uint8_t>> channels = ...;
+        /// compressed::image<uint8_t> my_image(channels, 1920, 1080, {"r", "g", "b"}, codec::lz4, 5);
+        /// \endcode
+        ///
+        /// \param channels A vector of vectors containing the image channels (each channel is a 2D array of pixel data).
+        /// \param width The width of the image in pixels.
+        /// \param height The height of the image in pixels.
+        /// \param channel_names (Optional) A list of channel names, must match the number of channels provided.
+        ///					     If omitted or incorrect, channel names are ignored.
+        /// \param compression_codec (Optional) The codec used for compression, default is `codec::lz4`.
+        /// \param compression_level (Optional) The compression level, default is `9`.
+        /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to
+        ///					  comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle
+        ///					  larger blocks feel free to up this number.
+        /// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel.
+        ///					  This should be tweaked to be no larger than the size of the usual images you are expecting
+        ///					  to compress for optimal performance but this could be upped which might give better compression
+        ///					  ratios. Must be a multiple of sizeof(T).
+        /// \throws std::runtime_error if a channel fails to be inserted.
+        image(
+            std::vector<std::vector<T>> channels,
+            size_t width,
+            size_t height,
+            std::vector<std::string> channel_names = {},
+            enums::codec compression_codec = enums::codec::lz4,
+            size_t compression_level = 9,
+            size_t block_size = s_default_blocksize,
+            size_t chunk_size = s_default_chunksize
+        )
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            m_Width = width;
+            m_Height = height;
+            m_ChannelNames = channel_names;
+            auto comp_level_adjusted = util::ensure_compression_level(compression_level);
+
+            // c-blosc2 chunks can at most be 2 gigabytes so the set chunk size should not exceed this.
+            assert(chunk_size < std::numeric_limits<int32_t>::max());
+            assert(block_size < chunk_size);
+            if (channel_names.size() != channels.size() && channel_names.size() != 0)
+            {
+                std::cout << std::format(
+                    "Invalid channelnames passed to image constructor, required them to match the number of"
+                    " channels in the channels parameter.Expected {} items but instead got {} names. Ignoring channel names",
+                    channels.size(),
+                    channel_names.size()
+                ) << std::endl;
+
+                m_ChannelNames = {};
+            }
+
+            // Iterate all channels and start creating channels for it.
+            size_t channel_idx = 0;
+            for (const auto& _channel : channels)
+            {
+                try
+                {
+                    // Generate the channel and append it.
+                    m_Channels.push_back(
+                        compressed::channel<T>(
+                            std::span<const T>(_channel.begin(), _channel.end()),
+                            width,
+                            height,
+                            compression_codec,
+                            comp_level_adjusted,
+                            block_size,
+                            chunk_size
+                        )
+                    );
+                }
+                catch (const std::exception& e)
+                {
+                    if (m_ChannelNames.size() > 0)
+                    {
+                        throw std::runtime_error(
+                            std::format(
+                                "Failed to insert channel '{}' at position {}. Full error: \n{}",
+                                m_ChannelNames[channel_idx],
+                                channel_idx,
+                                e.what()
+                            )
+                        );
+                    }
+                    else
+                    {
+                        throw std::runtime_error(
+                            std::format(
+                                "Failed to insert channel at position {}. Full error: \n{}",
+                                channel_idx,
+                                e.what()
+                            )
+                        );
+                    }
+                }
+                ++channel_idx;
+            }
+        }
+
+
+        /// Constructs an image object with the specified channels and dimensions, optionally passing channelnames.
+        ///
+        /// This constructor creates an image from a given set of channels. The channel names can optionally be specified.
+        /// The passed channels should already be compressed::channel instances.
+        ///
+        ///
+        /// \param channels A vector of compressed::channel instances to initialize the image with.
+        /// \param width The width of the image in pixels.
+        /// \param height The height of the image in pixels.
+        /// \param channel_names (Optional) A list of channel names, must match the number of channels provided.
+        ///					     If omitted or incorrect, channel names are ignored.
+        image(
+            std::vector<compressed::channel<T>> channels,
+            size_t width,
+            size_t height,
+            std::vector<std::string> channel_names = {}
+        )
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            m_Width = width;
+            m_Height = height;
+            m_ChannelNames = channel_names;
+
+            if (channel_names.size() != channels.size() && channel_names.size() != 0)
+            {
+                std::cout << std::format(
+                    "Invalid channelnames passed to image constructor, required them to match the number of"
+                    " channels in the channels parameter.Expected {} items but instead got {} names. Ignoring channel names",
+                    channels.size(),
+                    channel_names.size()
+                ) << std::endl;
+
+                m_ChannelNames = {};
+            }
+
+            size_t counter = 0;
+            for (auto& channel : channels)
+            {
+                if (channel.width() != width)
+                {
+                    throw std::invalid_argument(
+                        std::format(
+                            "Invalid channel passed to compressed::image constructor at index {}. It's width does not"
+                            " equal {} but instead is {}",
+                            counter,
+                            width,
+                            channel.width()
+                        )
+                    );
+                }
+                if (channel.height() != height)
+                {
+                    throw std::invalid_argument(
+                        std::format(
+                            "Invalid channel passed to compressed::image constructor at index {}. It's height does not"
+                            " equal {} but instead is {}",
+                            counter,
+                            height,
+                            channel.height()
+                        )
+                    );
+                }
+
+                ++counter;
+            }
+            m_Channels = std::move(channels);
+        }
 
 
 #ifdef COMPRESSED_IMAGE_OIIO_AVAILABLE
 
-		/// \brief Reads a compressed image from a file using OpenImageIO and compresses it during reading.
-		/// 
-		/// Requires CompressedImage to have been compiled with OpenImageIO support.
-		/// 
-		/// This function reads an image file in chunks and compresses it on the fly leading to much
-		/// lower memory usage at near-identical performance to raw OpenImageIO reads. On an image
-		/// that is well compressible this can easily achieve a compression ratio of 5-10x.
-		/// 
-		/// The type does not have to match that of the underlying image as OpenImageIO will take
-		/// care of converting the files into the specified format. It is perfectly valid to read 
-		/// a floating point image as e.g. uint16_t etc.
-		/// 
-		/// Example:
-		/// \code{.cpp}
-		/// std::filesystem::path filepath = "image.exr";
-		/// auto img = compressed::image::read<uint8_t>(filepath, 0, compressed::enums::codec::lz4, 5);
-		/// \endcode
-		///
-		/// \param filepath The file path of the image to read.
-		/// \param subimage The subimage to extract the channels from (default: 0). Only relevant for multi-part images.
-		/// \param compression_codec The compression codec to use (default: LZ4).
-		/// \param compression_level The compression level (default: 9).
-		/// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to 
-		///					  comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle 
-		///					  larger blocks feel free to up this number.
-		/// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. 
-		///					  This should be tweaked to be no larger than the size of the usual images you are expecting  
-		///					  to compress for optimal performance but this could be upped which might give better compression
-		///					  ratios. Must be a multiple of sizeof(T).
-		/// \return A compressed image instance.
-		static image read(
-			std::filesystem::path filepath,
-			int subimage = 0,
-			enums::codec compression_codec = enums::codec::lz4,
-			size_t compression_level = 9,
-			size_t block_size = s_default_blocksize,
-			size_t chunk_size = s_default_chunksize
-		)
-		{
-			// Initialize the OIIO primitives
-			auto input_ptr = OIIO::ImageInput::open(filepath);
-			if (!input_ptr)
-			{
-				throw std::invalid_argument(std::format("File {} does not exist on disk", filepath.string()));
-			}
-
-			// Ensure we seek to the right subimage before retrieving the spec as it is subimage dependent.
-			auto res = input_ptr->seek_subimage(subimage, 0);
-			if (!res)
-			{
-				throw std::invalid_argument(
-					std::format(
-						"File '{}' does not have a subimage {}, cannot seek to it", filepath.string(), subimage
-					)
-				);
-			}
-			const OIIO::ImageSpec& spec = input_ptr->spec();
-
-			return image<T>::read(
-				std::move(input_ptr),
-				spec.channelnames,
-				subimage,
-				compression_codec,
-				compression_level,
-				block_size,
-				chunk_size
-			);
-		}
-
-		/// \brief Reads a compressed image from a file using OpenImageIO and compresses it during reading.
-		/// 
-		/// Requires CompressedImage to have been compiled with OpenImageIO support.
-		/// 
-		/// This function reads an image file in chunks and compresses it on the fly leading to much
-		/// lower memory usage at near-identical performance to raw OpenImageIO reads. On an image
-		/// that is well compressible this can easily achieve a compression ratio of 5-10x.
-		/// 
-		/// The type does not have to match that of the underlying image as OpenImageIO will take
-		/// care of converting the files into the specified format. It is perfectly valid to read 
-		/// a floating point image as e.g. uint16_t etc.
-		/// 
-		/// This overload allows you to specify a custom invocable function which is executed after a chunk has been read
-		/// and before it is compressed. If you have some common operations like color management or a filter which you
-		/// wish to apply this would go in here.
-		/// Specifying these right away in the read is much more efficient than iterating over the image again later and
-		/// applying these.
-		/// 
-		/// The function passed should have no notion of coordinates or similar, it should simply assume to receive a block
-		/// of data (that is part of an image) as well as the channel index we are currently operating over.
-		/// 
-		/// Example:
-		/// \code{.cpp}
-		/// std::filesystem::path filepath = "image.exr";
-		/// 
-		/// // Read an image file and apply a post-process which adds 1 to the pixel value for all RGB channels (0, 1, 2).
-		/// 
-		/// auto postprocess = [](size_t channel_idx, std::span<T> chunk)
-		///		{
-		///			if (channel_idx > 2)
-		///			{
-		///				return;
-		///			}
-		/// 
-		///			std::for_each(std::execution::par_unseq, chunk.begin(), chunk.end(), [](T& value)
-		///			{
-		///				value += 1;
-		///			}
-		///		};
-		/// 
-		/// auto img = compressed::image::read<uint8_t>(
-		///		filepath, 
-		///		std::forward(postprocess),
-		///		0, // subimage
-		///		compressed::enums::codec::lz4, // compression_code
-		///		5 // compression_level
-		/// );
-		/// \endcode
-		///
-		/// \param filepath The file path of the image to read.
-		/// \param postprocess A postprocessing function to run after read but before re-compression. This function should
-		///					   take a `size_t` and a `std::span<T>` where the `size_t` is the channel index we are currently
-		///					   iterating over (e.g. 3 for the alpha channel) and the `std::span<T>` is a chunk within that
-		///					   channel, where this chunk is and what coordinates it represents is not passed along.
-		/// \param subimage The subimage to extract the channels from (default: 0). Only relevant for multi-part images.
-		/// \param compression_codec The compression codec to use (default: LZ4).
-		/// \param compression_level The compression level (default: 9).
-		/// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to 
-		///					  comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle 
-		///					  larger blocks feel free to up this number.
-		/// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. 
-		///					  This should be tweaked to be no larger than the size of the usual images you are expecting  
-		///					  to compress for optimal performance but this could be upped which might give better compression
-		///					  ratios. Must be a multiple of sizeof(T).
-		/// \return A compressed image instance.
-		template <typename PostProcess>
-			requires std::invocable<std::remove_reference_t<PostProcess>, size_t, std::span<T>>
-		static image read(
-			std::filesystem::path filepath,
-			PostProcess&& postprocess,
-			int subimage = 0,
-			enums::codec compression_codec = enums::codec::lz4,
-			size_t compression_level = 9,
-			size_t block_size = s_default_blocksize,
-			size_t chunk_size = s_default_chunksize
-		)
-		{
-			_COMPRESSED_PROFILE_FUNCTION();
-			// Initialize the OIIO primitives
-			auto input_ptr = OIIO::ImageInput::open(filepath);
-			if (!input_ptr)
-			{
-				throw std::invalid_argument(std::format("File {} does not exist on disk", filepath.string()));
-			}
-			
-			// Ensure we seek to the right subimage before retrieving the spec as it is subimage dependent.
-			auto res = input_ptr->seek_subimage(subimage, 0);
-			if (!res)
-			{
-				throw std::invalid_argument(
-					std::format(
-						"File '{}' does not have a subimage {}, cannot seek to it", filepath.string(), subimage
-					)
-				);
-			}
-			const OIIO::ImageSpec& spec = input_ptr->spec();
-
-			return image<T>::read(
-				std::move(input_ptr),
-				std::forward<PostProcess>(postprocess),
-				spec.channelnames,
-				subimage,
-				compression_codec,
-				compression_level,
-				block_size,
-				chunk_size
-			);
-		}
-
-		/// \brief Reads a compressed image from a file using OpenImageIO and compresses it during reading.
-		/// 
-		/// Requires CompressedImage to have been compiled with OpenImageIO support.
-		/// 
-		/// This function reads an image file in chunks and compresses it on the fly leading to much
-		/// lower memory usage at near-identical performance to raw OpenImageIO reads. On an image
-		/// that is well compressible this can easily achieve a compression ratio of 5-10x.
-		/// 
-		/// The type does not have to match that of the underlying image as OpenImageIO will take
-		/// care of converting the files into the specified format. It is perfectly valid to read 
-		/// a floating point image as e.g. uint16_t etc.
-		/// 
-		/// This overload allows you to only extract the channels specified which is useful if you have e.g. 
-		/// a multilayer file but only wish to extract the RGBA components.
-		/// 
-		/// We will internally take care of optimizing the calls to the OpenImageIO API for maximum read throughput.
-		/// 
-		/// Example:
-		/// \code{.cpp}
-		/// std::filesystem::path filepath = "image.exr";
-		/// 
-		/// auto input_ptr = OIIO::ImageInput::open(filepath);
-		/// if (!input_ptr)
-		/// {
-		/// 	throw std::runtime_error(std::format("file {} does not exist on disk", filepath.string()));
-		/// }
-		/// 
-		/// auto img = compressed::image::read<uint8_t>(input_ptr, {0, 1, 2, 3});
-		/// \endcode
-		///
-		/// \param input_ptr The opened OIIO input pointer.
-		/// \param channelindices The channels you wish to extract. These may be specified in any order. We throw a 
-		///						  std::out_of_range if one of the passed channels does not exist. It is perfectly valid
-		///						  to e.g. call this with {3, 1, 2} when the underlying channel structure may be 
-		///						  RGBA. Sorting these back into their underlying channel structure is done on read.
-		/// \param subimage The subimage to extract the channels from (default: 0). Only relevant for multi-part images.
-		/// \param compression_codec The compression codec to use (default: LZ4).
-		/// \param compression_level The compression level (default: 9).
-		/// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to 
-		///					  comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle 
-		///					  larger blocks feel free to up this number.
-		/// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. 
-		///					  This should be tweaked to be no larger than the size of the usual images you are expecting  
-		///					  to compress for optimal performance but this could be upped which might give better compression
-		///					  ratios. Must be a multiple of sizeof(T).
-		/// \return A compressed image instance.
-		static image read(
-			std::unique_ptr<OIIO::ImageInput> input_ptr,
-			std::vector<int> channelindices,
-			int subimage = 0,
-			enums::codec compression_codec = enums::codec::lz4,
-			size_t compression_level = 9,
-			size_t block_size = s_default_blocksize,
-			size_t chunk_size = s_default_chunksize
-		)
-		{
-			_COMPRESSED_PROFILE_FUNCTION();
-			std::vector<std::string> channelnames{};
-
-			// Ensure we seek to the right subimage before retrieving the spec as it is subimage dependent.
-			auto res = input_ptr->seek_subimage(subimage, 0);
-			if (!res)
-			{
-				throw std::invalid_argument(
-					std::format(
-						"File does not have a subimage {}, cannot seek to it", subimage
-					)
-				);
-			}
-			const auto& spec = input_ptr->spec();
-
-			for (int i : channelindices)
-			{
-				channelnames.push_back(spec.channelnames.at(i));
-			}
-
-			return image<T>::read(
-				std::move(input_ptr),
-				std::move(channelnames),
-				subimage,
-				compression_codec,
-				compression_level,
-				block_size,
-				chunk_size
-			);
-		}
-
-		/// \brief Reads a compressed image from a file using OpenImageIO and compresses it during reading.
-		/// 
-		/// Requires CompressedImage to have been compiled with OpenImageIO support.
-		/// 
-		/// This function reads an image file in chunks and compresses it on the fly leading to much
-		/// lower memory usage at near-identical performance to raw OpenImageIO reads. On an image
-		/// that is well compressible this can easily achieve a compression ratio of 5-10x.
-		/// 
-		/// The type does not have to match that of the underlying image as OpenImageIO will take
-		/// care of converting the files into the specified format. It is perfectly valid to read 
-		/// a floating point image as e.g. uint16_t etc.
-		/// 
-		/// This overload allows you to only extract the channels specified which is useful if you have e.g. 
-		/// a multilayer file but only wish to extract the RGBA components.
-		/// 
-		/// We will internally take care of optimizing the calls to the OpenImageIO API for maximum read throughput.
-		/// 
-		/// This function allows you to specify a custom invocable function which is executed after a chunk has been read
-		/// and before it is compressed. If you have some common operations like color management or a filter which you
-		/// wish to apply this would go in here.
-		/// Specifying these right away in the read is much more efficient than iterating over the image again later and
-		/// applying these.
-		/// 
-		/// The function passed should have no notion of coordinates or similar, it should simply assume to receive a block
-		/// of data (that is part of an image) as well as the channel index we are currently operating over.
-		/// 
-		/// Example:
-		/// \code{.cpp}
-		/// std::filesystem::path filepath = "image.exr";
-		/// 
-		/// auto input_ptr = OIIO::ImageInput::open(filepath);
-		/// if (!input_ptr)
-		/// {
-		/// 	throw std::runtime_error(std::format("file {} does not exist on disk", filepath.string()));
-		/// }
-		/// 
-		/// auto postprocess = [](size_t channel_idx, std::span<T> chunk)
-		///		{
-		///			if (channel_idx > 2)
-		///			{
-		///				return;
-		///			}
-		///		
-		///			std::for_each(std::execution::par_unseq, chunk.begin(), chunk.end(), [](T& value)
-		///			{
-		///				value += 1;
-		///			}
-		///		};
-		/// 
-		/// // Read an image file and apply a post-process which adds 1 to the pixel value for all RGB channels (0, 1, 2).
-		/// auto img = compressed::image::read<uint8_t>(
-		///		std::move(input_ptr), 
-		///		std::forward(postprocess),
-		///		{ 0, 1, 2, 3}, // only read the RGBA channels
-		///		0, // subimage
-		///		compressed::enums::codec::lz4, 
-		///		5
-		/// );
-		/// \endcode
-		///
-		/// \param input_ptr The opened OIIO input pointer.
-		/// \param postprocess A postprocessing function to run after read but before re-compression. This function should
-		///					   take a `size_t` and a `std::span<T>` where the `size_t` is the channel index we are currently
-		///					   iterating over (e.g. 3 for the alpha channel) and the `std::span<T>` is a chunk within that
-		///					   channel, where this chunk is and what coordinates it represents is not passed along.
-		/// \param channelindices The channels you wish to extract. These may be specified in any order. We throw a 
-		///						  std::out_of_range if one of the passed channels does not exist. It is perfectly valid
-		///						  to e.g. call this with {3, 1, 2} when the underlying channel structure may be 
-		///						  RGBA. Sorting these back into their underlying channel structure is done on read.
-		/// \param subimage The subimage to extract the channels from (default: 0). Only relevant for multi-part images.
-		/// \param compression_codec The compression codec to use (default: LZ4).
-		/// \param compression_level The compression level (default: 9).
-		/// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to 
-		///					  comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle 
-		///					  larger blocks feel free to up this number.
-		/// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. 
-		///					  This should be tweaked to be no larger than the size of the usual images you are expecting  
-		///					  to compress for optimal performance but this could be upped which might give better compression
-		///					  ratios. Must be a multiple of sizeof(T).
-		/// \return A compressed image instance.
-		template <typename PostProcess>
-			requires std::invocable<std::remove_reference_t<PostProcess>, size_t, std::span<T>>
-		static image read(
-			std::unique_ptr<OIIO::ImageInput> input_ptr,
-			PostProcess&& postprocess,
-			std::vector<int> channelindices,
-			int subimage = 0,
-			enums::codec compression_codec = enums::codec::lz4,
-			size_t compression_level = 9,
-			size_t block_size = s_default_blocksize,
-			size_t chunk_size = s_default_chunksize
-		)
-		{
-			_COMPRESSED_PROFILE_FUNCTION();
-			std::vector<std::string> channelnames{};
-
-			// Ensure we seek to the right subimage before retrieving the spec as it is subimage dependent.
-			auto res = input_ptr->seek_subimage(subimage, 0);
-			if (!res)
-			{
-				throw std::invalid_argument(
-					std::format(
-						"File does not have a subimage {}, cannot seek to it", subimage
-					)
-				);
-			}
-			const auto& spec = input_ptr->spec();
-
-			for (int i : channelindices)
-			{
-				channelnames.push_back(spec.channelnames.at(i));
-			}
-
-			return image<T>::read(
-				std::move(input_ptr),
-				std::forward<PostProcess>(postprocess),
-				subimage,
-				std::move(channelnames),
-				compression_codec,
-				compression_level,
-				block_size,
-				chunk_size
-			);
-		}
-
-
-		/// \brief Reads a compressed image from a file using OpenImageIO and compresses it during reading.
-		/// 
-		/// Requires CompressedImage to have been compiled with OpenImageIO support.
-		/// 
-		/// This function reads an image file in chunks and compresses it on the fly leading to much
-		/// lower memory usage at near-identical performance to raw OpenImageIO reads. On an image
-		/// that is well compressible this can easily achieve a compression ratio of 5-10x.
-		/// 
-		/// The type does not have to match that of the underlying image as OpenImageIO will take
-		/// care of converting the files into the specified format. It is perfectly valid to read 
-		/// a floating point image as e.g. uint16_t etc.
-		/// 
-		/// This overload allows you to only extract the channels specified which is useful if you have e.g. 
-		/// a multilayer file but only wish to extract the RGBA components.
-		/// 
-		/// We will internally take care of optimizing the calls to the OpenImageIO API for maximum read throughput.
-		/// 
-		/// Example:
-		/// \code{.cpp}
-		/// std::filesystem::path filepath = "image.exr";
-		/// 
-		/// auto input_ptr = OIIO::ImageInput::open(filepath);
-		/// if (!input_ptr)
-		/// {
-		/// 	throw std::runtime_error(std::format("file {} does not exist on disk", filepath.string()));
-		/// }
-		/// 
-		/// auto img = compressed::image::read<uint8_t>(std::move(input_ptr), {"R", "G", "B", "A"});
-		/// \endcode
-		///
-		/// \param input_ptr The opened OIIO input pointer.
-		/// \param channelnames The channels you wish to extract. These may be specified in any order. We throw a 
-		///						std::out_of_range if one of the passed channels does not exist. It is perfectly valid
-		///						to e.g. call this with {"G", "R", "A"} when the underlying channel structure may be 
-		///						RGBA. Sorting these back into their underlying channel structure is done on read.
-		/// \param subimage The subimage to extract the channels from (default: 0). Only relevant for multi-part images.
-		/// \param compression_codec The compression codec to use (default: LZ4).
-		/// \param compression_level The compression level (default: 9).
-		/// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to 
-		///					  comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle 
-		///					  larger blocks feel free to up this number.
-		/// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. 
-		///					  This should be tweaked to be no larger than the size of the usual images you are expecting  
-		///					  to compress for optimal performance but this could be upped which might give better compression
-		///					  ratios. Must be a multiple of sizeof(T).
-		/// \return A compressed image instance.
-		static image read(
-			std::unique_ptr<OIIO::ImageInput> input_ptr,
-			std::vector<std::string> channelnames,
-			int subimage = 0,
-			enums::codec compression_codec = enums::codec::lz4,
-			size_t compression_level = 9,
-			size_t block_size = s_default_blocksize,
-			size_t chunk_size = s_default_chunksize
-		)
-		{
-			_COMPRESSED_PROFILE_FUNCTION();
-			return image<T>::read_impl(
-				std::move(input_ptr),
-				std::move(channelnames),
-				std::nullopt,
-				subimage,
-				compression_codec,
-				compression_level,
-				block_size,
-				chunk_size
-			);
-		}
-
-
-		/// \brief Reads a compressed image from a file using OpenImageIO and compresses it during reading.
-		/// 
-		/// Requires CompressedImage to have been compiled with OpenImageIO support.
-		/// 
-		/// This function reads an image file in chunks and compresses it on the fly leading to much
-		/// lower memory usage at near-identical performance to raw OpenImageIO reads. On an image
-		/// that is well compressible this can easily achieve a compression ratio of 5-10x.
-		/// 
-		/// The type does not have to match that of the underlying image as OpenImageIO will take
-		/// care of converting the files into the specified format. It is perfectly valid to read 
-		/// a floating point image as e.g. uint16_t etc.
-		/// 
-		/// This overload allows you to only extract the channels specified which is useful if you have e.g. 
-		/// a multilayer file but only wish to extract the RGBA components.
-		/// 
-		/// We will internally take care of optimizing the calls to the OpenImageIO API for maximum read throughput.
-		/// 
-		/// This function allows you to specify a custom invocable function which is executed after a chunk has been read
-		/// and before it is compressed. If you have some common operations like color management or a filter which you
-		/// wish to apply this would go in here.
-		/// Specifying these right away in the read is much more efficient than iterating over the image again later and
-		/// applying these.
-		/// 
-		/// The function passed should have no notion of coordinates or similar, it should simply assume to receive a block
-		/// of data (that is part of an image) as well as the channel index we are currently operating over.
-		/// 
-		/// 
-		/// Example:
-		/// \code{.cpp}
-		/// std::filesystem::path filepath = "image.exr";
-		/// 
-		/// auto input_ptr = OIIO::ImageInput::open(filepath);
-		/// if (!input_ptr)
-		/// {
-		/// 	throw std::runtime_error(std::format("file {} does not exist on disk", filepath.string()));
-		/// }
-		/// 
-		/// auto postprocess = [](size_t channel_idx, std::span<T> chunk)
-		///		{
-		///			if (channel_idx > 2)
-		///			{
-		///				return;
-		///			}
-		/// 
-		///			std::for_each(std::execution::par_unseq, chunk.begin(), chunk.end(), [](T& value)
-		///			{
-		///				value += 1;
-		///			}
-		///		};
-		/// 
-		/// // Read an image file and apply a post-process which adds 1 to the pixel value for all RGB channels (0, 1, 2).
-		/// auto img = compressed::image::read<uint8_t>(
-		///		std::move(input_ptr), 
-		///		std::forward(postprocess),
-		///		{ 0, 1, 2, 3}, // only read the RGBA channels
-		///		0, // subimage
-		///		compressed::enums::codec::lz4, 
-		///		5
-		/// );
-		/// \endcode
-		///
-		/// \param input_ptr The opened OIIO input pointer.
-		/// \param postprocess A postprocessing function to run after read but before re-compression. This function should
-		///					   take a `size_t` and a `std::span<T>` where the `size_t` is the channel index we are currently
-		///					   iterating over (e.g. 3 for the alpha channel) and the `std::span<T>` is a chunk within that
-		///					   channel, where this chunk is and what coordinates it represents is not passed along.
-		/// \param channelnames The channels you wish to extract. These may be specified in any order. We throw a 
-		///						std::out_of_range if one of the passed channels does not exist. It is perfectly valid
-		///						to e.g. call this with {"G", "R", "A"} when the underlying channel structure may be 
-		///						RGBA. Sorting these back into their underlying channel structure is done on read.
-		/// \param subimage The subimage to extract the channels from (default: 0). Only relevant for multi-part images.
-		/// \param compression_codec The compression codec to use (default: LZ4).
-		/// \param compression_level The compression level (default: 9).
-		/// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to 
-		///					  comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle 
-		///					  larger blocks feel free to up this number.
-		/// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel. 
-		///					  This should be tweaked to be no larger than the size of the usual images you are expecting  
-		///					  to compress for optimal performance but this could be upped which might give better compression
-		///					  ratios. Must be a multiple of sizeof(T).
-		/// \return A compressed image instance.
-		template <typename PostProcess>
-			requires std::invocable<std::remove_reference_t<PostProcess>, size_t, std::span<T>>
-		static image read(
-			std::unique_ptr<OIIO::ImageInput> input_ptr,
-			PostProcess&& postprocess,
-			std::vector<std::string> channelnames,
-			int subimage = 0,
-			enums::codec compression_codec = enums::codec::lz4,
-			size_t compression_level = 9,
-			size_t block_size = s_default_blocksize,
-			size_t chunk_size = s_default_chunksize
-			)
-		{
-			_COMPRESSED_PROFILE_FUNCTION();
-			return image<T>::read_impl(
-				std::move(input_ptr),
-				std::move(channelnames),
-				std::forward<PostProcess>(postprocess),
-				subimage,
-				compression_codec,
-				compression_level,
-				block_size,
-				chunk_size
-			);
-		}
-
-
-		/// \brief Read the metadata from the openimageio pointer into a json representation
-		/// \param input_ptr The input file to query
-		/// \return The metadata encoded as json. This does not recursively parse jsons!
-		static json_ordered read_oiio_metadata(const OIIO::ImageSpec& spec)
-		{
-			return detail::param_value::to_json(spec.extra_attribs);
-		}
-
-		/// \brief Read the metadata from the file into a json representation
-		/// \param input_ptr The input file to query
-		/// 
-		/// \throws std::invalid_argument if the file does not exist on disk.
-		/// 
-		/// \return The metadata encoded as json. This does not recursively parse jsons!
-		static json_ordered read_oiio_metadata(std::filesystem::path filepath)
-		{
-			// Initialize the OIIO primitives
-			auto input_ptr = OIIO::ImageInput::open(filepath);
-			if (!input_ptr)
-			{
-				throw std::invalid_argument(std::format("File {} does not exist on disk", filepath.string()));
-			}
-
-			return detail::param_value::to_json(input_ptr->spec().extra_attribs);
-		}
+        /// \brief Reads a compressed image from a file using OpenImageIO and compresses it during reading.
+        ///
+        /// Requires CompressedImage to have been compiled with OpenImageIO support.
+        ///
+        /// This function reads an image file in chunks and compresses it on the fly leading to much
+        /// lower memory usage at near-identical performance to raw OpenImageIO reads. On an image
+        /// that is well compressible this can easily achieve a compression ratio of 5-10x.
+        ///
+        /// The type does not have to match that of the underlying image as OpenImageIO will take
+        /// care of converting the files into the specified format. It is perfectly valid to read
+        /// a floating point image as e.g. uint16_t etc.
+        ///
+        /// Example:
+        /// \code{.cpp}
+        /// std::filesystem::path filepath = "image.exr";
+        /// auto img = compressed::image::read<uint8_t>(filepath, 0, compressed::enums::codec::lz4, 5);
+        /// \endcode
+        ///
+        /// \param filepath The file path of the image to read.
+        /// \param subimage The subimage to extract the channels from (default: 0). Only relevant for multi-part images.
+        /// \param compression_codec The compression codec to use (default: LZ4).
+        /// \param compression_level The compression level (default: 9).
+        /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to
+        ///					  comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle
+        ///					  larger blocks feel free to up this number.
+        /// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel.
+        ///					  This should be tweaked to be no larger than the size of the usual images you are expecting
+        ///					  to compress for optimal performance but this could be upped which might give better compression
+        ///					  ratios. Must be a multiple of sizeof(T).
+        /// \return A compressed image instance.
+        static image read(
+            std::filesystem::path filepath,
+            int subimage = 0,
+            enums::codec compression_codec = enums::codec::lz4,
+            size_t compression_level = 9,
+            size_t block_size = s_default_blocksize,
+            size_t chunk_size = s_default_chunksize
+        )
+        {
+            // Initialize the OIIO primitives
+            auto input_ptr = OIIO::ImageInput::open(filepath);
+            if (!input_ptr)
+            {
+                throw std::invalid_argument(std::format("File {} does not exist on disk", filepath.string()));
+            }
+
+            // Ensure we seek to the right subimage before retrieving the spec as it is subimage dependent.
+            auto res = input_ptr->seek_subimage(subimage, 0);
+            if (!res)
+            {
+                throw std::invalid_argument(
+                    std::format(
+                        "File '{}' does not have a subimage {}, cannot seek to it",
+                        filepath.string(),
+                        subimage
+                    )
+                );
+            }
+            const OIIO::ImageSpec& spec = input_ptr->spec();
+
+            return image<T>::read(
+                std::move(input_ptr),
+                spec.channelnames,
+                subimage,
+                compression_codec,
+                compression_level,
+                block_size,
+                chunk_size
+            );
+        }
+
+        /// \brief Reads a compressed image from a file using OpenImageIO and compresses it during reading.
+        ///
+        /// Requires CompressedImage to have been compiled with OpenImageIO support.
+        ///
+        /// This function reads an image file in chunks and compresses it on the fly leading to much
+        /// lower memory usage at near-identical performance to raw OpenImageIO reads. On an image
+        /// that is well compressible this can easily achieve a compression ratio of 5-10x.
+        ///
+        /// The type does not have to match that of the underlying image as OpenImageIO will take
+        /// care of converting the files into the specified format. It is perfectly valid to read
+        /// a floating point image as e.g. uint16_t etc.
+        ///
+        /// This overload allows you to specify a custom invocable function which is executed after a chunk has been read
+        /// and before it is compressed. If you have some common operations like color management or a filter which you
+        /// wish to apply this would go in here.
+        /// Specifying these right away in the read is much more efficient than iterating over the image again later and
+        /// applying these.
+        ///
+        /// The function passed should have no notion of coordinates or similar, it should simply assume to receive a block
+        /// of data (that is part of an image) as well as the channel index we are currently operating over.
+        ///
+        /// Example:
+        /// \code{.cpp}
+        /// std::filesystem::path filepath = "image.exr";
+        ///
+        /// // Read an image file and apply a post-process which adds 1 to the pixel value for all RGB channels (0, 1, 2).
+        ///
+        /// auto postprocess = [](size_t channel_idx, std::span<T> chunk)
+        ///		{
+        ///			if (channel_idx > 2)
+        ///			{
+        ///				return;
+        ///			}
+        ///
+        ///			std::for_each(std::execution::par_unseq, chunk.begin(), chunk.end(), [](T& value)
+        ///			{
+        ///				value += 1;
+        ///			}
+        ///		};
+        ///
+        /// auto img = compressed::image::read<uint8_t>(
+        ///		filepath,
+        ///		std::forward(postprocess),
+        ///		0, // subimage
+        ///		compressed::enums::codec::lz4, // compression_code
+        ///		5 // compression_level
+        /// );
+        /// \endcode
+        ///
+        /// \param filepath The file path of the image to read.
+        /// \param postprocess A postprocessing function to run after read but before re-compression. This function should
+        ///					   take a `size_t` and a `std::span<T>` where the `size_t` is the channel index we are currently
+        ///					   iterating over (e.g. 3 for the alpha channel) and the `std::span<T>` is a chunk within that
+        ///					   channel, where this chunk is and what coordinates it represents is not passed along.
+        /// \param subimage The subimage to extract the channels from (default: 0). Only relevant for multi-part images.
+        /// \param compression_codec The compression codec to use (default: LZ4).
+        /// \param compression_level The compression level (default: 9).
+        /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to
+        ///					  comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle
+        ///					  larger blocks feel free to up this number.
+        /// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel.
+        ///					  This should be tweaked to be no larger than the size of the usual images you are expecting
+        ///					  to compress for optimal performance but this could be upped which might give better compression
+        ///					  ratios. Must be a multiple of sizeof(T).
+        /// \return A compressed image instance.
+        template <typename PostProcess>
+            requires std::invocable<std::remove_reference_t<PostProcess>, size_t, std::span<T>>
+        static image read(
+            std::filesystem::path filepath,
+            PostProcess&& postprocess,
+            int subimage = 0,
+            enums::codec compression_codec = enums::codec::lz4,
+            size_t compression_level = 9,
+            size_t block_size = s_default_blocksize,
+            size_t chunk_size = s_default_chunksize
+        )
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            // Initialize the OIIO primitives
+            auto input_ptr = OIIO::ImageInput::open(filepath);
+            if (!input_ptr)
+            {
+                throw std::invalid_argument(std::format("File {} does not exist on disk", filepath.string()));
+            }
+
+            // Ensure we seek to the right subimage before retrieving the spec as it is subimage dependent.
+            auto res = input_ptr->seek_subimage(subimage, 0);
+            if (!res)
+            {
+                throw std::invalid_argument(
+                    std::format(
+                        "File '{}' does not have a subimage {}, cannot seek to it",
+                        filepath.string(),
+                        subimage
+                    )
+                );
+            }
+            const OIIO::ImageSpec& spec = input_ptr->spec();
+
+            return image<T>::read(
+                std::move(input_ptr),
+                std::forward<PostProcess>(postprocess),
+                spec.channelnames,
+                subimage,
+                compression_codec,
+                compression_level,
+                block_size,
+                chunk_size
+            );
+        }
+
+        /// \brief Reads a compressed image from a file using OpenImageIO and compresses it during reading.
+        ///
+        /// Requires CompressedImage to have been compiled with OpenImageIO support.
+        ///
+        /// This function reads an image file in chunks and compresses it on the fly leading to much
+        /// lower memory usage at near-identical performance to raw OpenImageIO reads. On an image
+        /// that is well compressible this can easily achieve a compression ratio of 5-10x.
+        ///
+        /// The type does not have to match that of the underlying image as OpenImageIO will take
+        /// care of converting the files into the specified format. It is perfectly valid to read
+        /// a floating point image as e.g. uint16_t etc.
+        ///
+        /// This overload allows you to only extract the channels specified which is useful if you have e.g.
+        /// a multilayer file but only wish to extract the RGBA components.
+        ///
+        /// We will internally take care of optimizing the calls to the OpenImageIO API for maximum read throughput.
+        ///
+        /// Example:
+        /// \code{.cpp}
+        /// std::filesystem::path filepath = "image.exr";
+        ///
+        /// auto input_ptr = OIIO::ImageInput::open(filepath);
+        /// if (!input_ptr)
+        /// {
+        /// 	throw std::runtime_error(std::format("file {} does not exist on disk", filepath.string()));
+        /// }
+        ///
+        /// auto img = compressed::image::read<uint8_t>(input_ptr, {0, 1, 2, 3});
+        /// \endcode
+        ///
+        /// \param input_ptr The opened OIIO input pointer.
+        /// \param channelindices The channels you wish to extract. These may be specified in any order. We throw a
+        ///						  std::out_of_range if one of the passed channels does not exist. It is perfectly valid
+        ///						  to e.g. call this with {3, 1, 2} when the underlying channel structure may be
+        ///						  RGBA. Sorting these back into their underlying channel structure is done on read.
+        /// \param subimage The subimage to extract the channels from (default: 0). Only relevant for multi-part images.
+        /// \param compression_codec The compression codec to use (default: LZ4).
+        /// \param compression_level The compression level (default: 9).
+        /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to
+        ///					  comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle
+        ///					  larger blocks feel free to up this number.
+        /// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel.
+        ///					  This should be tweaked to be no larger than the size of the usual images you are expecting
+        ///					  to compress for optimal performance but this could be upped which might give better compression
+        ///					  ratios. Must be a multiple of sizeof(T).
+        /// \return A compressed image instance.
+        static image read(
+            std::unique_ptr<OIIO::ImageInput> input_ptr,
+            std::vector<int> channelindices,
+            int subimage = 0,
+            enums::codec compression_codec = enums::codec::lz4,
+            size_t compression_level = 9,
+            size_t block_size = s_default_blocksize,
+            size_t chunk_size = s_default_chunksize
+        )
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            std::vector<std::string> channelnames{};
+
+            // Ensure we seek to the right subimage before retrieving the spec as it is subimage dependent.
+            auto res = input_ptr->seek_subimage(subimage, 0);
+            if (!res)
+            {
+                throw std::invalid_argument(
+                    std::format(
+                        "File does not have a subimage {}, cannot seek to it",
+                        subimage
+                    )
+                );
+            }
+            const auto& spec = input_ptr->spec();
+
+            for (int i : channelindices)
+            {
+                channelnames.push_back(spec.channelnames.at(i));
+            }
+
+            return image<T>::read(
+                std::move(input_ptr),
+                std::move(channelnames),
+                subimage,
+                compression_codec,
+                compression_level,
+                block_size,
+                chunk_size
+            );
+        }
+
+        /// \brief Reads a compressed image from a file using OpenImageIO and compresses it during reading.
+        ///
+        /// Requires CompressedImage to have been compiled with OpenImageIO support.
+        ///
+        /// This function reads an image file in chunks and compresses it on the fly leading to much
+        /// lower memory usage at near-identical performance to raw OpenImageIO reads. On an image
+        /// that is well compressible this can easily achieve a compression ratio of 5-10x.
+        ///
+        /// The type does not have to match that of the underlying image as OpenImageIO will take
+        /// care of converting the files into the specified format. It is perfectly valid to read
+        /// a floating point image as e.g. uint16_t etc.
+        ///
+        /// This overload allows you to only extract the channels specified which is useful if you have e.g.
+        /// a multilayer file but only wish to extract the RGBA components.
+        ///
+        /// We will internally take care of optimizing the calls to the OpenImageIO API for maximum read throughput.
+        ///
+        /// This function allows you to specify a custom invocable function which is executed after a chunk has been read
+        /// and before it is compressed. If you have some common operations like color management or a filter which you
+        /// wish to apply this would go in here.
+        /// Specifying these right away in the read is much more efficient than iterating over the image again later and
+        /// applying these.
+        ///
+        /// The function passed should have no notion of coordinates or similar, it should simply assume to receive a block
+        /// of data (that is part of an image) as well as the channel index we are currently operating over.
+        ///
+        /// Example:
+        /// \code{.cpp}
+        /// std::filesystem::path filepath = "image.exr";
+        ///
+        /// auto input_ptr = OIIO::ImageInput::open(filepath);
+        /// if (!input_ptr)
+        /// {
+        /// 	throw std::runtime_error(std::format("file {} does not exist on disk", filepath.string()));
+        /// }
+        ///
+        /// auto postprocess = [](size_t channel_idx, std::span<T> chunk)
+        ///		{
+        ///			if (channel_idx > 2)
+        ///			{
+        ///				return;
+        ///			}
+        ///
+        ///			std::for_each(std::execution::par_unseq, chunk.begin(), chunk.end(), [](T& value)
+        ///			{
+        ///				value += 1;
+        ///			}
+        ///		};
+        ///
+        /// // Read an image file and apply a post-process which adds 1 to the pixel value for all RGB channels (0, 1, 2).
+        /// auto img = compressed::image::read<uint8_t>(
+        ///		std::move(input_ptr),
+        ///		std::forward(postprocess),
+        ///		{ 0, 1, 2, 3}, // only read the RGBA channels
+        ///		0, // subimage
+        ///		compressed::enums::codec::lz4,
+        ///		5
+        /// );
+        /// \endcode
+        ///
+        /// \param input_ptr The opened OIIO input pointer.
+        /// \param postprocess A postprocessing function to run after read but before re-compression. This function should
+        ///					   take a `size_t` and a `std::span<T>` where the `size_t` is the channel index we are currently
+        ///					   iterating over (e.g. 3 for the alpha channel) and the `std::span<T>` is a chunk within that
+        ///					   channel, where this chunk is and what coordinates it represents is not passed along.
+        /// \param channelindices The channels you wish to extract. These may be specified in any order. We throw a
+        ///						  std::out_of_range if one of the passed channels does not exist. It is perfectly valid
+        ///						  to e.g. call this with {3, 1, 2} when the underlying channel structure may be
+        ///						  RGBA. Sorting these back into their underlying channel structure is done on read.
+        /// \param subimage The subimage to extract the channels from (default: 0). Only relevant for multi-part images.
+        /// \param compression_codec The compression codec to use (default: LZ4).
+        /// \param compression_level The compression level (default: 9).
+        /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to
+        ///					  comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle
+        ///					  larger blocks feel free to up this number.
+        /// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel.
+        ///					  This should be tweaked to be no larger than the size of the usual images you are expecting
+        ///					  to compress for optimal performance but this could be upped which might give better compression
+        ///					  ratios. Must be a multiple of sizeof(T).
+        /// \return A compressed image instance.
+        template <typename PostProcess>
+            requires std::invocable<std::remove_reference_t<PostProcess>, size_t, std::span<T>>
+        static image read(
+            std::unique_ptr<OIIO::ImageInput> input_ptr,
+            PostProcess&& postprocess,
+            std::vector<int> channelindices,
+            int subimage = 0,
+            enums::codec compression_codec = enums::codec::lz4,
+            size_t compression_level = 9,
+            size_t block_size = s_default_blocksize,
+            size_t chunk_size = s_default_chunksize
+        )
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            std::vector<std::string> channelnames{};
+
+            // Ensure we seek to the right subimage before retrieving the spec as it is subimage dependent.
+            auto res = input_ptr->seek_subimage(subimage, 0);
+            if (!res)
+            {
+                throw std::invalid_argument(
+                    std::format(
+                        "File does not have a subimage {}, cannot seek to it",
+                        subimage
+                    )
+                );
+            }
+            const auto& spec = input_ptr->spec();
+
+            for (int i : channelindices)
+            {
+                channelnames.push_back(spec.channelnames.at(i));
+            }
+
+            return image<T>::read(
+                std::move(input_ptr),
+                std::forward<PostProcess>(postprocess),
+                subimage,
+                std::move(channelnames),
+                compression_codec,
+                compression_level,
+                block_size,
+                chunk_size
+            );
+        }
+
+
+        /// \brief Reads a compressed image from a file using OpenImageIO and compresses it during reading.
+        ///
+        /// Requires CompressedImage to have been compiled with OpenImageIO support.
+        ///
+        /// This function reads an image file in chunks and compresses it on the fly leading to much
+        /// lower memory usage at near-identical performance to raw OpenImageIO reads. On an image
+        /// that is well compressible this can easily achieve a compression ratio of 5-10x.
+        ///
+        /// The type does not have to match that of the underlying image as OpenImageIO will take
+        /// care of converting the files into the specified format. It is perfectly valid to read
+        /// a floating point image as e.g. uint16_t etc.
+        ///
+        /// This overload allows you to only extract the channels specified which is useful if you have e.g.
+        /// a multilayer file but only wish to extract the RGBA components.
+        ///
+        /// We will internally take care of optimizing the calls to the OpenImageIO API for maximum read throughput.
+        ///
+        /// Example:
+        /// \code{.cpp}
+        /// std::filesystem::path filepath = "image.exr";
+        ///
+        /// auto input_ptr = OIIO::ImageInput::open(filepath);
+        /// if (!input_ptr)
+        /// {
+        /// 	throw std::runtime_error(std::format("file {} does not exist on disk", filepath.string()));
+        /// }
+        ///
+        /// auto img = compressed::image::read<uint8_t>(std::move(input_ptr), {"R", "G", "B", "A"});
+        /// \endcode
+        ///
+        /// \param input_ptr The opened OIIO input pointer.
+        /// \param channelnames The channels you wish to extract. These may be specified in any order. We throw a
+        ///						std::out_of_range if one of the passed channels does not exist. It is perfectly valid
+        ///						to e.g. call this with {"G", "R", "A"} when the underlying channel structure may be
+        ///						RGBA. Sorting these back into their underlying channel structure is done on read.
+        /// \param subimage The subimage to extract the channels from (default: 0). Only relevant for multi-part images.
+        /// \param compression_codec The compression codec to use (default: LZ4).
+        /// \param compression_level The compression level (default: 9).
+        /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to
+        ///					  comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle
+        ///					  larger blocks feel free to up this number.
+        /// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel.
+        ///					  This should be tweaked to be no larger than the size of the usual images you are expecting
+        ///					  to compress for optimal performance but this could be upped which might give better compression
+        ///					  ratios. Must be a multiple of sizeof(T).
+        /// \return A compressed image instance.
+        static image read(
+            std::unique_ptr<OIIO::ImageInput> input_ptr,
+            std::vector<std::string> channelnames,
+            int subimage = 0,
+            enums::codec compression_codec = enums::codec::lz4,
+            size_t compression_level = 9,
+            size_t block_size = s_default_blocksize,
+            size_t chunk_size = s_default_chunksize
+        )
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            return image<T>::read_impl(
+                std::move(input_ptr),
+                std::move(channelnames),
+                std::nullopt,
+                subimage,
+                compression_codec,
+                compression_level,
+                block_size,
+                chunk_size
+            );
+        }
+
+
+        /// \brief Reads a compressed image from a file using OpenImageIO and compresses it during reading.
+        ///
+        /// Requires CompressedImage to have been compiled with OpenImageIO support.
+        ///
+        /// This function reads an image file in chunks and compresses it on the fly leading to much
+        /// lower memory usage at near-identical performance to raw OpenImageIO reads. On an image
+        /// that is well compressible this can easily achieve a compression ratio of 5-10x.
+        ///
+        /// The type does not have to match that of the underlying image as OpenImageIO will take
+        /// care of converting the files into the specified format. It is perfectly valid to read
+        /// a floating point image as e.g. uint16_t etc.
+        ///
+        /// This overload allows you to only extract the channels specified which is useful if you have e.g.
+        /// a multilayer file but only wish to extract the RGBA components.
+        ///
+        /// We will internally take care of optimizing the calls to the OpenImageIO API for maximum read throughput.
+        ///
+        /// This function allows you to specify a custom invocable function which is executed after a chunk has been read
+        /// and before it is compressed. If you have some common operations like color management or a filter which you
+        /// wish to apply this would go in here.
+        /// Specifying these right away in the read is much more efficient than iterating over the image again later and
+        /// applying these.
+        ///
+        /// The function passed should have no notion of coordinates or similar, it should simply assume to receive a block
+        /// of data (that is part of an image) as well as the channel index we are currently operating over.
+        ///
+        ///
+        /// Example:
+        /// \code{.cpp}
+        /// std::filesystem::path filepath = "image.exr";
+        ///
+        /// auto input_ptr = OIIO::ImageInput::open(filepath);
+        /// if (!input_ptr)
+        /// {
+        /// 	throw std::runtime_error(std::format("file {} does not exist on disk", filepath.string()));
+        /// }
+        ///
+        /// auto postprocess = [](size_t channel_idx, std::span<T> chunk)
+        ///		{
+        ///			if (channel_idx > 2)
+        ///			{
+        ///				return;
+        ///			}
+        ///
+        ///			std::for_each(std::execution::par_unseq, chunk.begin(), chunk.end(), [](T& value)
+        ///			{
+        ///				value += 1;
+        ///			}
+        ///		};
+        ///
+        /// // Read an image file and apply a post-process which adds 1 to the pixel value for all RGB channels (0, 1, 2).
+        /// auto img = compressed::image::read<uint8_t>(
+        ///		std::move(input_ptr),
+        ///		std::forward(postprocess),
+        ///		{ 0, 1, 2, 3}, // only read the RGBA channels
+        ///		0, // subimage
+        ///		compressed::enums::codec::lz4,
+        ///		5
+        /// );
+        /// \endcode
+        ///
+        /// \param input_ptr The opened OIIO input pointer.
+        /// \param postprocess A postprocessing function to run after read but before re-compression. This function should
+        ///					   take a `size_t` and a `std::span<T>` where the `size_t` is the channel index we are currently
+        ///					   iterating over (e.g. 3 for the alpha channel) and the `std::span<T>` is a chunk within that
+        ///					   channel, where this chunk is and what coordinates it represents is not passed along.
+        /// \param channelnames The channels you wish to extract. These may be specified in any order. We throw a
+        ///						std::out_of_range if one of the passed channels does not exist. It is perfectly valid
+        ///						to e.g. call this with {"G", "R", "A"} when the underlying channel structure may be
+        ///						RGBA. Sorting these back into their underlying channel structure is done on read.
+        /// \param subimage The subimage to extract the channels from (default: 0). Only relevant for multi-part images.
+        /// \param compression_codec The compression codec to use (default: LZ4).
+        /// \param compression_level The compression level (default: 9).
+        /// \param block_size The size of the blocks stored inside the chunks, defaults to 32KB which is enough to
+        ///					  comfortably fit into the L1 cache of most modern CPUs. If you know your cpu can handle
+        ///					  larger blocks feel free to up this number.
+        /// \param chunk_size The size of each individual chunk, defaults to 4MB which is enough to hold a 2048x2048 channel.
+        ///					  This should be tweaked to be no larger than the size of the usual images you are expecting
+        ///					  to compress for optimal performance but this could be upped which might give better compression
+        ///					  ratios. Must be a multiple of sizeof(T).
+        /// \return A compressed image instance.
+        template <typename PostProcess>
+            requires std::invocable<std::remove_reference_t<PostProcess>, size_t, std::span<T>>
+        static image read(
+            std::unique_ptr<OIIO::ImageInput> input_ptr,
+            PostProcess&& postprocess,
+            std::vector<std::string> channelnames,
+            int subimage = 0,
+            enums::codec compression_codec = enums::codec::lz4,
+            size_t compression_level = 9,
+            size_t block_size = s_default_blocksize,
+            size_t chunk_size = s_default_chunksize
+        )
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            return image<T>::read_impl(
+                std::move(input_ptr),
+                std::move(channelnames),
+                std::forward<PostProcess>(postprocess),
+                subimage,
+                compression_codec,
+                compression_level,
+                block_size,
+                chunk_size
+            );
+        }
+
+
+        /// \brief Read the metadata from the openimageio pointer into a json representation
+        /// \return The metadata encoded as json. This does not recursively parse jsons!
+        static json_ordered read_oiio_metadata(const OIIO::ImageSpec& spec)
+        {
+            return detail::param_value::to_json(spec.extra_attribs);
+        }
+
+        /// \brief Read the metadata from the file into a json representation
+        ///
+        /// \throws std::invalid_argument if the file does not exist on disk.
+        ///
+        /// \return The metadata encoded as json. This does not recursively parse jsons!
+        static json_ordered read_oiio_metadata(std::filesystem::path filepath)
+        {
+            // Initialize the OIIO primitives
+            auto input_ptr = OIIO::ImageInput::open(filepath);
+            if (!input_ptr)
+            {
+                throw std::invalid_argument(std::format("File {} does not exist on disk", filepath.string()));
+            }
+
+            return detail::param_value::to_json(input_ptr->spec().extra_attribs);
+        }
 
 
 #endif // COMPRESSED_IMAGE_OIIO_AVAILABLE
 
-		/// Adds a compressed channel to the image.
-		/// 
-		/// This method moves the provided channel into the image's internal storage, adding it to the list of channels.
-		/// 
-		/// Example:
-		/// \code{.cpp}
-		/// compressed::channel<uint8_t, BlockSize, ChunkSize> channel = ...;
-		/// my_image.add_channel(std::move(channel));
-		/// \endcode
-		/// 
-		/// \param _channel The channel to be added to the image.
-		/// \param name (Optional) Channel name of the channel to be inserted. If no channel names are set this argument is ignored.
-		void add_channel(compressed::channel<T> _channel, std::optional<std::string> name = std::nullopt)
-		{
-			if (_channel.width() != this->width())
-			{
-				throw std::invalid_argument(
-					std::format(
-						"Cannot add channel '{}' to the image as its width does not match that of the image."
-						" Expected {:L} pixels but instead got {:L} pixels",
-						name.value_or(""),
-						this->width(), _channel.width()
-					)
-				);
-			}
-			if (_channel.height() != this->height())
-			{
-				throw std::invalid_argument(
-					std::format(
-						"Cannot add channel '{}' to the image as its height does not match that of the image."
-						" Expected {:L} pixels but instead got {:L} pixels",
-						name.value_or(""),
-						this->height(), _channel.height()
-					)
-				);
-			}
-
-			if (name.has_value() && m_ChannelNames.size() == m_Channels.size())
-			{
-				m_ChannelNames.push_back(name.value());
-			}
-			else if (m_ChannelNames.size() > 0)
-			{
-				m_ChannelNames.push_back(name.value_or(""));
-			}
-
-			m_Channels.push_back(std::move(_channel));
-		}
-
-		/// Adds a channel to the image.
-		/// 
-		/// This method moves the provided channel into the image's internal storage, compressing it and adding it to the list of channels.
-		/// 
-		/// Example:
-		/// \code{.cpp}
-		/// std::span<constT> channel = ...;
-		/// my_image.add_channel(channel, 1920, 1080, "red"));
-		/// \endcode
-		/// 
-		/// \param data The channel to be added to the image.
-		/// \param width The width of the channel
-		/// \param height The height of the channel
-		/// \param name (Optional) Channel name of the channel to be inserted. If no channel names are set this argument is ignored.
-		/// \param compression_codec (Optional) Compression codec to apply to the channel, every channel is allowed to have a different one.
-		/// \param compression_level (Optional) Compression level, defaults to 5.
-		void add_channel(
-			std::span<const T> data, 
-			size_t width,
-			size_t height,
-			std::optional<std::string> name = std::nullopt,
-			enums::codec compression_codec = enums::codec::lz4,
-			uint8_t compression_level = 5
-		)
-		{
-			if (width != this->width())
-			{
-				throw std::invalid_argument(
-					std::format(
-						"Cannot add channel '{}' to the image as its width does not match that of the image."
-						" Expected {:L} pixels but instead got {:L} pixels",
-						name.value_or(""),
-						width, this->width()
-					)
-				);
-			}
-			if (height != this->height())
-			{
-				throw std::invalid_argument(
-					std::format(
-						"Cannot add channel '{}' to the image as its height does not match that of the image."
-						" Expected {:L} pixels but instead got {:L} pixels",
-						name.value_or(""),
-						height, this->height()
-					)
-				);
-			}
-
-			if (name.has_value() && m_ChannelNames.size() == m_Channels.size())
-			{
-				m_ChannelNames.push_back(name.value());
-			}
-			else if (m_ChannelNames.size() > 0)
-			{
-				m_ChannelNames.push_back(name.value_or(""));
-			}
-
-			m_Channels.push_back(compressed::channel(
-				std::span<const T>(data.begin(), data.end()),
-				width,
-				height,
-				compression_codec,
-				compression_level
-			));
-		}
-
-
-		/// Remove a channel by its index.
-		/// 
-		/// \param index The index of the channel to remove.
-		/// \throws std::out_of_range if the index is out of bounds.
-		void remove_channel(size_t index)
-		{
-			// Extract the channel and let it exit the scope to destruct
-			auto channel = this->extract_channel(index);
-		}
-
-		/// Remove a channel by its name.
-		/// 
-		/// \param name The name of the channel to remove.
-		/// \throws std::out_of_range if the channel name is invalid.
-		void remove_channel(const std::string_view name)
-		{
-			// Extract the channel and let it exit the scope to destruct
-			auto channel = this->extract_channel(name);
-		}
-
-		/// Extracts a channel by its index.
-		/// 
-		/// Remove the channel from the image and gives you full control over the channel. Also erases
-		/// its channel name.
-		/// 
-		/// \param index The index of the channel to retrieve.
-		/// \return The channel object.
-		/// \throws std::out_of_range if the index is out of bounds.
-		compressed::channel<T> extract_channel(size_t index)
-		{
-			if (index >= m_Channels.size())
-			{
-				throw std::out_of_range("Channel index out of range");
-			}
-			auto ret = std::move(m_Channels[index]);
-
-			m_Channels.erase(m_Channels.begin() + index);
-			m_ChannelNames.erase(m_ChannelNames.begin() + index);
-
-			return std::move(ret);
-		}
-
-		/// Extracts a channel by its name.
-		/// 
-		/// Remove the channel from the image and gives you full control over the channel. Also erases
-		/// its channel name.
-		/// 
-		/// \param name The name of the channel to retrieve.
-		/// \return The channel object.
-		/// \throws std::out_of_range if the channel name is invalid.
-		compressed::channel<T> extract_channel(const std::string_view name)
-		{
-			size_t index = get_channel_offset(name);
-			return extract_channel(index);
-		}
-
-		/// \brief Prints statistical information about the image file structure.
-		/// 
-		/// This function outputs various details about the compressed image, 
-		/// including dimensions, number of channels, compression ratio, and metadata.
-		/// 
-		/// Example output:
-		/// 
-		///		Statistics for image buffer:
-		///		 Width:             1024
-		///		 Height:            768
-		///		 Channels:          3
-		///		 Channelnames:      [R, G, B]
-		///		 --------------
-		///		 Compressed Size:   123456 bytes
-		///		 Uncompressed Size: 3145728 bytes
-		///		 Compression ratio: 25.5x
-		///		 Num Chunks:        512
-		///		 Metadata:
-		///		 {
-		///		    "author": "User",
-		///		    "timestamp": "2024-03-15"
-		///		 }
-		void print_statistics()
-		{
-			size_t compressed_size = 0;
-			size_t uncompressed_size = 0;
-			size_t num_chunks = 0;
-			for (const auto& channel : m_Channels)
-			{
-				compressed_size += channel.compressed_bytes();
-				uncompressed_size += channel.uncompressed_size();
-				num_chunks += channel.num_chunks();
-			}
-
-			std::cout << "Statistics for image buffer:" << std::endl;
-			std::cout << " Width:             " << m_Width << std::endl;
-			std::cout << " Height:            " << m_Height << std::endl;
-			std::cout << " Channels:          " << m_Channels.size() << std::endl;
-			std::cout << " Channelnames:      [";
-
-			for (size_t i = 0; i < m_ChannelNames.size(); ++i)
-			{
-				std::cout << m_ChannelNames[i];
-				if (i < m_ChannelNames.size() - 1)
-				{
-					std::cout << ", ";
-				}
-			}
-
-			std::cout << "]" << std::endl;
-			std::cout << " --------------     " << std::endl;
-			std::cout << " Compressed Size:   " << compressed_size << std::endl;
-			std::cout << " Uncompressed Size: " << uncompressed_size << std::endl;
-			std::cout << " Compression ratio: " << static_cast<double>(uncompressed_size) / compressed_size << "x" << std::endl;
-			std::cout << " Num Chunks:        " << num_chunks << std::endl;
-			std::cout << " Metadata:          " << "\n " << m_Metadata.dump(4) << std::endl;
-		}
-
-
-		/// Return the compression ratio over all channels.
-		double compression_ratio() const noexcept
-		{
-			size_t total_uncompressed = 1;
-			size_t total_compressed = 1;
-			for (const auto& channel : m_Channels)
-			{
-				total_compressed += channel.compressed_bytes();
-				total_uncompressed += channel.uncompressed_size();
-			}
-			return static_cast<double>(total_uncompressed) / total_compressed;
-		}
-
-
-		// ---------------------------------------------------------------------------------------------------------------------
-		// Iterators
-		// ---------------------------------------------------------------------------------------------------------------------
-
-		auto begin() noexcept { return m_Channels.begin(); }
-		auto begin() const noexcept { return m_Channels.begin(); }
-		auto end() noexcept { return m_Channels.end(); }
-		auto end() const noexcept { return m_Channels.end(); }
-
-		
-		// ---------------------------------------------------------------------------------------------------------------------
-		// Accessors
-		// ---------------------------------------------------------------------------------------------------------------------
-
-		/// Retrieves a reference to a channel by its index.
-		/// 
-		/// \param index The index of the channel to retrieve.
-		/// \return A reference to the requested channel.
-		/// \throws std::out_of_range if the index is out of bounds.
-		compressed::channel<T>& channel(size_t index)
-		{
-			if (index >= m_Channels.size())
-			{
-				throw std::out_of_range("Channel index out of range");
-			}
-			return m_Channels[index];
-		}
-
-		/// Retrieves a reference to a channel by its name.
-		/// 
-		/// \param name The name of the channel to retrieve.
-		/// \return A reference to the requested channel.
-		/// \throws std::out_of_range if the channel name is invalid.
-		compressed::channel<T>& channel(const std::string_view name)
-		{
-			size_t index = get_channel_offset(name);
-			return m_Channels[index];
-		}
-
-		/// Retrieves references to multiple channels by name and returns them as a tuple.
-		/// 
-		/// Can be used with structured bindings to quickly get the specified channels from an image.
-		/// These are returned as references (but don't have to be bound as such)
-		/// 
-		/// Example:
-		/// 
-		/// \code{.cpp}
-		/// compressed::image my_image = ...;
-		/// auto [r, g, b] = my_image.channels("r", "g", "b");
-		/// \endcode
-		/// 
-		/// \tparam Args Variadic template arguments, each convertible to std::string.
-		/// \param channel_names The names of the channels to retrieve.
-		/// \return A tuple containing references to the requested channels.
-		template <typename... Args>
-			requires (std::conjunction_v<std::is_constructible<std::string, Args>...>)
-		auto channels(Args... channel_names)
-		{
-			return std::tie(this->channel(std::forward<Args>(channel_names))...);
-		}
-
-		/// Retrieves references to multiple channels by index and returns them as a tuple.
-		/// 
-		/// Can be used with structured bindings to quickly get the specified channels from an image.
-		/// These are returned as references (but don't have to be bound as such)
-		/// 
-		/// Example:
-		/// 
-		/// \code{.cpp}
-		/// compressed::image my_image = ...;
-		/// auto [r, g, b] = my_image.channels(0, 1, 2);
-		/// \endcode
-		/// 
-		/// \tparam Args Variadic template arguments, each convertible to size_t.
-		/// \param channel_indices The indices of the channels to get
-		/// \return A tuple containing references to the requested channels.
-		template <typename... Args>
-			requires (std::conjunction_v<std::is_convertible<size_t, Args>...>)
-		auto channels(Args... channel_indices)
-		{
-			return std::tie(this->channel(std::forward<Args>(channel_indices))...);
-		}
-
-		/// Retrieves references to multiple channels their indices and returns them in a vector.
-		/// 
-		/// \param channel_indices A vector of channel indices.
-		/// \return A vector containing references to the requested channels.
-		/// \throws std::out_of_range if any channel indec is invalid.
-		std::vector<compressed::channel<T>&> channels(std::vector<size_t> channel_indices)
-		{
-			std::vector<compressed::channel<T>> result{};
-			for (const auto& index : channel_indices)
-			{
-				result.append(this->channel(index));
-			}
-			return result;
-		}
-
-		/// Retrieves references to multiple channels by name and returns them in a vector.
-		///  
-		/// \param channel_names A vector of channel names.
-		/// \return A vector containing references to the requested channels.
-		/// \throws std::out_of_range if any channel name is invalid.
-		std::vector<compressed::channel<T>&> channels(std::vector<std::string> channel_names)
-		{
-			std::vector<compressed::channel<T>> result{};
-			for (const auto& name : channel_names)
-			{
-				result.append(this->channel(name));
-			}
-			return result;
-		}
-
-		/// Retrieves references to all of the channels in the image
-		/// 
-		/// \return A vector containing references to the all the channels.
-		std::vector<compressed::channel<T>>& channels()
-		{
-			return m_Channels;
-		}
-
-		/// Retrieves const references to all of the channels in the image
-		/// 
-		/// \return A vector containing references to the all the channels.
-		const std::vector<compressed::channel<T>>& channels() const
-		{
-			return m_Channels;
-		}
-
-		/// Decompress all of the channels and return them in planar fashion.
-		/// 
-		/// Each channel's decompressed data is stored as a separate vector.
-		/// 
-		/// \return A vector of decompressed channel data, where each inner vector corresponds to a channel.
-		std::vector<std::vector<T>> get_decompressed() const
-		{
-			std::vector<std::vector<T>> result{};
-			for (const auto& channel : m_Channels)
-			{
-				result.push_back(channel.get_decompressed());
-			}
-			return result;
-		}
-
-
-		/// Retrieve the logical index of the given channel.
-		/// 
-		/// This function searches for the specified channel name in the list of available channels.
-		/// If the channel is not found, it throws a `std::invalid_argument`.
-		/// 
-		/// \param channelname The name of the channel to search for.
-		/// \return The index of the channel if found.
-		/// \throws std::invalid_argument if the channel is not available.
-		size_t get_channel_offset(const std::string_view channelname) const
-		{
-			for (size_t i = 0; i < m_ChannelNames.size(); ++i)
-			{
-				if (m_ChannelNames[i] == channelname)
-				{
-					return i;
-				}
-			}
-			throw std::invalid_argument(std::format("Unknown channelname '{}' encountered", channelname));
-		}
-
-		/// Width of the Image
-		size_t width() const noexcept
-		{
-			return m_Width;
-		}
-
-		/// Height of the image
-		size_t height() const noexcept
-		{
-			return m_Height;
-		}
-
-		/// Total number of channels in the image
-		size_t num_channels() const noexcept
-		{
-			return m_Channels.size();
-		}
-
-		/// Names of the channels stored on the image, are stored in the same order as the logical indices. So if the channelnames
-		/// are { "B", "G", "R" } accessing channel "R" would be index 2.
-		std::vector<std::string> channelnames() const noexcept
-		{
-			return m_ChannelNames;
-		}
-
-		/// Set the channelnames according to their logical indices, 
-		void channelnames(std::vector<std::string> _channelnames) 
-		{
-			if (_channelnames.size() != m_Channels.size())
-			{
-				throw std::invalid_argument(std::format(
-					"Invalid number of arguments received for setting channelnames. Expected vector size to be exactly {} but instead got {}", 
-					m_Channels.size(),
-					_channelnames.size()
-				).c_str()
-				);
-			}
-			m_ChannelNames = _channelnames;
-		}
-
-		/// Arbitrary user metadata, not authored or managed by image class, it's up to the caller to handle what goes in and comes out
-		void metadata(const json_ordered& _metadata) noexcept
-		{
-			m_Metadata = _metadata;
-		}
-
-		/// Arbitrary user metadata, not authored or managed by the image class, it's up to the caller to handle what goes in and comes out
-		json_ordered& metadata() noexcept
-		{
-			return m_Metadata;
-		}
-
-		/// Arbitrary user metadata, not authored or managed by image class, it's up to the caller to handle what goes in and comes out
-		const json_ordered& metadata() const noexcept
-		{
-			return m_Metadata;
-		}
-
-		/// Update the number of threads used internally by c-blosc2 for compression and decompression.
-		/// This is automatically set when iterating through the images with compressed::for_each for example
-		/// by specifying the compression codec.
-		void update_nthreads(size_t nthreads)
-		{
-			for (auto& channel : m_Channels)
-			{
-				channel.update_nthreads(nthreads);
-			}
-		}
-
-		/// \brief Get the chunk size used for compression, this is the same across all channels.
-		/// 
-		/// \throws std::runtime_error If the channels of the image do not all share the same chunk size as this is 
-		///							   currently unsupported.
-		/// 
-		/// \return The chunk size in bytes.
-		size_t chunk_size() const
-		{
-			size_t chunk_size = 0;
-			for (const auto& channel : m_Channels)
-			{
-				if (chunk_size != 0 && channel.chunk_size() != chunk_size)
-				{
-					throw std::runtime_error(
-						"Validation Error: Channels in image do not all have the same chunk size. This is currently"
-						" unsupported."
-					);
-				}
-				chunk_size = channel.chunk_size();
-			}
-			return chunk_size;
-		}
-
-		size_t block_size() const
-		{
-			size_t block_size = 0;
-			for (const auto& channel : m_Channels)
-			{
-				if (block_size != 0 && channel.block_size() != block_size)
-				{
-					throw std::runtime_error(
-						"Validation Error: Channels in image do not all have the same block size. This is currently"
-						" unsupported."
-					);
-				}
-				block_size = channel.block_size();
-			}
-			return block_size;
-		}
-
-	private:
-		/// All the channels, each holding their own decompression and compression context.
-		std::vector<compressed::channel<T>> m_Channels{};
-
-		/// Arbitrary user metadata, not authored or managed by us, it's up to the caller to handle what goes in and comes out
-		json_ordered m_Metadata{};
-
-		/// Optional set of channelnames to associate to the channels. If not specified sensible defaults are chosen. For example,
-		/// if 3 channels are provided we default to { "R", "G", "B" }
-		std::vector<std::string> m_ChannelNames{};
-
-		/// The width of the image file
-		size_t m_Width = 1;
-
-		/// The height of the image file
-		size_t m_Height = 1;
-
-	private:
-
-
-// Implementations for the read() functions.
-// -----------------------------------------------------------------------------------
-// -----------------------------------------------------------------------------------
+        /// Adds a compressed channel to the image.
+        ///
+        /// This method moves the provided channel into the image's internal storage, adding it to the list of channels.
+        ///
+        /// Example:
+        /// \code{.cpp}
+        /// compressed::channel<uint8_t, BlockSize, ChunkSize> channel = ...;
+        /// my_image.add_channel(std::move(channel));
+        /// \endcode
+        ///
+        /// \param _channel The channel to be added to the image.
+        /// \param name (Optional) Channel name of the channel to be inserted. If no channel names are set this argument is ignored.
+        void add_channel(compressed::channel<T> _channel, std::optional<std::string> name = std::nullopt)
+        {
+            if (_channel.width() != this->width())
+            {
+                throw std::invalid_argument(
+                    std::format(
+                        "Cannot add channel '{}' to the image as its width does not match that of the image."
+                        " Expected {:L} pixels but instead got {:L} pixels",
+                        name.value_or(""),
+                        this->width(),
+                        _channel.width()
+                    )
+                );
+            }
+            if (_channel.height() != this->height())
+            {
+                throw std::invalid_argument(
+                    std::format(
+                        "Cannot add channel '{}' to the image as its height does not match that of the image."
+                        " Expected {:L} pixels but instead got {:L} pixels",
+                        name.value_or(""),
+                        this->height(),
+                        _channel.height()
+                    )
+                );
+            }
+
+            if (name.has_value() && m_ChannelNames.size() == m_Channels.size())
+            {
+                m_ChannelNames.push_back(name.value());
+            }
+            else if (m_ChannelNames.size() > 0)
+            {
+                m_ChannelNames.push_back(name.value_or(""));
+            }
+
+            m_Channels.push_back(std::move(_channel));
+        }
+
+        /// Adds a channel to the image.
+        ///
+        /// This method moves the provided channel into the image's internal storage, compressing it and adding it to the list of channels.
+        ///
+        /// Example:
+        /// \code{.cpp}
+        /// std::span<constT> channel = ...;
+        /// my_image.add_channel(channel, 1920, 1080, "red"));
+        /// \endcode
+        ///
+        /// \param data The channel to be added to the image.
+        /// \param width The width of the channel
+        /// \param height The height of the channel
+        /// \param name (Optional) Channel name of the channel to be inserted. If no channel names are set this argument is ignored.
+        /// \param compression_codec (Optional) Compression codec to apply to the channel, every channel is allowed to have a different one.
+        /// \param compression_level (Optional) Compression level, defaults to 5.
+        void add_channel(
+            std::span<const T> data,
+            size_t width,
+            size_t height,
+            std::optional<std::string> name = std::nullopt,
+            enums::codec compression_codec = enums::codec::lz4,
+            uint8_t compression_level = 5
+        )
+        {
+            if (width != this->width())
+            {
+                throw std::invalid_argument(
+                    std::format(
+                        "Cannot add channel '{}' to the image as its width does not match that of the image."
+                        " Expected {:L} pixels but instead got {:L} pixels",
+                        name.value_or(""),
+                        width,
+                        this->width()
+                    )
+                );
+            }
+            if (height != this->height())
+            {
+                throw std::invalid_argument(
+                    std::format(
+                        "Cannot add channel '{}' to the image as its height does not match that of the image."
+                        " Expected {:L} pixels but instead got {:L} pixels",
+                        name.value_or(""),
+                        height,
+                        this->height()
+                    )
+                );
+            }
+
+            if (name.has_value() && m_ChannelNames.size() == m_Channels.size())
+            {
+                m_ChannelNames.push_back(name.value());
+            }
+            else if (m_ChannelNames.size() > 0)
+            {
+                m_ChannelNames.push_back(name.value_or(""));
+            }
+
+            m_Channels.push_back(
+                compressed::channel(
+                    std::span<const T>(data.begin(), data.end()),
+                    width,
+                    height,
+                    compression_codec,
+                    compression_level
+                )
+            );
+        }
+
+
+        /// Remove a channel by its index.
+        ///
+        /// \param index The index of the channel to remove.
+        /// \throws std::out_of_range if the index is out of bounds.
+        void remove_channel(size_t index)
+        {
+            // Extract the channel and let it exit the scope to destruct
+            auto channel = this->extract_channel(index);
+        }
+
+        /// Remove a channel by its name.
+        ///
+        /// \param name The name of the channel to remove.
+        /// \throws std::out_of_range if the channel name is invalid.
+        void remove_channel(const std::string_view name)
+        {
+            // Extract the channel and let it exit the scope to destruct
+            auto channel = this->extract_channel(name);
+        }
+
+        /// Extracts a channel by its index.
+        ///
+        /// Remove the channel from the image and gives you full control over the channel. Also erases
+        /// its channel name.
+        ///
+        /// \param index The index of the channel to retrieve.
+        /// \return The channel object.
+        /// \throws std::out_of_range if the index is out of bounds.
+        compressed::channel<T> extract_channel(size_t index)
+        {
+            if (index >= m_Channels.size())
+            {
+                throw std::out_of_range("Channel index out of range");
+            }
+            auto ret = std::move(m_Channels[index]);
+
+            m_Channels.erase(m_Channels.begin() + index);
+            m_ChannelNames.erase(m_ChannelNames.begin() + index);
+
+            return std::move(ret);
+        }
+
+        /// Extracts a channel by its name.
+        ///
+        /// Remove the channel from the image and gives you full control over the channel. Also erases
+        /// its channel name.
+        ///
+        /// \param name The name of the channel to retrieve.
+        /// \return The channel object.
+        /// \throws std::out_of_range if the channel name is invalid.
+        compressed::channel<T> extract_channel(const std::string_view name)
+        {
+            size_t index = get_channel_offset(name);
+            return extract_channel(index);
+        }
+
+        /// \brief Prints statistical information about the image file structure.
+        ///
+        /// This function outputs various details about the compressed image,
+        /// including dimensions, number of channels, compression ratio, and metadata.
+        ///
+        /// Example output:
+        ///
+        ///		Statistics for image buffer:
+        ///		 Width:             1024
+        ///		 Height:            768
+        ///		 Channels:          3
+        ///		 Channelnames:      [R, G, B]
+        ///		 --------------
+        ///		 Compressed Size:   123456 bytes
+        ///		 Uncompressed Size: 3145728 bytes
+        ///		 Compression ratio: 25.5x
+        ///		 Num Chunks:        512
+        ///		 Metadata:
+        ///		 {
+        ///		    "author": "User",
+        ///		    "timestamp": "2024-03-15"
+        ///		 }
+        void print_statistics()
+        {
+            size_t compressed_size = 0;
+            size_t uncompressed_size = 0;
+            size_t num_chunks = 0;
+            for (const auto& channel : m_Channels)
+            {
+                compressed_size += channel.compressed_bytes();
+                uncompressed_size += channel.uncompressed_size();
+                num_chunks += channel.num_chunks();
+            }
+
+            std::cout << "Statistics for image buffer:" << std::endl;
+            std::cout << " Width:             " << m_Width << std::endl;
+            std::cout << " Height:            " << m_Height << std::endl;
+            std::cout << " Channels:          " << m_Channels.size() << std::endl;
+            std::cout << " Channelnames:      [";
+
+            for (size_t i = 0; i < m_ChannelNames.size(); ++i)
+            {
+                std::cout << m_ChannelNames[i];
+                if (i < m_ChannelNames.size() - 1)
+                {
+                    std::cout << ", ";
+                }
+            }
+
+            std::cout << "]" << std::endl;
+            std::cout << " --------------     " << std::endl;
+            std::cout << " Compressed Size:   " << compressed_size << std::endl;
+            std::cout << " Uncompressed Size: " << uncompressed_size << std::endl;
+            std::cout << " Compression ratio: " << static_cast<double>(uncompressed_size) / compressed_size << "x" <<
+                std::endl;
+            std::cout << " Num Chunks:        " << num_chunks << std::endl;
+            std::cout << " Metadata:          " << "\n " << m_Metadata.dump(4) << std::endl;
+        }
+
+
+        /// Return the compression ratio over all channels.
+        double compression_ratio() const noexcept
+        {
+            size_t total_uncompressed = 1;
+            size_t total_compressed = 1;
+            for (const auto& channel : m_Channels)
+            {
+                total_compressed += channel.compressed_bytes();
+                total_uncompressed += channel.uncompressed_size();
+            }
+            return static_cast<double>(total_uncompressed) / total_compressed;
+        }
+
+
+        // ---------------------------------------------------------------------------------------------------------------------
+        // Iterators
+        // ---------------------------------------------------------------------------------------------------------------------
+
+        auto begin() noexcept { return m_Channels.begin(); }
+        auto begin() const noexcept { return m_Channels.begin(); }
+        auto end() noexcept { return m_Channels.end(); }
+        auto end() const noexcept { return m_Channels.end(); }
+
+
+        // ---------------------------------------------------------------------------------------------------------------------
+        // Accessors
+        // ---------------------------------------------------------------------------------------------------------------------
+
+        /// Retrieves a reference to a channel by its index.
+        ///
+        /// \param index The index of the channel to retrieve.
+        /// \return A reference to the requested channel.
+        /// \throws std::out_of_range if the index is out of bounds.
+        compressed::channel<T>& channel(size_t index)
+        {
+            if (index >= m_Channels.size())
+            {
+                throw std::out_of_range("Channel index out of range");
+            }
+            return m_Channels[index];
+        }
+
+        /// Retrieves a reference to a channel by its name.
+        ///
+        /// \param name The name of the channel to retrieve.
+        /// \return A reference to the requested channel.
+        /// \throws std::out_of_range if the channel name is invalid.
+        compressed::channel<T>& channel(const std::string_view name)
+        {
+            size_t index = get_channel_offset(name);
+            return m_Channels[index];
+        }
+
+        /// Retrieves references to multiple channels by name and returns them as a tuple.
+        ///
+        /// Can be used with structured bindings to quickly get the specified channels from an image.
+        /// These are returned as references (but don't have to be bound as such)
+        ///
+        /// Example:
+        ///
+        /// \code{.cpp}
+        /// compressed::image my_image = ...;
+        /// auto [r, g, b] = my_image.channels("r", "g", "b");
+        /// \endcode
+        ///
+        /// \tparam Args Variadic template arguments, each convertible to std::string.
+        /// \param channel_names The names of the channels to retrieve.
+        /// \return A tuple containing references to the requested channels.
+        template <typename... Args>
+            requires (std::conjunction_v<std::is_constructible<std::string, Args>...>)
+        auto channels(Args... channel_names)
+        {
+            return std::tie(this->channel(std::forward<Args>(channel_names))...);
+        }
+
+        /// Retrieves references to multiple channels by index and returns them as a tuple.
+        ///
+        /// Can be used with structured bindings to quickly get the specified channels from an image.
+        /// These are returned as references (but don't have to be bound as such)
+        ///
+        /// Example:
+        ///
+        /// \code{.cpp}
+        /// compressed::image my_image = ...;
+        /// auto [r, g, b] = my_image.channels(0, 1, 2);
+        /// \endcode
+        ///
+        /// \tparam Args Variadic template arguments, each convertible to size_t.
+        /// \param channel_indices The indices of the channels to get
+        /// \return A tuple containing references to the requested channels.
+        template <typename... Args>
+            requires (std::conjunction_v<std::is_convertible<size_t, Args>...>)
+        auto channels(Args... channel_indices)
+        {
+            return std::tie(this->channel(std::forward<Args>(channel_indices))...);
+        }
+
+        /// Retrieves references to multiple channels their indices and returns them in a vector.
+        ///
+        /// \param channel_indices A vector of channel indices.
+        /// \return A vector containing references to the requested channels.
+        /// \throws std::out_of_range if any channel indec is invalid.
+        std::vector<compressed::channel<T>&> channels(std::vector<size_t> channel_indices)
+        {
+            std::vector<compressed::channel<T>> result{};
+            for (const auto& index : channel_indices)
+            {
+                result.append(this->channel(index));
+            }
+            return result;
+        }
+
+        /// Retrieves references to multiple channels by name and returns them in a vector.
+        ///
+        /// \param channel_names A vector of channel names.
+        /// \return A vector containing references to the requested channels.
+        /// \throws std::out_of_range if any channel name is invalid.
+        std::vector<compressed::channel<T>&> channels(std::vector<std::string> channel_names)
+        {
+            std::vector<compressed::channel<T>> result{};
+            for (const auto& name : channel_names)
+            {
+                result.append(this->channel(name));
+            }
+            return result;
+        }
+
+        /// Retrieves references to all of the channels in the image
+        ///
+        /// \return A vector containing references to the all the channels.
+        std::vector<compressed::channel<T>>& channels()
+        {
+            return m_Channels;
+        }
+
+        /// Retrieves const references to all of the channels in the image
+        ///
+        /// \return A vector containing references to the all the channels.
+        const std::vector<compressed::channel<T>>& channels() const
+        {
+            return m_Channels;
+        }
+
+        /// Decompress all of the channels and return them in planar fashion.
+        ///
+        /// Each channel's decompressed data is stored as a separate vector.
+        ///
+        /// \return A vector of decompressed channel data, where each inner vector corresponds to a channel.
+        std::vector<std::vector<T>> get_decompressed() const
+        {
+            std::vector<std::vector<T>> result{};
+            for (const auto& channel : m_Channels)
+            {
+                result.push_back(channel.get_decompressed());
+            }
+            return result;
+        }
+
+
+        /// Retrieve the logical index of the given channel.
+        ///
+        /// This function searches for the specified channel name in the list of available channels.
+        /// If the channel is not found, it throws a `std::invalid_argument`.
+        ///
+        /// \param channelname The name of the channel to search for.
+        /// \return The index of the channel if found.
+        /// \throws std::invalid_argument if the channel is not available.
+        size_t get_channel_offset(const std::string_view channelname) const
+        {
+            for (size_t i = 0; i < m_ChannelNames.size(); ++i)
+            {
+                if (m_ChannelNames[i] == channelname)
+                {
+                    return i;
+                }
+            }
+            throw std::invalid_argument(std::format("Unknown channelname '{}' encountered", channelname));
+        }
+
+        /// Width of the Image
+        size_t width() const noexcept
+        {
+            return m_Width;
+        }
+
+        /// Height of the image
+        size_t height() const noexcept
+        {
+            return m_Height;
+        }
+
+        /// Total number of channels in the image
+        size_t num_channels() const noexcept
+        {
+            return m_Channels.size();
+        }
+
+        /// Names of the channels stored on the image, are stored in the same order as the logical indices. So if the channelnames
+        /// are { "B", "G", "R" } accessing channel "R" would be index 2.
+        std::vector<std::string> channelnames() const noexcept
+        {
+            return m_ChannelNames;
+        }
+
+        /// Set the channelnames according to their logical indices,
+        void channelnames(std::vector<std::string> _channelnames)
+        {
+            if (_channelnames.size() != m_Channels.size())
+            {
+                throw std::invalid_argument(
+                    std::format(
+                        "Invalid number of arguments received for setting channelnames. Expected vector size to be exactly {} but instead got {}",
+                        m_Channels.size(),
+                        _channelnames.size()
+                    ).c_str()
+                );
+            }
+            m_ChannelNames = _channelnames;
+        }
+
+        /// Arbitrary user metadata, not authored or managed by image class, it's up to the caller to handle what goes in and comes out
+        void metadata(const json_ordered& _metadata) noexcept
+        {
+            m_Metadata = _metadata;
+        }
+
+        /// Arbitrary user metadata, not authored or managed by the image class, it's up to the caller to handle what goes in and comes out
+        json_ordered& metadata() noexcept
+        {
+            return m_Metadata;
+        }
+
+        /// Arbitrary user metadata, not authored or managed by image class, it's up to the caller to handle what goes in and comes out
+        const json_ordered& metadata() const noexcept
+        {
+            return m_Metadata;
+        }
+
+        /// Update the number of threads used internally by c-blosc2 for compression and decompression.
+        /// This is automatically set when iterating through the images with compressed::for_each for example
+        /// by specifying the compression codec.
+        void update_nthreads(size_t nthreads)
+        {
+            for (auto& channel : m_Channels)
+            {
+                channel.update_nthreads(nthreads);
+            }
+        }
+
+        /// \brief Get the chunk size used for compression, this is the same across all channels.
+        ///
+        /// \throws std::runtime_error If the channels of the image do not all share the same chunk size as this is
+        ///							   currently unsupported.
+        ///
+        /// \return The chunk size in bytes.
+        size_t chunk_size() const
+        {
+            size_t chunk_size = 0;
+            for (const auto& channel : m_Channels)
+            {
+                if (chunk_size != 0 && channel.chunk_size() != chunk_size)
+                {
+                    throw std::runtime_error(
+                        "Validation Error: Channels in image do not all have the same chunk size. This is currently"
+                        " unsupported."
+                    );
+                }
+                chunk_size = channel.chunk_size();
+            }
+            return chunk_size;
+        }
+
+        size_t block_size() const
+        {
+            size_t block_size = 0;
+            for (const auto& channel : m_Channels)
+            {
+                if (block_size != 0 && channel.block_size() != block_size)
+                {
+                    throw std::runtime_error(
+                        "Validation Error: Channels in image do not all have the same block size. This is currently"
+                        " unsupported."
+                    );
+                }
+                block_size = channel.block_size();
+            }
+            return block_size;
+        }
+
+    private:
+        struct ring_buffer_slot
+        {
+            util::default_init_vector<T> interleaved_buffer;
+            std::vector<util::default_init_vector<T>> deinterleaved_buffer;
+            std::vector<cuda::scoped_host_pinner> memory_pinners;
+            std::future<void> processing_future;
+
+            ring_buffer_slot() = default;
+            ring_buffer_slot(ring_buffer_slot&&) noexcept = default;
+            ring_buffer_slot& operator=(ring_buffer_slot&&) noexcept = default;
+            ring_buffer_slot(const ring_buffer_slot&) = delete;
+            ring_buffer_slot& operator=(const ring_buffer_slot&) = delete;
+        };
+
+        using ring_buffer_t = std::vector<ring_buffer_slot>;
+
+    private:
+        /// All the channels, each holding their own decompression and compression context.
+        std::vector<compressed::channel<T>> m_Channels{};
+
+        /// Arbitrary user metadata, not authored or managed by us, it's up to the caller to handle what goes in and comes out
+        json_ordered m_Metadata{};
+
+        /// Optional set of channelnames to associate to the channels. If not specified sensible defaults are chosen. For example,
+        /// if 3 channels are provided we default to { "R", "G", "B" }
+        std::vector<std::string> m_ChannelNames{};
+
+        /// The width of the image file
+        size_t m_Width = 1;
+
+        /// The height of the image file
+        size_t m_Height = 1;
+
+    private:
+        // Implementations for the read() functions.
+        // -----------------------------------------------------------------------------------
+        // -----------------------------------------------------------------------------------
 
 #ifdef COMPRESSED_IMAGE_OIIO_AVAILABLE
 
 
-		/// \brief Read implementation for all the call to image<T>::read().
-		/// 
-		/// This function takes care of reading data from the input pointer and propagating it to read_contiguous_channels_impl.
-		/// 
-		/// \param input_ptr The pointer to read the data from
-		/// \param channelnames The channels to read from the file, non-existant channels throw std::out_of_range
-		/// \param postprocess An optional postprocessing step to apply to the chunks before they get compressed.
-		/// \param compression_codec The compression codec to apply
-		/// \param compression_level The compression level to compress with
-		/// \param block_size The block size to apply to the compressed data
-		/// \param chunk_size The chunk size to apply to the compressed data
-		/// 
-		/// \returns The decoded image.
-		template <typename PostProcess = std::nullopt_t>
-			requires std::invocable<std::remove_reference_t<PostProcess>, size_t, std::span<T>> || std::is_same_v<std::remove_cvref_t<PostProcess>, std::nullopt_t>
-		static image read_impl(
-			std::unique_ptr<OIIO::ImageInput> input_ptr,
-			std::vector<std::string> channelnames,
-			PostProcess&& postprocess,
-			int subimage,
-			enums::codec compression_codec = enums::codec::lz4,
-			size_t compression_level = 9,
-			size_t block_size = s_default_blocksize,
-			size_t chunk_size = s_default_chunksize
-			)
-		{
-			_COMPRESSED_PROFILE_FUNCTION();
-			assert(chunk_size % sizeof(T) == 0);
-			auto comp_level_adjusted = util::ensure_compression_level(compression_level);
-
-			// Seek to the right subimage before getting the spec.
-			auto res = input_ptr->seek_subimage(subimage, 0);
-			if (!res)
-			{
-				throw std::invalid_argument(
-					std::format(
-						"File does not have a subimage {}, cannot seek to it", subimage
-					)
-				);
-			}
-			const OIIO::ImageSpec& spec = input_ptr->spec();
-
-			// Align the chunk size to the scanlines and tiles (if applicable), this makes our life considerably 
-			// easier and allows us to not deal with partial scanlines.
-			size_t chunk_size_aligned = 0;
-			if (spec.tile_height != 0)
-			{
-				chunk_size_aligned = util::align_chunk_to_tile_bytes<T>(spec.width, spec.tile_height, chunk_size);
-			}
-			else
-			{
-				chunk_size_aligned = util::align_chunk_to_scanlines_bytes<T>(spec.width, chunk_size);
-			}
-
-			// Get a std::vector containing a begin-end pair for all contiguous channels in our channelnames.
-			// So if we pass 'R', 'B' and 'A' in a rgba image we would get the following
-			// { {0 - 1}, {2 - 4} }
-			// This allows us to both maximize performance by handling as many channels in one go as we can while also
-			// minimizing memory footprint by only ever allocating as much as we need for the max amount of contiguous
-			// channels we can encounter.
-			std::vector<compressed::channel<T>> channels;
-			auto channel_ranges_contiguous = detail::get_contiguous_channels(input_ptr, channelnames);
-			size_t max_num_channels = 0;
-			for (const auto& [chbegin, chend] : channel_ranges_contiguous)
-			{
-				if (static_cast<size_t>(chend) - chbegin > max_num_channels)
-				{
-					max_num_channels = static_cast<size_t>(chend) - chbegin;
-				}
-			}
-
-
-			// Set up scratch buffers
-			// -----------------------------------------------------------------------------------
-			// -----------------------------------------------------------------------------------
-
-			// Maximum chunk size we will need to account for (times number of channels).
-			const size_t max_chunk_size = chunk_size_aligned * max_num_channels;
-
-			// Initialize our swap buffers, these are going to be either discarded after
-			// or compressed from.
-			util::default_init_vector<T> interleaved_buffer(max_chunk_size / sizeof(T));
-			std::vector<util::default_init_vector<T>> deinterleaved_buffer(max_num_channels);
-			std::for_each(std::execution::par_unseq, deinterleaved_buffer.begin(), deinterleaved_buffer.end(), [&](auto& buffer)
-				{
-					buffer.resize(chunk_size_aligned / sizeof(T));
-				});
-
-			// Buffer to hold a single chunk. We will reuse this quite frequently
-			auto chunk_buffer = util::default_init_vector<std::byte>(blosc2::min_compressed_size(chunk_size_aligned));
-
-			// Read and compress the channel pairs in chunks
-			// -----------------------------------------------------------------------------------
-			// -----------------------------------------------------------------------------------
-
-			// This will be the channelnames we will construct the image with. This is to avoid cases where the user
-			// passes the channel names in a different order than they appear in such as 'A', 'G', 'R'. This should
-			// still create the channel names as expected in correct order.
-			std::vector<std::string> new_channelnames{};
-
-			// Iterate all the pair and extract them, refitting the buffers as needed.
-			// This is where the actual work of reading start. 
-			for (auto [chbegin, chend] : channel_ranges_contiguous)
-			{
-				// Calculate some preliminary data for computing how many scanlines to extract in one go.
-				int nchannels = chend - chbegin;
-				const size_t bytes_per_scanline = static_cast<size_t>(spec.width) * nchannels * sizeof(T);
-
-				const size_t chunk_size_all = chunk_size_aligned * nchannels;
-				const size_t scanlines_per_chunk = chunk_size_all / bytes_per_scanline;
-
-				// Refit the swap buffers as `read_contiguous_channels_impl` expects these to be exactly sized.
-				auto interleaved_fitted = std::span<T>(interleaved_buffer.begin(), chunk_size_all / sizeof(T));
-				std::vector<std::span<T>> deinterleaved_fitted{};
-				for (auto idx : std::views::iota(0, nchannels))
-				{
-					// construct a span from the util::default_init_vector
-					deinterleaved_fitted.push_back(
-						std::span<T>(deinterleaved_buffer.at(idx).begin(), deinterleaved_buffer.at(idx).end())
-					);
-				}
-
-				// Create and initialize the contexts and schunks. These are pretty light weight so we don't need
-				// to worry about creating them outside of the loop/reusing them.
-				std::vector<blosc2::context_ptr> contexts;
-				std::vector<blosc2::schunk<T>> schunks;
-				for ([[maybe_unused]] auto _ : std::views::iota(0, nchannels))
-				{
-					schunks.push_back(blosc2::schunk<T>(block_size, chunk_size_aligned));
-					contexts.push_back(blosc2::create_compression_context<T>(
-						std::thread::hardware_concurrency(),
-						compression_codec,
-						comp_level_adjusted,
-						block_size
-					));
-				}
-
-				// Read the contiguous channel sequence into the contexts and schunks.
-				if constexpr (std::invocable<std::remove_reference_t<PostProcess>, size_t, std::span<T>>)
-				{
-					if (spec.tile_height != 0)
-					{
-						image<T>::read_contiguous_channels_impl<true>(
-							input_ptr,
-							subimage,
-							chbegin,
-							chend,
-							interleaved_fitted,
-							deinterleaved_fitted,
-							scanlines_per_chunk,
-							contexts,
-							schunks,
-							chunk_buffer,
-							std::forward<PostProcess>(postprocess)
-						);
-					}
-					else
-					{
-						image<T>::read_contiguous_channels_impl<false>(
-							input_ptr,
-							subimage,
-							chbegin,
-							chend,
-							interleaved_fitted,
-							deinterleaved_fitted,
-							scanlines_per_chunk,
-							contexts,
-							schunks,
-							chunk_buffer,
-							std::forward<PostProcess>(postprocess)
-						);
-					}
-				}
-				else
-				{
-					if (spec.tile_height != 0)
-					{
-						image<T>::read_contiguous_channels_impl<true>(
-							input_ptr,
-							subimage,
-							chbegin,
-							chend,
-							interleaved_fitted,
-							deinterleaved_fitted,
-							scanlines_per_chunk,
-							contexts,
-							schunks,
-							chunk_buffer,
-							std::nullopt
-						);
-					}
-					else
-					{
-						image<T>::read_contiguous_channels_impl<false>(
-							input_ptr,
-							subimage,
-							chbegin,
-							chend,
-							interleaved_fitted,
-							deinterleaved_fitted,
-							scanlines_per_chunk,
-							contexts,
-							schunks,
-							chunk_buffer,
-							std::nullopt
-						);
-					}
-				}
-
-
-				// Finally create the channels from the schunks
-				for (const auto channel_idx : std::views::iota(0, nchannels))
-				{
-					_COMPRESSED_PROFILE_SCOPE("generate channels");
-					channels.push_back(
-						compressed::channel<T>(
-							std::move(schunks[channel_idx]),
-							spec.width,
-							spec.height,
-							compression_codec,
-							comp_level_adjusted
-						)
-					);
-				}
-
-				// Store the correctly mapped channelnames
-				for (auto channel_idx : std::views::iota(chbegin, chend))
-				{
-					new_channelnames.push_back(spec.channelnames.at(channel_idx));
-				}
-			}
-
-			// Construct the image instance.
-			auto img = compressed::image<T>(std::move(channels), spec.width, spec.height, new_channelnames);
-			img.metadata(compressed::image<T>::read_oiio_metadata(spec));
-			return std::move(img);
-		}
-
-
-		/// \brief Read a contiguous channel sequence from the passed input pointer
-		///
-		/// When reading with OpenImageIO it is a lot more efficient to parse as many channels as possible in one go
-		/// rather than reading one channel at a time as the ImageInput keeps the data as compressed (in many cases).
-		/// If we were to read one channel at a time this would significantly slow down our read speeds.
-		/// 
-		/// Due to us only being able to read contiguous channels at a time this helper function allows us to do that.
-		/// 
-		/// \param input_ptr The opened OpenImageIO ImageInput.
-		/// \param chbegin The start channel to read
-		/// \param chend The end channel to read
-		/// \param interleaved_buffer The buffer into which we will read the channels (before then interleaving).
-		///							  must be sized to exactly fit nchannels * width * height
-		/// \param deinterleaved_buffer The buffers to deinterleave into, must be exactly of size nchannels with each
-		///								sub-buffer being exactly width * height.
-		/// \param scanlines_per_chunk The number of scanlines that fit into one chunk (exactly).
-		/// \param contexts The contexts for compression, must be exactly nchannels amount
-		/// \param schunks The schunks for compression, must be exactly nchannels amount
-		/// \param chunk_buffer A scratch buffer for compression (from which we copy).
-		/// 
-		/// \throws std::invalid_argument if any of the above conditions is not met.
-		template <bool read_tiles, typename PostProcess = std::nullopt_t>
-			requires std::invocable<std::remove_reference_t<PostProcess>, size_t, std::span<T>> || std::is_same_v<std::remove_cvref_t<PostProcess>, std::nullopt_t>
-		static void read_contiguous_channels_impl(
-			std::unique_ptr<OIIO::ImageInput>& input_ptr,
-			int subimage,
-			int chbegin,
-			int chend,
-			std::span<T> interleaved_buffer,
-			std::vector<std::span<T>>& deinterleaved_buffer,
-			size_t scanlines_per_chunk,
-			std::vector<blosc2::context_ptr>& contexts,
-			std::vector<blosc2::schunk<T>>& schunks,
-			util::default_init_vector<std::byte>& chunk_buffer,
-			PostProcess&& postprocess
-		)
-		{
-			_COMPRESSED_PROFILE_FUNCTION();
-			const int nchannels = chend - chbegin;
-			assert(input_ptr->current_subimage() == subimage);
-			const OIIO::ImageSpec& spec = input_ptr->spec();
-			const auto typedesc = enums::get_type_desc<T>();
-
-			// Ensure this function is called with at least 1 channel to read.
-			if (nchannels < 1)
-			{
-				throw std::runtime_error(
-					std::format(
-						"read_contiguous_channels_impl: passed number of channels is less than one. This should not happen. Got {}",
-						nchannels
-					)
-				);
-			}
-
-			// Ensure the interleaved buffer is correctly sized.
-			if (interleaved_buffer.size() != static_cast<size_t>(nchannels) * spec.width * scanlines_per_chunk)
-			{
-				throw std::invalid_argument(
-					std::format(
-						"read_contiguous_channels_impl: Received incorrectly sized interleaved buffer, should be exactly"
-						" {:L} elements large but instead got {:L}.", 
-						static_cast<size_t>(nchannels) * spec.width * scanlines_per_chunk,
-						interleaved_buffer.size()
-					)
-				);
-			}
-			// Ensure the deinterleaved buffer, and its subbuffers, are correctly sized.
-			if (deinterleaved_buffer.size() != static_cast<size_t>(nchannels))
-			{
-				throw std::invalid_argument(
-					std::format(
-						"read_contiguous_channels_impl: Received incorrectly sized deinterleaved buffer, should be exactly"
-						" {:L} elements large but instead got {:L}.",
-						nchannels,
-						deinterleaved_buffer.size()
-					)
-				);
-			}
-			for (const auto& buffer : deinterleaved_buffer)
-			{
-				if (buffer.size() != spec.width * scanlines_per_chunk)
-				{
-					throw std::invalid_argument(
-						std::format(
-							"read_contiguous_channels_impl: Received incorrectly sized deinterleaved buffer,"
-							" should be exactly {:L} elements large but instead got {:L}.",
-							static_cast<size_t>(nchannels) * spec.width * scanlines_per_chunk,
-							interleaved_buffer.size()
-						)
-					);
-				}
-			}
-			// Ensure the contexts and schunks are correctly sized
-			if (contexts.size() != static_cast<size_t>(nchannels) || schunks.size() != static_cast<size_t>(nchannels))
-			{
-				throw std::runtime_error(
-					std::format(
-						"read_contiguous_channels_impl: Internal error: Expected the number of passed schunks and contexts"
-						" to exactly match the number of requested channels. Instead got {} and {} while {} was the expected"
-						" number.",
-						schunks.size(),
-						contexts.size(),
-						nchannels
-					)
-				);
-			}
-
-			// Iterate all scanlines and read as many scanlines as possible in one go, compressing them on the fly 
-			// into all of the super-chunks. This works for data windows as well where the y and x may not start at zero
-			int y = spec.y;
-			while (y < (spec.height + spec.y))
-			{
-				_COMPRESSED_PROFILE_SCOPE("Read Scanlines/Tiles and compress");
-				int scanlines_to_read = static_cast<int>(std::min<size_t>(
-					scanlines_per_chunk, static_cast<size_t>(spec.height + spec.y - y)
-				));
-
-
-				bool read_successful = false;
-				// Since the passed `scanlines_per_chunk` is already appropriately aligned to either tiles or scanlines,
-				// we can safely call either `read_tiles` or `read_scanlines` here making sure we are correctly aligned
-				if constexpr (read_tiles)
-				{
-					read_successful = input_ptr->read_tiles(
-						subimage,
-						0, // miplevel
-						spec.x, // xbegin
-						spec.width, // xend
-						y, // ybegin
-						y + scanlines_to_read, // yend
-						0, // zbegin	
-						1, // zend
-						chbegin,
-						chend,
-						typedesc,
-						static_cast<void*>(interleaved_buffer.data())
-					);
-				}
-				else
-				{
-					read_successful = input_ptr->read_scanlines(
-						subimage,
-						0, // miplevel
-						y, // ybegin
-						y + scanlines_to_read, // yend
-						0, // z
-						chbegin,
-						chend,
-						typedesc,
-						static_cast<void*>(interleaved_buffer.data())
-					);
-				}
-
-				if (!read_successful)
-				{
-					throw std::runtime_error(
-						std::format(
-							"OIIO read failure when reading scanlines {}-{} for channels {}-{}: '{}'",
-							y, y + scanlines_to_read, chbegin, chend, input_ptr->geterror()
-						)
-					);
-				}
-
-				// Deinterleave the buffers, in some cases we may be deinterleaving empty space here but that 
-				// is ok as we refit the buffers. Since in most cases the size will only be off by at most one
-				// scanline. In the case of the last chunk, we may be at worst deinterleaving only one scanline
-				// with the rest being empty space but that is also ok.
-				image_algo::deinterleave(std::span<const T>(interleaved_buffer), deinterleaved_buffer);
-
-				// Now start compressing the chunks and appending them into the super-chunks.
-				for (auto channel_idx : std::views::iota(0, nchannels))
-				{
-					// How many elements we actually read per buffer
-					size_t read_elements = static_cast<size_t>(scanlines_to_read) * spec.width;
-					auto deinterleaved_fitted = std::span<T>(deinterleaved_buffer[channel_idx].data(), read_elements);
-
-					// Perform the user-passed postprocessing, this may be anything and it's up to the user to decide
-					// what goes here.
-					if constexpr (std::invocable<std::remove_reference_t<PostProcess>, size_t, std::span<T>>)
-					{
-						auto absolute_channel_idx = chbegin + channel_idx;
-						postprocess(absolute_channel_idx, deinterleaved_fitted);
-					}
-
-					schunks[channel_idx].append_chunk(
-						contexts[channel_idx],
-						deinterleaved_fitted,
-						std::span<std::byte>(chunk_buffer)
-					);
-				}
-				y += scanlines_to_read;
-			}
-		}
+        /// \brief Read implementation for all the call to image<T>::read().
+        ///
+        /// This function takes care of reading data from the input pointer and propagating it to read_contiguous_channels_impl.
+        ///
+        /// \param input_ptr The pointer to read the data from
+        /// \param channelnames The channels to read from the file, non-existant channels throw std::out_of_range
+        /// \param postprocess An optional postprocessing step to apply to the chunks before they get compressed.
+        /// \param compression_codec The compression codec to apply
+        /// \param compression_level The compression level to compress with
+        /// \param block_size The block size to apply to the compressed data
+        /// \param chunk_size The chunk size to apply to the compressed data
+        ///
+        /// \returns The decoded image.
+        template <typename PostProcess = std::nullopt_t>
+            requires std::invocable<std::remove_reference_t<PostProcess>, size_t, std::span<T>> || std::is_same_v<
+                std::remove_cvref_t<PostProcess>, std::nullopt_t>
+        static image read_impl(
+            std::unique_ptr<OIIO::ImageInput> input_ptr,
+            std::vector<std::string> channelnames,
+            PostProcess&& postprocess,
+            int subimage,
+            enums::codec compression_codec = enums::codec::lz4,
+            size_t compression_level = 9,
+            size_t block_size = s_default_blocksize,
+            size_t chunk_size = s_default_chunksize
+        )
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            assert(chunk_size % sizeof(T) == 0);
+            auto comp_level_adjusted = util::ensure_compression_level(compression_level);
+
+            // Seek to the right subimage before getting the spec.
+            auto res = input_ptr->seek_subimage(subimage, 0);
+            if (!res)
+            {
+                throw std::invalid_argument(
+                    std::format(
+                        "File does not have a subimage {}, cannot seek to it",
+                        subimage
+                    )
+                );
+            }
+            const OIIO::ImageSpec& spec = input_ptr->spec();
+
+            // Align the chunk size to the scanlines and tiles (if applicable), this makes our life considerably
+            // easier and allows us to not deal with partial scanlines.
+            size_t chunk_size_aligned = 0;
+            if (spec.tile_height != 0)
+            {
+                chunk_size_aligned = util::align_chunk_to_tile_bytes<T>(spec.width, spec.tile_height, chunk_size);
+            }
+            else
+            {
+                chunk_size_aligned = util::align_chunk_to_scanlines_bytes<T>(spec.width, chunk_size);
+            }
+
+            // Get a std::vector containing a begin-end pair for all contiguous channels in our channelnames.
+            // So if we pass 'R', 'B' and 'A' in a rgba image we would get the following
+            // { {0 - 1}, {2 - 4} }
+            // This allows us to both maximize performance by handling as many channels in one go as we can while also
+            // minimizing memory footprint by only ever allocating as much as we need for the max amount of contiguous
+            // channels we can encounter.
+            std::vector<compressed::channel<T>> channels;
+            auto channel_ranges_contiguous = detail::get_contiguous_channels(input_ptr, channelnames);
+            size_t max_num_channels = 0;
+            for (const auto& [chbegin, chend] : channel_ranges_contiguous)
+            {
+                if (static_cast<size_t>(chend) - chbegin > max_num_channels)
+                {
+                    max_num_channels = static_cast<size_t>(chend) - chbegin;
+                }
+            }
+
+            // Set up the Ring Buffer (Double Buffering)
+            // -----------------------------------------------------------------------------------
+            constexpr size_t ring_buffer_size = 2;
+            const size_t max_chunk_size = chunk_size_aligned * max_num_channels;
+            ring_buffer_t ring_buffer(ring_buffer_size);
+
+            for (auto& slot : ring_buffer)
+            {
+                slot.interleaved_buffer.resize(max_chunk_size / sizeof(T));
+                slot.deinterleaved_buffer.resize(max_num_channels);
+                for (auto& buffer : slot.deinterleaved_buffer)
+                {
+                    buffer.resize(chunk_size_aligned / sizeof(T));
+                }
+
+                if (enums::is_gpu_codec(compression_codec))
+                {
+                    slot.memory_pinners.reserve(1 + slot.deinterleaved_buffer.size());
+                    slot.memory_pinners.emplace_back(
+                        slot.interleaved_buffer.data(),
+                        slot.interleaved_buffer.size() * sizeof(T)
+                    );
+                    for (auto& buffer : slot.deinterleaved_buffer)
+                    {
+                        slot.memory_pinners.emplace_back(buffer.data(), buffer.size() * sizeof(T));
+                    }
+                }
+            }
+
+            // Read and compress the channel pairs in chunks
+            // -----------------------------------------------------------------------------------
+            // -----------------------------------------------------------------------------------
+
+            // This will be the channelnames we will construct the image with. This is to avoid cases where the user
+            // passes the channel names in a different order than they appear in such as 'A', 'G', 'R'. This should
+            // still create the channel names as expected in correct order.
+            std::vector<std::string> new_channelnames{};
+
+            // Initialize a scratch buffer for compression/decompression.
+            auto scratch_buffer = detail::scratch_pool_registry::get_or_create_for_channel();
+
+            // Iterate all the pair and extract them, refitting the buffers as needed.
+            // This is where the actual work of reading start.
+            for (auto [chbegin, chend] : channel_ranges_contiguous)
+            {
+                // Calculate some preliminary data for computing how many scanlines to extract in one go.
+                int nchannels = chend - chbegin;
+                const size_t bytes_per_scanline = static_cast<size_t>(spec.width) * nchannels * sizeof(T);
+                const size_t chunk_size_all = chunk_size_aligned * nchannels;
+                const size_t scanlines_per_chunk = chunk_size_all / bytes_per_scanline;
+                std::vector<detail::schunk<T>> schunks;
+                for ([[maybe_unused]] auto _ : std::views::iota(0, nchannels))
+                {
+                    schunks.push_back(detail::schunk<T>(block_size, chunk_size_aligned));
+                }
+
+                // Pass the managed ring buffer into our streaming implementation
+                if constexpr (std::invocable<std::remove_reference_t<PostProcess>, size_t, std::span<T>>)
+                {
+                    if (spec.tile_height != 0)
+                    {
+                        image<T>::template read_contiguous_channels_impl<true>(
+                            input_ptr,
+                            subimage,
+                            chbegin,
+                            chend,
+                            compression_codec,
+                            comp_level_adjusted,
+                            block_size,
+                            ring_buffer,
+                            scanlines_per_chunk,
+                            schunks,
+                            std::forward<PostProcess>(postprocess)
+                        );
+                    }
+                    else
+                    {
+                        image<T>::template read_contiguous_channels_impl<false>(
+                            input_ptr,
+                            subimage,
+                            chbegin,
+                            chend,
+                            compression_codec,
+                            comp_level_adjusted,
+                            block_size,
+                            ring_buffer,
+                            scanlines_per_chunk,
+                            schunks,
+                            std::forward<PostProcess>(postprocess)
+                        );
+                    }
+                }
+                else
+                {
+                    if (spec.tile_height != 0)
+                    {
+                        image<T>::template read_contiguous_channels_impl<true>(
+                            input_ptr,
+                            subimage,
+                            chbegin,
+                            chend,
+                            compression_codec,
+                            comp_level_adjusted,
+                            block_size,
+                            ring_buffer,
+                            scanlines_per_chunk,
+                            schunks,
+                            std::nullopt
+                        );
+                    }
+                    else
+                    {
+                        image<T>::template read_contiguous_channels_impl<false>(
+                            input_ptr,
+                            subimage,
+                            chbegin,
+                            chend,
+                            compression_codec,
+                            comp_level_adjusted,
+                            block_size,
+                            ring_buffer,
+                            scanlines_per_chunk,
+                            schunks,
+                            std::nullopt
+                        );
+                    }
+                }
+
+                for (const auto channel_idx : std::views::iota(0, nchannels))
+                {
+                    _COMPRESSED_PROFILE_SCOPE("generate channels");
+                    channels.push_back(
+                        compressed::channel<T>(
+                            std::move(schunks[channel_idx]),
+                            spec.width,
+                            spec.height,
+                            compression_codec,
+                            comp_level_adjusted
+                        )
+                    );
+                }
+
+                for (auto channel_idx : std::views::iota(chbegin, chend))
+                {
+                    new_channelnames.push_back(spec.channelnames.at(channel_idx));
+                }
+            }
+
+            auto img = compressed::image<T>(std::move(channels), spec.width, spec.height, new_channelnames);
+            img.metadata(compressed::image<T>::read_oiio_metadata(spec));
+            return std::move(img);
+        }
+
+
+        /// \brief Read a contiguous channel sequence from the passed input pointer
+        ///
+        /// When reading with OpenImageIO it is a lot more efficient to parse as many channels as possible in one go
+        /// rather than reading one channel at a time as the ImageInput keeps the data as compressed (in many cases).
+        /// If we were to read one channel at a time this would significantly slow down our read speeds.
+        ///
+        /// Due to us only being able to read contiguous channels at a time this helper function allows us to do that.
+        ///
+        /// \param input_ptr The opened OpenImageIO ImageInput.
+        /// \param chbegin The start channel to read
+        /// \param chend The end channel to read
+        /// \param interleaved_buffer The buffer into which we will read the channels (before then interleaving).
+        ///							  must be sized to exactly fit nchannels * width * height
+        /// \param deinterleaved_buffer The buffers to deinterleave into, must be exactly of size nchannels with each
+        ///								sub-buffer being exactly width * height.
+        /// \param scanlines_per_chunk The number of scanlines that fit into one chunk (exactly).
+        /// \param contexts The contexts for compression, must be exactly nchannels amount
+        /// \param schunks The schunks for compression, must be exactly nchannels amount
+        /// \param chunk_buffer A scratch buffer for compression (from which we copy).
+        ///
+        /// \throws std::invalid_argument if any of the above conditions is not met.
+        template <bool read_tiles, typename PostProcess = std::nullopt_t>
+            requires std::invocable<std::remove_reference_t<PostProcess>, size_t, std::span<T>> || std::is_same_v<
+                std::remove_cvref_t<PostProcess>, std::nullopt_t>
+        static void read_contiguous_channels_impl(
+            std::unique_ptr<OIIO::ImageInput>& input_ptr,
+            const int subimage,
+            const int chbegin,
+            const int chend,
+            const enums::codec compression_codec,
+            const size_t compression_level,
+            const size_t block_size,
+            ring_buffer_t& ring_buffer,
+            size_t scanlines_per_chunk,
+            std::vector<detail::schunk<T>>& schunks,
+            PostProcess&& postprocess
+        )
+        {
+            _COMPRESSED_PROFILE_FUNCTION();
+            const int nchannels = chend - chbegin;
+            assert(input_ptr->current_subimage() == subimage);
+            const OIIO::ImageSpec& spec = input_ptr->spec();
+            const auto typedesc = enums::get_type_desc<T>();
+
+            // Ensure this function is called with at least 1 channel to read.
+            if (nchannels < 1)
+            {
+                throw std::runtime_error(
+                    std::format(
+                        "read_contiguous_channels_impl: passed number of channels is less than one. This should not happen. Got {}",
+                        nchannels
+                    )
+                );
+            }
+
+
+            // Iterate all scanlines and read as many scanlines as possible in one go, compressing them on the fly
+            // into all of the super-chunks. This works for data windows as well where the y and x may not start at zero
+            std::future<void> previous_compute_future;
+            size_t ring_index = 0;
+            int y = spec.y;
+
+            while (y < (spec.height + spec.y))
+            {
+                _COMPRESSED_PROFILE_SCOPE("Read Scanlines/Tiles and compress");
+                int scanlines_to_read = static_cast<int>(std::min<size_t>(
+                    scanlines_per_chunk,
+                    static_cast<size_t>(spec.height + spec.y - y)
+                ));
+
+                // Select active slot in our ring buffer
+                auto& slot = ring_buffer[ring_index];
+
+                // 1. Wait if this slot's own previous turn hasn't finished (Safe Guard for small ring buffers)
+                if (slot.processing_future.valid())
+                {
+                    slot.processing_future.get();
+                }
+
+                // Slice out exact span dimensions required for the OIIO validation checks and bounds
+                const size_t chunk_size_all = scanlines_per_chunk * spec.width * nchannels;
+                auto interleaved_fitted = std::span<T>(slot.interleaved_buffer.data(), chunk_size_all);
+
+                // 2. STAGE 1 (I/O): Synchronously read next file chunk on main thread
+                bool read_successful = false;
+                if constexpr (read_tiles)
+                {
+                    _COMPRESSED_PROFILE_SCOPE("read tiles");
+                    read_successful = input_ptr->read_tiles(
+                        subimage,
+                        0,
+                        spec.x,
+                        spec.width,
+                        y,
+                        y + scanlines_to_read,
+                        0,
+                        1,
+                        chbegin,
+                        chend,
+                        typedesc,
+                        static_cast<void*>(interleaved_fitted.data())
+                    );
+                }
+                else
+                {
+                    _COMPRESSED_PROFILE_SCOPE("read scanlines");
+                    read_successful = input_ptr->read_scanlines(
+                        subimage,
+                        0,
+                        y,
+                        y + scanlines_to_read,
+                        0,
+                        chbegin,
+                        chend,
+                        typedesc,
+                        static_cast<void*>(interleaved_fitted.data())
+                    );
+                }
+
+                if (!read_successful)
+                {
+                    throw std::runtime_error(
+                        std::format(
+                            "OIIO read failure when reading scanlines {}-{} for channels {}-{}: '{}'",
+                            y,
+                            y + scanlines_to_read,
+                            chbegin,
+                            chend,
+                            input_ptr->geterror()
+                        )
+                    );
+                }
+
+                // 3. ORDER ENFORCEMENT: Wait for chunk k-1's compression to completely finish
+                // before spawning chunk k's compute task. This guarantees blocks append to schunks in sequential order.
+                if (previous_compute_future.valid())
+                {
+                    previous_compute_future.get();
+                }
+
+                size_t read_elements = static_cast<size_t>(scanlines_to_read) * spec.width;
+
+                // 4. STAGE 2 (COMPUTE): Delegate processing & compression of the freshly read chunk to a background task.
+                // Main thread loops back immediately to read chunk k+1 into the alternate buffer slot.
+                slot.processing_future = std::async(
+                    std::launch::async,
+                    [
+                        &slot, interleaved_fitted, nchannels, read_elements, compression_codec, compression_level,
+                        block_size, chbegin,
+                        y, scanlines_to_read, spec_width = spec.width, spec_height = spec.height, spec_y = spec.y,
+                        &schunks, &postprocess
+                    ]()
+                    {
+                        // Slice deinterleaved spans for this active task
+                        std::vector<std::span<T>> deinterleaved_fitted_views;
+                        deinterleaved_fitted_views.reserve(nchannels);
+                        for (int idx = 0; idx < nchannels; ++idx)
+                        {
+                            deinterleaved_fitted_views.emplace_back(
+                                slot.deinterleaved_buffer[idx].data(),
+                                slot.deinterleaved_buffer[idx].size()
+                            );
+                        }
+
+                        // Compute steps
+                        image_algo::deinterleave(std::span<const T>(interleaved_fitted), deinterleaved_fitted_views);
+
+                        for (auto channel_idx : std::views::iota(0, nchannels))
+                        {
+                            auto context = NAMESPACE_COMPRESSED_IMAGE::channel<T>::create_compression_context(
+                                compression_codec,
+                                std::thread::hardware_concurrency(),
+                                compression_level,
+                                block_size,
+                                0
+                            );
+
+                            auto channel_span = std::span<T>(
+                                slot.deinterleaved_buffer[channel_idx].data(),
+                                read_elements
+                            );
+
+                            if constexpr (std::invocable<std::remove_reference_t<PostProcess>, size_t, std::span<T>>)
+                            {
+                                auto absolute_channel_idx = chbegin + channel_idx;
+                                postprocess(absolute_channel_idx, channel_span);
+                            }
+
+                            schunks[channel_idx].append_chunk(std::move(context), channel_span);
+
+                            // Logging
+                            if (y + scanlines_to_read == (spec_height + spec_y))
+                            {
+                                std::string_view codec_name = enums::to_string(compression_codec);
+                                std::string backend = enums::is_gpu_codec(compression_codec) ? "cuda" : "blosc2";
+                                get_logger()->debug(
+                                    std::format(
+                                        "[channel: {}] {} {}: uncompressed {} bytes; compressed {} bytes; cratio {}",
+                                        channel_idx,
+                                        backend,
+                                        codec_name,
+                                        schunks[channel_idx].chunk_bytes(),
+                                        schunks[channel_idx].csize(),
+                                        static_cast<double>(schunks[channel_idx].chunk_bytes()) / schunks[channel_idx].
+                                        csize()
+                                    )
+                                );
+                            }
+                        }
+                    }
+                );
+
+                // Save our background work task tracking token to previous handle
+                previous_compute_future = std::move(slot.processing_future);
+
+                // Cycle the Ring Buffer index and step image coordinate offset
+                ring_index = (ring_index + 1) % ring_buffer.size();
+                y += scanlines_to_read;
+            }
+
+            // 5. Final sync: block until the last chunk's processing pipeline completely winds down
+            if (previous_compute_future.valid())
+            {
+                previous_compute_future.get();
+            }
+        }
 
 
 #endif // COMPRESSED_IMAGE_OIIO_AVAILABLE
-
-	};
-
-} // NAMESPACE_COMPRESSED_IMAGE
\ No newline at end of file
+    };
+} // NAMESPACE_COMPRESSED_IMAGE
diff --git a/compressed_image/include/compressed/iterators/channel.h b/compressed_image/include/compressed/iterators/channel.h
new file mode 100644
index 0000000..35c1613
--- /dev/null
+++ b/compressed_image/include/compressed/iterators/channel.h
@@ -0,0 +1,444 @@
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <format>
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <ranges>
+#include <span>
+#include <type_traits>
+#include <variant>
+#include <vector>
+
+#include "compressed/blosc2/typedefs.h"
+#include "compressed/blosc2/wrapper.h"
+#include "compressed/containers/chunk_span.h"
+#include "compressed/context.h"
+#include "compressed/cuda/compression.h"
+#include "compressed/detail/scoped_timer.h"
+#include "compressed/enums.h"
+#include "compressed/macros.h"
+#include "compressed/util.h"
+
+namespace
+NAMESPACE_COMPRESSED_IMAGE
+{
+    template <typename T>
+    struct fitted_buffer
+    {
+        fitted_buffer() = default;
+
+        explicit fitted_buffer(size_t initial_size)
+        {
+            m_buffer.resize(initial_size);
+            m_size = initial_size;
+        }
+
+        std::span<T> get()
+        {
+            return std::span<T>(m_buffer.begin(), m_buffer.begin() + m_size);
+        }
+
+        std::span<const T> get() const
+        {
+            return std::span<const T>(m_buffer.begin(), m_buffer.begin() + m_size);
+        }
+
+        void reset()
+        {
+            m_size = m_buffer.size();
+            m_is_fitted = false;
+        }
+
+        void ensure_capacity(size_t capacity)
+        {
+            if (capacity > m_buffer.size())
+            {
+                m_buffer.resize(capacity);
+            }
+
+            if (!m_is_fitted)
+            {
+                m_size = m_buffer.size();
+            }
+        }
+
+        void refit(size_t new_size)
+        {
+            if (new_size > m_buffer.size())
+            {
+                throw std::invalid_argument(
+                    std::format("New size exceeds buffer capacity. Maximum size is {:L}", m_buffer.size())
+                );
+            }
+            m_size = new_size;
+            m_is_fitted = true;
+        }
+
+        size_t capacity() const noexcept
+        {
+            return m_buffer.size();
+        }
+
+    private:
+        util::default_init_vector<T> m_buffer;
+        bool m_is_fitted = false;
+        size_t m_size = 0;
+    };
+
+    template <typename T>
+    struct channel_iterator
+    {
+        using storage_type = std::remove_const_t<T>;
+        using schunk_pointer = schunk_var_ptr<storage_type>;
+
+        using iterator_category = std::forward_iterator_tag;
+        using difference_type = std::ptrdiff_t;
+        using value_type = container::chunk_span<T>;
+        using pointer = value_type*;
+        using reference = value_type&;
+
+        channel_iterator() = default;
+
+        channel_iterator(
+            schunk_pointer schunk,
+            size_t chunk_index,
+            size_t num_chunks,
+            size_t width,
+            size_t height,
+            enums::codec codec,
+            uint8_t compression_level,
+            size_t num_threads,
+            size_t block_size,
+            size_t chunk_size
+        )
+            : m_state(
+                std::make_shared<state>(
+                    std::move(schunk),
+                    detail::scratch_pool_registry::get_or_create_for_channel(),
+                    chunk_index,
+                    num_chunks,
+                    width,
+                    height,
+                    codec,
+                    compression_level,
+                    num_threads,
+                    block_size,
+                    chunk_size
+                )
+            )
+        {
+        }
+
+        ~channel_iterator()
+        {
+            if constexpr (!std::is_const_v<T>)
+            {
+                try
+                {
+                    flush();
+                }
+                catch (...)
+                {
+                    // Iterators must not throw from destructors.
+                }
+            }
+        }
+
+        value_type operator*()
+        {
+            ensure_dereferenceable();
+            load_current_chunk();
+
+            if constexpr (!std::is_const_v<T>)
+            {
+                m_state->dirty = true;
+            }
+
+            return m_state->current_chunk;
+        }
+
+        channel_iterator& operator++()
+        {
+            ensure_state();
+
+            if constexpr (!std::is_const_v<T>)
+            {
+                flush();
+            }
+
+            if (m_state->chunk_index < m_state->num_chunks)
+            {
+                ++m_state->chunk_index;
+            }
+
+            m_state->loaded = false;
+            return *this;
+        }
+
+        channel_iterator operator++(int)
+        {
+            channel_iterator copy = *this;
+            ++(*this);
+            return copy;
+        }
+
+        bool operator==(const channel_iterator& other) const noexcept
+        {
+            if (!m_state || !other.m_state)
+            {
+                return !m_state && !other.m_state;
+            }
+
+            return m_state->schunk == other.m_state->schunk && m_state->chunk_index == other.m_state->chunk_index;
+        }
+
+        bool operator!=(const channel_iterator& other) const noexcept
+        {
+            return !(*this == other);
+        }
+
+    private:
+        struct state
+        {
+            state(
+                schunk_pointer schunk_,
+                std::shared_ptr<detail::scratch_buffer_pool> scratch_pool_,
+                size_t chunk_index_,
+                size_t num_chunks_,
+                size_t width_,
+                size_t height_,
+                enums::codec codec_,
+                uint8_t compression_level_,
+                size_t num_threads_,
+                size_t block_size_,
+                size_t chunk_size_
+            )
+                : schunk(std::move(schunk_)),
+                  scratch_pool(std::move(scratch_pool_)),
+                  chunk_index(chunk_index_),
+                  num_chunks(num_chunks_),
+                  width(width_),
+                  height(height_),
+                  codec(codec_),
+                  compression_level(compression_level_),
+                  num_threads(num_threads_),
+                  block_size(block_size_),
+                  chunk_size(chunk_size_)
+            {
+            }
+
+            schunk_pointer schunk = nullptr;
+            std::shared_ptr<detail::scratch_buffer_pool> scratch_pool = nullptr;
+            size_t chunk_index = 0;
+            size_t num_chunks = 0;
+            size_t width = 0;
+            size_t height = 0;
+            enums::codec codec = enums::codec::lz4;
+            uint8_t compression_level = 9;
+            size_t num_threads = 1;
+            size_t block_size = 0;
+            size_t chunk_size = 0;
+
+            fitted_buffer<storage_type> decompressed_buffer{};
+            fitted_buffer<std::byte> compressed_buffer{};
+            value_type current_chunk{};
+
+            std::optional<compression_context_var> context{};
+            bool loaded = false;
+            bool dirty = false;
+        };
+
+        std::shared_ptr<state> m_state{};
+
+        void ensure_state() const
+        {
+            if (!m_state || !m_state->schunk)
+            {
+                throw std::runtime_error("Invalid channel iterator state.");
+            }
+        }
+
+        void ensure_dereferenceable() const
+        {
+            ensure_state();
+
+            if (m_state->chunk_index >= m_state->num_chunks)
+            {
+                throw std::out_of_range("Cannot dereference end channel iterator.");
+            }
+        }
+
+        void ensure_context()
+        {
+            ensure_state();
+
+            if (m_state->context.has_value())
+            {
+                return;
+            }
+
+            const int gpu_device = enums::is_gpu_codec(m_state->codec) ? cuda::current_device() : 0;
+            m_state->context = create_context(
+                m_state->codec,
+                m_state->num_threads,
+                m_state->compression_level,
+                m_state->block_size,
+                gpu_device
+            );
+        }
+
+        void load_current_chunk()
+        {
+            if (m_state->loaded)
+            {
+                return;
+            }
+
+            ensure_context();
+
+            const size_t chunk_elems = std::visit(
+                [&](const auto& schunk)
+                {
+                    return schunk.chunk_elements(m_state->chunk_index);
+                },
+                *m_state->schunk
+            );
+            const size_t schunk_total = std::visit(
+                [&](const auto& schunk)
+                {
+                    return schunk.size();
+                },
+                *m_state->schunk
+            );
+
+            size_t max_chunk_elems = m_state->chunk_size / sizeof(storage_type);
+            // Optimize for small chunks by allocating at most what is held in total.
+            max_chunk_elems = std::min(max_chunk_elems, schunk_total);
+
+            m_state->decompressed_buffer.ensure_capacity(max_chunk_elems);
+            m_state->decompressed_buffer.refit(chunk_elems);
+
+            auto writable_buffer = m_state->decompressed_buffer.get();
+
+            std::visit(
+                [&](const auto& schunk)
+                {
+                    if (enums::is_gpu_codec(m_state->codec))
+                    {
+                        schunk.chunk(writable_buffer, m_state->chunk_index);
+                    }
+                    else
+                    {
+                        auto& cpu_context = std::get<cpu_compression_context>(*m_state->context);
+                        schunk.chunk(cpu_context.decompression_ctx.get(), writable_buffer, m_state->chunk_index);
+                    }
+                },
+                *m_state->schunk
+            );
+
+            if constexpr (std::is_const_v<T>)
+            {
+                m_state->current_chunk = value_type(
+                    std::span<const storage_type>(writable_buffer.data(), writable_buffer.size()),
+                    m_state->width,
+                    m_state->height,
+                    m_state->chunk_index,
+                    m_state->chunk_size / sizeof(storage_type)
+                );
+            }
+            else
+            {
+                m_state->current_chunk = value_type(
+                    writable_buffer,
+                    m_state->width,
+                    m_state->height,
+                    m_state->chunk_index,
+                    m_state->chunk_size / sizeof(storage_type)
+                );
+            }
+
+            m_state->loaded = true;
+            m_state->dirty = false;
+        }
+
+        void flush()
+        {
+            if constexpr (std::is_const_v<T>)
+            {
+                return;
+            }
+            else
+            {
+                if (!m_state || !m_state->loaded || !m_state->dirty || m_state->chunk_index >= m_state->num_chunks)
+                {
+                    return;
+                }
+
+                ensure_context();
+
+                auto buffer = m_state->decompressed_buffer.get();
+
+                std::visit(
+                    [&](auto& schunk)
+                    {
+                        if (buffer.size() != schunk.chunk_elements(m_state->chunk_index))
+                        {
+                            throw std::invalid_argument(
+                                std::format(
+                                    "Invalid iterator chunk buffer size. Expected {} elements, got {}.",
+                                    schunk.chunk_elements(m_state->chunk_index),
+                                    buffer.size()
+                                )
+                            );
+                        }
+
+                        if (enums::is_gpu_codec(m_state->codec))
+                        {
+                            auto& gpu_context = std::get<gpu_compression_context>(*m_state->context);
+                            schunk.set_chunk(gpu_context.ctx, buffer, m_state->chunk_index);
+                        }
+                        else
+                        {
+                            auto& cpu_context = std::get<cpu_compression_context>(*m_state->context);
+                            schunk.set_chunk(cpu_context.compression_ctx, buffer, m_state->chunk_index);
+                        }
+                    },
+                    *m_state->schunk
+                );
+
+                m_state->dirty = false;
+            }
+        }
+
+        static compression_context_var create_context(
+            const enums::codec codec,
+            const size_t num_threads,
+            const size_t compression_level,
+            const size_t block_size,
+            const int gpu_device
+        )
+        {
+            if (enums::is_gpu_codec(codec))
+            {
+                return gpu_compression_context{
+                    .ctx = cuda::make_compression_context<storage_type>(codec, gpu_device, block_size)
+                };
+            }
+
+            return cpu_compression_context{
+                .compression_ctx = blosc2::create_compression_context<storage_type>(
+                    num_threads,
+                    codec,
+                    static_cast<uint8_t>(compression_level),
+                    block_size
+                ),
+                .decompression_ctx = blosc2::create_decompression_context(num_threads),
+                .nthreads = num_threads
+            };
+        }
+    };
+} // NAMESPACE_COMPRESSED_IMAGE
diff --git a/compressed_image/include/compressed/iterators/iterator.h b/compressed_image/include/compressed/iterators/iterator.h
deleted file mode 100644
index d436895..0000000
--- a/compressed_image/include/compressed/iterators/iterator.h
+++ /dev/null
@@ -1,308 +0,0 @@
-#pragma once
-
-#include <ranges>
-#include <vector>
-#include <span>
-#include <future>
-
-#include "compressed/detail/scoped_timer.h"
-#include "compressed/macros.h"
-#include "compressed/blosc2/wrapper.h"
-#include "compressed/containers/chunk_span.h"
-
-namespace NAMESPACE_COMPRESSED_IMAGE
-{
-
-	// Image iterator, cannot be used in parallel as it iterates the chunks. Dereferencing it gives a span view over the current decompressed 
-	// context.
-	template <typename T>
-	struct channel_iterator
-	{
-		// Iterator type definitions
-		using iterator_category = std::forward_iterator_tag;
-		using difference_type = std::ptrdiff_t;
-		using value_type = container::chunk_span<T>;
-		using pointer = value_type*;
-		using reference = value_type&;
-
-		channel_iterator() = default;
-
-		channel_iterator(
-			blosc2::schunk_var_ptr<T> schunk,
-			blosc2::context_raw_ptr compression_context,
-			blosc2::context_raw_ptr decompression_context,
-			size_t chunk_index,
-			size_t width,
-			size_t height
-			)
-			: m_Schunk(schunk),
-			m_CompressionContext(compression_context),
-			m_DecompressionContext(decompression_context),
-			m_ChunkIndex(chunk_index),
-			m_Width(width),
-			m_Height(height)
-		{
-			// Check that we are not out of range, throw if we are
-			std::visit([&](auto& schunk)
-				{
-					if (m_ChunkIndex > schunk.num_chunks())
-					{
-						throw std::out_of_range(
-							std::format(
-								"chunk_index is out of range for total number of chunks in blosc2_schunk."
-								" Max chunk number is {} but received {}",
-							schunk.num_chunks(), m_ChunkIndex
-							)
-						);
-					}
-				}, *m_Schunk);
-			
-			// Check that we don't pass zero width or height as e.g. the x() and y() functions of chunk_span require division by these dimensions
-			if (m_Width == 0 || m_Height == 0)
-			{
-				throw std::runtime_error(
-					std::format(
-						"passed zero width or height to iterator which is not valid, expected at least 1 pixel in either dimensions. Got [width: {} px, height: {} px]",
-						m_Width, m_Height
-					)
-				);
-			}
-		}
-
-		~channel_iterator()
-		{
-			_COMPRESSED_PROFILE_FUNCTION();
-			// We need to ensure that the last chunk also gets compressed on destruction
-			// because of e.g. scope exit
-			if (m_DecompressionBufferWasRefitted)
-			{
-				compress_chunk(m_CompressionContext);
-				// If we iterated through the whole range at this point we'd have a
-				// chunk index == nchunks but the last chunk was not yet compressed. In this case
-				// we have to ensure we set the index back to compress again.
-				auto chunk_idx = m_ChunkIndex;
-				std::visit([&](auto& schunk)
-					{
-						if (m_ChunkIndex == schunk.num_chunks())
-						{
-							chunk_idx = chunk_idx - 1;
-						}
-					}, *m_Schunk);
-				update_chunk(chunk_idx);
-			}
-		}
-
-		/// Dereference operator: decompress the current chunk and recompress (if necessary) the previously compressed
-		/// chunk. value_type is a view over the current buffers. Iterator going out of scope while value_type is accessed is UB.
-		value_type operator*()
-		{
-			_COMPRESSED_PROFILE_FUNCTION();
-
-			// Initialize the data, this allows the base iterator to be copied over
-			// quite cheaply
-			if (!m_Initialized)
-			{
-				m_CompressionBuffer.resize(blosc2::min_compressed_size(this->chunk_bytes()));
-				m_CompressionBufferSize = m_CompressionBuffer.size();
-				m_DecompressionBuffer.resize(blosc2::min_decompressed_size(this->chunk_bytes()));
-				m_DecompressionBufferSize = m_DecompressionBuffer.size();
-				m_Initialized = true;
-			}
-
-			if (!this->valid())
-			{
-				throw std::runtime_error("Invalid Iterator struct encountered, cannot dereference item");
-			}
-
-			// Compress the previously decompressed chunk if it has been modified.
-			if (m_DecompressionBufferWasRefitted && m_ChunkIndex != 0)
-			{
-				this->compress_chunk(m_CompressionContext);
-				this->update_chunk(m_ChunkIndex - 1);
-			}
-
-			// In most cases m_Decompressed.fitted_data should be identical to m_Decompressed.data. However, this is not true
-			// for the last chunk in the schunk which may not be the same decompressed size.
-			this->decompress_chunk(m_DecompressionContext);
-
-			if (this->decompression_buffer_byte_size() % sizeof(T) != 0)
-			{
-				throw std::runtime_error(
-					std::format(
-						"Unable to dereference iterator as the decompressed size is not a multiple of {}." \
-						" Got {:L} bytes. This is likely an internal decompression error.",
-						sizeof(T), decompression_buffer_byte_size()
-					)
-				);
-			}
-
-			std::span<T> item_span(reinterpret_cast<T*>(m_DecompressionBuffer.data()), m_DecompressionBufferSize / sizeof(T));
-			return container::chunk_span<T>(item_span, m_Width, m_Height, m_ChunkIndex, this->chunk_bytes());
-		}
-
-		// Pre-increment operator: move to the next chunk
-		channel_iterator& operator++()
-		{
-			++m_ChunkIndex;
-			std::visit([&](auto& schunk)
-				{
-					if (m_ChunkIndex > schunk.num_chunks())
-					{
-						throw std::out_of_range("Iterator: count exceeds number of chunks");
-					};
-				}, *m_Schunk);
-			return *this;
-		}
-
-		channel_iterator& operator++(int)
-		{
-			channel_iterator temp = *this;
-			++(*this);
-			return temp;
-		}
-
-		bool operator==(const channel_iterator& other) const noexcept
-		{
-			return m_ChunkIndex == other.m_ChunkIndex && m_Schunk == other.m_Schunk;
-		}
-
-		bool operator!=(const channel_iterator& other) const noexcept
-		{
-			return m_ChunkIndex != other.m_ChunkIndex || m_Schunk != other.m_Schunk;
-		}
-
-		/// Return the chunk index the iterator is currently at.
-		size_t chunk_index() const noexcept { return m_ChunkIndex; }
-
-		/// Return the chunk size of all but the last chunk.
-		size_t chunk_elements() const noexcept
-		{
-			return std::visit([&](auto& schunk) -> size_t
-				{
-					return schunk.chunk_elements();
-				}, *m_Schunk);
-		}
-
-		/// Return the chunk size of all but the last chunk.
-		size_t chunk_bytes() const noexcept
-		{
-			return std::visit([&](auto& schunk) -> size_t
-				{
-					return schunk.chunk_bytes();
-				}, *m_Schunk);
-		}
-
-	private:
-
-		/// Buffers for storing compressed and decompressed data. These hold enough data for ChunkSize
-		/// but may be smaller, thus we keep track of m_CompressionBufferSize and m_DecompressionBufferSize
-		util::default_init_vector<std::byte> m_CompressionBuffer;
-		bool m_CompressionBufferWasRefitted = false;
-		size_t m_CompressionBufferSize = 0;	// The fitted size of the container (only holding the compressed size)
-
-		std::vector<std::byte> m_DecompressionBuffer;
-		bool m_DecompressionBufferWasRefitted = false;
-		size_t m_DecompressionBufferSize = 0;	// The fitted size of the container (only holding the decompressed size)
-
-		/// Pointers to the blosc2 structs. The data is owned by the `channel` struct and we just have a view over it.
-		blosc2::schunk_var_ptr<T> m_Schunk;
-		blosc2::context_raw_ptr m_CompressionContext = nullptr;
-		blosc2::context_raw_ptr	m_DecompressionContext = nullptr;
-
-		size_t m_ChunkIndex = 0;
-		size_t m_Width = 0;
-		size_t m_Height = 0;
-
-		/// this is set in the dereference operator to only initialize on first access
-		/// not on setup.
-		bool m_Initialized = false;
-
-	private:
-
-		size_t compression_buffer_byte_size() const noexcept
-		{
-			return m_CompressionBufferSize;
-		}
-
-		size_t compression_buffer_max_byte_size() const noexcept
-		{
-			return m_CompressionBuffer.size();
-		}
-
-		size_t decompression_buffer_byte_size() const noexcept
-		{
-			return m_DecompressionBufferSize;
-		}
-
-		size_t decompression_buffer_max_byte_size() const noexcept
-		{
-			return m_DecompressionBuffer.size();
-		}
-
-		/// Check for validity of this struct.
-		bool valid() const
-		{
-			if (!m_Schunk)
-			{
-				return false;
-			}
-			return std::visit([&](auto& schunk)
-				{
-					// Check that the schunk, compression and decompression ptrs are not null
-					bool ptrs_valid = m_Schunk && m_CompressionContext && m_DecompressionContext;
-					if (!ptrs_valid)
-					{
-						return false;
-					}
-
-					bool compression_size_valid = m_CompressionBufferSize <= m_CompressionBuffer.size();
-					bool decompression_size_valid = m_DecompressionBufferSize <= m_DecompressionBuffer.size();
-
-					bool idx_valid = m_ChunkIndex < schunk.num_chunks();
-					bool compressed_data_valid = compression_buffer_max_byte_size() >= blosc2::min_compressed_size(this->chunk_bytes());
-					bool decompressed_data_valid = decompression_buffer_max_byte_size() >= blosc2::min_decompressed_size(this->chunk_bytes());
-
-					return idx_valid && compressed_data_valid && decompressed_data_valid && compression_size_valid && decompression_size_valid;
-				}, *m_Schunk);
-		}
-
-		/// Decompress a chunk using the given context and chunk pointer. Decompressing into the buffer
-		void decompress_chunk(blosc2::context_raw_ptr decompression_context_ptr)
-		{
-			_COMPRESSED_PROFILE_FUNCTION();
-			auto buffer_span = std::span<T>(reinterpret_cast<T*>(m_DecompressionBuffer.data()), m_DecompressionBufferSize / sizeof(T));
-			
-			// apply the decompression.
-			std::visit([&](auto& schunk) 
-				{
-					schunk.chunk(decompression_context_ptr, buffer_span, m_ChunkIndex);
-					m_DecompressionBufferSize = schunk.chunk_bytes(m_ChunkIndex);
-					m_DecompressionBufferWasRefitted = true;
-				}, *m_Schunk);
-		}
-
-		/// Compress a chunk from the decompressed view into the compressed view
-		void compress_chunk(blosc2::context_raw_ptr compression_context_ptr)
-		{
-			_COMPRESSED_PROFILE_FUNCTION();
-			std::span<T> fitted = { reinterpret_cast<T*>(m_DecompressionBuffer.data()), m_DecompressionBufferSize / sizeof(T) };
-			auto compressed_size = blosc2::compress(compression_context_ptr, fitted, m_CompressionBuffer);
-			
-			m_CompressionBufferSize = compressed_size;
-			m_CompressionBufferWasRefitted = true;
-		}
-
-		/// Update and replace the chunk inside of the superchunk at the given index.
-		void update_chunk(size_t chunk_index)
-		{
-			_COMPRESSED_PROFILE_FUNCTION();
-			auto byte_span = std::span<std::byte>(m_CompressionBuffer.data(), this->compression_buffer_byte_size());
-			std::visit([&](auto& schunk)
-				{
-					schunk.set_chunk(byte_span, chunk_index);
-				}, *m_Schunk);
-		}
-	};
-
-
-} // NAMESPACE_COMPRESSED_IMAGE
\ No newline at end of file
diff --git a/compressed_image/include/compressed/logger.h b/compressed_image/include/compressed/logger.h
new file mode 100644
index 0000000..fcb0b39
--- /dev/null
+++ b/compressed_image/include/compressed/logger.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include "macros.h"
+
+#include <spdlog/spdlog.h>
+#include <spdlog/sinks/stdout_color_sinks.h>
+#include <memory>
+
+
+namespace
+NAMESPACE_COMPRESSED_IMAGE
+{
+    namespace detail
+    {
+        static std::shared_ptr<spdlog::logger> s_logger = nullptr;
+        /// \brief The default logger name used internally if the user does not provide one.
+        static inline std::string s_default_logger_name = "compressed_image";
+    }
+
+
+    /// \brief Set the logger instance used by the compressed-image api.
+    ///
+    /// This function allows consumers of the library to provide their own `spdlog::logger` instance.
+    /// This can be useful to integrate the library’s logging output into an existing logging system,
+    /// route messages to a file, or change verbosity dynamically.
+    ///
+    /// If no logger is set, the library will lazily create a default one that logs to `stdout` at warning level.
+    ///
+    /// \param logger The `spdlog::logger` instance to use for all library logging.
+    inline void set_logger(std::shared_ptr<spdlog::logger> logger)
+    {
+        detail::s_logger = logger;
+    }
+
+    /// \brief Retrieve the current logger instance used by the cryptomatte-api.
+    ///
+    /// If no logger has been previously set via `set_logger`, this function will initialize
+    /// a default logger named `"cryptomatte_api"` that logs to standard output with color support,
+    /// and at `spdlog::level::warn` verbosity.
+    ///
+    /// \return A shared pointer to the currently active `spdlog::logger`.
+    inline std::shared_ptr<spdlog::logger> get_logger()
+    {
+        if (!detail::s_logger)
+        {
+            // Lazy init with a sensible default
+            detail::s_logger = spdlog::stdout_color_mt(detail::s_default_logger_name);
+            detail::s_logger->set_level(spdlog::level::info);
+        }
+        return detail::s_logger;
+    }
+} // NAMESPACE_COMPRESSED_IMAGE
diff --git a/examples/gpu_compression/CMakeLists.txt b/examples/gpu_compression/CMakeLists.txt
new file mode 100644
index 0000000..469f5d7
--- /dev/null
+++ b/examples/gpu_compression/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(lazy_channels main.cpp)
+target_link_libraries(lazy_channels PRIVATE compressed_image)
diff --git a/examples/gpu_compression/main.cpp b/examples/gpu_compression/main.cpp
new file mode 100644
index 0000000..4f479fe
--- /dev/null
+++ b/examples/gpu_compression/main.cpp
@@ -0,0 +1,57 @@
+
+#include <string>
+#include <filesystem>
+#include <algorithm>
+#include <execution>
+
+#include <compressed/channel.h>
+
+
+auto main() -> int
+{
+	// The compressed_image API provides multiple ways of generating lazy chunks to represent sparse data. This generates
+	// chunks represented by a single value that take up just a couple of bytes. This method is especially useful when you
+	// are planning to fill the channel with sparse data to then pass along to an image or somewhere else.
+	auto channel_zeros = compressed::channel<uint16_t>::zeros(1920, 1080);
+	auto channel_full = compressed::channel<uint16_t>::full(1920, 1080, 65535 /* fill value */);
+
+	// We can also directly mirror another channel, this doesn't have to be a lazy channel!
+	auto channel_zeros_like = compressed::channel<uint16_t>::zeros_like(channel_zeros);
+	auto channel_full_like = compressed::channel<uint16_t>::full_like(channel_full, 24 /* fill value */);
+
+	// When working with these lazy channels one has to slightly rethink how they approach modifying chunks within a 
+	// channel. This is because the usual `set_chunk` method will actually trigger a non-lazy chunk to be generated using
+	// up more memory and being slower
+	//
+	// So instead of:
+	for ([[maybe_unused]] auto chunk : channel_zeros)
+	{
+		// modify the chunk
+	}
+
+	// One should instead do the following:
+
+	// Generate a vector with uninitialized data since we'll set it directly after.
+	compressed::util::default_init_vector<uint16_t> chunk_buffer(channel_zeros.chunk_size());
+
+	for (size_t chunk_idx = 0; chunk_idx < channel_zeros.num_chunks(); ++chunk_idx)
+	{
+		// Only conditionally modify the chunk, do this to avoid breaking the laziness of chunks unless necessary. 
+		if (true /*some arbitrary condition*/)
+		{
+			// Note: we need to ensure this is set to chunk_elems(chunk_idx) as the last chunk of an channel may be smaller
+			// than the rest of the chunks in the channel, this way we don't have to worry about the chunk size.
+			std::span<uint16_t> chunk_span(chunk_buffer.data(), channel_zeros.chunk_elems(chunk_idx));
+
+			channel_zeros.get_chunk(chunk_span, chunk_idx);
+
+			// modify the data to your hearts content
+
+			channel_zeros.set_chunk(chunk_span, chunk_idx);
+		}
+	}
+
+	// While lazy chunks are mentioned as a good way of generating sparse data they are also generally the fastest way to 
+	// initialize a channel you are planning to populate fully as it is very cheap to instantiate and you only pay the 
+	// memory price as you go!
+}
\ No newline at end of file
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index b202d9b..06f38cd 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -4,9 +4,9 @@ file(GLOB_RECURSE MY_SOURCES CONFIGURE_DEPENDS "src/*.cpp")
 enable_testing()
 add_executable(compressed_image_test ${MY_SOURCES} "main.cpp")
 
-if(MSVC)
-    target_compile_options(compressed_image_test PRIVATE /MP /utf-8)
-endif()
+if (MSVC)
+   target_compile_options(compressed_image_test PRIVATE /MP /utf-8 /bigobj)
+endif ()
 target_link_libraries(compressed_image_test PRIVATE compressed_image)
 target_link_libraries(compressed_image_test PRIVATE doctest)
 
@@ -15,8 +15,18 @@ add_test(test_compressed_image compressed_image_test)
 
 # Copy the images/ folder to the build dir to run the tests
 add_custom_command(TARGET compressed_image_test POST_BUILD
-                   COMMAND ${CMAKE_COMMAND} -E copy_directory
-                       ${CMAKE_CURRENT_SOURCE_DIR}/images/ $<TARGET_FILE_DIR:compressed_image_test>/images)
+   COMMAND ${CMAKE_COMMAND} -E copy_directory
+   ${CMAKE_CURRENT_SOURCE_DIR}/images/ $<TARGET_FILE_DIR:compressed_image_test>/images)
 add_custom_command(TARGET compressed_image_test POST_BUILD
-                    COMMAND ${CMAKE_COMMAND} -E echo 
-                    "Finished copying test files to output directory $<TARGET_FILE_DIR:compressed_image_test>/images")
\ No newline at end of file
+   COMMAND ${CMAKE_COMMAND} -E echo
+   "Finished copying test files to output directory $<TARGET_FILE_DIR:compressed_image_test>/images")
+
+if (NVCOMP_RUNTIME_BINARIES)
+   add_custom_command(
+      TARGET compressed_image_test POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different
+      ${NVCOMP_RUNTIME_BINARIES}
+      $<TARGET_FILE_DIR:compressed_image_test>
+      COMMENT "Syncing nvcomp runtime dependencies next to test executable..."
+   )
+endif ()
\ No newline at end of file
diff --git a/test/main.cpp b/test/main.cpp
index 7c5687a..dee8622 100644
--- a/test/main.cpp
+++ b/test/main.cpp
@@ -2,103 +2,362 @@
 #define DOCTEST_CONFIG_IMPLEMENT
 #include "doctest.h"
 
+
 #include <filesystem>
 #include <vector>
 #include <memory>
 #include <format>
 #include <mutex>
 
+#define _COMPRESSED_PROFILE 1
+#include "compressed/detail/scoped_timer.h"
 
-/// Create a reporter which prints out failure statistics at the end
-struct FailureReporter : public doctest::ConsoleReporter
+struct test_log_reporter : public doctest::ConsoleReporter
 {
-	FailureReporter(const  doctest::ContextOptions& opt) : doctest::ConsoleReporter(opt) {}
+    const doctest::TestCaseData* tc = nullptr;
+    std::mutex mutex;
+
+    // Track both test cases and subcases for the final report summary
+    std::vector<std::pair<std::string, double>> test_durations;
+
+    // Flat raw structure captured during test runs
+    struct subcase_report
+    {
+        std::string name;
+        double seconds = 0.0;
+        size_t depth = 0;
+    };
+
+    std::vector<subcase_report> buffered_subcases;
+
+    // Reconstructed tree structure used for grouped reporting
+    struct subcase_node
+    {
+        std::string name;
+        double total_seconds = 0.0;
+        int call_count = 0;
+        std::vector<subcase_node> children;
+    };
+
+    test_log_reporter(const doctest::ContextOptions& opt) : doctest::ConsoleReporter(opt)
+    {
+    }
+
+    void test_case_start(const doctest::TestCaseData& in) override
+    {
+        std::lock_guard<std::mutex> lock(mutex);
+        tc = &in;
+        buffered_subcases.clear();
+        active_subcase_stack.clear();
+    }
+
+    void test_case_reenter(const doctest::TestCaseData& in) override
+    {
+        std::lock_guard<std::mutex> lock(mutex);
+        tc = &in;
+        active_subcase_stack.clear();
+    }
+
+    void test_case_end(const doctest::CurrentTestCaseStats& in) override
+    {
+        std::lock_guard<std::mutex> lock(mutex);
+        if (!tc) return;
+
+        // Track the root parent test case
+        test_durations.push_back({std::string(tc->m_name), in.seconds});
+
+        constexpr int column_width = 120;
+        constexpr const char* blue = "\033[36m";
+        constexpr const char* red = "\033[31m";
+        constexpr const char* gold = "\033[33m";
+        constexpr const char* reset = "\033[0m";
+        constexpr const char* dim_dot = "\033[90m";
+
+        if (!in.testCaseSuccess)
+        {
+            std::string reason = "Assertion failure";
+            if (in.failure_flags & doctest::TestCaseFailureReason::Exception) reason = "Unhandled Exception";
+            else if (in.failure_flags & doctest::TestCaseFailureReason::Crash) reason = "Crash";
+            else if (in.failure_flags & doctest::TestCaseFailureReason::ShouldHaveFailedButDidnt)
+                reason = "Expected failure missing";
+
+            std::cout
+                << gold << "\n===============================================================================\n"
+                << blue << "[doctest]" << reset << " Failure in test case: " << red << tc->m_name << reset
+                << " | Reason: " << gold << reason << "\n"
+                << "===============================================================================\n" << reset
+                << std::endl;
+        }
+        else
+        {
+            std::string time_str = format_duration_clean(in.seconds);
+            std::string time_color = get_duration_color(in.seconds);
+            std::string right_side = std::format("{}{} ok{}", time_color, time_str, reset);
+            std::string right_side_plain = std::format("{} ok", time_str);
+
+            size_t max_name_len = column_width - std::string("[doctest] ").length() - right_side_plain.length() - 3;
+            std::string display_name = tc->m_name;
+            if (display_name.length() > max_name_len)
+            {
+                display_name = truncate_string(display_name, max_name_len);
+            }
+
+            std::string left_side = std::format("[doctest] {}", display_name);
+            int fill_dots = column_width - static_cast<int>(left_side.length() + right_side_plain.length());
+            if (fill_dots < 3) fill_dots = 3;
+
+            std::cout << left_side << dim_dot << std::string(fill_dots, '.') << reset << right_side << std::endl;
+        }
+
+        // 1. Reconstruct hierarchical tree from raw flat loop history
+        std::vector<subcase_node> root_nodes;
+        std::vector<std::string> current_path_names;
+
+        for (const auto& sub : buffered_subcases)
+        {
+            if (current_path_names.size() >= sub.depth)
+            {
+                current_path_names.resize(sub.depth - 1);
+            }
+            current_path_names.push_back(sub.name);
+
+            std::vector<subcase_node>* current_level = &root_nodes;
+            for (size_t d = 0; d < current_path_names.size(); ++d)
+            {
+                const std::string& name = current_path_names[d];
+                auto it = std::find_if(
+                    current_level->begin(),
+                    current_level->end(),
+                    [&name](const subcase_node& node) { return node.name == name; }
+                );
+
+                if (it == current_level->end())
+                {
+                    current_level->push_back(subcase_node{name, 0.0, 0, {}});
+                    it = current_level->end() - 1;
+                }
+
+                if (d == current_path_names.size() - 1)
+                {
+                    it->total_seconds += sub.seconds;
+                    it->call_count++;
+                }
+                current_level = &it->children;
+            }
+        }
+
+        // 2. Extract and recursively add subcase profiles into final summary metrics
+        collect_subcase_durations(root_nodes, std::string(tc->m_name));
+
+        // 3. Print the collapsed tree down the console pipe using clean ASCII configurations
+        print_subcase_tree(root_nodes, "", column_width);
 
-	void test_case_start(const doctest::TestCaseData& in) override { tc = &in; }
+        buffered_subcases.clear();
+        tc = nullptr;
+    }
 
+    void subcase_start(const doctest::SubcaseSignature& in) override
+    {
+        std::lock_guard<std::mutex> lock(mutex);
 
-	void test_case_end(const doctest::CurrentTestCaseStats& in) override
-	{
-		if (in.failure_flags == doctest::TestCaseFailureReason::Exception ||
-			in.failure_flags == doctest::TestCaseFailureReason::Crash ||
-			in.failure_flags == doctest::TestCaseFailureReason::ShouldHaveFailedButDidnt
-			)
-		{
-			std::lock_guard<std::mutex> lock(mutex);
-			if (tc)
-			{
-				std::string reason = "";
-				if (in.failure_flags == doctest::TestCaseFailureReason::Exception)
-				{
-					reason = "Exception";
-				}
-				else if (in.failure_flags == doctest::TestCaseFailureReason::Crash)
-				{
-					reason = "Crash";
-				}
-				else if (in.failure_flags == doctest::TestCaseFailureReason::ShouldHaveFailedButDidnt)
-				{
-					reason = "Expected failure not failing";
-				}
+        size_t idx = buffered_subcases.size();
+        buffered_subcases.push_back({in.m_name.c_str(), 0.0, active_subcase_stack.size() + 1});
+        active_subcase_stack.push_back({in.m_name.c_str(), std::chrono::high_resolution_clock::now(), idx});
+    }
 
-				constexpr const char* blue = "\033[36m";
-				constexpr const char* red = "\033[31m";
-				constexpr const char* gold = "\033[33m";
-				constexpr const char* reset = "\033[0m";
-				std::cout
-					<< gold << "===============================================================================\n"
-					<< blue << "[doctest]" << reset << " Failure in test case: " << red << std::string(tc->m_name) << reset
-					<< " with reason: " << gold << reason << "\n"
-					<< "===============================================================================\n" << reset
-					<< std::endl;
-			}
-		}
-		else
-		{
-			constexpr int column_width = 100;
-			std::cout << std::format("[doctest] {:.<{}} ok", std::string(tc->m_name), column_width - 12) << std::endl;
-		}
-	}
+    void subcase_end() override
+    {
+        auto end_time = std::chrono::high_resolution_clock::now();
+        if (active_subcase_stack.empty()) return;
 
+        auto top = active_subcase_stack.back();
+        active_subcase_stack.pop_back();
 
-	void report_query(const doctest::QueryData& /*in*/) override {}
+        std::chrono::duration<double> elapsed = end_time - top.start_time;
 
-	void test_run_start() override {}
+        std::lock_guard<std::mutex> lock(mutex);
+        if (top.report_index < buffered_subcases.size())
+        {
+            buffered_subcases[top.report_index].seconds = elapsed.count();
+        }
+    }
 
-	void test_run_end(const doctest::TestRunStats& /*in*/) override {}
+    void log_assert(const doctest::AssertData& in) override
+    {
+        if (!in.m_failed) return;
 
-	void test_case_reenter(const doctest::TestCaseData& /*in*/) override {}
+        std::lock_guard<std::mutex> lock(mutex);
+        constexpr const char* red = "\033[31m";
+        constexpr const char* gold = "\033[33m";
+        constexpr const char* reset = "\033[0m";
 
-	void test_case_exception(const doctest::TestCaseException& /*in*/) override {}
+        std::cout << "\n"
+            << red << "  `-- ASSERTION FAILURE:\n" << reset
+            << "      " << gold << "File:   " << reset << in.m_file << ":" << in.m_line << "\n"
+            << "      " << gold << "Expr:   " << reset << in.m_expr << "\n"
+            << "      " << gold << "Decomp: " << red << in.m_decomp << reset << "\n"
+            << std::endl;
+    }
 
-	void subcase_start(const doctest::SubcaseSignature& /*in*/) override {}
+    void test_run_end(const doctest::TestRunStats& /*in*/) override
+    {
+        std::lock_guard<std::mutex> lock(mutex);
+        if (test_durations.empty()) return;
 
-	void subcase_end() override {}
+        constexpr const char* gold = "\033[33m";
+        constexpr const char* reset = "\033[0m";
 
-	void log_assert([[maybe_unused]] const doctest::AssertData& in) override {}
+        std::sort(
+            test_durations.begin(),
+            test_durations.end(),
+            [](const auto& a, const auto& b) { return a.second > b.second; }
+        );
 
-	void log_message(const doctest::MessageData& /*in*/) override {}
+        std::cout << "\n" << gold << "Slowest test paths & subcases (Top 10):" << reset << "\n";
 
-	void test_case_skipped(const doctest::TestCaseData& /*in*/) override {}
+        size_t display_count = std::min(size_t(10), test_durations.size());
+        for (size_t i = 0; i < display_count; ++i)
+        {
+            const auto& [name, seconds] = test_durations[i];
+            std::string time_str = format_duration_clean(seconds, true);
+            std::string time_color = get_duration_color(seconds);
 
+            std::cout << std::format("  [{}{}{}] {}\n", time_color, time_str, reset, name);
+        }
+        std::cout << std::endl;
+    }
+
+    void report_query(const doctest::QueryData&) override
+    {
+    }
+
+    void test_run_start() override
+    {
+    }
+
+    void test_case_exception(const doctest::TestCaseException&) override
+    {
+    }
+
+    void log_message(const doctest::MessageData&) override
+    {
+    }
+
+    void test_case_skipped(const doctest::TestCaseData&) override
+    {
+    }
 
 private:
+    struct subcase_timing
+    {
+        std::string name;
+        std::chrono::high_resolution_clock::time_point start_time;
+        size_t report_index;
+    };
+
+    inline static thread_local std::vector<subcase_timing> active_subcase_stack;
+
+    void collect_subcase_durations(const std::vector<subcase_node>& nodes, const std::string& parent_path)
+    {
+        for (const auto& node : nodes)
+        {
+            std::string current_path = parent_path + " > " + node.name;
+            std::string report_name = current_path;
+
+            test_durations.push_back({report_name, node.total_seconds});
+
+            if (!node.children.empty())
+            {
+                collect_subcase_durations(node.children, current_path);
+            }
+        }
+    }
+
+    void print_subcase_tree(const std::vector<subcase_node>& nodes, const std::string& prefix, int column_width)
+    {
+        constexpr const char* reset = "\033[0m";
+        constexpr const char* dim_dot = "\033[90m";
+
+        for (size_t i = 0; i < nodes.size(); ++i)
+        {
+            const auto& node = nodes[i];
+            bool is_last = (i == nodes.size() - 1);
+            std::string branch = is_last ? "`-- " : "|-- ";
 
-	const doctest::TestCaseData* tc = nullptr;
-	std::mutex						mutex;
+            std::string display_name = node.name;
+
+            std::string sub_time_str = format_duration_clean(node.total_seconds);
+            std::string sub_time_color = get_duration_color(node.total_seconds);
+            std::string right_side = std::format("{}{} ok{}", sub_time_color, sub_time_str, reset);
+            std::string right_side_plain = std::format("{} ok", sub_time_str);
+
+            size_t left_base_len = 10 + prefix.length() + branch.length();
+            size_t space_budget = column_width - left_base_len - right_side_plain.length() - 3;
+
+            if (display_name.length() > space_budget)
+            {
+                display_name = truncate_string(display_name, space_budget);
+            }
+
+            std::string left_side = std::format("          {}{}{}", prefix, branch, display_name);
+            int fill_dots = column_width - static_cast<int>(left_side.length() + right_side_plain.length());
+            if (fill_dots < 3) fill_dots = 3;
+
+            std::cout << left_side << dim_dot << std::string(fill_dots, '.') << reset << right_side << std::endl;
+
+            if (!node.children.empty())
+            {
+                std::string next_prefix = prefix + (is_last ? "    " : "|   ");
+                print_subcase_tree(node.children, next_prefix, column_width);
+            }
+        }
+    }
+
+    static std::string format_duration_clean(double seconds, bool fixed_width = false)
+    {
+        if (fixed_width)
+        {
+            if (seconds < 0.001) return std::format("{:>8}", "<1ms");
+            if (seconds < 1.0) return std::format("{:>6.1f}ms", seconds * 1000.0);
+            return std::format("{:>6.2f}s ", seconds);
+        }
+        if (seconds < 0.001) return "(<1ms)";
+        if (seconds < 1.0) return std::format("({:.1f}ms)", seconds * 1000.0);
+        return std::format("({:.2f}s)", seconds);
+    }
+
+    static std::string get_duration_color(const double seconds)
+    {
+        if (seconds < 0.010) return "\033[90m"; // Dim Grey (<10ms)
+        if (seconds < 0.250) return "\033[32m"; // Clean Green (<250ms)
+        if (seconds < 1.000) return "\033[0m"; // Standard Text (<1s)
+        if (seconds < 3.000) return "\033[33m"; // Warning Yellow (<3s)
+        return "\033[1;31m"; // Bold Panic Red (>=3s)
+    }
+
+    static std::string truncate_string(const std::string& str, size_t max_len)
+    {
+        if (str.length() <= max_len) return str;
+        if (max_len <= 3) return "...";
+        return str.substr(0, max_len - 3) + "...";
+    }
 };
 
-REGISTER_LISTENER("failure", /*priority=*/1, FailureReporter);
+REGISTER_LISTENER("test_log", /*priority=*/1, test_log_reporter);
 
 
 int main()
 {
-	doctest::Context context;
-	int res = context.run();
-
-	if (context.shouldExit())
-	{
-		return res;
-	}
-	return res;
-}
\ No newline at end of file
+    compressed::detail::Instrumentor::Get().BeginSession("Tests");
+
+    doctest::Context context;
+    int res = context.run();
+
+    if (context.shouldExit())
+    {
+        compressed::detail::Instrumentor::Get().EndSession();
+        return res;
+    }
+    compressed::detail::Instrumentor::Get().EndSession();
+    return res;
+}
diff --git a/test/src/test_channel.cpp b/test/src/test_channel.cpp
index 405bea2..429c525 100644
--- a/test/src/test_channel.cpp
+++ b/test/src/test_channel.cpp
@@ -8,36 +8,39 @@
 
 #include <OpenImageIO/half.h>
 
+#define _COMPRESSED_PROFILE 1
 #include <compressed/channel.h>
+#include <compressed/blosc2/schunk.h>
 #include <compressed/blosc2/wrapper.h>
 
 #include "util.h"
 
 
-
 // -----------------------------------------------------------------------------------
 // -----------------------------------------------------------------------------------
-TEST_CASE("Initialize channel from incorrect schunk"
-	* doctest::no_breaks(true)
-	* doctest::no_output(true)
-	* doctest::should_fail(true)
+TEST_CASE(
+    "Initialize channel from incorrect schunk"
+    * doctest::no_breaks(true)
+    * doctest::no_output(true)
+    * doctest::should_fail(true)
 )
 {
-	auto schunk = compressed::blosc2::schunk<uint8_t>();
-	auto channel = compressed::channel<uint8_t>(std::move(schunk), 1, 1);
+    auto schunk = compressed::detail::schunk<uint8_t>();
+    auto channel = compressed::channel<uint8_t>(std::move(schunk), 1, 1);
 }
 
 
 // -----------------------------------------------------------------------------------
 // -----------------------------------------------------------------------------------
-TEST_CASE("Initialize channel from incorrect span"
-	* doctest::no_breaks(true)
-	* doctest::no_output(true)
-	* doctest::should_fail(true)
+TEST_CASE(
+    "Initialize channel from incorrect span"
+    * doctest::no_breaks(true)
+    * doctest::no_output(true)
+    * doctest::should_fail(true)
 )
 {
-	auto vec = std::vector<uint8_t>(50);
-	auto channel = compressed::channel<uint8_t>(std::span<uint8_t>(vec), 1, 1);
+    auto vec = std::vector<uint8_t>(50);
+    auto channel = compressed::channel<uint8_t>(std::span<uint8_t>(vec), 1, 1);
 }
 
 
@@ -45,29 +48,29 @@ TEST_CASE("Initialize channel from incorrect span"
 // -----------------------------------------------------------------------------------
 TEST_CASE("Empty channel creation")
 {
-	auto vec = std::vector<uint8_t>(0);
+    auto vec = std::vector<uint8_t>(0);
+
+    auto channel = compressed::channel<uint8_t>(std::span<uint8_t>(vec), 0, 0);
 
-	auto channel = compressed::channel<uint8_t>(std::span<uint8_t>(vec), 0, 0);
-	
-	CHECK(channel.uncompressed_size() == 0);
-	CHECK(channel.width() == 0);
-	CHECK(channel.height() == 0);
+    CHECK(channel.uncompressed_size() == 0);
+    CHECK(channel.width() == 0);
+    CHECK(channel.height() == 0);
 
-	auto decompressed = channel.get_decompressed();
-	CHECK(decompressed.size() == 0);
+    auto decompressed = channel.get_decompressed();
+    CHECK(decompressed.size() == 0);
 }
 
 // -----------------------------------------------------------------------------------
 // -----------------------------------------------------------------------------------
 TEST_CASE("Roundtrip channel creation")
 {
-	auto vec = std::vector<uint8_t>(50);
-	std::iota(vec.begin(), vec.end(), 0);
+    auto vec = std::vector<uint8_t>(50);
+    std::iota(vec.begin(), vec.end(), 0);
 
-	auto channel = compressed::channel<uint8_t>(std::span<uint8_t>(vec), 10, 5);
-	auto roundtripped = channel.get_decompressed();
+    auto channel = compressed::channel<uint8_t>(std::span<uint8_t>(vec), 10, 5);
+    auto roundtripped = channel.get_decompressed();
 
-	CHECK(vec == roundtripped);
+    CHECK(vec == roundtripped);
 }
 
 
@@ -75,68 +78,117 @@ TEST_CASE("Roundtrip channel creation")
 // -----------------------------------------------------------------------------------
 TEST_CASE("Roundtrip channel creation larger than chunksize")
 {
-	auto vec = std::vector<uint8_t>(8192);
-	std::iota(vec.begin(), vec.end(), 0);
-
-	auto channel = compressed::channel<uint8_t>(std::span<uint8_t>(vec), 128, 64, compressed::enums::codec::lz4, 9, 128, 4096);
-	auto roundtripped = channel.get_decompressed();
-
-	CHECK(vec == roundtripped);
+    auto vec = std::vector<uint8_t>(8192);
+    std::iota(vec.begin(), vec.end(), 0);
+
+    auto channel = compressed::channel<uint8_t>(
+        std::span<uint8_t>(vec),
+        128,
+        64,
+        compressed::enums::codec::lz4,
+        9,
+        128,
+        4096
+    );
+    auto roundtripped = channel.get_decompressed();
+
+    CHECK(vec == roundtripped);
 }
 
 
 // -----------------------------------------------------------------------------------
 // -----------------------------------------------------------------------------------
-TEST_CASE("Channel get attributes"
+TEST_CASE(
+    "Channel get attributes"
 )
 {
-	auto vec = std::vector<uint8_t>(50);
-	auto channel = compressed::channel<uint8_t>(std::span<uint8_t>(vec), 10, 5, compressed::enums::codec::blosclz, 9);
-
-	CHECK(channel.width() == 10);
-	CHECK(channel.height() == 5);
-	CHECK(channel.compression() == compressed::enums::codec::blosclz);
-	CHECK(channel.compression_context() != nullptr);
-	CHECK(channel.decompression_context() != nullptr);
-	CHECK(channel.uncompressed_size() == 50);
-	CHECK(channel.num_chunks() == 1);
+    auto vec = std::vector<uint8_t>(50);
+    auto channel = compressed::channel<uint8_t>(std::span<uint8_t>(vec), 10, 5, compressed::enums::codec::blosclz, 9);
+
+    CHECK(channel.width() == 10);
+    CHECK(channel.height() == 5);
+    CHECK(channel.compression() == compressed::enums::codec::blosclz);
+    CHECK(channel.uncompressed_size() == 50);
+    CHECK(channel.num_chunks() == 1);
 }
 
-
 // -----------------------------------------------------------------------------------
 // -----------------------------------------------------------------------------------
 TEST_CASE("Channel iterate")
 {
-	auto vec = std::vector<uint16_t>(128, 255);
-	auto channel = compressed::channel<uint16_t>(std::span<uint16_t>(vec), 16, 8);
-
-	SUBCASE("Read")
-	{
-		for (auto chunk_span : channel)
-		{
-			for (auto& pixel : chunk_span)
-			{
-				CHECK(pixel == 255);
-			}
-		}
-	}
-
-	SUBCASE("Modify")
-	{
-		for (auto chunk_span : channel)
-		{
-			for (auto& pixel : chunk_span)
-			{
-				pixel = 128;
-			}
-		}
-
-		for (auto chunk_span : channel)
-		{
-			for (auto& pixel : chunk_span)
-			{
-				CHECK(pixel == 128);
-			}
-		}
-	}
-}
\ No newline at end of file
+    auto vec = std::vector<uint16_t>(128, 255);
+    auto channel = compressed::channel<uint16_t>(std::span<uint16_t>(vec), 16, 8);
+
+    SUBCASE("Read")
+    {
+        size_t count = 0;
+
+        for (auto chunk_span : channel)
+        {
+            for (auto& pixel : chunk_span)
+            {
+                CHECK(pixel == 255);
+                ++count;
+            }
+        }
+
+        CHECK(count == vec.size());
+    }
+
+    SUBCASE("Modify")
+    {
+        for (auto chunk_span : channel)
+        {
+            for (auto& pixel : chunk_span)
+            {
+                pixel = 128;
+            }
+        }
+
+        for (auto chunk_span : channel)
+        {
+            for (auto& pixel : chunk_span)
+            {
+                CHECK(pixel == 128);
+            }
+        }
+
+        auto decompressed = channel.get_decompressed();
+        test_util::check_vector_verbose(decompressed, static_cast<uint16_t>(128));
+    }
+}
+
+
+// -----------------------------------------------------------------------------------
+// -----------------------------------------------------------------------------------
+TEST_CASE("Channel iterate multiple chunks")
+{
+    auto vec = std::vector<uint16_t>(128);
+    std::iota(vec.begin(), vec.end(), uint16_t{0});
+
+    auto channel = compressed::channel<uint16_t>(
+        std::span<uint16_t>(vec),
+        16,
+        8,
+        compressed::enums::codec::lz4,
+        9,
+        64,
+        64
+    );
+
+    size_t count = 0;
+    for (auto chunk_span : channel)
+    {
+        for (auto& pixel : chunk_span)
+        {
+            pixel = 42;
+            ++count;
+        }
+    }
+
+    CHECK(count == vec.size());
+
+    auto decompressed = channel.get_decompressed();
+    CHECK(decompressed.size() == vec.size());
+    CHECK(std::ranges::all_of(decompressed, [](auto value) { return value == 42; }));
+}
diff --git a/test/src/test_chunk_span.cpp b/test/src/test_chunk_span.cpp
index f5dbffe..72bc633 100644
--- a/test/src/test_chunk_span.cpp
+++ b/test/src/test_chunk_span.cpp
@@ -6,6 +6,7 @@
 #include <algorithm>
 #include <string>
 
+#define _COMPRESSED_PROFILE 1
 #include <compressed/containers/chunk_span.h>
 
 #include "util.h"
@@ -15,14 +16,20 @@
 // -----------------------------------------------------------------------------------
 TEST_CASE("Get coordinates in base-chunk")
 {
-	std::vector<uint8_t> data(50);
-	auto span_container = compressed::container::chunk_span<uint8_t>(std::span<uint8_t>(data), 10, 5, 0, compressed::s_default_chunksize);
+    std::vector<uint8_t> data(50);
+    auto span_container = compressed::container::chunk_span<uint8_t>(
+        std::span<uint8_t>(data),
+        10,
+        5,
+        0,
+        compressed::s_default_chunksize
+    );
 
-	CHECK(span_container.x(9) == 9);
-	CHECK(span_container.y(5) == 0);
+    CHECK(span_container.x(9) == 9);
+    CHECK(span_container.y(5) == 0);
 
-	CHECK(span_container.x(15) == 5);
-	CHECK(span_container.y(15) == 1);
+    CHECK(span_container.x(15) == 5);
+    CHECK(span_container.y(15) == 1);
 }
 
 
@@ -30,13 +37,13 @@ TEST_CASE("Get coordinates in base-chunk")
 // -----------------------------------------------------------------------------------
 TEST_CASE("Get coordinates in non-base chunk")
 {
-	std::vector<uint8_t> data(50);
-	auto span_container = compressed::container::chunk_span<uint8_t>(std::span<uint8_t>(data), 128, 128, 1, 128);
+    std::vector<uint8_t> data(50);
+    auto span_container = compressed::container::chunk_span<uint8_t>(std::span<uint8_t>(data), 128, 128, 1, 128);
 
-	CHECK(span_container.x(9) == 9);
-	CHECK(span_container.y(5) == 1);
-	CHECK(span_container.x(135) == 7);
-	CHECK(span_container.y(129) == 2);
+    CHECK(span_container.x(9) == 9);
+    CHECK(span_container.y(5) == 1);
+    CHECK(span_container.x(135) == 7);
+    CHECK(span_container.y(129) == 2);
 }
 
 
@@ -44,14 +51,20 @@ TEST_CASE("Get coordinates in non-base chunk")
 // -----------------------------------------------------------------------------------
 TEST_CASE("Iter over chunk")
 {
-	std::vector<uint8_t> data(50, 5);
-	auto span_container = compressed::container::chunk_span<uint8_t>(std::span<uint8_t>(data), 50, 1, 0, compressed::s_default_chunksize);
-
-	size_t count = 0;
-	for (const auto& pixel : span_container)
-	{
-		CHECK(pixel == 5);
-		++count;
-	}
-	CHECK(count == 50);
+    std::vector<uint8_t> data(50, 5);
+    auto span_container = compressed::container::chunk_span<uint8_t>(
+        std::span<uint8_t>(data),
+        50,
+        1,
+        0,
+        compressed::s_default_chunksize
+    );
+
+    size_t count = 0;
+    for (const auto& pixel : span_container)
+    {
+        CHECK(pixel == 5);
+        ++count;
+    }
+    CHECK(count == 50);
 }
diff --git a/test/src/test_image.cpp b/test/src/test_image.cpp
index 0810ab1..fa0298f 100644
--- a/test/src/test_image.cpp
+++ b/test/src/test_image.cpp
@@ -2,14 +2,12 @@
 
 #include <ranges>
 #include <string>
-#include <cstdint>
 
 #include <OpenImageIO/half.h>
 
+#define _COMPRESSED_PROFILE 1
 #include <compressed/image.h>
 #include <compressed/ranges.h>
-#include <compressed/util.h>
-#include <compressed/blosc2/wrapper.h>
 
 #include "util.h"
 
@@ -18,67 +16,82 @@
 // -----------------------------------------------------------------------------------
 TEST_CASE("Read compressed file smaller than one chunk")
 {
-	std::string name = "uv_grid_2048x2048.jpg";
-	auto path = std::filesystem::current_path() / "images" / name;
-
-	auto image = compressed::image<uint8_t>::read(
-		path, 
-		0,
-		compressed::enums::codec::lz4, 
-		9,
-		compressed::s_default_blocksize, 
-		compressed::s_default_chunksize * 2
-	);
-	auto image_data = image.get_decompressed();
-	auto image_ref = test_util::read_oiio<uint8_t>(path);
-
-
-	test_util::compare_images(image_data, image_ref, name);
+    test_util::parametrize_codecs(
+        [&](compressed::enums::codec codec)
+        {
+            std::string name = "uv_grid_2048x2048.jpg";
+            auto path = std::filesystem::current_path() / "images" / name;
+
+            auto image = compressed::image<uint8_t>::read(
+                path,
+                0,
+                codec,
+                9,
+                compressed::s_default_blocksize,
+                compressed::s_default_chunksize * 2
+            );
+            auto image_data = image.get_decompressed();
+            auto image_ref = test_util::read_oiio<uint8_t>(path);
+
+
+            test_util::compare_images(image_data, image_ref, name);
+        }
+    );
 }
 
 // -----------------------------------------------------------------------------------
 // -----------------------------------------------------------------------------------
 TEST_CASE("Read compressed tiled file and extract channels")
 {
-	std::string name = "tiled_cryptomatte.exr";
-	auto path = std::filesystem::current_path() / "images" / name;
-
-	auto image = compressed::image<uint8_t>::read(
-		path,
-		0,
-		compressed::enums::codec::lz4,
-		9,
-		compressed::s_default_blocksize,
-		compressed::s_default_chunksize * 2
-	);
-	auto image_data = image.get_decompressed();
-	auto image_ref = test_util::read_oiio<uint8_t>(path);
-
-	test_util::compare_images(image_data, image_ref, name);
+    test_util::parametrize_codecs(
+        [&](compressed::enums::codec codec)
+        {
+            std::string name = "tiled_cryptomatte.exr";
+            auto path = std::filesystem::current_path() / "images" / name;
+
+            auto image = compressed::image<uint8_t>::read(
+                path,
+                0,
+                codec,
+                9,
+                compressed::s_default_blocksize,
+                compressed::s_default_chunksize * 2
+            );
+            auto image_data = image.get_decompressed();
+            auto image_ref = test_util::read_oiio<uint8_t>(path);
+
+            test_util::compare_images(image_data, image_ref, name);
+        }
+    );
 }
 
 // -----------------------------------------------------------------------------------
 // -----------------------------------------------------------------------------------
 TEST_CASE("Read compressed multipart file and extract channels")
 {
-	std::string name = "multipart.0001.exr";
-	auto path = std::filesystem::current_path() / "images" / name;
-
-	for (int subimage = 0; subimage < 10; ++subimage)
-	{
-		auto image = compressed::image<uint8_t>::read(
-			path,
-			subimage,
-			compressed::enums::codec::lz4,
-			9,
-			compressed::s_default_blocksize,
-			compressed::s_default_chunksize * 2
-		);
-		auto image_data = image.get_decompressed();
-		auto image_ref = test_util::read_oiio<uint8_t>(path, subimage);
-
-		test_util::compare_images(image_data, image_ref, name);
-	}
+    test_util::parametrize_codecs(
+        [&](compressed::enums::codec codec)
+        {
+            std::string name = "multipart.0001.exr";
+            auto path = std::filesystem::current_path() / "images" / name;
+
+            for (int subimage = 0; subimage < 10; ++subimage)
+            {
+                auto image = compressed::image<uint8_t>::read(
+                    path,
+                    subimage,
+                    codec,
+                    9,
+                    compressed::s_default_blocksize,
+                    compressed::s_default_chunksize * 2
+                );
+                auto image_data = image.get_decompressed();
+                auto image_ref = test_util::read_oiio<uint8_t>(path, subimage);
+
+                test_util::compare_images(image_data, image_ref, name);
+            }
+        }
+    );
 }
 
 
@@ -86,26 +99,31 @@ TEST_CASE("Read compressed multipart file and extract channels")
 // -----------------------------------------------------------------------------------
 TEST_CASE("Read compressed file and extract channels")
 {
-	std::string name = "uv_grid_2048x2048.jpg";
-	auto path = std::filesystem::current_path() / "images" / name;
-
-	auto image = compressed::image<uint8_t>::read(path);
-
-	std::vector<std::vector<uint8_t>> decompressed;
-	for ([[maybe_unused]] auto _: std::views::iota(size_t{ 0 }, image.num_channels()))
-	{
-		// Since we keep pulling out the channels the indices change back to zero
-		auto channel = image.extract_channel(0);
-		decompressed.push_back(channel.get_decompressed());
-	}
-	auto image_ref = test_util::read_oiio<uint8_t>(path);
-
-	// Since we extracted the channels, the number of channels should be zero with the channelnames 
-	// also being empty
-	CHECK(image.num_channels() == 0);
-	CHECK(image.channelnames() == std::vector<std::string>{});
-
-	test_util::compare_images(decompressed, image_ref, name);
+    test_util::parametrize_codecs(
+        [&](compressed::enums::codec codec)
+        {
+            std::string name = "uv_grid_2048x2048.jpg";
+            auto path = std::filesystem::current_path() / "images" / name;
+
+            auto image = compressed::image<uint8_t>::read(path, 0, codec);
+
+            std::vector<std::vector<uint8_t>> decompressed;
+            for ([[maybe_unused]] auto _ : std::views::iota(size_t{0}, image.num_channels()))
+            {
+                // Since we keep pulling out the channels the indices change back to zero
+                auto channel = image.extract_channel(0);
+                decompressed.push_back(channel.get_decompressed());
+            }
+            auto image_ref = test_util::read_oiio<uint8_t>(path);
+
+            // Since we extracted the channels, the number of channels should be zero with the channelnames
+            // also being empty
+            CHECK(image.num_channels() == 0);
+            CHECK(image.channelnames() == std::vector<std::string>{});
+
+            test_util::compare_images(decompressed, image_ref, name);
+        }
+    );
 }
 
 
@@ -113,17 +131,17 @@ TEST_CASE("Read compressed file and extract channels")
 // -----------------------------------------------------------------------------------
 TEST_CASE("Read compressed file get attributes")
 {
-	std::string name = "uv_grid_2048x2048.jpg";
-	auto path = std::filesystem::current_path() / "images" / name;
+    std::string name = "uv_grid_2048x2048.jpg";
+    auto path = std::filesystem::current_path() / "images" / name;
 
-	auto image = compressed::image<uint8_t>::read(path);
+    auto image = compressed::image<uint8_t>::read(path);
 
-	CHECK(image.width() == 2048);
-	CHECK(image.height() == 2048);
-	CHECK(image.num_channels() == 3);
-	CHECK(image.channelnames() == std::vector<std::string>{"R", "G", "B"});
-	CHECK(image.metadata().size() > 0);
-	CHECK(image.chunk_size() == compressed::s_default_chunksize);
+    CHECK(image.width() == 2048);
+    CHECK(image.height() == 2048);
+    CHECK(image.num_channels() == 3);
+    CHECK(image.channelnames() == std::vector<std::string>{"R", "G", "B"});
+    CHECK(image.metadata().size() > 0);
+    CHECK(image.chunk_size() == compressed::s_default_chunksize);
 }
 
 
@@ -131,15 +149,20 @@ TEST_CASE("Read compressed file get attributes")
 // -----------------------------------------------------------------------------------
 TEST_CASE("Read compressed file exactly than one chunk")
 {
-	std::string name = "uv_grid_2048x2048.jpg";
-	auto path = std::filesystem::current_path() / "images" / name;
+    test_util::parametrize_codecs(
+        [&](compressed::enums::codec codec)
+        {
+            std::string name = "uv_grid_2048x2048.jpg";
+            auto path = std::filesystem::current_path() / "images" / name;
 
-	auto image = compressed::image<uint8_t>::read(path);
-	auto image_data = image.get_decompressed();
-	auto image_ref = test_util::read_oiio<uint8_t>(path);
+            auto image = compressed::image<uint8_t>::read(path, 0, codec);
+            auto image_data = image.get_decompressed();
+            auto image_ref = test_util::read_oiio<uint8_t>(path);
 
 
-	test_util::compare_images(image_data, image_ref, name);
+            test_util::compare_images(image_data, image_ref, name);
+        }
+    );
 }
 
 
@@ -147,22 +170,27 @@ TEST_CASE("Read compressed file exactly than one chunk")
 // -----------------------------------------------------------------------------------
 TEST_CASE("Read compressed file larger than one chunk")
 {
-	std::string name = "multilayer_2560x1440.exr";
-	auto path = std::filesystem::current_path() / "images" / name;
-
-	auto image = compressed::image<float>::read(
-		path,
-		0,
-		compressed::enums::codec::lz4, 
-		9, 
-		compressed::s_default_blocksize, 
-		compressed::s_default_chunksize / 2
-	);
-	auto image_data = image.get_decompressed();
-	auto image_ref = test_util::read_oiio<float>(path);
-
-
-	test_util::compare_images(image_data, image_ref, name);
+    test_util::parametrize_codecs(
+        [&](compressed::enums::codec codec)
+        {
+            std::string name = "multilayer_2560x1440.exr";
+            auto path = std::filesystem::current_path() / "images" / name;
+
+            auto image = compressed::image<float>::read(
+                path,
+                0,
+                codec,
+                9,
+                compressed::s_default_blocksize,
+                compressed::s_default_chunksize / 2
+            );
+            auto image_data = image.get_decompressed();
+            auto image_ref = test_util::read_oiio<float>(path);
+
+
+            test_util::compare_images(image_data, image_ref, name);
+        }
+    );
 }
 
 
@@ -170,25 +198,33 @@ TEST_CASE("Read compressed file larger than one chunk")
 // -----------------------------------------------------------------------------------
 TEST_CASE("Read compressed file, subset of channel indices")
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			std::string name = "multilayer_2560x1440.exr";
-			auto path = std::filesystem::current_path() / "images" / name;
-			auto input_ptr = OIIO::ImageInput::open(path.string());
-
-			auto image = compressed::image<T>::read(
-				std::move(input_ptr),
-				{ 0, 1, 2, 3 },
-				0,
-				compressed::enums::codec::lz4,
-				9,
-				compressed::s_default_blocksize,
-				compressed::s_default_chunksize / 2
-			);
-
-			CHECK(image.num_channels() == 4);
-			CHECK(image.channelnames() == std::vector<std::string>{"R", "G", "B", "A"});
-		});
+    test_util::parametrize_codecs(
+        [&](compressed::enums::codec codec)
+        {
+            test_util::parametrize<uint8_t, uint16_t, uint32_t, Imath::half, float>(
+                [&]<typename T>([[maybe_unused]] T type)
+                {
+                    std::string name = "multilayer_2560x1440.exr";
+                    auto path = std::filesystem::current_path() / "images" / name;
+                    auto input_ptr = OIIO::ImageInput::open(path.string());
+
+                    auto image = compressed::image<T>::read(
+                        std::move(input_ptr),
+                        {0, 1, 2, 3},
+                        0,
+                        codec,
+                        9,
+                        compressed::s_default_blocksize,
+                        compressed::s_default_chunksize / 2
+                    );
+
+                    CHECK(image.num_channels() == 4);
+                    CHECK(image.channelnames() == std::vector<std::string>{"R", "G", "B", "A"});
+                }
+
+            );
+        }
+    );
 }
 
 
@@ -196,25 +232,32 @@ TEST_CASE("Read compressed file, subset of channel indices")
 // -----------------------------------------------------------------------------------
 TEST_CASE("Read compressed file, non contiguous channel indices")
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			std::string name = "multilayer_2560x1440.exr";
-			auto path = std::filesystem::current_path() / "images" / name;
-			auto input_ptr = OIIO::ImageInput::open(path.string());
-
-			auto image = compressed::image<T>::read(
-				std::move(input_ptr),
-				{ 0, 2, 3, 11 },
-				0,
-				compressed::enums::codec::lz4,
-				9,
-				compressed::s_default_blocksize,
-				compressed::s_default_chunksize / 2
-			);
-
-			CHECK(image.num_channels() == 4);
-			CHECK(image.channelnames() == std::vector<std::string>{ "R", "B", "A", "VRayCryptomatte00.R"});
-		});
+    test_util::parametrize_codecs(
+        [&](compressed::enums::codec codec)
+        {
+            test_util::parametrize<uint8_t, uint16_t, uint32_t, Imath::half, float>(
+                [&]<typename T>([[maybe_unused]] T type)
+                {
+                    std::string name = "multilayer_2560x1440.exr";
+                    auto path = std::filesystem::current_path() / "images" / name;
+                    auto input_ptr = OIIO::ImageInput::open(path.string());
+
+                    auto image = compressed::image<T>::read(
+                        std::move(input_ptr),
+                        {0, 2, 3, 11},
+                        0,
+                        codec,
+                        9,
+                        compressed::s_default_blocksize,
+                        compressed::s_default_chunksize / 2
+                    );
+
+                    CHECK(image.num_channels() == 4);
+                    CHECK(image.channelnames() == std::vector<std::string>{ "R", "B", "A", "VRayCryptomatte00.R"});
+                }
+            );
+        }
+    );
 }
 
 
@@ -222,57 +265,66 @@ TEST_CASE("Read compressed file, non contiguous channel indices")
 // -----------------------------------------------------------------------------------
 TEST_CASE("Read compressed file, non contiguous channel indices, out of order")
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			std::string name = "multilayer_2560x1440.exr";
-			auto path = std::filesystem::current_path() / "images" / name;
-			auto input_ptr = OIIO::ImageInput::open(path.string());
-
-			auto image = compressed::image<T>::read(
-				std::move(input_ptr),
-				{ 11, 0, 2, 3 },
-				0,
-				compressed::enums::codec::lz4,
-				9,
-				compressed::s_default_blocksize,
-				compressed::s_default_chunksize / 2
-			);
-
-			// Despite us specifying "VRayCryptomatte00.R" first, since it appears later in the channels this should
-			// have the same ordering as the file
-			CHECK(image.num_channels() == 4);
-			CHECK(image.channelnames() == std::vector<std::string>{ "R", "B", "A", "VRayCryptomatte00.R"});
-		});
+    test_util::parametrize_codecs(
+        [&](compressed::enums::codec codec)
+        {
+            test_util::parametrize<uint8_t, uint16_t, uint32_t, Imath::half, float>(
+                [&]<typename T>([[maybe_unused]] T type)
+                {
+                    std::string name = "multilayer_2560x1440.exr";
+                    auto path = std::filesystem::current_path() / "images" / name;
+                    auto input_ptr = OIIO::ImageInput::open(path.string());
+
+                    auto image = compressed::image<T>::read(
+                        std::move(input_ptr),
+                        {11, 0, 2, 3},
+                        0,
+                        codec,
+                        9,
+                        compressed::s_default_blocksize,
+                        compressed::s_default_chunksize / 2
+                    );
+
+                    // Despite us specifying "VRayCryptomatte00.R" first, since it appears later in the channels this should
+                    // have the same ordering as the file
+                    CHECK(image.num_channels() == 4);
+                    CHECK(image.channelnames() == std::vector<std::string>{ "R", "B", "A", "VRayCryptomatte00.R"});
+                }
+            );
+        }
+    );
 }
 
 
 // -----------------------------------------------------------------------------------
 // -----------------------------------------------------------------------------------
 TEST_CASE(
-	"Read compressed file, invalid channel index"
-	* doctest::no_breaks(true)
-	* doctest::no_output(true)
-	* doctest::should_fail(true)
+    "Read compressed file, invalid channel index"
+    * doctest::no_breaks(true)
+    * doctest::no_output(true)
+    * doctest::should_fail(true)
 )
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			std::string name = "multilayer_2560x1440.exr";
-			auto path = std::filesystem::current_path() / "images" / name;
-			auto input_ptr = OIIO::ImageInput::open(path.string());
-
-			// this should fail as this file does not have a 64th channel
-			auto image = compressed::image<T>::read(
-				std::move(input_ptr),
-				{ 0, 1, 64 },
-				0,
-				compressed::enums::codec::lz4,
-				9,
-				compressed::s_default_blocksize,
-				compressed::s_default_chunksize / 2
-			);
-
-		});
+    test_util::parametrize<uint8_t, uint16_t, uint32_t, float>(
+        [&]<typename T>([[maybe_unused]] T type)
+        {
+            std::string name = "multilayer_2560x1440.exr";
+            auto path = std::filesystem::current_path() / "images" / name;
+            auto input_ptr = OIIO::ImageInput::open(path.string());
+
+            // this should fail as this file does not have a 64th channel
+            auto image = compressed::image<T>::read(
+                std::move(input_ptr),
+                {0, 1, 64},
+                0,
+                compressed::enums::codec::lz4,
+                9,
+                compressed::s_default_blocksize,
+                compressed::s_default_chunksize / 2
+            );
+        }
+
+    );
 }
 
 
@@ -280,25 +332,33 @@ TEST_CASE(
 // -----------------------------------------------------------------------------------
 TEST_CASE("Read compressed file, subset of channel names")
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			std::string name = "multilayer_2560x1440.exr";
-			auto path = std::filesystem::current_path() / "images" / name;
-			auto input_ptr = OIIO::ImageInput::open(path.string());
-
-			auto image = compressed::image<T>::read(
-				std::move(input_ptr),
-				{ "R", "G", "B", "A" },
-				0,
-				compressed::enums::codec::lz4,
-				9,
-				compressed::s_default_blocksize,
-				compressed::s_default_chunksize / 2
-			);
-
-			CHECK(image.num_channels() == 4);
-			CHECK(image.channelnames() == std::vector<std::string>{"R", "G", "B", "A"});
-		});
+    test_util::parametrize_codecs(
+        [&](compressed::enums::codec codec)
+        {
+            test_util::parametrize<uint8_t, uint16_t, uint32_t, Imath::half, float>(
+                [&]<typename T>([[maybe_unused]] T type)
+                {
+                    std::string name = "multilayer_2560x1440.exr";
+                    auto path = std::filesystem::current_path() / "images" / name;
+                    auto input_ptr = OIIO::ImageInput::open(path.string());
+
+                    auto image = compressed::image<T>::read(
+                        std::move(input_ptr),
+                        {"R", "G", "B", "A"},
+                        0,
+                        codec,
+                        9,
+                        compressed::s_default_blocksize,
+                        compressed::s_default_chunksize / 2
+                    );
+
+                    CHECK(image.num_channels() == 4);
+                    CHECK(image.channelnames() == std::vector<std::string>{"R", "G", "B", "A"});
+                }
+
+            );
+        }
+    );
 }
 
 
@@ -306,25 +366,32 @@ TEST_CASE("Read compressed file, subset of channel names")
 // -----------------------------------------------------------------------------------
 TEST_CASE("Read compressed file, non contiguous channel names")
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			std::string name = "multilayer_2560x1440.exr";
-			auto path = std::filesystem::current_path() / "images" / name;
-			auto input_ptr = OIIO::ImageInput::open(path.string());
-
-			auto image = compressed::image<T>::read(
-				std::move(input_ptr),
-				{ "R", "B", "A", "VRayCryptomatte00.R" },
-				0,
-				compressed::enums::codec::lz4,
-				9,
-				compressed::s_default_blocksize,
-				compressed::s_default_chunksize / 2
-			);
-
-			CHECK(image.num_channels() == 4);
-			CHECK(image.channelnames() == std::vector<std::string>{ "R", "B", "A", "VRayCryptomatte00.R"});
-		});
+    test_util::parametrize_codecs(
+        [&](compressed::enums::codec codec)
+        {
+            test_util::parametrize<uint8_t, uint16_t, uint32_t, Imath::half, float>(
+                [&]<typename T>([[maybe_unused]] T type)
+                {
+                    std::string name = "multilayer_2560x1440.exr";
+                    auto path = std::filesystem::current_path() / "images" / name;
+                    auto input_ptr = OIIO::ImageInput::open(path.string());
+
+                    auto image = compressed::image<T>::read(
+                        std::move(input_ptr),
+                        {"R", "B", "A", "VRayCryptomatte00.R"},
+                        0,
+                        codec,
+                        9,
+                        compressed::s_default_blocksize,
+                        compressed::s_default_chunksize / 2
+                    );
+
+                    CHECK(image.num_channels() == 4);
+                    CHECK(image.channelnames() == std::vector<std::string>{ "R", "B", "A", "VRayCryptomatte00.R"});
+                }
+            );
+        }
+    );
 }
 
 
@@ -332,98 +399,113 @@ TEST_CASE("Read compressed file, non contiguous channel names")
 // -----------------------------------------------------------------------------------
 TEST_CASE("Read compressed file, non contiguous channel names, out of order")
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			std::string name = "multilayer_2560x1440.exr";
-			auto path = std::filesystem::current_path() / "images" / name;
-			auto input_ptr = OIIO::ImageInput::open(path.string());
-
-			auto image = compressed::image<T>::read(
-				std::move(input_ptr),
-				{ "VRayCryptomatte00.R", "R", "B", "A" },
-				0,
-				compressed::enums::codec::lz4,
-				9,
-				compressed::s_default_blocksize,
-				compressed::s_default_chunksize / 2
-			);
-
-			// Despite us specifying "VRayCryptomatte00.R" first, since it appears later in the channels this should
-			// have the same ordering as the file
-			CHECK(image.num_channels() == 4);
-			CHECK(image.channelnames() == std::vector<std::string>{ "R", "B", "A", "VRayCryptomatte00.R"});
-		});
+    test_util::parametrize_codecs(
+        [&](compressed::enums::codec codec)
+        {
+            test_util::parametrize<uint8_t, uint16_t, uint32_t, Imath::half, float>(
+                [&]<typename T>([[maybe_unused]] T type)
+                {
+                    std::string name = "multilayer_2560x1440.exr";
+                    auto path = std::filesystem::current_path() / "images" / name;
+                    auto input_ptr = OIIO::ImageInput::open(path.string());
+
+                    auto image = compressed::image<T>::read(
+                        std::move(input_ptr),
+                        {"VRayCryptomatte00.R", "R", "B", "A"},
+                        0,
+                        codec,
+                        9,
+                        compressed::s_default_blocksize,
+                        compressed::s_default_chunksize / 2
+                    );
+
+                    // Despite us specifying "VRayCryptomatte00.R" first, since it appears later in the channels this should
+                    // have the same ordering as the file
+                    CHECK(image.num_channels() == 4);
+                    CHECK(image.channelnames() == std::vector<std::string>{ "R", "B", "A", "VRayCryptomatte00.R"});
+                }
+            );
+        }
+    );
 }
 
 
 // -----------------------------------------------------------------------------------
 // -----------------------------------------------------------------------------------
 TEST_CASE(
-	"Read compressed file, invalid channel name"
-	* doctest::no_breaks(true)
-	* doctest::no_output(true)
-	* doctest::should_fail(true)
+    "Read compressed file, invalid channel name"
+    * doctest::no_breaks(true)
+    * doctest::no_output(true)
+    * doctest::should_fail(true)
 )
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			std::string name = "multilayer_2560x1440.exr";
-			auto path = std::filesystem::current_path() / "images" / name;
-			auto input_ptr = OIIO::ImageInput::open(path.string());
-
-			// this should fail as this file does not have a z channel
-			auto image = compressed::image<T>::read(
-				std::move(input_ptr),
-				{ "R", "G", "Z" },
-				0,
-				compressed::enums::codec::lz4,
-				9,
-				compressed::s_default_blocksize,
-				compressed::s_default_chunksize / 2
-			);
-
-		});
+    test_util::parametrize<uint8_t, uint16_t, uint32_t, float>(
+        [&]<typename T>([[maybe_unused]] T type)
+        {
+            std::string name = "multilayer_2560x1440.exr";
+            auto path = std::filesystem::current_path() / "images" / name;
+            auto input_ptr = OIIO::ImageInput::open(path.string());
+
+            // this should fail as this file does not have a z channel
+            auto image = compressed::image<T>::read(
+                std::move(input_ptr),
+                {"R", "G", "Z"},
+                0,
+                compressed::enums::codec::lz4,
+                9,
+                compressed::s_default_blocksize,
+                compressed::s_default_chunksize / 2
+            );
+        }
+    );
 }
 
 
-
 // -----------------------------------------------------------------------------------
 // -----------------------------------------------------------------------------------
 TEST_CASE("Read compressed file with postprocess, subset of channel names")
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			std::string name = "multilayer_2560x1440.exr";
-			auto path = std::filesystem::current_path() / "images" / name;
-			auto input_ptr = OIIO::ImageInput::open(path.string());
-
-			auto image = compressed::image<T>::read(
-				std::move(input_ptr),
-				[]([[maybe_unused]] size_t channel_idx, std::span<T> values)
-				{
-					for (auto& value : values)
-					{
-						value = static_cast<T>(25);
-					}
-				},
-				{ "R", "G", "B", "A" },
-				0,
-				compressed::enums::codec::lz4,
-				9,
-				compressed::s_default_blocksize,
-				compressed::s_default_chunksize / 2
-			);
-
-			CHECK(image.num_channels() == 4);
-			CHECK(image.channelnames() == std::vector<std::string>{"R", "G", "B", "A"});
-
-			// Check that our postprocess worked
-			auto decompressed = image.get_decompressed();
-			for (const auto& channel : decompressed)
-			{
-				test_util::check_vector_verbose(channel, static_cast<T>(25));
-			}
-		});
+    test_util::parametrize_codecs(
+        [&](compressed::enums::codec codec)
+        {
+            test_util::parametrize<uint8_t, uint16_t, uint32_t, Imath::half, float>(
+                [&]<typename T>([[maybe_unused]] T type)
+                {
+                    std::string name = "multilayer_2560x1440.exr";
+                    auto path = std::filesystem::current_path() / "images" / name;
+                    auto input_ptr = OIIO::ImageInput::open(path.string());
+
+                    auto image = compressed::image<T>::read(
+                        std::move(input_ptr),
+                        []([[maybe_unused]] size_t channel_idx, std::span<T> values)
+                        {
+                            for (auto& value : values)
+                            {
+                                value = static_cast<T>(25);
+                            }
+                        },
+                        {"R", "G", "B", "A"},
+                        0,
+                        codec,
+                        9,
+                        compressed::s_default_blocksize,
+                        compressed::s_default_chunksize / 2
+                    );
+
+                    CHECK(image.num_channels() == 4);
+                    CHECK(image.channelnames() == std::vector<std::string>{"R", "G", "B", "A"});
+
+                    // Check that our postprocess worked
+                    auto decompressed = image.get_decompressed();
+                    for (const auto& channel : decompressed)
+                    {
+                        test_util::check_vector_verbose(channel, static_cast<T>(25));
+                    }
+                }
+
+            );
+        }
+    );
 }
 
 
@@ -431,39 +513,51 @@ TEST_CASE("Read compressed file with postprocess, subset of channel names")
 // -----------------------------------------------------------------------------------
 TEST_CASE("Read compressed file with postprocess, non contiguous channel names")
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			std::string name = "multilayer_2560x1440.exr";
-			auto path = std::filesystem::current_path() / "images" / name;
-			auto input_ptr = OIIO::ImageInput::open(path.string());
-
-			auto image = compressed::image<T>::read(
-				std::move(input_ptr),
-				[]([[maybe_unused]] size_t channel_idx, std::span<T> values)
-				{
-					for (auto& value : values)
-					{
-						value = static_cast<T>(25);
-					}
-				},
-				{ "R", "B", "A", "VRayCryptomatte00.R" },
-				0,
-				compressed::enums::codec::lz4,
-				9,
-				compressed::s_default_blocksize,
-				compressed::s_default_chunksize / 2
-			);
-
-			CHECK(image.num_channels() == 4);
-			CHECK(image.channelnames() == std::vector<std::string>{ "R", "B", "A", "VRayCryptomatte00.R"});
-
-			// Check that our postprocess worked
-			auto decompressed = image.get_decompressed();
-			for (const auto& channel : decompressed)
-			{
-				test_util::check_vector_verbose(channel, static_cast<T>(25));
-			}
-		});
+    test_util::parametrize_codecs(
+        [&](compressed::enums::codec codec)
+        {
+            test_util::parametrize<uint8_t, uint16_t, uint32_t, Imath::half, float>(
+                [&]<typename T>([[maybe_unused]] T type)
+                {
+                    std::string name = "multilayer_2560x1440.exr";
+                    auto path = std::filesystem::current_path() / "images" / name;
+                    auto input_ptr = OIIO::ImageInput::open(path.string());
+
+                    auto image = compressed::image<T>::read(
+                        std::move(input_ptr),
+                        []([[maybe_unused]] size_t channel_idx, std::span<T> values)
+                        {
+                            for (auto& value : values)
+                            {
+                                value = static_cast<T>(25);
+                            }
+                        },
+                        {"R", "B", "A", "VRayCryptomatte00.R"},
+                        0,
+                        codec,
+                        9,
+                        compressed::s_default_blocksize,
+                        compressed::s_default_chunksize / 2
+                    );
+
+                    CHECK(image.num_channels() == 4);
+                    CHECK(
+                        image.channelnames() == std::vector<std::string>{ "R",
+                        "B",
+                        "A",
+                        "VRayCryptomatte00.R"}
+                    );
+
+                    // Check that our postprocess worked
+                    auto decompressed = image.get_decompressed();
+                    for (const auto& channel : decompressed)
+                    {
+                        test_util::check_vector_verbose(channel, static_cast<T>(25));
+                    }
+                }
+            );
+        }
+    );
 }
 
 
@@ -471,131 +565,144 @@ TEST_CASE("Read compressed file with postprocess, non contiguous channel names")
 // -----------------------------------------------------------------------------------
 TEST_CASE("Read compressed file with postprocess, non contiguous channel names, out of order")
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			std::string name = "multilayer_2560x1440.exr";
-			auto path = std::filesystem::current_path() / "images" / name;
-			auto input_ptr = OIIO::ImageInput::open(path.string());
-
-			auto image = compressed::image<T>::read(
-				std::move(input_ptr),
-				[]([[maybe_unused]] size_t channel_idx, std::span<T> values)
-				{
-					for (auto& value : values)
-					{
-						value = static_cast<T>(25);
-					}
-				},
-				{ "VRayCryptomatte00.R", "R", "B", "A" },
-				0,
-				compressed::enums::codec::lz4,
-				9,
-				compressed::s_default_blocksize,
-				compressed::s_default_chunksize / 2
-			);
-
-			// Despite us specifying "VRayCryptomatte00.R" first, since it appears later in the channels this should
-			// have the same ordering as the file
-			CHECK(image.num_channels() == 4);
-			CHECK(image.channelnames() == std::vector<std::string>{ "R", "B", "A", "VRayCryptomatte00.R"});
-
-			// Check that our postprocess worked
-			auto decompressed = image.get_decompressed();
-			for (const auto& channel : decompressed)
-			{
-				test_util::check_vector_verbose(channel, static_cast<T>(25));
-			}
-		});
+    test_util::parametrize_codecs(
+        [&](compressed::enums::codec codec)
+        {
+            test_util::parametrize<uint8_t, uint16_t, uint32_t, Imath::half, float>(
+                [&]<typename T>([[maybe_unused]] T type)
+                {
+                    std::string name = "multilayer_2560x1440.exr";
+                    auto path = std::filesystem::current_path() / "images" / name;
+                    auto input_ptr = OIIO::ImageInput::open(path.string());
+
+                    auto image = compressed::image<T>::read(
+                        std::move(input_ptr),
+                        []([[maybe_unused]] size_t channel_idx, std::span<T> values)
+                        {
+                            for (auto& value : values)
+                            {
+                                value = static_cast<T>(25);
+                            }
+                        },
+                        {"VRayCryptomatte00.R", "R", "B", "A"},
+                        0,
+                        codec,
+                        9,
+                        compressed::s_default_blocksize,
+                        compressed::s_default_chunksize / 2
+                    );
+
+                    // Despite us specifying "VRayCryptomatte00.R" first, since it appears later in the channels this should
+                    // have the same ordering as the file
+                    CHECK(image.num_channels() == 4);
+                    CHECK(
+                        image.channelnames() == std::vector<std::string>{ "R",
+                        "B",
+                        "A",
+                        "VRayCryptomatte00.R"}
+                    );
+
+                    // Check that our postprocess worked
+                    auto decompressed = image.get_decompressed();
+                    for (const auto& channel : decompressed)
+                    {
+                        test_util::check_vector_verbose(channel, static_cast<T>(25));
+                    }
+                }
+
+            );
+        }
+    );
 }
 
 
 // -----------------------------------------------------------------------------------
 // -----------------------------------------------------------------------------------
 TEST_CASE(
-	"Read compressed file with postprocess, invalid channel name"
-	* doctest::no_breaks(true)
-	* doctest::no_output(true)
-	* doctest::should_fail(true)
+    "Read compressed file with postprocess, invalid channel name"
+    * doctest::no_breaks(true)
+    * doctest::no_output(true)
+    * doctest::should_fail(true)
 )
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			std::string name = "multilayer_2560x1440.exr";
-			auto path = std::filesystem::current_path() / "images" / name;
-			auto input_ptr = OIIO::ImageInput::open(path.string());
-
-			// this should fail as this file does not have a z channel
-			auto image = compressed::image<T>::read(
-				std::move(input_ptr),
-				[]([[maybe_unused]] size_t channel_idx, std::span<T> values)
-				{
-					for (auto& value : values)
-					{
-						value = static_cast<T>(25);
-					}
-				},
-				{ "R", "G", "Z" },
-				0,
-				compressed::enums::codec::lz4,
-				9,
-				compressed::s_default_blocksize,
-				compressed::s_default_chunksize / 2
-			);
-		});
+    test_util::parametrize<uint8_t, uint16_t, uint32_t, float>(
+        [&]<typename T>([[maybe_unused]] T type)
+        {
+            std::string name = "multilayer_2560x1440.exr";
+            auto path = std::filesystem::current_path() / "images" / name;
+            auto input_ptr = OIIO::ImageInput::open(path.string());
+
+            // this should fail as this file does not have a z channel
+            auto image = compressed::image<T>::read(
+                std::move(input_ptr),
+                []([[maybe_unused]] size_t channel_idx, std::span<T> values)
+                {
+                    for (auto& value : values)
+                    {
+                        value = static_cast<T>(25);
+                    }
+                },
+                {"R", "G", "Z"},
+                0,
+                compressed::enums::codec::lz4,
+                9,
+                compressed::s_default_blocksize,
+                compressed::s_default_chunksize / 2
+            );
+        }
+    );
 }
 
 
-
-
 // -----------------------------------------------------------------------------------
 // -----------------------------------------------------------------------------------
 TEST_CASE("Initialize image and iterate parametrized")
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			auto channel_r_data = std::vector<T>(128, static_cast<T>(255));
-
-			auto image = compressed::image<T>(
-				std::vector<std::vector<T>>{ channel_r_data},
-				16,
-				8
-			);
-
-			SUBCASE("Read")
-			{
-				auto& r_ref = image.channel(0);
-				for (auto chunk : r_ref)
-				{
-					for (auto& pixel : chunk)
-					{
-						CHECK(pixel == static_cast<T>(255));
-					}
-				}
-			}
-
-			SUBCASE("Modify")
-			{
-				auto& r_ref = image.channel(0);
-				for (auto chunk : r_ref)
-				{
-					for (auto& pixel : chunk)
-					{
-						pixel = static_cast<T>(128);
-					}
-				}
-
-				auto& r_ref_2 = image.channel(0);
-				for (auto chunk_ : r_ref_2)
-				{
-					for (auto& pixel : chunk_)
-					{
-						CHECK(pixel == static_cast<T>(128));
-					}
-				}
-			}
-		}
-	);
-
+    test_util::parametrize<uint8_t, uint16_t, uint32_t, float>(
+        [&]<typename T>([[maybe_unused]] T type)
+        {
+            auto channel_r_data = std::vector<T>(128, static_cast<T>(255));
+
+            auto image = compressed::image<T>(
+                std::vector<std::vector<T>>{channel_r_data},
+                16,
+                8
+            );
+
+            SUBCASE("Read")
+            {
+                auto& r_ref = image.channel(0);
+                for (auto chunk : r_ref)
+                {
+                    for (auto& pixel : chunk)
+                    {
+                        CHECK(pixel == static_cast<T>(255));
+                    }
+                }
+            }
+
+            SUBCASE("Modify")
+            {
+                auto& r_ref = image.channel(0);
+                for (auto chunk : r_ref)
+                {
+                    for (auto& pixel : chunk)
+                    {
+                        pixel = static_cast<T>(128);
+                    }
+                }
+
+                auto& r_ref_2 = image.channel(0);
+                for (auto chunk_ : r_ref_2)
+                {
+                    for (auto& pixel : chunk_)
+                    {
+                        CHECK(pixel == static_cast<T>(128));
+                    }
+                }
+            }
+        }
+    );
 }
 
 
@@ -603,34 +710,35 @@ TEST_CASE("Initialize image and iterate parametrized")
 // -----------------------------------------------------------------------------------
 TEST_CASE("Zip image channels parametrized")
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			auto channel_r_data = std::vector<T>(128, static_cast<T>(255));
-			auto channel_g_data = std::vector<T>(128, static_cast<T>(0));
-			auto channel_b_data = std::vector<T>(128, static_cast<T>(199));
-
-			auto image = compressed::image<T>(
-				std::vector<std::vector<T>>{ channel_r_data, channel_g_data, channel_b_data },
-				16,
-				8
-			);
-
-			auto [r, g, b] = image.channels(0, 1, 2);
-			CHECK(r == image.channel(0));
-			CHECK(g == image.channel(1));
-			CHECK(b == image.channel(2));
-
-			for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b))
-			{
-				for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk))
-				{
-					CHECK(r_pixel == static_cast<T>(255));
-					CHECK(g_pixel == static_cast<T>(0));
-					CHECK(b_pixel == static_cast<T>(199));
-				}
-			}
-		}
-	);
+    test_util::parametrize<uint8_t, uint16_t, uint32_t, float>(
+        [&]<typename T>([[maybe_unused]] T type)
+        {
+            auto channel_r_data = std::vector<T>(128, static_cast<T>(255));
+            auto channel_g_data = std::vector<T>(128, static_cast<T>(0));
+            auto channel_b_data = std::vector<T>(128, static_cast<T>(199));
+
+            auto image = compressed::image<T>(
+                std::vector<std::vector<T>>{channel_r_data, channel_g_data, channel_b_data},
+                16,
+                8
+            );
+
+            auto [r, g, b] = image.channels(0, 1, 2);
+            CHECK(r == image.channel(0));
+            CHECK(g == image.channel(1));
+            CHECK(b == image.channel(2));
+
+            for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b))
+            {
+                for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk))
+                {
+                    CHECK(r_pixel == static_cast<T>(255));
+                    CHECK(g_pixel == static_cast<T>(0));
+                    CHECK(b_pixel == static_cast<T>(199));
+                }
+            }
+        }
+    );
 }
 
 
@@ -638,39 +746,40 @@ TEST_CASE("Zip image channels parametrized")
 // -----------------------------------------------------------------------------------
 TEST_CASE("Zip image channels equal to chunk size parametrized")
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			auto channel_r_data = std::vector<T>(1024, static_cast<T>(255));
-			auto channel_g_data = std::vector<T>(1024, static_cast<T>(0));
-			auto channel_b_data = std::vector<T>(1024, static_cast<T>(199));
-
-			auto image = compressed::image<T>(
-				std::vector<std::vector<T>>{ channel_r_data, channel_g_data, channel_b_data },
-				64,
-				16,
-				{},
-				compressed::enums::codec::lz4,
-				9,
-				256, 
-				1024
-			);
-
-			auto [r, g, b] = image.channels(0, 1, 2);
-			CHECK(r == image.channel(0));
-			CHECK(g == image.channel(1));
-			CHECK(b == image.channel(2));
-
-			for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b))
-			{
-				for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk))
-				{
-					CHECK(r_pixel == static_cast<T>(255));
-					CHECK(g_pixel == static_cast<T>(0));
-					CHECK(b_pixel == static_cast<T>(199));
-				}
-			}
-		}
-	);
+    test_util::parametrize<uint8_t, uint16_t, uint32_t, float>(
+        [&]<typename T>([[maybe_unused]] T type)
+        {
+            auto channel_r_data = std::vector<T>(1024, static_cast<T>(255));
+            auto channel_g_data = std::vector<T>(1024, static_cast<T>(0));
+            auto channel_b_data = std::vector<T>(1024, static_cast<T>(199));
+
+            auto image = compressed::image<T>(
+                std::vector<std::vector<T>>{channel_r_data, channel_g_data, channel_b_data},
+                64,
+                16,
+                {},
+                compressed::enums::codec::lz4,
+                9,
+                256,
+                1024
+            );
+
+            auto [r, g, b] = image.channels(0, 1, 2);
+            CHECK(r == image.channel(0));
+            CHECK(g == image.channel(1));
+            CHECK(b == image.channel(2));
+
+            for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b))
+            {
+                for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk))
+                {
+                    CHECK(r_pixel == static_cast<T>(255));
+                    CHECK(g_pixel == static_cast<T>(0));
+                    CHECK(b_pixel == static_cast<T>(199));
+                }
+            }
+        }
+    );
 }
 
 
@@ -678,39 +787,40 @@ TEST_CASE("Zip image channels equal to chunk size parametrized")
 // -----------------------------------------------------------------------------------
 TEST_CASE("Zip image channels larger to chunk size parametrized")
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			auto channel_r_data = std::vector<T>(1024, static_cast<T>(255));
-			auto channel_g_data = std::vector<T>(1024, static_cast<T>(0));
-			auto channel_b_data = std::vector<T>(1024, static_cast<T>(199));
-
-			auto image = compressed::image<T>(
-				std::vector<std::vector<T>>{ channel_r_data, channel_g_data, channel_b_data },
-				64,
-				16,
-				{},
-				compressed::enums::codec::lz4,
-				9,
-				256, 
-				768
-			);
-
-			auto [r, g, b] = image.channels(0, 1, 2);
-			CHECK(r == image.channel(0));
-			CHECK(g == image.channel(1));
-			CHECK(b == image.channel(2));
-
-			for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b))
-			{
-				for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk))
-				{
-					CHECK(r_pixel == static_cast<T>(255));
-					CHECK(g_pixel == static_cast<T>(0));
-					CHECK(b_pixel == static_cast<T>(199));
-				}
-			}
-		}
-	);
+    test_util::parametrize<uint8_t, uint16_t, uint32_t, float>(
+        [&]<typename T>([[maybe_unused]] T type)
+        {
+            auto channel_r_data = std::vector<T>(1024, static_cast<T>(255));
+            auto channel_g_data = std::vector<T>(1024, static_cast<T>(0));
+            auto channel_b_data = std::vector<T>(1024, static_cast<T>(199));
+
+            auto image = compressed::image<T>(
+                std::vector<std::vector<T>>{channel_r_data, channel_g_data, channel_b_data},
+                64,
+                16,
+                {},
+                compressed::enums::codec::lz4,
+                9,
+                256,
+                768
+            );
+
+            auto [r, g, b] = image.channels(0, 1, 2);
+            CHECK(r == image.channel(0));
+            CHECK(g == image.channel(1));
+            CHECK(b == image.channel(2));
+
+            for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b))
+            {
+                for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk))
+                {
+                    CHECK(r_pixel == static_cast<T>(255));
+                    CHECK(g_pixel == static_cast<T>(0));
+                    CHECK(b_pixel == static_cast<T>(199));
+                }
+            }
+        }
+    );
 }
 
 
@@ -718,44 +828,45 @@ TEST_CASE("Zip image channels larger to chunk size parametrized")
 // -----------------------------------------------------------------------------------
 TEST_CASE("Zip modify image channels parametrized")
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			auto channel_r_data = std::vector<T>(128, static_cast<T>(255));
-			auto channel_g_data = std::vector<T>(128, static_cast<T>(0));
-			auto channel_b_data = std::vector<T>(128, static_cast<T>(199));
-
-			auto image = compressed::image<T>(
-				std::vector<std::vector<T>>{ channel_r_data, channel_g_data, channel_b_data },
-				16,
-				8
-			);
-
-			auto [r, g, b] = image.channels(0, 1, 2);
-			CHECK(r == image.channel(0));
-			CHECK(g == image.channel(1));
-			CHECK(b == image.channel(2));
-
-			for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b))
-			{
-				for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk))
-				{
-					r_pixel = static_cast<T>(12);
-					g_pixel = static_cast<T>(13);
-					b_pixel = static_cast<T>(14);
-				}
-			}
-
-			for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b))
-			{
-				for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk))
-				{
-					CHECK(r_pixel == static_cast<T>(12));
-					CHECK(g_pixel == static_cast<T>(13));
-					CHECK(b_pixel == static_cast<T>(14));
-				}
-			}
-		}
-	);
+    test_util::parametrize<uint8_t, uint16_t, uint32_t, float>(
+        [&]<typename T>([[maybe_unused]] T type)
+        {
+            auto channel_r_data = std::vector<T>(128, static_cast<T>(255));
+            auto channel_g_data = std::vector<T>(128, static_cast<T>(0));
+            auto channel_b_data = std::vector<T>(128, static_cast<T>(199));
+
+            auto image = compressed::image<T>(
+                std::vector<std::vector<T>>{channel_r_data, channel_g_data, channel_b_data},
+                16,
+                8
+            );
+
+            auto [r, g, b] = image.channels(0, 1, 2);
+            CHECK(r == image.channel(0));
+            CHECK(g == image.channel(1));
+            CHECK(b == image.channel(2));
+
+            for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b))
+            {
+                for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk))
+                {
+                    r_pixel = static_cast<T>(12);
+                    g_pixel = static_cast<T>(13);
+                    b_pixel = static_cast<T>(14);
+                }
+            }
+
+            for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b))
+            {
+                for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk))
+                {
+                    CHECK(r_pixel == static_cast<T>(12));
+                    CHECK(g_pixel == static_cast<T>(13));
+                    CHECK(b_pixel == static_cast<T>(14));
+                }
+            }
+        }
+    );
 }
 
 
@@ -763,49 +874,50 @@ TEST_CASE("Zip modify image channels parametrized")
 // -----------------------------------------------------------------------------------
 TEST_CASE("Zip modify image channels equal to chunk size parametrized")
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			auto channel_r_data = std::vector<T>(1024, static_cast<T>(255));
-			auto channel_g_data = std::vector<T>(1024, static_cast<T>(0));
-			auto channel_b_data = std::vector<T>(1024, static_cast<T>(199));
-
-			auto image = compressed::image<T>(
-				std::vector<std::vector<T>>{ channel_r_data, channel_g_data, channel_b_data },
-				64,
-				16,
-				{},
-				compressed::enums::codec::lz4,
-				9,
-				256,
-				1024
-			);
-
-			auto [r, g, b] = image.channels(0, 1, 2);
-			CHECK(r == image.channel(0));
-			CHECK(g == image.channel(1));
-			CHECK(b == image.channel(2));
-
-			for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b))
-			{
-				for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk))
-				{
-					r_pixel = static_cast<T>(12);
-					g_pixel = static_cast<T>(13);
-					b_pixel = static_cast<T>(14);
-				}
-			}
-
-			for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b))
-			{
-				for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk))
-				{
-					CHECK(r_pixel == static_cast<T>(12));
-					CHECK(g_pixel == static_cast<T>(13));
-					CHECK(b_pixel == static_cast<T>(14));
-				}
-			}
-		}
-	);
+    test_util::parametrize<uint8_t, uint16_t, uint32_t, float>(
+        [&]<typename T>([[maybe_unused]] T type)
+        {
+            auto channel_r_data = std::vector<T>(1024, static_cast<T>(255));
+            auto channel_g_data = std::vector<T>(1024, static_cast<T>(0));
+            auto channel_b_data = std::vector<T>(1024, static_cast<T>(199));
+
+            auto image = compressed::image<T>(
+                std::vector<std::vector<T>>{channel_r_data, channel_g_data, channel_b_data},
+                64,
+                16,
+                {},
+                compressed::enums::codec::lz4,
+                9,
+                256,
+                1024
+            );
+
+            auto [r, g, b] = image.channels(0, 1, 2);
+            CHECK(r == image.channel(0));
+            CHECK(g == image.channel(1));
+            CHECK(b == image.channel(2));
+
+            for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b))
+            {
+                for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk))
+                {
+                    r_pixel = static_cast<T>(12);
+                    g_pixel = static_cast<T>(13);
+                    b_pixel = static_cast<T>(14);
+                }
+            }
+
+            for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b))
+            {
+                for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk))
+                {
+                    CHECK(r_pixel == static_cast<T>(12));
+                    CHECK(g_pixel == static_cast<T>(13));
+                    CHECK(b_pixel == static_cast<T>(14));
+                }
+            }
+        }
+    );
 }
 
 
@@ -813,47 +925,48 @@ TEST_CASE("Zip modify image channels equal to chunk size parametrized")
 // -----------------------------------------------------------------------------------
 TEST_CASE("Zip modify image channels larger to chunk size parametrized")
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			auto channel_r_data = std::vector<T>(1024, static_cast<T>(255));
-			auto channel_g_data = std::vector<T>(1024, static_cast<T>(0));
-			auto channel_b_data = std::vector<T>(1024, static_cast<T>(199));
-
-			auto image = compressed::image<T>(
-				std::vector<std::vector<T>>{ channel_r_data, channel_g_data, channel_b_data },
-				64,
-				16,
-				{},
-				compressed::enums::codec::lz4,
-				9,
-				256,
-				768
-			);
-
-			auto [r, g, b] = image.channels(0, 1, 2);
-			CHECK(r == image.channel(0));
-			CHECK(g == image.channel(1));
-			CHECK(b == image.channel(2));
-
-			for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b))
-			{
-				for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk))
-				{
-					r_pixel = static_cast<T>(12);
-					g_pixel = static_cast<T>(13);
-					b_pixel = static_cast<T>(14);
-				}
-			}
-
-			for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b))
-			{
-				for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk))
-				{
-					CHECK(r_pixel == static_cast<T>(12));
-					CHECK(g_pixel == static_cast<T>(13));
-					CHECK(b_pixel == static_cast<T>(14));
-				}
-			}
-		}
-	);
-}
\ No newline at end of file
+    test_util::parametrize<uint8_t, uint16_t, uint32_t, float>(
+        [&]<typename T>([[maybe_unused]] T type)
+        {
+            auto channel_r_data = std::vector<T>(1024, static_cast<T>(255));
+            auto channel_g_data = std::vector<T>(1024, static_cast<T>(0));
+            auto channel_b_data = std::vector<T>(1024, static_cast<T>(199));
+
+            auto image = compressed::image<T>(
+                std::vector<std::vector<T>>{channel_r_data, channel_g_data, channel_b_data},
+                64,
+                16,
+                {},
+                compressed::enums::codec::lz4,
+                9,
+                256,
+                768
+            );
+
+            auto [r, g, b] = image.channels(0, 1, 2);
+            CHECK(r == image.channel(0));
+            CHECK(g == image.channel(1));
+            CHECK(b == image.channel(2));
+
+            for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b))
+            {
+                for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk))
+                {
+                    r_pixel = static_cast<T>(12);
+                    g_pixel = static_cast<T>(13);
+                    b_pixel = static_cast<T>(14);
+                }
+            }
+
+            for (auto [r_chunk, g_chunk, b_chunk] : compressed::ranges::zip(r, g, b))
+            {
+                for (auto [r_pixel, g_pixel, b_pixel] : compressed::ranges::zip(r_chunk, g_chunk, b_chunk))
+                {
+                    CHECK(r_pixel == static_cast<T>(12));
+                    CHECK(g_pixel == static_cast<T>(13));
+                    CHECK(b_pixel == static_cast<T>(14));
+                }
+            }
+        }
+    );
+}
diff --git a/test/src/test_iterator.cpp b/test/src/test_iterator.cpp
deleted file mode 100644
index 854a48d..0000000
--- a/test/src/test_iterator.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-#include "doctest.h"
-
-#include <ranges>
-#include <span>
-#include <vector>
-#include <algorithm>
-#include <string>
-
-#include <compressed/image.h>
-#include <compressed/iterators/iterator.h>
-
-#include "util.h"
-
-
-TEST_CASE("Iterator: serial access")
-{
-	std::string name = "uv_grid_2048x2048.jpg";
-	auto path = std::filesystem::current_path() / "images" / name;
-	auto image = compressed::image<uint8_t>::read(path);
-
-	auto& r = image.channel(0);
-	size_t count = 0;
-	for (const auto& chunk : r)
-	{
-		CHECK(chunk.chunk_index() == count);
-		++count;
-	}
-}
-
-
-TEST_CASE("Iterator: iterate out of bounds"
-	* doctest::no_breaks(true)
-	* doctest::no_output(true)
-	* doctest::should_fail(true))
-{
-	std::string name = "uv_grid_2048x2048.jpg";
-	auto path = std::filesystem::current_path() / "images" / name;
-	auto image = compressed::image<uint8_t>::read(path);
-
-	auto& r = image.channel(0);
-	auto it = r.begin();
-	++it;
-	++it;
-}
-
-
-
-TEST_CASE("Iterator: comparison")
-{
-	std::string name = "uv_grid_2048x2048.jpg";
-	auto path = std::filesystem::current_path() / "images" / name;
-	auto image = compressed::image<uint8_t>::read(
-		path,
-		0,
-		compressed::enums::codec::lz4,
-		9,
-		4096, 
-		16384
-	);
-
-	auto& r = image.channel(0);
-	auto it = r.begin();
-	auto it_2 = r.begin();
-
-	CHECK(it == it_2);
-	++it;
-	CHECK(it != it_2);
-
-	// Different image, iterator should not match
-	auto image_2 = compressed::image<uint8_t>::read(
-		path,
-		0, 
-		compressed::enums::codec::lz4,
-		9,
-		4096,
-		16384
-	);
-	auto& r_2 = image_2.channel(0);
-	auto it_other = r_2.begin();
-
-	CHECK(it_other != it);
-	CHECK(it_other != it_2);
-}
diff --git a/test/src/test_schunk.cpp b/test/src/test_schunk.cpp
index 6384d5d..3929226 100644
--- a/test/src/test_schunk.cpp
+++ b/test/src/test_schunk.cpp
@@ -5,32 +5,38 @@
 #include <vector>
 #include <algorithm>
 #include <thread>
-#include <string>
 #include <numeric>
 
+#define _COMPRESSED_PROFILE 1
 #include <compressed/blosc2/schunk.h>
 #include <compressed/blosc2/wrapper.h>
 
 #include "util.h"
+#include "compressed/channel.h"
 
 
 // -----------------------------------------------------------------------------------
 // -----------------------------------------------------------------------------------
 TEST_CASE("Schunk: initialize with chunk size")
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			compressed::blosc2::schunk<T> super_chunk(128, 4096);
+    test_util::parametrize<uint8_t, uint16_t, uint32_t, float>(
+        [&]<typename T>([[maybe_unused]] T type)
+        {
+            compressed::detail::schunk<T> super_chunk(128, 4096);
 
-			auto ctx = compressed::blosc2::create_decompression_context(std::thread::hardware_concurrency());
+            auto ctx = compressed::blosc2::create_decompression_context(std::thread::hardware_concurrency());
 
-			// this schunk is empty so we expect no items
-			auto decompressed = super_chunk.to_uncompressed(ctx);
-			CHECK(decompressed.size() == 0);
+            auto compression_ctx = compressed::cpu_compression_context{
+                .compression_ctx = nullptr,
+                .decompression_ctx = std::move(ctx),
+                .nthreads = std::thread::hardware_concurrency()
+            };
 
-			// similarly converting to schunk should work, but be empty
-			auto raw_schunk = super_chunk.to_schunk();
-		});
+            // this schunk is empty so we expect no items
+            auto decompressed = super_chunk.to_uncompressed(compression_ctx);
+            CHECK(decompressed.size() == 0);
+        }
+    );
 }
 
 
@@ -38,38 +44,42 @@ TEST_CASE("Schunk: initialize with chunk size")
 // -----------------------------------------------------------------------------------
 TEST_CASE("Schunk: initialize with data")
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			std::vector<T> data(4096);
-			std::iota(data.begin(), data.end(), T{ 0 });
+    test_util::parametrize<uint8_t, uint16_t, uint32_t, float>(
+        [&]<typename T>([[maybe_unused]] T type)
+        {
+            std::vector<T> data(4096);
+            std::iota(data.begin(), data.end(), T{0});
 
-			auto ctx = compressed::blosc2::create_compression_context<T>(
-				std::thread::hardware_concurrency(), 
-				compressed::enums::codec::lz4, 
-				9,
-				128
-			);
-			compressed::blosc2::schunk<T> super_chunk(std::span<const T>(data), 64, 256, ctx);
+            auto ctx = compressed::channel<T>::create_compression_context(
+                compressed::enums::codec::lz4,
+                std::thread::hardware_concurrency(),
+                9,
+                128,
+                0
+            );
+            compressed::detail::schunk<T> super_chunk(std::span<const T>(data), 64, 256, std::move(ctx));
 
-			auto decomp_ctx = compressed::blosc2::create_decompression_context(std::thread::hardware_concurrency());
-			SUBCASE("Check decompressed")
-			{
-				// We expect the same number of elements
-				auto decompressed = super_chunk.to_uncompressed(decomp_ctx);
-				CHECK(decompressed.size() == 4096);
-				CHECK(decompressed == data);
-			}
-			SUBCASE("Check blosc2 schunk result")
-			{
-				// we also expect the right result converting to schunk
-				auto raw_schunk = super_chunk.to_schunk();
-				CHECK(raw_schunk->nchunks == 4096 * sizeof(T) / 256);
-				CHECK(raw_schunk->nbytes / sizeof(T) == 4096);
-			}
-			SUBCASE("Get chunk")
-			{
-				auto chunk = super_chunk.chunk(decomp_ctx, 0);
-				CHECK(chunk.size() == 256 / sizeof(T));
-			}
-		});
-}
\ No newline at end of file
+            auto decomp_ctx = compressed::channel<T>::create_compression_context(
+                compressed::enums::codec::lz4,
+                std::thread::hardware_concurrency(),
+                9,
+                128,
+                0
+            );
+            SUBCASE("Check decompressed")
+            {
+                // We expect the same number of elements
+                auto decompressed = super_chunk.to_uncompressed(
+                    std::get<compressed::cpu_compression_context>(decomp_ctx)
+                );
+                CHECK(decompressed.size() == 4096);
+                CHECK(decompressed == data);
+            }
+            SUBCASE("Get chunk")
+            {
+                auto chunk = super_chunk.chunk(std::get<compressed::cpu_compression_context>(decomp_ctx), size_t{0});
+                CHECK(chunk.size() == 256 / sizeof(T));
+            }
+        }
+    );
+}
diff --git a/test/src/test_zip.cpp b/test/src/test_zip.cpp
index 1083711..fc25d1f 100644
--- a/test/src/test_zip.cpp
+++ b/test/src/test_zip.cpp
@@ -1,11 +1,8 @@
 #include "doctest.h"
 
-#include <ranges>
-#include <span>
 #include <vector>
 #include <algorithm>
-#include <string>
-
+#define _COMPRESSED_PROFILE 1
 #include <compressed/ranges.h>
 
 #include "util.h"
@@ -15,63 +12,78 @@
 // -----------------------------------------------------------------------------------
 TEST_CASE("compressed::ranges::zip sequenced loops")
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			std::vector<T> data_a(25, static_cast<T>(25));
-			std::vector<T> data_b(25, static_cast<T>(50));
-			std::vector<T> data_c(25, static_cast<T>(75));
-
-			auto gen = compressed::ranges::zip(data_a, data_b, data_c);
-			std::for_each(std::execution::seq, gen.begin(), gen.end(), [](auto vals)
-				{
-					auto& [a, b, c] = vals;
-					CHECK(a == static_cast<T>(25));
-					CHECK(b == static_cast<T>(50));
-					CHECK(c == static_cast<T>(75));
-				});
-		});
+    test_util::parametrize<uint8_t, uint16_t, uint32_t, float>(
+        [&]<typename T>([[maybe_unused]] T type)
+        {
+            std::vector<T> data_a(25, static_cast<T>(25));
+            std::vector<T> data_b(25, static_cast<T>(50));
+            std::vector<T> data_c(25, static_cast<T>(75));
+
+            auto gen = compressed::ranges::zip(data_a, data_b, data_c);
+            std::for_each(
+                std::execution::seq,
+                gen.begin(),
+                gen.end(),
+                [](auto vals)
+                {
+                    auto& [a, b, c] = vals;
+                    CHECK(a == static_cast<T>(25));
+                    CHECK(b == static_cast<T>(50));
+                    CHECK(c == static_cast<T>(75));
+                }
+            );
+        }
+    );
 }
 
 // -----------------------------------------------------------------------------------
 // -----------------------------------------------------------------------------------
 TEST_CASE("compressed::ranges::zip parallel loops")
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			std::vector<T> data_a(25, static_cast<T>(25));
-			std::vector<T> data_b(25, static_cast<T>(50));
-			std::vector<T> data_c(25, static_cast<T>(75));
-
-			auto gen = compressed::ranges::zip(data_a, data_b, data_c);
-			std::for_each(std::execution::par_unseq, gen.begin(), gen.end(), [](auto vals)
-				{
-					auto& [a, b, c] = vals;
-					CHECK(a == static_cast<T>(25));
-					CHECK(b == static_cast<T>(50));
-					CHECK(c == static_cast<T>(75));
-				});
-		});
+    test_util::parametrize<uint8_t, uint16_t, uint32_t, float>(
+        [&]<typename T>([[maybe_unused]] T type)
+        {
+            std::vector<T> data_a(25, static_cast<T>(25));
+            std::vector<T> data_b(25, static_cast<T>(50));
+            std::vector<T> data_c(25, static_cast<T>(75));
+
+            auto gen = compressed::ranges::zip(data_a, data_b, data_c);
+            std::for_each(
+                std::execution::par_unseq,
+                gen.begin(),
+                gen.end(),
+                [](auto vals)
+                {
+                    auto& [a, b, c] = vals;
+                    CHECK(a == static_cast<T>(25));
+                    CHECK(b == static_cast<T>(50));
+                    CHECK(c == static_cast<T>(75));
+                }
+            );
+        }
+    );
 }
 
 
-
 // -----------------------------------------------------------------------------------
 // -----------------------------------------------------------------------------------
 TEST_CASE("compressed::ranges::zip regular for loop")
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			std::vector<T> data_a(25, static_cast<T>(25));
-			std::vector<T> data_b(25, static_cast<T>(50));
-			std::vector<T> data_c(25, static_cast<T>(75));
-
-			for (auto [a, b, c] : compressed::ranges::zip(data_a, data_b, data_c))
-			{
-				CHECK(a == static_cast<T>(25));
-				CHECK(b == static_cast<T>(50));
-				CHECK(c == static_cast<T>(75));
-			}
-		});
+    test_util::parametrize<uint8_t, uint16_t, uint32_t, float>(
+        [&]<typename T>([[maybe_unused]] T type)
+        {
+            std::vector<T> data_a(25, static_cast<T>(25));
+            std::vector<T> data_b(25, static_cast<T>(50));
+            std::vector<T> data_c(25, static_cast<T>(75));
+
+            for (auto [a, b, c] : compressed::ranges::zip(data_a, data_b, data_c))
+            {
+                CHECK(a == static_cast<T>(25));
+                CHECK(b == static_cast<T>(50));
+                CHECK(c == static_cast<T>(75));
+            }
+        }
+    );
 }
 
 
@@ -79,23 +91,30 @@ TEST_CASE("compressed::ranges::zip regular for loop")
 // -----------------------------------------------------------------------------------
 TEST_CASE("compressed::ranges::zip serial mismatched sizes")
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			// We expect to only iterate up until index 25 here
-			std::vector<T> data_a(30, static_cast<T>(25));
-			std::vector<T> data_b(25, static_cast<T>(50));
-			std::vector<T> data_c(45, static_cast<T>(75));
-
-			auto gen = compressed::ranges::zip(data_a, data_b, data_c);
-			CHECK(gen.size() == 25);
-			std::for_each(std::execution::seq, gen.begin(), gen.end(), [](auto vals)
-				{
-					auto& [a, b, c] = vals;
-					CHECK(a == static_cast<T>(25));
-					CHECK(b == static_cast<T>(50));
-					CHECK(c == static_cast<T>(75));
-				});
-		});
+    test_util::parametrize<uint8_t, uint16_t, uint32_t, float>(
+        [&]<typename T>([[maybe_unused]] T type)
+        {
+            // We expect to only iterate up until index 25 here
+            std::vector<T> data_a(30, static_cast<T>(25));
+            std::vector<T> data_b(25, static_cast<T>(50));
+            std::vector<T> data_c(45, static_cast<T>(75));
+
+            auto gen = compressed::ranges::zip(data_a, data_b, data_c);
+            CHECK(gen.size() == 25);
+            std::for_each(
+                std::execution::seq,
+                gen.begin(),
+                gen.end(),
+                [](auto vals)
+                {
+                    auto& [a, b, c] = vals;
+                    CHECK(a == static_cast<T>(25));
+                    CHECK(b == static_cast<T>(50));
+                    CHECK(c == static_cast<T>(75));
+                }
+            );
+        }
+    );
 }
 
 
@@ -103,23 +122,30 @@ TEST_CASE("compressed::ranges::zip serial mismatched sizes")
 // -----------------------------------------------------------------------------------
 TEST_CASE("compressed::ranges::zip parallel mismatched sizes")
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			// We expect to only iterate up until index 25 here
-			std::vector<T> data_a(30, static_cast<T>(25));
-			std::vector<T> data_b(25, static_cast<T>(50));
-			std::vector<T> data_c(45, static_cast<T>(75));
-
-			auto gen = compressed::ranges::zip(data_a, data_b, data_c);
-			CHECK(gen.size() == 25);
-			std::for_each(std::execution::par_unseq, gen.begin(), gen.end(), [](auto vals)
-				{
-					auto& [a, b, c] = vals;
-					CHECK(a == static_cast<T>(25));
-					CHECK(b == static_cast<T>(50));
-					CHECK(c == static_cast<T>(75));
-				});
-		});
+    test_util::parametrize<uint8_t, uint16_t, uint32_t, float>(
+        [&]<typename T>([[maybe_unused]] T type)
+        {
+            // We expect to only iterate up until index 25 here
+            std::vector<T> data_a(30, static_cast<T>(25));
+            std::vector<T> data_b(25, static_cast<T>(50));
+            std::vector<T> data_c(45, static_cast<T>(75));
+
+            auto gen = compressed::ranges::zip(data_a, data_b, data_c);
+            CHECK(gen.size() == 25);
+            std::for_each(
+                std::execution::par_unseq,
+                gen.begin(),
+                gen.end(),
+                [](auto vals)
+                {
+                    auto& [a, b, c] = vals;
+                    CHECK(a == static_cast<T>(25));
+                    CHECK(b == static_cast<T>(50));
+                    CHECK(c == static_cast<T>(75));
+                }
+            );
+        }
+    );
 }
 
 
@@ -127,23 +153,25 @@ TEST_CASE("compressed::ranges::zip parallel mismatched sizes")
 // -----------------------------------------------------------------------------------
 TEST_CASE("compressed::ranges::zip regular for loop mismatched sizes")
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			// We expect to only iterate up until index 25 here
-			std::vector<T> data_a(30, static_cast<T>(25));
-			std::vector<T> data_b(25, static_cast<T>(50));
-			std::vector<T> data_c(45, static_cast<T>(75));
-
-			size_t count = 0;
-			for (auto [a, b, c] : compressed::ranges::zip(data_a, data_b, data_c))
-			{
-				CHECK(a == static_cast<T>(25));
-				CHECK(b == static_cast<T>(50));
-				CHECK(c == static_cast<T>(75));
-				++count;
-			}
-			CHECK(count == 25);
-		});
+    test_util::parametrize<uint8_t, uint16_t, uint32_t, float>(
+        [&]<typename T>([[maybe_unused]] T type)
+        {
+            // We expect to only iterate up until index 25 here
+            std::vector<T> data_a(30, static_cast<T>(25));
+            std::vector<T> data_b(25, static_cast<T>(50));
+            std::vector<T> data_c(45, static_cast<T>(75));
+
+            size_t count = 0;
+            for (auto [a, b, c] : compressed::ranges::zip(data_a, data_b, data_c))
+            {
+                CHECK(a == static_cast<T>(25));
+                CHECK(b == static_cast<T>(50));
+                CHECK(c == static_cast<T>(75));
+                ++count;
+            }
+            CHECK(count == 25);
+        }
+    );
 }
 
 
@@ -151,36 +179,48 @@ TEST_CASE("compressed::ranges::zip regular for loop mismatched sizes")
 // -----------------------------------------------------------------------------------
 TEST_CASE("compressed::ranges::zip parallel mismatched sizes modify")
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			// We expect to only iterate up until index 25 here
-			std::vector<T> data_a(30, static_cast<T>(25));
-			std::vector<T> data_b(25, static_cast<T>(50));
-			std::vector<T> data_c(45, static_cast<T>(75));
-
-			auto gen = compressed::ranges::zip(data_a, data_b, data_c);
-			CHECK(gen.size() == 25);
-			std::for_each(std::execution::par_unseq, gen.begin(), gen.end(), [](auto vals)
-				{
-					auto& [a, b, c] = vals;
-					CHECK(a == static_cast<T>(25));
-					CHECK(b == static_cast<T>(50));
-					CHECK(c == static_cast<T>(75));
-
-					a = 75;
-					b = 49;
-					c = 25;
-				});
-
-			auto gen_2 = compressed::ranges::zip(data_a, data_b, data_c);
-			std::for_each(std::execution::par_unseq, gen_2.begin(), gen_2.end(), [](auto vals)
-				{
-					auto& [a, b, c] = vals;
-					CHECK(a == static_cast<T>(75));
-					CHECK(b == static_cast<T>(49));
-					CHECK(c == static_cast<T>(25));
-				});
-		});
+    test_util::parametrize<uint8_t, uint16_t, uint32_t, float>(
+        [&]<typename T>([[maybe_unused]] T type)
+        {
+            // We expect to only iterate up until index 25 here
+            std::vector<T> data_a(30, static_cast<T>(25));
+            std::vector<T> data_b(25, static_cast<T>(50));
+            std::vector<T> data_c(45, static_cast<T>(75));
+
+            auto gen = compressed::ranges::zip(data_a, data_b, data_c);
+            CHECK(gen.size() == 25);
+            std::for_each(
+                std::execution::par_unseq,
+                gen.begin(),
+                gen.end(),
+                [](auto vals)
+                {
+                    auto& [a, b, c] = vals;
+                    CHECK(a == static_cast<T>(25));
+                    CHECK(b == static_cast<T>(50));
+                    CHECK(c == static_cast<T>(75));
+
+                    a = 75;
+                    b = 49;
+                    c = 25;
+                }
+            );
+
+            auto gen_2 = compressed::ranges::zip(data_a, data_b, data_c);
+            std::for_each(
+                std::execution::par_unseq,
+                gen_2.begin(),
+                gen_2.end(),
+                [](auto vals)
+                {
+                    auto& [a, b, c] = vals;
+                    CHECK(a == static_cast<T>(75));
+                    CHECK(b == static_cast<T>(49));
+                    CHECK(c == static_cast<T>(25));
+                }
+            );
+        }
+    );
 }
 
 
@@ -188,36 +228,48 @@ TEST_CASE("compressed::ranges::zip parallel mismatched sizes modify")
 // -----------------------------------------------------------------------------------
 TEST_CASE("compressed::ranges::zip serial mismatched sizes modify")
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			// We expect to only iterate up until index 25 here
-			std::vector<T> data_a(30, static_cast<T>(25));
-			std::vector<T> data_b(25, static_cast<T>(50));
-			std::vector<T> data_c(45, static_cast<T>(75));
-
-			auto gen = compressed::ranges::zip(data_a, data_b, data_c);
-			CHECK(gen.size() == 25);
-			std::for_each(std::execution::seq, gen.begin(), gen.end(), [](auto vals)
-				{
-					auto& [a, b, c] = vals;
-					CHECK(a == static_cast<T>(25));
-					CHECK(b == static_cast<T>(50));
-					CHECK(c == static_cast<T>(75));
-
-					a = 75;
-					b = 49;
-					c = 25;
-				});
-
-			auto gen_2 = compressed::ranges::zip(data_a, data_b, data_c);
-			std::for_each(std::execution::seq, gen_2.begin(), gen_2.end(), [](auto vals)
-				{
-					auto& [a, b, c] = vals;
-					CHECK(a == static_cast<T>(75));
-					CHECK(b == static_cast<T>(49));
-					CHECK(c == static_cast<T>(25));
-				});
-		});
+    test_util::parametrize<uint8_t, uint16_t, uint32_t, float>(
+        [&]<typename T>([[maybe_unused]] T type)
+        {
+            // We expect to only iterate up until index 25 here
+            std::vector<T> data_a(30, static_cast<T>(25));
+            std::vector<T> data_b(25, static_cast<T>(50));
+            std::vector<T> data_c(45, static_cast<T>(75));
+
+            auto gen = compressed::ranges::zip(data_a, data_b, data_c);
+            CHECK(gen.size() == 25);
+            std::for_each(
+                std::execution::seq,
+                gen.begin(),
+                gen.end(),
+                [](auto vals)
+                {
+                    auto& [a, b, c] = vals;
+                    CHECK(a == static_cast<T>(25));
+                    CHECK(b == static_cast<T>(50));
+                    CHECK(c == static_cast<T>(75));
+
+                    a = 75;
+                    b = 49;
+                    c = 25;
+                }
+            );
+
+            auto gen_2 = compressed::ranges::zip(data_a, data_b, data_c);
+            std::for_each(
+                std::execution::seq,
+                gen_2.begin(),
+                gen_2.end(),
+                [](auto vals)
+                {
+                    auto& [a, b, c] = vals;
+                    CHECK(a == static_cast<T>(75));
+                    CHECK(b == static_cast<T>(49));
+                    CHECK(c == static_cast<T>(25));
+                }
+            );
+        }
+    );
 }
 
 
@@ -225,47 +277,49 @@ TEST_CASE("compressed::ranges::zip serial mismatched sizes modify")
 // -----------------------------------------------------------------------------------
 TEST_CASE("compressed::ranges::zip regular for loop mismatched sizes modify")
 {
-	test_util::parametrize<uint8_t, uint16_t, uint32_t, float>([&]<typename T>([[maybe_unused]] T type)
-		{
-			// We expect to only iterate up until index 25 here
-			std::vector<T> data_a(30, static_cast<T>(25));
-			std::vector<T> data_b(25, static_cast<T>(50));
-			std::vector<T> data_c(45, static_cast<T>(75));
-
-			size_t count = 0;
-			for (auto [a, b, c] : compressed::ranges::zip(data_a, data_b, data_c))
-			{
-				CHECK(a == static_cast<T>(25));
-				CHECK(b == static_cast<T>(50));
-				CHECK(c == static_cast<T>(75));
-
-				a = static_cast<T>(75);
-				b = static_cast<T>(49);
-				c = static_cast<T>(25);
-				++count;
-			}
-			CHECK(count == 25);
-
-			for (auto [a, b, c] : compressed::ranges::zip(data_a, data_b, data_c))
-			{
-				CHECK(a == static_cast<T>(75));
-				CHECK(b == static_cast<T>(49));
-				CHECK(c == static_cast<T>(25));
-			}
-
-			// The zip should have only touched the first 25 elements with the rest being the same
-			size_t count_2 = 0;
-			for (const auto& elem : data_a)
-			{
-				if (count_2 < 25)
-				{
-					CHECK(elem == 75);
-				}
-				else
-				{
-					CHECK(elem == 25);
-				}
-				++count_2;
-			}
-		});
-}
\ No newline at end of file
+    test_util::parametrize<uint8_t, uint16_t, uint32_t, float>(
+        [&]<typename T>([[maybe_unused]] T type)
+        {
+            // We expect to only iterate up until index 25 here
+            std::vector<T> data_a(30, static_cast<T>(25));
+            std::vector<T> data_b(25, static_cast<T>(50));
+            std::vector<T> data_c(45, static_cast<T>(75));
+
+            size_t count = 0;
+            for (auto [a, b, c] : compressed::ranges::zip(data_a, data_b, data_c))
+            {
+                CHECK(a == static_cast<T>(25));
+                CHECK(b == static_cast<T>(50));
+                CHECK(c == static_cast<T>(75));
+
+                a = static_cast<T>(75);
+                b = static_cast<T>(49);
+                c = static_cast<T>(25);
+                ++count;
+            }
+            CHECK(count == 25);
+
+            for (auto [a, b, c] : compressed::ranges::zip(data_a, data_b, data_c))
+            {
+                CHECK(a == static_cast<T>(75));
+                CHECK(b == static_cast<T>(49));
+                CHECK(c == static_cast<T>(25));
+            }
+
+            // The zip should have only touched the first 25 elements with the rest being the same
+            size_t count_2 = 0;
+            for (const auto& elem : data_a)
+            {
+                if (count_2 < 25)
+                {
+                    CHECK(elem == 75);
+                }
+                else
+                {
+                    CHECK(elem == 25);
+                }
+                ++count_2;
+            }
+        }
+    );
+}
diff --git a/test/src/util.h b/test/src/util.h
index 2b34ec7..4331b0e 100644
--- a/test/src/util.h
+++ b/test/src/util.h
@@ -1,158 +1,234 @@
 #pragma once
 
-#include <execution>
 #include <ranges>
 #include <vector>
 #include <span>
 #include <filesystem>
 
 #include <OpenImageIO/imageio.h>
-#include <OpenImageIO/half.h>
 
 #include <compressed/image_algo.h>
 #include <compressed/enums.h>
+#include <compressed/cuda/gpu.h>
 
+// Explicit stringification support for Imath::half in doctest assertions
+namespace doctest
+{
+    template <>
+    struct StringMaker<Imath_3_1::half>
+    {
+        static String convert(const Imath_3_1::half& value)
+        {
+            // Safely cast to float, which doctest already knows how to print perfectly
+            return toString(static_cast<float>(value));
+        }
+    };
+}
 
 namespace test_util
 {
-
-	/// Read the image using OpenImageIO (OIIO) and deinterleave all the channels into discrete buffers.
-	/// 
-	/// This function opens an image file using OIIO, reads its pixel data into a single buffer, 
-	/// and then separates the interleaved channel data into individual channel buffers.
-	/// 
-	/// \tparam T The pixel data type (e.g., uint8_t, float).
-	/// \param filepath The file path to the image.
-	/// \return A vector of vectors, where each inner vector represents a deinterleaved channel.
-	/// \throws std::runtime_error if the image fails to open or read.
-	template <typename T>
-	std::vector<std::vector<T>> read_oiio(std::filesystem::path filepath, int subimage = 0)
-	{
-		auto input_ptr = OIIO::ImageInput::open(filepath.string());
-		if (!input_ptr)
-		{
-			throw std::runtime_error(std::format("Failed to open image {}", filepath.string()));
-		}
-		auto res = input_ptr->seek_subimage(subimage, 0);
-		if (!res)
-		{
-			throw std::runtime_error(std::format("Image {} does not contain subimage {}", filepath.string(), subimage));
-		}
-		const OIIO::ImageSpec& spec = input_ptr->spec();
-		std::vector<T> pixels(static_cast<size_t>(spec.width) * spec.height * spec.nchannels);
-		std::vector<std::vector<T>> channels;
-		for ([[maybe_unused]] auto _ : std::views::iota(0, spec.nchannels))
-		{
-			channels.push_back(std::vector<T>(static_cast<size_t>(spec.width) * spec.height));
-		}
-
-		auto typedesc = compressed::enums::get_type_desc<T>();
-		auto ok = input_ptr->read_image(subimage, 0, 0, spec.nchannels, typedesc, static_cast<void*>(pixels.data()));
-		if (!ok)
-		{
-			throw std::runtime_error(std::format("Image {} failed to read because: {}", filepath.string(), input_ptr->geterror()));
-		}
-		compressed::image_algo::deinterleave(std::span<const T>(pixels), channels);
-		return channels;
-	}
-
-
-	/// Compare two nested vectors (representing two multi-channel images), ensuring their contents are equal.
-	/// 
-	/// This function checks if two images (stored as `std::vector<std::vector<T>>`) have the same number of channels, 
-	/// that each channel contains the same number of elements, and that all pixel values match. 
-	/// If any discrepancy is found, a detailed exception is thrown.
-	/// 
-	/// \tparam T The pixel data type (e.g., uint8_t, float).
-	/// \param a The first image to compare.
-	/// \param b The second image to compare.
-	/// \param name A label for the images, used in error messages.
-	/// \throws std::runtime_error if the images differ in structure or content.
-	template <typename T>
-	void compare_images(std::vector<std::vector<T>> a, std::vector<std::vector<T>> b, std::string name)
-	{
-		if (a.size() != b.size())
-		{
-			throw std::runtime_error(std::format("{}: Error while comparing images, mismatch in number of channels {} : {}", name, a.size(), b.size()));
-		}
-
-		for (auto channel_idx : std::views::iota(static_cast<size_t>(0), a.size()))
-		{
-			if (a[channel_idx].size() != b[channel_idx].size())
-			{
-				throw std::runtime_error(
-					std::format("{}: Error while comparing images, mismatch in number of items while comparing channel {} a: {:L} b: {:L}",
-						name,
-						channel_idx,
-						a[channel_idx].size(),
-						b[channel_idx].size())
-				);
-			}
-
-			for (auto i : std::views::iota(static_cast<size_t>(0), a[channel_idx].size()))
-			{
-				if (a[channel_idx][i] != b[channel_idx][i])
-				{
-					throw std::runtime_error(
-						std::format("{}: Error while comparing images, mismatch at element {} in channel {}. a: {}, b: {}",
-							name,
-							i,
-							channel_idx,
-							a[channel_idx][i],
-							b[channel_idx][i])
-					);
-				}
-			}
-		}
-	}
-
-
-	/// Parametrize the given test lambda for the given types.
-	template <typename... Types, typename Lambda>
-	void parametrize(Lambda&& lambda)
-	{
-		(lambda(Types{}), ...);
-	}
-
-	namespace detail
-	{
-		template <typename Container1, typename Container2>
-		concept ContainerPair = requires(Container1 x, Container2 y) {
-			{ x.size() } -> std::convertible_to<std::size_t>;
-			{ y.size() } -> std::convertible_to<std::size_t>;
-			{ x[0] } -> std::same_as<decltype(y[0])>;
-		};
-
-		template <typename Container, typename T>
-		concept ContainerAndValue = requires(Container x, T val) {
-			{ x.size() } -> std::convertible_to<std::size_t>;
-			{ x[0] != val } -> std::convertible_to<bool>;
-		};
-
-	} // detail
-
-
-	template <typename Container1, typename Container2>
-		requires detail::ContainerPair<Container1, Container2>
-	void check_vector_verbose(const Container1& x, const Container2& y) 
-	{
-		REQUIRE(x.size() == y.size());
-		for (size_t i = 0; i < x.size(); ++i) {
-			if (x[i] != y[i]) {
-				REQUIRE_MESSAGE(x[i] == y[i], "Failed vector index: " << i);
-			}
-		}
-	}
-
-	template <typename Container, typename T>
-		requires detail::ContainerAndValue<Container, T>
-	void check_vector_verbose(const Container& x, T y)
-	{
-		for (size_t i = 0; i < x.size(); ++i) {
-			if (x[i] != y) {
-				REQUIRE_MESSAGE(x[i] == y, "Failed vector index: " << i);
-			}
-		}
-	}
-
-}
\ No newline at end of file
+    /// Read the image using OpenImageIO (OIIO) and deinterleave all the channels into discrete buffers.
+    ///
+    /// This function opens an image file using OIIO, reads its pixel data into a single buffer,
+    /// and then separates the interleaved channel data into individual channel buffers.
+    ///
+    /// \tparam T The pixel data type (e.g., uint8_t, float).
+    /// \param filepath The file path to the image.
+    /// \return A vector of vectors, where each inner vector represents a deinterleaved channel.
+    /// \throws std::runtime_error if the image fails to open or read.
+    template <typename T>
+    std::vector<std::vector<T>> read_oiio(std::filesystem::path filepath, int subimage = 0)
+    {
+        auto input_ptr = OIIO::ImageInput::open(filepath.string());
+        if (!input_ptr)
+        {
+            throw std::runtime_error(std::format("Failed to open image {}", filepath.string()));
+        }
+        auto res = input_ptr->seek_subimage(subimage, 0);
+        if (!res)
+        {
+            throw std::runtime_error(std::format("Image {} does not contain subimage {}", filepath.string(), subimage));
+        }
+        const OIIO::ImageSpec& spec = input_ptr->spec();
+        std::vector<T> pixels(static_cast<size_t>(spec.width) * spec.height * spec.nchannels);
+        std::vector<std::vector<T>> channels;
+        for ([[maybe_unused]] auto _ : std::views::iota(0, spec.nchannels))
+        {
+            channels.push_back(std::vector<T>(static_cast<size_t>(spec.width) * spec.height));
+        }
+
+        auto typedesc = compressed::enums::get_type_desc<T>();
+        auto ok = input_ptr->read_image(subimage, 0, 0, spec.nchannels, typedesc, static_cast<void*>(pixels.data()));
+        if (!ok)
+        {
+            throw std::runtime_error(
+                std::format("Image {} failed to read because: {}", filepath.string(), input_ptr->geterror())
+            );
+        }
+        compressed::image_algo::deinterleave(std::span<const T>(pixels), channels);
+        return channels;
+    }
+
+
+    /// Compare two nested vectors (representing two multi-channel images), ensuring their contents are equal.
+    ///
+    /// This function checks if two images (stored as `std::vector<std::vector<T>>`) have the same number of channels,
+    /// that each channel contains the same number of elements, and that all pixel values match.
+    /// If any discrepancy is found, a detailed exception is thrown.
+    ///
+    /// \tparam T The pixel data type (e.g., uint8_t, float).
+    /// \param a The first image to compare.
+    /// \param b The second image to compare.
+    /// \param name A label for the images, used in error messages.
+    /// \throws std::runtime_error if the images differ in structure or content.
+    template <typename T>
+    void compare_images(std::vector<std::vector<T>> a, std::vector<std::vector<T>> b, std::string name)
+    {
+        if (a.size() != b.size())
+        {
+            throw std::runtime_error(
+                std::format(
+                    "{}: Error while comparing images, mismatch in number of channels {} : {}",
+                    name,
+                    a.size(),
+                    b.size()
+                )
+            );
+        }
+
+        for (auto channel_idx : std::views::iota(static_cast<size_t>(0), a.size()))
+        {
+            if (a[channel_idx].size() != b[channel_idx].size())
+            {
+                throw std::runtime_error(
+                    std::format(
+                        "{}: Error while comparing images, mismatch in number of items while comparing channel {} a: {:L} b: {:L}",
+                        name,
+                        channel_idx,
+                        a[channel_idx].size(),
+                        b[channel_idx].size()
+                    )
+                );
+            }
+
+            for (auto i : std::views::iota(static_cast<size_t>(0), a[channel_idx].size()))
+            {
+                if (a[channel_idx][i] != b[channel_idx][i])
+                {
+                    throw std::runtime_error(
+                        std::format(
+                            "{}: Error while comparing images, mismatch at element {} in channel {}. a: {}, b: {}",
+                            name,
+                            i,
+                            channel_idx,
+                            a[channel_idx][i],
+                            b[channel_idx][i]
+                        )
+                    );
+                }
+            }
+        }
+    }
+
+
+    /// Parametrize the given test lambda for the given types.
+    template <typename... Types, typename Lambda>
+    void parametrize(Lambda&& lambda)
+    {
+        ([&]<typename T>()
+        {
+            if constexpr (std::is_same_v<T, Imath::half>)
+            {
+                SUBCASE("<half>")
+                    lambda(T{});
+            }
+            else
+            {
+                const std::string name = std::format("<{}>", typeid(T).name());
+                SUBCASE(name.c_str())
+                    lambda(T{});
+            }
+        }.template operator()<Types>(), ...);
+    }
+
+    template <typename Lambda>
+    void parametrize_codecs(Lambda&& lambda)
+    {
+        // 1. Define all possible variants from your enums::codec
+        constexpr std::array all_codecs = {
+            compressed::enums::codec::blosclz,
+            compressed::enums::codec::lz4,
+            compressed::enums::codec::lz4hc,
+            compressed::enums::codec::zstd,
+            compressed::enums::codec::lz4_gpu,
+            compressed::enums::codec::snappy_gpu,
+            compressed::enums::codec::zstd_gpu,
+            compressed::enums::codec::deflate_gpu,
+            compressed::enums::codec::gdeflate_gpu,
+            compressed::enums::codec::cascaded_gpu
+        };
+
+        for (const auto codec : all_codecs)
+        {
+            // 3. Skip GPU codecs dynamically if CUDA is not available
+            if (compressed::enums::is_gpu_codec(codec) && !compressed::cuda::is_available())
+            {
+                continue;
+            }
+
+            // 4. Capture the string in the local loop scope so it outlives the SUBCASE macro evaluation
+            std::string codec_name = std::string(compressed::enums::to_string(codec));
+
+            SUBCASE(codec_name.c_str())
+            {
+                lambda(codec);
+            }
+        }
+    }
+
+    namespace detail
+    {
+        template <typename Container1, typename Container2>
+        concept ContainerPair = requires(Container1 x, Container2 y)
+        {
+            { x.size() } -> std::convertible_to<std::size_t>;
+            { y.size() } -> std::convertible_to<std::size_t>;
+            { x[0] } -> std::same_as<decltype(y[0])>;
+        };
+
+        template <typename Container, typename T>
+        concept ContainerAndValue = requires(Container x, T val)
+        {
+            { x.size() } -> std::convertible_to<std::size_t>;
+            { x[0] != val } -> std::convertible_to<bool>;
+        };
+    } // detail
+
+
+    template <typename Container1, typename Container2>
+        requires detail::ContainerPair<Container1, Container2>
+    void check_vector_verbose(const Container1& x, const Container2& y)
+    {
+        REQUIRE(x.size() == y.size());
+        for (size_t i = 0; i < x.size(); ++i)
+        {
+            if (x[i] != y[i])
+            {
+                REQUIRE_MESSAGE(x[i] == y[i], "Failed vector index: " << i);
+            }
+        }
+    }
+
+    template <typename Container, typename T>
+        requires detail::ContainerAndValue<Container, T>
+    void check_vector_verbose(const Container& x, T y)
+    {
+        for (size_t i = 0; i < x.size(); ++i)
+        {
+            if (x[i] != y)
+            {
+                REQUIRE_MESSAGE(x[i] == y, "Failed vector index: " << i);
+            }
+        }
+    }
+}
diff --git a/thirdparty/spdlog b/thirdparty/spdlog
new file mode 160000
index 0000000..79524dd
--- /dev/null
+++ b/thirdparty/spdlog
@@ -0,0 +1 @@
+Subproject commit 79524ddd08a4ec981b7fea76afd08ee05f83755d
diff --git a/thirdparty/vcpkg b/thirdparty/vcpkg
index 5a2324f..120deac 160000
--- a/thirdparty/vcpkg
+++ b/thirdparty/vcpkg
@@ -1 +1 @@
-Subproject commit 5a2324f6667233aeb903d3117f6fd259a2be6f8b
+Subproject commit 120deac3062162151622ca4860575a33844ba10b
diff --git a/vcpkg.json b/vcpkg.json
index 0f1e71b..d4b713c 100644
--- a/vcpkg.json
+++ b/vcpkg.json
@@ -6,7 +6,7 @@
   "overrides": [
     {
       "name": "openimageio",
-      "version": "2.5.16.0"
+      "version": "3.0.9.1"
     }
   ]
 }
\ No newline at end of file