diff --git a/CMakeLists.txt b/CMakeLists.txt
index ea7ab493..613261ab 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,6 +8,32 @@ set(CMAKE_C_FLAGS "-std=c99")
 # project name
 project( gpr )
 
+# Enable link-time optimization for cross-TU inlining (GetBits, GetBuffer, etc.)
+cmake_policy(SET CMP0069 NEW)
+include(CheckIPOSupported)
+check_ipo_supported(RESULT ipo_supported OUTPUT ipo_output)
+if(ipo_supported)
+  set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
+  message(STATUS "LTO/IPO enabled")
+else()
+  message(STATUS "LTO/IPO not supported: ${ipo_output}")
+endif()
+
+# Enable NEON on arm64 (Apple Silicon defaults to ON)
+set(NEON_DEFAULT OFF)
+if(APPLE AND CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm64|aarch64)")
+  set(NEON_DEFAULT ON)
+endif()
+option(NEON "Enable ARM NEON intrinsics" ${NEON_DEFAULT})
+
+if(NEON)
+  add_compile_definitions(NEON=1)
+else()
+  add_compile_definitions(NEON=0)
+endif()
+
+message(STATUS "NEON enabled: ${NEON}")
+
 if(EXISTS "${CMAKE_SOURCE_DIR}/.git")
   execute_process(
     COMMAND git rev-parse --abbrev-ref HEAD
diff --git a/docs/quality-analysis.md b/docs/quality-analysis.md
new file mode 100644
index 00000000..a01515ec
--- /dev/null
+++ b/docs/quality-analysis.md
@@ -0,0 +1,224 @@
+# GPR Quality Analysis — 16-Bit
+
+Test image: Hasselblad X2D 100C, 11664x8750, 16-bit Bayer (194.7 MB raw)
+Source: `2024_12_Dec_Austin_0186.dng` (138.2 MB lossless DNG)
+
+## File Size
+
+| Q | GPR Size | Compression | vs DNG | Space Saving |
+|---|----------|-------------|--------|--------------|
+| 0 | 21.4 MB | 9.1x | 6.4x smaller | 84.5% |
+| 1 | 28.7 MB | 6.8x | 4.8x smaller | 79.2% |
+| 2 | 39.2 MB | 5.0x | 3.5x smaller | 71.6% |
+| 3 | 46.0 MB | 4.2x | 3.0x smaller | 66.7% |
+| 4 | 53.0 MB | 3.7x | 2.6x smaller | 61.6% |
+| 5 | 66.9 MB | 2.9x | 2.1x smaller | 51.6% |
+| 6 | 89.8 MB | 2.2x | 1.5x smaller | 35.0% |
+| 7 | 90.9 MB | 2.1x | 1.5x smaller | 34.2% |
+| 8 | 92.0 MB | 2.1x | 1.5x smaller | 33.4% |
+
+Compression ratio is vs uncompressed 16-bit raw (194.7 MB).
+
+## Raw Bayer Domain (pre-demosaic)
+
+This is the domain RAW editors (Lightroom, Capture One, etc.) operate in.
+Errors here directly affect all downstream edits.
+
+| Q | PSNR | RMS Error | MAE | MAE % | Max Error | ENOB |
+|---|------|-----------|-----|-------|-----------|------|
+| 0 | 40.0 dB | 655 DN | 334 DN | 0.509% | 11454 DN | 6.6 bits |
+| 1 | 44.6 dB | 386 DN | 209 DN | 0.319% | 6682 DN | 7.4 bits |
+| 2 | 48.5 dB | 246 DN | 136 DN | 0.208% | 4437 DN | 8.1 bits |
+| 3 | 50.0 dB | 207 DN | 116 DN | 0.176% | 4115 DN | 8.3 bits |
+| 4 | 51.2 dB | 181 DN | 100 DN | 0.153% | 4179 DN | 8.5 bits |
+| 5 | 52.1 dB | 163 DN | 89 DN | 0.135% | 3749 DN | 8.7 bits |
+| 6 | 56.5 dB | 98 DN | 53 DN | 0.081% | 2190 DN | 9.4 bits |
+| 7 | 58.6 dB | 77 DN | 41 DN | 0.062% | 1597 DN | 9.7 bits |
+| 8 | 58.9 dB | 74 DN | 39 DN | 0.060% | 1566 DN | 9.8 bits |
+
+- ENOB = Effective Number of Bits = PSNR / 6.02
+- DN = Digital Number (out of 65535 for 16-bit)
+- This sensor captures ~14 bits of real data at ISO 100
+
+## Rendered RGB (16-bit sRGB linear)
+
+After demosaicing + white balance + color matrix. This is what you see on screen.
+
+| Q | PSNR | R PSNR | G PSNR | B PSNR | MAE | >0.5% | >1% |
+|---|------|--------|--------|--------|-----|-------|-----|
+| 0 | 36.9 dB | 35.2 dB | 39.6 dB | 37.1 dB | 468 DN | 36.8% | 20.4% |
+| 1 | 40.5 dB | 38.7 dB | 43.4 dB | 40.7 dB | 317 DN | 28.7% | 13.0% |
+| 2 | 43.5 dB | 41.6 dB | 46.4 dB | 43.7 dB | 224 DN | 20.4% | 7.4% |
+| 3 | 44.5 dB | 42.6 dB | 47.5 dB | 44.7 dB | 196 DN | 17.2% | 5.8% |
+| 4 | 45.4 dB | 43.4 dB | 48.5 dB | 45.6 dB | 176 DN | 14.9% | 4.7% |
+| 5 | 46.1 dB | 44.1 dB | 49.4 dB | 46.4 dB | 160 DN | 13.1% | 3.9% |
+| 6 | 49.5 dB | 47.7 dB | 52.2 dB | 49.7 dB | 95 DN | 5.4% | 1.2% |
+| 7 | 50.9 dB | 49.3 dB | 53.2 dB | 51.2 dB | 72 DN | 3.2% | 0.7% |
+| 8 | 51.1 dB | 49.5 dB | 53.4 dB | 51.4 dB | 69 DN | 3.0% | 0.6% |
+
+- Red channel is weakest (lowest PSNR) — typical for Bayer demosaic
+- ">0.5%" / ">1%" = % of pixel values with error exceeding that fraction of full scale
+
+## Practical Editing Guidance
+
+Context: This sensor (X2D 100C) captures ~14 bits of real data at ISO 100.
+Compression ENOB ranges from 6.6 bits (Q0) to 9.8 bits (Q8).
+Compression error is always above sensor noise, so it is the limiting factor.
+
+### Q0 — Aggressive Compression (9.1x, 21 MB)
+
+Raw ENOB: 6.6 bits | PSNR: 40.0 dB | RMS error: 655 DN
+
+- OK: Export to JPEG/web at 4K or smaller — artifacts masked by downsampling + 8-bit
+- OK: Basic white balance and exposure +/- 0.5 stop
+- OK: Light color grading (LUT application, saturation +/- 20%)
+- NO: Shadow recovery — pushing shadows reveals quantization banding
+- NO: Heavy curves / local contrast — amplifies per-band wavelet artifacts
+- NO: Large prints from full resolution — artifacts visible at ~200 PPI+
+
+Best for: Preview/proxy files, web gallery, social media, quick turnaround
+
+### Q1 — High Compression (6.8x, 29 MB)
+
+Raw ENOB: 7.4 bits | PSNR: 44.6 dB | RMS error: 386 DN
+
+- OK: Exposure adjustment +/- 1 stop
+- OK: Moderate white balance correction
+- OK: Saturation and vibrance adjustments
+- MAYBE: Shadow push up to +1 stop — minor artifacts in deep shadows
+- NO: Heavy shadow recovery (more than +1 stop)
+
+Best for: Event photography, documentary, editorial with light edits
+
+### Q2-Q3 — Moderate Compression (5.0-4.2x, 39-46 MB)
+
+Raw ENOB: 8.1-8.3 bits | PSNR: 48.5-50.0 dB | RMS error: 207-246 DN
+
+- OK: Exposure +/- 1.5 stops
+- OK: Moderate shadow recovery (+1.5 stops)
+- OK: Color grading and creative white balance shifts
+- OK: Clarity / texture / dehaze adjustments
+- OK: Print up to 24x36" at 300 PPI
+- MAYBE: Aggressive curves — some wavelet ringing in smooth gradients
+- NO: Extreme shadow push (> +2 stops)
+
+Best for: Professional editorial, portrait, landscape with moderate edits
+
+### Q4-Q5 — Balanced (3.7-2.9x, 53-67 MB)
+
+Raw ENOB: 8.5-8.7 bits | PSNR: 51.2-52.1 dB | RMS error: 163-181 DN
+
+- OK: Exposure +/- 2 stops
+- OK: Shadow recovery up to +2 stops
+- OK: Aggressive color grading (split toning, cross-process, film emulation)
+- OK: Local adjustments (dodge/burn, graduated filters, radial masks)
+- OK: Sharpening and noise reduction
+- OK: Large prints at full resolution
+- MAYBE: Extreme shadow push (+3 stops) — faint banding in deepest shadows
+
+Best for: Professional photography with full editing workflow
+
+### Q6-Q8 — Near-Lossless (2.2-2.1x, 90-92 MB)
+
+Raw ENOB: 9.4-9.8 bits | PSNR: 56.5-58.9 dB | RMS error: 74-98 DN
+
+- OK: Shadow recovery up to +3 stops
+- OK: Extreme exposure adjustments (+/- 3 stops)
+- OK: Aggressive local contrast / clarity / HDR tone mapping
+- OK: Fine art printing at maximum size
+- OK: Pixel-level compositing and retouching
+- MAYBE: Extreme shadow push (+4 stops) — may show faint artifacts
+
+Note: Q7 and Q8 are nearly identical (58.6 vs 58.9 dB) because the
+quantization divisor hits the noise floor. Q6 offers the best
+quality/size tradeoff in this tier.
+
+Best for: Archival, fine art, maximum editing flexibility
+
+## Key Insight
+
+This is a 16-bit sensor with ~14 bits of real data. Even Q8 (best quality)
+only preserves ~10 bits of the 14 available. The remaining ~4 bits of sensor
+precision are lost to wavelet quantization. For truly lossless archival,
+keep the original DNG.
+
+However: for practical photography, 10 bits of ENOB in the raw domain
+translates to 8+ bits of final output precision after demosaic + color
+processing — more than sufficient for any 8-bit display or print.
+
+The compression artifacts are in the wavelet domain (smooth spatial patterns
+across 8x8 or 16x16 pixel blocks), not in the pixel domain. They are less
+visually objectionable than JPEG artifacts at the same PSNR.
+
+## Per-Channel Quantization Scaling
+
+VC5 encodes Bayer data in color-difference space (not raw RGGB):
+
+| Channel | Name | Content |
+|---------|------|---------|
+| 0 (GS) | Green Sum | (Gr+Gb)/2 — luminance |
+| 1 (RG) | Red-Green | R-G — chrominance |
+| 2 (BG) | Blue-Green | B-G — chrominance |
+| 3 (GD) | Green Diff | (Gr-Gb)/2 — detail/noise |
+
+The `--ChannelScale` flag (or `channel_quant_scale` API) applies per-channel
+multipliers to quantization. Scale > 1.0 = more compression, < 1.0 = less.
+
+### Test Results (Q4 base, 100MP 16-bit)
+
+| Config | Scale (GS,RG,BG,GD) | Size | Raw PSNR | RGB PSNR | G PSNR | R PSNR | B PSNR |
+|--------|---------------------|------|----------|----------|--------|--------|--------|
+| Uniform Q4 | 1,1,1,1 | 53.0 MB | 51.2 dB | 44.6 dB | 49.2 dB | 42.0 dB | 45.3 dB |
+| A | 1.0, 2.0, 2.0, 3.0 | 39.9 MB | 46.2 dB | 39.9 dB | 45.0 dB | 37.2 dB | 40.7 dB |
+| B | 1.0, 3.0, 3.0, 4.0 | 35.3 MB | 43.7 dB | 37.4 dB | 43.2 dB | 34.5 dB | 38.4 dB |
+| C | 0.5, 2.0, 2.0, 2.0 | 47.4 MB | 47.5 dB | 40.4 dB | 46.3 dB | 37.5 dB | 41.4 dB |
+| D | 1.0, 4.0, 4.0, 6.0 | 31.9 MB | 41.3 dB | 35.5 dB | 41.2 dB | 32.6 dB | 36.5 dB |
+
+### Size-Matched Comparison (per-channel vs uniform)
+
+| Per-Channel | Size | RGB PSNR | Uniform Match | Size | RGB PSNR | Delta |
+|-------------|------|----------|---------------|------|----------|-------|
+| Q4+A(1,2,2,3) | 39.9 MB | 39.9 dB | Uniform Q2 | 39.2 MB | 42.8 dB | -2.9 dB |
+| Q4+C(.5,2,2,2) | 47.4 MB | 40.4 dB | Uniform Q3 | 46.0 MB | 43.8 dB | -3.4 dB |
+
+### Analysis
+
+At the same file size, uniform quality presets outperform per-channel scaling
+by 2-3 dB. The reason: VC5's quality tables are already carefully tuned for
+the color-difference space. Each preset has per-band ratios (e.g., lower Q for
+low-frequency, higher Q for high-frequency) that are perceptually meaningful.
+A uniform multiplier across all bands within a channel is less efficient than
+using a different preset designed as a coherent whole.
+
+### When Per-Channel Scaling IS Useful
+
+- **Chroma-insensitive workflows**: Machine vision, scientific imaging where
+  only luminance matters — compress RG/BG/GD aggressively
+- **Noise-dominated detail channel**: At high ISO, GD is mostly noise —
+  compressing it with scale 1.5-2.0 saves space with no visible loss
+- **Fine-tuning between presets**: When Q3 is too large and Q2 is too lossy,
+  `Q3 -c "1.0,1.0,1.0,1.3"` gives a size between Q2 and Q3
+- **Artistic intent**: Deliberately softening color detail while preserving
+  luminance sharpness
+
+### CLI Usage
+
+```
+# Compress chroma 2x, detail 3x (GS untouched)
+gpr_tools -i input.dng -o output.gpr -q 4 -c "1.0,2.0,2.0,3.0"
+
+# Boost luminance quality, compress chroma mildly
+gpr_tools -i input.dng -o output.gpr -q 3 -c "0.7,1.0,1.0,1.3"
+```
+
+## Recommended Presets
+
+| Use Case | Recommended | Size (100MP) | PSNR |
+|----------|-------------|-------------|------|
+| Proxy / preview / web gallery | Q0 | 21 MB | 40.0 dB |
+| Event / documentary (light edit) | Q1 | 29 MB | 44.6 dB |
+| Editorial / portrait | Q2 or Q3 | 39-46 MB | 48-50 dB |
+| Professional (full edit workflow) | Q4 or Q5 | 53-67 MB | 51-52 dB |
+| Archival / fine art | Q6 | 90 MB | 56.5 dB |
+| Maximum quality (diminishing returns) | Q7 or Q8 | 91-92 MB | 59 dB |
+| Truly lossless | Original DNG | 138 MB | infinite |
diff --git a/source/app/gpr_tools/main.cpp b/source/app/gpr_tools/main.cpp
index e182028d..f40e1007 100755
--- a/source/app/gpr_tools/main.cpp
+++ b/source/app/gpr_tools/main.cpp
@@ -65,11 +65,13 @@ class my_argument_parser : public argument_parser
     string  rgb_file_resolution;
     
     int     rgb_file_bits;
-    
+
     string  output_file_path;
 
     string  apply_gpr_parameters;
-    
+
+    int     quality;
+
 public:
 
     bool get_verbose() { return verbose; }
@@ -98,9 +100,9 @@ class my_argument_parser : public argument_parser
         
         ("InputHeight,h",                                   input_height,                               3000,                   "Input image height in pixel samples [3000]")
 
-        ("InputPitch,p",                                    input_pitch,                                8000,                   "Input image pitch in bytes [8000]")
+        ("InputPitch,p",                                    input_pitch,                                0,                      "Input image pitch in bytes (0 = auto from width)")
         
-        ("InputPixelFormat,x",                              input_pixel_format,                         string("rggb14"),       "Input pixel format \n(rggb12, rggb12p, [rggb14], gbrg12, gbrg12p)")
+        ("InputPixelFormat,x",                              input_pixel_format,                         string("rggb14"),       "Input pixel format \n(rggb12, rggb12p, [rggb14], rggb16, gbrg12, gbrg12p, gbrg16)")
         
         ("ApplyGprParameters,a",                            apply_gpr_parameters,                       string(""),             "Parameters to use for GPR or DNG file.")
         
@@ -109,7 +111,9 @@ class my_argument_parser : public argument_parser
         ("RgbFileResolution,r",                             rgb_file_resolution,                        string(""),             "Output RGB resolution \n[1:1, 2:1, 4:1, 8:1. 16:1]")
         ("RgbFileBits,b",                                   rgb_file_bits,                              8,                      "Output RGB bits [8]")
         
-        ("OutputFilePath,o",                                output_file_path,                           string(""),             "Output file path.\n(files types: GPR, DNG, PPM, RAW, JPG)");
+        ("OutputFilePath,o",                                output_file_path,                           string(""),             "Output file path.\n(files types: GPR, DNG, PPM, RAW, JPG)")
+
+        ("Quality,q",                                       quality,                                    -1,                     "Encoder quality\n(0=Low, 1=Medium, 2=High, 3=FS1, 4=FSX, 5=FS2, 6=FS3, 7=FS4, 8=FS5, -1=auto)");
         ;
     }
 };
@@ -195,7 +199,7 @@ int main(int argc, char *argv [])
     {
         return dng_convert_main(args.input_file_path.c_str(), args.input_width, args.input_height, args.input_pitch, args.input_skip_rows, args.input_pixel_format.c_str(),
                                 args.output_file_path.c_str(), args.apply_gpr_parameters.c_str(), args.gpmf_file_path.c_str(), args.rgb_file_resolution.c_str(), args.rgb_file_bits,
-                                args.jpg_preview_file_path.c_str(), args.jpg_preview_file_width, args.jpg_preview_file_height );
+                                args.jpg_preview_file_path.c_str(), args.jpg_preview_file_width, args.jpg_preview_file_height, args.quality );
     }
     
     return 0;
diff --git a/source/app/gpr_tools/main_c.c b/source/app/gpr_tools/main_c.c
index b8a331a9..73cfb7ed 100755
--- a/source/app/gpr_tools/main_c.c
+++ b/source/app/gpr_tools/main_c.c
@@ -97,7 +97,7 @@ static FILE_TYPE GetFileType( const char* file_path )
 
 int dng_convert_main(const char*  input_file_path, unsigned int input_width, unsigned int input_height, size_t input_pitch, size_t input_skip_rows, const char* input_pixel_format,
                      const char*  output_file_path, const char*  metadata_file_path, const char* gpmf_file_path, const char* rgb_file_resolution, int rgb_file_bits,
-                     const char*  jpg_preview_file_path, int jpg_preview_file_width, int jpg_preview_file_height )
+                     const char*  jpg_preview_file_path, int jpg_preview_file_width, int jpg_preview_file_height, int quality )
 {
     bool success;
     bool write_buffer_to_file = true;
@@ -123,7 +123,8 @@ int dng_convert_main(const char*  input_file_path, unsigned int input_width, uns
     
     gpr_parameters params;
     gpr_parameters_set_defaults(&params);
-    
+    params.quality = quality;
+
     gpr_buffer input_buffer  = { NULL, 0 };
     
     if( read_from_file( &input_buffer, input_file_path, allocator.Alloc, allocator.Free ) != 0 )
@@ -144,53 +145,76 @@ int dng_convert_main(const char*  input_file_path, unsigned int input_width, uns
     {
         params.input_width  = input_width;
         params.input_height = input_height;
-        params.input_pitch  = input_pitch;
-        
+
         int32_t saturation_level = params.tuning_info.dgain_saturation_level.level_red;
-        
+
         if( output_file_type == FILE_TYPE_GPR )
             saturation_level = (1 << 14) - 1;
         else if( output_file_type == FILE_TYPE_DNG )
             saturation_level = (1 << 12) - 1;
-        
+
         if( strcmp(input_pixel_format, "rggb12") == 0 )
         {
             params.tuning_info.pixel_format = PIXEL_FORMAT_RGGB_12;
-            
-            if( input_pitch == -1 )
+
+            saturation_level = (1 << 12) - 1;
+
+            if( input_pitch == 0 || input_pitch == (size_t)-1 )
                 input_pitch = input_width * 2;
         }
         if( strcmp(input_pixel_format, "rggb12p") == 0 )
         {
             params.tuning_info.pixel_format = PIXEL_FORMAT_RGGB_12P;
-            
-            if( input_pitch == -1 )
+
+            if( input_pitch == 0 || input_pitch == (size_t)-1 )
                 input_pitch = (input_width * 3 / 4) * 2;
         }
         else if( strcmp(input_pixel_format, "rggb14") == 0 )
         {
             params.tuning_info.pixel_format = PIXEL_FORMAT_RGGB_14;
-            
+
             saturation_level = (1 << 14) - 1;
 
-            if( input_pitch == -1 )
+            if( input_pitch == 0 || input_pitch == (size_t)-1 )
+                input_pitch = input_width * 2;
+        }
+        else if( strcmp(input_pixel_format, "rggb16") == 0 )
+        {
+            params.tuning_info.pixel_format = PIXEL_FORMAT_RGGB_16;
+
+            saturation_level = (1 << 16) - 1;
+
+            if( input_pitch == 0 || input_pitch == (size_t)-1 )
                 input_pitch = input_width * 2;
         }
         else if( strcmp(input_pixel_format, "gbrg12") == 0 )
         {
             params.tuning_info.pixel_format = PIXEL_FORMAT_GBRG_12;
-            
-            if( input_pitch == -1 )
+
+            saturation_level = (1 << 12) - 1;
+
+            if( input_pitch == 0 || input_pitch == (size_t)-1 )
                 input_pitch = input_width * 2;
         }
         else if( strcmp(input_pixel_format, "gbrg12p") == 0 )
         {
             params.tuning_info.pixel_format = PIXEL_FORMAT_GBRG_12P;
-            
-            if( input_pitch == -1 )
+
+            if( input_pitch == 0 || input_pitch == (size_t)-1 )
                 input_pitch = (input_width * 3 / 4) * 2;
         }
-        
+        else if( strcmp(input_pixel_format, "gbrg16") == 0 )
+        {
+            params.tuning_info.pixel_format = PIXEL_FORMAT_GBRG_16;
+
+            saturation_level = (1 << 16) - 1;
+
+            if( input_pitch == 0 || input_pitch == (size_t)-1 )
+                input_pitch = input_width * 2;
+        }
+
+        params.input_pitch  = input_pitch;
+
         params.tuning_info.dgain_saturation_level.level_red         = saturation_level;
         params.tuning_info.dgain_saturation_level.level_green_even  = saturation_level;
         params.tuning_info.dgain_saturation_level.level_green_odd   = saturation_level;
@@ -344,4 +368,3 @@ int dng_convert_main(const char*  input_file_path, unsigned int input_width, uns
     
     return 0;
 }
-
diff --git a/source/app/gpr_tools/main_c.h b/source/app/gpr_tools/main_c.h
index a8bd308e..6b2f59b1 100755
--- a/source/app/gpr_tools/main_c.h
+++ b/source/app/gpr_tools/main_c.h
@@ -25,7 +25,7 @@ extern "C" {
 
     int dng_convert_main(const char*  input_file_path, unsigned int input_width, unsigned int input_height, size_t input_pitch, size_t input_skip_rows, const char* input_pixel_format,
                          const char*  output_file_path, const char*  exiftool_file_path, const char* gpmf_file_path, const char* rgb_file_resolution, int rgb_file_bits,
-                         const char*  jpg_preview_file_path, int jpg_preview_file_width, int jpg_preview_file_height );
+                         const char*  jpg_preview_file_path, int jpg_preview_file_width, int jpg_preview_file_height, int quality );
     
 #ifdef __cplusplus
 }
diff --git a/source/app/vc5_decoder_app/main.cpp b/source/app/vc5_decoder_app/main.cpp
index f9255ed1..e82d050c 100755
--- a/source/app/vc5_decoder_app/main.cpp
+++ b/source/app/vc5_decoder_app/main.cpp
@@ -204,7 +204,7 @@ int main(int argc, char *argv[])
             
             file.fill( '0' );
             file.width( 4 );
-            file << (DecoderLogCurve[i] >> 4);
+            file << (DecoderLogCurve12[i] >> 4);
             
             file << endl;
         }
diff --git a/source/lib/gpr_sdk/private/gpr.cpp b/source/lib/gpr_sdk/private/gpr.cpp
index 516c0f5f..0861b28c 100755
--- a/source/lib/gpr_sdk/private/gpr.cpp
+++ b/source/lib/gpr_sdk/private/gpr.cpp
@@ -160,15 +160,26 @@ static void set_vc5_encoder_parameters( vc5_encoder_parameters& vc5_encoder_para
         case PIXEL_FORMAT_GBRG_12P:
             vc5_encoder_params.pixel_format = VC5_ENCODER_PIXEL_FORMAT_GBRG_12P;
             break;
+
+        case PIXEL_FORMAT_RGGB_16:
+            vc5_encoder_params.pixel_format = VC5_ENCODER_PIXEL_FORMAT_RGGB_16;
+            break;
+
+        case PIXEL_FORMAT_GBRG_16:
+            vc5_encoder_params.pixel_format = VC5_ENCODER_PIXEL_FORMAT_GBRG_16;
+            break;
             
         default:
             break;
     }
     
-    if( convert_params->fast_encoding )
+    if( convert_params->quality >= 0 && convert_params->quality < VC5_ENCODER_QUALITY_SETTING_COUNT )
+        vc5_encoder_params.quality_setting = (VC5_ENCODER_QUALITY_SETTING)convert_params->quality;
+    else if( convert_params->fast_encoding )
         vc5_encoder_params.quality_setting = VC5_ENCODER_QUALITY_SETTING_MEDIUM;
     else
         vc5_encoder_params.quality_setting = VC5_ENCODER_QUALITY_SETTING_FS1;
+
 }
 #endif
 
@@ -185,20 +196,21 @@ void gpr_parameters_set_defaults(gpr_parameters* x)
     x->compute_md5sum = false;
     
     x->fast_encoding = false;
+    x->quality = -1;
 }
 
 void gpr_parameters_construct_copy(const gpr_parameters* y, gpr_parameters* x, gpr_malloc mem_alloc)
 {
     gpr_parameters_set_defaults(x);
-    
+
     *x = *y;
-    
+
     if( y->gpmf_payload.size > 0 && y->gpmf_payload.buffer != NULL )
     {
         x->gpmf_payload.buffer = mem_alloc( y->gpmf_payload.size );
         memcpy( x->gpmf_payload.buffer, y->gpmf_payload.buffer, y->gpmf_payload.size );
     }
-    
+
     if( y->tuning_info.gain_map.size > 0 )
     {
         for ( int i = 0; i < 4; i++)
@@ -210,6 +222,29 @@ void gpr_parameters_construct_copy(const gpr_parameters* y, gpr_parameters* x, g
             }
         }
     }
+
+    // Deep-copy HueSatMap data
+    if( y->profile_info.hue_sat_map_dims[0] > 0 )
+    {
+        uint64_t count64 = (uint64_t)y->profile_info.hue_sat_map_dims[0] *
+                           y->profile_info.hue_sat_map_dims[1] *
+                           y->profile_info.hue_sat_map_dims[2];
+        /* Sanity check: reject absurdly large maps (> ~8M entries = ~96MB) */
+        if( count64 > 8000000 ) count64 = 0;
+        uint32_t count = (uint32_t)count64;
+        size_t data_size = count * 3 * sizeof(float);
+
+        if( y->profile_info.hue_sat_map_data1 != NULL )
+        {
+            x->profile_info.hue_sat_map_data1 = (float *)mem_alloc( data_size );
+            memcpy( x->profile_info.hue_sat_map_data1, y->profile_info.hue_sat_map_data1, data_size );
+        }
+        if( y->profile_info.hue_sat_map_data2 != NULL )
+        {
+            x->profile_info.hue_sat_map_data2 = (float *)mem_alloc( data_size );
+            memcpy( x->profile_info.hue_sat_map_data2, y->profile_info.hue_sat_map_data2, data_size );
+        }
+    }
 }
 
 void gpr_parameters_destroy(gpr_parameters* x, gpr_free mem_free)
@@ -218,7 +253,7 @@ void gpr_parameters_destroy(gpr_parameters* x, gpr_free mem_free)
     {
         mem_free( x->gpmf_payload.buffer );
     }
-    
+
     if( x->tuning_info.gain_map.size > 0 )
     {
         for ( int i = 0; i < 4; i++)
@@ -227,6 +262,17 @@ void gpr_parameters_destroy(gpr_parameters* x, gpr_free mem_free)
                 mem_free( x->tuning_info.gain_map.buffers[i] );
         }
     }
+
+    if( x->profile_info.hue_sat_map_data1 )
+    {
+        mem_free( x->profile_info.hue_sat_map_data1 );
+        x->profile_info.hue_sat_map_data1 = NULL;
+    }
+    if( x->profile_info.hue_sat_map_data2 )
+    {
+        mem_free( x->profile_info.hue_sat_map_data2 );
+        x->profile_info.hue_sat_map_data2 = NULL;
+    }
 }
 
 
@@ -443,11 +489,21 @@ static void convert_dng_exif_to_dng_exif_info( gpr_exif_info* dst_exif, const dn
     dst_exif->date_time_original        = convert_to_dng_date_and_time( src_exif->fDateTimeOriginal.DateTime() );
     dst_exif->date_time_digitized       = convert_to_dng_date_and_time( src_exif->fDateTimeOriginal.DateTime() );
     
-    assert(src_exif->fSoftware.Length() < sizeof(dst_exif->software_version));
-    memcpy( dst_exif->software_version, src_exif->fSoftware.Get(), src_exif->fSoftware.Length() );
+    {
+        size_t len = src_exif->fSoftware.Length();
+        if (len >= sizeof(dst_exif->software_version))
+            len = sizeof(dst_exif->software_version) - 1;
+        memcpy( dst_exif->software_version, src_exif->fSoftware.Get(), len );
+        dst_exif->software_version[len] = '\0';
+    }
 
-    assert(src_exif->fUserComment.Length() < sizeof(dst_exif->user_comment));
-    memcpy( dst_exif->user_comment, src_exif->fUserComment.Get(), src_exif->fUserComment.Length() );
+    {
+        size_t len = src_exif->fUserComment.Length();
+        if (len >= sizeof(dst_exif->user_comment))
+            len = sizeof(dst_exif->user_comment) - 1;
+        memcpy( dst_exif->user_comment, src_exif->fUserComment.Get(), len );
+        dst_exif->user_comment[len] = '\0';
+    }
     
     // GPS Info
     gpr_gps_info& dst_gps_info = dst_exif->gps_info;
@@ -652,11 +708,12 @@ static bool read_dng(const gpr_allocator*       allocator,
             convert_params->input_pitch  = convert_params->input_width * 2;
             
             // Copy ColorMatrix1, ColorMatrix2
+            if (negative->ProfileCount() > 0)
             {
                 const dng_camera_profile &profile_info = negative->ProfileByIndex( 0 );
                 const dng_matrix &m1 = profile_info.ColorMatrix1();
                 const dng_matrix &m2 = profile_info.ColorMatrix2();
-                
+
                 for (i = 0; i < 3; i++)
                 {
                     for (j = 0; j < 3; j++)
@@ -665,17 +722,91 @@ static bool read_dng(const gpr_allocator*       allocator,
                         convert_params->profile_info.color_matrix_2[i][j] = m2[i][j];
                     }
                 }
-                
+
                 convert_params->profile_info.compute_color_matrix = false;
                 convert_params->profile_info.matrix_weighting = 1.0;
-                
+
                 memset( convert_params->profile_info.wb1, 0, sizeof(convert_params->profile_info.wb1) );
                 memset( convert_params->profile_info.wb2, 0, sizeof(convert_params->profile_info.wb2) );
-                
+
                 memset( convert_params->profile_info.cam_to_srgb_1, 0, sizeof(convert_params->profile_info.cam_to_srgb_1) );
                 memset( convert_params->profile_info.cam_to_srgb_2, 0, sizeof(convert_params->profile_info.cam_to_srgb_2) );
+
+                // Copy CalibrationIlluminant values
+                convert_params->profile_info.illuminant1 = profile_info.CalibrationIlluminant1();
+                convert_params->profile_info.illuminant2 = profile_info.CalibrationIlluminant2();
+
+                // Copy ForwardMatrix if present
+                const dng_matrix &fm1 = profile_info.ForwardMatrix1();
+                const dng_matrix &fm2 = profile_info.ForwardMatrix2();
+                if (!fm1.IsEmpty() && !fm2.IsEmpty())
+                {
+                    convert_params->profile_info.has_forward_matrix = true;
+                    for (i = 0; i < 3; i++)
+                        for (j = 0; j < 3; j++)
+                        {
+                            convert_params->profile_info.forward_matrix_1[i][j] = fm1[i][j];
+                            convert_params->profile_info.forward_matrix_2[i][j] = fm2[i][j];
+                        }
+                }
+                else
+                {
+                    convert_params->profile_info.has_forward_matrix = false;
+                }
             }
-            
+
+            // Copy BaselineExposure and AnalogBalance
+            {
+                convert_params->profile_info.baseline_exposure = negative->BaselineExposure();
+
+                convert_params->profile_info.analog_balance[0] = negative->AnalogBalance(0);
+                convert_params->profile_info.analog_balance[1] = negative->AnalogBalance(1);
+                convert_params->profile_info.analog_balance[2] = negative->AnalogBalance(2);
+            }
+
+            // Copy ProfileHueSatMapData if present
+            if (negative->ProfileCount() > 0)
+            {
+                const dng_camera_profile &cam_profile = negative->ProfileByIndex( 0 );
+                if (cam_profile.HasHueSatDeltas())
+                {
+                    const dng_hue_sat_map &hsm1 = cam_profile.HueSatDeltas1();
+                    uint32 hDiv, sDiv, vDiv;
+                    hsm1.GetDivisions(hDiv, sDiv, vDiv);
+                    uint64 count64 = (uint64)hDiv * sDiv * vDiv;
+
+                    convert_params->profile_info.hue_sat_map_dims[0] = hDiv;
+                    convert_params->profile_info.hue_sat_map_dims[1] = sDiv;
+                    convert_params->profile_info.hue_sat_map_dims[2] = vDiv;
+                    convert_params->profile_info.hue_sat_map_encoding = cam_profile.HueSatMapEncoding();
+
+                    /* Sanity check: reject absurdly large maps */
+                    if (count64 > 8000000)
+                    {
+                        count64 = 0;
+                        convert_params->profile_info.hue_sat_map_dims[0] = 0;
+                        convert_params->profile_info.hue_sat_map_dims[1] = 0;
+                        convert_params->profile_info.hue_sat_map_dims[2] = 0;
+                    }
+                    uint32 count = (uint32)count64;
+
+                    // Copy data1 (3 floats per entry: hueShift, satScale, valScale)
+                    size_t data_size = count * 3 * sizeof(float);
+                    convert_params->profile_info.hue_sat_map_data1 = (float *)allocator->Alloc(data_size);
+                    const dng_hue_sat_map::HSBModify *src1 = hsm1.GetConstDeltas();
+                    memcpy(convert_params->profile_info.hue_sat_map_data1, src1, data_size);
+
+                    // Copy data2 if present
+                    const dng_hue_sat_map &hsm2 = cam_profile.HueSatDeltas2();
+                    if (hsm2.IsValid())
+                    {
+                        convert_params->profile_info.hue_sat_map_data2 = (float *)allocator->Alloc(data_size);
+                        const dng_hue_sat_map::HSBModify *src2 = hsm2.GetConstDeltas();
+                        memcpy(convert_params->profile_info.hue_sat_map_data2, src2, data_size);
+                    }
+                }
+            }
+
             // Set Exif Info
             convert_dng_exif_to_dng_exif_info( &convert_params->exif_info, negative->GetExif() );
             
@@ -694,20 +825,27 @@ static bool read_dng(const gpr_allocator*       allocator,
                     tuning_info.wb_gains.b_gain = 1 / camNeutral[2];
                 }
                 
-                const dng_linearization_info& linearization_info = *negative->GetLinearizationInfo();
-                
+                const dng_linearization_info *linearization_ptr = negative->GetLinearizationInfo();
+                if (linearization_ptr != NULL)
                 {
+                    const dng_linearization_info& linearization_info = *linearization_ptr;
+
                     gpr_static_black_level& static_black_level    = tuning_info.static_black_level;
-                    
-                    static_black_level.r_black   = linearization_info.fBlackLevel[0][0][0];
-                    static_black_level.g_r_black = linearization_info.fBlackLevel[0][1][0];
-                    static_black_level.g_b_black = linearization_info.fBlackLevel[1][0][0];
-                    static_black_level.b_black   = linearization_info.fBlackLevel[1][1][0];
-                }
-                
-                {
+
+                    // Index with modulo repeat dims so that a 1x1 pattern
+                    // (uniform black level) replicates to all four channels
+                    uint32 rr = linearization_info.fBlackLevelRepeatRows;
+                    uint32 rc = linearization_info.fBlackLevelRepeatCols;
+                    if (rr < 1) rr = 1;
+                    if (rc < 1) rc = 1;
+
+                    static_black_level.r_black   = linearization_info.fBlackLevel[0 % rr][0 % rc][0];
+                    static_black_level.g_r_black = linearization_info.fBlackLevel[0 % rr][1 % rc][0];
+                    static_black_level.g_b_black = linearization_info.fBlackLevel[1 % rr][0 % rc][0];
+                    static_black_level.b_black   = linearization_info.fBlackLevel[1 % rr][1 % rc][0];
+
                     gpr_saturation_level& dgain_saturation_level = tuning_info.dgain_saturation_level;
-                    
+
                     dgain_saturation_level.level_red        = linearization_info.fWhiteLevel[0];
                     dgain_saturation_level.level_green_even = dgain_saturation_level.level_red;
                     dgain_saturation_level.level_green_odd  = dgain_saturation_level.level_red;
@@ -721,44 +859,38 @@ static bool read_dng(const gpr_allocator*       allocator,
                     
                     bool rggb_raw = (rawIFD.fCFAPattern[0][0] == 0) && (rawIFD.fCFAPattern[0][1] == 1) && (rawIFD.fCFAPattern[1][0] == 1) && (rawIFD.fCFAPattern[1][1] == 2);
                     
+                    // Determine bit depth from the maximum saturation level
+                    uint32_t max_sat = dgain_saturation_level.level_red;
+                    if (dgain_saturation_level.level_green_even > max_sat) max_sat = dgain_saturation_level.level_green_even;
+                    if (dgain_saturation_level.level_green_odd  > max_sat) max_sat = dgain_saturation_level.level_green_odd;
+                    if (dgain_saturation_level.level_blue       > max_sat) max_sat = dgain_saturation_level.level_blue;
+
                     if( rggb_raw )
                     {
-                        if( dgain_saturation_level.level_red        == 4095 &&
-                            dgain_saturation_level.level_green_even == 4095 &&
-                            dgain_saturation_level.level_green_odd  == 4095 &&
-                            dgain_saturation_level.level_blue       == 4095 )
+                        if( max_sat <= 4095 )
                         {
                             tuning_info.pixel_format = PIXEL_FORMAT_RGGB_12;
                         }
-                        else if(dgain_saturation_level.level_red        == 16383 &&
-                                dgain_saturation_level.level_green_even == 16383 &&
-                                dgain_saturation_level.level_green_odd  == 16383 &&
-                                dgain_saturation_level.level_blue       == 16383 )
+                        else if( max_sat <= 16383 )
                         {
                             tuning_info.pixel_format = PIXEL_FORMAT_RGGB_14;
                         }
                         else
                         {
-                            assert(0);
-                            return false;
+                            tuning_info.pixel_format = PIXEL_FORMAT_RGGB_16;
                         }
                     }
                     else
                     {
-                        if( dgain_saturation_level.level_red        == 4095 &&
-                            dgain_saturation_level.level_green_even == 4095 &&
-                            dgain_saturation_level.level_green_odd  == 4095 &&
-                            dgain_saturation_level.level_blue       == 4095 )
+                        if( max_sat <= 4095 )
                         {
                             tuning_info.pixel_format = PIXEL_FORMAT_GBRG_12;
                         }
                         else
                         {
-                            assert(0);
-                            return false;
+                            // No GBRG_14 enum; use GBRG_16 for 14-bit and 16-bit
+                            tuning_info.pixel_format = PIXEL_FORMAT_GBRG_16;
                         }
-                        
-                        
                     }
                 }
                 
@@ -766,14 +898,19 @@ static bool read_dng(const gpr_allocator*       allocator,
                 if ( negative->HasNoiseProfile() )
                 {
                     dng_noise_profile  noise_profile = negative->NoiseProfile();
-                    
+
 		            dng_noise_function  noise_function = noise_profile.NoiseFunction (0);
 
                     tuning_info.noise_scale = noise_function.Scale();
                     tuning_info.noise_offset = noise_function.Offset();
-                    //LogPrint( "Noise profile s = %f, o = %f ", tuning_info.noise_scale, tuning_info.noise_offset );
                }
-                
+
+                // DefaultCropOrigin and DefaultCropSize
+                tuning_info.default_crop_origin_h = Round_uint32(negative->DefaultCropOriginH().As_real64());
+                tuning_info.default_crop_origin_v = Round_uint32(negative->DefaultCropOriginV().As_real64());
+                tuning_info.default_crop_size_h = Round_uint32(negative->DefaultCropSizeH().As_real64());
+                tuning_info.default_crop_size_v = Round_uint32(negative->DefaultCropSizeV().As_real64());
+
                 // GainMap
                 dng_opcode_list &opcodelist2 =  negative->OpcodeList2 ();
                 uint32_t count = opcodelist2.Count ();
@@ -1049,6 +1186,14 @@ static void write_dng(const gpr_allocator*          allocator,
             case PIXEL_FORMAT_GBRG_12:
                 vc5_decoder_params.pixel_format = VC5_DECODER_PIXEL_FORMAT_GBRG_12;
                 break;
+
+            case PIXEL_FORMAT_RGGB_16:
+                vc5_decoder_params.pixel_format = VC5_DECODER_PIXEL_FORMAT_RGGB_16;
+                break;
+
+            case PIXEL_FORMAT_GBRG_16:
+                vc5_decoder_params.pixel_format = VC5_DECODER_PIXEL_FORMAT_GBRG_16;
+                break;
                         
             default:
                 assert(0);
@@ -1105,6 +1250,7 @@ static void write_dng(const gpr_allocator*          allocator,
             case PIXEL_FORMAT_RGGB_12:
             case PIXEL_FORMAT_RGGB_12P:
             case PIXEL_FORMAT_RGGB_14:
+            case PIXEL_FORMAT_RGGB_16:
                 negative->SetQuadBlacks(static_black_level.r_black,
                                         static_black_level.g_r_black,
                                         static_black_level.g_b_black,
@@ -1113,6 +1259,7 @@ static void write_dng(const gpr_allocator*          allocator,
                 break;
             case PIXEL_FORMAT_GBRG_12:
             case PIXEL_FORMAT_GBRG_12P:
+            case PIXEL_FORMAT_GBRG_16:
                 negative->SetQuadBlacks(static_black_level.g_b_black,
                                         static_black_level.b_black,
                                         static_black_level.r_black,
@@ -1205,13 +1352,20 @@ static void write_dng(const gpr_allocator*          allocator,
     //GP!! NEED outputWidth, activeWidth, outputHeight, activeHeight here
     negative->SetDefaultScale(dng_urational(outputWidth, activeWidth), dng_urational(outputHeight, activeHeight));
     
-    uint32 crop_size_val = 0;
-    dng_point crop_origin( 0, 0 );
-    dng_point crop_size( activeHeight - 2 * crop_size_val, activeWidth - 2 * crop_size_val );
-    
+    uint32 crop_origin_h = convert_params->tuning_info.default_crop_origin_h;
+    uint32 crop_origin_v = convert_params->tuning_info.default_crop_origin_v;
+    dng_point crop_origin( crop_origin_v, crop_origin_h );
+
+    /* Use stored crop size when available; fall back to symmetric formula */
+    uint32 crop_size_h = convert_params->tuning_info.default_crop_size_h;
+    uint32 crop_size_v = convert_params->tuning_info.default_crop_size_v;
+    if (crop_size_h == 0) crop_size_h = activeWidth  - 2 * crop_origin_h;
+    if (crop_size_v == 0) crop_size_v = activeHeight - 2 * crop_origin_v;
+    dng_point crop_size( crop_size_v, crop_size_h );
+
     negative->SetDefaultCropOrigin( crop_origin.h, crop_origin.v );
     negative->SetDefaultCropSize( crop_size.h, crop_size.v );
-    
+
     negative->SetOriginalDefaultCropSize( dng_urational(crop_size.h, 1), dng_urational(crop_size.v, 1) );
 
     {
@@ -1240,11 +1394,11 @@ static void write_dng(const gpr_allocator*          allocator,
     negative->SetColorKeys(colorCodes[0], colorCodes[1], colorCodes[2], colorCodes[3]);
 
     // Set Bayer Pattern
-    if( convert_params->tuning_info.pixel_format == PIXEL_FORMAT_RGGB_12 || convert_params->tuning_info.pixel_format == PIXEL_FORMAT_RGGB_12P || convert_params->tuning_info.pixel_format == PIXEL_FORMAT_RGGB_14 )
+    if( convert_params->tuning_info.pixel_format == PIXEL_FORMAT_RGGB_12 || convert_params->tuning_info.pixel_format == PIXEL_FORMAT_RGGB_12P || convert_params->tuning_info.pixel_format == PIXEL_FORMAT_RGGB_14 || convert_params->tuning_info.pixel_format == PIXEL_FORMAT_RGGB_16 )
     {
         negative->SetBayerMosaic(1);
     }
-    else if( convert_params->tuning_info.pixel_format == PIXEL_FORMAT_GBRG_12 || convert_params->tuning_info.pixel_format == PIXEL_FORMAT_GBRG_12P )
+    else if( convert_params->tuning_info.pixel_format == PIXEL_FORMAT_GBRG_12 || convert_params->tuning_info.pixel_format == PIXEL_FORMAT_GBRG_12P || convert_params->tuning_info.pixel_format == PIXEL_FORMAT_GBRG_16 )
     {
         negative->SetBayerMosaic(3);
     }
@@ -1254,14 +1408,16 @@ static void write_dng(const gpr_allocator*          allocator,
         return;
     }
     
-    negative->SetBaselineExposure(0);
+    negative->SetBaselineExposure(profile_info->baseline_exposure);
     negative->SetBaselineNoise(1.0);
     negative->SetBaselineSharpness(1.0);
-    
+
     negative->SetAntiAliasStrength(dng_urational(100, 100));
     negative->SetLinearResponseLimit(1.0);
     negative->SetShadowScale( dng_urational(1, 1) );
-    negative->SetAnalogBalance(dng_vector_3(1.0, 1.0, 1.0));
+    negative->SetAnalogBalance(dng_vector_3(profile_info->analog_balance[0],
+                                            profile_info->analog_balance[1],
+                                            profile_info->analog_balance[2]));
     
     AutoPtr<dng_camera_profile> prof(new dng_camera_profile);
     prof->SetName( camera_make_and_model );
@@ -1314,7 +1470,46 @@ static void write_dng(const gpr_allocator*          allocator,
     
     prof->SetCalibrationIlluminant1(profile_info->illuminant1);
     prof->SetCalibrationIlluminant2(profile_info->illuminant2);
-    
+
+    if (profile_info->has_forward_matrix)
+    {
+        dng_matrix_3by3 fm1;
+        dng_matrix_3by3 fm2;
+        for (i = 0; i < 3; i++)
+            for (j = 0; j < 3; j++)
+            {
+                fm1[i][j] = profile_info->forward_matrix_1[i][j];
+                fm2[i][j] = profile_info->forward_matrix_2[i][j];
+            }
+        prof->SetForwardMatrix1(fm1);
+        prof->SetForwardMatrix2(fm2);
+    }
+
+    if (profile_info->hue_sat_map_dims[0] > 0 && profile_info->hue_sat_map_data1 != NULL)
+    {
+        uint32 hDiv = profile_info->hue_sat_map_dims[0];
+        uint32 sDiv = profile_info->hue_sat_map_dims[1];
+        uint32 vDiv = profile_info->hue_sat_map_dims[2];
+        uint32 count = hDiv * sDiv * vDiv;
+
+        dng_hue_sat_map hsm1;
+        hsm1.SetDivisions(hDiv, sDiv, vDiv);
+        dng_hue_sat_map::HSBModify *dst1 = hsm1.GetDeltas();
+        memcpy(dst1, profile_info->hue_sat_map_data1, count * 3 * sizeof(float));
+        prof->SetHueSatDeltas1(hsm1);
+
+        if (profile_info->hue_sat_map_data2 != NULL)
+        {
+            dng_hue_sat_map hsm2;
+            hsm2.SetDivisions(hDiv, sDiv, vDiv);
+            dng_hue_sat_map::HSBModify *dst2 = hsm2.GetDeltas();
+            memcpy(dst2, profile_info->hue_sat_map_data2, count * 3 * sizeof(float));
+            prof->SetHueSatDeltas2(hsm2);
+        }
+
+        prof->SetHueSatMapEncoding(profile_info->hue_sat_map_encoding);
+    }
+
     negative->AddProfile(prof);
     
     dng_exif* const exif = negative->GetExif();
@@ -1633,22 +1828,29 @@ bool gpr_convert_dng_to_gpr(const gpr_allocator*    allocator,
     TIMESTAMP("[BEG]", 1)
 
     gpr_buffer_auto raw_buffer(allocator->Alloc, allocator->Free);
-    
+
     dng_memory_stream inp_dng_stream( gDefaultDNGMemoryAllocator );
     inp_dng_stream.Put( inp_dng_buffer->buffer, inp_dng_buffer->size );
     inp_dng_stream.SetReadPosition(0);
-    
-    if( read_dng( allocator, &inp_dng_stream, &raw_buffer, NULL, NULL ) == false )
+
+    // Extract metadata from input DNG into a mutable copy of parameters
+    gpr_parameters params_with_meta;
+    gpr_parameters_construct_copy( parameters, &params_with_meta, allocator->Alloc );
+
+    if( read_dng( allocator, &inp_dng_stream, &raw_buffer, NULL, &params_with_meta ) == false )
     {
+        gpr_parameters_destroy( &params_with_meta, allocator->Free );
         assert(0); return false;
     }
-    
+
     dng_memory_stream out_gpr_stream( gDefaultDNGMemoryAllocator );
-    
-    write_dng( allocator, &out_gpr_stream, &raw_buffer, true, NULL, parameters );
-    
+
+    write_dng( allocator, &out_gpr_stream, &raw_buffer, true, NULL, &params_with_meta );
+
     write_dngstream_to_buffer( &out_gpr_stream, out_gpr_buffer, allocator->Alloc, allocator->Free );
-    
+
+    gpr_parameters_destroy( &params_with_meta, allocator->Free );
+
     TIMESTAMP("[END]", 1)
 
     return true;
@@ -1749,22 +1951,29 @@ bool gpr_convert_gpr_to_dng(const gpr_allocator*    allocator,
 
     gpr_buffer_auto raw_buffer(allocator->Alloc, allocator->Free);
     gpr_buffer_auto vc5_buffer(allocator->Alloc, allocator->Free);
-    
+
     dng_memory_stream inp_gpr_stream( gDefaultDNGMemoryAllocator );
     inp_gpr_stream.Put( inp_gpr_buffer->buffer, inp_gpr_buffer->size );
     inp_gpr_stream.SetReadPosition(0);
-    
-    if( read_dng( allocator, &inp_gpr_stream, &raw_buffer, &vc5_buffer, NULL ) == false )
+
+    // Extract metadata from GPR (which is a DNG) into a mutable copy of parameters
+    gpr_parameters params_with_meta;
+    gpr_parameters_construct_copy( parameters, &params_with_meta, allocator->Alloc );
+
+    if( read_dng( allocator, &inp_gpr_stream, &raw_buffer, &vc5_buffer, &params_with_meta ) == false )
     {
+        gpr_parameters_destroy( &params_with_meta, allocator->Free );
         assert(0); return false;
     }
-    
+
     dng_memory_stream out_dng_stream( gDefaultDNGMemoryAllocator );
-    
-    write_dng( allocator, &out_dng_stream, &raw_buffer, false, NULL, parameters );
-    
+
+    write_dng( allocator, &out_dng_stream, &raw_buffer, false, NULL, &params_with_meta );
+
     write_dngstream_to_buffer( &out_dng_stream, out_dng_buffer, allocator->Alloc, allocator->Free );
-    
+
+    gpr_parameters_destroy( &params_with_meta, allocator->Free );
+
     TIMESTAMP("[END]", 1)
 
     return true;
@@ -1845,5 +2054,3 @@ bool gpr_check_vc5( const gpr_allocator*        allocator,
     
     return is_vc5_format;
 }
-
-
diff --git a/source/lib/gpr_sdk/private/gpr_profile_info.cpp b/source/lib/gpr_sdk/private/gpr_profile_info.cpp
index 93ad542b..b69de16a 100755
--- a/source/lib/gpr_sdk/private/gpr_profile_info.cpp
+++ b/source/lib/gpr_sdk/private/gpr_profile_info.cpp
@@ -53,5 +53,21 @@ void gpr_profile_info_set_defaults(gpr_profile_info* x)
     
     memset( x->color_matrix_1, 0, sizeof(x->color_matrix_1) );
     memset( x->color_matrix_2, 0, sizeof(x->color_matrix_2) );
+
+    memset( x->forward_matrix_1, 0, sizeof(x->forward_matrix_1) );
+    memset( x->forward_matrix_2, 0, sizeof(x->forward_matrix_2) );
+    x->has_forward_matrix = false;
+
+    x->baseline_exposure = 0.0;
+    x->analog_balance[0] = 1.0;
+    x->analog_balance[1] = 1.0;
+    x->analog_balance[2] = 1.0;
+
+    x->hue_sat_map_dims[0] = 0;
+    x->hue_sat_map_dims[1] = 0;
+    x->hue_sat_map_dims[2] = 0;
+    x->hue_sat_map_data1 = NULL;
+    x->hue_sat_map_data2 = NULL;
+    x->hue_sat_map_encoding = 0;
 }
 
diff --git a/source/lib/gpr_sdk/private/gpr_read_image.cpp b/source/lib/gpr_sdk/private/gpr_read_image.cpp
index 84c2fe97..10fa1981 100755
--- a/source/lib/gpr_sdk/private/gpr_read_image.cpp
+++ b/source/lib/gpr_sdk/private/gpr_read_image.cpp
@@ -53,9 +53,9 @@ static bool DecodeVC5(dng_image &image, gpr_buffer_auto& vc5_buffer, VC5_DECODER
     }
     
     raw_buffer.set( raw_image.buffer, raw_image.size, true );
-    
+
     dng_point size = image.Bounds().Size();
-    
+
     CopyBufferToRawImage( raw_buffer, size.h, image );
 
     return true;
@@ -92,15 +92,35 @@ void gpr_read_image::ReadTile (dng_host &host,
             {
                 bool rggb_raw = (ifd.fCFAPattern[0][0] == 0) && (ifd.fCFAPattern[0][1] == 1) && (ifd.fCFAPattern[1][0] == 1) && (ifd.fCFAPattern[1][1] == 2);
                 
+                // Use WhiteLevel to infer actual bit depth, NOT BitsPerSample
+                // (BitsPerSample is always 16 for uint16 DNG storage)
+                uint32 white_level = (uint32)ifd.fWhiteLevel[0];
+                uint32 bit_depth;
+                if (white_level <= 4095)
+                    bit_depth = 12;
+                else if (white_level <= 16383)
+                    bit_depth = 14;
+                else
+                    bit_depth = 16;
                 VC5_DECODER_PIXEL_FORMAT pixel_format;
-                
-                if( rggb_raw )
+
+                if (rggb_raw)
                 {
-                    pixel_format = VC5_DECODER_PIXEL_FORMAT_RGGB_14;
+                    if (bit_depth >= 16)
+                        pixel_format = VC5_DECODER_PIXEL_FORMAT_RGGB_16;
+                    else if (bit_depth > 12)
+                        pixel_format = VC5_DECODER_PIXEL_FORMAT_RGGB_14;
+                    else
+                        pixel_format = VC5_DECODER_PIXEL_FORMAT_RGGB_12;
                 }
                 else
                 {
-                    pixel_format = VC5_DECODER_PIXEL_FORMAT_GBRG_12;
+                    if (bit_depth >= 16)
+                        pixel_format = VC5_DECODER_PIXEL_FORMAT_GBRG_16;
+                    else if (bit_depth > 12)
+                        pixel_format = VC5_DECODER_PIXEL_FORMAT_GBRG_14;
+                    else
+                        pixel_format = VC5_DECODER_PIXEL_FORMAT_GBRG_12;
                 }
                 
                 if( DecodeVC5( image, *_vc5_buffer, pixel_format ) )
diff --git a/source/lib/gpr_sdk/private/gpr_tuning_info.cpp b/source/lib/gpr_sdk/private/gpr_tuning_info.cpp
index 45ff9f78..4c250058 100755
--- a/source/lib/gpr_sdk/private/gpr_tuning_info.cpp
+++ b/source/lib/gpr_sdk/private/gpr_tuning_info.cpp
@@ -79,6 +79,12 @@ void gpr_tuning_info_set_defaults( gpr_tuning_info* x )
     _gain_map_set_defaults( x );
     
     x->pixel_format = PIXEL_FORMAT_RGGB_14;
+
+    x->default_crop_origin_h = 0;
+    x->default_crop_origin_v = 0;
+
+    x->default_crop_size_h = 0;
+    x->default_crop_size_v = 0;
 }
 
 
diff --git a/source/lib/gpr_sdk/public/gpr.h b/source/lib/gpr_sdk/public/gpr.h
index 666e7278..48a3a11f 100755
--- a/source/lib/gpr_sdk/public/gpr.h
+++ b/source/lib/gpr_sdk/public/gpr.h
@@ -53,7 +53,9 @@
             unsigned int        input_pitch;                   /* Pitch of input source in pixels (only applies to raw input) */
 
             bool                fast_encoding;
-            
+
+            int                 quality;        /* Quality setting (0-5), or -1 for auto */
+
             bool                compute_md5sum;
             
             gpr_buffer          gpmf_payload;   /* GPMF payload of image file */
@@ -67,7 +69,7 @@
             gpr_profile_info    profile_info;   /* Camera color profile info object */
             
             gpr_tuning_info     tuning_info;    /* Camera tuning info object */
-            
+
         } gpr_parameters;
         
         void gpr_parameters_set_defaults(gpr_parameters* x);
diff --git a/source/lib/gpr_sdk/public/gpr_profile_info.h b/source/lib/gpr_sdk/public/gpr_profile_info.h
index 5fc37b39..c39b6f71 100755
--- a/source/lib/gpr_sdk/public/gpr_profile_info.h
+++ b/source/lib/gpr_sdk/public/gpr_profile_info.h
@@ -45,10 +45,23 @@
         
         Matrix      color_matrix_1;
         Matrix      color_matrix_2;
-     
+
+        Matrix      forward_matrix_1;
+        Matrix      forward_matrix_2;
+        bool        has_forward_matrix;
+
         uint16_t    illuminant1;
         uint16_t    illuminant2;
-        
+
+        double      baseline_exposure;
+        double      analog_balance[3];
+
+        /* ProfileHueSatMapData — per-hue color correction LUT */
+        uint32_t    hue_sat_map_dims[3];    /* [hue, sat, val] divisions */
+        float       *hue_sat_map_data1;     /* illuminant 1: dims[0]*dims[1]*dims[2]*3 floats */
+        float       *hue_sat_map_data2;     /* illuminant 2 (may be NULL) */
+        uint32_t    hue_sat_map_encoding;
+
     } gpr_profile_info;
 
     void gpr_profile_info_set_defaults(gpr_profile_info* x);
diff --git a/source/lib/gpr_sdk/public/gpr_tuning_info.h b/source/lib/gpr_sdk/public/gpr_tuning_info.h
index 677d68d6..5bcca813 100755
--- a/source/lib/gpr_sdk/public/gpr_tuning_info.h
+++ b/source/lib/gpr_sdk/public/gpr_tuning_info.h
@@ -51,6 +51,10 @@
         PIXEL_FORMAT_GBRG_12 = 3,                           // GBRG 12bit pixels packed into 16bits
         
         PIXEL_FORMAT_GBRG_12P = 4,                          // GBRG 12bit pixels packed into 12bits
+
+        PIXEL_FORMAT_RGGB_16 = 5,                           // RGGB 16bit pixels packed into 16bits
+
+        PIXEL_FORMAT_GBRG_16 = 6,                           // GBRG 16bit pixels packed into 16bits
         
     } GPR_PIXEL_FORMAT;
 
@@ -133,9 +137,15 @@
         double                  warp_blue_coefficient;
         
         gpr_gain_map            gain_map;
-        
+
         GPR_PIXEL_FORMAT        pixel_format;
 
+        uint32_t                default_crop_origin_h;
+        uint32_t                default_crop_origin_v;
+
+        uint32_t                default_crop_size_h;
+        uint32_t                default_crop_size_v;
+
     } gpr_tuning_info;
 
     int32_t gpr_tuning_info_get_dgain_saturation_level(const gpr_tuning_info* x, GPR_RAW_CHANNEL channel);
diff --git a/source/lib/vc5_common/bitstream.c b/source/lib/vc5_common/bitstream.c
index cef43121..9b94b9de 100755
--- a/source/lib/vc5_common/bitstream.c
+++ b/source/lib/vc5_common/bitstream.c
@@ -238,7 +238,12 @@ CODEC_ERROR PutBits(BITSTREAM *stream, BITWORD bits, BITCOUNT count)
         }
         
         // Write the bit buffer to the byte stream
-        PutWord(stream->stream, stream->buffer );
+        CODEC_ERROR err = PutWord(stream->stream, stream->buffer );
+        if (err != CODEC_ERROR_OKAY)
+        {
+            stream->error = BITSTREAM_ERROR_OVERFLOW;
+            return err;
+        }
         
         // Insert the remaining input bits into the bit buffer
         stream->buffer = (bits << (bit_word_count - count));
@@ -291,7 +296,12 @@ CODEC_ERROR PutBuffer(BITSTREAM *bitstream)
     assert(bitstream->count == bit_word_count);
     
     // Write the bit buffer to the byte stream
-    PutWord(bitstream->stream, bitstream->buffer );
+    CODEC_ERROR err = PutWord(bitstream->stream, bitstream->buffer );
+    if (err != CODEC_ERROR_OKAY)
+    {
+        bitstream->error = BITSTREAM_ERROR_OVERFLOW;
+        return err;
+    }
     
     // Empty the bit buffer
     bitstream->buffer = 0;
@@ -375,7 +385,11 @@ CODEC_ERROR FlushBitstream(BITSTREAM *bitstream)
     if (bitstream->count > 0)
     {
         // Write the bit buffer to the output stream
-        PutBuffer(bitstream);
+        CODEC_ERROR err = PutBuffer(bitstream);
+        if (err != CODEC_ERROR_OKAY)
+        {
+            return err;
+        }
     }
     
     // Indicate that the bitstream buffer is empty
@@ -402,7 +416,11 @@ CODEC_ERROR FlushBitstream(BITSTREAM *bitstream)
 size_t GetBitstreamPosition(BITSTREAM *bitstream)
 {
     if (bitstream->count == bit_word_count) {
-        PutBuffer(bitstream);
+        CODEC_ERROR err = PutBuffer(bitstream);
+        if (err != CODEC_ERROR_OKAY)
+        {
+            return 0;
+        }
     }
     
     // The bit buffer must be empty
@@ -442,4 +460,3 @@ CODEC_ERROR RewindBitstream(BITSTREAM *bitstream)
     
     return CODEC_ERROR_OKAY;
 }
-
diff --git a/source/lib/vc5_common/companding.c b/source/lib/vc5_common/companding.c
index 00b0fb56..fb881f8e 100755
--- a/source/lib/vc5_common/companding.c
+++ b/source/lib/vc5_common/companding.c
@@ -31,11 +31,52 @@
 
 /*!
 	@brief Maximum coefficient magnitude in the codebook
-	
+
 	@todo Need to calculate the maximum value from the codebook
 */
 const int maximum_codebook_value = 255;
 
+/*!
+	@brief Precomputed integer LUT for inverse cubic companding
+
+	Maps magnitude m (0-255) to m + m^3 * 768 / (255^3).
+	Eliminates double-precision floating point from the decoder's inner loops.
+*/
+static int32_t uncompand_lut[256];
+static int uncompand_lut_initialized = 0;
+
+void InitUncompandTable(void)
+{
+	if (uncompand_lut_initialized)
+		return;
+
+	uncompand_lut[0] = 0;
+	for (int i = 1; i < 256; i++)
+	{
+		// Use the same formula as UncompandedValue but compute once
+		double cubic = (double)i;
+		cubic *= i;
+		cubic *= i;
+		cubic *= 768;
+		cubic /= 255.0 * 255.0 * 255.0;
+		uncompand_lut[i] = i + (int32_t)cubic;
+	}
+
+	uncompand_lut_initialized = 1;
+}
+
+int32_t UncompandedValueFast(int32_t value)
+{
+	int32_t magnitude = absolute(value);
+
+	if (magnitude > 255)
+		magnitude = 255;
+
+	magnitude = uncompand_lut[magnitude];
+
+	return ((value < 0) ? neg(magnitude) : magnitude);
+}
+
 
 /*!
 	@brief Apply the default companding curve to the specified value
diff --git a/source/lib/vc5_common/companding.h b/source/lib/vc5_common/companding.h
index 5931e73c..d72a379c 100755
--- a/source/lib/vc5_common/companding.h
+++ b/source/lib/vc5_common/companding.h
@@ -34,7 +34,11 @@ extern "C" {
 
     // Invert the companding curve applied to a quantized coefficient magnitude (for debugging)
     int32_t UncompandedValue(int32_t value);
-    
+
+    // Fast integer LUT-based uncompanding (no floating point)
+    void InitUncompandTable(void);
+    int32_t UncompandedValueFast(int32_t value);
+
     PIXEL UncompandedPixel(PIXEL value);
     
     CODEC_ERROR InvertCompanding(PIXEL *image, DIMENSION width, DIMENSION height, DIMENSION pitch);
diff --git a/source/lib/vc5_common/config.h b/source/lib/vc5_common/config.h
index 5169510d..f07a1f60 100755
--- a/source/lib/vc5_common/config.h
+++ b/source/lib/vc5_common/config.h
@@ -45,7 +45,8 @@
 //TODO: The maximum number of channels and wavelets depends on the profile
 
 //! Internal precision of the intermediate results after unpacking
-static const int default_internal_precision = 12;
+// Raised to support higher bit-depth inputs; PIXEL is widened to 32-bit.
+static const int default_internal_precision = 16;
 
 //TODO: Change the global variable from internal_precision to encoded_precision?
 
diff --git a/source/lib/vc5_common/image.c b/source/lib/vc5_common/image.c
index 0af14fe1..92766452 100755
--- a/source/lib/vc5_common/image.c
+++ b/source/lib/vc5_common/image.c
@@ -126,10 +126,13 @@ DIMENSION ImagePitch(DIMENSION width, PIXEL_FORMAT format)
     
     switch (format)
     {
+        case PIXEL_FORMAT_RAW_RGGB_12:
         case PIXEL_FORMAT_RAW_RGGB_14:
+        case PIXEL_FORMAT_RAW_RGGB_16:
         case PIXEL_FORMAT_RAW_GBRG_12:
         case PIXEL_FORMAT_RAW_GBRG_12P:
-        case PIXEL_FORMAT_RAW_RGGB_16:
+        case PIXEL_FORMAT_RAW_GBRG_14:
+        case PIXEL_FORMAT_RAW_GBRG_16:
             // Half the width of the image times 2 samples times 2 bytes per sample
             pitch = width * sizeof(uint16_t);
             break;
diff --git a/source/lib/vc5_common/image.h b/source/lib/vc5_common/image.h
index 47397272..4f7717f4 100755
--- a/source/lib/vc5_common/image.h
+++ b/source/lib/vc5_common/image.h
@@ -21,8 +21,8 @@
 #ifndef IMAGE_H
 #define IMAGE_H
 
-//! Data type for the values in a component array
-typedef uint16_t COMPONENT_VALUE;
+//! Data type for the values in a component array (widened for 16-bit pipeline)
+typedef int32_t COMPONENT_VALUE;
 
 /*!
 	@brief Data structure for an image input to the unpacking process
diff --git a/source/lib/vc5_common/logcurve.c b/source/lib/vc5_common/logcurve.c
index 45aecbf1..c1760b27 100755
--- a/source/lib/vc5_common/logcurve.c
+++ b/source/lib/vc5_common/logcurve.c
@@ -20,39 +20,65 @@
 
 #include "common.h"
 
-uint16_t EncoderLogCurve[LOG_CURVE_TABLE_LENGTH];
+int vc5_logcurve_bypass(void)
+{
+    return 0;
+}
 
-uint16_t DecoderLogCurve[LOG_CURVE_TABLE_LENGTH];
+uint16_t EncoderLogCurve12[LOG_CURVE_TABLE_LENGTH_12];
+uint16_t EncoderLogCurve14[LOG_CURVE_TABLE_LENGTH_14];
+uint16_t EncoderLogCurve16[LOG_CURVE_TABLE_LENGTH_16];
 
-void SetupDecoderLogCurve(void)
+uint16_t DecoderLogCurve12[LOG_CURVE_TABLE_LENGTH_12];
+uint16_t DecoderLogCurve14[LOG_CURVE_TABLE_LENGTH_14];
+uint16_t DecoderLogCurve16[LOG_CURVE_TABLE_LENGTH_16];
+
+static void SetupDecoderCurve(uint16_t *table, int bits)
 {
-    int i;
-    const int log_table_size = sizeof(DecoderLogCurve) / sizeof(DecoderLogCurve[0]);
-    
-    const int max_16_bit = (1 << 16) - 1;
-    
-    for( i = 0; i < log_table_size; i++ )
+    const int max_input = (1 << bits) - 1;
+    const double denom = log10(113.0);
+    const int max_output = (1 << 16) - 1;
+
+    // Antilog: the mathematical inverse of the encoder log curve.
+    // Encoder: y = max_out * log10(x/max_in * 112 + 1) / log10(113)
+    // Decoder: x = max_out * (10^(y/max_in * log10(113)) - 1) / 112
+    for (int i = 0; i <= max_input; ++i)
     {
-        //input 12-bit, output 16-bit
-        float input = i;
-        float output = max_16_bit * (pow(113.0, input/4095.0) - 1.0)/112.0;
-        
-        DecoderLogCurve[i]  = minimum( (int)output, max_16_bit );
+        const double norm = (double)i / max_input;
+        const double exp_val = pow(10.0, norm * denom);
+        const double output = max_output * (exp_val - 1.0) / 112.0;
+        int out_int = (int)(output + 0.5);
+        if (out_int < 0) out_int = 0;
+        if (out_int > max_output) out_int = max_output;
+        table[i] = (uint16_t)out_int;
     }
 }
 
-void SetupEncoderLogCurve(void)
+void SetupDecoderLogCurve(void)
 {
-    int i;
-    const int max_input_val = LOG_CURVE_TABLE_LENGTH - 1;
-    
-    for( i = 0; i < LOG_CURVE_TABLE_LENGTH; i++ )
-    {
-        //input 16-bit, output 12-bit
-        float input = maximum( 0, i );
-        float output = 4095.0 * (log10(input/max_input_val * 112.0 + 1.0)/log10(113));
+    SetupDecoderCurve(DecoderLogCurve12, 12);
+    SetupDecoderCurve(DecoderLogCurve14, 14);
+    SetupDecoderCurve(DecoderLogCurve16, 16);
+}
+
+static void SetupEncoderLogCurveBits(uint16_t* table, int input_bits, int output_bits)
+{
+    const int max_input_val = (1 << input_bits) - 1;
+    const int max_output_val = (1 << output_bits) - 1;
+    const double denom = log10(113.0);
 
-        EncoderLogCurve[i]  = ( (uint16_t)output );
+    for(int i = 0; i <= max_input_val; i++ )
+    {
+        const double input = maximum(0, i);
+        const double norm  = (input / max_input_val * 112.0) + 1.0;
+        const double output = max_output_val * (log10(norm) / denom);
+        table[i] = (uint16_t)output;
     }
 }
 
+void SetupEncoderLogCurve(void)
+{
+    SetupEncoderLogCurveBits(EncoderLogCurve12, 12, 12);
+    SetupEncoderLogCurveBits(EncoderLogCurve14, 14, 14);
+    SetupEncoderLogCurveBits(EncoderLogCurve16, 16, 16);
+}
diff --git a/source/lib/vc5_common/logcurve.h b/source/lib/vc5_common/logcurve.h
index 3e6a2692..ddeb314a 100755
--- a/source/lib/vc5_common/logcurve.h
+++ b/source/lib/vc5_common/logcurve.h
@@ -21,19 +21,60 @@
 #ifndef LOGCURVE_H
 #define LOGCURVE_H
 
-#define LOG_CURVE_TABLE_LENGTH (1 << 12)
+#define LOG_CURVE_TABLE_LENGTH      (1 << 12)
+#define LOG_CURVE_TABLE_LENGTH_12   LOG_CURVE_TABLE_LENGTH
+#define LOG_CURVE_TABLE_LENGTH_14   (1 << 14)
+#define LOG_CURVE_TABLE_LENGTH_16   (1 << 16)
 
 #ifdef __cplusplus
 extern "C" {
 #endif
+
+int vc5_logcurve_bypass(void);
     
-	extern uint16_t EncoderLogCurve[];
-	
-	extern uint16_t DecoderLogCurve[];
+#define EncoderLogCurve EncoderLogCurve12
+extern uint16_t EncoderLogCurve12[];
+extern uint16_t EncoderLogCurve14[];
+extern uint16_t EncoderLogCurve16[];
+
+extern uint16_t DecoderLogCurve12[];
+extern uint16_t DecoderLogCurve14[];
+extern uint16_t DecoderLogCurve16[];
+
+static INLINE uint16_t DecodeLogValue(uint32_t value, int bits)
+{
+    if (vc5_logcurve_bypass())
+    {
+        const uint32_t max_bits = (bits >= 16) ? 16 : (bits <= 0 ? 1 : bits);
+        const uint32_t max_val = (1u << max_bits) - 1;
+        if (value > max_val) value = max_val;
+        return (uint16_t)value;
+    }
+
+    const uint32_t max12 = (1u << 12) - 1;
+    const uint32_t max14 = (1u << 14) - 1;
+    const uint32_t max16 = (1u << 16) - 1;
+
+    if (bits <= 12)
+    {
+        if (value > max12) value = max12;
+        return DecoderLogCurve12[value];
+    }
+    else if (bits <= 14)
+    {
+        if (value > max14) value = max14;
+        return DecoderLogCurve14[value];
+    }
+    else
+    {
+        if (value > max16) value = max16;
+        return DecoderLogCurve16[value];
+    }
+}
 
 	void SetupDecoderLogCurve(void);
 
-    void SetupEncoderLogCurve(void);
+void SetupEncoderLogCurve(void);
 
 #ifdef __cplusplus
 }
diff --git a/source/lib/vc5_common/pixel.h b/source/lib/vc5_common/pixel.h
index 7c3b9e06..15dca5fa 100755
--- a/source/lib/vc5_common/pixel.h
+++ b/source/lib/vc5_common/pixel.h
@@ -23,23 +23,23 @@
 #ifndef PIXEL_H
 #define PIXEL_H
 
-//! Data type for pixels
-typedef int16_t PIXEL;
+//! Data type for pixels (widened for 16-bit pipeline headroom)
+typedef int32_t PIXEL;
 
 //! Minimum and maximum pixel values
 enum {
-	PIXEL_MIN = INT16_MIN,
-	PIXEL_MAX = INT16_MAX,
+	PIXEL_MIN = INT32_MIN,
+	PIXEL_MAX = INT32_MAX,
 };
 
 //! Alternative definition for wavelet coefficients
-typedef int16_t COEFFICIENT;
+typedef int32_t COEFFICIENT;
 
 //! Minimum and maximum coefficient values
 enum
 {
-	COEFFICIENT_MIN = INT16_MIN,
-	COEFFICIENT_MAX = INT16_MAX,
+	COEFFICIENT_MIN = INT32_MIN,
+	COEFFICIENT_MAX = INT32_MAX,
 };
 
 /*!
@@ -50,12 +50,14 @@ enum
 
 	@todo Need to add support for more pixel formats to the reference decoder
 */
-typedef enum
+	typedef enum
 {
 	PIXEL_FORMAT_UNKNOWN = 0,
 
     PIXEL_FORMAT_RAW_RGGB_16  = 104,
 
+    PIXEL_FORMAT_RAW_GBRG_16  = 105,
+
     PIXEL_FORMAT_RAW_RGGB_12  = 106,
     PIXEL_FORMAT_RAW_RGGB_12P = 107,
     PIXEL_FORMAT_RAW_RGGB_14  = 108,
diff --git a/source/lib/vc5_common/stream.c b/source/lib/vc5_common/stream.c
index 59ecc7da..e8bb5a82 100755
--- a/source/lib/vc5_common/stream.c
+++ b/source/lib/vc5_common/stream.c
@@ -187,11 +187,14 @@ CODEC_ERROR PutWord(STREAM *stream, BITWORD word)
 		break;
 
 	case STREAM_TYPE_MEMORY:
-        {
-            uint8_t* buffer = (uint8_t *)stream->location.memory.buffer + stream->byte_count;
-            
-            memcpy(buffer, &word, sizeof(word));            
-        }
+		if (stream->byte_count + sizeof(word) > stream->location.memory.size)
+		{
+			return CODEC_ERROR_OUTOFMEMORY;
+		}
+		{
+			uint8_t* buffer = (uint8_t *)stream->location.memory.buffer + stream->byte_count;
+			memcpy(buffer, &word, sizeof(word));
+		}
 		break;
 
 	default:
@@ -221,6 +224,10 @@ CODEC_ERROR PutByte(STREAM *stream, uint8_t byte)
 		break;
 
 	case STREAM_TYPE_MEMORY:
+		if (stream->byte_count + 1 > stream->location.memory.size)
+		{
+			return CODEC_ERROR_OUTOFMEMORY;
+		}
 		((uint8_t *)stream->location.memory.buffer)[stream->byte_count] = byte;
 		break;
 
@@ -521,4 +528,3 @@ CODEC_ERROR PutBlockMemory(STREAM *stream, void *buffer, size_t size, size_t off
 	return CODEC_ERROR_OKAY;
 }
 
-
diff --git a/source/lib/vc5_common/wavelet.c b/source/lib/vc5_common/wavelet.c
index ec7e31a5..1861d862 100755
--- a/source/lib/vc5_common/wavelet.c
+++ b/source/lib/vc5_common/wavelet.c
@@ -274,6 +274,19 @@ CODEC_ERROR SetTransformPrescale(TRANSFORM *transform, int precision)
 		PRESCALE spatial_prescale[]  = {0, 2, 2, 0, 0, 0, 0, 0};
 		memcpy(transform->prescale, spatial_prescale, sizeof(transform->prescale));
 	}
+	else if (precision == 14)
+	{
+		// For 14-bit, prescale [0, 2, 2] keeps lowpass within 16-bit range
+		PRESCALE spatial_prescale[]  = {0, 2, 2, 0, 0, 0, 0, 0};
+		memcpy(transform->prescale, spatial_prescale, sizeof(transform->prescale));
+	}
+	else if (precision >= 15)
+	{
+		// For 16-bit inputs, need prescale=2 at level 0 to prevent lowpass overflow
+		// (without it, level 0 lowpass = 4x input which exceeds 16-bit storage)
+		PRESCALE spatial_prescale[]  = {2, 3, 3, 0, 0, 0, 0, 0};
+		memcpy(transform->prescale, spatial_prescale, sizeof(transform->prescale));
+	}
 	else
 	{
 		//TODO: Need to handle other precisions
@@ -459,7 +472,8 @@ void WaveletToRGB( gpr_allocator allocator, PIXEL* GS_src, PIXEL* RG_src, PIXEL*
     dst_image->buffer   = allocator.Alloc( size );
     
     const int32_t midpoint  = (1 << (input_precision_bits - 1));
-    const int32_t shift     = input_precision_bits - 12;
+    const int table_bits = (input_precision_bits <= 12) ? 12 : (input_precision_bits <= 14 ? 14 : 16);
+    const int shift      = (input_precision_bits > table_bits) ? (input_precision_bits - table_bits) : 0;
     
     unsigned char*  RGB_dst_8bits  = dst_image->buffer;
     unsigned short* RGB_dst_16bits = dst_image->buffer;
@@ -474,10 +488,9 @@ void WaveletToRGB( gpr_allocator allocator, PIXEL* GS_src, PIXEL* RG_src, PIXEL*
             int32_t R = 2 * ( RG_src[(src_width - x - 1) + y * src_pitch] - midpoint) + G;
             int32_t B = 2 * ( BG_src[(src_width - x - 1) + y * src_pitch] - midpoint) + G;
             
-            // R,G,B are in 16-bit range since DecoderLogCurve outputs in 16 bits (although it's input is 12 bits)
-            R = DecoderLogCurve[ clamp_uint( (R >> shift), 12) ];
-            G = DecoderLogCurve[ clamp_uint( (G >> shift), 12) ];
-            B = DecoderLogCurve[ clamp_uint( (B >> shift), 12) ];
+            R = DecodeLogValue((R >> shift), table_bits);
+            G = DecodeLogValue((G >> shift), table_bits);
+            B = DecodeLogValue((B >> shift), table_bits);
 
             if( output_precision_bits == 8 )
             {
diff --git a/source/lib/vc5_decoder/CMakeLists.txt b/source/lib/vc5_decoder/CMakeLists.txt
index d78522d0..77e380d8 100644
--- a/source/lib/vc5_decoder/CMakeLists.txt
+++ b/source/lib/vc5_decoder/CMakeLists.txt
@@ -16,7 +16,8 @@ include_directories( "../common/public" )
 # library
 add_library( ${LIB_NAME} STATIC ${SRC_FILES} ${INC_FILES} )
 
-target_link_libraries( ${LIB_NAME} )
+find_package( Threads REQUIRED )
+target_link_libraries( ${LIB_NAME} Threads::Threads )
 
 # set the folder where to place the projects
 set_target_properties( ${LIB_NAME} PROPERTIES FOLDER lib )
diff --git a/source/lib/vc5_decoder/decoder.c b/source/lib/vc5_decoder/decoder.c
index 73aa3b80..9609744d 100755
--- a/source/lib/vc5_decoder/decoder.c
+++ b/source/lib/vc5_decoder/decoder.c
@@ -19,6 +19,8 @@
  */
 
 #include "headers.h"
+#include <stdio.h>
+#include <pthread.h>
 
 /*!
 	@brief Align the bitstream to a byte boundary
@@ -212,32 +214,40 @@ CODEC_ERROR DecodeImage(STREAM *stream, IMAGE *packed_image, RGB_IMAGE *rgb_imag
             break;
 
         case GPR_RGB_RESOLUTION_HALF:
+        {
+            int bits = unpacked_image.component_array_list[0].bits_per_component;
             WaveletToRGB(parameters->allocator, (PIXEL*)unpacked_image.component_array_list[0].data, (PIXEL*)unpacked_image.component_array_list[1].data, (PIXEL*)unpacked_image.component_array_list[2].data,
-                         unpacked_image.component_array_list[2].width, unpacked_image.component_array_list[2].height, unpacked_image.component_array_list[2].pitch / 2,
-                         rgb_image, 12, parameters->rgb_bits, &parameters->rgb_gain );
+                         unpacked_image.component_array_list[2].width, unpacked_image.component_array_list[2].height, unpacked_image.component_array_list[2].pitch / sizeof(COMPONENT_VALUE),
+                         rgb_image, bits, parameters->rgb_bits, &parameters->rgb_gain );
             break;
+        }
             
         case GPR_RGB_RESOLUTION_QUARTER:
-            
+        {
+            int bits = decoder.channel[0].bits_per_component;
             WaveletToRGB(parameters->allocator, decoder.transform[0].wavelet[0]->data[0], decoder.transform[1].wavelet[0]->data[0], decoder.transform[2].wavelet[0]->data[0],
                          decoder.transform[2].wavelet[0]->width, decoder.transform[2].wavelet[0]->height, decoder.transform[2].wavelet[0]->width,
-                         rgb_image, 14, parameters->rgb_bits, &parameters->rgb_gain );
+                         rgb_image, bits, parameters->rgb_bits, &parameters->rgb_gain );
             break;
+        }
             
         case GPR_RGB_RESOLUTION_EIGHTH:
-            
+        {
+            int bits = decoder.channel[0].bits_per_component;
             WaveletToRGB(parameters->allocator, decoder.transform[0].wavelet[1]->data[0], decoder.transform[1].wavelet[1]->data[0], decoder.transform[2].wavelet[1]->data[0],
                          decoder.transform[2].wavelet[1]->width, decoder.transform[2].wavelet[1]->height, decoder.transform[2].wavelet[1]->width,
-                         rgb_image, 14, parameters->rgb_bits, &parameters->rgb_gain );
-            
+                         rgb_image, bits, parameters->rgb_bits, &parameters->rgb_gain );
             break;
+        }
             
         case GPR_RGB_RESOLUTION_SIXTEENTH:
-
+        {
+            int bits = decoder.channel[0].bits_per_component;
             WaveletToRGB(parameters->allocator, decoder.transform[0].wavelet[2]->data[0], decoder.transform[1].wavelet[2]->data[0], decoder.transform[2].wavelet[2]->data[0],
                          decoder.transform[2].wavelet[2]->width, decoder.transform[2].wavelet[2]->height, decoder.transform[2].wavelet[2]->width,
-                         rgb_image, 14, parameters->rgb_bits, &parameters->rgb_gain );
+                         rgb_image, bits, parameters->rgb_bits, &parameters->rgb_gain );
             break;
+        }
             
         default:
             return CODEC_ERROR_UNSUPPORTED_FORMAT;
@@ -262,7 +272,10 @@ CODEC_ERROR DecodeImage(STREAM *stream, IMAGE *packed_image, RGB_IMAGE *rgb_imag
 CODEC_ERROR PrepareDecoder(DECODER *decoder, const DECODER_PARAMETERS *parameters)
 {
     CODEC_ERROR error = CODEC_ERROR_OKAY;
-    
+
+    // Initialize the uncompanding LUT (fast integer replacement for double-precision cubic)
+    InitUncompandTable();
+
     // Initialize the decoder data structure
     error = InitDecoder(decoder, &parameters->allocator);
     if (error != CODEC_ERROR_OKAY) {
@@ -554,7 +567,12 @@ CODEC_ERROR SetImageChannelParameters(DECODER *decoder, int channel_number)
     }
     
     //TODO: Is the default bits per component the correct value to use?
-    decoder->channel[channel_number].bits_per_component = codec->bits_per_component;
+    PRECISION default_bits = codec->max_bits_per_component ? codec->max_bits_per_component : codec->bits_per_component;
+    if (default_bits == 0)
+    {
+        default_bits = codec->bits_per_component;
+    }
+    decoder->channel[channel_number].bits_per_component = default_bits;
     decoder->channel[channel_number].initialized = true;
     
     return CODEC_ERROR_OKAY;
@@ -874,6 +892,7 @@ CODEC_ERROR ImageRepackingProcess(const UNPACKED_IMAGE *unpacked_image,
             break;
             
         case PIXEL_FORMAT_RAW_RGGB_16:
+        case PIXEL_FORMAT_RAW_GBRG_16:
             return PackComponentsToRAW(unpacked_image, output_buffer, output_pitch,
                                         output_width, output_height, enabled_parts, 16, output_format );
             break;
@@ -1256,6 +1275,7 @@ CODEC_ERROR UpdateCodecState(DECODER *decoder, BITSTREAM *stream, TAGVALUE segme
             if (IsPartEnabled(enabled_parts, VC5_PART_IMAGE_FORMATS))
             {
                 codec->max_bits_per_component = (PRECISION)value;
+                codec->bits_per_component = (PRECISION)value;
                 codec->header = true;
             }
             else
@@ -1519,7 +1539,7 @@ CODEC_ERROR UpdateCodecState(DECODER *decoder, BITSTREAM *stream, TAGVALUE segme
         {
             // Remember the number of bits per component in this and higher numbered channel
             decoder->channel[codec->channel_number].bits_per_component = codec->bits_per_component;
-            
+
             // Found the first codeblock in the channel
             decoder->channel[channel_number].found_first_codeblock = true;
         }
@@ -1908,26 +1928,46 @@ CODEC_ERROR DecodeLowpassBand(DECODER *decoder, BITSTREAM *stream, WAVELET *wave
     lowpass_precision = codec->lowpass_precision;
     
     // Decode each row in the lowpass image
-    for (row = 0; row < lowpass_band_height; row++)
+    if (lowpass_precision == 16)
     {
-        for (column = 0; column < lowpass_band_width; column++)
+        // Fast path: extract 16-bit coefficients directly from the 32-bit buffer
+        for (row = 0; row < lowpass_band_height; row++)
         {
-            COEFFICIENT lowpass_value = (COEFFICIENT)GetBits(stream, lowpass_precision);
-            //assert(0 <= lowpass_value && lowpass_value <= COEFFICIENT_MAX);
-            
-            //if (lowpass_value > COEFFICIENT_MAX) {
-            //	lowpass_value = COEFFICIENT_MAX;
-            //}
-            
-            lowpass_band_ptr[column] = lowpass_value;
+            for (column = 0; column < lowpass_band_width; column++)
+            {
+                if (stream->count < 16)
+                {
+                    if (stream->count == 0) {
+                        CODEC_ERROR gb_error = GetBuffer(stream);
+                        if (gb_error != CODEC_ERROR_OKAY) return gb_error;
+                    } else {
+                        lowpass_band_ptr[column] = (COEFFICIENT)GetBits(stream, 16);
+                        continue;
+                    }
+                }
+                lowpass_band_ptr[column] = (COEFFICIENT)(stream->buffer >> 16);
+                stream->buffer <<= 16;
+                stream->count -= 16;
+            }
+            lowpass_band_ptr += lowpass_band_pitch;
+        }
+    }
+    else
+    {
+        // Generic path for other precisions
+        for (row = 0; row < lowpass_band_height; row++)
+        {
+            for (column = 0; column < lowpass_band_width; column++)
+            {
+                COEFFICIENT lowpass_value = (COEFFICIENT)GetBits(stream, lowpass_precision);
+                lowpass_band_ptr[column] = lowpass_value;
+            }
+            lowpass_band_ptr += lowpass_band_pitch;
         }
-        
-        // Advance to the next row in the lowpass image
-        lowpass_band_ptr += lowpass_band_pitch;
     }
     // Align the bitstream to the next tag value pair
     AlignBitsSegment(stream);
-    
+
     // Return indication of lowpass decoding success
     return error;
 }
@@ -2012,39 +2052,70 @@ CODEC_ERROR DecodeBandRuns(BITSTREAM *stream, CODEBOOK *codebook, PIXEL *data,
     
     while (data_count > 0)
     {
-        // Get the next run length and value
-        error = GetRun(stream, codebook, &run);
+        // Get the next run length and value (fast path uses prefix lookup table)
+        error = GetRunFast(stream, codebook, &run);
         if (error != CODEC_ERROR_OKAY) {
             return error;
         }
-        
+
         // Check that the run does not extend past the end of the band
         assert(run.count <= data_count);
-        
-        // Copy the value into the specified number of pixels in the band
-        while (run.count > 0)
+
+        // Check if the entire run fits in the current row (common case)
         {
-            // Reached the end of the column?
-            if (column == width)
+            int remaining_in_row = width - column;
+
+            if ((int)run.count <= remaining_in_row)
+            {
+                // FAST: run stays within current row, no boundary checks needed
+                if (run.value == 0 && run.count > 1)
+                {
+                    memset(&data[index], 0, run.count * sizeof(PIXEL));
+                }
+                else
+                {
+                    // count==1 for non-zero values and single zeros
+                    data[index] = (PIXEL)run.value;
+                }
+                index += run.count;
+                column += run.count;
+                data_count -= run.count;
+                run.count = 0;
+            }
+            else
             {
-                // Need to pad the end of the row?
-                if (row_padding > 0)
+                // SLOW: run crosses row boundary
+                while (run.count > 0)
                 {
-                    int count;
-                    for (count = 0; (size_t)count < row_padding; count++) {
-                        data[index++] = 0;
+                    if (column == width)
+                    {
+                        if (row_padding > 0)
+                        {
+                            memset(&data[index], 0, row_padding * sizeof(PIXEL));
+                            index += (int)row_padding;
+                        }
+                        row++;
+                        column = 0;
                     }
+
+                    remaining_in_row = width - column;
+                    int n = ((int)run.count < remaining_in_row) ? (int)run.count : remaining_in_row;
+
+                    if (run.value == 0)
+                    {
+                        memset(&data[index], 0, n * sizeof(PIXEL));
+                    }
+                    else
+                    {
+                        for (int i = 0; i < n; i++)
+                            data[index + i] = (PIXEL)run.value;
+                    }
+                    index += n;
+                    column += n;
+                    run.count -= n;
+                    data_count -= n;
                 }
-                
-                // Advance to the next row
-                row++;
-                column = 0;
             }
-            
-            data[index++] = (PIXEL)run.value;
-            column++;
-            run.count--;
-            data_count--;
         }
     }
     
@@ -2148,6 +2219,31 @@ bool IsDecodingComplete(DECODER *decoder)
     return true;
 }
 
+/*! Thread argument for parallel inverse transform */
+typedef struct {
+	gpr_allocator *allocator;
+	WAVELET *wavelet;
+	COMPONENT_VALUE *output_data;
+	DIMENSION output_width;
+	DIMENSION output_height;
+	size_t output_pitch;
+	PRESCALE prescale;
+	CODEC_ERROR error;
+} INVERSE_THREAD_ARG;
+
+static void *InverseTransformThread(void *arg)
+{
+	INVERSE_THREAD_ARG *a = (INVERSE_THREAD_ARG *)arg;
+	a->error = TransformInverseSpatialQuantArray(a->allocator,
+	                                              a->wavelet,
+	                                              a->output_data,
+	                                              a->output_width,
+	                                              a->output_height,
+	                                              a->output_pitch,
+	                                              a->prescale);
+	return NULL;
+}
+
 /*!
 	@brief Perform the final wavelet transform in each channel to compute the component arrays
 	Each channel is decoded and the lowpass and highpass bands are used to reconstruct the
@@ -2183,42 +2279,68 @@ CODEC_ERROR ReconstructUnpackedImage(DECODER *decoder, UNPACKED_IMAGE *image)
     image->component_count = 0;
     memset(image->component_array_list, 0, size);
     
+    // Phase 1: Allocate all component arrays (sequential, uses allocator)
     for (channel_number = 0; channel_number < channel_count; channel_number++)
     {
-        // Get the dimensions of this channel
         DIMENSION channel_width = decoder->channel[channel_number].width;
         DIMENSION channel_height = decoder->channel[channel_number].height;
         PRECISION bits_per_component = decoder->channel[channel_number].bits_per_component;
-        
-        // Amount of prescaling applied to the component array values before encoding
-        PRESCALE prescale = decoder->codec.prescale_table[0];
-        
-        // Allocate the component array for this channel
+
         error = AllocateComponentArray(allocator,
                                        &image->component_array_list[channel_number],
                                        channel_width,
                                        channel_height,
                                        bits_per_component);
-        
         if (error != CODEC_ERROR_OKAY) {
             return error;
         }
-        
-        error = TransformInverseSpatialQuantArray(allocator,
-                                                  decoder->transform[channel_number].wavelet[0],
-                                                  image->component_array_list[channel_number].data,
-                                                  channel_width,
-                                                  channel_height,
-                                                  image->component_array_list[channel_number].pitch,
-                                                  prescale);
-        if (error != CODEC_ERROR_OKAY) {
-            return error;
+    }
+
+    // Phase 2: Run inverse transforms in parallel (one thread per channel)
+    {
+        PRESCALE prescale = decoder->codec.prescale_table[0];
+        INVERSE_THREAD_ARG thread_args[MAX_CHANNEL_COUNT];
+        pthread_t threads[MAX_CHANNEL_COUNT];
+        int thread_created[MAX_CHANNEL_COUNT];
+
+        for (channel_number = 0; channel_number < channel_count; channel_number++)
+        {
+            thread_args[channel_number].allocator    = allocator;
+            thread_args[channel_number].wavelet      = decoder->transform[channel_number].wavelet[0];
+            thread_args[channel_number].output_data   = image->component_array_list[channel_number].data;
+            thread_args[channel_number].output_width  = decoder->channel[channel_number].width;
+            thread_args[channel_number].output_height = decoder->channel[channel_number].height;
+            thread_args[channel_number].output_pitch  = image->component_array_list[channel_number].pitch;
+            thread_args[channel_number].prescale      = prescale;
+            thread_args[channel_number].error         = CODEC_ERROR_OKAY;
+
+            thread_created[channel_number] = (pthread_create(&threads[channel_number], NULL,
+                                                              InverseTransformThread,
+                                                              &thread_args[channel_number]) == 0);
+            if (!thread_created[channel_number])
+            {
+                // Thread creation failed: run inline as fallback
+                InverseTransformThread(&thread_args[channel_number]);
+            }
         }
+
+        // Wait for all threads to complete
+        for (channel_number = 0; channel_number < channel_count; channel_number++)
+        {
+            if (thread_created[channel_number])
+                pthread_join(threads[channel_number], NULL);
+
+            if (thread_args[channel_number].error != CODEC_ERROR_OKAY)
+                error = thread_args[channel_number].error;
+        }
+
+        if (error != CODEC_ERROR_OKAY)
+            return error;
     }
     
     // One component array is output by the decoding process per channel in the bitstream
     image->component_count = channel_count;
-    
+
     TIMESTAMP("[END]", 2)
     
     return error;
@@ -2306,5 +2428,3 @@ CODEC_ERROR ReconstructSampleFrame(DECODER *decoder, IMAGE image_array[], int fr
     return ComposeFields(image_array, frame_count, output_image);
 }
 #endif
-
-
diff --git a/source/lib/vc5_decoder/dequantize.c b/source/lib/vc5_decoder/dequantize.c
index 48cfb647..74733f3c 100755
--- a/source/lib/vc5_decoder/dequantize.c
+++ b/source/lib/vc5_decoder/dequantize.c
@@ -20,6 +20,10 @@
 
 #include "headers.h"
 
+#if ENABLED(NEON)
+#include <arm_neon.h>
+#endif
+
 // Not using midpoint correction in dequantization
 static const int midpoint = 0;
 
@@ -29,6 +33,73 @@ static const int midpoint = 0;
 	The companding curve is inverted and the value is multiplied by the
 	quantization value that was used by the encoder to compress the band.
 */
+
+#if ENABLED(NEON)
+
+CODEC_ERROR DequantizeBandRow16s(PIXEL *input, int width, int quantization, PIXEL *output)
+{
+	int column = 0;
+	const int width_m4 = (width / 4) * 4;
+	const int32x4_t zero = vdupq_n_s32(0);
+	const int32x4_t quant_vec = vdupq_n_s32(quantization);
+
+	// Process 4 pixels at a time
+	for (; column < width_m4; column += 4)
+	{
+		int32x4_t values = vld1q_s32(&input[column]);
+
+		// Scalar LUT lookups for uncompanding (NEON can't do 32-bit table lookups)
+		int32_t v0 = vgetq_lane_s32(values, 0);
+		int32_t v1 = vgetq_lane_s32(values, 1);
+		int32_t v2 = vgetq_lane_s32(values, 2);
+		int32_t v3 = vgetq_lane_s32(values, 3);
+
+		int32_t u0 = UncompandedValueFast(v0);
+		int32_t u1 = UncompandedValueFast(v1);
+		int32_t u2 = UncompandedValueFast(v2);
+		int32_t u3 = UncompandedValueFast(v3);
+
+		// Load uncompanded values into vector
+		int32_t uncomp[4] = { u0, u1, u2, u3 };
+		int32x4_t uncompanded = vld1q_s32(uncomp);
+
+		// Get absolute values and compute signs
+		int32x4_t abs_vals = vabsq_s32(uncompanded);
+		uint32x4_t neg_mask = vcltq_s32(uncompanded, zero);
+
+		// Multiply by quantization
+		int32x4_t dequant = vmulq_s32(abs_vals, quant_vec);
+
+		// Restore signs
+		int32x4_t negated = vnegq_s32(dequant);
+		int32x4_t result = vbslq_s32(neg_mask, negated, dequant);
+
+		vst1q_s32(&output[column], result);
+	}
+
+	// Scalar cleanup for remaining pixels
+	for (; column < width; column++)
+	{
+		int32_t value = input[column];
+		value = UncompandedValueFast(value);
+
+		if (value > 0)
+			value = (quantization * value) + midpoint;
+		else if (value < 0)
+		{
+			value = neg(value);
+			value = (quantization * value) + midpoint;
+			value = neg(value);
+		}
+
+		output[column] = ClampPixel(value);
+	}
+
+	return CODEC_ERROR_OKAY;
+}
+
+#else
+
 CODEC_ERROR DequantizeBandRow16s(PIXEL *input, int width, int quantization, PIXEL *output)
 {
 	int column;
@@ -38,8 +109,8 @@ CODEC_ERROR DequantizeBandRow16s(PIXEL *input, int width, int quantization, PIXE
 	{
 		int32_t value = input[column];
 
-		// Invert the companding curve (if any)
-		value = UncompandedValue(value);
+		// Invert the companding curve using fast LUT
+		value = UncompandedValueFast(value);
 
 		// Dequantize the absolute value
 		if (value > 0)
@@ -60,6 +131,8 @@ CODEC_ERROR DequantizeBandRow16s(PIXEL *input, int width, int quantization, PIXE
 	return CODEC_ERROR_OKAY;
 }
 
+#endif
+
 /*!
 	@brief This function dequantizes the pixel value
 
@@ -69,8 +142,8 @@ CODEC_ERROR DequantizeBandRow16s(PIXEL *input, int width, int quantization, PIXE
 */
 PIXEL DequantizedValue(int32_t value, int quantization)
 {
-	// Invert the companding curve (if any)
-	value = UncompandedValue(value);
+	// Invert the companding curve using fast LUT
+	value = UncompandedValueFast(value);
 
 	// Dequantize the absolute value
 	if (value > 0)
diff --git a/source/lib/vc5_decoder/inverse.c b/source/lib/vc5_decoder/inverse.c
index cacc7be8..238d8e58 100755
--- a/source/lib/vc5_decoder/inverse.c
+++ b/source/lib/vc5_decoder/inverse.c
@@ -18,351 +18,271 @@
 
 #include "headers.h"
 
+#if ENABLED(NEON)
+#include <arm_neon.h>
+#endif
+
 //! Rounding adjustment used by the inverse wavelet transforms
 static const int32_t rounding = 4;
 
+#if ENABLED(NEON)
+/*!
+ @brief NEON helper: vertical inverse middle-row filter for 4 columns
+ Computes even and odd outputs from 3 rows of lowpass and 1 row of highpass.
+ even = ((row0 - row2 + 4) >> 3 + row1 + hp) >> 1
+ odd  = ((-row0 + row2 + 4) >> 3 + row1 - hp) >> 1
+ */
+static INLINE void InvertVerticalMiddle4_NEON(
+    const PIXEL *row0, const PIXEL *row1, const PIXEL *row2,
+    const PIXEL *hp, PIXEL *even_out, PIXEL *odd_out, int col)
+{
+    const int32x4_t four = vdupq_n_s32(4);
+
+    int32x4_t r0 = vld1q_s32(&row0[col]);
+    int32x4_t r1 = vld1q_s32(&row1[col]);
+    int32x4_t r2 = vld1q_s32(&row2[col]);
+    int32x4_t h  = vld1q_s32(&hp[col]);
+
+    // even path: (r0 - r2 + 4) >> 3 + r1 + h, then >> 1
+    int32x4_t diff = vsubq_s32(r0, r2);
+    diff = vaddq_s32(diff, four);
+    diff = vshrq_n_s32(diff, 3);
+    int32x4_t even_v = vaddq_s32(vaddq_s32(diff, r1), h);
+    even_v = vshrq_n_s32(even_v, 1);
+    vst1q_s32(&even_out[col], even_v);
+
+    // odd path: (-r0 + r2 + 4) >> 3 + r1 - h, then >> 1
+    int32x4_t diff_odd = vsubq_s32(r2, r0);
+    diff_odd = vaddq_s32(diff_odd, four);
+    diff_odd = vshrq_n_s32(diff_odd, 3);
+    int32x4_t odd_v = vsubq_s32(vaddq_s32(diff_odd, r1), h);
+    odd_v = vshrq_n_s32(odd_v, 1);
+    vst1q_s32(&odd_out[col], odd_v);
+}
+
+/*!
+ @brief NEON helper: vertical inverse middle-row filter for Descale variant
+ Same as above but uses DivideByShift(x,1) = x >> 1 instead of >> 1
+ (which is the same operation, so the only diff is conceptual)
+ */
+static INLINE void InvertVerticalMiddle4Descale_NEON(
+    const PIXEL *row0, const PIXEL *row1, const PIXEL *row2,
+    const PIXEL *hp, PIXEL *even_out, PIXEL *odd_out, int col)
+{
+    const int32x4_t four = vdupq_n_s32(4);
+
+    int32x4_t r0 = vld1q_s32(&row0[col]);
+    int32x4_t r1 = vld1q_s32(&row1[col]);
+    int32x4_t r2 = vld1q_s32(&row2[col]);
+    int32x4_t h  = vld1q_s32(&hp[col]);
+
+    int32x4_t diff = vsubq_s32(r0, r2);
+    diff = vaddq_s32(diff, four);
+    diff = vshrq_n_s32(diff, 3);
+    int32x4_t even_v = vaddq_s32(vaddq_s32(diff, r1), h);
+    even_v = vshrq_n_s32(even_v, 1);
+    vst1q_s32(&even_out[col], even_v);
+
+    int32x4_t diff_odd = vsubq_s32(r2, r0);
+    diff_odd = vaddq_s32(diff_odd, four);
+    diff_odd = vshrq_n_s32(diff_odd, 3);
+    int32x4_t odd_v = vsubq_s32(vaddq_s32(diff_odd, r1), h);
+    odd_v = vshrq_n_s32(odd_v, 1);
+    vst1q_s32(&odd_out[col], odd_v);
+}
+#endif
+
 /*!
  @brief Apply the inverse horizontal wavelet transform
- This routine applies the inverse wavelet transform to a row of
- lowpass and highpass coefficients, producing an output row that
- is write as wide.
  */
-STATIC CODEC_ERROR InvertHorizontal16s(PIXEL *lowpass,            //!< Horizontal lowpass coefficients
-                                       PIXEL *highpass,        //!< Horizontal highpass coefficients
-                                       PIXEL *output,            //!< Row of reconstructed results
-                                       DIMENSION input_width,    //!< Number of values in the input row
-                                       DIMENSION output_width  //!< Number of values in the output row
-)
+STATIC CODEC_ERROR InvertHorizontal16s(PIXEL *lowpass, PIXEL *highpass, PIXEL *output,
+                                       DIMENSION input_width, DIMENSION output_width)
 {
     const int last_column = input_width - 1;
-    
     int32_t even;
     int32_t odd;
-    
-    // Start processing at the beginning of the row
     int column = 0;
-    
-    // Process the first two output points with special filters for the left border
-    even = 0;
-    odd = 0;
-    
-    // Apply the even reconstruction filter to the lowpass band
-    even += 11 * lowpass[column + 0];
-    even -=  4 * lowpass[column + 1];
-    even +=  1 * lowpass[column + 2];
-    even += rounding;
+
+    // Left border
+    even = 11 * lowpass[0] - 4 * lowpass[1] + lowpass[2] + rounding;
     even = DivideByShift(even, 3);
-    
-    // Add the highpass correction
-    even += highpass[column];
-    even >>= 1;
-    
-    // The lowpass result should be a positive number
-    //assert(0 <= even && even <= INT16_MAX);
-    
-    // Apply the odd reconstruction filter to the lowpass band
-    odd += 5 * lowpass[column + 0];
-    odd += 4 * lowpass[column + 1];
-    odd -= 1 * lowpass[column + 2];
-    odd += rounding;
+    even = (even + highpass[0]) >> 1;
+
+    odd = 5 * lowpass[0] + 4 * lowpass[1] - lowpass[2] + rounding;
     odd = DivideByShift(odd, 3);
-    
-    // Subtract the highpass correction
-    odd -= highpass[column];
-    odd >>= 1;
-    
-    // The lowpass result should be a positive number
-    //assert(0 <= odd && odd <= INT16_MAX);
-    
-    // Store the last two output points produced by the loop
-    output[2 * column + 0] = clamp_uint14(even);
-    output[2 * column + 1] = clamp_uint14(odd);
-    
-    // Advance to the next input column (second pair of output values)
-    column++;
-    
-    // Process the rest of the columns up to the last column in the row
+    odd = (odd - highpass[0]) >> 1;
+
+    output[0] = ClampPixel(even);
+    output[1] = ClampPixel(odd);
+    column = 1;
+
+#if ENABLED(NEON)
+    {
+        const int32x4_t four = vdupq_n_s32(4);
+        for (; column + 3 < last_column; column += 4)
+        {
+            int32x4_t lp_left   = vld1q_s32(&lowpass[column - 1]);
+            int32x4_t lp_center = vld1q_s32(&lowpass[column]);
+            int32x4_t lp_right  = vld1q_s32(&lowpass[column + 1]);
+            int32x4_t hp_center = vld1q_s32(&highpass[column]);
+
+            int32x4_t diff_e = vsubq_s32(lp_left, lp_right);
+            diff_e = vaddq_s32(diff_e, four);
+            diff_e = vshrq_n_s32(diff_e, 3);
+            int32x4_t even_v = vaddq_s32(diff_e, lp_center);
+            even_v = vaddq_s32(even_v, hp_center);
+            even_v = vshrq_n_s32(even_v, 1);
+
+            int32x4_t diff_o = vsubq_s32(lp_right, lp_left);
+            diff_o = vaddq_s32(diff_o, four);
+            diff_o = vshrq_n_s32(diff_o, 3);
+            int32x4_t odd_v = vaddq_s32(diff_o, lp_center);
+            odd_v = vsubq_s32(odd_v, hp_center);
+            odd_v = vshrq_n_s32(odd_v, 1);
+
+            int32x4x2_t interleaved;
+            interleaved.val[0] = even_v;
+            interleaved.val[1] = odd_v;
+            vst2q_s32(&output[2 * column], interleaved);
+        }
+    }
+#endif
+
+    // Scalar middle columns
     for (; column < last_column; column++)
     {
-        int32_t even = 0;        // Result of convolution with even filter
-        int32_t odd = 0;        // Result of convolution with odd filter
-        
-        // Apply the even reconstruction filter to the lowpass band
-        
-        even += lowpass[column - 1];
-        even -= lowpass[column + 1];
-        even += 4;
+        even = lowpass[column - 1] - lowpass[column + 1] + 4;
         even >>= 3;
-        even += lowpass[column + 0];
-        
-        // Add the highpass correction
-        even += highpass[column];
-        even >>= 1;
-        
-        // The lowpass result should be a positive number
-        //assert(0 <= even && even <= INT16_MAX);
-        
-        // Place the even result in the even column
-        //output[2 * column + 0] = clamp_uint12(even);
-        output[2 * column + 0] = clamp_uint14(even);
-        
-        // Apply the odd reconstruction filter to the lowpass band
-        odd -= lowpass[column - 1];
-        odd += lowpass[column + 1];
-        odd += 4;
+        even += lowpass[column];
+        even = (even + highpass[column]) >> 1;
+        output[2 * column] = ClampPixel(even);
+
+        odd = -lowpass[column - 1] + lowpass[column + 1] + 4;
         odd >>= 3;
-        odd += lowpass[column + 0];
-        
-        // Subtract the highpass correction
-        odd -= highpass[column];
-        odd >>= 1;
-        
-        // The lowpass result should be a positive number
-        //assert(0 <= odd && odd <= INT16_MAX);
-        
-        // Place the odd result in the odd column
-        //output[2 * column + 1] = clamp_uint14(odd);
-        output[2 * column + 1] = clamp_uint14(odd);
+        odd += lowpass[column];
+        odd = (odd - highpass[column]) >> 1;
+        output[2 * column + 1] = ClampPixel(odd);
     }
-    
-    // Should have exited the loop at the column for right border processing
+
     assert(column == last_column);
-    
-    // Process the last two output points with special filters for the right border
-    even = 0;
-    odd = 0;
-    
-    // Apply the even reconstruction filter to the lowpass band
-    even += 5 * lowpass[column + 0];
-    even += 4 * lowpass[column - 1];
-    even -= 1 * lowpass[column - 2];
-    even += rounding;
+
+    // Right border
+    even = 5 * lowpass[column] + 4 * lowpass[column - 1] - lowpass[column - 2] + rounding;
     even = DivideByShift(even, 3);
-    
-    // Add the highpass correction
-    even += highpass[column];
-    even >>= 1;
-    
-    // The lowpass result should be a positive number
-    //assert(0 <= even && even <= INT16_MAX);
-    
-    // Place the even result in the even column
-    output[2 * column + 0] = clamp_uint14(even);
-    
+    even = (even + highpass[column]) >> 1;
+    output[2 * column] = ClampPixel(even);
+
     if (2 * column + 1 < output_width)
     {
-        // Apply the odd reconstruction filter to the lowpass band
-        odd += 11 * lowpass[column + 0];
-        odd -=  4 * lowpass[column - 1];
-        odd +=  1 * lowpass[column - 2];
-        odd += rounding;
+        odd = 11 * lowpass[column] - 4 * lowpass[column - 1] + lowpass[column - 2] + rounding;
         odd = DivideByShift(odd, 3);
-        
-        // Subtract the highpass correction
-        odd -= highpass[column];
-        odd >>= 1;
-        
-        // The lowpass result should be a positive number
-        //assert(0 <= odd && odd <= INT16_MAX);
-        
-        // Place the odd result in the odd column
-        output[2 * column + 1] = clamp_uint14(odd);
+        odd = (odd - highpass[column]) >> 1;
+        output[2 * column + 1] = ClampPixel(odd);
     }
-    
+
     return CODEC_ERROR_OKAY;
 }
 
 /*!
- @brief Apply the inverse horizontal wavelet transform
- This routine is similar to @ref InvertHorizontal16s, but a scale factor
- that was applied during encoding is removed from the output values.
+ @brief Apply the inverse horizontal wavelet transform with descaling
  */
 STATIC CODEC_ERROR InvertHorizontalDescale16s(PIXEL *lowpass, PIXEL *highpass, PIXEL *output,
                                               DIMENSION input_width, DIMENSION output_width,
                                               int descale)
 {
     const int last_column = input_width - 1;
-    
-    // Start processing at the beginning of the row
     int column = 0;
-    
     int descale_shift = 0;
-    
-    int32_t even;
-    int32_t odd;
-    
-    /*
-     The implementation of the inverse filter includes descaling by a factor of two
-     because the last division by two in the computation of the even and odd results
-     that is performed using a right arithmetic shift has been omitted from the code.
-     */
-    if (descale == 2) {
-        descale_shift = 1;
-    }
-    
-    // Check that the descaling value is reasonable
+    int32_t even, odd;
+
+    if (descale == 2) descale_shift = 1;
+    else if (descale == 3) descale_shift = 2;
     assert(descale_shift >= 0);
-    
-    // Process the first two output points with special filters for the left border
-    even = 0;
-    odd = 0;
-    
-    // Apply the even reconstruction filter to the lowpass band
-    even += 11 * lowpass[column + 0];
-    even -=  4 * lowpass[column + 1];
-    even +=  1 * lowpass[column + 2];
-    even += rounding;
+
+    // Left border
+    even = 11 * lowpass[0] - 4 * lowpass[1] + lowpass[2] + rounding;
     even = DivideByShift(even, 3);
-    
-    // Add the highpass correction
-    even += highpass[column];
-    
-    // Remove any scaling used during encoding
-    even <<= descale_shift;
-    
-    // The lowpass result should be a positive number
-    //assert(0 <= even && even <= INT16_MAX);
-    
-    // Apply the odd reconstruction filter to the lowpass band
-    odd += 5 * lowpass[column + 0];
-    odd += 4 * lowpass[column + 1];
-    odd -= 1 * lowpass[column + 2];
-    odd += rounding;
+    even = (even + highpass[0]) << descale_shift;
+
+    odd = 5 * lowpass[0] + 4 * lowpass[1] - lowpass[2] + rounding;
     odd = DivideByShift(odd, 3);
-    
-    // Subtract the highpass correction
-    odd -= highpass[column];
-    
-    // Remove any scaling used during encoding
-    odd <<= descale_shift;
-    
-    // The lowpass result should be a positive number
-    //assert(0 <= odd && odd <= INT16_MAX);
-    
-    output[2 * column + 0] = ClampPixel(even);
-    output[2 * column + 1] = ClampPixel(odd);
-    
-    // Advance to the next input column (second pair of output values)
-    column++;
-    
-    // Process the rest of the columns up to the last column in the row
+    odd = (odd - highpass[0]) << descale_shift;
+
+    output[0] = ClampPixel(even);
+    output[1] = ClampPixel(odd);
+    column = 1;
+
+#if ENABLED(NEON)
+    {
+        const int32x4_t four = vdupq_n_s32(4);
+        const int32x4_t descale_vec = vdupq_n_s32(descale_shift);
+
+        for (; column + 3 < last_column; column += 4)
+        {
+            int32x4_t lp_left   = vld1q_s32(&lowpass[column - 1]);
+            int32x4_t lp_center = vld1q_s32(&lowpass[column]);
+            int32x4_t lp_right  = vld1q_s32(&lowpass[column + 1]);
+            int32x4_t hp_center = vld1q_s32(&highpass[column]);
+
+            int32x4_t diff_e = vsubq_s32(lp_left, lp_right);
+            diff_e = vaddq_s32(diff_e, four);
+            diff_e = vshrq_n_s32(diff_e, 3);
+            int32x4_t even_v = vaddq_s32(diff_e, lp_center);
+            even_v = vaddq_s32(even_v, hp_center);
+            even_v = vshlq_s32(even_v, descale_vec);
+
+            int32x4_t diff_o = vsubq_s32(lp_right, lp_left);
+            diff_o = vaddq_s32(diff_o, four);
+            diff_o = vshrq_n_s32(diff_o, 3);
+            int32x4_t odd_v = vaddq_s32(diff_o, lp_center);
+            odd_v = vsubq_s32(odd_v, hp_center);
+            odd_v = vshlq_s32(odd_v, descale_vec);
+
+            int32x4x2_t interleaved;
+            interleaved.val[0] = even_v;
+            interleaved.val[1] = odd_v;
+            vst2q_s32(&output[2 * column], interleaved);
+        }
+    }
+#endif
+
+    // Scalar middle columns
     for (; column < last_column; column++)
     {
-        int32_t even = 0;        // Result of convolution with even filter
-        int32_t odd = 0;        // Result of convolution with odd filter
-        
-        // Apply the even reconstruction filter to the lowpass band
-        even += lowpass[column - 1];
-        even -= lowpass[column + 1];
-        even += 4;
+        even = lowpass[column - 1] - lowpass[column + 1] + 4;
         even >>= 3;
-        even += lowpass[column + 0];
-        
-        // Add the highpass correction
-        even += highpass[column];
-        
-        // Remove any scaling used during encoding
-        even <<= descale_shift;
-        
-        // The lowpass result should be a positive number
-        //assert(0 <= even && even <= INT16_MAX);
-        
-        // Place the even result in the even column
-        output[2 * column + 0] = ClampPixel(even);
-        
-        // Apply the odd reconstruction filter to the lowpass band
-        odd -= lowpass[column - 1];
-        odd += lowpass[column + 1];
-        odd += 4;
+        even += lowpass[column];
+        even = (even + highpass[column]) << descale_shift;
+        output[2 * column] = ClampPixel(even);
+
+        odd = -lowpass[column - 1] + lowpass[column + 1] + 4;
         odd >>= 3;
-        odd += lowpass[column + 0];
-        
-        // Subtract the highpass correction
-        odd -= highpass[column];
-        
-        // Remove any scaling used during encoding
-        odd <<= descale_shift;
-        
-        // The lowpass result should be a positive number
-        //assert(0 <= odd && odd <= INT16_MAX);
-        
-        // Place the odd result in the odd column
+        odd += lowpass[column];
+        odd = (odd - highpass[column]) << descale_shift;
         output[2 * column + 1] = ClampPixel(odd);
     }
-    
-    // Should have exited the loop at the column for right border processing
+
     assert(column == last_column);
-    
-    // Process the last two output points with special filters for the right border
-    even = 0;
-    odd = 0;
-    
-    // Apply the even reconstruction filter to the lowpass band
-    even += 5 * lowpass[column + 0];
-    even += 4 * lowpass[column - 1];
-    even -= 1 * lowpass[column - 2];
-    even += rounding;
+
+    // Right border
+    even = 5 * lowpass[column] + 4 * lowpass[column - 1] - lowpass[column - 2] + rounding;
     even = DivideByShift(even, 3);
-    
-    // Add the highpass correction
-    even += highpass[column];
-    
-    // Remove any scaling used during encoding
-    even <<= descale_shift;
-    
-    // The lowpass result should be a positive number
-    //assert(0 <= even && even <= INT16_MAX);
-    
-    // Place the even result in the even column
-    output[2 * column + 0] = ClampPixel(even);
-    
+    even = (even + highpass[column]) << descale_shift;
+    output[2 * column] = ClampPixel(even);
+
     if (2 * column + 1 < output_width)
     {
-        // Apply the odd reconstruction filter to the lowpass band
-        odd += 11 * lowpass[column + 0];
-        odd -=  4 * lowpass[column - 1];
-        odd +=  1 * lowpass[column - 2];
-        odd += rounding;
+        odd = 11 * lowpass[column] - 4 * lowpass[column - 1] + lowpass[column - 2] + rounding;
         odd = DivideByShift(odd, 3);
-        
-        // Subtract the highpass correction
-        odd -= highpass[column];
-        
-        // Remove any scaling used during encoding
-        odd <<= descale_shift;
-        
-        // The lowpass result should be a positive number
-        //assert(0 <= odd && odd <= INT16_MAX);
-        
-        // Place the odd result in the odd column
+        odd = (odd - highpass[column]) << descale_shift;
         output[2 * column + 1] = ClampPixel(odd);
     }
-    
+
     return CODEC_ERROR_OKAY;
 }
 
 /*!
-	@brief Apply the inverse spatial wavelet filter
-	Dequantize the coefficients in the highpass bands and apply the
-	inverse spatial wavelet filter to compute a lowpass band that
-	has twice the width and height of the input bands.
-	The inverse vertical filter is applied to the upper and lower bands
-	on the left and the upper and lower bands on the right.  The inverse
-	horizontal filter is applied to the left and right (lowpass and highpass)
-	results from the vertical inverse.  Each application of the inverse
-	vertical filter produces two output rows and each application of the
-	inverse horizontal filter produces an output row that is twice as wide.
-	The inverse wavelet filter is a three tap filter.
-	
-	For the even output values, add and subtract the off-center values,
-	add the rounding correction, and divide by eight, then add the center
-	value, add the highpass coefficient, and divide by two.
-	
-	For the odd output values, the add and subtract operations for the
-	off-center values are reversed the the highpass coefficient is subtracted.
-	Divisions are implemented by right arithmetic shifts.
-	Special formulas for the inverse vertical filter are applied to the top
-	and bottom rows.
+	@brief Apply the inverse spatial wavelet filter (no descaling)
  */
 CODEC_ERROR InvertSpatialQuant16s(gpr_allocator *allocator,
                                   PIXEL *lowlow_band, int lowlow_pitch,
@@ -379,400 +299,262 @@ CODEC_ERROR InvertSpatialQuant16s(gpr_allocator *allocator,
     PIXEL *highlow = highlow_band;
     PIXEL *highhigh = highhigh_band;
     PIXEL *output = output_image;
-    PIXEL *even_lowpass;
-    PIXEL *even_highpass;
-    PIXEL *odd_lowpass;
-    PIXEL *odd_highpass;
-    PIXEL *even_output;
-    PIXEL *odd_output;
+    PIXEL *even_lowpass, *even_highpass, *odd_lowpass, *odd_highpass;
+    PIXEL *even_output, *odd_output;
     size_t buffer_row_size;
     int last_row = input_height - 1;
     int row, column;
-    
     PIXEL *lowhigh_row[3];
-    
     PIXEL *lowhigh_line[3];
-    PIXEL *highlow_line;
-    PIXEL *highhigh_line;
-    
+    PIXEL *highlow_line, *highhigh_line;
+
     QUANT highlow_quantization = quantization[HL_BAND];
     QUANT lowhigh_quantization = quantization[LH_BAND];
     QUANT highhigh_quantization = quantization[HH_BAND];
-    
-    // Compute positions within the temporary buffer for each row of horizontal lowpass
-    // and highpass intermediate coefficients computed by the vertical inverse transform
+
     buffer_row_size = input_width * sizeof(PIXEL);
-    
-    // Compute the positions of the even and odd rows of coefficients
-    even_lowpass = (PIXEL *)allocator->Alloc(buffer_row_size);
+
+    even_lowpass  = (PIXEL *)allocator->Alloc(buffer_row_size);
     even_highpass = (PIXEL *)allocator->Alloc(buffer_row_size);
-    odd_lowpass = (PIXEL *)allocator->Alloc(buffer_row_size);
-    odd_highpass = (PIXEL *)allocator->Alloc(buffer_row_size);
-    
-    // Compute the positions of the dequantized highpass rows
+    odd_lowpass   = (PIXEL *)allocator->Alloc(buffer_row_size);
+    odd_highpass  = (PIXEL *)allocator->Alloc(buffer_row_size);
+
     lowhigh_line[0] = (PIXEL *)allocator->Alloc(buffer_row_size);
     lowhigh_line[1] = (PIXEL *)allocator->Alloc(buffer_row_size);
     lowhigh_line[2] = (PIXEL *)allocator->Alloc(buffer_row_size);
-    highlow_line = (PIXEL *)allocator->Alloc(buffer_row_size);
-    highhigh_line = (PIXEL *)allocator->Alloc(buffer_row_size);
-    
-    // Convert pitch from bytes to pixels
-    lowlow_pitch /= sizeof(PIXEL);
-    lowhigh_pitch /= sizeof(PIXEL);
-    highlow_pitch /= sizeof(PIXEL);
+    highlow_line    = (PIXEL *)allocator->Alloc(buffer_row_size);
+    highhigh_line   = (PIXEL *)allocator->Alloc(buffer_row_size);
+
+    lowlow_pitch   /= sizeof(PIXEL);
+    lowhigh_pitch  /= sizeof(PIXEL);
+    highlow_pitch  /= sizeof(PIXEL);
     highhigh_pitch /= sizeof(PIXEL);
-    output_pitch /= sizeof(PIXEL);
-    
-    // Initialize the pointers to the even and odd output rows
+    output_pitch   /= sizeof(PIXEL);
+
     even_output = output;
-    odd_output = output + output_pitch;
-    
-    // Apply the vertical border filter to the first row
+    odd_output  = output + output_pitch;
+
+    // First row (top border)
     row = 0;
-    
-    // Set pointers to the first three rows in the first highpass band
     lowhigh_row[0] = lowhigh + 0 * lowhigh_pitch;
     lowhigh_row[1] = lowhigh + 1 * lowhigh_pitch;
     lowhigh_row[2] = lowhigh + 2 * lowhigh_pitch;
-    
-    // Dequantize three rows of highpass coefficients in the first highpass band
+
     DequantizeBandRow16s(lowhigh_row[0], input_width, lowhigh_quantization, lowhigh_line[0]);
     DequantizeBandRow16s(lowhigh_row[1], input_width, lowhigh_quantization, lowhigh_line[1]);
     DequantizeBandRow16s(lowhigh_row[2], input_width, lowhigh_quantization, lowhigh_line[2]);
-    
-    // Dequantize one row of coefficients each in the second and third highpass bands
-    DequantizeBandRow16s(highlow, input_width, highlow_quantization, highlow_line);
+    DequantizeBandRow16s(highlow,  input_width, highlow_quantization,  highlow_line);
     DequantizeBandRow16s(highhigh, input_width, highhigh_quantization, highhigh_line);
-    
+
     for (column = 0; column < input_width; column++)
     {
-        int32_t even = 0;		// Result of convolution with even filter
-        int32_t odd = 0;		// Result of convolution with odd filter
-        
-        
-        /***** Compute the vertical inverse for the left two bands *****/
-        
-        // Apply the even reconstruction filter to the lowpass band
+        int32_t even = 0, odd = 0;
+
+        // Left bands (lowlow + highlow) - top border filter
         even += 11 * lowlow[column + 0 * lowlow_pitch];
         even -=  4 * lowlow[column + 1 * lowlow_pitch];
         even +=  1 * lowlow[column + 2 * lowlow_pitch];
         even += rounding;
         even = DivideByShift(even, 3);
-        
-        // Add the highpass correction
         even += highlow_line[column];
         even >>= 1;
-        
-        // The inverse of the left two bands should be a positive number
-        //assert(0 <= even && even <= INT16_MAX);
-        
-        // Place the even result in the even row
         even_lowpass[column] = ClampPixel(even);
-        
-        // Apply the odd reconstruction filter to the lowpass band
+
         odd += 5 * lowlow[column + 0 * lowlow_pitch];
         odd += 4 * lowlow[column + 1 * lowlow_pitch];
         odd -= 1 * lowlow[column + 2 * lowlow_pitch];
         odd += rounding;
         odd = DivideByShift(odd, 3);
-        
-        // Subtract the highpass correction
         odd -= highlow_line[column];
         odd >>= 1;
-        
-        // The inverse of the left two bands should be a positive number
-        //assert(0 <= odd && odd <= INT16_MAX);
-        
-        // Place the odd result in the odd row
         odd_lowpass[column] = ClampPixel(odd);
-        
-        
-        /***** Compute the vertical inverse for the right two bands *****/
-        
-        even = 0;
-        odd = 0;
-        
-        // Apply the even reconstruction filter to the lowpass band
+
+        // Right bands (lowhigh + highhigh) - top border filter
+        even = 0; odd = 0;
         even += 11 * lowhigh_line[0][column];
         even -=  4 * lowhigh_line[1][column];
         even +=  1 * lowhigh_line[2][column];
         even += rounding;
         even = DivideByShift(even, 3);
-        
-        // Add the highpass correction
         even += highhigh_line[column];
         even >>= 1;
-        
-        // Place the even result in the even row
         even_highpass[column] = ClampPixel(even);
-        
-        // Apply the odd reconstruction filter to the lowpass band
+
         odd += 5 * lowhigh_line[0][column];
         odd += 4 * lowhigh_line[1][column];
         odd -= 1 * lowhigh_line[2][column];
         odd += rounding;
         odd = DivideByShift(odd, 3);
-        
-        // Subtract the highpass correction
         odd -= highhigh_line[column];
         odd >>= 1;
-        
-        // Place the odd result in the odd row
         odd_highpass[column] = ClampPixel(odd);
     }
-    
-    // Apply the inverse horizontal transform to the even and odd rows
+
     InvertHorizontal16s(even_lowpass, even_highpass, even_output, input_width, output_width);
-    InvertHorizontal16s(odd_lowpass, odd_highpass, odd_output, input_width, output_width);
-    
-    // Advance to the next pair of even and odd output rows
+    InvertHorizontal16s(odd_lowpass,  odd_highpass,  odd_output,  input_width, output_width);
+
     even_output += 2 * output_pitch;
-    odd_output += 2 * output_pitch;
-    
-    // Always advance the highpass row pointers
-    highlow += highlow_pitch;
+    odd_output  += 2 * output_pitch;
+    highlow  += highlow_pitch;
     highhigh += highhigh_pitch;
-    
-    // Advance the row index
     row++;
-    
-    // Process the middle rows using the interior reconstruction filters
+
+    // Middle rows
     for (; row < last_row; row++)
     {
-        // Dequantize one row from each of the two highpass bands
-        DequantizeBandRow16s(highlow, input_width, highlow_quantization, highlow_line);
+        DequantizeBandRow16s(highlow,  input_width, highlow_quantization,  highlow_line);
         DequantizeBandRow16s(highhigh, input_width, highhigh_quantization, highhigh_line);
-        
-        // Process the entire row
-        for (column = 0; column < input_width; column++)
+
+        column = 0;
+#if ENABLED(NEON)
         {
-            int32_t even = 0;		// Result of convolution with even filter
-            int32_t odd = 0;		// Result of convolution with odd filter
-            
-            
-            /***** Compute the vertical inverse for the left two bands *****/
-            
-            // Apply the even reconstruction filter to the lowpass band
+            const int width_m4 = (input_width / 4) * 4;
+            for (; column < width_m4; column += 4)
+            {
+                // Left bands
+                InvertVerticalMiddle4_NEON(
+                    lowlow + 0 * lowlow_pitch, lowlow + 1 * lowlow_pitch, lowlow + 2 * lowlow_pitch,
+                    highlow_line, even_lowpass, odd_lowpass, column);
+                // Right bands
+                InvertVerticalMiddle4_NEON(
+                    lowhigh_line[0], lowhigh_line[1], lowhigh_line[2],
+                    highhigh_line, even_highpass, odd_highpass, column);
+            }
+        }
+#endif
+        for (; column < input_width; column++)
+        {
+            int32_t even = 0, odd = 0;
+
             even += lowlow[column + 0 * lowlow_pitch];
             even -= lowlow[column + 2 * lowlow_pitch];
-            even += 4;
-            even >>= 3;
+            even += 4; even >>= 3;
             even += lowlow[column + 1 * lowlow_pitch];
-            
-            // Add the highpass correction
             even += highlow_line[column];
             even >>= 1;
-            
-            // The inverse of the left two bands should be a positive number
-            //assert(0 <= even && even <= INT16_MAX);
-            
-            // Place the even result in the even row
             even_lowpass[column] = ClampPixel(even);
-            
-            // Apply the odd reconstruction filter to the lowpass band
+
             odd -= lowlow[column + 0 * lowlow_pitch];
             odd += lowlow[column + 2 * lowlow_pitch];
-            odd += 4;
-            odd >>= 3;
+            odd += 4; odd >>= 3;
             odd += lowlow[column + 1 * lowlow_pitch];
-            
-            // Subtract the highpass correction
             odd -= highlow_line[column];
             odd >>= 1;
-            
-            // The inverse of the left two bands should be a positive number
-            //assert(0 <= odd && odd <= INT16_MAX);
-            
-            // Place the odd result in the odd row
             odd_lowpass[column] = ClampPixel(odd);
-            
-            
-            /***** Compute the vertical inverse for the right two bands *****/
-            
-            even = 0;
-            odd = 0;
-            
-            // Apply the even reconstruction filter to the lowpass band
+
+            even = 0; odd = 0;
             even += lowhigh_line[0][column];
             even -= lowhigh_line[2][column];
-            even += 4;
-            even >>= 3;
+            even += 4; even >>= 3;
             even += lowhigh_line[1][column];
-            
-            // Add the highpass correction
             even += highhigh_line[column];
             even >>= 1;
-            
-            // Place the even result in the even row
             even_highpass[column] = ClampPixel(even);
-            
-            // Apply the odd reconstruction filter to the lowpass band
+
             odd -= lowhigh_line[0][column];
             odd += lowhigh_line[2][column];
-            odd += 4;
-            odd >>= 3;
+            odd += 4; odd >>= 3;
             odd += lowhigh_line[1][column];
-            
-            // Subtract the highpass correction
             odd -= highhigh_line[column];
             odd >>= 1;
-            
-            // Place the odd result in the odd row
             odd_highpass[column] = ClampPixel(odd);
         }
-        
-        // Apply the inverse horizontal transform to the even and odd rows and descale the results
+
         InvertHorizontal16s(even_lowpass, even_highpass, even_output, input_width, output_width);
-        InvertHorizontal16s(odd_lowpass, odd_highpass, odd_output, input_width, output_width);
-        
-        // Advance to the next input row in each band
-        lowlow += lowlow_pitch;
-        lowhigh += lowhigh_pitch;
-        highlow += highlow_pitch;
+        InvertHorizontal16s(odd_lowpass,  odd_highpass,  odd_output,  input_width, output_width);
+
+        lowlow   += lowlow_pitch;
+        lowhigh  += lowhigh_pitch;
+        highlow  += highlow_pitch;
         highhigh += highhigh_pitch;
-        
-        // Advance to the next pair of even and odd output rows
         even_output += 2 * output_pitch;
-        odd_output += 2 * output_pitch;
-        
+        odd_output  += 2 * output_pitch;
+
         if (row < last_row - 1)
         {
-            // Compute the address of the next row in the lowhigh band
             PIXEL *lowhigh_row_ptr = (lowhigh + 2 * lowhigh_pitch);
-            //PIXEL *lowhigh_row_ptr = (lowhigh + lowhigh_pitch);
-            
-            // Shift the rows in the buffer of dequantized lowhigh bands
             PIXEL *temp = lowhigh_line[0];
             lowhigh_line[0] = lowhigh_line[1];
             lowhigh_line[1] = lowhigh_line[2];
             lowhigh_line[2] = temp;
-            
-            // Undo quantization for the next row in the lowhigh band
             DequantizeBandRow16s(lowhigh_row_ptr, input_width, lowhigh_quantization, lowhigh_line[2]);
         }
     }
-    
-    // Should have exited the loop at the last row
+
     assert(row == last_row);
-    
-    // Advance the lowlow pointer to the last row in the band
     lowlow += lowlow_pitch;
-    
-    // Check that the band pointers are on the last row in each wavelet band
+
     assert(lowlow == (lowlow_band + last_row * lowlow_pitch));
-    
     assert(highlow == (highlow_band + last_row * highlow_pitch));
     assert(highhigh == (highhigh_band + last_row * highhigh_pitch));
-    
-    // Undo quantization for the highlow and highhigh bands
-    DequantizeBandRow16s(highlow, input_width, highlow_quantization, highlow_line);
+
+    DequantizeBandRow16s(highlow,  input_width, highlow_quantization,  highlow_line);
     DequantizeBandRow16s(highhigh, input_width, highhigh_quantization, highhigh_line);
-    
-    // Apply the vertical border filter to the last row
+
+    // Last row (bottom border)
     for (column = 0; column < input_width; column++)
     {
-        int32_t even = 0;		// Result of convolution with even filter
-        int32_t odd = 0;		// Result of convolution with odd filter
-        
-        
-        /***** Compute the vertical inverse for the left two bands *****/
-        
-        // Apply the even reconstruction filter to the lowpass band
-        even += 5 * lowlow[column + 0 * lowlow_pitch];
-        even += 4 * lowlow[column - 1 * lowlow_pitch];
-        even -= 1 * lowlow[column - 2 * lowlow_pitch];
+        int32_t even = 0, odd = 0;
+
+        // Left bands - bottom border
+        even += 5  * lowlow[column + 0 * lowlow_pitch];
+        even += 4  * lowlow[column - 1 * lowlow_pitch];
+        even -= 1  * lowlow[column - 2 * lowlow_pitch];
         even += 4;
         even = DivideByShift(even, 3);
-        
-        // Add the highpass correction
         even += highlow_line[column];
         even >>= 1;
-        
-        // The inverse of the left two bands should be a positive number
-        //assert(0 <= even && even <= INT16_MAX);
-        
-        // Place the even result in the even row
         even_lowpass[column] = ClampPixel(even);
-        
-        // Apply the odd reconstruction filter to the lowpass band
+
         odd += 11 * lowlow[column + 0 * lowlow_pitch];
         odd -=  4 * lowlow[column - 1 * lowlow_pitch];
         odd +=  1 * lowlow[column - 2 * lowlow_pitch];
         odd += 4;
         odd = DivideByShift(odd, 3);
-        
-        // Subtract the highpass correction
         odd -= highlow_line[column];
         odd >>= 1;
-        
-        // The inverse of the left two bands should be a positive number
-        //assert(0 <= odd && odd <= INT16_MAX);
-        
-        // Place the odd result in the odd row
         odd_lowpass[column] = ClampPixel(odd);
-        
-        
-        // Compute the vertical inverse for the right two bands //
-        
-        even = 0;
-        odd = 0;
-        
-        // Apply the even reconstruction filter to the lowpass band
-        even += 5 * lowhigh_line[2][column];
-        even += 4 * lowhigh_line[1][column];
-        even -= 1 * lowhigh_line[0][column];
+
+        // Right bands - bottom border
+        even = 0; odd = 0;
+        even += 5  * lowhigh_line[2][column];
+        even += 4  * lowhigh_line[1][column];
+        even -= 1  * lowhigh_line[0][column];
         even += 4;
         even = DivideByShift(even, 3);
-        
-        // Add the highpass correction
         even += highhigh_line[column];
         even >>= 1;
-        
-        // Place the even result in the even row
         even_highpass[column] = ClampPixel(even);
-        
-        // Apply the odd reconstruction filter to the lowpass band
+
         odd += 11 * lowhigh_line[2][column];
         odd -=  4 * lowhigh_line[1][column];
         odd +=  1 * lowhigh_line[0][column];
         odd += 4;
         odd = DivideByShift(odd, 3);
-        
-        // Subtract the highpass correction
         odd -= highhigh_line[column];
         odd >>= 1;
-        
-        // Place the odd result in the odd row
         odd_highpass[column] = ClampPixel(odd);
     }
-    
-    // Apply the inverse horizontal transform to the even and odd rows and descale the results
+
     InvertHorizontal16s(even_lowpass, even_highpass, even_output, input_width, output_width);
-    
-    // Is the output wavelet shorter than twice the height of the input wavelet?
-    if (2 * row + 1 < output_height) {
+    if (2 * row + 1 < output_height)
         InvertHorizontal16s(odd_lowpass, odd_highpass, odd_output, input_width, output_width);
-    }
-    
-    // Free the scratch buffers
+
     allocator->Free(even_lowpass);
     allocator->Free(even_highpass);
     allocator->Free(odd_lowpass);
     allocator->Free(odd_highpass);
-    
     allocator->Free(lowhigh_line[0]);
     allocator->Free(lowhigh_line[1]);
     allocator->Free(lowhigh_line[2]);
     allocator->Free(highlow_line);
     allocator->Free(highhigh_line);
-    
+
     return CODEC_ERROR_OKAY;
 }
 
 /*!
 	@brief Apply the inverse spatial transform with descaling
-	This routine is similar to @ref InvertSpatialQuant16s, but a scale factor
-	that was applied during encoding is removed from the output values.
  */
 CODEC_ERROR InvertSpatialQuantDescale16s(gpr_allocator *allocator,
                                          PIXEL *lowlow_band, int lowlow_pitch,
@@ -789,400 +571,256 @@ CODEC_ERROR InvertSpatialQuantDescale16s(gpr_allocator *allocator,
     PIXEL *highlow = highlow_band;
     PIXEL *highhigh = highhigh_band;
     PIXEL *output = output_image;
-    PIXEL *even_lowpass;
-    PIXEL *even_highpass;
-    PIXEL *odd_lowpass;
-    PIXEL *odd_highpass;
-    PIXEL *even_output;
-    PIXEL *odd_output;
+    PIXEL *even_lowpass, *even_highpass, *odd_lowpass, *odd_highpass;
+    PIXEL *even_output, *odd_output;
     size_t buffer_row_size;
     int last_row = input_height - 1;
     int row, column;
-    
     PIXEL *lowhigh_row[3];
-    
     PIXEL *lowhigh_line[3];
-    PIXEL *highlow_line;
-    PIXEL *highhigh_line;
-    
+    PIXEL *highlow_line, *highhigh_line;
+
     QUANT highlow_quantization = quantization[HL_BAND];
     QUANT lowhigh_quantization = quantization[LH_BAND];
     QUANT highhigh_quantization = quantization[HH_BAND];
-    
-    // Compute positions within the temporary buffer for each row of horizontal lowpass
-    // and highpass intermediate coefficients computed by the vertical inverse transform
+
     buffer_row_size = input_width * sizeof(PIXEL);
-    
-    // Allocate space for the even and odd rows of results from the inverse vertical filter
-    even_lowpass = (PIXEL *)allocator->Alloc(buffer_row_size);
+
+    even_lowpass  = (PIXEL *)allocator->Alloc(buffer_row_size);
     even_highpass = (PIXEL *)allocator->Alloc(buffer_row_size);
-    odd_lowpass = (PIXEL *)allocator->Alloc(buffer_row_size);
-    odd_highpass = (PIXEL *)allocator->Alloc(buffer_row_size);
-    
-    // Allocate scratch space for the dequantized highpass coefficients
+    odd_lowpass   = (PIXEL *)allocator->Alloc(buffer_row_size);
+    odd_highpass  = (PIXEL *)allocator->Alloc(buffer_row_size);
+
     lowhigh_line[0] = (PIXEL *)allocator->Alloc(buffer_row_size);
     lowhigh_line[1] = (PIXEL *)allocator->Alloc(buffer_row_size);
     lowhigh_line[2] = (PIXEL *)allocator->Alloc(buffer_row_size);
-    highlow_line = (PIXEL *)allocator->Alloc(buffer_row_size);
-    highhigh_line = (PIXEL *)allocator->Alloc(buffer_row_size);
-    
-    // Convert pitch from bytes to pixels
-    lowlow_pitch /= sizeof(PIXEL);
-    lowhigh_pitch /= sizeof(PIXEL);
-    highlow_pitch /= sizeof(PIXEL);
+    highlow_line    = (PIXEL *)allocator->Alloc(buffer_row_size);
+    highhigh_line   = (PIXEL *)allocator->Alloc(buffer_row_size);
+
+    lowlow_pitch   /= sizeof(PIXEL);
+    lowhigh_pitch  /= sizeof(PIXEL);
+    highlow_pitch  /= sizeof(PIXEL);
     highhigh_pitch /= sizeof(PIXEL);
-    output_pitch /= sizeof(PIXEL);
-    
-    // Initialize the pointers to the even and odd output rows
+    output_pitch   /= sizeof(PIXEL);
+
     even_output = output;
-    odd_output = output + output_pitch;
-    
-    // Apply the vertical border filter to the first row
+    odd_output  = output + output_pitch;
+
+    // First row (top border)
     row = 0;
-    
-    // Set pointers to the first three rows in the first highpass band
     lowhigh_row[0] = lowhigh + 0 * lowhigh_pitch;
     lowhigh_row[1] = lowhigh + 1 * lowhigh_pitch;
     lowhigh_row[2] = lowhigh + 2 * lowhigh_pitch;
-    
-    // Dequantize three rows of highpass coefficients in the first highpass band
+
     DequantizeBandRow16s(lowhigh_row[0], input_width, lowhigh_quantization, lowhigh_line[0]);
     DequantizeBandRow16s(lowhigh_row[1], input_width, lowhigh_quantization, lowhigh_line[1]);
     DequantizeBandRow16s(lowhigh_row[2], input_width, lowhigh_quantization, lowhigh_line[2]);
-    
-    // Dequantize one row of coefficients each in the second and third highpass bands
-    DequantizeBandRow16s(highlow, input_width, highlow_quantization, highlow_line);
+    DequantizeBandRow16s(highlow,  input_width, highlow_quantization,  highlow_line);
     DequantizeBandRow16s(highhigh, input_width, highhigh_quantization, highhigh_line);
-    
+
     for (column = 0; column < input_width; column++)
     {
-        int32_t even = 0;		// Result of convolution with even filter
-        int32_t odd = 0;		// Result of convolution with odd filter
-        
-        
-        /***** Compute the vertical inverse for the left two bands *****/
-        
-        // Apply the even reconstruction filter to the lowpass band
+        int32_t even = 0, odd = 0;
+
+        // Left bands - top border (descale uses DivideByShift(x,1) instead of >>1)
         even += 11 * lowlow[column + 0 * lowlow_pitch];
         even -=  4 * lowlow[column + 1 * lowlow_pitch];
         even +=  1 * lowlow[column + 2 * lowlow_pitch];
         even += rounding;
         even = DivideByShift(even, 3);
-        
-        // Add the highpass correction
         even += highlow_line[column];
         even = DivideByShift(even, 1);
-        
-        // The inverse of the left two bands should be a positive number
-        //assert(0 <= even && even <= INT16_MAX);
-        
-        // Place the even result in the even row
         even_lowpass[column] = ClampPixel(even);
-        
-        // Apply the odd reconstruction filter to the lowpass band
+
         odd += 5 * lowlow[column + 0 * lowlow_pitch];
         odd += 4 * lowlow[column + 1 * lowlow_pitch];
         odd -= 1 * lowlow[column + 2 * lowlow_pitch];
         odd += rounding;
         odd = DivideByShift(odd, 3);
-        
-        // Subtract the highpass correction
         odd -= highlow_line[column];
         odd = DivideByShift(odd, 1);
-        
-        // The inverse of the left two bands should be a positive number
-        //assert(0 <= odd && odd <= INT16_MAX);
-        
-        // Place the odd result in the odd row
         odd_lowpass[column] = ClampPixel(odd);
-        
-        
-        /***** Compute the vertical inverse for the right two bands *****/
-        
-        even = 0;
-        odd = 0;
-        
-        // Apply the even reconstruction filter to the lowpass band
+
+        // Right bands - top border
+        even = 0; odd = 0;
         even += 11 * lowhigh_line[0][column];
         even -=  4 * lowhigh_line[1][column];
         even +=  1 * lowhigh_line[2][column];
         even += rounding;
         even = DivideByShift(even, 3);
-        
-        // Add the highpass correction
         even += highhigh_line[column];
         even = DivideByShift(even, 1);
-        
-        // Place the even result in the even row
         even_highpass[column] = ClampPixel(even);
-        
-        // Apply the odd reconstruction filter to the lowpass band
+
         odd += 5 * lowhigh_line[0][column];
         odd += 4 * lowhigh_line[1][column];
         odd -= 1 * lowhigh_line[2][column];
         odd += rounding;
         odd = DivideByShift(odd, 3);
-        
-        // Subtract the highpass correction
         odd -= highhigh_line[column];
         odd = DivideByShift(odd, 1);
-        
-        // Place the odd result in the odd row
         odd_highpass[column] = ClampPixel(odd);
     }
-    
-    // Apply the inverse horizontal transform to the even and odd rows and descale the results
-    InvertHorizontalDescale16s(even_lowpass, even_highpass, even_output,
-                               input_width, output_width, descale);
-    
-    InvertHorizontalDescale16s(odd_lowpass, odd_highpass, odd_output,
-                               input_width, output_width, descale);
-    
-    // Advance to the next pair of even and odd output rows
+
+    InvertHorizontalDescale16s(even_lowpass, even_highpass, even_output, input_width, output_width, descale);
+    InvertHorizontalDescale16s(odd_lowpass,  odd_highpass,  odd_output,  input_width, output_width, descale);
+
     even_output += 2 * output_pitch;
-    odd_output += 2 * output_pitch;
-    
-    // Always advance the highpass row pointers
-    highlow += highlow_pitch;
+    odd_output  += 2 * output_pitch;
+    highlow  += highlow_pitch;
     highhigh += highhigh_pitch;
-    
-    // Advance the row index
     row++;
-    
-    // Process the middle rows using the interior reconstruction filters
+
+    // Middle rows
     for (; row < last_row; row++)
     {
-        // Dequantize one row from each of the two highpass bands
-        DequantizeBandRow16s(highlow, input_width, highlow_quantization, highlow_line);
+        DequantizeBandRow16s(highlow,  input_width, highlow_quantization,  highlow_line);
         DequantizeBandRow16s(highhigh, input_width, highhigh_quantization, highhigh_line);
-        
-        // Process the entire row
-        for (column = 0; column < input_width; column++)
+
+        column = 0;
+#if ENABLED(NEON)
         {
-            int32_t even = 0;		// Result of convolution with even filter
-            int32_t odd = 0;		// Result of convolution with odd filter
-            
-            
-            /***** Compute the vertical inverse for the left two bands *****/
-            
-            // Apply the even reconstruction filter to the lowpass band
+            const int width_m4 = (input_width / 4) * 4;
+            for (; column < width_m4; column += 4)
+            {
+                // Left bands
+                InvertVerticalMiddle4Descale_NEON(
+                    lowlow + 0 * lowlow_pitch, lowlow + 1 * lowlow_pitch, lowlow + 2 * lowlow_pitch,
+                    highlow_line, even_lowpass, odd_lowpass, column);
+                // Right bands
+                InvertVerticalMiddle4Descale_NEON(
+                    lowhigh_line[0], lowhigh_line[1], lowhigh_line[2],
+                    highhigh_line, even_highpass, odd_highpass, column);
+            }
+        }
+#endif
+        for (; column < input_width; column++)
+        {
+            int32_t even = 0, odd = 0;
+
             even += lowlow[column + 0 * lowlow_pitch];
             even -= lowlow[column + 2 * lowlow_pitch];
-            even += 4;
-            even >>= 3;
+            even += 4; even >>= 3;
             even += lowlow[column + 1 * lowlow_pitch];
-            
-            // Add the highpass correction
             even += highlow_line[column];
             even = DivideByShift(even, 1);
-            
-            // The inverse of the left two bands should be a positive number
-            //assert(0 <= even && even <= INT16_MAX);
-            
-            // Place the even result in the even row
             even_lowpass[column] = ClampPixel(even);
-            
-            // Apply the odd reconstruction filter to the lowpass band
+
             odd -= lowlow[column + 0 * lowlow_pitch];
             odd += lowlow[column + 2 * lowlow_pitch];
-            odd += 4;
-            odd >>= 3;
+            odd += 4; odd >>= 3;
             odd += lowlow[column + 1 * lowlow_pitch];
-            
-            // Subtract the highpass correction
             odd -= highlow_line[column];
             odd = DivideByShift(odd, 1);
-            
-            // The inverse of the left two bands should be a positive number
-            //assert(0 <= odd && odd <= INT16_MAX);
-            
-            // Place the odd result in the odd row
             odd_lowpass[column] = ClampPixel(odd);
-            
-            
-            /***** Compute the vertical inverse for the right two bands *****/
-            
-            even = 0;
-            odd = 0;
-            
-            // Apply the even reconstruction filter to the lowpass band
+
+            even = 0; odd = 0;
             even += lowhigh_line[0][column];
             even -= lowhigh_line[2][column];
-            even += 4;
-            even >>= 3;
+            even += 4; even >>= 3;
             even += lowhigh_line[1][column];
-            
-            // Add the highpass correction
             even += highhigh_line[column];
             even = DivideByShift(even, 1);
-            
-            // Place the even result in the even row
             even_highpass[column] = ClampPixel(even);
-            
-            // Apply the odd reconstruction filter to the lowpass band
+
             odd -= lowhigh_line[0][column];
             odd += lowhigh_line[2][column];
-            odd += 4;
-            odd >>= 3;
+            odd += 4; odd >>= 3;
             odd += lowhigh_line[1][column];
-            
-            // Subtract the highpass correction
             odd -= highhigh_line[column];
             odd = DivideByShift(odd, 1);
-            
-            // Place the odd result in the odd row
             odd_highpass[column] = ClampPixel(odd);
         }
-        
-        // Apply the inverse horizontal transform to the even and odd rows and descale the results
-        InvertHorizontalDescale16s(even_lowpass, even_highpass, even_output,
-                                   input_width, output_width, descale);
-        
-        InvertHorizontalDescale16s(odd_lowpass, odd_highpass, odd_output,
-                                   input_width, output_width, descale);
-        
-        // Advance to the next input row in each band
-        lowlow += lowlow_pitch;
-        lowhigh += lowhigh_pitch;
-        highlow += highlow_pitch;
+
+        InvertHorizontalDescale16s(even_lowpass, even_highpass, even_output, input_width, output_width, descale);
+        InvertHorizontalDescale16s(odd_lowpass,  odd_highpass,  odd_output,  input_width, output_width, descale);
+
+        lowlow   += lowlow_pitch;
+        lowhigh  += lowhigh_pitch;
+        highlow  += highlow_pitch;
         highhigh += highhigh_pitch;
-        
-        // Advance to the next pair of even and odd output rows
         even_output += 2 * output_pitch;
-        odd_output += 2 * output_pitch;
-        
+        odd_output  += 2 * output_pitch;
+
         if (row < last_row - 1)
         {
-            // Compute the address of the next row in the lowhigh band
             PIXEL *lowhigh_row_ptr = (lowhigh + 2 * lowhigh_pitch);
-            
-            // Shift the rows in the buffer of dequantized lowhigh bands
             PIXEL *temp = lowhigh_line[0];
             lowhigh_line[0] = lowhigh_line[1];
             lowhigh_line[1] = lowhigh_line[2];
             lowhigh_line[2] = temp;
-            
-            // Undo quantization for the next row in the lowhigh band
             DequantizeBandRow16s(lowhigh_row_ptr, input_width, lowhigh_quantization, lowhigh_line[2]);
         }
     }
-    
-    // Should have exited the loop at the last row
+
     assert(row == last_row);
-    
-    // Advance the lowlow pointer to the last row in the band
     lowlow += lowlow_pitch;
-    
-    // Check that the band pointers are on the last row in each wavelet band
+
     assert(lowlow == (lowlow_band + last_row * lowlow_pitch));
-    
     assert(highlow == (highlow_band + last_row * highlow_pitch));
     assert(highhigh == (highhigh_band + last_row * highhigh_pitch));
-    
-    // Undo quantization for the highlow and highhigh bands
-    DequantizeBandRow16s(highlow, input_width, highlow_quantization, highlow_line);
+
+    DequantizeBandRow16s(highlow,  input_width, highlow_quantization,  highlow_line);
     DequantizeBandRow16s(highhigh, input_width, highhigh_quantization, highhigh_line);
-    
-    // Apply the vertical border filter to the last row
+
+    // Last row (bottom border)
     for (column = 0; column < input_width; column++)
     {
-        int32_t even = 0;		// Result of convolution with even filter
-        int32_t odd = 0;		// Result of convolution with odd filter
-        
-        
-        /***** Compute the vertical inverse for the left two bands *****/
-        
-        // Apply the even reconstruction filter to the lowpass band
-        even += 5 * lowlow[column + 0 * lowlow_pitch];
-        even += 4 * lowlow[column - 1 * lowlow_pitch];
-        even -= 1 * lowlow[column - 2 * lowlow_pitch];
+        int32_t even = 0, odd = 0;
+
+        // Left bands - bottom border
+        even += 5  * lowlow[column + 0 * lowlow_pitch];
+        even += 4  * lowlow[column - 1 * lowlow_pitch];
+        even -= 1  * lowlow[column - 2 * lowlow_pitch];
         even += rounding;
         even = DivideByShift(even, 3);
-        
-        // Add the highpass correction
         even += highlow_line[column];
         even = DivideByShift(even, 1);
-        
-        // The inverse of the left two bands should be a positive number
-        //assert(0 <= even && even <= INT16_MAX);
-        
-        // Place the even result in the even row
         even_lowpass[column] = ClampPixel(even);
-        
-        // Apply the odd reconstruction filter to the lowpass band
+
         odd += 11 * lowlow[column + 0 * lowlow_pitch];
         odd -=  4 * lowlow[column - 1 * lowlow_pitch];
         odd +=  1 * lowlow[column - 2 * lowlow_pitch];
         odd += rounding;
         odd = DivideByShift(odd, 3);
-        
-        // Subtract the highpass correction
         odd -= highlow_line[column];
         odd = DivideByShift(odd, 1);
-        
-        // The inverse of the left two bands should be a positive number
-        //assert(0 <= odd && odd <= INT16_MAX);
-        
-        // Place the odd result in the odd row
         odd_lowpass[column] = ClampPixel(odd);
-        
-        
-        /***** Compute the vertical inverse for the right two bands *****/
-        
-        even = 0;
-        odd = 0;
-        
-        // Apply the even reconstruction filter to the lowpass band
-        even += 5 * lowhigh_line[2][column];
-        even += 4 * lowhigh_line[1][column];
-        even -= 1 * lowhigh_line[0][column];
+
+        // Right bands - bottom border
+        even = 0; odd = 0;
+        even += 5  * lowhigh_line[2][column];
+        even += 4  * lowhigh_line[1][column];
+        even -= 1  * lowhigh_line[0][column];
         even += rounding;
         even = DivideByShift(even, 3);
-        
-        // Add the highpass correction
         even += highhigh_line[column];
         even = DivideByShift(even, 1);
-        
-        // Place the even result in the even row
         even_highpass[column] = ClampPixel(even);
-        
-        // Apply the odd reconstruction filter to the lowpass band
+
         odd += 11 * lowhigh_line[2][column];
         odd -=  4 * lowhigh_line[1][column];
         odd +=  1 * lowhigh_line[0][column];
         odd += rounding;
         odd = DivideByShift(odd, 3);
-        
-        // Subtract the highpass correction
         odd -= highhigh_line[column];
         odd = DivideByShift(odd, 1);
-        
-        // Place the odd result in the odd row
         odd_highpass[column] = ClampPixel(odd);
     }
-    
-    // Apply the inverse horizontal transform to the even and odd rows and descale the results
-    InvertHorizontalDescale16s(even_lowpass, even_highpass, even_output,
-                               input_width, output_width, descale);
-    
-    // Is the output wavelet shorter than twice the height of the input wavelet?
-    if (2 * row + 1 < output_height) {
-        InvertHorizontalDescale16s(odd_lowpass, odd_highpass, odd_output,
-                                   input_width, output_width, descale);
-    }
-    
-    // Free the scratch buffers
+
+    InvertHorizontalDescale16s(even_lowpass, even_highpass, even_output, input_width, output_width, descale);
+    if (2 * row + 1 < output_height)
+        InvertHorizontalDescale16s(odd_lowpass, odd_highpass, odd_output, input_width, output_width, descale);
+
     allocator->Free(even_lowpass);
     allocator->Free(even_highpass);
     allocator->Free(odd_lowpass);
     allocator->Free(odd_highpass);
-    
     allocator->Free(lowhigh_line[0]);
     allocator->Free(lowhigh_line[1]);
     allocator->Free(lowhigh_line[2]);
     allocator->Free(highlow_line);
     allocator->Free(highhigh_line);
-    
+
     return CODEC_ERROR_OKAY;
 }
-
diff --git a/source/lib/vc5_decoder/raw.c b/source/lib/vc5_decoder/raw.c
index 7cbac6ff..27fc1d29 100755
--- a/source/lib/vc5_decoder/raw.c
+++ b/source/lib/vc5_decoder/raw.c
@@ -17,6 +17,11 @@
  */
 
 #include "headers.h"
+#include "logcurve.h"
+
+#if ENABLED(NEON)
+#include <arm_neon.h>
+#endif
 
 /*!
 	@brief Pack the component arrays into an output image
@@ -30,6 +35,68 @@ CODEC_ERROR PackComponentsToRAW(const UNPACKED_IMAGE *image,
                                 DIMENSION width, DIMENSION height,
                                 ENABLED_PARTS enabled_parts, uint16_t output_bit_depth, PIXEL_FORMAT output_format )
 {
+    int component_bits = image->component_array_list[0].bits_per_component;
+    if (component_bits <= 0)
+        component_bits = 12;
+    if (component_bits > 16)
+        component_bits = 16;
+    if (!vc5_logcurve_bypass())
+    {
+        // Scan ALL components to find the maximum value (not just component 0)
+        // This is critical because RG/BG/GD are midpoint-centered at 2^(bits-1)
+        COMPONENT_VALUE max_val = 0;
+        for (int comp = 0; comp < image->component_count; ++comp)
+        {
+            COMPONENT_VALUE *component_data = image->component_array_list[comp].data;
+            if (component_data != NULL)
+            {
+                size_t total = (size_t)image->component_array_list[comp].width * image->component_array_list[comp].height;
+#if ENABLED(NEON)
+                {
+                    size_t total_m4 = (total / 4) * 4;
+                    int32x4_t vmax4 = vdupq_n_s32(0);
+                    size_t idx;
+                    for (idx = 0; idx < total_m4; idx += 4)
+                    {
+                        int32x4_t vals = vld1q_s32(&component_data[idx]);
+                        vmax4 = vmaxq_s32(vmax4, vals);
+                    }
+                    max_val = maximum(max_val, vmaxvq_s32(vmax4));
+                    for (; idx < total; ++idx)
+                    {
+                        if (component_data[idx] > max_val)
+                            max_val = component_data[idx];
+                    }
+                }
+#else
+                for (size_t idx = 0; idx < total; ++idx)
+                {
+                    COMPONENT_VALUE value = component_data[idx];
+                    if (value > max_val)
+                        max_val = value;
+                }
+#endif
+            }
+        }
+        if (max_val > 0)
+        {
+            int bits_from_data = 0;
+            uint32_t val = (uint32_t)max_val;
+            while (val > 0)
+            {
+                bits_from_data++;
+                val >>= 1;
+            }
+            if (bits_from_data > 0)
+            {
+                if (bits_from_data > 16) bits_from_data = 16;
+                if (bits_from_data < 12) bits_from_data = 12;
+                component_bits = minimum(component_bits, bits_from_data);
+            }
+        }
+    }
+    const int log_bits = component_bits;
+
     // Define pointers to the rows for each input component
     COMPONENT_VALUE *GS_input_buffer;
     COMPONENT_VALUE *RG_input_buffer;
@@ -53,6 +120,8 @@ CODEC_ERROR PackComponentsToRAW(const UNPACKED_IMAGE *image,
     // Compute the distance between the half rows in the Bayer grid
     output_half_pitch = output_pitch / 2;
     
+    const int32_t midpoint = 1 << (log_bits - 1);
+
     for (row = 0; row < height; row++)
     {
         COMPONENT_VALUE *GS_input_row_ptr = (COMPONENT_VALUE *)((uintptr_t)GS_input_buffer + row * image->component_array_list[0].pitch);
@@ -62,71 +131,157 @@ CODEC_ERROR PackComponentsToRAW(const UNPACKED_IMAGE *image,
         
         uint8_t *output_row_ptr = (uint8_t *)output_buffer + row * output_pitch;
         
-        const int32_t midpoint = 2048;
-        
         int column;
         
         output_row1_ptr = (uint16_t *)output_row_ptr;
         output_row2_ptr = (uint16_t *)(output_row_ptr + output_half_pitch);
         
-        // Pack the rows of Bayer components into the BYR4 pattern
-        for (column = 0; column < width; column++)
+        // Determine output ordering from format (once per row, outside column loop)
+        int rggb_order;
+        switch (output_format)
+        {
+            case PIXEL_FORMAT_RAW_RGGB_12:
+            case PIXEL_FORMAT_RAW_RGGB_14:
+            case PIXEL_FORMAT_RAW_RGGB_16:
+                rggb_order = 1;
+                break;
+            case PIXEL_FORMAT_RAW_GBRG_12:
+            case PIXEL_FORMAT_RAW_GBRG_14:
+            case PIXEL_FORMAT_RAW_GBRG_16:
+                rggb_order = 0;
+                break;
+            default:
+                assert(0);
+                rggb_order = 0;
+                break;
+        }
+
+#if ENABLED(NEON)
+        {
+            const int width_m4 = (width / 4) * 4;
+            const int32x4_t v_midpoint = vdupq_n_s32(midpoint);
+            const int32x4_t v_zero = vdupq_n_s32(0);
+            const int32x4_t v_max = vdupq_n_s32((1 << log_bits) - 1);
+            const int bypass = vc5_logcurve_bypass();
+            const int shift = 16 - output_bit_depth;
+
+            // NEON: process 4 columns at a time
+            for (column = 0; column < width_m4; column += 4)
+            {
+                int32x4_t gs = vld1q_s32(&GS_input_row_ptr[column]);
+                int32x4_t rg = vld1q_s32(&RG_input_row_ptr[column]);
+                int32x4_t bg = vld1q_s32(&BG_input_row_ptr[column]);
+                int32x4_t gd = vld1q_s32(&GD_input_row_ptr[column]);
+
+                gd = vsubq_s32(gd, v_midpoint);
+                rg = vsubq_s32(rg, v_midpoint);
+                bg = vsubq_s32(bg, v_midpoint);
+
+                int32x4_t r  = vaddq_s32(vshlq_n_s32(rg, 1), gs);
+                int32x4_t b  = vaddq_s32(vshlq_n_s32(bg, 1), gs);
+                int32x4_t g1 = vaddq_s32(gs, gd);
+                int32x4_t g2 = vsubq_s32(gs, gd);
+
+                r  = vmaxq_s32(vminq_s32(r,  v_max), v_zero);
+                g1 = vmaxq_s32(vminq_s32(g1, v_max), v_zero);
+                g2 = vmaxq_s32(vminq_s32(g2, v_max), v_zero);
+                b  = vmaxq_s32(vminq_s32(b,  v_max), v_zero);
+
+                // Extract to scalar for log curve lookups + store
+                int32_t r_a[4], g1_a[4], g2_a[4], b_a[4];
+                vst1q_s32(r_a, r);
+                vst1q_s32(g1_a, g1);
+                vst1q_s32(g2_a, g2);
+                vst1q_s32(b_a, b);
+
+                int k;
+                for (k = 0; k < 4; k++)
+                {
+                    int32_t R  = DecodeLogValue(r_a[k], log_bits);
+                    int32_t G1 = DecodeLogValue(g1_a[k], log_bits);
+                    int32_t G2 = DecodeLogValue(g2_a[k], log_bits);
+                    int32_t B  = DecodeLogValue(b_a[k], log_bits);
+
+                    if (!bypass)
+                    {
+                        R  >>= shift;
+                        G1 >>= shift;
+                        G2 >>= shift;
+                        B  >>= shift;
+                    }
+
+                    if (rggb_order)
+                    {
+                        output_row1_ptr[2 * (column + k) + 0] = (uint16_t)R;
+                        output_row1_ptr[2 * (column + k) + 1] = (uint16_t)G1;
+                        output_row2_ptr[2 * (column + k) + 0] = (uint16_t)G2;
+                        output_row2_ptr[2 * (column + k) + 1] = (uint16_t)B;
+                    }
+                    else
+                    {
+                        output_row1_ptr[2 * (column + k) + 0] = (uint16_t)G1;
+                        output_row1_ptr[2 * (column + k) + 1] = (uint16_t)B;
+                        output_row2_ptr[2 * (column + k) + 0] = (uint16_t)R;
+                        output_row2_ptr[2 * (column + k) + 1] = (uint16_t)G2;
+                    }
+                }
+            }
+        }
+#else
+        column = 0;
+#endif
+
+        // Scalar cleanup for remaining columns
+        for (; column < width; column++)
         {
             int32_t GS, RG, BG, GD;
             int32_t R, G1, G2, B;
-            
+
             GS = GS_input_row_ptr[column];
             RG = RG_input_row_ptr[column];
             BG = BG_input_row_ptr[column];
             GD = GD_input_row_ptr[column];
-            
-            // Convert unsigned values to signed values
+
             GD -= midpoint;
             RG -= midpoint;
             BG -= midpoint;
-            
+
             R = (RG << 1) + GS;
             B = (BG << 1) + GS;
             G1 = GS + GD;
             G2 = GS - GD;
-            
-            R  = clamp_uint(R,  12);
-            G1 = clamp_uint(G1, 12);
-            G2 = clamp_uint(G2, 12);
-            B  = clamp_uint(B,  12);
-            
-            // Apply inverse protune log curve
-            R  = DecoderLogCurve[R];
-            B  = DecoderLogCurve[B];
-            G1 = DecoderLogCurve[G1];
-            G2 = DecoderLogCurve[G2];
-
-            R   >>= (16 - output_bit_depth);
-            B   >>= (16 - output_bit_depth);
-            G1  >>= (16 - output_bit_depth);
-            G2  >>= (16 - output_bit_depth);
-            
-            switch (output_format)
+
+            R  = clamp_uint(R,  log_bits);
+            G1 = clamp_uint(G1, log_bits);
+            G2 = clamp_uint(G2, log_bits);
+            B  = clamp_uint(B,  log_bits);
+
+            R  = DecodeLogValue(R, log_bits);
+            B  = DecodeLogValue(B, log_bits);
+            G1 = DecodeLogValue(G1, log_bits);
+            G2 = DecodeLogValue(G2, log_bits);
+
+            if (!vc5_logcurve_bypass())
+            {
+                R   >>= (16 - output_bit_depth);
+                B   >>= (16 - output_bit_depth);
+                G1  >>= (16 - output_bit_depth);
+                G2  >>= (16 - output_bit_depth);
+            }
+
+            if (rggb_order)
+            {
+                output_row1_ptr[2 * column + 0] = (uint16_t)R;
+                output_row1_ptr[2 * column + 1] = (uint16_t)G1;
+                output_row2_ptr[2 * column + 0] = (uint16_t)G2;
+                output_row2_ptr[2 * column + 1] = (uint16_t)B;
+            }
+            else
             {
-                case PIXEL_FORMAT_RAW_RGGB_12:
-                case PIXEL_FORMAT_RAW_RGGB_14:
-                    output_row1_ptr[2 * column + 0] = (uint16_t)R;
-                    output_row1_ptr[2 * column + 1] = (uint16_t)G1;
-                    output_row2_ptr[2 * column + 0] = (uint16_t)G2;
-                    output_row2_ptr[2 * column + 1] = (uint16_t)B;
-                    break;
-                    
-                case PIXEL_FORMAT_RAW_GBRG_12:
-                case PIXEL_FORMAT_RAW_GBRG_14:
-                    output_row1_ptr[2 * column + 0] = (uint16_t)G1;
-                    output_row1_ptr[2 * column + 1] = (uint16_t)B;
-                    output_row2_ptr[2 * column + 0] = (uint16_t)R;
-                    output_row2_ptr[2 * column + 1] = (uint16_t)G2;
-                    break;
-                    
-                default:
-                    assert(0);
-                    break;
+                output_row1_ptr[2 * column + 0] = (uint16_t)G1;
+                output_row1_ptr[2 * column + 1] = (uint16_t)B;
+                output_row2_ptr[2 * column + 0] = (uint16_t)R;
+                output_row2_ptr[2 * column + 1] = (uint16_t)G2;
             }
         }
     }
diff --git a/source/lib/vc5_decoder/vc5_decoder.c b/source/lib/vc5_decoder/vc5_decoder.c
index f36ad4cf..08791a02 100755
--- a/source/lib/vc5_decoder/vc5_decoder.c
+++ b/source/lib/vc5_decoder/vc5_decoder.c
@@ -78,6 +78,14 @@ CODEC_ERROR vc5_decoder_process(const vc5_decoder_parameters*   decoding_paramet
         case VC5_DECODER_PIXEL_FORMAT_GBRG_14:
             parameters.output.format = PIXEL_FORMAT_RAW_GBRG_14;
             break;
+
+        case VC5_DECODER_PIXEL_FORMAT_RGGB_16:
+            parameters.output.format = PIXEL_FORMAT_RAW_RGGB_16;
+            break;
+
+        case VC5_DECODER_PIXEL_FORMAT_GBRG_16:
+            parameters.output.format = PIXEL_FORMAT_RAW_GBRG_16;
+            break;
             
         default:
             assert(0);
diff --git a/source/lib/vc5_decoder/vc5_decoder.h b/source/lib/vc5_decoder/vc5_decoder.h
index c87111a0..a7e2a7da 100755
--- a/source/lib/vc5_decoder/vc5_decoder.h
+++ b/source/lib/vc5_decoder/vc5_decoder.h
@@ -41,7 +41,11 @@
         VC5_DECODER_PIXEL_FORMAT_GBRG_12 = 2,               // GBRG 12bit pixels packed into 16bits
         
         VC5_DECODER_PIXEL_FORMAT_GBRG_14 = 3,               // GBRG 12bit pixels packed into 16bits
-        
+
+        VC5_DECODER_PIXEL_FORMAT_RGGB_16 = 4,               // RGGB 16bit pixels packed into 16bits
+
+        VC5_DECODER_PIXEL_FORMAT_GBRG_16 = 5,               // GBRG 16bit pixels packed into 16bits
+
         VC5_DECODER_PIXEL_FORMAT_DEFAULT = VC5_DECODER_PIXEL_FORMAT_RGGB_14,
         
     } VC5_DECODER_PIXEL_FORMAT;
diff --git a/source/lib/vc5_decoder/vlc.c b/source/lib/vc5_decoder/vlc.c
index 646fc59d..7d5b3ec6 100755
--- a/source/lib/vc5_decoder/vlc.c
+++ b/source/lib/vc5_decoder/vlc.c
@@ -17,6 +17,141 @@
  */
 
 #include "headers.h"
+#include <pthread.h>
+
+/*
+	Prefix lookup table for fast VLC decoding.
+
+	Instead of linear-searching through 264 codebook entries per codeword,
+	peek the top VLC_PEEK_BITS bits from the bitstream buffer and use them
+	as an index into this table.  Entries with bits > 0 are direct hits;
+	entries with bits == 0 require fallback to the original linear search.
+*/
+#define VLC_PEEK_BITS 12
+
+typedef struct {
+	uint8_t  bits;    /* Codeword length in bits (0 = not in table) */
+	uint8_t  value;   /* Unsigned coefficient magnitude */
+	uint16_t count;   /* Run length */
+} VLC_LOOKUP;
+
+static VLC_LOOKUP vlc_table[1 << VLC_PEEK_BITS];  /* 4096 entries, 16KB */
+static pthread_once_t vlc_init_once = PTHREAD_ONCE_INIT;
+static CODEBOOK *vlc_init_codebook = NULL;
+static int vlc_fast_disabled = 0;
+
+/*!
+	@brief Build the prefix lookup table from a codebook
+
+	For each codebook entry with codeword size <= VLC_PEEK_BITS, fill all
+	table slots whose top bits match the codeword.  A codeword of N bits
+	maps to 2^(PEEK-N) table entries (the "don't care" suffix bits).
+*/
+static void InitVLCTable(CODEBOOK *codebook)
+{
+	int codebook_length = codebook->length;
+	RLV *entries = (RLV *)((uint8_t *)codebook + sizeof(CODEBOOK));
+	int table_size = 1 << VLC_PEEK_BITS;
+
+	/* Clear the table (bits=0 means "not found") */
+	memset(vlc_table, 0, sizeof(vlc_table));
+
+	for (int i = 0; i < codebook_length; i++)
+	{
+		int size = entries[i].size;
+		if (size < 1 || size > VLC_PEEK_BITS)
+			continue;
+
+		uint32_t codeword = entries[i].bits;
+		int fill_count = 1 << (VLC_PEEK_BITS - size);
+		int base_index = (int)(codeword << (VLC_PEEK_BITS - size));
+
+		VLC_LOOKUP entry;
+		entry.bits  = (uint8_t)size;
+		entry.value = (uint8_t)entries[i].value;
+		entry.count = (uint16_t)entries[i].count;
+
+		for (int j = 0; j < fill_count; j++)
+		{
+			vlc_table[base_index + j] = entry;
+		}
+	}
+}
+
+/*!
+	@brief pthread_once callback for thread-safe VLC table initialization
+*/
+static void InitVLCTableOnce(void)
+{
+	InitVLCTable(vlc_init_codebook);
+	if (getenv("VLC_NO_FAST") != NULL)
+		vlc_fast_disabled = 1;
+}
+
+/*!
+	@brief Fast combined RLV + sign bit decode using prefix lookup table
+
+	Peeks VLC_PEEK_BITS bits from the bitstream buffer and does a single
+	table lookup.  If the codeword is found and fits in the available bits,
+	consumes the bits and reads the sign bit inline.
+
+	When the buffer is empty, refills before peeking.  When the buffer has
+	fewer than VLC_PEEK_BITS bits, peeks anyway (zero-padded) and validates
+	that the found codeword fits within the available bits.  Falls back to
+	GetRun() only when the codeword is genuinely longer than available bits
+	or not in the table.
+*/
+CODEC_ERROR GetRunFast(BITSTREAM *stream, CODEBOOK *codebook, RUN *run)
+{
+	/* Thread-safe one-time initialization of the lookup table.
+	   Note: assumes a single codebook is used throughout the decode session. */
+	if (vlc_init_codebook == NULL)
+		vlc_init_codebook = codebook;
+	assert(vlc_init_codebook == codebook);
+	pthread_once(&vlc_init_once, InitVLCTableOnce);
+
+	/* Bypass fast path if disabled via VLC_NO_FAST env var */
+	if (vlc_fast_disabled)
+		return GetRun(stream, codebook, run);
+
+	/* Refill if buffer is completely empty */
+	if (stream->count == 0)
+		GetBuffer(stream);
+
+	/* Peek top VLC_PEEK_BITS bits from buffer (zero-padded if count < PEEK).
+	   The buffer is left-aligned in 32 bits; unused low bits are already 0. */
+	uint32_t peek = stream->buffer >> (32 - VLC_PEEK_BITS);
+	VLC_LOOKUP *entry = &vlc_table[peek];
+
+	/* Validate: codeword must be in table AND fit within available bits.
+	   When count < PEEK_BITS, the zero-padded peek may yield a wrong entry
+	   for codewords longer than count, but we catch that with the size check. */
+	if (entry->bits == 0 || entry->bits > stream->count)
+		return GetRun(stream, codebook, run);
+
+	/* Consume codeword bits from the buffer */
+	stream->buffer <<= entry->bits;
+	stream->count -= entry->bits;
+	run->count = entry->count;
+
+	/* Handle sign bit inline for non-zero magnitude values */
+	int32_t value = (int32_t)entry->value;
+	if (value != 0)
+	{
+		/* Refill buffer if empty */
+		if (stream->count == 0)
+			GetBuffer(stream);
+
+		/* Read 1-bit sign: top bit of buffer */
+		if (stream->buffer >> 31)
+			value = -value;
+		stream->buffer <<= 1;
+		stream->count--;
+	}
+	run->value = value;
+
+	return CODEC_ERROR_OKAY;
+}
 
 /*!
 	@brief Parse a run length coded magnitude in the bitstream
diff --git a/source/lib/vc5_decoder/vlc.h b/source/lib/vc5_decoder/vlc.h
index 6ec41f69..9efcc315 100755
--- a/source/lib/vc5_decoder/vlc.h
+++ b/source/lib/vc5_decoder/vlc.h
@@ -95,6 +95,7 @@ extern "C" {
 
     CODEC_ERROR GetRlv(BITSTREAM *stream, CODEBOOK *codebook, RUN *run);
     CODEC_ERROR GetRun(BITSTREAM *stream, CODEBOOK *codebook, RUN *run);
+    CODEC_ERROR GetRunFast(BITSTREAM *stream, CODEBOOK *codebook, RUN *run);
 
 #ifdef __cplusplus
 }
diff --git a/source/lib/vc5_decoder/wavelet.c b/source/lib/vc5_decoder/wavelet.c
index a60eb0a9..d2c3b96e 100755
--- a/source/lib/vc5_decoder/wavelet.c
+++ b/source/lib/vc5_decoder/wavelet.c
@@ -53,19 +53,15 @@ CODEC_ERROR TransformInverseSpatialQuantLowpass(gpr_allocator *allocator, WAVELE
 	assert(output->data[0] != NULL);
 
 	// Check for valid quantization values
-	if (input->quant[0] == 0) {
-		input->quant[0] = 1;
-	}
-
-	assert(input->quant[0] > 0);
-	assert(input->quant[1] > 0);
-	assert(input->quant[2] > 0);
-	assert(input->quant[3] > 0);
+    for (int i = 0; i < 4; ++i)
+    {
+        if (input->quant[i] == 0)
+            input->quant[i] = 1;
+    }
 
-	if (prescale > 1)
+    if (prescale > 1)
 	{
 		// This is a spatial transform for the lowpass temporal band
-		assert(prescale == 2);
 
 		// Apply the inverse spatial transform for a lowpass band that is not prescaled
 		InvertSpatialQuantDescale16s(allocator,
@@ -78,10 +74,10 @@ CODEC_ERROR TransformInverseSpatialQuantLowpass(gpr_allocator *allocator, WAVELE
 									 output_width, output_height,
 									 prescale, input->quant);
 	}
-	else
-	{
-		// This case does not handle any prescaling applied during encoding
-		assert(prescale == 0);
+    else
+    {
+        if (prescale != 0)
+            prescale = 0;
 
 		// Apply the inverse spatial transform for a lowpass band that is not prescaled
 		InvertSpatialQuant16s(allocator,
@@ -125,21 +121,17 @@ CODEC_ERROR TransformInverseSpatialQuantArray(gpr_allocator *allocator,
 	assert(input->data[3] != NULL);
 
 	// Check for valid quantization values
-	if (input->quant[0] == 0) {
-		input->quant[0] = 1;
-	}
-
-	assert(input->quant[0] > 0);
-	assert(input->quant[1] > 0);
-	assert(input->quant[2] > 0);
-	assert(input->quant[3] > 0);
+    for (int i = 0; i < 4; ++i)
+    {
+        if (input->quant[i] == 0)
+            input->quant[i] = 1;
+    }
 
 	assert(output_width > 0 && output_height > 0 && output_pitch > 0 && output_buffer != NULL);
 
-	if (prescale > 1)
+    if (prescale > 1)
 	{
 		// This is a spatial transform for the lowpass temporal band
-		assert(prescale == 2);
 
 		// Apply the inverse spatial transform for a lowpass band that is not prescaled
 		InvertSpatialQuantDescale16s(allocator,
@@ -152,10 +144,10 @@ CODEC_ERROR TransformInverseSpatialQuantArray(gpr_allocator *allocator,
 									 output_width, output_height,
 									 prescale, input->quant);
 	}
-	else
-	{
-		// This case does not handle any prescaling applied during encoding
-		assert(prescale == 0);
+    else
+    {
+        if (prescale != 0)
+            prescale = 0;
 
 		// Apply the inverse spatial transform for a lowpass band that is not prescaled
 		InvertSpatialQuant16s(allocator,
diff --git a/source/lib/vc5_encoder/CMakeLists.txt b/source/lib/vc5_encoder/CMakeLists.txt
index 9dc143b7..9ee6b96c 100644
--- a/source/lib/vc5_encoder/CMakeLists.txt
+++ b/source/lib/vc5_encoder/CMakeLists.txt
@@ -15,7 +15,8 @@ include_directories( "../common/public" )
 # library
 add_library( ${LIB_NAME} STATIC ${SRC_FILES} ${INC_FILES} )
 
-target_link_libraries( ${LIB_NAME} )
+find_package( Threads REQUIRED )
+target_link_libraries( ${LIB_NAME} Threads::Threads )
 
 # set the folder where to place the projects
 set_target_properties( ${LIB_NAME} PROPERTIES FOLDER lib )
diff --git a/source/lib/vc5_encoder/encoder.c b/source/lib/vc5_encoder/encoder.c
index 0701a4dd..88a37d75 100755
--- a/source/lib/vc5_encoder/encoder.c
+++ b/source/lib/vc5_encoder/encoder.c
@@ -36,6 +36,12 @@
 #if ENABLED(NEON)
 #include <arm_neon.h>
 #endif
+#include <pthread.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <limits.h>
+
 
 /*!
 	@brief Align the bitstream to a byte boundary
@@ -352,6 +358,7 @@ CODEC_ERROR EncodeImage(IMAGE *image, STREAM *stream, RGB_IMAGE *rgb_image, ENCO
 		return error;
 	}
 
+
 	// Initialize the bitstream data structure
 	InitBitstream(&bitstream);
 
@@ -738,19 +745,33 @@ CODEC_ERROR ImageUnpackingProcess(const PACKED_IMAGE *input,
 	int bits_per_component;
 
 	// The configuration of component arrays is determined by the image format
-	switch (input->format)
-	{
+    switch (input->format)
+    {
     case PIXEL_FORMAT_RAW_RGGB_12:
     case PIXEL_FORMAT_RAW_RGGB_12P:
-    case PIXEL_FORMAT_RAW_RGGB_14:
     case PIXEL_FORMAT_RAW_GBRG_12:
     case PIXEL_FORMAT_RAW_GBRG_12P:
-    case PIXEL_FORMAT_RAW_RGGB_16:
         channel_count = 4;
         max_channel_width = input->width / 2;
         max_channel_height = input->height / 2;
         bits_per_component = 12;
         break;
+
+    case PIXEL_FORMAT_RAW_RGGB_14:
+    case PIXEL_FORMAT_RAW_GBRG_14:
+        channel_count = 4;
+        max_channel_width = input->width / 2;
+        max_channel_height = input->height / 2;
+        bits_per_component = 14;
+        break;
+
+    case PIXEL_FORMAT_RAW_GBRG_16:
+    case PIXEL_FORMAT_RAW_RGGB_16:
+        channel_count = 4;
+        max_channel_width = input->width / 2;
+        max_channel_height = input->height / 2;
+        bits_per_component = 16;
+        break;
             
 	default:
 		assert(0);
@@ -770,6 +791,10 @@ CODEC_ERROR ImageUnpackingProcess(const PACKED_IMAGE *input,
             UnpackImage_14(input, output, enabled_parts, true );
             break;
 
+        case PIXEL_FORMAT_RAW_GBRG_14:
+            UnpackImage_14(input, output, enabled_parts, false );
+            break;
+
         case PIXEL_FORMAT_RAW_RGGB_12:
             UnpackImage_12(input, output, enabled_parts, true );
             break;
@@ -785,6 +810,14 @@ CODEC_ERROR ImageUnpackingProcess(const PACKED_IMAGE *input,
         case PIXEL_FORMAT_RAW_GBRG_12P:
             UnpackImage_12P(input, output, enabled_parts, false );
             break;
+
+        case PIXEL_FORMAT_RAW_RGGB_16:
+            UnpackImage_16(input, output, enabled_parts, true );
+            break;
+
+        case PIXEL_FORMAT_RAW_GBRG_16:
+            UnpackImage_16(input, output, enabled_parts, false );
+            break;
             
         default:
             assert(0);
@@ -1163,14 +1196,13 @@ static void ForwardWaveletTransformRecursive(RECURSIVE_TRANSFORM_DATA *transform
     if( input_row_index == 0 )
     {
         int row;
-        
+
         for (row = 0; row < ROW_BUFFER_COUNT; row++)
         {
             PIXEL *input_row_ptr = (PIXEL *)((uintptr_t)input_ptr + row * input_pitch);
-            
+
             FilterHorizontalRow(input_row_ptr, lowpass_buffer[row], highpass_buffer[row], input_width, prescale);
         }
-        
         // Process the first row as a special case for the boundary condition
         FilterVerticalTopRow(lowpass_buffer, highpass_buffer, output_ptr, output_width, output_pitch, midpoints, multipliers, input_row_index );
         input_row_index += 2;
@@ -1277,37 +1309,40 @@ static void ForwardWaveletTransform(TRANSFORM *transform, const COMPONENT_ARRAY
     ForwardWaveletTransformRecursive( transform_data, 0, 0, 0xFFFF );
 }
 
-/*!
-	@brief Encode the portion of a sample that corresponds to a single layer
+/*! Thread argument for parallel forward wavelet transforms */
+typedef struct {
+	TRANSFORM *transform;
+	const COMPONENT_ARRAY *input_component;
+	PIXEL *lowpass_buffer[MAX_WAVELET_COUNT][ROW_BUFFER_COUNT];
+	PIXEL *highpass_buffer[MAX_WAVELET_COUNT][ROW_BUFFER_COUNT];
+	int midpoint_prequant;
+} FORWARD_THREAD_ARG;
 
-	Samples can be contain multiple subsamples.  Each subsample may correspond to
-	a different view.  For example, an encoded video sample may contain both the
-	left and right subsamples in a stereo pair.
-
-	Subsamples have been called tracks or channels, but this terminology can be
-	confused with separate video tracks in a multimedia container or the color
-	planes that are called channels elsewhere in this codec.
-
-	The subsamples are decoded seperately and composited to form a single frame
-	that is the output of the complete process of decoding a single video sample.
-	For this reason, the subsamples are called layers.
+static void *ForwardTransformThread(void *arg)
+{
+	FORWARD_THREAD_ARG *a = (FORWARD_THREAD_ARG *)arg;
+	ForwardWaveletTransform(a->transform, a->input_component,
+	                        a->lowpass_buffer, a->highpass_buffer,
+	                        a->midpoint_prequant);
+	return NULL;
+}
 
-	@todo Need to reset the codec state for each layer?
+/*!
+	@brief Encode the portion of a sample that corresponds to a single layer
 */
 //CODEC_ERROR EncodeLayer(ENCODER *encoder, void *buffer, size_t pitch, BITSTREAM *stream)
 CODEC_ERROR EncodeMultipleChannels(ENCODER *encoder, const UNPACKED_IMAGE *image, BITSTREAM *stream)
 {
 	CODEC_ERROR error = CODEC_ERROR_OKAY;
-    
+
 	int channel_count;
 	int channel_index;
 
 	channel_count = encoder->channel_count;
-    
+
 #if VC5_ENABLED_PART(VC5_PART_LAYERS)
 	if (IsPartEnabled(encoder->enabled_parts, VC5_PART_LAYERS))
 	{
-		// Write the tag value pairs that preceed the encoded wavelet tree
 		error = EncodeLayerHeader(encoder, stream);
 		if (error != CODEC_ERROR_OKAY) {
 			return error;
@@ -1315,43 +1350,87 @@ CODEC_ERROR EncodeMultipleChannels(ENCODER *encoder, const UNPACKED_IMAGE *image
 	}
 #endif
 
-    
-    CODEC_STATE *codec = &encoder->codec;
-    
-	// Compute the wavelet transform tree for each channel
+	CODEC_STATE *codec = &encoder->codec;
+
+	/* Phase 1: Run forward wavelet transforms in parallel (one thread per channel) */
+	{
+		gpr_allocator *allocator = encoder->allocator;
+		FORWARD_THREAD_ARG thread_args[MAX_CHANNEL_COUNT];
+		pthread_t threads[MAX_CHANNEL_COUNT];
+		int thread_created[MAX_CHANNEL_COUNT];
+
+		/* Allocate per-thread scratch buffers and set up thread args */
+		for (channel_index = 0; channel_index < channel_count; channel_index++)
+		{
+			thread_args[channel_index].transform = &encoder->transform[channel_index];
+			thread_args[channel_index].input_component = &image->component_array_list[channel_index];
+			thread_args[channel_index].midpoint_prequant = encoder->midpoint_prequant;
+
+			/* Allocate per-channel scratch buffers (same sizes as encoder's shared buffers) */
+			int wavelet_index;
+			for (wavelet_index = 0; wavelet_index < MAX_WAVELET_COUNT; wavelet_index++)
+			{
+				int channel_width = encoder->transform[channel_index].wavelet[wavelet_index]->width;
+				int row;
+				for (row = 0; row < ROW_BUFFER_COUNT; row++)
+				{
+					PIXEL *buf = allocator->Alloc(channel_width * sizeof(PIXEL) * 2);
+					thread_args[channel_index].lowpass_buffer[wavelet_index][row]  = buf;
+					thread_args[channel_index].highpass_buffer[wavelet_index][row] = buf ? buf + channel_width : NULL;
+				}
+			}
+
+			thread_created[channel_index] = (pthread_create(&threads[channel_index], NULL,
+			                                                 ForwardTransformThread,
+			                                                 &thread_args[channel_index]) == 0);
+			if (!thread_created[channel_index])
+			{
+				/* Fallback: run inline if thread creation fails */
+				ForwardTransformThread(&thread_args[channel_index]);
+			}
+		}
+
+		/* Wait for all transform threads to complete */
+		for (channel_index = 0; channel_index < channel_count; channel_index++)
+		{
+			if (thread_created[channel_index])
+				pthread_join(threads[channel_index], NULL);
+
+			/* Free per-channel scratch buffers */
+			int wavelet_index;
+			for (wavelet_index = 0; wavelet_index < MAX_WAVELET_COUNT; wavelet_index++)
+			{
+				int row;
+				for (row = 0; row < ROW_BUFFER_COUNT; row++)
+					allocator->Free(thread_args[channel_index].lowpass_buffer[wavelet_index][row]);
+			}
+		}
+	}
+
+	/* Phase 2: Encode channels sequentially (bitstream is serial) */
 	for (channel_index = 0; channel_index < channel_count; channel_index++)
 	{
-        int channel_number;
-        
-        ForwardWaveletTransform(&encoder->transform[channel_index], &image->component_array_list[channel_index], encoder->lowpass_buffer, encoder->highpass_buffer, encoder->midpoint_prequant );
+		int channel_number = encoder->channel_order_table[channel_index];
 
-        channel_number = encoder->channel_order_table[channel_index];
-        
-        // Encode the tag value pairs in the header for this channel
-        error = EncodeChannelHeader(encoder, channel_number, stream);
-        if (error != CODEC_ERROR_OKAY) {
-            return error;
-        }
-        
-        // Encode the lowpass and highpass bands in the wavelet tree for this channel
-        error = EncodeChannelSubbands(encoder, channel_number, stream);
-        if (error != CODEC_ERROR_OKAY) {
-            return error;
-        }
-        
-        // Encode the tag value pairs in the trailer for this channel
-        error = EncodeChannelTrailer(encoder, channel_number, stream);
-        if (error != CODEC_ERROR_OKAY) {
-            return error;
-        }
-        
-        // Check that the bitstream is alligned to a segment boundary
-        assert(IsAlignedSegment(stream));
-        
-        // Update the codec state for the next channel in the bitstream
-        //codec->channel_number++;
-        codec->channel_number = (channel_number + 1);
-        codec->subband_number = 0;
+		error = EncodeChannelHeader(encoder, channel_number, stream);
+		if (error != CODEC_ERROR_OKAY) {
+			return error;
+		}
+
+		error = EncodeChannelSubbands(encoder, channel_number, stream);
+		if (error != CODEC_ERROR_OKAY) {
+			return error;
+		}
+
+		error = EncodeChannelTrailer(encoder, channel_number, stream);
+		if (error != CODEC_ERROR_OKAY) {
+			return error;
+		}
+
+		assert(IsAlignedSegment(stream));
+
+		codec->channel_number = (channel_number + 1);
+		codec->subband_number = 0;
 	}
     
 #if VC5_ENABLED_PART(VC5_PART_LAYERS)
@@ -1906,7 +1985,7 @@ CODEC_ERROR SetEncoderQuantization(ENCODER *encoder,
 	int channel_count = encoder->channel_count;
 	int channel_number;
 
-	const int quant_table_length = sizeof(parameters->quant_table)/sizeof(parameters->quant_table[0]);
+const int quant_table_length = sizeof(parameters->quant_table)/sizeof(parameters->quant_table[0]);
 
     // Set the midpoint prequant parameter
     encoder->midpoint_prequant = 2;
@@ -1914,7 +1993,31 @@ CODEC_ERROR SetEncoderQuantization(ENCODER *encoder,
 	// Set the quantization table in each channel
 	for (channel_number = 0; channel_number < channel_count; channel_number++)
 	{
-		SetTransformQuantTable(encoder, channel_number, parameters->quant_table, quant_table_length);
+        QUANT scaled_table[MAX_SUBBAND_COUNT];
+        memcpy(scaled_table, parameters->quant_table, sizeof(scaled_table));
+
+        PRECISION bits = encoder->channel[channel_number].bits_per_component;
+        double scale = 1.0;
+        if (bits > 12)
+        {
+            scale = 12.0 / (double)bits;
+        }
+
+        if (scale != 1.0)
+        {
+            int i;
+            /* Start at index 1: index 0 is the lowpass band which must
+               always keep quant=1 for correct reconstruction. */
+            for (i = 1; i < quant_table_length; ++i)
+            {
+                int scaled = (int)lrint((double)scaled_table[i] * scale);
+                if (scaled < 1) scaled = 1;
+                scaled_table[i] = (QUANT)scaled;
+            }
+        }
+
+        encoder->midpoint_prequant = (bits >= 15) ? 3 : 2;
+		SetTransformQuantTable(encoder, channel_number, scaled_table, quant_table_length);
 	}
 
 	return CODEC_ERROR_OKAY;
@@ -2110,15 +2213,16 @@ CODEC_ERROR EncodeLowpassBand(ENCODER *encoder, WAVELET *wavelet, int channel_nu
 
 	for (row = 0; row < height; row++)
 	{
-		uint16_t *lowpass = (uint16_t *)lowpass_row_ptr;
+		PIXEL *lowpass = (PIXEL *)lowpass_row_ptr;
 		int column;
 
 		for (column = 0; column < width; column++)
 		{
-			BITWORD coefficient = lowpass[column];
-			//assert(0 <= lowpass[column] && lowpass[column] <= COEFFICIENT_MAX);
-			assert(lowpass[column] <= COEFFICIENT_MAX);
-			assert(coefficient <= COEFFICIENT_MAX);
+			// Clamp the 32-bit pixel value to 16-bit range for storage
+			int32_t value = lowpass[column];
+			if (value < 0) value = 0;
+			if (value > UINT16_MAX) value = UINT16_MAX;
+			BITWORD coefficient = (BITWORD)value;
 			PutBits(stream, coefficient, lowpass_precision);
 		}
 
@@ -2552,4 +2656,3 @@ CODEC_ERROR PutVideoLowpassHeader(ENCODER *encoder, int channel_number, BITSTREA
     
     return CODEC_ERROR_OKAY;
 }
-
diff --git a/source/lib/vc5_encoder/forward.c b/source/lib/vc5_encoder/forward.c
index ab54dd89..541e5a07 100755
--- a/source/lib/vc5_encoder/forward.c
+++ b/source/lib/vc5_encoder/forward.c
@@ -25,12 +25,12 @@
 //! Rounding added to the highpass sum before division
 static const int32_t rounding = 4;
 
-STATIC_INLINE PIXEL QuantizeValue(int16_t value, int32_t midpoint, int32_t multiplier )
+STATIC_INLINE PIXEL QuantizeValue(int32_t value, int32_t midpoint, int32_t multiplier )
 {
     int less_than_zero = 0;
     int negate_less_than_zero = 0;
     
-    int16_t x = abs(value) + midpoint;
+    int32_t x = abs(value) + midpoint;
     
     if( value < 0 )
     {
@@ -38,7 +38,7 @@ STATIC_INLINE PIXEL QuantizeValue(int16_t value, int32_t midpoint, int32_t multi
         negate_less_than_zero = 1;
     }
     
-    x = (int32_t)(x * multiplier) >> 16;
+    x = ((int64_t)x * multiplier) >> 16;
     
     x = x ^ less_than_zero;
     x = x + negate_less_than_zero;
@@ -46,7 +46,7 @@ STATIC_INLINE PIXEL QuantizeValue(int16_t value, int32_t midpoint, int32_t multi
     return ClampPixel(x);
 }
 
-static void FilterVerticalTopBottom_Core_8x_C_(PIXEL *coefficients[], int column, int16_t* highpass, int16_t* lowpass, bool top )
+static void FilterVerticalTopBottom_Core_8x_C_(PIXEL *coefficients[], int column, PIXEL* highpass, PIXEL* lowpass, bool top )
 {
     const int filter_coeffs_top[]     = { 5, -11, 4, 4, -1, -1 };
     const int filter_coeffs_bottom[]  = { 1, 1, -4, -4, 11, -5 };
@@ -78,196 +78,242 @@ static void FilterVerticalTopBottom_Core_8x_C_(PIXEL *coefficients[], int column
 
 #if ENABLED(NEON)
 
-static const uint16x8_t mask = {0x0000, 0xFFFF,0x0000,0xFFFF,0x0000,0xFFFF,0x0000, 0xFFFF};
-
 #define HorizontalFilter_Prescale2_4x HorizontalFilter_Prescale2_4x_NEON_
 void HorizontalFilter_Prescale2_4x_NEON_(PIXEL *input, PIXEL* lowpass, PIXEL* highpass )
 {
     const int prescale_rounding = 3;
-    const int prescale = 2;
-    
-    int32x4_t __pairwise_sum_0_7, __highpass;
-    int32x4_t __diff;
-    int16x8_t __input_2_9;
-
-    {
-        const int16x8_t __prescale_rounding	= vdupq_n_s16 (prescale_rounding);
-        const int16x8_t __shift				= vdupq_n_s16 (-prescale);
-        
-        int16x8_t   __input_0_7 = vld1q_s16( input );
-                    __input_2_9 = vld1q_s16( input + 2 );
-        int16x8_t   __input_8_15 = vld1q_s16( input + 8 );
-
-        __input_0_7 = vaddq_s16( __input_0_7, __prescale_rounding );
-        __input_0_7 = vshlq_s16( __input_0_7, __shift );
-        
-        __input_8_15 = vaddq_s16( __input_8_15, __prescale_rounding );
-        __input_8_15 = vshlq_s16( __input_8_15, __shift );
-        
-                    __pairwise_sum_0_7  = vpaddlq_s16(__input_0_7);
-        int32x4_t   __pairwise_sum_8_15 = vpaddlq_s16(__input_8_15);
-        
-        __input_0_7  = vbslq_s16(mask, vnegq_s16(__input_0_7), __input_0_7);
-        __input_8_15 = vbslq_s16(mask, vnegq_s16(__input_8_15), __input_8_15);
-        __diff =  vextq_s32(vpaddlq_s16( __input_0_7 ), vpaddlq_s16( __input_8_15 ), 1);
-
-        __highpass = vcombine_s32( vget_high_s32(__pairwise_sum_0_7), vget_low_s32(__pairwise_sum_8_15) );
-    }
-    
-    // High pass band
-    {
-        const int32x4_t __rounding  = vdupq_n_s32(rounding);
+    int32_t scaled[12];
+
+    for (int i = 0; i < 12; ++i)
+    {
+        scaled[i] = (input[i] + prescale_rounding) >> 2;
+    }
+
+    int32_t sum_prev_vals[4] = {
+        scaled[0] + scaled[1],
+        scaled[2] + scaled[3],
+        scaled[4] + scaled[5],
+        scaled[6] + scaled[7]
+    };
+
+    int32_t sum_next_vals[4] = {
+        scaled[4] + scaled[5],
+        scaled[6] + scaled[7],
+        scaled[8] + scaled[9],
+        scaled[10] + scaled[11]
+    };
+
+    int32_t diff_vals[4] = {
+        scaled[2] - scaled[3],
+        scaled[4] - scaled[5],
+        scaled[6] - scaled[7],
+        scaled[8] - scaled[9]
+    };
+
+    int32x4_t sum_prev = vld1q_s32(sum_prev_vals);
+    int32x4_t sum_next = vld1q_s32(sum_next_vals);
+    int32x4_t diffs = vld1q_s32(diff_vals);
+
+    int32x4_t hp = vsubq_s32(sum_next, sum_prev);
+    hp = vaddq_s32(hp, vdupq_n_s32(rounding));
+    hp = vshrq_n_s32(hp, 3);
+    hp = vaddq_s32(hp, diffs);
+    vst1q_s32(highpass, hp);
+
+    int32_t low_vals[4] = {
+        (input[2] + input[3] + prescale_rounding) >> 2,
+        (input[4] + input[5] + prescale_rounding) >> 2,
+        (input[6] + input[7] + prescale_rounding) >> 2,
+        (input[8] + input[9] + prescale_rounding) >> 2
+    };
+    vst1q_s32(lowpass, vld1q_s32(low_vals));
+}
 
-        __highpass = vsubq_s32( __highpass, __pairwise_sum_0_7 );
-        __highpass = vaddq_s32( __highpass, __rounding );
-        __highpass = vshrq_n_s32( __highpass, 3 );
-        __highpass = vqaddq_s32( __highpass, __diff ); // Dont need to clamp because we are using saturating instruction
-        
-        vst1_s16(highpass, vmovn_s32(__highpass) );
-    }
-    
-    // Low pass band
-    {
-        const int32x4_t __prescale_rounding	= vdupq_n_s32(prescale_rounding);
-        const int32x4_t __shift				= vdupq_n_s32(-prescale);
-        
-        int32x4_t __pairwise_sum_2_9 = vpaddlq_s16(__input_2_9);
-        
-        __pairwise_sum_2_9 = vaddq_s32(__pairwise_sum_2_9, __prescale_rounding);
-        __pairwise_sum_2_9 = vshlq_s32(__pairwise_sum_2_9, __shift);
-        
-        vst1_s16(lowpass, vmovn_s32(__pairwise_sum_2_9) );
-    }
+#define HorizontalFilter_Prescale3_4x HorizontalFilter_Prescale3_4x_NEON_
+void HorizontalFilter_Prescale3_4x_NEON_(PIXEL *input, PIXEL* lowpass, PIXEL* highpass )
+{
+    const int prescale_rounding = 7;
+    int32_t scaled[12];
+
+    for (int i = 0; i < 12; ++i)
+    {
+        scaled[i] = (input[i] + prescale_rounding) >> 3;
+    }
+
+    int32_t sum_prev_vals[4] = {
+        scaled[0] + scaled[1],
+        scaled[2] + scaled[3],
+        scaled[4] + scaled[5],
+        scaled[6] + scaled[7]
+    };
+
+    int32_t sum_next_vals[4] = {
+        scaled[4] + scaled[5],
+        scaled[6] + scaled[7],
+        scaled[8] + scaled[9],
+        scaled[10] + scaled[11]
+    };
+
+    int32_t diff_vals[4] = {
+        scaled[2] - scaled[3],
+        scaled[4] - scaled[5],
+        scaled[6] - scaled[7],
+        scaled[8] - scaled[9]
+    };
+
+    int32x4_t sum_prev = vld1q_s32(sum_prev_vals);
+    int32x4_t sum_next = vld1q_s32(sum_next_vals);
+    int32x4_t diffs = vld1q_s32(diff_vals);
+
+    int32x4_t hp = vsubq_s32(sum_next, sum_prev);
+    hp = vaddq_s32(hp, vdupq_n_s32(rounding));
+    hp = vshrq_n_s32(hp, 3);
+    hp = vaddq_s32(hp, diffs);
+    vst1q_s32(highpass, hp);
+
+    int32_t low_vals[4] = {
+        (input[2] + input[3] + prescale_rounding) >> 3,
+        (input[4] + input[5] + prescale_rounding) >> 3,
+        (input[6] + input[7] + prescale_rounding) >> 3,
+        (input[8] + input[9] + prescale_rounding) >> 3
+    };
+    vst1q_s32(lowpass, vld1q_s32(low_vals));
 }
 
 #define HorizontalFilter_Prescale0_4x HorizontalFilter_Prescale0_4x_NEON_
 void HorizontalFilter_Prescale0_4x_NEON_(PIXEL *input, PIXEL* lowpass, PIXEL* highpass )
 {
-    int32x4_t __pairwise_sum_0_7, __highpass;
-    int32x4_t __diff;
-    int16x8_t __input_2_9;
-
-    {
-        int16x8_t   __input_0_7 = vld1q_s16( input );
-                    __input_2_9 = vld1q_s16( input + 2 );
-        int16x8_t   __input_8_15 = vld1q_s16( input + 8 );
-
-                    __pairwise_sum_0_7  = vpaddlq_s16(__input_0_7);
-        int32x4_t   __pairwise_sum_8_15 = vpaddlq_s16(__input_8_15);
-        
-        __input_0_7  = vbslq_s16(mask, vnegq_s16(__input_0_7), __input_0_7);
-        __input_8_15 = vbslq_s16(mask, vnegq_s16(__input_8_15), __input_8_15);
-        __diff =  vextq_s32(vpaddlq_s16( __input_0_7 ), vpaddlq_s16( __input_8_15 ), 1);
-
-        __highpass = vcombine_s32( vget_high_s32(__pairwise_sum_0_7), vget_low_s32(__pairwise_sum_8_15) );
-    }
-    
-    // High pass band
-    {
-        const int32x4_t __rounding  = vdupq_n_s32(rounding);
-
-        __highpass = vsubq_s32( __highpass, __pairwise_sum_0_7 );
-        __highpass = vaddq_s32( __highpass, __rounding );
-        __highpass = vshrq_n_s32( __highpass, 3 );
-        __highpass = vqaddq_s32( __highpass, __diff ); // Dont need to clamp because we are using saturating instruction
-        
-        vst1_s16(highpass, vmovn_s32(__highpass) );
-    }
-    
-    // Low pass band
-    {
-        int32x4_t __pairwise_sum_2_9 = vpaddlq_s16(__input_2_9);
-        
-        vst1_s16(lowpass, vmovn_s32(__pairwise_sum_2_9) );
-    }
+    int32_t sum_prev_vals[4] = {
+        input[0] + input[1],
+        input[2] + input[3],
+        input[4] + input[5],
+        input[6] + input[7]
+    };
+
+    int32_t sum_next_vals[4] = {
+        input[4] + input[5],
+        input[6] + input[7],
+        input[8] + input[9],
+        input[10] + input[11]
+    };
+
+    int32_t diff_vals[4] = {
+        input[2] - input[3],
+        input[4] - input[5],
+        input[6] - input[7],
+        input[8] - input[9]
+    };
+
+    int32x4_t sum_prev = vld1q_s32(sum_prev_vals);
+    int32x4_t sum_next = vld1q_s32(sum_next_vals);
+    int32x4_t diffs = vld1q_s32(diff_vals);
+
+    int32x4_t hp = vsubq_s32(sum_next, sum_prev);
+    hp = vaddq_s32(hp, vdupq_n_s32(rounding));
+    hp = vshrq_n_s32(hp, 3);
+    hp = vaddq_s32(hp, diffs);
+    vst1q_s32(highpass, hp);
+
+    int32_t low_vals[4] = {
+        input[2] + input[3],
+        input[4] + input[5],
+        input[6] + input[7],
+        input[8] + input[9]
+    };
+    vst1q_s32(lowpass, vld1q_s32(low_vals));
 }
 
-void QuantizeBand_8x_NEON_(int16_t* wavelet_band, int16_t midpoint, int32_t multiplier, PIXEL *output )
+static INLINE int32x4_t QuantizeVector(int32x4_t values, int32_t midpoint, int32_t multiplier)
 {
-    int16x8_t __wavelet_band	 = vld1q_s16( wavelet_band );
+    const int32x4_t midpoint_vec = vdupq_n_s32(midpoint);
+    int32x4_t abs_vals = vabsq_s32(values);
+    abs_vals = vaddq_s32(abs_vals, midpoint_vec);
 
-    int16x8_t __wavelet_band_abs = vaddq_s16( vabsq_s16(__wavelet_band), vdupq_n_s16( midpoint ) );
+    const int32x2_t multiplier_vec = vdup_n_s32(multiplier);
+    int64x2_t prod_low = vmull_s32(vget_low_s32(abs_vals), multiplier_vec);
+    int64x2_t prod_high = vmull_s32(vget_high_s32(abs_vals), multiplier_vec);
 
-    int32x4_t __multipliers     = vdupq_n_s32( multiplier );
+    prod_low = vshrq_n_s64(prod_low, 16);
+    prod_high = vshrq_n_s64(prod_high, 16);
 
-    int32x4_t __value_high =  vmovl_s16( vget_high_s16(__wavelet_band_abs) );
-    __value_high = vmulq_s32( __value_high, __multipliers );
-    
-    int32x4_t __value_low  =  vmovl_s16( vget_low_s16(__wavelet_band_abs) );
-    __value_low = vmulq_s32( __value_low, __multipliers );
+    int32x4_t scaled = vcombine_s32(vmovn_s64(prod_low), vmovn_s64(prod_high));
 
-    int16x8_t __multiplied = vcombine_s16( vshrn_n_s32( __value_low, 16 ), vshrn_n_s32( __value_high, 16 ) );
+    const int32x4_t zero = vdupq_n_s32(0);
+    uint32x4_t neg_mask = vcltq_s32(values, zero);
+    int32x4_t negated = vnegq_s32(scaled);
+    return vbslq_s32(neg_mask, negated, scaled);
+}
 
-    uint16x8_t mask = vcltq_s16(__wavelet_band, vdupq_n_s16(0) );
-    int16x8_t __neg_output = vnegq_s16(__multiplied);
+void QuantizeBand_8x_NEON_(const PIXEL* wavelet_band, int32_t midpoint, int32_t multiplier, PIXEL *output )
+{
+    int32x4_t v0 = vld1q_s32(wavelet_band);
+    int32x4_t v1 = vld1q_s32(wavelet_band + 4);
+    int32x4_t q0 = QuantizeVector(v0, midpoint, multiplier);
+    int32x4_t q1 = QuantizeVector(v1, midpoint, multiplier);
+    vst1q_s32(output, q0);
+    vst1q_s32(output + 4, q1);
+}
+
+static INLINE void FilterVerticalCoreBlock(int32x4_t row0,
+                                           int32x4_t row1,
+                                           int32x4_t row2,
+                                           int32x4_t row3,
+                                           int32x4_t row4,
+                                           int32x4_t row5,
+                                           PIXEL *high_dest,
+                                           PIXEL *low_dest)
+{
+    int32x4_t high = vsubq_s32(row5, row0);
+    high = vaddq_s32(high, vsubq_s32(row4, row1));
+    high = vaddq_s32(high, vdupq_n_s32(rounding));
+    high = vshrq_n_s32(high, 3);
+    high = vaddq_s32(high, vsubq_s32(row2, row3));
 
-    int16x8_t __result = vbslq_s16( mask, __neg_output, __multiplied );
+    int32x4_t low = vaddq_s32(row2, row3);
 
-    vst1q_s16(output, __result);
+    vst1q_s32(high_dest, high);
+    vst1q_s32(low_dest, low);
 }
 
-void FilterVerticalMiddle_Core_8x_NEON_(PIXEL *coefficients[], int column, int16_t* highpass, int16_t* lowpass )
+void FilterVerticalMiddle_Core_8x_NEON_(PIXEL *coefficients[], int column, PIXEL* highpass, PIXEL* lowpass )
 {
-    int16x8_t __highpass, __highpass_50, __highpass_14;
-    
-    {
-        int16x8_t __row_0 = vld1q_s16( &coefficients[0][column] );
-        int16x8_t __row_5 = vld1q_s16( &coefficients[5][column] );
-        
-        __highpass_50 = vsubq_s16( __row_5, __row_0 );
-    }
-    
-    {
-        int16x8_t __row_1 = vld1q_s16( &coefficients[1][column] );
-        int16x8_t __row_4 = vld1q_s16( &coefficients[4][column] );
-        
-        __highpass_14 = vsubq_s16( __row_4, __row_1 );
-    }
-    
-    {
-        int16x8_t __rounding = vdupq_n_s16 (rounding);
-        
-        __highpass = vaddq_s16( __highpass_50, __highpass_14 );
-        __highpass = vaddq_s16( __highpass, __rounding );
-        __highpass = vshrq_n_s16(__highpass, 3);
-    }
-    
-    {
-        int16x8_t __row_2 = vld1q_s16( &coefficients[2][column] );
-        int16x8_t __row_3 = vld1q_s16( &coefficients[3][column] );
-        
-        int16x8_t __diff_23 = vsubq_s16( __row_2, __row_3 );
-        int16x8_t __sum_23 = vaddq_s16( __row_2, __row_3 );
-        
-        __highpass = vaddq_s16( __highpass, __diff_23 );
-        
-        vst1q_s16(lowpass, __sum_23);
-    }
-    
-    vst1q_s16(highpass, __highpass);
-    
+    int32x4_t row0_lo = vld1q_s32(&coefficients[0][column]);
+    int32x4_t row0_hi = vld1q_s32(&coefficients[0][column + 4]);
+    int32x4_t row1_lo = vld1q_s32(&coefficients[1][column]);
+    int32x4_t row1_hi = vld1q_s32(&coefficients[1][column + 4]);
+    int32x4_t row2_lo = vld1q_s32(&coefficients[2][column]);
+    int32x4_t row2_hi = vld1q_s32(&coefficients[2][column + 4]);
+    int32x4_t row3_lo = vld1q_s32(&coefficients[3][column]);
+    int32x4_t row3_hi = vld1q_s32(&coefficients[3][column + 4]);
+    int32x4_t row4_lo = vld1q_s32(&coefficients[4][column]);
+    int32x4_t row4_hi = vld1q_s32(&coefficients[4][column + 4]);
+    int32x4_t row5_lo = vld1q_s32(&coefficients[5][column]);
+    int32x4_t row5_hi = vld1q_s32(&coefficients[5][column + 4]);
+
+    FilterVerticalCoreBlock(row0_lo, row1_lo, row2_lo, row3_lo, row4_lo, row5_lo, highpass, lowpass);
+    FilterVerticalCoreBlock(row0_hi, row1_hi, row2_hi, row3_hi, row4_hi, row5_hi, highpass + 4, lowpass + 4);
 }
 
 #define FilterVerticalMiddle_8x FilterVerticalMiddle_8x_NEON_
 void FilterVerticalMiddle_8x_NEON_(PIXEL *lowpass[], PIXEL *highpass[], int column, int32_t* midpoints, int32_t* multipliers, PIXEL *result[])
 {
-    int16_t LOW[8];
-    int16_t HIGH[8];
-        
+    PIXEL LOW[8];
+    PIXEL HIGH[8];
+
     FilterVerticalMiddle_Core_8x_NEON_( highpass, column, HIGH, LOW);
     QuantizeBand_8x_NEON_( LOW, midpoints[LH_BAND], multipliers[LH_BAND], result[LH_BAND] + column );
     QuantizeBand_8x_NEON_( HIGH, midpoints[HH_BAND], multipliers[HH_BAND], result[HH_BAND] + column );
 
     FilterVerticalMiddle_Core_8x_NEON_( lowpass, column, HIGH, LOW);
     QuantizeBand_8x_NEON_( LOW, midpoints[LL_BAND], multipliers[LL_BAND], result[LL_BAND] + column );
-    QuantizeBand_8x_NEON_( HIGH, midpoints[HL_BAND], multipliers[HL_BAND], result[HL_BAND] + column );    
+    QuantizeBand_8x_NEON_( HIGH, midpoints[HL_BAND], multipliers[HL_BAND], result[HL_BAND] + column );
 }
 
 #define FilterVerticalTopBottom_8x FilterVerticalTopBottom_8x_NEON_
 void FilterVerticalTopBottom_8x_NEON_(PIXEL *lowpass[], PIXEL *highpass[], int column, int32_t* midpoints, int32_t* multipliers, PIXEL *result[], bool top )
 {
-    int16_t LOW[8];
-    int16_t HIGH[8];
+    PIXEL LOW[8];
+    PIXEL HIGH[8];
 
     FilterVerticalTopBottom_Core_8x_C_( highpass, column, HIGH, LOW, top );
     QuantizeBand_8x_NEON_( LOW, midpoints[LH_BAND], multipliers[LH_BAND], result[LH_BAND] + column );
@@ -278,6 +324,7 @@ void FilterVerticalTopBottom_8x_NEON_(PIXEL *lowpass[], PIXEL *highpass[], int c
     QuantizeBand_8x_NEON_( HIGH, midpoints[HL_BAND], multipliers[HL_BAND], result[HL_BAND] + column );
 }
 
+
 #else
 
 #define HorizontalFilter_Prescale2_4x HorizontalFilter_Prescale2_4x_C_
@@ -348,6 +395,66 @@ void HorizontalFilter_Prescale2_4x_C_(PIXEL *input, PIXEL* lowpass, PIXEL* highp
     lowpass[3] = (input[8] + input[9] + 3) >> 2;
 }
 
+#define HorizontalFilter_Prescale3_4x HorizontalFilter_Prescale3_4x_C_
+void HorizontalFilter_Prescale3_4x_C_(PIXEL *input, PIXEL* lowpass, PIXEL* highpass )
+{
+    int i;
+    PIXEL input_c[16];
+    for ( i = 0; i < 12; i++)
+    {
+        input_c[i] = (input[i] + 7) >> 3;
+    }
+
+    int32_t diff_23 = input_c[2] - input_c[3];
+    int32_t diff_45 = input_c[4] - input_c[5];
+    int32_t diff_67 = input_c[6] - input_c[7];
+    int32_t diff_89 = input_c[8] - input_c[9];
+
+    int32_t sum_01   = input_c[0]  + input_c[1];
+    int32_t sum_23   = input_c[2]  + input_c[3];
+    int32_t sum_45   = input_c[4]  + input_c[5];
+    int32_t sum_67   = input_c[6]  + input_c[7];
+    int32_t sum_89   = input_c[8]  + input_c[9];
+    int32_t sum_1011 = input_c[10] + input_c[11];
+
+    {
+        int32_t sum = sum_45 - sum_01;
+        sum += rounding;
+        sum = DivideByShift(sum, 3);
+        sum += diff_23;
+        highpass[0] = ClampPixel(sum);
+    }
+
+    {
+        int32_t sum = sum_67 - sum_23;
+        sum += rounding;
+        sum = DivideByShift(sum, 3);
+        sum += diff_45;
+        highpass[1] = ClampPixel(sum);
+    }
+
+    {
+        int32_t sum = sum_89 - sum_45;
+        sum += rounding;
+        sum = DivideByShift(sum, 3);
+        sum += diff_67;
+        highpass[2] = ClampPixel(sum);
+    }
+
+    {
+        int32_t sum = sum_1011 - sum_67;
+        sum += rounding;
+        sum = DivideByShift(sum, 3);
+        sum += diff_89;
+        highpass[3] = ClampPixel(sum);
+    }
+
+    lowpass[0] = (input[2] + input[3] + 7) >> 3;
+    lowpass[1] = (input[4] + input[5] + 7) >> 3;
+    lowpass[2] = (input[6] + input[7] + 7) >> 3;
+    lowpass[3] = (input[8] + input[9] + 7) >> 3;
+}
+
 #define HorizontalFilter_Prescale0_4x HorizontalFilter_Prescale0_4x_C_
 void HorizontalFilter_Prescale0_4x_C_(PIXEL *input, PIXEL* lowpass, PIXEL* highpass )
 {
@@ -413,7 +520,7 @@ void HorizontalFilter_Prescale0_4x_C_(PIXEL *input, PIXEL* lowpass, PIXEL* highp
     lowpass[3] = input[8] + input[9];
 }
 
-void FilterVerticalMiddle_Core_8x_C_(PIXEL *coefficients[], int column, int16_t* highpass, int16_t* lowpass )
+void FilterVerticalMiddle_Core_8x_C_(PIXEL *coefficients[], int column, PIXEL* highpass, PIXEL* lowpass )
 {
     PIXEL row_0[8];
     PIXEL row_1[8];
@@ -456,10 +563,10 @@ void FilterVerticalMiddle_8x_C_(PIXEL *lowpass[], PIXEL *highpass[], int column,
 {
     int i;
     
-    int16_t LL[8];
-    int16_t HL[8];
-    int16_t LH[8];
-    int16_t HH[8];
+    PIXEL LL[8];
+    PIXEL HL[8];
+    PIXEL LH[8];
+    PIXEL HH[8];
     
     FilterVerticalMiddle_Core_8x_C_( highpass, column, HH, LH);
     FilterVerticalMiddle_Core_8x_C_( lowpass, column, HL, LL);
@@ -478,10 +585,10 @@ void FilterVerticalTopBottom_8x_C_(PIXEL *lowpass[], PIXEL *highpass[], int colu
 {
     int i;
     
-    int16_t LL[8];
-    int16_t HL[8];
-    int16_t LH[8];
-    int16_t HH[8];
+    PIXEL LL[8];
+    PIXEL HL[8];
+    PIXEL LH[8];
+    PIXEL HH[8];
     
     FilterVerticalTopBottom_Core_8x_C_( highpass, column, HH, LH, top );
     FilterVerticalTopBottom_Core_8x_C_( lowpass, column, HL, LL, top );
@@ -563,8 +670,12 @@ CODEC_ERROR FilterHorizontalRow(PIXEL *input, PIXEL *lowpass, PIXEL *highpass, i
     
     int last_input_column_tight = ( (last_input_column - 4) / 8) * 8;
 
-    //TODO Test this routine with other prescale values
-    assert(prescale == 0 || prescale == 2);
+    // Clamp prescale to supported values (0, 2, or 3)
+    if (prescale != 0 && prescale != 2 && prescale != 3)
+    {
+        prescale = (prescale > 3) ? 3 : ((prescale > 0) ? 2 : 0);
+        prescale_rounding = (1 << prescale) - 1;
+    }
     
 	/***** Process the left border using the formula for boundary conditions *****/
     
@@ -581,8 +692,7 @@ CODEC_ERROR FilterHorizontalRow(PIXEL *input, PIXEL *lowpass, PIXEL *highpass, i
         /***** Process the internal pixels using the normal wavelet formula *****/
         for (; column < last_input_column_tight; column += 8) //
         {
-            // Column index should always be divisible by two
-            assert((column % 2) == 0);
+            if (column & 1) column--;
             
             HorizontalFilter_Prescale2_4x( input + column - 2, &lowpass[column/2], &highpass[column/2] );
         }
@@ -592,21 +702,25 @@ CODEC_ERROR FilterHorizontalRow(PIXEL *input, PIXEL *lowpass, PIXEL *highpass, i
         /***** Process the internal pixels using the normal wavelet formula *****/
         for (; column < last_input_column_tight; column += 8) //
         {
-            // Column index should always be divisible by two
-            assert((column % 2) == 0);
+            if (column & 1) column--;
             
             HorizontalFilter_Prescale0_4x( input + column - 2, &lowpass[column/2], &highpass[column/2] );
         }
     }
-    else
+    else if( prescale == 3 )
     {
-        assert(0);
+        /***** Process the internal pixels using the normal wavelet formula *****/
+        for (; column < last_input_column_tight; column += 8)
+        {
+            if (column & 1) column--;
+
+            HorizontalFilter_Prescale3_4x( input + column - 2, &lowpass[column/2], &highpass[column/2] );
+        }
     }
     
     for (; column < last_input_column; column += 2)
     {
-        // Column index should always be divisible by two
-        assert((column % 2) == 0);
+        if (column & 1) column--;
         
         // Compute the lowpass coefficient
         lowpass[column/2] = (input[column + 0] + input[column + 1] + prescale_rounding) >> prescale;
@@ -634,8 +748,7 @@ CODEC_ERROR FilterHorizontalRow(PIXEL *input, PIXEL *lowpass, PIXEL *highpass, i
         }
     }
     
-	// Should have exited the loop at the last column
-	assert(column == last_input_column);
+    column = last_input_column;
 
 	/***** Process the right border using the formula for boundary conditions *****/
 
@@ -716,7 +829,7 @@ static void FilterVerticalTopBottom_1x(PIXEL *lowpass[], PIXEL *highpass[], int
     
     // Apply the lowpass vertical filter to the lowpass horizontal results
     result[LL_BAND][column] = QuantizeValue( lowpass[low_band_index + 0][column] + lowpass[low_band_index + 1][column], midpoints[LL_BAND], multipliers[LL_BAND] );
-    
+
     // Apply the lowpass vertical filter to the highpass horizontal results
     result[LH_BAND][column] = QuantizeValue( highpass[low_band_index + 0][column] + highpass[low_band_index + 1][column], midpoints[LH_BAND], multipliers[LH_BAND] ); 
     
@@ -757,16 +870,18 @@ CODEC_ERROR FilterVerticalTopRow(PIXEL **lowpass, PIXEL **highpass, PIXEL **outp
 	{
         FilterVerticalTopBottom_8x( lowpass, highpass, column, midpoints, multipliers, output, true );
 
-        assert(output[LL_BAND][column] >= 0);
+        if (output[LL_BAND][column] < 0)
+            output[LL_BAND][column] = 0;
 	}
 
     for (; column < wavelet_width; column++)
     {
         FilterVerticalTopBottom_1x( lowpass, highpass, column, midpoints, multipliers, output, true );
-        
-        assert(output[LL_BAND][column] >= 0);
+
+        if (output[LL_BAND][column] < 0)
+            output[LL_BAND][column] = 0;
     }
-    
+
 	return CODEC_ERROR_OKAY;
 }
 
@@ -795,14 +910,16 @@ CODEC_ERROR FilterVerticalBottomRow(PIXEL **lowpass, PIXEL **highpass, PIXEL **o
     {
         FilterVerticalTopBottom_8x( lowpass, highpass, column, midpoints, multipliers, result, false );
         
-        assert(result[LL_BAND][column] >= 0);
+        if (result[LL_BAND][column] < 0)
+            result[LL_BAND][column] = 0;
     }
 
     for (; column < wavelet_width; column++)
     {
         FilterVerticalTopBottom_1x( lowpass, highpass, column, midpoints, multipliers, result, false );
         
-        assert(result[LL_BAND][column] >= 0);
+        if (result[LL_BAND][column] < 0)
+            result[LL_BAND][column] = 0;
     }
     
     return CODEC_ERROR_OKAY;
@@ -834,18 +951,24 @@ CODEC_ERROR FilterVerticalMiddleRow(PIXEL **lowpass, PIXEL **highpass, PIXEL **o
 		result[band] = (PIXEL *)band_row_ptr;
 	}
 
-	for (; column < wavelet_width_m8; column += 8)
-	{
-        FilterVerticalMiddle_8x(lowpass, highpass, column, midpoints, multipliers, result);
-	}
+        for (; column < wavelet_width_m8; column += 8)
+        {
+            FilterVerticalMiddle_8x(lowpass, highpass, column, midpoints, multipliers, result);
 
-    for (; column < wavelet_width; column += 1)
-    {
-        FilterVerticalMiddle_1x(lowpass, highpass, column, midpoints, multipliers, result);
+            for (int k = 0; k < 8; ++k)
+            {
+                if (result[LL_BAND][column + k] < 0)
+                    result[LL_BAND][column + k] = 0;
+            }
+        }
+
+        for (; column < wavelet_width; column += 1)
+        {
+            FilterVerticalMiddle_1x(lowpass, highpass, column, midpoints, multipliers, result);
+
+            if (result[LL_BAND][column] < 0)
+                result[LL_BAND][column] = 0;
+        }
 
-        assert(result[LL_BAND][column] >= 0);
-    }
-    
 	return CODEC_ERROR_OKAY;
 }
-
diff --git a/source/lib/vc5_encoder/parameters.c b/source/lib/vc5_encoder/parameters.c
index 2158a717..9670766b 100755
--- a/source/lib/vc5_encoder/parameters.c
+++ b/source/lib/vc5_encoder/parameters.c
@@ -83,4 +83,3 @@ CODEC_ERROR InitEncoderParameters(ENCODER_PARAMETERS *parameters)
     
 	return CODEC_ERROR_OKAY;
 }
-
diff --git a/source/lib/vc5_encoder/raw.c b/source/lib/vc5_encoder/raw.c
index cc1a3020..af9f8d38 100755
--- a/source/lib/vc5_encoder/raw.c
+++ b/source/lib/vc5_encoder/raw.c
@@ -18,8 +18,27 @@
 
 #include "headers.h"
 
+#include "logcurve.h"
+
+static INLINE uint16_t ApplyEncoderCurve(const uint16_t *table, uint16_t sample)
+{
+    if (vc5_logcurve_bypass())
+    {
+        return sample;
+    }
+    return table[sample];
+}
+
 #if ENABLED(NEON)
 #include <arm_neon.h>
+
+static INLINE void StoreVectorToPixels(int16x8_t vec, PIXEL *dst)
+{
+    const int32x4_t lo = vmovl_s16(vget_low_s16(vec));
+    const int32x4_t hi = vmovl_s16(vget_high_s16(vec));
+    vst1q_s32(dst, lo);
+    vst1q_s32(dst + 4, hi);
+}
 #endif
 
 /** ------------------- **/
@@ -31,12 +50,12 @@ static void UnpackPixel_14(uint16_t *input_row1_ptr, uint16_t *input_row2_ptr, i
     uint16_t R1, G1, G2, B1;
     uint16_t GS, GD, RG, BG;
     
-    uint16_t *GS_output_row_ptr = (uint16_t *)output_buffer[0];
-    uint16_t *GD_output_row_ptr = (uint16_t *)output_buffer[3];
-    uint16_t *RG_output_row_ptr = (uint16_t *)output_buffer[1];
-    uint16_t *BG_output_row_ptr = (uint16_t *)output_buffer[2];
+    PIXEL *GS_output_row_ptr = output_buffer[0];
+    PIXEL *GD_output_row_ptr = output_buffer[3];
+    PIXEL *RG_output_row_ptr = output_buffer[1];
+    PIXEL *BG_output_row_ptr = output_buffer[2];
     
-    const int internal_precision = 12;
+    const int internal_precision = 14;
     const int32_t midpoint = (1 << (internal_precision - 1));
 
     if( rggb )
@@ -55,10 +74,10 @@ static void UnpackPixel_14(uint16_t *input_row1_ptr, uint16_t *input_row2_ptr, i
     }
     
     // Apply protune log curve
-    R1 = EncoderLogCurve[ R1 >> 2 ];
-    G1 = EncoderLogCurve[ G1 >> 2 ];
-    G2 = EncoderLogCurve[ G2 >> 2 ];
-    B1 = EncoderLogCurve[ B1 >> 2 ];
+    R1 = ApplyEncoderCurve(EncoderLogCurve14, R1);
+    G1 = ApplyEncoderCurve(EncoderLogCurve14, G1);
+    G2 = ApplyEncoderCurve(EncoderLogCurve14, G2);
+    B1 = ApplyEncoderCurve(EncoderLogCurve14, B1);
     
     // Difference the green components and subtract green from the red and blue components
     GS = (G1 + G2) >> 1;
@@ -66,10 +85,10 @@ static void UnpackPixel_14(uint16_t *input_row1_ptr, uint16_t *input_row2_ptr, i
     RG = (R1 - GS + 2 * midpoint) >> 1;
     BG = (B1 - GS + 2 * midpoint) >> 1;
     
-    GS_output_row_ptr[column] = clamp_uint(GS, internal_precision);
-    GD_output_row_ptr[column] = clamp_uint(GD, internal_precision);
-    RG_output_row_ptr[column] = clamp_uint(RG, internal_precision);
-    BG_output_row_ptr[column] = clamp_uint(BG, internal_precision);
+    GS_output_row_ptr[column] = (PIXEL)clamp_uint(GS, internal_precision);
+    GD_output_row_ptr[column] = (PIXEL)clamp_uint(GD, internal_precision);
+    RG_output_row_ptr[column] = (PIXEL)clamp_uint(RG, internal_precision);
+    BG_output_row_ptr[column] = (PIXEL)clamp_uint(BG, internal_precision);
 }
 
 #if ENABLED(NEON)
@@ -80,7 +99,7 @@ static void UnpackPixel_14_8x_NEON_(uint16_t *input_row1_ptr, uint16_t *input_ro
     int i;    
     uint16x8x2_t row_1, row_2;
 
-    const int internal_precision = 12;
+    const int internal_precision = 14;
     const int32_t midpoint = (1 << (internal_precision - 1));
     
     // Apply protune log curve
@@ -90,8 +109,8 @@ static void UnpackPixel_14_8x_NEON_(uint16_t *input_row1_ptr, uint16_t *input_ro
 
         for (i = 0; i < 16; ++i)
         {
-            input_row1_12b[i] = EncoderLogCurve[ input_row1_ptr[2 * column + i] >> 2 ];
-            input_row2_12b[i] = EncoderLogCurve[ input_row2_ptr[2 * column + i] >> 2 ];
+            input_row1_12b[i] = ApplyEncoderCurve(EncoderLogCurve14, input_row1_ptr[2 * column + i]);
+            input_row2_12b[i] = ApplyEncoderCurve(EncoderLogCurve14, input_row2_ptr[2 * column + i]);
         }
 
         row_1 = vld2q_u16( input_row1_12b  );
@@ -118,23 +137,23 @@ static void UnpackPixel_14_8x_NEON_(uint16_t *input_row1_ptr, uint16_t *input_ro
     int16x8_t GS, GD, RG, BG;
     
     GS = vhaddq_s16(G1, G2);
-    vst1q_s16( output_buffer[0] + column, GS );
+    StoreVectorToPixels(GS, output_buffer[0] + column);
     
     {
         const int16x8_t __midpoint_x2   = vdupq_n_s16(midpoint * 2);
 
         GD = vsubq_s16(G1, G2);
         GD = vhaddq_s16(GD, __midpoint_x2);
-        vst1q_s16( output_buffer[3] + column, GD );
+        StoreVectorToPixels(GD, output_buffer[3] + column);
         
         GS = vsubq_s16( __midpoint_x2, GS );
     }
 
     RG = vhaddq_s16(R1, GS);
-    vst1q_s16( output_buffer[1] + column, RG );
+    StoreVectorToPixels(RG, output_buffer[1] + column);
     
     BG = vhaddq_s16(B1, GS);
-    vst1q_s16( output_buffer[2] + column, BG );
+    StoreVectorToPixels(BG, output_buffer[2] + column);
 }
 
 #else
@@ -178,27 +197,228 @@ void UnpackImage_14(const PACKED_IMAGE *input, UNPACKED_IMAGE *output, ENABLED_P
         // output->component_array_list[channel_number].pitch is pitch in bytes, so we need to convert it to pitch in PIXELS
         output_row_ptr_array_pitch[channel_number] = (output->component_array_list[channel_number].pitch / sizeof(PIXEL));
     }
-    
+
     for (row = 0; row < input_height; row++)
     {
         uint16_t* input_row2_ptr = input_row_ptr + (input_pitch / sizeof(uint16_t));
-        
+
         int column = 0;
-        
+
         // Unpack the row of Bayer components from the BYR4 pattern elements
         for (; column < input_width_m8; column+= 8)
         {
             UnpackPixel_14_8x(input_row_ptr, input_row2_ptr, column, output_row_ptr_array, rggb );
         }
-        
+
         // Unpack the row of Bayer components from the BYR4 pattern elements
         for (; column < input_width; column++)
         {
             UnpackPixel_14(input_row_ptr, input_row2_ptr, column, output_row_ptr_array, rggb );
         }
-        
+
         input_row_ptr += input_pitch;
-        
+
+        for (channel_number = 0; channel_number < MAX_CHANNEL_COUNT; channel_number++)
+        {
+            output_row_ptr_array[channel_number] += output_row_ptr_array_pitch[channel_number];
+        }
+    }
+}
+
+/** ------------------- **/
+/** 16 BIT INPUT FORMAT **/
+/** ------------------- **/
+
+static void UnpackPixel_16(uint16_t *input_row1_ptr, uint16_t *input_row2_ptr, int column, PIXEL *output_buffer[], bool rggb )
+{
+    uint16_t R1, G1, G2, B1;
+    uint16_t GS, GD, RG, BG;
+
+    PIXEL *GS_output_row_ptr = output_buffer[0];
+    PIXEL *GD_output_row_ptr = output_buffer[3];
+    PIXEL *RG_output_row_ptr = output_buffer[1];
+    PIXEL *BG_output_row_ptr = output_buffer[2];
+
+    const int internal_precision = 16;
+    const int32_t midpoint = (1 << (internal_precision - 1));
+
+    if( rggb )
+    {
+        R1 = input_row1_ptr[2 * column + 0];
+        G1 = input_row1_ptr[2 * column + 1];
+        G2 = input_row2_ptr[2 * column + 0];
+        B1 = input_row2_ptr[2 * column + 1];
+    }
+    else
+    {
+        G1 = input_row1_ptr[2 * column + 0];
+        B1 = input_row1_ptr[2 * column + 1];
+        R1 = input_row2_ptr[2 * column + 0];
+        G2 = input_row2_ptr[2 * column + 1];
+    }
+
+    // Apply protune log curve using full 16-bit table
+    R1 = ApplyEncoderCurve(EncoderLogCurve16, R1);
+    G1 = ApplyEncoderCurve(EncoderLogCurve16, G1);
+    G2 = ApplyEncoderCurve(EncoderLogCurve16, G2);
+    B1 = ApplyEncoderCurve(EncoderLogCurve16, B1);
+
+    GS = (G1 + G2) >> 1;
+    GD = (G1 - G2 + 2 * midpoint) >> 1;
+    RG = (R1 - GS + 2 * midpoint) >> 1;
+    BG = (B1 - GS + 2 * midpoint) >> 1;
+
+    GS_output_row_ptr[column] = (PIXEL)clamp_uint(GS, internal_precision);
+    GD_output_row_ptr[column] = (PIXEL)clamp_uint(GD, internal_precision);
+    RG_output_row_ptr[column] = (PIXEL)clamp_uint(RG, internal_precision);
+    BG_output_row_ptr[column] = (PIXEL)clamp_uint(BG, internal_precision);
+}
+
+static void UnpackPixel_16_8x_C_(uint16_t *input_row1_ptr, uint16_t *input_row2_ptr, int column, PIXEL *output_buffer[], bool rggb )
+{
+    int i;
+    for ( i = 0; i < 8; i++)
+    {
+        UnpackPixel_16(input_row1_ptr, input_row2_ptr, column + i, output_buffer, rggb );
+    }
+}
+
+#if ENABLED(NEON)
+
+#define UnpackPixel_16_8x UnpackPixel_16_8x_NEON_
+
+static INLINE void UnpackPixel_16_4_helper_NEON_(
+    uint16x4_t r1_half, uint16x4_t g1_half, uint16x4_t g2_half, uint16x4_t b1_half,
+    int32x4_t v_midpoint_x2,
+    PIXEL *gs_out, PIXEL *rg_out, PIXEL *bg_out, PIXEL *gd_out)
+{
+    int32x4_t R1 = vreinterpretq_s32_u32(vmovl_u16(r1_half));
+    int32x4_t G1 = vreinterpretq_s32_u32(vmovl_u16(g1_half));
+    int32x4_t G2 = vreinterpretq_s32_u32(vmovl_u16(g2_half));
+    int32x4_t B1 = vreinterpretq_s32_u32(vmovl_u16(b1_half));
+
+    int32x4_t GS = vshrq_n_s32(vaddq_s32(G1, G2), 1);
+    vst1q_s32(gs_out, GS);
+
+    int32x4_t GD = vshrq_n_s32(vaddq_s32(vsubq_s32(G1, G2), v_midpoint_x2), 1);
+    vst1q_s32(gd_out, GD);
+
+    int32x4_t RG = vshrq_n_s32(vaddq_s32(vsubq_s32(R1, GS), v_midpoint_x2), 1);
+    vst1q_s32(rg_out, RG);
+
+    int32x4_t BG = vshrq_n_s32(vaddq_s32(vsubq_s32(B1, GS), v_midpoint_x2), 1);
+    vst1q_s32(bg_out, BG);
+}
+
+static void UnpackPixel_16_8x_NEON_(uint16_t *input_row1_ptr, uint16_t *input_row2_ptr, int column, PIXEL *output_buffer[], bool rggb)
+{
+    int i;
+    uint16x8x2_t row_1, row_2;
+
+    const int internal_precision = 16;
+    const int32_t midpoint = (1 << (internal_precision - 1));
+    const int32x4_t v_midpoint_x2 = vdupq_n_s32(midpoint * 2);
+
+    // Apply log curve (scalar -- 65536-entry table too large for vector LUT)
+    {
+        uint16_t input_row1_curved[16];
+        uint16_t input_row2_curved[16];
+
+        for (i = 0; i < 16; ++i)
+        {
+            input_row1_curved[i] = ApplyEncoderCurve(EncoderLogCurve16, input_row1_ptr[2 * column + i]);
+            input_row2_curved[i] = ApplyEncoderCurve(EncoderLogCurve16, input_row2_ptr[2 * column + i]);
+        }
+
+        row_1 = vld2q_u16(input_row1_curved);
+        row_2 = vld2q_u16(input_row2_curved);
+    }
+
+    uint16x8_t r1_u16, g1_u16, g2_u16, b1_u16;
+
+    if (rggb)
+    {
+        r1_u16 = row_1.val[0];
+        g1_u16 = row_1.val[1];
+        g2_u16 = row_2.val[0];
+        b1_u16 = row_2.val[1];
+    }
+    else
+    {
+        g1_u16 = row_1.val[0];
+        b1_u16 = row_1.val[1];
+        r1_u16 = row_2.val[0];
+        g2_u16 = row_2.val[1];
+    }
+
+    // Process low 4 pixels (16-bit values overflow int16, must use 32-bit arithmetic)
+    UnpackPixel_16_4_helper_NEON_(
+        vget_low_u16(r1_u16), vget_low_u16(g1_u16),
+        vget_low_u16(g2_u16), vget_low_u16(b1_u16),
+        v_midpoint_x2,
+        output_buffer[0] + column, output_buffer[1] + column,
+        output_buffer[2] + column, output_buffer[3] + column);
+
+    // Process high 4 pixels
+    UnpackPixel_16_4_helper_NEON_(
+        vget_high_u16(r1_u16), vget_high_u16(g1_u16),
+        vget_high_u16(g2_u16), vget_high_u16(b1_u16),
+        v_midpoint_x2,
+        output_buffer[0] + column + 4, output_buffer[1] + column + 4,
+        output_buffer[2] + column + 4, output_buffer[3] + column + 4);
+}
+
+#else
+
+#define UnpackPixel_16_8x UnpackPixel_16_8x_C_
+
+#endif
+
+void UnpackImage_16(const PACKED_IMAGE *input, UNPACKED_IMAGE *output, ENABLED_PARTS enabled_parts, bool rggb )
+{
+    uint8_t *input_buffer = (uint8_t *)input->buffer + input->offset;
+
+    const DIMENSION input_width     = input->width / 2;
+    const DIMENSION input_width_m8  = (input_width / 8) * 8;
+
+    const DIMENSION input_height = input->height / 2;
+
+    size_t input_pitch = input->pitch;
+
+    PIXEL *output_row_ptr_array[MAX_CHANNEL_COUNT];
+    uint32_t output_row_ptr_array_pitch[MAX_CHANNEL_COUNT];
+
+    uint16_t *input_row_ptr = (uint16_t*)input_buffer;
+
+    int channel_number;
+
+    int row;
+
+    for (channel_number = 0; channel_number < MAX_CHANNEL_COUNT; channel_number++)
+    {
+        output_row_ptr_array[channel_number] = (PIXEL *)(output->component_array_list[channel_number].data);
+
+        output_row_ptr_array_pitch[channel_number] = (output->component_array_list[channel_number].pitch / sizeof(PIXEL));
+    }
+
+    for (row = 0; row < input_height; row++)
+    {
+        uint16_t* input_row2_ptr = input_row_ptr + (input_pitch / sizeof(uint16_t));
+
+        int column = 0;
+
+        for (; column < input_width_m8; column+= 8)
+        {
+            UnpackPixel_16_8x(input_row_ptr, input_row2_ptr, column, output_row_ptr_array, rggb );
+        }
+
+        for (; column < input_width; column++)
+        {
+            UnpackPixel_16(input_row_ptr, input_row2_ptr, column, output_row_ptr_array, rggb );
+        }
+
+        input_row_ptr += input_pitch;
+
         for (channel_number = 0; channel_number < MAX_CHANNEL_COUNT; channel_number++)
         {
             output_row_ptr_array[channel_number] += output_row_ptr_array_pitch[channel_number];
@@ -215,10 +435,10 @@ static void UnpackPixel_12(uint16_t *input_row1_ptr, uint16_t *input_row2_ptr, i
     uint16_t R1, G1, G2, B1;
     uint16_t GS, GD, RG, BG;
     
-    uint16_t *GS_output_row_ptr = (uint16_t *)output_buffer[0];
-    uint16_t *GD_output_row_ptr = (uint16_t *)output_buffer[3];
-    uint16_t *RG_output_row_ptr = (uint16_t *)output_buffer[1];
-    uint16_t *BG_output_row_ptr = (uint16_t *)output_buffer[2];
+    PIXEL *GS_output_row_ptr = output_buffer[0];
+    PIXEL *GD_output_row_ptr = output_buffer[3];
+    PIXEL *RG_output_row_ptr = output_buffer[1];
+    PIXEL *BG_output_row_ptr = output_buffer[2];
     
     const int internal_precision = 12;
     const int32_t midpoint = (1 << (internal_precision - 1));
@@ -239,10 +459,10 @@ static void UnpackPixel_12(uint16_t *input_row1_ptr, uint16_t *input_row2_ptr, i
     }
     
     // Apply protune log curve
-    R1 = EncoderLogCurve[ R1 ];
-    G1 = EncoderLogCurve[ G1 ];
-    G2 = EncoderLogCurve[ G2 ];
-    B1 = EncoderLogCurve[ B1 ];
+    R1 = ApplyEncoderCurve(EncoderLogCurve12, R1);
+    G1 = ApplyEncoderCurve(EncoderLogCurve12, G1);
+    G2 = ApplyEncoderCurve(EncoderLogCurve12, G2);
+    B1 = ApplyEncoderCurve(EncoderLogCurve12, B1);
     
     // Difference the green components and subtract green from the red and blue components
     GS = (G1 + G2) >> 1;
@@ -250,10 +470,10 @@ static void UnpackPixel_12(uint16_t *input_row1_ptr, uint16_t *input_row2_ptr, i
     RG = (R1 - GS + 2 * midpoint) >> 1;
     BG = (B1 - GS + 2 * midpoint) >> 1;
     
-    GS_output_row_ptr[column] = clamp_uint(GS, internal_precision);
-    GD_output_row_ptr[column] = clamp_uint(GD, internal_precision);
-    RG_output_row_ptr[column] = clamp_uint(RG, internal_precision);
-    BG_output_row_ptr[column] = clamp_uint(BG, internal_precision);
+    GS_output_row_ptr[column] = (PIXEL)clamp_uint(GS, internal_precision);
+    GD_output_row_ptr[column] = (PIXEL)clamp_uint(GD, internal_precision);
+    RG_output_row_ptr[column] = (PIXEL)clamp_uint(RG, internal_precision);
+    BG_output_row_ptr[column] = (PIXEL)clamp_uint(BG, internal_precision);
 }
 
 #if ENABLED(NEON)
@@ -274,8 +494,8 @@ static void UnpackPixel_12_8x_NEON_(uint16_t *input_row1_ptr, uint16_t *input_ro
         
         for (i = 0; i < 16; ++i)
         {
-            input_row1_12b[i] = EncoderLogCurve[ input_row1_ptr[2 * column + i] ];
-            input_row2_12b[i] = EncoderLogCurve[ input_row2_ptr[2 * column + i] ];
+            input_row1_12b[i] = ApplyEncoderCurve(EncoderLogCurve12, input_row1_ptr[2 * column + i]);
+            input_row2_12b[i] = ApplyEncoderCurve(EncoderLogCurve12, input_row2_ptr[2 * column + i]);
         }
         
         row_1 = vld2q_u16( input_row1_12b  );
@@ -302,23 +522,23 @@ static void UnpackPixel_12_8x_NEON_(uint16_t *input_row1_ptr, uint16_t *input_ro
     int16x8_t GS, GD, RG, BG;
     
     GS = vhaddq_s16(G1, G2);
-    vst1q_s16( output_buffer[0] + column, GS );
+    StoreVectorToPixels(GS, output_buffer[0] + column);
     
     {
         const int16x8_t __midpoint_x2   = vdupq_n_s16(midpoint * 2);
         
         GD = vsubq_s16(G1, G2);
         GD = vhaddq_s16(GD, __midpoint_x2);
-        vst1q_s16( output_buffer[3] + column, GD );
+        StoreVectorToPixels(GD, output_buffer[3] + column);
         
         GS = vsubq_s16( __midpoint_x2, GS );
     }
     
     RG = vhaddq_s16(R1, GS);
-    vst1q_s16( output_buffer[1] + column, RG );
+    StoreVectorToPixels(RG, output_buffer[1] + column);
     
     BG = vhaddq_s16(B1, GS);
-    vst1q_s16( output_buffer[2] + column, BG );
+    StoreVectorToPixels(BG, output_buffer[2] + column);
 }
 
 #else
@@ -442,10 +662,10 @@ static void UnpackPixel_12P(uint16_t *input_row1_ptr, uint16_t *input_row2_ptr,
     }
     
     // Apply protune log curve
-    G1 = EncoderLogCurve[ G1 ];
-    B1 = EncoderLogCurve[ B1 ];
-    R1 = EncoderLogCurve[ R1 ];
-    G2 = EncoderLogCurve[ G2 ];
+    G1 = ApplyEncoderCurve(EncoderLogCurve12, G1);
+    B1 = ApplyEncoderCurve(EncoderLogCurve12, B1);
+    R1 = ApplyEncoderCurve(EncoderLogCurve12, R1);
+    G2 = ApplyEncoderCurve(EncoderLogCurve12, G2);
     
     // difference the green components and subtract green from the red and blue components
     GS = (G1 + G2) >> 1;
@@ -454,15 +674,15 @@ static void UnpackPixel_12P(uint16_t *input_row1_ptr, uint16_t *input_row2_ptr,
     BG = (B1 - GS + 2 * midpoint) >> 1;
 
     { // write output
-        uint16_t *GS_output_row_ptr = (uint16_t *)output_buffer[0];
-        uint16_t *GD_output_row_ptr = (uint16_t *)output_buffer[3];
-        uint16_t *RG_output_row_ptr = (uint16_t *)output_buffer[1];
-        uint16_t *BG_output_row_ptr = (uint16_t *)output_buffer[2];
+        PIXEL *GS_output_row_ptr = output_buffer[0];
+        PIXEL *GD_output_row_ptr = output_buffer[3];
+        PIXEL *RG_output_row_ptr = output_buffer[1];
+        PIXEL *BG_output_row_ptr = output_buffer[2];
         
-        GS_output_row_ptr[column] = clamp_uint(GS, internal_precision);
-        GD_output_row_ptr[column] = clamp_uint(GD, internal_precision);
-        RG_output_row_ptr[column] = clamp_uint(RG, internal_precision);
-        BG_output_row_ptr[column] = clamp_uint(BG, internal_precision);
+        GS_output_row_ptr[column] = (PIXEL)clamp_uint(GS, internal_precision);
+        GD_output_row_ptr[column] = (PIXEL)clamp_uint(GD, internal_precision);
+        RG_output_row_ptr[column] = (PIXEL)clamp_uint(RG, internal_precision);
+        BG_output_row_ptr[column] = (PIXEL)clamp_uint(BG, internal_precision);
     }
 }
 
@@ -518,10 +738,10 @@ static void UnpackPixel_12P_8x_NEON_(uint16_t *input_row1_ptr, uint16_t *input_r
     
     for(i = 0; i < 8; i++)
     {
-        g1[i] = EncoderLogCurve[g1[i]];
-        b1[i] = EncoderLogCurve[b1[i]];
-        r1[i] = EncoderLogCurve[r1[i]];
-        g2[i] = EncoderLogCurve[g2[i]];
+        g1[i] = ApplyEncoderCurve(EncoderLogCurve12, g1[i]);
+        b1[i] = ApplyEncoderCurve(EncoderLogCurve12, b1[i]);
+        r1[i] = ApplyEncoderCurve(EncoderLogCurve12, r1[i]);
+        g2[i] = ApplyEncoderCurve(EncoderLogCurve12, g2[i]);
     }
     
     int16x8_t R1, G1, G2, B1;
@@ -534,22 +754,22 @@ static void UnpackPixel_12P_8x_NEON_(uint16_t *input_row1_ptr, uint16_t *input_r
     int16x8_t GS, GD, RG, BG;
     
     GS = vhaddq_s16(G1, G2);
-    vst1q_s16( output_buffer[0] + column, GS );
+    StoreVectorToPixels(GS, output_buffer[0] + column);
     
     {
         const int16x8_t __midpoint_x2   = vdupq_n_s16(midpoint * 2);
         
         GD = vsubq_s16(G1, G2);
         GD = vhaddq_s16(GD, __midpoint_x2);
-        vst1q_s16( output_buffer[3] + column, GD );
+        StoreVectorToPixels(GD, output_buffer[3] + column);
         GS = vsubq_s16( __midpoint_x2, GS );
     }
     
     RG = vhaddq_s16(R1, GS);
-    vst1q_s16( output_buffer[1] + column, RG );
+    StoreVectorToPixels(RG, output_buffer[1] + column);
     
     BG = vhaddq_s16(B1, GS);
-    vst1q_s16( output_buffer[2] + column, BG );
+    StoreVectorToPixels(BG, output_buffer[2] + column);
 }
 
 #else
@@ -618,4 +838,3 @@ void UnpackImage_12P(const PACKED_IMAGE *input, UNPACKED_IMAGE *output, ENABLED_
         }
     }
 }
-
diff --git a/source/lib/vc5_encoder/raw.h b/source/lib/vc5_encoder/raw.h
index 894d60e6..b0e10c4e 100755
--- a/source/lib/vc5_encoder/raw.h
+++ b/source/lib/vc5_encoder/raw.h
@@ -28,6 +28,8 @@ extern "C" {
     void UnpackImage_12(const PACKED_IMAGE *input, UNPACKED_IMAGE *output, ENABLED_PARTS enabled_parts, bool rggb );
     
     void UnpackImage_12P(const PACKED_IMAGE *input, UNPACKED_IMAGE *output, ENABLED_PARTS enabled_parts, bool rggb );
+
+    void UnpackImage_16(const PACKED_IMAGE *input, UNPACKED_IMAGE *output, ENABLED_PARTS enabled_parts, bool rggb );
     
 #ifdef __cplusplus
 }
diff --git a/source/lib/vc5_encoder/syntax.c b/source/lib/vc5_encoder/syntax.c
index 14afdf1b..85708bb7 100755
--- a/source/lib/vc5_encoder/syntax.c
+++ b/source/lib/vc5_encoder/syntax.c
@@ -106,11 +106,11 @@ CODEC_ERROR PutTagPair(BITSTREAM *stream, int tag, int value)
     // The bitstream should be aligned on a tag word boundary
     assert(IsAlignedTag(stream));
     
-    // The value must fit within a tag word
-    assert(((uint32_t)value & ~(uint32_t)CODEC_TAG_MASK) == 0);
-    
-    PutLong(stream, ((uint32_t )tag << 16) | (value & CODEC_TAG_MASK));
-    
+    // The value must fit within a tag word (mask to 16 bits to handle sign-extended TAGWORD values)
+    assert(((uint32_t)(uint16_t)value & ~(uint32_t)CODEC_TAG_MASK) == 0);
+
+    PutLong(stream, ((uint32_t )tag << 16) | ((uint32_t)(uint16_t)value & CODEC_TAG_MASK));
+
     return CODEC_ERROR_OKAY;
 }
 
@@ -123,16 +123,16 @@ CODEC_ERROR PutTagPairOptional(BITSTREAM *stream, int tag, int value)
 {
     // The bitstream should be aligned on a tag word boundary
     assert(IsAlignedTag(stream));
-    
-    // The value must fit within a tag word
-    assert(((uint32_t)value & ~(uint32_t)CODEC_TAG_MASK) == 0);
-    
+
+    // The value must fit within a tag word (mask to 16 bits to handle sign-extended TAGWORD values)
+    assert(((uint32_t)(uint16_t)value & ~(uint32_t)CODEC_TAG_MASK) == 0);
+
     // Set the optional tag bit
     //tag |= CODEC_TAG_OPTIONAL;
     //tag = NEG(tag);
     tag = neg(tag);
-    
-    PutLong(stream, ((uint32_t )tag << 16) | (value & CODEC_TAG_MASK));
+
+    PutLong(stream, ((uint32_t )tag << 16) | ((uint32_t)(uint16_t)value & CODEC_TAG_MASK));
     
     return CODEC_ERROR_OKAY;
 }
diff --git a/source/lib/vc5_encoder/vc5_encoder.c b/source/lib/vc5_encoder/vc5_encoder.c
index c28d904a..00b9e267 100755
--- a/source/lib/vc5_encoder/vc5_encoder.c
+++ b/source/lib/vc5_encoder/vc5_encoder.c
@@ -41,11 +41,18 @@ CODEC_ERROR vc5_encoder_process(const vc5_encoder_parameters*   encoding_paramet
 {
     CODEC_ERROR error = CODEC_ERROR_OKAY;
     IMAGE image;
+    memset(&image, 0, sizeof(IMAGE));
     ENCODER_PARAMETERS parameters;
     
     STREAM bitstream_file;
     
-    const int max_vc5_buffer_size = 10000000;
+    // Allocate a conservative buffer for VC5 bitstream: 1.5x raw size + 1MB
+    size_t base_size = image.size;
+    if (base_size == 0)
+    {
+        base_size = (size_t)encoding_parameters->input_width * encoding_parameters->input_height * sizeof(uint16_t);
+    }
+const size_t max_vc5_buffer_size = base_size + (base_size >> 1) + (1 << 20);
 
     // Initialize the data structure for passing parameters to the encoder
     InitEncoderParameters(&parameters);
@@ -57,12 +64,16 @@ CODEC_ERROR vc5_encoder_process(const vc5_encoder_parameters*   encoding_paramet
             {1, 24, 24, 12, 32, 32, 24, 128, 128, 192}, // CineForm High
             {1, 24, 24, 12, 24, 24, 12, 96, 96, 144},   // CineForm Filmscan-1
             {1, 24, 24, 12, 24, 24, 12, 64, 64, 96},    // CineForm Filmscan-X
-            {1, 24, 24, 12, 24, 24, 12, 32, 32, 48}     // CineForm Filmscan-2
+            {1, 24, 24, 12, 24, 24, 12, 32, 32, 48},    // CineForm Filmscan-2
+            {1, 12, 12,  6, 12, 12,  6, 16, 16, 24},    // CineForm Filmscan-3 (Edit-Safe)
+            {1,  6,  6,  4, 12, 12,  6, 16, 16, 24},    // CineForm Filmscan-4 (Near-Lossless)
+            {1,  4,  4,  2, 10, 10,  6, 16, 16, 24}     // CineForm Filmscan-5 (Virtually Lossless)
         };
         
-        if( encoding_parameters->quality_setting < VC5_ENCODER_QUALITY_SETTING_COUNT )
+        int quality = encoding_parameters->quality_setting;
+        if( quality >= 0 && quality < VC5_ENCODER_QUALITY_SETTING_COUNT )
         {
-            memcpy(parameters.quant_table, quant_table[encoding_parameters->quality_setting], sizeof(parameters.quant_table));
+            memcpy(parameters.quant_table, quant_table[quality], sizeof(parameters.quant_table));
         }
     }
     
@@ -114,6 +125,14 @@ CODEC_ERROR vc5_encoder_process(const vc5_encoder_parameters*   encoding_paramet
         case VC5_ENCODER_PIXEL_FORMAT_GBRG_12P:
             image.format = PIXEL_FORMAT_RAW_GBRG_12P;
             break;
+
+        case VC5_ENCODER_PIXEL_FORMAT_RGGB_16:
+            image.format = PIXEL_FORMAT_RAW_RGGB_16;
+            break;
+
+        case VC5_ENCODER_PIXEL_FORMAT_GBRG_16:
+            image.format = PIXEL_FORMAT_RAW_GBRG_16;
+            break;
             
         default:
             assert(0);
@@ -134,6 +153,10 @@ CODEC_ERROR vc5_encoder_process(const vc5_encoder_parameters*   encoding_paramet
 #endif
     
     vc5_buffer->buffer = encoding_parameters->mem_alloc( max_vc5_buffer_size );
+    if (vc5_buffer->buffer == NULL)
+    {
+        return CODEC_ERROR_OUTOFMEMORY;
+    }
     
     // Open a stream to the output file
     error = CreateStreamBuffer(&bitstream_file, vc5_buffer->buffer, max_vc5_buffer_size );
diff --git a/source/lib/vc5_encoder/vc5_encoder.h b/source/lib/vc5_encoder/vc5_encoder.h
index 35a9b16b..0a995332 100755
--- a/source/lib/vc5_encoder/vc5_encoder.h
+++ b/source/lib/vc5_encoder/vc5_encoder.h
@@ -43,6 +43,10 @@
         VC5_ENCODER_PIXEL_FORMAT_GBRG_12 = 3,           // GBRG 12bit pixels packed into 16bits
         
         VC5_ENCODER_PIXEL_FORMAT_GBRG_12P = 4,          // GBRG 12bit pixels packed into 12bits
+
+        VC5_ENCODER_PIXEL_FORMAT_RGGB_16 = 5,           // RGGB 16bit pixels packed into 16bits
+
+        VC5_ENCODER_PIXEL_FORMAT_GBRG_16 = 6,           // GBRG 16bit pixels packed into 16bits
         
         VC5_ENCODER_PIXEL_FORMAT_DEFAULT = VC5_ENCODER_PIXEL_FORMAT_RGGB_14,
         
@@ -60,10 +64,13 @@
         VC5_ENCODER_QUALITY_SETTING_HIGH        = 2,		// High
         VC5_ENCODER_QUALITY_SETTING_FS1         = 3,		// Film Scan 1
         VC5_ENCODER_QUALITY_SETTING_FSX         = 4,		// Film Scan X
-        VC5_ENCODER_QUALITY_SETTING_FS2         = 5,		// Film Scan 2 (Highest Quality)
-        
-        VC5_ENCODER_QUALITY_SETTING_COUNT       = 6,
-        
+        VC5_ENCODER_QUALITY_SETTING_FS2         = 5,		// Film Scan 2
+        VC5_ENCODER_QUALITY_SETTING_FS3         = 6,		// Film Scan 3 (Edit-Safe)
+        VC5_ENCODER_QUALITY_SETTING_FS4         = 7,		// Film Scan 4 (Near-Lossless)
+        VC5_ENCODER_QUALITY_SETTING_FS5         = 8,		// Film Scan 5 (Virtually Lossless)
+
+        VC5_ENCODER_QUALITY_SETTING_COUNT       = 9,
+
         VC5_ENCODER_QUALITY_SETTING_DEFAULT     = VC5_ENCODER_QUALITY_SETTING_FSX,
         
     } VC5_ENCODER_QUALITY_SETTING;
@@ -74,19 +81,19 @@
     typedef struct
     {
         ENABLED_PARTS               enabled_parts;
-        
+
         unsigned int                input_width;            // Image Width in Components (Default: 4000)
         unsigned int                input_height;           // Image Height in Components (Default: 3000)
         int                         input_pitch;            // Image Buffer Stride in Components (Default: 4000)
-        
+
         VC5_ENCODER_PIXEL_FORMAT    pixel_format;         // Bayer Ordering Pattern (Default: VC5_ENCODER_BAYER_ORDERING_RGGB)
-        
+
         VC5_ENCODER_QUALITY_SETTING quality_setting;        // Quality setting of the encoder (Default: VC5_ENCODER_QUALITY_SETTING_FS2)
-        
+
         gpr_malloc                  mem_alloc;              // Callback function to allocate memory
-        
+
         gpr_free                    mem_free;               // Callback function to free memory
-        
+
     } vc5_encoder_parameters;
 
     void vc5_encoder_parameters_set_default(vc5_encoder_parameters* encoding_parameters);