diff --git a/CMakeLists.txt b/CMakeLists.txt index ea7ab493..613261ab 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,6 +8,32 @@ set(CMAKE_C_FLAGS "-std=c99") # project name project( gpr ) +# Enable link-time optimization for cross-TU inlining (GetBits, GetBuffer, etc.) +cmake_policy(SET CMP0069 NEW) +include(CheckIPOSupported) +check_ipo_supported(RESULT ipo_supported OUTPUT ipo_output) +if(ipo_supported) + set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) + message(STATUS "LTO/IPO enabled") +else() + message(STATUS "LTO/IPO not supported: ${ipo_output}") +endif() + +# Enable NEON on arm64 (Apple Silicon defaults to ON) +set(NEON_DEFAULT OFF) +if(APPLE AND CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm64|aarch64)") + set(NEON_DEFAULT ON) +endif() +option(NEON "Enable ARM NEON intrinsics" ${NEON_DEFAULT}) + +if(NEON) + add_compile_definitions(NEON=1) +else() + add_compile_definitions(NEON=0) +endif() + +message(STATUS "NEON enabled: ${NEON}") + if(EXISTS "${CMAKE_SOURCE_DIR}/.git") execute_process( COMMAND git rev-parse --abbrev-ref HEAD diff --git a/docs/quality-analysis.md b/docs/quality-analysis.md new file mode 100644 index 00000000..a01515ec --- /dev/null +++ b/docs/quality-analysis.md @@ -0,0 +1,224 @@ +# GPR Quality Analysis — 16-Bit + +Test image: Hasselblad X2D 100C, 11664x8750, 16-bit Bayer (194.7 MB raw) +Source: `2024_12_Dec_Austin_0186.dng` (138.2 MB lossless DNG) + +## File Size + +| Q | GPR Size | Compression | vs DNG | Space Saving | +|---|----------|-------------|--------|--------------| +| 0 | 21.4 MB | 9.1x | 6.4x smaller | 84.5% | +| 1 | 28.7 MB | 6.8x | 4.8x smaller | 79.2% | +| 2 | 39.2 MB | 5.0x | 3.5x smaller | 71.6% | +| 3 | 46.0 MB | 4.2x | 3.0x smaller | 66.7% | +| 4 | 53.0 MB | 3.7x | 2.6x smaller | 61.6% | +| 5 | 66.9 MB | 2.9x | 2.1x smaller | 51.6% | +| 6 | 89.8 MB | 2.2x | 1.5x smaller | 35.0% | +| 7 | 90.9 MB | 2.1x | 1.5x smaller | 34.2% | +| 8 | 92.0 MB | 2.1x | 1.5x smaller | 33.4% | + +Compression ratio is vs uncompressed 16-bit raw (194.7 MB). + +## Raw Bayer Domain (pre-demosaic) + +This is the domain RAW editors (Lightroom, Capture One, etc.) operate in. +Errors here directly affect all downstream edits. + +| Q | PSNR | RMS Error | MAE | MAE % | Max Error | ENOB | +|---|------|-----------|-----|-------|-----------|------| +| 0 | 40.0 dB | 655 DN | 334 DN | 0.509% | 11454 DN | 6.6 bits | +| 1 | 44.6 dB | 386 DN | 209 DN | 0.319% | 6682 DN | 7.4 bits | +| 2 | 48.5 dB | 246 DN | 136 DN | 0.208% | 4437 DN | 8.1 bits | +| 3 | 50.0 dB | 207 DN | 116 DN | 0.176% | 4115 DN | 8.3 bits | +| 4 | 51.2 dB | 181 DN | 100 DN | 0.153% | 4179 DN | 8.5 bits | +| 5 | 52.1 dB | 163 DN | 89 DN | 0.135% | 3749 DN | 8.7 bits | +| 6 | 56.5 dB | 98 DN | 53 DN | 0.081% | 2190 DN | 9.4 bits | +| 7 | 58.6 dB | 77 DN | 41 DN | 0.062% | 1597 DN | 9.7 bits | +| 8 | 58.9 dB | 74 DN | 39 DN | 0.060% | 1566 DN | 9.8 bits | + +- ENOB = Effective Number of Bits = PSNR / 6.02 +- DN = Digital Number (out of 65535 for 16-bit) +- This sensor captures ~14 bits of real data at ISO 100 + +## Rendered RGB (16-bit sRGB linear) + +After demosaicing + white balance + color matrix. This is what you see on screen. + +| Q | PSNR | R PSNR | G PSNR | B PSNR | MAE | >0.5% | >1% | +|---|------|--------|--------|--------|-----|-------|-----| +| 0 | 36.9 dB | 35.2 dB | 39.6 dB | 37.1 dB | 468 DN | 36.8% | 20.4% | +| 1 | 40.5 dB | 38.7 dB | 43.4 dB | 40.7 dB | 317 DN | 28.7% | 13.0% | +| 2 | 43.5 dB | 41.6 dB | 46.4 dB | 43.7 dB | 224 DN | 20.4% | 7.4% | +| 3 | 44.5 dB | 42.6 dB | 47.5 dB | 44.7 dB | 196 DN | 17.2% | 5.8% | +| 4 | 45.4 dB | 43.4 dB | 48.5 dB | 45.6 dB | 176 DN | 14.9% | 4.7% | +| 5 | 46.1 dB | 44.1 dB | 49.4 dB | 46.4 dB | 160 DN | 13.1% | 3.9% | +| 6 | 49.5 dB | 47.7 dB | 52.2 dB | 49.7 dB | 95 DN | 5.4% | 1.2% | +| 7 | 50.9 dB | 49.3 dB | 53.2 dB | 51.2 dB | 72 DN | 3.2% | 0.7% | +| 8 | 51.1 dB | 49.5 dB | 53.4 dB | 51.4 dB | 69 DN | 3.0% | 0.6% | + +- Red channel is weakest (lowest PSNR) — typical for Bayer demosaic +- ">0.5%" / ">1%" = % of pixel values with error exceeding that fraction of full scale + +## Practical Editing Guidance + +Context: This sensor (X2D 100C) captures ~14 bits of real data at ISO 100. +Compression ENOB ranges from 6.6 bits (Q0) to 9.8 bits (Q8). +Compression error is always above sensor noise, so it is the limiting factor. + +### Q0 — Aggressive Compression (9.1x, 21 MB) + +Raw ENOB: 6.6 bits | PSNR: 40.0 dB | RMS error: 655 DN + +- OK: Export to JPEG/web at 4K or smaller — artifacts masked by downsampling + 8-bit +- OK: Basic white balance and exposure +/- 0.5 stop +- OK: Light color grading (LUT application, saturation +/- 20%) +- NO: Shadow recovery — pushing shadows reveals quantization banding +- NO: Heavy curves / local contrast — amplifies per-band wavelet artifacts +- NO: Large prints from full resolution — artifacts visible at ~200 PPI+ + +Best for: Preview/proxy files, web gallery, social media, quick turnaround + +### Q1 — High Compression (6.8x, 29 MB) + +Raw ENOB: 7.4 bits | PSNR: 44.6 dB | RMS error: 386 DN + +- OK: Exposure adjustment +/- 1 stop +- OK: Moderate white balance correction +- OK: Saturation and vibrance adjustments +- MAYBE: Shadow push up to +1 stop — minor artifacts in deep shadows +- NO: Heavy shadow recovery (more than +1 stop) + +Best for: Event photography, documentary, editorial with light edits + +### Q2-Q3 — Moderate Compression (5.0-4.2x, 39-46 MB) + +Raw ENOB: 8.1-8.3 bits | PSNR: 48.5-50.0 dB | RMS error: 207-246 DN + +- OK: Exposure +/- 1.5 stops +- OK: Moderate shadow recovery (+1.5 stops) +- OK: Color grading and creative white balance shifts +- OK: Clarity / texture / dehaze adjustments +- OK: Print up to 24x36" at 300 PPI +- MAYBE: Aggressive curves — some wavelet ringing in smooth gradients +- NO: Extreme shadow push (> +2 stops) + +Best for: Professional editorial, portrait, landscape with moderate edits + +### Q4-Q5 — Balanced (3.7-2.9x, 53-67 MB) + +Raw ENOB: 8.5-8.7 bits | PSNR: 51.2-52.1 dB | RMS error: 163-181 DN + +- OK: Exposure +/- 2 stops +- OK: Shadow recovery up to +2 stops +- OK: Aggressive color grading (split toning, cross-process, film emulation) +- OK: Local adjustments (dodge/burn, graduated filters, radial masks) +- OK: Sharpening and noise reduction +- OK: Large prints at full resolution +- MAYBE: Extreme shadow push (+3 stops) — faint banding in deepest shadows + +Best for: Professional photography with full editing workflow + +### Q6-Q8 — Near-Lossless (2.2-2.1x, 90-92 MB) + +Raw ENOB: 9.4-9.8 bits | PSNR: 56.5-58.9 dB | RMS error: 74-98 DN + +- OK: Shadow recovery up to +3 stops +- OK: Extreme exposure adjustments (+/- 3 stops) +- OK: Aggressive local contrast / clarity / HDR tone mapping +- OK: Fine art printing at maximum size +- OK: Pixel-level compositing and retouching +- MAYBE: Extreme shadow push (+4 stops) — may show faint artifacts + +Note: Q7 and Q8 are nearly identical (58.6 vs 58.9 dB) because the +quantization divisor hits the noise floor. Q6 offers the best +quality/size tradeoff in this tier. + +Best for: Archival, fine art, maximum editing flexibility + +## Key Insight + +This is a 16-bit sensor with ~14 bits of real data. Even Q8 (best quality) +only preserves ~10 bits of the 14 available. The remaining ~4 bits of sensor +precision are lost to wavelet quantization. For truly lossless archival, +keep the original DNG. + +However: for practical photography, 10 bits of ENOB in the raw domain +translates to 8+ bits of final output precision after demosaic + color +processing — more than sufficient for any 8-bit display or print. + +The compression artifacts are in the wavelet domain (smooth spatial patterns +across 8x8 or 16x16 pixel blocks), not in the pixel domain. They are less +visually objectionable than JPEG artifacts at the same PSNR. + +## Per-Channel Quantization Scaling + +VC5 encodes Bayer data in color-difference space (not raw RGGB): + +| Channel | Name | Content | +|---------|------|---------| +| 0 (GS) | Green Sum | (Gr+Gb)/2 — luminance | +| 1 (RG) | Red-Green | R-G — chrominance | +| 2 (BG) | Blue-Green | B-G — chrominance | +| 3 (GD) | Green Diff | (Gr-Gb)/2 — detail/noise | + +The `--ChannelScale` flag (or `channel_quant_scale` API) applies per-channel +multipliers to quantization. Scale > 1.0 = more compression, < 1.0 = less. + +### Test Results (Q4 base, 100MP 16-bit) + +| Config | Scale (GS,RG,BG,GD) | Size | Raw PSNR | RGB PSNR | G PSNR | R PSNR | B PSNR | +|--------|---------------------|------|----------|----------|--------|--------|--------| +| Uniform Q4 | 1,1,1,1 | 53.0 MB | 51.2 dB | 44.6 dB | 49.2 dB | 42.0 dB | 45.3 dB | +| A | 1.0, 2.0, 2.0, 3.0 | 39.9 MB | 46.2 dB | 39.9 dB | 45.0 dB | 37.2 dB | 40.7 dB | +| B | 1.0, 3.0, 3.0, 4.0 | 35.3 MB | 43.7 dB | 37.4 dB | 43.2 dB | 34.5 dB | 38.4 dB | +| C | 0.5, 2.0, 2.0, 2.0 | 47.4 MB | 47.5 dB | 40.4 dB | 46.3 dB | 37.5 dB | 41.4 dB | +| D | 1.0, 4.0, 4.0, 6.0 | 31.9 MB | 41.3 dB | 35.5 dB | 41.2 dB | 32.6 dB | 36.5 dB | + +### Size-Matched Comparison (per-channel vs uniform) + +| Per-Channel | Size | RGB PSNR | Uniform Match | Size | RGB PSNR | Delta | +|-------------|------|----------|---------------|------|----------|-------| +| Q4+A(1,2,2,3) | 39.9 MB | 39.9 dB | Uniform Q2 | 39.2 MB | 42.8 dB | -2.9 dB | +| Q4+C(.5,2,2,2) | 47.4 MB | 40.4 dB | Uniform Q3 | 46.0 MB | 43.8 dB | -3.4 dB | + +### Analysis + +At the same file size, uniform quality presets outperform per-channel scaling +by 2-3 dB. The reason: VC5's quality tables are already carefully tuned for +the color-difference space. Each preset has per-band ratios (e.g., lower Q for +low-frequency, higher Q for high-frequency) that are perceptually meaningful. +A uniform multiplier across all bands within a channel is less efficient than +using a different preset designed as a coherent whole. + +### When Per-Channel Scaling IS Useful + +- **Chroma-insensitive workflows**: Machine vision, scientific imaging where + only luminance matters — compress RG/BG/GD aggressively +- **Noise-dominated detail channel**: At high ISO, GD is mostly noise — + compressing it with scale 1.5-2.0 saves space with no visible loss +- **Fine-tuning between presets**: When Q3 is too large and Q2 is too lossy, + `Q3 -c "1.0,1.0,1.0,1.3"` gives a size between Q2 and Q3 +- **Artistic intent**: Deliberately softening color detail while preserving + luminance sharpness + +### CLI Usage + +``` +# Compress chroma 2x, detail 3x (GS untouched) +gpr_tools -i input.dng -o output.gpr -q 4 -c "1.0,2.0,2.0,3.0" + +# Boost luminance quality, compress chroma mildly +gpr_tools -i input.dng -o output.gpr -q 3 -c "0.7,1.0,1.0,1.3" +``` + +## Recommended Presets + +| Use Case | Recommended | Size (100MP) | PSNR | +|----------|-------------|-------------|------| +| Proxy / preview / web gallery | Q0 | 21 MB | 40.0 dB | +| Event / documentary (light edit) | Q1 | 29 MB | 44.6 dB | +| Editorial / portrait | Q2 or Q3 | 39-46 MB | 48-50 dB | +| Professional (full edit workflow) | Q4 or Q5 | 53-67 MB | 51-52 dB | +| Archival / fine art | Q6 | 90 MB | 56.5 dB | +| Maximum quality (diminishing returns) | Q7 or Q8 | 91-92 MB | 59 dB | +| Truly lossless | Original DNG | 138 MB | infinite | diff --git a/source/app/gpr_tools/main.cpp b/source/app/gpr_tools/main.cpp index e182028d..f40e1007 100755 --- a/source/app/gpr_tools/main.cpp +++ b/source/app/gpr_tools/main.cpp @@ -65,11 +65,13 @@ class my_argument_parser : public argument_parser string rgb_file_resolution; int rgb_file_bits; - + string output_file_path; string apply_gpr_parameters; - + + int quality; + public: bool get_verbose() { return verbose; } @@ -98,9 +100,9 @@ class my_argument_parser : public argument_parser ("InputHeight,h", input_height, 3000, "Input image height in pixel samples [3000]") - ("InputPitch,p", input_pitch, 8000, "Input image pitch in bytes [8000]") + ("InputPitch,p", input_pitch, 0, "Input image pitch in bytes (0 = auto from width)") - ("InputPixelFormat,x", input_pixel_format, string("rggb14"), "Input pixel format \n(rggb12, rggb12p, [rggb14], gbrg12, gbrg12p)") + ("InputPixelFormat,x", input_pixel_format, string("rggb14"), "Input pixel format \n(rggb12, rggb12p, [rggb14], rggb16, gbrg12, gbrg12p, gbrg16)") ("ApplyGprParameters,a", apply_gpr_parameters, string(""), "Parameters to use for GPR or DNG file.") @@ -109,7 +111,9 @@ class my_argument_parser : public argument_parser ("RgbFileResolution,r", rgb_file_resolution, string(""), "Output RGB resolution \n[1:1, 2:1, 4:1, 8:1. 16:1]") ("RgbFileBits,b", rgb_file_bits, 8, "Output RGB bits [8]") - ("OutputFilePath,o", output_file_path, string(""), "Output file path.\n(files types: GPR, DNG, PPM, RAW, JPG)"); + ("OutputFilePath,o", output_file_path, string(""), "Output file path.\n(files types: GPR, DNG, PPM, RAW, JPG)") + + ("Quality,q", quality, -1, "Encoder quality\n(0=Low, 1=Medium, 2=High, 3=FS1, 4=FSX, 5=FS2, 6=FS3, 7=FS4, 8=FS5, -1=auto)"); ; } }; @@ -195,7 +199,7 @@ int main(int argc, char *argv []) { return dng_convert_main(args.input_file_path.c_str(), args.input_width, args.input_height, args.input_pitch, args.input_skip_rows, args.input_pixel_format.c_str(), args.output_file_path.c_str(), args.apply_gpr_parameters.c_str(), args.gpmf_file_path.c_str(), args.rgb_file_resolution.c_str(), args.rgb_file_bits, - args.jpg_preview_file_path.c_str(), args.jpg_preview_file_width, args.jpg_preview_file_height ); + args.jpg_preview_file_path.c_str(), args.jpg_preview_file_width, args.jpg_preview_file_height, args.quality ); } return 0; diff --git a/source/app/gpr_tools/main_c.c b/source/app/gpr_tools/main_c.c index b8a331a9..73cfb7ed 100755 --- a/source/app/gpr_tools/main_c.c +++ b/source/app/gpr_tools/main_c.c @@ -97,7 +97,7 @@ static FILE_TYPE GetFileType( const char* file_path ) int dng_convert_main(const char* input_file_path, unsigned int input_width, unsigned int input_height, size_t input_pitch, size_t input_skip_rows, const char* input_pixel_format, const char* output_file_path, const char* metadata_file_path, const char* gpmf_file_path, const char* rgb_file_resolution, int rgb_file_bits, - const char* jpg_preview_file_path, int jpg_preview_file_width, int jpg_preview_file_height ) + const char* jpg_preview_file_path, int jpg_preview_file_width, int jpg_preview_file_height, int quality ) { bool success; bool write_buffer_to_file = true; @@ -123,7 +123,8 @@ int dng_convert_main(const char* input_file_path, unsigned int input_width, uns gpr_parameters params; gpr_parameters_set_defaults(¶ms); - + params.quality = quality; + gpr_buffer input_buffer = { NULL, 0 }; if( read_from_file( &input_buffer, input_file_path, allocator.Alloc, allocator.Free ) != 0 ) @@ -144,53 +145,76 @@ int dng_convert_main(const char* input_file_path, unsigned int input_width, uns { params.input_width = input_width; params.input_height = input_height; - params.input_pitch = input_pitch; - + int32_t saturation_level = params.tuning_info.dgain_saturation_level.level_red; - + if( output_file_type == FILE_TYPE_GPR ) saturation_level = (1 << 14) - 1; else if( output_file_type == FILE_TYPE_DNG ) saturation_level = (1 << 12) - 1; - + if( strcmp(input_pixel_format, "rggb12") == 0 ) { params.tuning_info.pixel_format = PIXEL_FORMAT_RGGB_12; - - if( input_pitch == -1 ) + + saturation_level = (1 << 12) - 1; + + if( input_pitch == 0 || input_pitch == (size_t)-1 ) input_pitch = input_width * 2; } if( strcmp(input_pixel_format, "rggb12p") == 0 ) { params.tuning_info.pixel_format = PIXEL_FORMAT_RGGB_12P; - - if( input_pitch == -1 ) + + if( input_pitch == 0 || input_pitch == (size_t)-1 ) input_pitch = (input_width * 3 / 4) * 2; } else if( strcmp(input_pixel_format, "rggb14") == 0 ) { params.tuning_info.pixel_format = PIXEL_FORMAT_RGGB_14; - + saturation_level = (1 << 14) - 1; - if( input_pitch == -1 ) + if( input_pitch == 0 || input_pitch == (size_t)-1 ) + input_pitch = input_width * 2; + } + else if( strcmp(input_pixel_format, "rggb16") == 0 ) + { + params.tuning_info.pixel_format = PIXEL_FORMAT_RGGB_16; + + saturation_level = (1 << 16) - 1; + + if( input_pitch == 0 || input_pitch == (size_t)-1 ) input_pitch = input_width * 2; } else if( strcmp(input_pixel_format, "gbrg12") == 0 ) { params.tuning_info.pixel_format = PIXEL_FORMAT_GBRG_12; - - if( input_pitch == -1 ) + + saturation_level = (1 << 12) - 1; + + if( input_pitch == 0 || input_pitch == (size_t)-1 ) input_pitch = input_width * 2; } else if( strcmp(input_pixel_format, "gbrg12p") == 0 ) { params.tuning_info.pixel_format = PIXEL_FORMAT_GBRG_12P; - - if( input_pitch == -1 ) + + if( input_pitch == 0 || input_pitch == (size_t)-1 ) input_pitch = (input_width * 3 / 4) * 2; } - + else if( strcmp(input_pixel_format, "gbrg16") == 0 ) + { + params.tuning_info.pixel_format = PIXEL_FORMAT_GBRG_16; + + saturation_level = (1 << 16) - 1; + + if( input_pitch == 0 || input_pitch == (size_t)-1 ) + input_pitch = input_width * 2; + } + + params.input_pitch = input_pitch; + params.tuning_info.dgain_saturation_level.level_red = saturation_level; params.tuning_info.dgain_saturation_level.level_green_even = saturation_level; params.tuning_info.dgain_saturation_level.level_green_odd = saturation_level; @@ -344,4 +368,3 @@ int dng_convert_main(const char* input_file_path, unsigned int input_width, uns return 0; } - diff --git a/source/app/gpr_tools/main_c.h b/source/app/gpr_tools/main_c.h index a8bd308e..6b2f59b1 100755 --- a/source/app/gpr_tools/main_c.h +++ b/source/app/gpr_tools/main_c.h @@ -25,7 +25,7 @@ extern "C" { int dng_convert_main(const char* input_file_path, unsigned int input_width, unsigned int input_height, size_t input_pitch, size_t input_skip_rows, const char* input_pixel_format, const char* output_file_path, const char* exiftool_file_path, const char* gpmf_file_path, const char* rgb_file_resolution, int rgb_file_bits, - const char* jpg_preview_file_path, int jpg_preview_file_width, int jpg_preview_file_height ); + const char* jpg_preview_file_path, int jpg_preview_file_width, int jpg_preview_file_height, int quality ); #ifdef __cplusplus } diff --git a/source/app/vc5_decoder_app/main.cpp b/source/app/vc5_decoder_app/main.cpp index f9255ed1..e82d050c 100755 --- a/source/app/vc5_decoder_app/main.cpp +++ b/source/app/vc5_decoder_app/main.cpp @@ -204,7 +204,7 @@ int main(int argc, char *argv[]) file.fill( '0' ); file.width( 4 ); - file << (DecoderLogCurve[i] >> 4); + file << (DecoderLogCurve12[i] >> 4); file << endl; } diff --git a/source/lib/gpr_sdk/private/gpr.cpp b/source/lib/gpr_sdk/private/gpr.cpp index 516c0f5f..0861b28c 100755 --- a/source/lib/gpr_sdk/private/gpr.cpp +++ b/source/lib/gpr_sdk/private/gpr.cpp @@ -160,15 +160,26 @@ static void set_vc5_encoder_parameters( vc5_encoder_parameters& vc5_encoder_para case PIXEL_FORMAT_GBRG_12P: vc5_encoder_params.pixel_format = VC5_ENCODER_PIXEL_FORMAT_GBRG_12P; break; + + case PIXEL_FORMAT_RGGB_16: + vc5_encoder_params.pixel_format = VC5_ENCODER_PIXEL_FORMAT_RGGB_16; + break; + + case PIXEL_FORMAT_GBRG_16: + vc5_encoder_params.pixel_format = VC5_ENCODER_PIXEL_FORMAT_GBRG_16; + break; default: break; } - if( convert_params->fast_encoding ) + if( convert_params->quality >= 0 && convert_params->quality < VC5_ENCODER_QUALITY_SETTING_COUNT ) + vc5_encoder_params.quality_setting = (VC5_ENCODER_QUALITY_SETTING)convert_params->quality; + else if( convert_params->fast_encoding ) vc5_encoder_params.quality_setting = VC5_ENCODER_QUALITY_SETTING_MEDIUM; else vc5_encoder_params.quality_setting = VC5_ENCODER_QUALITY_SETTING_FS1; + } #endif @@ -185,20 +196,21 @@ void gpr_parameters_set_defaults(gpr_parameters* x) x->compute_md5sum = false; x->fast_encoding = false; + x->quality = -1; } void gpr_parameters_construct_copy(const gpr_parameters* y, gpr_parameters* x, gpr_malloc mem_alloc) { gpr_parameters_set_defaults(x); - + *x = *y; - + if( y->gpmf_payload.size > 0 && y->gpmf_payload.buffer != NULL ) { x->gpmf_payload.buffer = mem_alloc( y->gpmf_payload.size ); memcpy( x->gpmf_payload.buffer, y->gpmf_payload.buffer, y->gpmf_payload.size ); } - + if( y->tuning_info.gain_map.size > 0 ) { for ( int i = 0; i < 4; i++) @@ -210,6 +222,29 @@ void gpr_parameters_construct_copy(const gpr_parameters* y, gpr_parameters* x, g } } } + + // Deep-copy HueSatMap data + if( y->profile_info.hue_sat_map_dims[0] > 0 ) + { + uint64_t count64 = (uint64_t)y->profile_info.hue_sat_map_dims[0] * + y->profile_info.hue_sat_map_dims[1] * + y->profile_info.hue_sat_map_dims[2]; + /* Sanity check: reject absurdly large maps (> ~8M entries = ~96MB) */ + if( count64 > 8000000 ) count64 = 0; + uint32_t count = (uint32_t)count64; + size_t data_size = count * 3 * sizeof(float); + + if( y->profile_info.hue_sat_map_data1 != NULL ) + { + x->profile_info.hue_sat_map_data1 = (float *)mem_alloc( data_size ); + memcpy( x->profile_info.hue_sat_map_data1, y->profile_info.hue_sat_map_data1, data_size ); + } + if( y->profile_info.hue_sat_map_data2 != NULL ) + { + x->profile_info.hue_sat_map_data2 = (float *)mem_alloc( data_size ); + memcpy( x->profile_info.hue_sat_map_data2, y->profile_info.hue_sat_map_data2, data_size ); + } + } } void gpr_parameters_destroy(gpr_parameters* x, gpr_free mem_free) @@ -218,7 +253,7 @@ void gpr_parameters_destroy(gpr_parameters* x, gpr_free mem_free) { mem_free( x->gpmf_payload.buffer ); } - + if( x->tuning_info.gain_map.size > 0 ) { for ( int i = 0; i < 4; i++) @@ -227,6 +262,17 @@ void gpr_parameters_destroy(gpr_parameters* x, gpr_free mem_free) mem_free( x->tuning_info.gain_map.buffers[i] ); } } + + if( x->profile_info.hue_sat_map_data1 ) + { + mem_free( x->profile_info.hue_sat_map_data1 ); + x->profile_info.hue_sat_map_data1 = NULL; + } + if( x->profile_info.hue_sat_map_data2 ) + { + mem_free( x->profile_info.hue_sat_map_data2 ); + x->profile_info.hue_sat_map_data2 = NULL; + } } @@ -443,11 +489,21 @@ static void convert_dng_exif_to_dng_exif_info( gpr_exif_info* dst_exif, const dn dst_exif->date_time_original = convert_to_dng_date_and_time( src_exif->fDateTimeOriginal.DateTime() ); dst_exif->date_time_digitized = convert_to_dng_date_and_time( src_exif->fDateTimeOriginal.DateTime() ); - assert(src_exif->fSoftware.Length() < sizeof(dst_exif->software_version)); - memcpy( dst_exif->software_version, src_exif->fSoftware.Get(), src_exif->fSoftware.Length() ); + { + size_t len = src_exif->fSoftware.Length(); + if (len >= sizeof(dst_exif->software_version)) + len = sizeof(dst_exif->software_version) - 1; + memcpy( dst_exif->software_version, src_exif->fSoftware.Get(), len ); + dst_exif->software_version[len] = '\0'; + } - assert(src_exif->fUserComment.Length() < sizeof(dst_exif->user_comment)); - memcpy( dst_exif->user_comment, src_exif->fUserComment.Get(), src_exif->fUserComment.Length() ); + { + size_t len = src_exif->fUserComment.Length(); + if (len >= sizeof(dst_exif->user_comment)) + len = sizeof(dst_exif->user_comment) - 1; + memcpy( dst_exif->user_comment, src_exif->fUserComment.Get(), len ); + dst_exif->user_comment[len] = '\0'; + } // GPS Info gpr_gps_info& dst_gps_info = dst_exif->gps_info; @@ -652,11 +708,12 @@ static bool read_dng(const gpr_allocator* allocator, convert_params->input_pitch = convert_params->input_width * 2; // Copy ColorMatrix1, ColorMatrix2 + if (negative->ProfileCount() > 0) { const dng_camera_profile &profile_info = negative->ProfileByIndex( 0 ); const dng_matrix &m1 = profile_info.ColorMatrix1(); const dng_matrix &m2 = profile_info.ColorMatrix2(); - + for (i = 0; i < 3; i++) { for (j = 0; j < 3; j++) @@ -665,17 +722,91 @@ static bool read_dng(const gpr_allocator* allocator, convert_params->profile_info.color_matrix_2[i][j] = m2[i][j]; } } - + convert_params->profile_info.compute_color_matrix = false; convert_params->profile_info.matrix_weighting = 1.0; - + memset( convert_params->profile_info.wb1, 0, sizeof(convert_params->profile_info.wb1) ); memset( convert_params->profile_info.wb2, 0, sizeof(convert_params->profile_info.wb2) ); - + memset( convert_params->profile_info.cam_to_srgb_1, 0, sizeof(convert_params->profile_info.cam_to_srgb_1) ); memset( convert_params->profile_info.cam_to_srgb_2, 0, sizeof(convert_params->profile_info.cam_to_srgb_2) ); + + // Copy CalibrationIlluminant values + convert_params->profile_info.illuminant1 = profile_info.CalibrationIlluminant1(); + convert_params->profile_info.illuminant2 = profile_info.CalibrationIlluminant2(); + + // Copy ForwardMatrix if present + const dng_matrix &fm1 = profile_info.ForwardMatrix1(); + const dng_matrix &fm2 = profile_info.ForwardMatrix2(); + if (!fm1.IsEmpty() && !fm2.IsEmpty()) + { + convert_params->profile_info.has_forward_matrix = true; + for (i = 0; i < 3; i++) + for (j = 0; j < 3; j++) + { + convert_params->profile_info.forward_matrix_1[i][j] = fm1[i][j]; + convert_params->profile_info.forward_matrix_2[i][j] = fm2[i][j]; + } + } + else + { + convert_params->profile_info.has_forward_matrix = false; + } } - + + // Copy BaselineExposure and AnalogBalance + { + convert_params->profile_info.baseline_exposure = negative->BaselineExposure(); + + convert_params->profile_info.analog_balance[0] = negative->AnalogBalance(0); + convert_params->profile_info.analog_balance[1] = negative->AnalogBalance(1); + convert_params->profile_info.analog_balance[2] = negative->AnalogBalance(2); + } + + // Copy ProfileHueSatMapData if present + if (negative->ProfileCount() > 0) + { + const dng_camera_profile &cam_profile = negative->ProfileByIndex( 0 ); + if (cam_profile.HasHueSatDeltas()) + { + const dng_hue_sat_map &hsm1 = cam_profile.HueSatDeltas1(); + uint32 hDiv, sDiv, vDiv; + hsm1.GetDivisions(hDiv, sDiv, vDiv); + uint64 count64 = (uint64)hDiv * sDiv * vDiv; + + convert_params->profile_info.hue_sat_map_dims[0] = hDiv; + convert_params->profile_info.hue_sat_map_dims[1] = sDiv; + convert_params->profile_info.hue_sat_map_dims[2] = vDiv; + convert_params->profile_info.hue_sat_map_encoding = cam_profile.HueSatMapEncoding(); + + /* Sanity check: reject absurdly large maps */ + if (count64 > 8000000) + { + count64 = 0; + convert_params->profile_info.hue_sat_map_dims[0] = 0; + convert_params->profile_info.hue_sat_map_dims[1] = 0; + convert_params->profile_info.hue_sat_map_dims[2] = 0; + } + uint32 count = (uint32)count64; + + // Copy data1 (3 floats per entry: hueShift, satScale, valScale) + size_t data_size = count * 3 * sizeof(float); + convert_params->profile_info.hue_sat_map_data1 = (float *)allocator->Alloc(data_size); + const dng_hue_sat_map::HSBModify *src1 = hsm1.GetConstDeltas(); + memcpy(convert_params->profile_info.hue_sat_map_data1, src1, data_size); + + // Copy data2 if present + const dng_hue_sat_map &hsm2 = cam_profile.HueSatDeltas2(); + if (hsm2.IsValid()) + { + convert_params->profile_info.hue_sat_map_data2 = (float *)allocator->Alloc(data_size); + const dng_hue_sat_map::HSBModify *src2 = hsm2.GetConstDeltas(); + memcpy(convert_params->profile_info.hue_sat_map_data2, src2, data_size); + } + } + } + // Set Exif Info convert_dng_exif_to_dng_exif_info( &convert_params->exif_info, negative->GetExif() ); @@ -694,20 +825,27 @@ static bool read_dng(const gpr_allocator* allocator, tuning_info.wb_gains.b_gain = 1 / camNeutral[2]; } - const dng_linearization_info& linearization_info = *negative->GetLinearizationInfo(); - + const dng_linearization_info *linearization_ptr = negative->GetLinearizationInfo(); + if (linearization_ptr != NULL) { + const dng_linearization_info& linearization_info = *linearization_ptr; + gpr_static_black_level& static_black_level = tuning_info.static_black_level; - - static_black_level.r_black = linearization_info.fBlackLevel[0][0][0]; - static_black_level.g_r_black = linearization_info.fBlackLevel[0][1][0]; - static_black_level.g_b_black = linearization_info.fBlackLevel[1][0][0]; - static_black_level.b_black = linearization_info.fBlackLevel[1][1][0]; - } - - { + + // Index with modulo repeat dims so that a 1x1 pattern + // (uniform black level) replicates to all four channels + uint32 rr = linearization_info.fBlackLevelRepeatRows; + uint32 rc = linearization_info.fBlackLevelRepeatCols; + if (rr < 1) rr = 1; + if (rc < 1) rc = 1; + + static_black_level.r_black = linearization_info.fBlackLevel[0 % rr][0 % rc][0]; + static_black_level.g_r_black = linearization_info.fBlackLevel[0 % rr][1 % rc][0]; + static_black_level.g_b_black = linearization_info.fBlackLevel[1 % rr][0 % rc][0]; + static_black_level.b_black = linearization_info.fBlackLevel[1 % rr][1 % rc][0]; + gpr_saturation_level& dgain_saturation_level = tuning_info.dgain_saturation_level; - + dgain_saturation_level.level_red = linearization_info.fWhiteLevel[0]; dgain_saturation_level.level_green_even = dgain_saturation_level.level_red; dgain_saturation_level.level_green_odd = dgain_saturation_level.level_red; @@ -721,44 +859,38 @@ static bool read_dng(const gpr_allocator* allocator, bool rggb_raw = (rawIFD.fCFAPattern[0][0] == 0) && (rawIFD.fCFAPattern[0][1] == 1) && (rawIFD.fCFAPattern[1][0] == 1) && (rawIFD.fCFAPattern[1][1] == 2); + // Determine bit depth from the maximum saturation level + uint32_t max_sat = dgain_saturation_level.level_red; + if (dgain_saturation_level.level_green_even > max_sat) max_sat = dgain_saturation_level.level_green_even; + if (dgain_saturation_level.level_green_odd > max_sat) max_sat = dgain_saturation_level.level_green_odd; + if (dgain_saturation_level.level_blue > max_sat) max_sat = dgain_saturation_level.level_blue; + if( rggb_raw ) { - if( dgain_saturation_level.level_red == 4095 && - dgain_saturation_level.level_green_even == 4095 && - dgain_saturation_level.level_green_odd == 4095 && - dgain_saturation_level.level_blue == 4095 ) + if( max_sat <= 4095 ) { tuning_info.pixel_format = PIXEL_FORMAT_RGGB_12; } - else if(dgain_saturation_level.level_red == 16383 && - dgain_saturation_level.level_green_even == 16383 && - dgain_saturation_level.level_green_odd == 16383 && - dgain_saturation_level.level_blue == 16383 ) + else if( max_sat <= 16383 ) { tuning_info.pixel_format = PIXEL_FORMAT_RGGB_14; } else { - assert(0); - return false; + tuning_info.pixel_format = PIXEL_FORMAT_RGGB_16; } } else { - if( dgain_saturation_level.level_red == 4095 && - dgain_saturation_level.level_green_even == 4095 && - dgain_saturation_level.level_green_odd == 4095 && - dgain_saturation_level.level_blue == 4095 ) + if( max_sat <= 4095 ) { tuning_info.pixel_format = PIXEL_FORMAT_GBRG_12; } else { - assert(0); - return false; + // No GBRG_14 enum; use GBRG_16 for 14-bit and 16-bit + tuning_info.pixel_format = PIXEL_FORMAT_GBRG_16; } - - } } @@ -766,14 +898,19 @@ static bool read_dng(const gpr_allocator* allocator, if ( negative->HasNoiseProfile() ) { dng_noise_profile noise_profile = negative->NoiseProfile(); - + dng_noise_function noise_function = noise_profile.NoiseFunction (0); tuning_info.noise_scale = noise_function.Scale(); tuning_info.noise_offset = noise_function.Offset(); - //LogPrint( "Noise profile s = %f, o = %f ", tuning_info.noise_scale, tuning_info.noise_offset ); } - + + // DefaultCropOrigin and DefaultCropSize + tuning_info.default_crop_origin_h = Round_uint32(negative->DefaultCropOriginH().As_real64()); + tuning_info.default_crop_origin_v = Round_uint32(negative->DefaultCropOriginV().As_real64()); + tuning_info.default_crop_size_h = Round_uint32(negative->DefaultCropSizeH().As_real64()); + tuning_info.default_crop_size_v = Round_uint32(negative->DefaultCropSizeV().As_real64()); + // GainMap dng_opcode_list &opcodelist2 = negative->OpcodeList2 (); uint32_t count = opcodelist2.Count (); @@ -1049,6 +1186,14 @@ static void write_dng(const gpr_allocator* allocator, case PIXEL_FORMAT_GBRG_12: vc5_decoder_params.pixel_format = VC5_DECODER_PIXEL_FORMAT_GBRG_12; break; + + case PIXEL_FORMAT_RGGB_16: + vc5_decoder_params.pixel_format = VC5_DECODER_PIXEL_FORMAT_RGGB_16; + break; + + case PIXEL_FORMAT_GBRG_16: + vc5_decoder_params.pixel_format = VC5_DECODER_PIXEL_FORMAT_GBRG_16; + break; default: assert(0); @@ -1105,6 +1250,7 @@ static void write_dng(const gpr_allocator* allocator, case PIXEL_FORMAT_RGGB_12: case PIXEL_FORMAT_RGGB_12P: case PIXEL_FORMAT_RGGB_14: + case PIXEL_FORMAT_RGGB_16: negative->SetQuadBlacks(static_black_level.r_black, static_black_level.g_r_black, static_black_level.g_b_black, @@ -1113,6 +1259,7 @@ static void write_dng(const gpr_allocator* allocator, break; case PIXEL_FORMAT_GBRG_12: case PIXEL_FORMAT_GBRG_12P: + case PIXEL_FORMAT_GBRG_16: negative->SetQuadBlacks(static_black_level.g_b_black, static_black_level.b_black, static_black_level.r_black, @@ -1205,13 +1352,20 @@ static void write_dng(const gpr_allocator* allocator, //GP!! NEED outputWidth, activeWidth, outputHeight, activeHeight here negative->SetDefaultScale(dng_urational(outputWidth, activeWidth), dng_urational(outputHeight, activeHeight)); - uint32 crop_size_val = 0; - dng_point crop_origin( 0, 0 ); - dng_point crop_size( activeHeight - 2 * crop_size_val, activeWidth - 2 * crop_size_val ); - + uint32 crop_origin_h = convert_params->tuning_info.default_crop_origin_h; + uint32 crop_origin_v = convert_params->tuning_info.default_crop_origin_v; + dng_point crop_origin( crop_origin_v, crop_origin_h ); + + /* Use stored crop size when available; fall back to symmetric formula */ + uint32 crop_size_h = convert_params->tuning_info.default_crop_size_h; + uint32 crop_size_v = convert_params->tuning_info.default_crop_size_v; + if (crop_size_h == 0) crop_size_h = activeWidth - 2 * crop_origin_h; + if (crop_size_v == 0) crop_size_v = activeHeight - 2 * crop_origin_v; + dng_point crop_size( crop_size_v, crop_size_h ); + negative->SetDefaultCropOrigin( crop_origin.h, crop_origin.v ); negative->SetDefaultCropSize( crop_size.h, crop_size.v ); - + negative->SetOriginalDefaultCropSize( dng_urational(crop_size.h, 1), dng_urational(crop_size.v, 1) ); { @@ -1240,11 +1394,11 @@ static void write_dng(const gpr_allocator* allocator, negative->SetColorKeys(colorCodes[0], colorCodes[1], colorCodes[2], colorCodes[3]); // Set Bayer Pattern - if( convert_params->tuning_info.pixel_format == PIXEL_FORMAT_RGGB_12 || convert_params->tuning_info.pixel_format == PIXEL_FORMAT_RGGB_12P || convert_params->tuning_info.pixel_format == PIXEL_FORMAT_RGGB_14 ) + if( convert_params->tuning_info.pixel_format == PIXEL_FORMAT_RGGB_12 || convert_params->tuning_info.pixel_format == PIXEL_FORMAT_RGGB_12P || convert_params->tuning_info.pixel_format == PIXEL_FORMAT_RGGB_14 || convert_params->tuning_info.pixel_format == PIXEL_FORMAT_RGGB_16 ) { negative->SetBayerMosaic(1); } - else if( convert_params->tuning_info.pixel_format == PIXEL_FORMAT_GBRG_12 || convert_params->tuning_info.pixel_format == PIXEL_FORMAT_GBRG_12P ) + else if( convert_params->tuning_info.pixel_format == PIXEL_FORMAT_GBRG_12 || convert_params->tuning_info.pixel_format == PIXEL_FORMAT_GBRG_12P || convert_params->tuning_info.pixel_format == PIXEL_FORMAT_GBRG_16 ) { negative->SetBayerMosaic(3); } @@ -1254,14 +1408,16 @@ static void write_dng(const gpr_allocator* allocator, return; } - negative->SetBaselineExposure(0); + negative->SetBaselineExposure(profile_info->baseline_exposure); negative->SetBaselineNoise(1.0); negative->SetBaselineSharpness(1.0); - + negative->SetAntiAliasStrength(dng_urational(100, 100)); negative->SetLinearResponseLimit(1.0); negative->SetShadowScale( dng_urational(1, 1) ); - negative->SetAnalogBalance(dng_vector_3(1.0, 1.0, 1.0)); + negative->SetAnalogBalance(dng_vector_3(profile_info->analog_balance[0], + profile_info->analog_balance[1], + profile_info->analog_balance[2])); AutoPtr prof(new dng_camera_profile); prof->SetName( camera_make_and_model ); @@ -1314,7 +1470,46 @@ static void write_dng(const gpr_allocator* allocator, prof->SetCalibrationIlluminant1(profile_info->illuminant1); prof->SetCalibrationIlluminant2(profile_info->illuminant2); - + + if (profile_info->has_forward_matrix) + { + dng_matrix_3by3 fm1; + dng_matrix_3by3 fm2; + for (i = 0; i < 3; i++) + for (j = 0; j < 3; j++) + { + fm1[i][j] = profile_info->forward_matrix_1[i][j]; + fm2[i][j] = profile_info->forward_matrix_2[i][j]; + } + prof->SetForwardMatrix1(fm1); + prof->SetForwardMatrix2(fm2); + } + + if (profile_info->hue_sat_map_dims[0] > 0 && profile_info->hue_sat_map_data1 != NULL) + { + uint32 hDiv = profile_info->hue_sat_map_dims[0]; + uint32 sDiv = profile_info->hue_sat_map_dims[1]; + uint32 vDiv = profile_info->hue_sat_map_dims[2]; + uint32 count = hDiv * sDiv * vDiv; + + dng_hue_sat_map hsm1; + hsm1.SetDivisions(hDiv, sDiv, vDiv); + dng_hue_sat_map::HSBModify *dst1 = hsm1.GetDeltas(); + memcpy(dst1, profile_info->hue_sat_map_data1, count * 3 * sizeof(float)); + prof->SetHueSatDeltas1(hsm1); + + if (profile_info->hue_sat_map_data2 != NULL) + { + dng_hue_sat_map hsm2; + hsm2.SetDivisions(hDiv, sDiv, vDiv); + dng_hue_sat_map::HSBModify *dst2 = hsm2.GetDeltas(); + memcpy(dst2, profile_info->hue_sat_map_data2, count * 3 * sizeof(float)); + prof->SetHueSatDeltas2(hsm2); + } + + prof->SetHueSatMapEncoding(profile_info->hue_sat_map_encoding); + } + negative->AddProfile(prof); dng_exif* const exif = negative->GetExif(); @@ -1633,22 +1828,29 @@ bool gpr_convert_dng_to_gpr(const gpr_allocator* allocator, TIMESTAMP("[BEG]", 1) gpr_buffer_auto raw_buffer(allocator->Alloc, allocator->Free); - + dng_memory_stream inp_dng_stream( gDefaultDNGMemoryAllocator ); inp_dng_stream.Put( inp_dng_buffer->buffer, inp_dng_buffer->size ); inp_dng_stream.SetReadPosition(0); - - if( read_dng( allocator, &inp_dng_stream, &raw_buffer, NULL, NULL ) == false ) + + // Extract metadata from input DNG into a mutable copy of parameters + gpr_parameters params_with_meta; + gpr_parameters_construct_copy( parameters, ¶ms_with_meta, allocator->Alloc ); + + if( read_dng( allocator, &inp_dng_stream, &raw_buffer, NULL, ¶ms_with_meta ) == false ) { + gpr_parameters_destroy( ¶ms_with_meta, allocator->Free ); assert(0); return false; } - + dng_memory_stream out_gpr_stream( gDefaultDNGMemoryAllocator ); - - write_dng( allocator, &out_gpr_stream, &raw_buffer, true, NULL, parameters ); - + + write_dng( allocator, &out_gpr_stream, &raw_buffer, true, NULL, ¶ms_with_meta ); + write_dngstream_to_buffer( &out_gpr_stream, out_gpr_buffer, allocator->Alloc, allocator->Free ); - + + gpr_parameters_destroy( ¶ms_with_meta, allocator->Free ); + TIMESTAMP("[END]", 1) return true; @@ -1749,22 +1951,29 @@ bool gpr_convert_gpr_to_dng(const gpr_allocator* allocator, gpr_buffer_auto raw_buffer(allocator->Alloc, allocator->Free); gpr_buffer_auto vc5_buffer(allocator->Alloc, allocator->Free); - + dng_memory_stream inp_gpr_stream( gDefaultDNGMemoryAllocator ); inp_gpr_stream.Put( inp_gpr_buffer->buffer, inp_gpr_buffer->size ); inp_gpr_stream.SetReadPosition(0); - - if( read_dng( allocator, &inp_gpr_stream, &raw_buffer, &vc5_buffer, NULL ) == false ) + + // Extract metadata from GPR (which is a DNG) into a mutable copy of parameters + gpr_parameters params_with_meta; + gpr_parameters_construct_copy( parameters, ¶ms_with_meta, allocator->Alloc ); + + if( read_dng( allocator, &inp_gpr_stream, &raw_buffer, &vc5_buffer, ¶ms_with_meta ) == false ) { + gpr_parameters_destroy( ¶ms_with_meta, allocator->Free ); assert(0); return false; } - + dng_memory_stream out_dng_stream( gDefaultDNGMemoryAllocator ); - - write_dng( allocator, &out_dng_stream, &raw_buffer, false, NULL, parameters ); - + + write_dng( allocator, &out_dng_stream, &raw_buffer, false, NULL, ¶ms_with_meta ); + write_dngstream_to_buffer( &out_dng_stream, out_dng_buffer, allocator->Alloc, allocator->Free ); - + + gpr_parameters_destroy( ¶ms_with_meta, allocator->Free ); + TIMESTAMP("[END]", 1) return true; @@ -1845,5 +2054,3 @@ bool gpr_check_vc5( const gpr_allocator* allocator, return is_vc5_format; } - - diff --git a/source/lib/gpr_sdk/private/gpr_profile_info.cpp b/source/lib/gpr_sdk/private/gpr_profile_info.cpp index 93ad542b..b69de16a 100755 --- a/source/lib/gpr_sdk/private/gpr_profile_info.cpp +++ b/source/lib/gpr_sdk/private/gpr_profile_info.cpp @@ -53,5 +53,21 @@ void gpr_profile_info_set_defaults(gpr_profile_info* x) memset( x->color_matrix_1, 0, sizeof(x->color_matrix_1) ); memset( x->color_matrix_2, 0, sizeof(x->color_matrix_2) ); + + memset( x->forward_matrix_1, 0, sizeof(x->forward_matrix_1) ); + memset( x->forward_matrix_2, 0, sizeof(x->forward_matrix_2) ); + x->has_forward_matrix = false; + + x->baseline_exposure = 0.0; + x->analog_balance[0] = 1.0; + x->analog_balance[1] = 1.0; + x->analog_balance[2] = 1.0; + + x->hue_sat_map_dims[0] = 0; + x->hue_sat_map_dims[1] = 0; + x->hue_sat_map_dims[2] = 0; + x->hue_sat_map_data1 = NULL; + x->hue_sat_map_data2 = NULL; + x->hue_sat_map_encoding = 0; } diff --git a/source/lib/gpr_sdk/private/gpr_read_image.cpp b/source/lib/gpr_sdk/private/gpr_read_image.cpp index 84c2fe97..10fa1981 100755 --- a/source/lib/gpr_sdk/private/gpr_read_image.cpp +++ b/source/lib/gpr_sdk/private/gpr_read_image.cpp @@ -53,9 +53,9 @@ static bool DecodeVC5(dng_image &image, gpr_buffer_auto& vc5_buffer, VC5_DECODER } raw_buffer.set( raw_image.buffer, raw_image.size, true ); - + dng_point size = image.Bounds().Size(); - + CopyBufferToRawImage( raw_buffer, size.h, image ); return true; @@ -92,15 +92,35 @@ void gpr_read_image::ReadTile (dng_host &host, { bool rggb_raw = (ifd.fCFAPattern[0][0] == 0) && (ifd.fCFAPattern[0][1] == 1) && (ifd.fCFAPattern[1][0] == 1) && (ifd.fCFAPattern[1][1] == 2); + // Use WhiteLevel to infer actual bit depth, NOT BitsPerSample + // (BitsPerSample is always 16 for uint16 DNG storage) + uint32 white_level = (uint32)ifd.fWhiteLevel[0]; + uint32 bit_depth; + if (white_level <= 4095) + bit_depth = 12; + else if (white_level <= 16383) + bit_depth = 14; + else + bit_depth = 16; VC5_DECODER_PIXEL_FORMAT pixel_format; - - if( rggb_raw ) + + if (rggb_raw) { - pixel_format = VC5_DECODER_PIXEL_FORMAT_RGGB_14; + if (bit_depth >= 16) + pixel_format = VC5_DECODER_PIXEL_FORMAT_RGGB_16; + else if (bit_depth > 12) + pixel_format = VC5_DECODER_PIXEL_FORMAT_RGGB_14; + else + pixel_format = VC5_DECODER_PIXEL_FORMAT_RGGB_12; } else { - pixel_format = VC5_DECODER_PIXEL_FORMAT_GBRG_12; + if (bit_depth >= 16) + pixel_format = VC5_DECODER_PIXEL_FORMAT_GBRG_16; + else if (bit_depth > 12) + pixel_format = VC5_DECODER_PIXEL_FORMAT_GBRG_14; + else + pixel_format = VC5_DECODER_PIXEL_FORMAT_GBRG_12; } if( DecodeVC5( image, *_vc5_buffer, pixel_format ) ) diff --git a/source/lib/gpr_sdk/private/gpr_tuning_info.cpp b/source/lib/gpr_sdk/private/gpr_tuning_info.cpp index 45ff9f78..4c250058 100755 --- a/source/lib/gpr_sdk/private/gpr_tuning_info.cpp +++ b/source/lib/gpr_sdk/private/gpr_tuning_info.cpp @@ -79,6 +79,12 @@ void gpr_tuning_info_set_defaults( gpr_tuning_info* x ) _gain_map_set_defaults( x ); x->pixel_format = PIXEL_FORMAT_RGGB_14; + + x->default_crop_origin_h = 0; + x->default_crop_origin_v = 0; + + x->default_crop_size_h = 0; + x->default_crop_size_v = 0; } diff --git a/source/lib/gpr_sdk/public/gpr.h b/source/lib/gpr_sdk/public/gpr.h index 666e7278..48a3a11f 100755 --- a/source/lib/gpr_sdk/public/gpr.h +++ b/source/lib/gpr_sdk/public/gpr.h @@ -53,7 +53,9 @@ unsigned int input_pitch; /* Pitch of input source in pixels (only applies to raw input) */ bool fast_encoding; - + + int quality; /* Quality setting (0-5), or -1 for auto */ + bool compute_md5sum; gpr_buffer gpmf_payload; /* GPMF payload of image file */ @@ -67,7 +69,7 @@ gpr_profile_info profile_info; /* Camera color profile info object */ gpr_tuning_info tuning_info; /* Camera tuning info object */ - + } gpr_parameters; void gpr_parameters_set_defaults(gpr_parameters* x); diff --git a/source/lib/gpr_sdk/public/gpr_profile_info.h b/source/lib/gpr_sdk/public/gpr_profile_info.h index 5fc37b39..c39b6f71 100755 --- a/source/lib/gpr_sdk/public/gpr_profile_info.h +++ b/source/lib/gpr_sdk/public/gpr_profile_info.h @@ -45,10 +45,23 @@ Matrix color_matrix_1; Matrix color_matrix_2; - + + Matrix forward_matrix_1; + Matrix forward_matrix_2; + bool has_forward_matrix; + uint16_t illuminant1; uint16_t illuminant2; - + + double baseline_exposure; + double analog_balance[3]; + + /* ProfileHueSatMapData — per-hue color correction LUT */ + uint32_t hue_sat_map_dims[3]; /* [hue, sat, val] divisions */ + float *hue_sat_map_data1; /* illuminant 1: dims[0]*dims[1]*dims[2]*3 floats */ + float *hue_sat_map_data2; /* illuminant 2 (may be NULL) */ + uint32_t hue_sat_map_encoding; + } gpr_profile_info; void gpr_profile_info_set_defaults(gpr_profile_info* x); diff --git a/source/lib/gpr_sdk/public/gpr_tuning_info.h b/source/lib/gpr_sdk/public/gpr_tuning_info.h index 677d68d6..5bcca813 100755 --- a/source/lib/gpr_sdk/public/gpr_tuning_info.h +++ b/source/lib/gpr_sdk/public/gpr_tuning_info.h @@ -51,6 +51,10 @@ PIXEL_FORMAT_GBRG_12 = 3, // GBRG 12bit pixels packed into 16bits PIXEL_FORMAT_GBRG_12P = 4, // GBRG 12bit pixels packed into 12bits + + PIXEL_FORMAT_RGGB_16 = 5, // RGGB 16bit pixels packed into 16bits + + PIXEL_FORMAT_GBRG_16 = 6, // GBRG 16bit pixels packed into 16bits } GPR_PIXEL_FORMAT; @@ -133,9 +137,15 @@ double warp_blue_coefficient; gpr_gain_map gain_map; - + GPR_PIXEL_FORMAT pixel_format; + uint32_t default_crop_origin_h; + uint32_t default_crop_origin_v; + + uint32_t default_crop_size_h; + uint32_t default_crop_size_v; + } gpr_tuning_info; int32_t gpr_tuning_info_get_dgain_saturation_level(const gpr_tuning_info* x, GPR_RAW_CHANNEL channel); diff --git a/source/lib/vc5_common/bitstream.c b/source/lib/vc5_common/bitstream.c index cef43121..9b94b9de 100755 --- a/source/lib/vc5_common/bitstream.c +++ b/source/lib/vc5_common/bitstream.c @@ -238,7 +238,12 @@ CODEC_ERROR PutBits(BITSTREAM *stream, BITWORD bits, BITCOUNT count) } // Write the bit buffer to the byte stream - PutWord(stream->stream, stream->buffer ); + CODEC_ERROR err = PutWord(stream->stream, stream->buffer ); + if (err != CODEC_ERROR_OKAY) + { + stream->error = BITSTREAM_ERROR_OVERFLOW; + return err; + } // Insert the remaining input bits into the bit buffer stream->buffer = (bits << (bit_word_count - count)); @@ -291,7 +296,12 @@ CODEC_ERROR PutBuffer(BITSTREAM *bitstream) assert(bitstream->count == bit_word_count); // Write the bit buffer to the byte stream - PutWord(bitstream->stream, bitstream->buffer ); + CODEC_ERROR err = PutWord(bitstream->stream, bitstream->buffer ); + if (err != CODEC_ERROR_OKAY) + { + bitstream->error = BITSTREAM_ERROR_OVERFLOW; + return err; + } // Empty the bit buffer bitstream->buffer = 0; @@ -375,7 +385,11 @@ CODEC_ERROR FlushBitstream(BITSTREAM *bitstream) if (bitstream->count > 0) { // Write the bit buffer to the output stream - PutBuffer(bitstream); + CODEC_ERROR err = PutBuffer(bitstream); + if (err != CODEC_ERROR_OKAY) + { + return err; + } } // Indicate that the bitstream buffer is empty @@ -402,7 +416,11 @@ CODEC_ERROR FlushBitstream(BITSTREAM *bitstream) size_t GetBitstreamPosition(BITSTREAM *bitstream) { if (bitstream->count == bit_word_count) { - PutBuffer(bitstream); + CODEC_ERROR err = PutBuffer(bitstream); + if (err != CODEC_ERROR_OKAY) + { + return 0; + } } // The bit buffer must be empty @@ -442,4 +460,3 @@ CODEC_ERROR RewindBitstream(BITSTREAM *bitstream) return CODEC_ERROR_OKAY; } - diff --git a/source/lib/vc5_common/companding.c b/source/lib/vc5_common/companding.c index 00b0fb56..fb881f8e 100755 --- a/source/lib/vc5_common/companding.c +++ b/source/lib/vc5_common/companding.c @@ -31,11 +31,52 @@ /*! @brief Maximum coefficient magnitude in the codebook - + @todo Need to calculate the maximum value from the codebook */ const int maximum_codebook_value = 255; +/*! + @brief Precomputed integer LUT for inverse cubic companding + + Maps magnitude m (0-255) to m + m^3 * 768 / (255^3). + Eliminates double-precision floating point from the decoder's inner loops. +*/ +static int32_t uncompand_lut[256]; +static int uncompand_lut_initialized = 0; + +void InitUncompandTable(void) +{ + if (uncompand_lut_initialized) + return; + + uncompand_lut[0] = 0; + for (int i = 1; i < 256; i++) + { + // Use the same formula as UncompandedValue but compute once + double cubic = (double)i; + cubic *= i; + cubic *= i; + cubic *= 768; + cubic /= 255.0 * 255.0 * 255.0; + uncompand_lut[i] = i + (int32_t)cubic; + } + + uncompand_lut_initialized = 1; +} + +int32_t UncompandedValueFast(int32_t value) +{ + int32_t magnitude = absolute(value); + + if (magnitude > 255) + magnitude = 255; + + magnitude = uncompand_lut[magnitude]; + + return ((value < 0) ? neg(magnitude) : magnitude); +} + /*! @brief Apply the default companding curve to the specified value diff --git a/source/lib/vc5_common/companding.h b/source/lib/vc5_common/companding.h index 5931e73c..d72a379c 100755 --- a/source/lib/vc5_common/companding.h +++ b/source/lib/vc5_common/companding.h @@ -34,7 +34,11 @@ extern "C" { // Invert the companding curve applied to a quantized coefficient magnitude (for debugging) int32_t UncompandedValue(int32_t value); - + + // Fast integer LUT-based uncompanding (no floating point) + void InitUncompandTable(void); + int32_t UncompandedValueFast(int32_t value); + PIXEL UncompandedPixel(PIXEL value); CODEC_ERROR InvertCompanding(PIXEL *image, DIMENSION width, DIMENSION height, DIMENSION pitch); diff --git a/source/lib/vc5_common/config.h b/source/lib/vc5_common/config.h index 5169510d..f07a1f60 100755 --- a/source/lib/vc5_common/config.h +++ b/source/lib/vc5_common/config.h @@ -45,7 +45,8 @@ //TODO: The maximum number of channels and wavelets depends on the profile //! Internal precision of the intermediate results after unpacking -static const int default_internal_precision = 12; +// Raised to support higher bit-depth inputs; PIXEL is widened to 32-bit. +static const int default_internal_precision = 16; //TODO: Change the global variable from internal_precision to encoded_precision? diff --git a/source/lib/vc5_common/image.c b/source/lib/vc5_common/image.c index 0af14fe1..92766452 100755 --- a/source/lib/vc5_common/image.c +++ b/source/lib/vc5_common/image.c @@ -126,10 +126,13 @@ DIMENSION ImagePitch(DIMENSION width, PIXEL_FORMAT format) switch (format) { + case PIXEL_FORMAT_RAW_RGGB_12: case PIXEL_FORMAT_RAW_RGGB_14: + case PIXEL_FORMAT_RAW_RGGB_16: case PIXEL_FORMAT_RAW_GBRG_12: case PIXEL_FORMAT_RAW_GBRG_12P: - case PIXEL_FORMAT_RAW_RGGB_16: + case PIXEL_FORMAT_RAW_GBRG_14: + case PIXEL_FORMAT_RAW_GBRG_16: // Half the width of the image times 2 samples times 2 bytes per sample pitch = width * sizeof(uint16_t); break; diff --git a/source/lib/vc5_common/image.h b/source/lib/vc5_common/image.h index 47397272..4f7717f4 100755 --- a/source/lib/vc5_common/image.h +++ b/source/lib/vc5_common/image.h @@ -21,8 +21,8 @@ #ifndef IMAGE_H #define IMAGE_H -//! Data type for the values in a component array -typedef uint16_t COMPONENT_VALUE; +//! Data type for the values in a component array (widened for 16-bit pipeline) +typedef int32_t COMPONENT_VALUE; /*! @brief Data structure for an image input to the unpacking process diff --git a/source/lib/vc5_common/logcurve.c b/source/lib/vc5_common/logcurve.c index 45aecbf1..c1760b27 100755 --- a/source/lib/vc5_common/logcurve.c +++ b/source/lib/vc5_common/logcurve.c @@ -20,39 +20,65 @@ #include "common.h" -uint16_t EncoderLogCurve[LOG_CURVE_TABLE_LENGTH]; +int vc5_logcurve_bypass(void) +{ + return 0; +} -uint16_t DecoderLogCurve[LOG_CURVE_TABLE_LENGTH]; +uint16_t EncoderLogCurve12[LOG_CURVE_TABLE_LENGTH_12]; +uint16_t EncoderLogCurve14[LOG_CURVE_TABLE_LENGTH_14]; +uint16_t EncoderLogCurve16[LOG_CURVE_TABLE_LENGTH_16]; -void SetupDecoderLogCurve(void) +uint16_t DecoderLogCurve12[LOG_CURVE_TABLE_LENGTH_12]; +uint16_t DecoderLogCurve14[LOG_CURVE_TABLE_LENGTH_14]; +uint16_t DecoderLogCurve16[LOG_CURVE_TABLE_LENGTH_16]; + +static void SetupDecoderCurve(uint16_t *table, int bits) { - int i; - const int log_table_size = sizeof(DecoderLogCurve) / sizeof(DecoderLogCurve[0]); - - const int max_16_bit = (1 << 16) - 1; - - for( i = 0; i < log_table_size; i++ ) + const int max_input = (1 << bits) - 1; + const double denom = log10(113.0); + const int max_output = (1 << 16) - 1; + + // Antilog: the mathematical inverse of the encoder log curve. + // Encoder: y = max_out * log10(x/max_in * 112 + 1) / log10(113) + // Decoder: x = max_out * (10^(y/max_in * log10(113)) - 1) / 112 + for (int i = 0; i <= max_input; ++i) { - //input 12-bit, output 16-bit - float input = i; - float output = max_16_bit * (pow(113.0, input/4095.0) - 1.0)/112.0; - - DecoderLogCurve[i] = minimum( (int)output, max_16_bit ); + const double norm = (double)i / max_input; + const double exp_val = pow(10.0, norm * denom); + const double output = max_output * (exp_val - 1.0) / 112.0; + int out_int = (int)(output + 0.5); + if (out_int < 0) out_int = 0; + if (out_int > max_output) out_int = max_output; + table[i] = (uint16_t)out_int; } } -void SetupEncoderLogCurve(void) +void SetupDecoderLogCurve(void) { - int i; - const int max_input_val = LOG_CURVE_TABLE_LENGTH - 1; - - for( i = 0; i < LOG_CURVE_TABLE_LENGTH; i++ ) - { - //input 16-bit, output 12-bit - float input = maximum( 0, i ); - float output = 4095.0 * (log10(input/max_input_val * 112.0 + 1.0)/log10(113)); + SetupDecoderCurve(DecoderLogCurve12, 12); + SetupDecoderCurve(DecoderLogCurve14, 14); + SetupDecoderCurve(DecoderLogCurve16, 16); +} + +static void SetupEncoderLogCurveBits(uint16_t* table, int input_bits, int output_bits) +{ + const int max_input_val = (1 << input_bits) - 1; + const int max_output_val = (1 << output_bits) - 1; + const double denom = log10(113.0); - EncoderLogCurve[i] = ( (uint16_t)output ); + for(int i = 0; i <= max_input_val; i++ ) + { + const double input = maximum(0, i); + const double norm = (input / max_input_val * 112.0) + 1.0; + const double output = max_output_val * (log10(norm) / denom); + table[i] = (uint16_t)output; } } +void SetupEncoderLogCurve(void) +{ + SetupEncoderLogCurveBits(EncoderLogCurve12, 12, 12); + SetupEncoderLogCurveBits(EncoderLogCurve14, 14, 14); + SetupEncoderLogCurveBits(EncoderLogCurve16, 16, 16); +} diff --git a/source/lib/vc5_common/logcurve.h b/source/lib/vc5_common/logcurve.h index 3e6a2692..ddeb314a 100755 --- a/source/lib/vc5_common/logcurve.h +++ b/source/lib/vc5_common/logcurve.h @@ -21,19 +21,60 @@ #ifndef LOGCURVE_H #define LOGCURVE_H -#define LOG_CURVE_TABLE_LENGTH (1 << 12) +#define LOG_CURVE_TABLE_LENGTH (1 << 12) +#define LOG_CURVE_TABLE_LENGTH_12 LOG_CURVE_TABLE_LENGTH +#define LOG_CURVE_TABLE_LENGTH_14 (1 << 14) +#define LOG_CURVE_TABLE_LENGTH_16 (1 << 16) #ifdef __cplusplus extern "C" { #endif + +int vc5_logcurve_bypass(void); - extern uint16_t EncoderLogCurve[]; - - extern uint16_t DecoderLogCurve[]; +#define EncoderLogCurve EncoderLogCurve12 +extern uint16_t EncoderLogCurve12[]; +extern uint16_t EncoderLogCurve14[]; +extern uint16_t EncoderLogCurve16[]; + +extern uint16_t DecoderLogCurve12[]; +extern uint16_t DecoderLogCurve14[]; +extern uint16_t DecoderLogCurve16[]; + +static INLINE uint16_t DecodeLogValue(uint32_t value, int bits) +{ + if (vc5_logcurve_bypass()) + { + const uint32_t max_bits = (bits >= 16) ? 16 : (bits <= 0 ? 1 : bits); + const uint32_t max_val = (1u << max_bits) - 1; + if (value > max_val) value = max_val; + return (uint16_t)value; + } + + const uint32_t max12 = (1u << 12) - 1; + const uint32_t max14 = (1u << 14) - 1; + const uint32_t max16 = (1u << 16) - 1; + + if (bits <= 12) + { + if (value > max12) value = max12; + return DecoderLogCurve12[value]; + } + else if (bits <= 14) + { + if (value > max14) value = max14; + return DecoderLogCurve14[value]; + } + else + { + if (value > max16) value = max16; + return DecoderLogCurve16[value]; + } +} void SetupDecoderLogCurve(void); - void SetupEncoderLogCurve(void); +void SetupEncoderLogCurve(void); #ifdef __cplusplus } diff --git a/source/lib/vc5_common/pixel.h b/source/lib/vc5_common/pixel.h index 7c3b9e06..15dca5fa 100755 --- a/source/lib/vc5_common/pixel.h +++ b/source/lib/vc5_common/pixel.h @@ -23,23 +23,23 @@ #ifndef PIXEL_H #define PIXEL_H -//! Data type for pixels -typedef int16_t PIXEL; +//! Data type for pixels (widened for 16-bit pipeline headroom) +typedef int32_t PIXEL; //! Minimum and maximum pixel values enum { - PIXEL_MIN = INT16_MIN, - PIXEL_MAX = INT16_MAX, + PIXEL_MIN = INT32_MIN, + PIXEL_MAX = INT32_MAX, }; //! Alternative definition for wavelet coefficients -typedef int16_t COEFFICIENT; +typedef int32_t COEFFICIENT; //! Minimum and maximum coefficient values enum { - COEFFICIENT_MIN = INT16_MIN, - COEFFICIENT_MAX = INT16_MAX, + COEFFICIENT_MIN = INT32_MIN, + COEFFICIENT_MAX = INT32_MAX, }; /*! @@ -50,12 +50,14 @@ enum @todo Need to add support for more pixel formats to the reference decoder */ -typedef enum + typedef enum { PIXEL_FORMAT_UNKNOWN = 0, PIXEL_FORMAT_RAW_RGGB_16 = 104, + PIXEL_FORMAT_RAW_GBRG_16 = 105, + PIXEL_FORMAT_RAW_RGGB_12 = 106, PIXEL_FORMAT_RAW_RGGB_12P = 107, PIXEL_FORMAT_RAW_RGGB_14 = 108, diff --git a/source/lib/vc5_common/stream.c b/source/lib/vc5_common/stream.c index 59ecc7da..e8bb5a82 100755 --- a/source/lib/vc5_common/stream.c +++ b/source/lib/vc5_common/stream.c @@ -187,11 +187,14 @@ CODEC_ERROR PutWord(STREAM *stream, BITWORD word) break; case STREAM_TYPE_MEMORY: - { - uint8_t* buffer = (uint8_t *)stream->location.memory.buffer + stream->byte_count; - - memcpy(buffer, &word, sizeof(word)); - } + if (stream->byte_count + sizeof(word) > stream->location.memory.size) + { + return CODEC_ERROR_OUTOFMEMORY; + } + { + uint8_t* buffer = (uint8_t *)stream->location.memory.buffer + stream->byte_count; + memcpy(buffer, &word, sizeof(word)); + } break; default: @@ -221,6 +224,10 @@ CODEC_ERROR PutByte(STREAM *stream, uint8_t byte) break; case STREAM_TYPE_MEMORY: + if (stream->byte_count + 1 > stream->location.memory.size) + { + return CODEC_ERROR_OUTOFMEMORY; + } ((uint8_t *)stream->location.memory.buffer)[stream->byte_count] = byte; break; @@ -521,4 +528,3 @@ CODEC_ERROR PutBlockMemory(STREAM *stream, void *buffer, size_t size, size_t off return CODEC_ERROR_OKAY; } - diff --git a/source/lib/vc5_common/wavelet.c b/source/lib/vc5_common/wavelet.c index ec7e31a5..1861d862 100755 --- a/source/lib/vc5_common/wavelet.c +++ b/source/lib/vc5_common/wavelet.c @@ -274,6 +274,19 @@ CODEC_ERROR SetTransformPrescale(TRANSFORM *transform, int precision) PRESCALE spatial_prescale[] = {0, 2, 2, 0, 0, 0, 0, 0}; memcpy(transform->prescale, spatial_prescale, sizeof(transform->prescale)); } + else if (precision == 14) + { + // For 14-bit, prescale [0, 2, 2] keeps lowpass within 16-bit range + PRESCALE spatial_prescale[] = {0, 2, 2, 0, 0, 0, 0, 0}; + memcpy(transform->prescale, spatial_prescale, sizeof(transform->prescale)); + } + else if (precision >= 15) + { + // For 16-bit inputs, need prescale=2 at level 0 to prevent lowpass overflow + // (without it, level 0 lowpass = 4x input which exceeds 16-bit storage) + PRESCALE spatial_prescale[] = {2, 3, 3, 0, 0, 0, 0, 0}; + memcpy(transform->prescale, spatial_prescale, sizeof(transform->prescale)); + } else { //TODO: Need to handle other precisions @@ -459,7 +472,8 @@ void WaveletToRGB( gpr_allocator allocator, PIXEL* GS_src, PIXEL* RG_src, PIXEL* dst_image->buffer = allocator.Alloc( size ); const int32_t midpoint = (1 << (input_precision_bits - 1)); - const int32_t shift = input_precision_bits - 12; + const int table_bits = (input_precision_bits <= 12) ? 12 : (input_precision_bits <= 14 ? 14 : 16); + const int shift = (input_precision_bits > table_bits) ? (input_precision_bits - table_bits) : 0; unsigned char* RGB_dst_8bits = dst_image->buffer; unsigned short* RGB_dst_16bits = dst_image->buffer; @@ -474,10 +488,9 @@ void WaveletToRGB( gpr_allocator allocator, PIXEL* GS_src, PIXEL* RG_src, PIXEL* int32_t R = 2 * ( RG_src[(src_width - x - 1) + y * src_pitch] - midpoint) + G; int32_t B = 2 * ( BG_src[(src_width - x - 1) + y * src_pitch] - midpoint) + G; - // R,G,B are in 16-bit range since DecoderLogCurve outputs in 16 bits (although it's input is 12 bits) - R = DecoderLogCurve[ clamp_uint( (R >> shift), 12) ]; - G = DecoderLogCurve[ clamp_uint( (G >> shift), 12) ]; - B = DecoderLogCurve[ clamp_uint( (B >> shift), 12) ]; + R = DecodeLogValue((R >> shift), table_bits); + G = DecodeLogValue((G >> shift), table_bits); + B = DecodeLogValue((B >> shift), table_bits); if( output_precision_bits == 8 ) { diff --git a/source/lib/vc5_decoder/CMakeLists.txt b/source/lib/vc5_decoder/CMakeLists.txt index d78522d0..77e380d8 100644 --- a/source/lib/vc5_decoder/CMakeLists.txt +++ b/source/lib/vc5_decoder/CMakeLists.txt @@ -16,7 +16,8 @@ include_directories( "../common/public" ) # library add_library( ${LIB_NAME} STATIC ${SRC_FILES} ${INC_FILES} ) -target_link_libraries( ${LIB_NAME} ) +find_package( Threads REQUIRED ) +target_link_libraries( ${LIB_NAME} Threads::Threads ) # set the folder where to place the projects set_target_properties( ${LIB_NAME} PROPERTIES FOLDER lib ) diff --git a/source/lib/vc5_decoder/decoder.c b/source/lib/vc5_decoder/decoder.c index 73aa3b80..9609744d 100755 --- a/source/lib/vc5_decoder/decoder.c +++ b/source/lib/vc5_decoder/decoder.c @@ -19,6 +19,8 @@ */ #include "headers.h" +#include +#include /*! @brief Align the bitstream to a byte boundary @@ -212,32 +214,40 @@ CODEC_ERROR DecodeImage(STREAM *stream, IMAGE *packed_image, RGB_IMAGE *rgb_imag break; case GPR_RGB_RESOLUTION_HALF: + { + int bits = unpacked_image.component_array_list[0].bits_per_component; WaveletToRGB(parameters->allocator, (PIXEL*)unpacked_image.component_array_list[0].data, (PIXEL*)unpacked_image.component_array_list[1].data, (PIXEL*)unpacked_image.component_array_list[2].data, - unpacked_image.component_array_list[2].width, unpacked_image.component_array_list[2].height, unpacked_image.component_array_list[2].pitch / 2, - rgb_image, 12, parameters->rgb_bits, ¶meters->rgb_gain ); + unpacked_image.component_array_list[2].width, unpacked_image.component_array_list[2].height, unpacked_image.component_array_list[2].pitch / sizeof(COMPONENT_VALUE), + rgb_image, bits, parameters->rgb_bits, ¶meters->rgb_gain ); break; + } case GPR_RGB_RESOLUTION_QUARTER: - + { + int bits = decoder.channel[0].bits_per_component; WaveletToRGB(parameters->allocator, decoder.transform[0].wavelet[0]->data[0], decoder.transform[1].wavelet[0]->data[0], decoder.transform[2].wavelet[0]->data[0], decoder.transform[2].wavelet[0]->width, decoder.transform[2].wavelet[0]->height, decoder.transform[2].wavelet[0]->width, - rgb_image, 14, parameters->rgb_bits, ¶meters->rgb_gain ); + rgb_image, bits, parameters->rgb_bits, ¶meters->rgb_gain ); break; + } case GPR_RGB_RESOLUTION_EIGHTH: - + { + int bits = decoder.channel[0].bits_per_component; WaveletToRGB(parameters->allocator, decoder.transform[0].wavelet[1]->data[0], decoder.transform[1].wavelet[1]->data[0], decoder.transform[2].wavelet[1]->data[0], decoder.transform[2].wavelet[1]->width, decoder.transform[2].wavelet[1]->height, decoder.transform[2].wavelet[1]->width, - rgb_image, 14, parameters->rgb_bits, ¶meters->rgb_gain ); - + rgb_image, bits, parameters->rgb_bits, ¶meters->rgb_gain ); break; + } case GPR_RGB_RESOLUTION_SIXTEENTH: - + { + int bits = decoder.channel[0].bits_per_component; WaveletToRGB(parameters->allocator, decoder.transform[0].wavelet[2]->data[0], decoder.transform[1].wavelet[2]->data[0], decoder.transform[2].wavelet[2]->data[0], decoder.transform[2].wavelet[2]->width, decoder.transform[2].wavelet[2]->height, decoder.transform[2].wavelet[2]->width, - rgb_image, 14, parameters->rgb_bits, ¶meters->rgb_gain ); + rgb_image, bits, parameters->rgb_bits, ¶meters->rgb_gain ); break; + } default: return CODEC_ERROR_UNSUPPORTED_FORMAT; @@ -262,7 +272,10 @@ CODEC_ERROR DecodeImage(STREAM *stream, IMAGE *packed_image, RGB_IMAGE *rgb_imag CODEC_ERROR PrepareDecoder(DECODER *decoder, const DECODER_PARAMETERS *parameters) { CODEC_ERROR error = CODEC_ERROR_OKAY; - + + // Initialize the uncompanding LUT (fast integer replacement for double-precision cubic) + InitUncompandTable(); + // Initialize the decoder data structure error = InitDecoder(decoder, ¶meters->allocator); if (error != CODEC_ERROR_OKAY) { @@ -554,7 +567,12 @@ CODEC_ERROR SetImageChannelParameters(DECODER *decoder, int channel_number) } //TODO: Is the default bits per component the correct value to use? - decoder->channel[channel_number].bits_per_component = codec->bits_per_component; + PRECISION default_bits = codec->max_bits_per_component ? codec->max_bits_per_component : codec->bits_per_component; + if (default_bits == 0) + { + default_bits = codec->bits_per_component; + } + decoder->channel[channel_number].bits_per_component = default_bits; decoder->channel[channel_number].initialized = true; return CODEC_ERROR_OKAY; @@ -874,6 +892,7 @@ CODEC_ERROR ImageRepackingProcess(const UNPACKED_IMAGE *unpacked_image, break; case PIXEL_FORMAT_RAW_RGGB_16: + case PIXEL_FORMAT_RAW_GBRG_16: return PackComponentsToRAW(unpacked_image, output_buffer, output_pitch, output_width, output_height, enabled_parts, 16, output_format ); break; @@ -1256,6 +1275,7 @@ CODEC_ERROR UpdateCodecState(DECODER *decoder, BITSTREAM *stream, TAGVALUE segme if (IsPartEnabled(enabled_parts, VC5_PART_IMAGE_FORMATS)) { codec->max_bits_per_component = (PRECISION)value; + codec->bits_per_component = (PRECISION)value; codec->header = true; } else @@ -1519,7 +1539,7 @@ CODEC_ERROR UpdateCodecState(DECODER *decoder, BITSTREAM *stream, TAGVALUE segme { // Remember the number of bits per component in this and higher numbered channel decoder->channel[codec->channel_number].bits_per_component = codec->bits_per_component; - + // Found the first codeblock in the channel decoder->channel[channel_number].found_first_codeblock = true; } @@ -1908,26 +1928,46 @@ CODEC_ERROR DecodeLowpassBand(DECODER *decoder, BITSTREAM *stream, WAVELET *wave lowpass_precision = codec->lowpass_precision; // Decode each row in the lowpass image - for (row = 0; row < lowpass_band_height; row++) + if (lowpass_precision == 16) { - for (column = 0; column < lowpass_band_width; column++) + // Fast path: extract 16-bit coefficients directly from the 32-bit buffer + for (row = 0; row < lowpass_band_height; row++) { - COEFFICIENT lowpass_value = (COEFFICIENT)GetBits(stream, lowpass_precision); - //assert(0 <= lowpass_value && lowpass_value <= COEFFICIENT_MAX); - - //if (lowpass_value > COEFFICIENT_MAX) { - // lowpass_value = COEFFICIENT_MAX; - //} - - lowpass_band_ptr[column] = lowpass_value; + for (column = 0; column < lowpass_band_width; column++) + { + if (stream->count < 16) + { + if (stream->count == 0) { + CODEC_ERROR gb_error = GetBuffer(stream); + if (gb_error != CODEC_ERROR_OKAY) return gb_error; + } else { + lowpass_band_ptr[column] = (COEFFICIENT)GetBits(stream, 16); + continue; + } + } + lowpass_band_ptr[column] = (COEFFICIENT)(stream->buffer >> 16); + stream->buffer <<= 16; + stream->count -= 16; + } + lowpass_band_ptr += lowpass_band_pitch; + } + } + else + { + // Generic path for other precisions + for (row = 0; row < lowpass_band_height; row++) + { + for (column = 0; column < lowpass_band_width; column++) + { + COEFFICIENT lowpass_value = (COEFFICIENT)GetBits(stream, lowpass_precision); + lowpass_band_ptr[column] = lowpass_value; + } + lowpass_band_ptr += lowpass_band_pitch; } - - // Advance to the next row in the lowpass image - lowpass_band_ptr += lowpass_band_pitch; } // Align the bitstream to the next tag value pair AlignBitsSegment(stream); - + // Return indication of lowpass decoding success return error; } @@ -2012,39 +2052,70 @@ CODEC_ERROR DecodeBandRuns(BITSTREAM *stream, CODEBOOK *codebook, PIXEL *data, while (data_count > 0) { - // Get the next run length and value - error = GetRun(stream, codebook, &run); + // Get the next run length and value (fast path uses prefix lookup table) + error = GetRunFast(stream, codebook, &run); if (error != CODEC_ERROR_OKAY) { return error; } - + // Check that the run does not extend past the end of the band assert(run.count <= data_count); - - // Copy the value into the specified number of pixels in the band - while (run.count > 0) + + // Check if the entire run fits in the current row (common case) { - // Reached the end of the column? - if (column == width) + int remaining_in_row = width - column; + + if ((int)run.count <= remaining_in_row) + { + // FAST: run stays within current row, no boundary checks needed + if (run.value == 0 && run.count > 1) + { + memset(&data[index], 0, run.count * sizeof(PIXEL)); + } + else + { + // count==1 for non-zero values and single zeros + data[index] = (PIXEL)run.value; + } + index += run.count; + column += run.count; + data_count -= run.count; + run.count = 0; + } + else { - // Need to pad the end of the row? - if (row_padding > 0) + // SLOW: run crosses row boundary + while (run.count > 0) { - int count; - for (count = 0; (size_t)count < row_padding; count++) { - data[index++] = 0; + if (column == width) + { + if (row_padding > 0) + { + memset(&data[index], 0, row_padding * sizeof(PIXEL)); + index += (int)row_padding; + } + row++; + column = 0; } + + remaining_in_row = width - column; + int n = ((int)run.count < remaining_in_row) ? (int)run.count : remaining_in_row; + + if (run.value == 0) + { + memset(&data[index], 0, n * sizeof(PIXEL)); + } + else + { + for (int i = 0; i < n; i++) + data[index + i] = (PIXEL)run.value; + } + index += n; + column += n; + run.count -= n; + data_count -= n; } - - // Advance to the next row - row++; - column = 0; } - - data[index++] = (PIXEL)run.value; - column++; - run.count--; - data_count--; } } @@ -2148,6 +2219,31 @@ bool IsDecodingComplete(DECODER *decoder) return true; } +/*! Thread argument for parallel inverse transform */ +typedef struct { + gpr_allocator *allocator; + WAVELET *wavelet; + COMPONENT_VALUE *output_data; + DIMENSION output_width; + DIMENSION output_height; + size_t output_pitch; + PRESCALE prescale; + CODEC_ERROR error; +} INVERSE_THREAD_ARG; + +static void *InverseTransformThread(void *arg) +{ + INVERSE_THREAD_ARG *a = (INVERSE_THREAD_ARG *)arg; + a->error = TransformInverseSpatialQuantArray(a->allocator, + a->wavelet, + a->output_data, + a->output_width, + a->output_height, + a->output_pitch, + a->prescale); + return NULL; +} + /*! @brief Perform the final wavelet transform in each channel to compute the component arrays Each channel is decoded and the lowpass and highpass bands are used to reconstruct the @@ -2183,42 +2279,68 @@ CODEC_ERROR ReconstructUnpackedImage(DECODER *decoder, UNPACKED_IMAGE *image) image->component_count = 0; memset(image->component_array_list, 0, size); + // Phase 1: Allocate all component arrays (sequential, uses allocator) for (channel_number = 0; channel_number < channel_count; channel_number++) { - // Get the dimensions of this channel DIMENSION channel_width = decoder->channel[channel_number].width; DIMENSION channel_height = decoder->channel[channel_number].height; PRECISION bits_per_component = decoder->channel[channel_number].bits_per_component; - - // Amount of prescaling applied to the component array values before encoding - PRESCALE prescale = decoder->codec.prescale_table[0]; - - // Allocate the component array for this channel + error = AllocateComponentArray(allocator, &image->component_array_list[channel_number], channel_width, channel_height, bits_per_component); - if (error != CODEC_ERROR_OKAY) { return error; } - - error = TransformInverseSpatialQuantArray(allocator, - decoder->transform[channel_number].wavelet[0], - image->component_array_list[channel_number].data, - channel_width, - channel_height, - image->component_array_list[channel_number].pitch, - prescale); - if (error != CODEC_ERROR_OKAY) { - return error; + } + + // Phase 2: Run inverse transforms in parallel (one thread per channel) + { + PRESCALE prescale = decoder->codec.prescale_table[0]; + INVERSE_THREAD_ARG thread_args[MAX_CHANNEL_COUNT]; + pthread_t threads[MAX_CHANNEL_COUNT]; + int thread_created[MAX_CHANNEL_COUNT]; + + for (channel_number = 0; channel_number < channel_count; channel_number++) + { + thread_args[channel_number].allocator = allocator; + thread_args[channel_number].wavelet = decoder->transform[channel_number].wavelet[0]; + thread_args[channel_number].output_data = image->component_array_list[channel_number].data; + thread_args[channel_number].output_width = decoder->channel[channel_number].width; + thread_args[channel_number].output_height = decoder->channel[channel_number].height; + thread_args[channel_number].output_pitch = image->component_array_list[channel_number].pitch; + thread_args[channel_number].prescale = prescale; + thread_args[channel_number].error = CODEC_ERROR_OKAY; + + thread_created[channel_number] = (pthread_create(&threads[channel_number], NULL, + InverseTransformThread, + &thread_args[channel_number]) == 0); + if (!thread_created[channel_number]) + { + // Thread creation failed: run inline as fallback + InverseTransformThread(&thread_args[channel_number]); + } } + + // Wait for all threads to complete + for (channel_number = 0; channel_number < channel_count; channel_number++) + { + if (thread_created[channel_number]) + pthread_join(threads[channel_number], NULL); + + if (thread_args[channel_number].error != CODEC_ERROR_OKAY) + error = thread_args[channel_number].error; + } + + if (error != CODEC_ERROR_OKAY) + return error; } // One component array is output by the decoding process per channel in the bitstream image->component_count = channel_count; - + TIMESTAMP("[END]", 2) return error; @@ -2306,5 +2428,3 @@ CODEC_ERROR ReconstructSampleFrame(DECODER *decoder, IMAGE image_array[], int fr return ComposeFields(image_array, frame_count, output_image); } #endif - - diff --git a/source/lib/vc5_decoder/dequantize.c b/source/lib/vc5_decoder/dequantize.c index 48cfb647..74733f3c 100755 --- a/source/lib/vc5_decoder/dequantize.c +++ b/source/lib/vc5_decoder/dequantize.c @@ -20,6 +20,10 @@ #include "headers.h" +#if ENABLED(NEON) +#include +#endif + // Not using midpoint correction in dequantization static const int midpoint = 0; @@ -29,6 +33,73 @@ static const int midpoint = 0; The companding curve is inverted and the value is multiplied by the quantization value that was used by the encoder to compress the band. */ + +#if ENABLED(NEON) + +CODEC_ERROR DequantizeBandRow16s(PIXEL *input, int width, int quantization, PIXEL *output) +{ + int column = 0; + const int width_m4 = (width / 4) * 4; + const int32x4_t zero = vdupq_n_s32(0); + const int32x4_t quant_vec = vdupq_n_s32(quantization); + + // Process 4 pixels at a time + for (; column < width_m4; column += 4) + { + int32x4_t values = vld1q_s32(&input[column]); + + // Scalar LUT lookups for uncompanding (NEON can't do 32-bit table lookups) + int32_t v0 = vgetq_lane_s32(values, 0); + int32_t v1 = vgetq_lane_s32(values, 1); + int32_t v2 = vgetq_lane_s32(values, 2); + int32_t v3 = vgetq_lane_s32(values, 3); + + int32_t u0 = UncompandedValueFast(v0); + int32_t u1 = UncompandedValueFast(v1); + int32_t u2 = UncompandedValueFast(v2); + int32_t u3 = UncompandedValueFast(v3); + + // Load uncompanded values into vector + int32_t uncomp[4] = { u0, u1, u2, u3 }; + int32x4_t uncompanded = vld1q_s32(uncomp); + + // Get absolute values and compute signs + int32x4_t abs_vals = vabsq_s32(uncompanded); + uint32x4_t neg_mask = vcltq_s32(uncompanded, zero); + + // Multiply by quantization + int32x4_t dequant = vmulq_s32(abs_vals, quant_vec); + + // Restore signs + int32x4_t negated = vnegq_s32(dequant); + int32x4_t result = vbslq_s32(neg_mask, negated, dequant); + + vst1q_s32(&output[column], result); + } + + // Scalar cleanup for remaining pixels + for (; column < width; column++) + { + int32_t value = input[column]; + value = UncompandedValueFast(value); + + if (value > 0) + value = (quantization * value) + midpoint; + else if (value < 0) + { + value = neg(value); + value = (quantization * value) + midpoint; + value = neg(value); + } + + output[column] = ClampPixel(value); + } + + return CODEC_ERROR_OKAY; +} + +#else + CODEC_ERROR DequantizeBandRow16s(PIXEL *input, int width, int quantization, PIXEL *output) { int column; @@ -38,8 +109,8 @@ CODEC_ERROR DequantizeBandRow16s(PIXEL *input, int width, int quantization, PIXE { int32_t value = input[column]; - // Invert the companding curve (if any) - value = UncompandedValue(value); + // Invert the companding curve using fast LUT + value = UncompandedValueFast(value); // Dequantize the absolute value if (value > 0) @@ -60,6 +131,8 @@ CODEC_ERROR DequantizeBandRow16s(PIXEL *input, int width, int quantization, PIXE return CODEC_ERROR_OKAY; } +#endif + /*! @brief This function dequantizes the pixel value @@ -69,8 +142,8 @@ CODEC_ERROR DequantizeBandRow16s(PIXEL *input, int width, int quantization, PIXE */ PIXEL DequantizedValue(int32_t value, int quantization) { - // Invert the companding curve (if any) - value = UncompandedValue(value); + // Invert the companding curve using fast LUT + value = UncompandedValueFast(value); // Dequantize the absolute value if (value > 0) diff --git a/source/lib/vc5_decoder/inverse.c b/source/lib/vc5_decoder/inverse.c index cacc7be8..238d8e58 100755 --- a/source/lib/vc5_decoder/inverse.c +++ b/source/lib/vc5_decoder/inverse.c @@ -18,351 +18,271 @@ #include "headers.h" +#if ENABLED(NEON) +#include +#endif + //! Rounding adjustment used by the inverse wavelet transforms static const int32_t rounding = 4; +#if ENABLED(NEON) +/*! + @brief NEON helper: vertical inverse middle-row filter for 4 columns + Computes even and odd outputs from 3 rows of lowpass and 1 row of highpass. + even = ((row0 - row2 + 4) >> 3 + row1 + hp) >> 1 + odd = ((-row0 + row2 + 4) >> 3 + row1 - hp) >> 1 + */ +static INLINE void InvertVerticalMiddle4_NEON( + const PIXEL *row0, const PIXEL *row1, const PIXEL *row2, + const PIXEL *hp, PIXEL *even_out, PIXEL *odd_out, int col) +{ + const int32x4_t four = vdupq_n_s32(4); + + int32x4_t r0 = vld1q_s32(&row0[col]); + int32x4_t r1 = vld1q_s32(&row1[col]); + int32x4_t r2 = vld1q_s32(&row2[col]); + int32x4_t h = vld1q_s32(&hp[col]); + + // even path: (r0 - r2 + 4) >> 3 + r1 + h, then >> 1 + int32x4_t diff = vsubq_s32(r0, r2); + diff = vaddq_s32(diff, four); + diff = vshrq_n_s32(diff, 3); + int32x4_t even_v = vaddq_s32(vaddq_s32(diff, r1), h); + even_v = vshrq_n_s32(even_v, 1); + vst1q_s32(&even_out[col], even_v); + + // odd path: (-r0 + r2 + 4) >> 3 + r1 - h, then >> 1 + int32x4_t diff_odd = vsubq_s32(r2, r0); + diff_odd = vaddq_s32(diff_odd, four); + diff_odd = vshrq_n_s32(diff_odd, 3); + int32x4_t odd_v = vsubq_s32(vaddq_s32(diff_odd, r1), h); + odd_v = vshrq_n_s32(odd_v, 1); + vst1q_s32(&odd_out[col], odd_v); +} + +/*! + @brief NEON helper: vertical inverse middle-row filter for Descale variant + Same as above but uses DivideByShift(x,1) = x >> 1 instead of >> 1 + (which is the same operation, so the only diff is conceptual) + */ +static INLINE void InvertVerticalMiddle4Descale_NEON( + const PIXEL *row0, const PIXEL *row1, const PIXEL *row2, + const PIXEL *hp, PIXEL *even_out, PIXEL *odd_out, int col) +{ + const int32x4_t four = vdupq_n_s32(4); + + int32x4_t r0 = vld1q_s32(&row0[col]); + int32x4_t r1 = vld1q_s32(&row1[col]); + int32x4_t r2 = vld1q_s32(&row2[col]); + int32x4_t h = vld1q_s32(&hp[col]); + + int32x4_t diff = vsubq_s32(r0, r2); + diff = vaddq_s32(diff, four); + diff = vshrq_n_s32(diff, 3); + int32x4_t even_v = vaddq_s32(vaddq_s32(diff, r1), h); + even_v = vshrq_n_s32(even_v, 1); + vst1q_s32(&even_out[col], even_v); + + int32x4_t diff_odd = vsubq_s32(r2, r0); + diff_odd = vaddq_s32(diff_odd, four); + diff_odd = vshrq_n_s32(diff_odd, 3); + int32x4_t odd_v = vsubq_s32(vaddq_s32(diff_odd, r1), h); + odd_v = vshrq_n_s32(odd_v, 1); + vst1q_s32(&odd_out[col], odd_v); +} +#endif + /*! @brief Apply the inverse horizontal wavelet transform - This routine applies the inverse wavelet transform to a row of - lowpass and highpass coefficients, producing an output row that - is write as wide. */ -STATIC CODEC_ERROR InvertHorizontal16s(PIXEL *lowpass, //!< Horizontal lowpass coefficients - PIXEL *highpass, //!< Horizontal highpass coefficients - PIXEL *output, //!< Row of reconstructed results - DIMENSION input_width, //!< Number of values in the input row - DIMENSION output_width //!< Number of values in the output row -) +STATIC CODEC_ERROR InvertHorizontal16s(PIXEL *lowpass, PIXEL *highpass, PIXEL *output, + DIMENSION input_width, DIMENSION output_width) { const int last_column = input_width - 1; - int32_t even; int32_t odd; - - // Start processing at the beginning of the row int column = 0; - - // Process the first two output points with special filters for the left border - even = 0; - odd = 0; - - // Apply the even reconstruction filter to the lowpass band - even += 11 * lowpass[column + 0]; - even -= 4 * lowpass[column + 1]; - even += 1 * lowpass[column + 2]; - even += rounding; + + // Left border + even = 11 * lowpass[0] - 4 * lowpass[1] + lowpass[2] + rounding; even = DivideByShift(even, 3); - - // Add the highpass correction - even += highpass[column]; - even >>= 1; - - // The lowpass result should be a positive number - //assert(0 <= even && even <= INT16_MAX); - - // Apply the odd reconstruction filter to the lowpass band - odd += 5 * lowpass[column + 0]; - odd += 4 * lowpass[column + 1]; - odd -= 1 * lowpass[column + 2]; - odd += rounding; + even = (even + highpass[0]) >> 1; + + odd = 5 * lowpass[0] + 4 * lowpass[1] - lowpass[2] + rounding; odd = DivideByShift(odd, 3); - - // Subtract the highpass correction - odd -= highpass[column]; - odd >>= 1; - - // The lowpass result should be a positive number - //assert(0 <= odd && odd <= INT16_MAX); - - // Store the last two output points produced by the loop - output[2 * column + 0] = clamp_uint14(even); - output[2 * column + 1] = clamp_uint14(odd); - - // Advance to the next input column (second pair of output values) - column++; - - // Process the rest of the columns up to the last column in the row + odd = (odd - highpass[0]) >> 1; + + output[0] = ClampPixel(even); + output[1] = ClampPixel(odd); + column = 1; + +#if ENABLED(NEON) + { + const int32x4_t four = vdupq_n_s32(4); + for (; column + 3 < last_column; column += 4) + { + int32x4_t lp_left = vld1q_s32(&lowpass[column - 1]); + int32x4_t lp_center = vld1q_s32(&lowpass[column]); + int32x4_t lp_right = vld1q_s32(&lowpass[column + 1]); + int32x4_t hp_center = vld1q_s32(&highpass[column]); + + int32x4_t diff_e = vsubq_s32(lp_left, lp_right); + diff_e = vaddq_s32(diff_e, four); + diff_e = vshrq_n_s32(diff_e, 3); + int32x4_t even_v = vaddq_s32(diff_e, lp_center); + even_v = vaddq_s32(even_v, hp_center); + even_v = vshrq_n_s32(even_v, 1); + + int32x4_t diff_o = vsubq_s32(lp_right, lp_left); + diff_o = vaddq_s32(diff_o, four); + diff_o = vshrq_n_s32(diff_o, 3); + int32x4_t odd_v = vaddq_s32(diff_o, lp_center); + odd_v = vsubq_s32(odd_v, hp_center); + odd_v = vshrq_n_s32(odd_v, 1); + + int32x4x2_t interleaved; + interleaved.val[0] = even_v; + interleaved.val[1] = odd_v; + vst2q_s32(&output[2 * column], interleaved); + } + } +#endif + + // Scalar middle columns for (; column < last_column; column++) { - int32_t even = 0; // Result of convolution with even filter - int32_t odd = 0; // Result of convolution with odd filter - - // Apply the even reconstruction filter to the lowpass band - - even += lowpass[column - 1]; - even -= lowpass[column + 1]; - even += 4; + even = lowpass[column - 1] - lowpass[column + 1] + 4; even >>= 3; - even += lowpass[column + 0]; - - // Add the highpass correction - even += highpass[column]; - even >>= 1; - - // The lowpass result should be a positive number - //assert(0 <= even && even <= INT16_MAX); - - // Place the even result in the even column - //output[2 * column + 0] = clamp_uint12(even); - output[2 * column + 0] = clamp_uint14(even); - - // Apply the odd reconstruction filter to the lowpass band - odd -= lowpass[column - 1]; - odd += lowpass[column + 1]; - odd += 4; + even += lowpass[column]; + even = (even + highpass[column]) >> 1; + output[2 * column] = ClampPixel(even); + + odd = -lowpass[column - 1] + lowpass[column + 1] + 4; odd >>= 3; - odd += lowpass[column + 0]; - - // Subtract the highpass correction - odd -= highpass[column]; - odd >>= 1; - - // The lowpass result should be a positive number - //assert(0 <= odd && odd <= INT16_MAX); - - // Place the odd result in the odd column - //output[2 * column + 1] = clamp_uint14(odd); - output[2 * column + 1] = clamp_uint14(odd); + odd += lowpass[column]; + odd = (odd - highpass[column]) >> 1; + output[2 * column + 1] = ClampPixel(odd); } - - // Should have exited the loop at the column for right border processing + assert(column == last_column); - - // Process the last two output points with special filters for the right border - even = 0; - odd = 0; - - // Apply the even reconstruction filter to the lowpass band - even += 5 * lowpass[column + 0]; - even += 4 * lowpass[column - 1]; - even -= 1 * lowpass[column - 2]; - even += rounding; + + // Right border + even = 5 * lowpass[column] + 4 * lowpass[column - 1] - lowpass[column - 2] + rounding; even = DivideByShift(even, 3); - - // Add the highpass correction - even += highpass[column]; - even >>= 1; - - // The lowpass result should be a positive number - //assert(0 <= even && even <= INT16_MAX); - - // Place the even result in the even column - output[2 * column + 0] = clamp_uint14(even); - + even = (even + highpass[column]) >> 1; + output[2 * column] = ClampPixel(even); + if (2 * column + 1 < output_width) { - // Apply the odd reconstruction filter to the lowpass band - odd += 11 * lowpass[column + 0]; - odd -= 4 * lowpass[column - 1]; - odd += 1 * lowpass[column - 2]; - odd += rounding; + odd = 11 * lowpass[column] - 4 * lowpass[column - 1] + lowpass[column - 2] + rounding; odd = DivideByShift(odd, 3); - - // Subtract the highpass correction - odd -= highpass[column]; - odd >>= 1; - - // The lowpass result should be a positive number - //assert(0 <= odd && odd <= INT16_MAX); - - // Place the odd result in the odd column - output[2 * column + 1] = clamp_uint14(odd); + odd = (odd - highpass[column]) >> 1; + output[2 * column + 1] = ClampPixel(odd); } - + return CODEC_ERROR_OKAY; } /*! - @brief Apply the inverse horizontal wavelet transform - This routine is similar to @ref InvertHorizontal16s, but a scale factor - that was applied during encoding is removed from the output values. + @brief Apply the inverse horizontal wavelet transform with descaling */ STATIC CODEC_ERROR InvertHorizontalDescale16s(PIXEL *lowpass, PIXEL *highpass, PIXEL *output, DIMENSION input_width, DIMENSION output_width, int descale) { const int last_column = input_width - 1; - - // Start processing at the beginning of the row int column = 0; - int descale_shift = 0; - - int32_t even; - int32_t odd; - - /* - The implementation of the inverse filter includes descaling by a factor of two - because the last division by two in the computation of the even and odd results - that is performed using a right arithmetic shift has been omitted from the code. - */ - if (descale == 2) { - descale_shift = 1; - } - - // Check that the descaling value is reasonable + int32_t even, odd; + + if (descale == 2) descale_shift = 1; + else if (descale == 3) descale_shift = 2; assert(descale_shift >= 0); - - // Process the first two output points with special filters for the left border - even = 0; - odd = 0; - - // Apply the even reconstruction filter to the lowpass band - even += 11 * lowpass[column + 0]; - even -= 4 * lowpass[column + 1]; - even += 1 * lowpass[column + 2]; - even += rounding; + + // Left border + even = 11 * lowpass[0] - 4 * lowpass[1] + lowpass[2] + rounding; even = DivideByShift(even, 3); - - // Add the highpass correction - even += highpass[column]; - - // Remove any scaling used during encoding - even <<= descale_shift; - - // The lowpass result should be a positive number - //assert(0 <= even && even <= INT16_MAX); - - // Apply the odd reconstruction filter to the lowpass band - odd += 5 * lowpass[column + 0]; - odd += 4 * lowpass[column + 1]; - odd -= 1 * lowpass[column + 2]; - odd += rounding; + even = (even + highpass[0]) << descale_shift; + + odd = 5 * lowpass[0] + 4 * lowpass[1] - lowpass[2] + rounding; odd = DivideByShift(odd, 3); - - // Subtract the highpass correction - odd -= highpass[column]; - - // Remove any scaling used during encoding - odd <<= descale_shift; - - // The lowpass result should be a positive number - //assert(0 <= odd && odd <= INT16_MAX); - - output[2 * column + 0] = ClampPixel(even); - output[2 * column + 1] = ClampPixel(odd); - - // Advance to the next input column (second pair of output values) - column++; - - // Process the rest of the columns up to the last column in the row + odd = (odd - highpass[0]) << descale_shift; + + output[0] = ClampPixel(even); + output[1] = ClampPixel(odd); + column = 1; + +#if ENABLED(NEON) + { + const int32x4_t four = vdupq_n_s32(4); + const int32x4_t descale_vec = vdupq_n_s32(descale_shift); + + for (; column + 3 < last_column; column += 4) + { + int32x4_t lp_left = vld1q_s32(&lowpass[column - 1]); + int32x4_t lp_center = vld1q_s32(&lowpass[column]); + int32x4_t lp_right = vld1q_s32(&lowpass[column + 1]); + int32x4_t hp_center = vld1q_s32(&highpass[column]); + + int32x4_t diff_e = vsubq_s32(lp_left, lp_right); + diff_e = vaddq_s32(diff_e, four); + diff_e = vshrq_n_s32(diff_e, 3); + int32x4_t even_v = vaddq_s32(diff_e, lp_center); + even_v = vaddq_s32(even_v, hp_center); + even_v = vshlq_s32(even_v, descale_vec); + + int32x4_t diff_o = vsubq_s32(lp_right, lp_left); + diff_o = vaddq_s32(diff_o, four); + diff_o = vshrq_n_s32(diff_o, 3); + int32x4_t odd_v = vaddq_s32(diff_o, lp_center); + odd_v = vsubq_s32(odd_v, hp_center); + odd_v = vshlq_s32(odd_v, descale_vec); + + int32x4x2_t interleaved; + interleaved.val[0] = even_v; + interleaved.val[1] = odd_v; + vst2q_s32(&output[2 * column], interleaved); + } + } +#endif + + // Scalar middle columns for (; column < last_column; column++) { - int32_t even = 0; // Result of convolution with even filter - int32_t odd = 0; // Result of convolution with odd filter - - // Apply the even reconstruction filter to the lowpass band - even += lowpass[column - 1]; - even -= lowpass[column + 1]; - even += 4; + even = lowpass[column - 1] - lowpass[column + 1] + 4; even >>= 3; - even += lowpass[column + 0]; - - // Add the highpass correction - even += highpass[column]; - - // Remove any scaling used during encoding - even <<= descale_shift; - - // The lowpass result should be a positive number - //assert(0 <= even && even <= INT16_MAX); - - // Place the even result in the even column - output[2 * column + 0] = ClampPixel(even); - - // Apply the odd reconstruction filter to the lowpass band - odd -= lowpass[column - 1]; - odd += lowpass[column + 1]; - odd += 4; + even += lowpass[column]; + even = (even + highpass[column]) << descale_shift; + output[2 * column] = ClampPixel(even); + + odd = -lowpass[column - 1] + lowpass[column + 1] + 4; odd >>= 3; - odd += lowpass[column + 0]; - - // Subtract the highpass correction - odd -= highpass[column]; - - // Remove any scaling used during encoding - odd <<= descale_shift; - - // The lowpass result should be a positive number - //assert(0 <= odd && odd <= INT16_MAX); - - // Place the odd result in the odd column + odd += lowpass[column]; + odd = (odd - highpass[column]) << descale_shift; output[2 * column + 1] = ClampPixel(odd); } - - // Should have exited the loop at the column for right border processing + assert(column == last_column); - - // Process the last two output points with special filters for the right border - even = 0; - odd = 0; - - // Apply the even reconstruction filter to the lowpass band - even += 5 * lowpass[column + 0]; - even += 4 * lowpass[column - 1]; - even -= 1 * lowpass[column - 2]; - even += rounding; + + // Right border + even = 5 * lowpass[column] + 4 * lowpass[column - 1] - lowpass[column - 2] + rounding; even = DivideByShift(even, 3); - - // Add the highpass correction - even += highpass[column]; - - // Remove any scaling used during encoding - even <<= descale_shift; - - // The lowpass result should be a positive number - //assert(0 <= even && even <= INT16_MAX); - - // Place the even result in the even column - output[2 * column + 0] = ClampPixel(even); - + even = (even + highpass[column]) << descale_shift; + output[2 * column] = ClampPixel(even); + if (2 * column + 1 < output_width) { - // Apply the odd reconstruction filter to the lowpass band - odd += 11 * lowpass[column + 0]; - odd -= 4 * lowpass[column - 1]; - odd += 1 * lowpass[column - 2]; - odd += rounding; + odd = 11 * lowpass[column] - 4 * lowpass[column - 1] + lowpass[column - 2] + rounding; odd = DivideByShift(odd, 3); - - // Subtract the highpass correction - odd -= highpass[column]; - - // Remove any scaling used during encoding - odd <<= descale_shift; - - // The lowpass result should be a positive number - //assert(0 <= odd && odd <= INT16_MAX); - - // Place the odd result in the odd column + odd = (odd - highpass[column]) << descale_shift; output[2 * column + 1] = ClampPixel(odd); } - + return CODEC_ERROR_OKAY; } /*! - @brief Apply the inverse spatial wavelet filter - Dequantize the coefficients in the highpass bands and apply the - inverse spatial wavelet filter to compute a lowpass band that - has twice the width and height of the input bands. - The inverse vertical filter is applied to the upper and lower bands - on the left and the upper and lower bands on the right. The inverse - horizontal filter is applied to the left and right (lowpass and highpass) - results from the vertical inverse. Each application of the inverse - vertical filter produces two output rows and each application of the - inverse horizontal filter produces an output row that is twice as wide. - The inverse wavelet filter is a three tap filter. - - For the even output values, add and subtract the off-center values, - add the rounding correction, and divide by eight, then add the center - value, add the highpass coefficient, and divide by two. - - For the odd output values, the add and subtract operations for the - off-center values are reversed the the highpass coefficient is subtracted. - Divisions are implemented by right arithmetic shifts. - Special formulas for the inverse vertical filter are applied to the top - and bottom rows. + @brief Apply the inverse spatial wavelet filter (no descaling) */ CODEC_ERROR InvertSpatialQuant16s(gpr_allocator *allocator, PIXEL *lowlow_band, int lowlow_pitch, @@ -379,400 +299,262 @@ CODEC_ERROR InvertSpatialQuant16s(gpr_allocator *allocator, PIXEL *highlow = highlow_band; PIXEL *highhigh = highhigh_band; PIXEL *output = output_image; - PIXEL *even_lowpass; - PIXEL *even_highpass; - PIXEL *odd_lowpass; - PIXEL *odd_highpass; - PIXEL *even_output; - PIXEL *odd_output; + PIXEL *even_lowpass, *even_highpass, *odd_lowpass, *odd_highpass; + PIXEL *even_output, *odd_output; size_t buffer_row_size; int last_row = input_height - 1; int row, column; - PIXEL *lowhigh_row[3]; - PIXEL *lowhigh_line[3]; - PIXEL *highlow_line; - PIXEL *highhigh_line; - + PIXEL *highlow_line, *highhigh_line; + QUANT highlow_quantization = quantization[HL_BAND]; QUANT lowhigh_quantization = quantization[LH_BAND]; QUANT highhigh_quantization = quantization[HH_BAND]; - - // Compute positions within the temporary buffer for each row of horizontal lowpass - // and highpass intermediate coefficients computed by the vertical inverse transform + buffer_row_size = input_width * sizeof(PIXEL); - - // Compute the positions of the even and odd rows of coefficients - even_lowpass = (PIXEL *)allocator->Alloc(buffer_row_size); + + even_lowpass = (PIXEL *)allocator->Alloc(buffer_row_size); even_highpass = (PIXEL *)allocator->Alloc(buffer_row_size); - odd_lowpass = (PIXEL *)allocator->Alloc(buffer_row_size); - odd_highpass = (PIXEL *)allocator->Alloc(buffer_row_size); - - // Compute the positions of the dequantized highpass rows + odd_lowpass = (PIXEL *)allocator->Alloc(buffer_row_size); + odd_highpass = (PIXEL *)allocator->Alloc(buffer_row_size); + lowhigh_line[0] = (PIXEL *)allocator->Alloc(buffer_row_size); lowhigh_line[1] = (PIXEL *)allocator->Alloc(buffer_row_size); lowhigh_line[2] = (PIXEL *)allocator->Alloc(buffer_row_size); - highlow_line = (PIXEL *)allocator->Alloc(buffer_row_size); - highhigh_line = (PIXEL *)allocator->Alloc(buffer_row_size); - - // Convert pitch from bytes to pixels - lowlow_pitch /= sizeof(PIXEL); - lowhigh_pitch /= sizeof(PIXEL); - highlow_pitch /= sizeof(PIXEL); + highlow_line = (PIXEL *)allocator->Alloc(buffer_row_size); + highhigh_line = (PIXEL *)allocator->Alloc(buffer_row_size); + + lowlow_pitch /= sizeof(PIXEL); + lowhigh_pitch /= sizeof(PIXEL); + highlow_pitch /= sizeof(PIXEL); highhigh_pitch /= sizeof(PIXEL); - output_pitch /= sizeof(PIXEL); - - // Initialize the pointers to the even and odd output rows + output_pitch /= sizeof(PIXEL); + even_output = output; - odd_output = output + output_pitch; - - // Apply the vertical border filter to the first row + odd_output = output + output_pitch; + + // First row (top border) row = 0; - - // Set pointers to the first three rows in the first highpass band lowhigh_row[0] = lowhigh + 0 * lowhigh_pitch; lowhigh_row[1] = lowhigh + 1 * lowhigh_pitch; lowhigh_row[2] = lowhigh + 2 * lowhigh_pitch; - - // Dequantize three rows of highpass coefficients in the first highpass band + DequantizeBandRow16s(lowhigh_row[0], input_width, lowhigh_quantization, lowhigh_line[0]); DequantizeBandRow16s(lowhigh_row[1], input_width, lowhigh_quantization, lowhigh_line[1]); DequantizeBandRow16s(lowhigh_row[2], input_width, lowhigh_quantization, lowhigh_line[2]); - - // Dequantize one row of coefficients each in the second and third highpass bands - DequantizeBandRow16s(highlow, input_width, highlow_quantization, highlow_line); + DequantizeBandRow16s(highlow, input_width, highlow_quantization, highlow_line); DequantizeBandRow16s(highhigh, input_width, highhigh_quantization, highhigh_line); - + for (column = 0; column < input_width; column++) { - int32_t even = 0; // Result of convolution with even filter - int32_t odd = 0; // Result of convolution with odd filter - - - /***** Compute the vertical inverse for the left two bands *****/ - - // Apply the even reconstruction filter to the lowpass band + int32_t even = 0, odd = 0; + + // Left bands (lowlow + highlow) - top border filter even += 11 * lowlow[column + 0 * lowlow_pitch]; even -= 4 * lowlow[column + 1 * lowlow_pitch]; even += 1 * lowlow[column + 2 * lowlow_pitch]; even += rounding; even = DivideByShift(even, 3); - - // Add the highpass correction even += highlow_line[column]; even >>= 1; - - // The inverse of the left two bands should be a positive number - //assert(0 <= even && even <= INT16_MAX); - - // Place the even result in the even row even_lowpass[column] = ClampPixel(even); - - // Apply the odd reconstruction filter to the lowpass band + odd += 5 * lowlow[column + 0 * lowlow_pitch]; odd += 4 * lowlow[column + 1 * lowlow_pitch]; odd -= 1 * lowlow[column + 2 * lowlow_pitch]; odd += rounding; odd = DivideByShift(odd, 3); - - // Subtract the highpass correction odd -= highlow_line[column]; odd >>= 1; - - // The inverse of the left two bands should be a positive number - //assert(0 <= odd && odd <= INT16_MAX); - - // Place the odd result in the odd row odd_lowpass[column] = ClampPixel(odd); - - - /***** Compute the vertical inverse for the right two bands *****/ - - even = 0; - odd = 0; - - // Apply the even reconstruction filter to the lowpass band + + // Right bands (lowhigh + highhigh) - top border filter + even = 0; odd = 0; even += 11 * lowhigh_line[0][column]; even -= 4 * lowhigh_line[1][column]; even += 1 * lowhigh_line[2][column]; even += rounding; even = DivideByShift(even, 3); - - // Add the highpass correction even += highhigh_line[column]; even >>= 1; - - // Place the even result in the even row even_highpass[column] = ClampPixel(even); - - // Apply the odd reconstruction filter to the lowpass band + odd += 5 * lowhigh_line[0][column]; odd += 4 * lowhigh_line[1][column]; odd -= 1 * lowhigh_line[2][column]; odd += rounding; odd = DivideByShift(odd, 3); - - // Subtract the highpass correction odd -= highhigh_line[column]; odd >>= 1; - - // Place the odd result in the odd row odd_highpass[column] = ClampPixel(odd); } - - // Apply the inverse horizontal transform to the even and odd rows + InvertHorizontal16s(even_lowpass, even_highpass, even_output, input_width, output_width); - InvertHorizontal16s(odd_lowpass, odd_highpass, odd_output, input_width, output_width); - - // Advance to the next pair of even and odd output rows + InvertHorizontal16s(odd_lowpass, odd_highpass, odd_output, input_width, output_width); + even_output += 2 * output_pitch; - odd_output += 2 * output_pitch; - - // Always advance the highpass row pointers - highlow += highlow_pitch; + odd_output += 2 * output_pitch; + highlow += highlow_pitch; highhigh += highhigh_pitch; - - // Advance the row index row++; - - // Process the middle rows using the interior reconstruction filters + + // Middle rows for (; row < last_row; row++) { - // Dequantize one row from each of the two highpass bands - DequantizeBandRow16s(highlow, input_width, highlow_quantization, highlow_line); + DequantizeBandRow16s(highlow, input_width, highlow_quantization, highlow_line); DequantizeBandRow16s(highhigh, input_width, highhigh_quantization, highhigh_line); - - // Process the entire row - for (column = 0; column < input_width; column++) + + column = 0; +#if ENABLED(NEON) { - int32_t even = 0; // Result of convolution with even filter - int32_t odd = 0; // Result of convolution with odd filter - - - /***** Compute the vertical inverse for the left two bands *****/ - - // Apply the even reconstruction filter to the lowpass band + const int width_m4 = (input_width / 4) * 4; + for (; column < width_m4; column += 4) + { + // Left bands + InvertVerticalMiddle4_NEON( + lowlow + 0 * lowlow_pitch, lowlow + 1 * lowlow_pitch, lowlow + 2 * lowlow_pitch, + highlow_line, even_lowpass, odd_lowpass, column); + // Right bands + InvertVerticalMiddle4_NEON( + lowhigh_line[0], lowhigh_line[1], lowhigh_line[2], + highhigh_line, even_highpass, odd_highpass, column); + } + } +#endif + for (; column < input_width; column++) + { + int32_t even = 0, odd = 0; + even += lowlow[column + 0 * lowlow_pitch]; even -= lowlow[column + 2 * lowlow_pitch]; - even += 4; - even >>= 3; + even += 4; even >>= 3; even += lowlow[column + 1 * lowlow_pitch]; - - // Add the highpass correction even += highlow_line[column]; even >>= 1; - - // The inverse of the left two bands should be a positive number - //assert(0 <= even && even <= INT16_MAX); - - // Place the even result in the even row even_lowpass[column] = ClampPixel(even); - - // Apply the odd reconstruction filter to the lowpass band + odd -= lowlow[column + 0 * lowlow_pitch]; odd += lowlow[column + 2 * lowlow_pitch]; - odd += 4; - odd >>= 3; + odd += 4; odd >>= 3; odd += lowlow[column + 1 * lowlow_pitch]; - - // Subtract the highpass correction odd -= highlow_line[column]; odd >>= 1; - - // The inverse of the left two bands should be a positive number - //assert(0 <= odd && odd <= INT16_MAX); - - // Place the odd result in the odd row odd_lowpass[column] = ClampPixel(odd); - - - /***** Compute the vertical inverse for the right two bands *****/ - - even = 0; - odd = 0; - - // Apply the even reconstruction filter to the lowpass band + + even = 0; odd = 0; even += lowhigh_line[0][column]; even -= lowhigh_line[2][column]; - even += 4; - even >>= 3; + even += 4; even >>= 3; even += lowhigh_line[1][column]; - - // Add the highpass correction even += highhigh_line[column]; even >>= 1; - - // Place the even result in the even row even_highpass[column] = ClampPixel(even); - - // Apply the odd reconstruction filter to the lowpass band + odd -= lowhigh_line[0][column]; odd += lowhigh_line[2][column]; - odd += 4; - odd >>= 3; + odd += 4; odd >>= 3; odd += lowhigh_line[1][column]; - - // Subtract the highpass correction odd -= highhigh_line[column]; odd >>= 1; - - // Place the odd result in the odd row odd_highpass[column] = ClampPixel(odd); } - - // Apply the inverse horizontal transform to the even and odd rows and descale the results + InvertHorizontal16s(even_lowpass, even_highpass, even_output, input_width, output_width); - InvertHorizontal16s(odd_lowpass, odd_highpass, odd_output, input_width, output_width); - - // Advance to the next input row in each band - lowlow += lowlow_pitch; - lowhigh += lowhigh_pitch; - highlow += highlow_pitch; + InvertHorizontal16s(odd_lowpass, odd_highpass, odd_output, input_width, output_width); + + lowlow += lowlow_pitch; + lowhigh += lowhigh_pitch; + highlow += highlow_pitch; highhigh += highhigh_pitch; - - // Advance to the next pair of even and odd output rows even_output += 2 * output_pitch; - odd_output += 2 * output_pitch; - + odd_output += 2 * output_pitch; + if (row < last_row - 1) { - // Compute the address of the next row in the lowhigh band PIXEL *lowhigh_row_ptr = (lowhigh + 2 * lowhigh_pitch); - //PIXEL *lowhigh_row_ptr = (lowhigh + lowhigh_pitch); - - // Shift the rows in the buffer of dequantized lowhigh bands PIXEL *temp = lowhigh_line[0]; lowhigh_line[0] = lowhigh_line[1]; lowhigh_line[1] = lowhigh_line[2]; lowhigh_line[2] = temp; - - // Undo quantization for the next row in the lowhigh band DequantizeBandRow16s(lowhigh_row_ptr, input_width, lowhigh_quantization, lowhigh_line[2]); } } - - // Should have exited the loop at the last row + assert(row == last_row); - - // Advance the lowlow pointer to the last row in the band lowlow += lowlow_pitch; - - // Check that the band pointers are on the last row in each wavelet band + assert(lowlow == (lowlow_band + last_row * lowlow_pitch)); - assert(highlow == (highlow_band + last_row * highlow_pitch)); assert(highhigh == (highhigh_band + last_row * highhigh_pitch)); - - // Undo quantization for the highlow and highhigh bands - DequantizeBandRow16s(highlow, input_width, highlow_quantization, highlow_line); + + DequantizeBandRow16s(highlow, input_width, highlow_quantization, highlow_line); DequantizeBandRow16s(highhigh, input_width, highhigh_quantization, highhigh_line); - - // Apply the vertical border filter to the last row + + // Last row (bottom border) for (column = 0; column < input_width; column++) { - int32_t even = 0; // Result of convolution with even filter - int32_t odd = 0; // Result of convolution with odd filter - - - /***** Compute the vertical inverse for the left two bands *****/ - - // Apply the even reconstruction filter to the lowpass band - even += 5 * lowlow[column + 0 * lowlow_pitch]; - even += 4 * lowlow[column - 1 * lowlow_pitch]; - even -= 1 * lowlow[column - 2 * lowlow_pitch]; + int32_t even = 0, odd = 0; + + // Left bands - bottom border + even += 5 * lowlow[column + 0 * lowlow_pitch]; + even += 4 * lowlow[column - 1 * lowlow_pitch]; + even -= 1 * lowlow[column - 2 * lowlow_pitch]; even += 4; even = DivideByShift(even, 3); - - // Add the highpass correction even += highlow_line[column]; even >>= 1; - - // The inverse of the left two bands should be a positive number - //assert(0 <= even && even <= INT16_MAX); - - // Place the even result in the even row even_lowpass[column] = ClampPixel(even); - - // Apply the odd reconstruction filter to the lowpass band + odd += 11 * lowlow[column + 0 * lowlow_pitch]; odd -= 4 * lowlow[column - 1 * lowlow_pitch]; odd += 1 * lowlow[column - 2 * lowlow_pitch]; odd += 4; odd = DivideByShift(odd, 3); - - // Subtract the highpass correction odd -= highlow_line[column]; odd >>= 1; - - // The inverse of the left two bands should be a positive number - //assert(0 <= odd && odd <= INT16_MAX); - - // Place the odd result in the odd row odd_lowpass[column] = ClampPixel(odd); - - - // Compute the vertical inverse for the right two bands // - - even = 0; - odd = 0; - - // Apply the even reconstruction filter to the lowpass band - even += 5 * lowhigh_line[2][column]; - even += 4 * lowhigh_line[1][column]; - even -= 1 * lowhigh_line[0][column]; + + // Right bands - bottom border + even = 0; odd = 0; + even += 5 * lowhigh_line[2][column]; + even += 4 * lowhigh_line[1][column]; + even -= 1 * lowhigh_line[0][column]; even += 4; even = DivideByShift(even, 3); - - // Add the highpass correction even += highhigh_line[column]; even >>= 1; - - // Place the even result in the even row even_highpass[column] = ClampPixel(even); - - // Apply the odd reconstruction filter to the lowpass band + odd += 11 * lowhigh_line[2][column]; odd -= 4 * lowhigh_line[1][column]; odd += 1 * lowhigh_line[0][column]; odd += 4; odd = DivideByShift(odd, 3); - - // Subtract the highpass correction odd -= highhigh_line[column]; odd >>= 1; - - // Place the odd result in the odd row odd_highpass[column] = ClampPixel(odd); } - - // Apply the inverse horizontal transform to the even and odd rows and descale the results + InvertHorizontal16s(even_lowpass, even_highpass, even_output, input_width, output_width); - - // Is the output wavelet shorter than twice the height of the input wavelet? - if (2 * row + 1 < output_height) { + if (2 * row + 1 < output_height) InvertHorizontal16s(odd_lowpass, odd_highpass, odd_output, input_width, output_width); - } - - // Free the scratch buffers + allocator->Free(even_lowpass); allocator->Free(even_highpass); allocator->Free(odd_lowpass); allocator->Free(odd_highpass); - allocator->Free(lowhigh_line[0]); allocator->Free(lowhigh_line[1]); allocator->Free(lowhigh_line[2]); allocator->Free(highlow_line); allocator->Free(highhigh_line); - + return CODEC_ERROR_OKAY; } /*! @brief Apply the inverse spatial transform with descaling - This routine is similar to @ref InvertSpatialQuant16s, but a scale factor - that was applied during encoding is removed from the output values. */ CODEC_ERROR InvertSpatialQuantDescale16s(gpr_allocator *allocator, PIXEL *lowlow_band, int lowlow_pitch, @@ -789,400 +571,256 @@ CODEC_ERROR InvertSpatialQuantDescale16s(gpr_allocator *allocator, PIXEL *highlow = highlow_band; PIXEL *highhigh = highhigh_band; PIXEL *output = output_image; - PIXEL *even_lowpass; - PIXEL *even_highpass; - PIXEL *odd_lowpass; - PIXEL *odd_highpass; - PIXEL *even_output; - PIXEL *odd_output; + PIXEL *even_lowpass, *even_highpass, *odd_lowpass, *odd_highpass; + PIXEL *even_output, *odd_output; size_t buffer_row_size; int last_row = input_height - 1; int row, column; - PIXEL *lowhigh_row[3]; - PIXEL *lowhigh_line[3]; - PIXEL *highlow_line; - PIXEL *highhigh_line; - + PIXEL *highlow_line, *highhigh_line; + QUANT highlow_quantization = quantization[HL_BAND]; QUANT lowhigh_quantization = quantization[LH_BAND]; QUANT highhigh_quantization = quantization[HH_BAND]; - - // Compute positions within the temporary buffer for each row of horizontal lowpass - // and highpass intermediate coefficients computed by the vertical inverse transform + buffer_row_size = input_width * sizeof(PIXEL); - - // Allocate space for the even and odd rows of results from the inverse vertical filter - even_lowpass = (PIXEL *)allocator->Alloc(buffer_row_size); + + even_lowpass = (PIXEL *)allocator->Alloc(buffer_row_size); even_highpass = (PIXEL *)allocator->Alloc(buffer_row_size); - odd_lowpass = (PIXEL *)allocator->Alloc(buffer_row_size); - odd_highpass = (PIXEL *)allocator->Alloc(buffer_row_size); - - // Allocate scratch space for the dequantized highpass coefficients + odd_lowpass = (PIXEL *)allocator->Alloc(buffer_row_size); + odd_highpass = (PIXEL *)allocator->Alloc(buffer_row_size); + lowhigh_line[0] = (PIXEL *)allocator->Alloc(buffer_row_size); lowhigh_line[1] = (PIXEL *)allocator->Alloc(buffer_row_size); lowhigh_line[2] = (PIXEL *)allocator->Alloc(buffer_row_size); - highlow_line = (PIXEL *)allocator->Alloc(buffer_row_size); - highhigh_line = (PIXEL *)allocator->Alloc(buffer_row_size); - - // Convert pitch from bytes to pixels - lowlow_pitch /= sizeof(PIXEL); - lowhigh_pitch /= sizeof(PIXEL); - highlow_pitch /= sizeof(PIXEL); + highlow_line = (PIXEL *)allocator->Alloc(buffer_row_size); + highhigh_line = (PIXEL *)allocator->Alloc(buffer_row_size); + + lowlow_pitch /= sizeof(PIXEL); + lowhigh_pitch /= sizeof(PIXEL); + highlow_pitch /= sizeof(PIXEL); highhigh_pitch /= sizeof(PIXEL); - output_pitch /= sizeof(PIXEL); - - // Initialize the pointers to the even and odd output rows + output_pitch /= sizeof(PIXEL); + even_output = output; - odd_output = output + output_pitch; - - // Apply the vertical border filter to the first row + odd_output = output + output_pitch; + + // First row (top border) row = 0; - - // Set pointers to the first three rows in the first highpass band lowhigh_row[0] = lowhigh + 0 * lowhigh_pitch; lowhigh_row[1] = lowhigh + 1 * lowhigh_pitch; lowhigh_row[2] = lowhigh + 2 * lowhigh_pitch; - - // Dequantize three rows of highpass coefficients in the first highpass band + DequantizeBandRow16s(lowhigh_row[0], input_width, lowhigh_quantization, lowhigh_line[0]); DequantizeBandRow16s(lowhigh_row[1], input_width, lowhigh_quantization, lowhigh_line[1]); DequantizeBandRow16s(lowhigh_row[2], input_width, lowhigh_quantization, lowhigh_line[2]); - - // Dequantize one row of coefficients each in the second and third highpass bands - DequantizeBandRow16s(highlow, input_width, highlow_quantization, highlow_line); + DequantizeBandRow16s(highlow, input_width, highlow_quantization, highlow_line); DequantizeBandRow16s(highhigh, input_width, highhigh_quantization, highhigh_line); - + for (column = 0; column < input_width; column++) { - int32_t even = 0; // Result of convolution with even filter - int32_t odd = 0; // Result of convolution with odd filter - - - /***** Compute the vertical inverse for the left two bands *****/ - - // Apply the even reconstruction filter to the lowpass band + int32_t even = 0, odd = 0; + + // Left bands - top border (descale uses DivideByShift(x,1) instead of >>1) even += 11 * lowlow[column + 0 * lowlow_pitch]; even -= 4 * lowlow[column + 1 * lowlow_pitch]; even += 1 * lowlow[column + 2 * lowlow_pitch]; even += rounding; even = DivideByShift(even, 3); - - // Add the highpass correction even += highlow_line[column]; even = DivideByShift(even, 1); - - // The inverse of the left two bands should be a positive number - //assert(0 <= even && even <= INT16_MAX); - - // Place the even result in the even row even_lowpass[column] = ClampPixel(even); - - // Apply the odd reconstruction filter to the lowpass band + odd += 5 * lowlow[column + 0 * lowlow_pitch]; odd += 4 * lowlow[column + 1 * lowlow_pitch]; odd -= 1 * lowlow[column + 2 * lowlow_pitch]; odd += rounding; odd = DivideByShift(odd, 3); - - // Subtract the highpass correction odd -= highlow_line[column]; odd = DivideByShift(odd, 1); - - // The inverse of the left two bands should be a positive number - //assert(0 <= odd && odd <= INT16_MAX); - - // Place the odd result in the odd row odd_lowpass[column] = ClampPixel(odd); - - - /***** Compute the vertical inverse for the right two bands *****/ - - even = 0; - odd = 0; - - // Apply the even reconstruction filter to the lowpass band + + // Right bands - top border + even = 0; odd = 0; even += 11 * lowhigh_line[0][column]; even -= 4 * lowhigh_line[1][column]; even += 1 * lowhigh_line[2][column]; even += rounding; even = DivideByShift(even, 3); - - // Add the highpass correction even += highhigh_line[column]; even = DivideByShift(even, 1); - - // Place the even result in the even row even_highpass[column] = ClampPixel(even); - - // Apply the odd reconstruction filter to the lowpass band + odd += 5 * lowhigh_line[0][column]; odd += 4 * lowhigh_line[1][column]; odd -= 1 * lowhigh_line[2][column]; odd += rounding; odd = DivideByShift(odd, 3); - - // Subtract the highpass correction odd -= highhigh_line[column]; odd = DivideByShift(odd, 1); - - // Place the odd result in the odd row odd_highpass[column] = ClampPixel(odd); } - - // Apply the inverse horizontal transform to the even and odd rows and descale the results - InvertHorizontalDescale16s(even_lowpass, even_highpass, even_output, - input_width, output_width, descale); - - InvertHorizontalDescale16s(odd_lowpass, odd_highpass, odd_output, - input_width, output_width, descale); - - // Advance to the next pair of even and odd output rows + + InvertHorizontalDescale16s(even_lowpass, even_highpass, even_output, input_width, output_width, descale); + InvertHorizontalDescale16s(odd_lowpass, odd_highpass, odd_output, input_width, output_width, descale); + even_output += 2 * output_pitch; - odd_output += 2 * output_pitch; - - // Always advance the highpass row pointers - highlow += highlow_pitch; + odd_output += 2 * output_pitch; + highlow += highlow_pitch; highhigh += highhigh_pitch; - - // Advance the row index row++; - - // Process the middle rows using the interior reconstruction filters + + // Middle rows for (; row < last_row; row++) { - // Dequantize one row from each of the two highpass bands - DequantizeBandRow16s(highlow, input_width, highlow_quantization, highlow_line); + DequantizeBandRow16s(highlow, input_width, highlow_quantization, highlow_line); DequantizeBandRow16s(highhigh, input_width, highhigh_quantization, highhigh_line); - - // Process the entire row - for (column = 0; column < input_width; column++) + + column = 0; +#if ENABLED(NEON) { - int32_t even = 0; // Result of convolution with even filter - int32_t odd = 0; // Result of convolution with odd filter - - - /***** Compute the vertical inverse for the left two bands *****/ - - // Apply the even reconstruction filter to the lowpass band + const int width_m4 = (input_width / 4) * 4; + for (; column < width_m4; column += 4) + { + // Left bands + InvertVerticalMiddle4Descale_NEON( + lowlow + 0 * lowlow_pitch, lowlow + 1 * lowlow_pitch, lowlow + 2 * lowlow_pitch, + highlow_line, even_lowpass, odd_lowpass, column); + // Right bands + InvertVerticalMiddle4Descale_NEON( + lowhigh_line[0], lowhigh_line[1], lowhigh_line[2], + highhigh_line, even_highpass, odd_highpass, column); + } + } +#endif + for (; column < input_width; column++) + { + int32_t even = 0, odd = 0; + even += lowlow[column + 0 * lowlow_pitch]; even -= lowlow[column + 2 * lowlow_pitch]; - even += 4; - even >>= 3; + even += 4; even >>= 3; even += lowlow[column + 1 * lowlow_pitch]; - - // Add the highpass correction even += highlow_line[column]; even = DivideByShift(even, 1); - - // The inverse of the left two bands should be a positive number - //assert(0 <= even && even <= INT16_MAX); - - // Place the even result in the even row even_lowpass[column] = ClampPixel(even); - - // Apply the odd reconstruction filter to the lowpass band + odd -= lowlow[column + 0 * lowlow_pitch]; odd += lowlow[column + 2 * lowlow_pitch]; - odd += 4; - odd >>= 3; + odd += 4; odd >>= 3; odd += lowlow[column + 1 * lowlow_pitch]; - - // Subtract the highpass correction odd -= highlow_line[column]; odd = DivideByShift(odd, 1); - - // The inverse of the left two bands should be a positive number - //assert(0 <= odd && odd <= INT16_MAX); - - // Place the odd result in the odd row odd_lowpass[column] = ClampPixel(odd); - - - /***** Compute the vertical inverse for the right two bands *****/ - - even = 0; - odd = 0; - - // Apply the even reconstruction filter to the lowpass band + + even = 0; odd = 0; even += lowhigh_line[0][column]; even -= lowhigh_line[2][column]; - even += 4; - even >>= 3; + even += 4; even >>= 3; even += lowhigh_line[1][column]; - - // Add the highpass correction even += highhigh_line[column]; even = DivideByShift(even, 1); - - // Place the even result in the even row even_highpass[column] = ClampPixel(even); - - // Apply the odd reconstruction filter to the lowpass band + odd -= lowhigh_line[0][column]; odd += lowhigh_line[2][column]; - odd += 4; - odd >>= 3; + odd += 4; odd >>= 3; odd += lowhigh_line[1][column]; - - // Subtract the highpass correction odd -= highhigh_line[column]; odd = DivideByShift(odd, 1); - - // Place the odd result in the odd row odd_highpass[column] = ClampPixel(odd); } - - // Apply the inverse horizontal transform to the even and odd rows and descale the results - InvertHorizontalDescale16s(even_lowpass, even_highpass, even_output, - input_width, output_width, descale); - - InvertHorizontalDescale16s(odd_lowpass, odd_highpass, odd_output, - input_width, output_width, descale); - - // Advance to the next input row in each band - lowlow += lowlow_pitch; - lowhigh += lowhigh_pitch; - highlow += highlow_pitch; + + InvertHorizontalDescale16s(even_lowpass, even_highpass, even_output, input_width, output_width, descale); + InvertHorizontalDescale16s(odd_lowpass, odd_highpass, odd_output, input_width, output_width, descale); + + lowlow += lowlow_pitch; + lowhigh += lowhigh_pitch; + highlow += highlow_pitch; highhigh += highhigh_pitch; - - // Advance to the next pair of even and odd output rows even_output += 2 * output_pitch; - odd_output += 2 * output_pitch; - + odd_output += 2 * output_pitch; + if (row < last_row - 1) { - // Compute the address of the next row in the lowhigh band PIXEL *lowhigh_row_ptr = (lowhigh + 2 * lowhigh_pitch); - - // Shift the rows in the buffer of dequantized lowhigh bands PIXEL *temp = lowhigh_line[0]; lowhigh_line[0] = lowhigh_line[1]; lowhigh_line[1] = lowhigh_line[2]; lowhigh_line[2] = temp; - - // Undo quantization for the next row in the lowhigh band DequantizeBandRow16s(lowhigh_row_ptr, input_width, lowhigh_quantization, lowhigh_line[2]); } } - - // Should have exited the loop at the last row + assert(row == last_row); - - // Advance the lowlow pointer to the last row in the band lowlow += lowlow_pitch; - - // Check that the band pointers are on the last row in each wavelet band + assert(lowlow == (lowlow_band + last_row * lowlow_pitch)); - assert(highlow == (highlow_band + last_row * highlow_pitch)); assert(highhigh == (highhigh_band + last_row * highhigh_pitch)); - - // Undo quantization for the highlow and highhigh bands - DequantizeBandRow16s(highlow, input_width, highlow_quantization, highlow_line); + + DequantizeBandRow16s(highlow, input_width, highlow_quantization, highlow_line); DequantizeBandRow16s(highhigh, input_width, highhigh_quantization, highhigh_line); - - // Apply the vertical border filter to the last row + + // Last row (bottom border) for (column = 0; column < input_width; column++) { - int32_t even = 0; // Result of convolution with even filter - int32_t odd = 0; // Result of convolution with odd filter - - - /***** Compute the vertical inverse for the left two bands *****/ - - // Apply the even reconstruction filter to the lowpass band - even += 5 * lowlow[column + 0 * lowlow_pitch]; - even += 4 * lowlow[column - 1 * lowlow_pitch]; - even -= 1 * lowlow[column - 2 * lowlow_pitch]; + int32_t even = 0, odd = 0; + + // Left bands - bottom border + even += 5 * lowlow[column + 0 * lowlow_pitch]; + even += 4 * lowlow[column - 1 * lowlow_pitch]; + even -= 1 * lowlow[column - 2 * lowlow_pitch]; even += rounding; even = DivideByShift(even, 3); - - // Add the highpass correction even += highlow_line[column]; even = DivideByShift(even, 1); - - // The inverse of the left two bands should be a positive number - //assert(0 <= even && even <= INT16_MAX); - - // Place the even result in the even row even_lowpass[column] = ClampPixel(even); - - // Apply the odd reconstruction filter to the lowpass band + odd += 11 * lowlow[column + 0 * lowlow_pitch]; odd -= 4 * lowlow[column - 1 * lowlow_pitch]; odd += 1 * lowlow[column - 2 * lowlow_pitch]; odd += rounding; odd = DivideByShift(odd, 3); - - // Subtract the highpass correction odd -= highlow_line[column]; odd = DivideByShift(odd, 1); - - // The inverse of the left two bands should be a positive number - //assert(0 <= odd && odd <= INT16_MAX); - - // Place the odd result in the odd row odd_lowpass[column] = ClampPixel(odd); - - - /***** Compute the vertical inverse for the right two bands *****/ - - even = 0; - odd = 0; - - // Apply the even reconstruction filter to the lowpass band - even += 5 * lowhigh_line[2][column]; - even += 4 * lowhigh_line[1][column]; - even -= 1 * lowhigh_line[0][column]; + + // Right bands - bottom border + even = 0; odd = 0; + even += 5 * lowhigh_line[2][column]; + even += 4 * lowhigh_line[1][column]; + even -= 1 * lowhigh_line[0][column]; even += rounding; even = DivideByShift(even, 3); - - // Add the highpass correction even += highhigh_line[column]; even = DivideByShift(even, 1); - - // Place the even result in the even row even_highpass[column] = ClampPixel(even); - - // Apply the odd reconstruction filter to the lowpass band + odd += 11 * lowhigh_line[2][column]; odd -= 4 * lowhigh_line[1][column]; odd += 1 * lowhigh_line[0][column]; odd += rounding; odd = DivideByShift(odd, 3); - - // Subtract the highpass correction odd -= highhigh_line[column]; odd = DivideByShift(odd, 1); - - // Place the odd result in the odd row odd_highpass[column] = ClampPixel(odd); } - - // Apply the inverse horizontal transform to the even and odd rows and descale the results - InvertHorizontalDescale16s(even_lowpass, even_highpass, even_output, - input_width, output_width, descale); - - // Is the output wavelet shorter than twice the height of the input wavelet? - if (2 * row + 1 < output_height) { - InvertHorizontalDescale16s(odd_lowpass, odd_highpass, odd_output, - input_width, output_width, descale); - } - - // Free the scratch buffers + + InvertHorizontalDescale16s(even_lowpass, even_highpass, even_output, input_width, output_width, descale); + if (2 * row + 1 < output_height) + InvertHorizontalDescale16s(odd_lowpass, odd_highpass, odd_output, input_width, output_width, descale); + allocator->Free(even_lowpass); allocator->Free(even_highpass); allocator->Free(odd_lowpass); allocator->Free(odd_highpass); - allocator->Free(lowhigh_line[0]); allocator->Free(lowhigh_line[1]); allocator->Free(lowhigh_line[2]); allocator->Free(highlow_line); allocator->Free(highhigh_line); - + return CODEC_ERROR_OKAY; } - diff --git a/source/lib/vc5_decoder/raw.c b/source/lib/vc5_decoder/raw.c index 7cbac6ff..27fc1d29 100755 --- a/source/lib/vc5_decoder/raw.c +++ b/source/lib/vc5_decoder/raw.c @@ -17,6 +17,11 @@ */ #include "headers.h" +#include "logcurve.h" + +#if ENABLED(NEON) +#include +#endif /*! @brief Pack the component arrays into an output image @@ -30,6 +35,68 @@ CODEC_ERROR PackComponentsToRAW(const UNPACKED_IMAGE *image, DIMENSION width, DIMENSION height, ENABLED_PARTS enabled_parts, uint16_t output_bit_depth, PIXEL_FORMAT output_format ) { + int component_bits = image->component_array_list[0].bits_per_component; + if (component_bits <= 0) + component_bits = 12; + if (component_bits > 16) + component_bits = 16; + if (!vc5_logcurve_bypass()) + { + // Scan ALL components to find the maximum value (not just component 0) + // This is critical because RG/BG/GD are midpoint-centered at 2^(bits-1) + COMPONENT_VALUE max_val = 0; + for (int comp = 0; comp < image->component_count; ++comp) + { + COMPONENT_VALUE *component_data = image->component_array_list[comp].data; + if (component_data != NULL) + { + size_t total = (size_t)image->component_array_list[comp].width * image->component_array_list[comp].height; +#if ENABLED(NEON) + { + size_t total_m4 = (total / 4) * 4; + int32x4_t vmax4 = vdupq_n_s32(0); + size_t idx; + for (idx = 0; idx < total_m4; idx += 4) + { + int32x4_t vals = vld1q_s32(&component_data[idx]); + vmax4 = vmaxq_s32(vmax4, vals); + } + max_val = maximum(max_val, vmaxvq_s32(vmax4)); + for (; idx < total; ++idx) + { + if (component_data[idx] > max_val) + max_val = component_data[idx]; + } + } +#else + for (size_t idx = 0; idx < total; ++idx) + { + COMPONENT_VALUE value = component_data[idx]; + if (value > max_val) + max_val = value; + } +#endif + } + } + if (max_val > 0) + { + int bits_from_data = 0; + uint32_t val = (uint32_t)max_val; + while (val > 0) + { + bits_from_data++; + val >>= 1; + } + if (bits_from_data > 0) + { + if (bits_from_data > 16) bits_from_data = 16; + if (bits_from_data < 12) bits_from_data = 12; + component_bits = minimum(component_bits, bits_from_data); + } + } + } + const int log_bits = component_bits; + // Define pointers to the rows for each input component COMPONENT_VALUE *GS_input_buffer; COMPONENT_VALUE *RG_input_buffer; @@ -53,6 +120,8 @@ CODEC_ERROR PackComponentsToRAW(const UNPACKED_IMAGE *image, // Compute the distance between the half rows in the Bayer grid output_half_pitch = output_pitch / 2; + const int32_t midpoint = 1 << (log_bits - 1); + for (row = 0; row < height; row++) { COMPONENT_VALUE *GS_input_row_ptr = (COMPONENT_VALUE *)((uintptr_t)GS_input_buffer + row * image->component_array_list[0].pitch); @@ -62,71 +131,157 @@ CODEC_ERROR PackComponentsToRAW(const UNPACKED_IMAGE *image, uint8_t *output_row_ptr = (uint8_t *)output_buffer + row * output_pitch; - const int32_t midpoint = 2048; - int column; output_row1_ptr = (uint16_t *)output_row_ptr; output_row2_ptr = (uint16_t *)(output_row_ptr + output_half_pitch); - // Pack the rows of Bayer components into the BYR4 pattern - for (column = 0; column < width; column++) + // Determine output ordering from format (once per row, outside column loop) + int rggb_order; + switch (output_format) + { + case PIXEL_FORMAT_RAW_RGGB_12: + case PIXEL_FORMAT_RAW_RGGB_14: + case PIXEL_FORMAT_RAW_RGGB_16: + rggb_order = 1; + break; + case PIXEL_FORMAT_RAW_GBRG_12: + case PIXEL_FORMAT_RAW_GBRG_14: + case PIXEL_FORMAT_RAW_GBRG_16: + rggb_order = 0; + break; + default: + assert(0); + rggb_order = 0; + break; + } + +#if ENABLED(NEON) + { + const int width_m4 = (width / 4) * 4; + const int32x4_t v_midpoint = vdupq_n_s32(midpoint); + const int32x4_t v_zero = vdupq_n_s32(0); + const int32x4_t v_max = vdupq_n_s32((1 << log_bits) - 1); + const int bypass = vc5_logcurve_bypass(); + const int shift = 16 - output_bit_depth; + + // NEON: process 4 columns at a time + for (column = 0; column < width_m4; column += 4) + { + int32x4_t gs = vld1q_s32(&GS_input_row_ptr[column]); + int32x4_t rg = vld1q_s32(&RG_input_row_ptr[column]); + int32x4_t bg = vld1q_s32(&BG_input_row_ptr[column]); + int32x4_t gd = vld1q_s32(&GD_input_row_ptr[column]); + + gd = vsubq_s32(gd, v_midpoint); + rg = vsubq_s32(rg, v_midpoint); + bg = vsubq_s32(bg, v_midpoint); + + int32x4_t r = vaddq_s32(vshlq_n_s32(rg, 1), gs); + int32x4_t b = vaddq_s32(vshlq_n_s32(bg, 1), gs); + int32x4_t g1 = vaddq_s32(gs, gd); + int32x4_t g2 = vsubq_s32(gs, gd); + + r = vmaxq_s32(vminq_s32(r, v_max), v_zero); + g1 = vmaxq_s32(vminq_s32(g1, v_max), v_zero); + g2 = vmaxq_s32(vminq_s32(g2, v_max), v_zero); + b = vmaxq_s32(vminq_s32(b, v_max), v_zero); + + // Extract to scalar for log curve lookups + store + int32_t r_a[4], g1_a[4], g2_a[4], b_a[4]; + vst1q_s32(r_a, r); + vst1q_s32(g1_a, g1); + vst1q_s32(g2_a, g2); + vst1q_s32(b_a, b); + + int k; + for (k = 0; k < 4; k++) + { + int32_t R = DecodeLogValue(r_a[k], log_bits); + int32_t G1 = DecodeLogValue(g1_a[k], log_bits); + int32_t G2 = DecodeLogValue(g2_a[k], log_bits); + int32_t B = DecodeLogValue(b_a[k], log_bits); + + if (!bypass) + { + R >>= shift; + G1 >>= shift; + G2 >>= shift; + B >>= shift; + } + + if (rggb_order) + { + output_row1_ptr[2 * (column + k) + 0] = (uint16_t)R; + output_row1_ptr[2 * (column + k) + 1] = (uint16_t)G1; + output_row2_ptr[2 * (column + k) + 0] = (uint16_t)G2; + output_row2_ptr[2 * (column + k) + 1] = (uint16_t)B; + } + else + { + output_row1_ptr[2 * (column + k) + 0] = (uint16_t)G1; + output_row1_ptr[2 * (column + k) + 1] = (uint16_t)B; + output_row2_ptr[2 * (column + k) + 0] = (uint16_t)R; + output_row2_ptr[2 * (column + k) + 1] = (uint16_t)G2; + } + } + } + } +#else + column = 0; +#endif + + // Scalar cleanup for remaining columns + for (; column < width; column++) { int32_t GS, RG, BG, GD; int32_t R, G1, G2, B; - + GS = GS_input_row_ptr[column]; RG = RG_input_row_ptr[column]; BG = BG_input_row_ptr[column]; GD = GD_input_row_ptr[column]; - - // Convert unsigned values to signed values + GD -= midpoint; RG -= midpoint; BG -= midpoint; - + R = (RG << 1) + GS; B = (BG << 1) + GS; G1 = GS + GD; G2 = GS - GD; - - R = clamp_uint(R, 12); - G1 = clamp_uint(G1, 12); - G2 = clamp_uint(G2, 12); - B = clamp_uint(B, 12); - - // Apply inverse protune log curve - R = DecoderLogCurve[R]; - B = DecoderLogCurve[B]; - G1 = DecoderLogCurve[G1]; - G2 = DecoderLogCurve[G2]; - - R >>= (16 - output_bit_depth); - B >>= (16 - output_bit_depth); - G1 >>= (16 - output_bit_depth); - G2 >>= (16 - output_bit_depth); - - switch (output_format) + + R = clamp_uint(R, log_bits); + G1 = clamp_uint(G1, log_bits); + G2 = clamp_uint(G2, log_bits); + B = clamp_uint(B, log_bits); + + R = DecodeLogValue(R, log_bits); + B = DecodeLogValue(B, log_bits); + G1 = DecodeLogValue(G1, log_bits); + G2 = DecodeLogValue(G2, log_bits); + + if (!vc5_logcurve_bypass()) + { + R >>= (16 - output_bit_depth); + B >>= (16 - output_bit_depth); + G1 >>= (16 - output_bit_depth); + G2 >>= (16 - output_bit_depth); + } + + if (rggb_order) + { + output_row1_ptr[2 * column + 0] = (uint16_t)R; + output_row1_ptr[2 * column + 1] = (uint16_t)G1; + output_row2_ptr[2 * column + 0] = (uint16_t)G2; + output_row2_ptr[2 * column + 1] = (uint16_t)B; + } + else { - case PIXEL_FORMAT_RAW_RGGB_12: - case PIXEL_FORMAT_RAW_RGGB_14: - output_row1_ptr[2 * column + 0] = (uint16_t)R; - output_row1_ptr[2 * column + 1] = (uint16_t)G1; - output_row2_ptr[2 * column + 0] = (uint16_t)G2; - output_row2_ptr[2 * column + 1] = (uint16_t)B; - break; - - case PIXEL_FORMAT_RAW_GBRG_12: - case PIXEL_FORMAT_RAW_GBRG_14: - output_row1_ptr[2 * column + 0] = (uint16_t)G1; - output_row1_ptr[2 * column + 1] = (uint16_t)B; - output_row2_ptr[2 * column + 0] = (uint16_t)R; - output_row2_ptr[2 * column + 1] = (uint16_t)G2; - break; - - default: - assert(0); - break; + output_row1_ptr[2 * column + 0] = (uint16_t)G1; + output_row1_ptr[2 * column + 1] = (uint16_t)B; + output_row2_ptr[2 * column + 0] = (uint16_t)R; + output_row2_ptr[2 * column + 1] = (uint16_t)G2; } } } diff --git a/source/lib/vc5_decoder/vc5_decoder.c b/source/lib/vc5_decoder/vc5_decoder.c index f36ad4cf..08791a02 100755 --- a/source/lib/vc5_decoder/vc5_decoder.c +++ b/source/lib/vc5_decoder/vc5_decoder.c @@ -78,6 +78,14 @@ CODEC_ERROR vc5_decoder_process(const vc5_decoder_parameters* decoding_paramet case VC5_DECODER_PIXEL_FORMAT_GBRG_14: parameters.output.format = PIXEL_FORMAT_RAW_GBRG_14; break; + + case VC5_DECODER_PIXEL_FORMAT_RGGB_16: + parameters.output.format = PIXEL_FORMAT_RAW_RGGB_16; + break; + + case VC5_DECODER_PIXEL_FORMAT_GBRG_16: + parameters.output.format = PIXEL_FORMAT_RAW_GBRG_16; + break; default: assert(0); diff --git a/source/lib/vc5_decoder/vc5_decoder.h b/source/lib/vc5_decoder/vc5_decoder.h index c87111a0..a7e2a7da 100755 --- a/source/lib/vc5_decoder/vc5_decoder.h +++ b/source/lib/vc5_decoder/vc5_decoder.h @@ -41,7 +41,11 @@ VC5_DECODER_PIXEL_FORMAT_GBRG_12 = 2, // GBRG 12bit pixels packed into 16bits VC5_DECODER_PIXEL_FORMAT_GBRG_14 = 3, // GBRG 12bit pixels packed into 16bits - + + VC5_DECODER_PIXEL_FORMAT_RGGB_16 = 4, // RGGB 16bit pixels packed into 16bits + + VC5_DECODER_PIXEL_FORMAT_GBRG_16 = 5, // GBRG 16bit pixels packed into 16bits + VC5_DECODER_PIXEL_FORMAT_DEFAULT = VC5_DECODER_PIXEL_FORMAT_RGGB_14, } VC5_DECODER_PIXEL_FORMAT; diff --git a/source/lib/vc5_decoder/vlc.c b/source/lib/vc5_decoder/vlc.c index 646fc59d..7d5b3ec6 100755 --- a/source/lib/vc5_decoder/vlc.c +++ b/source/lib/vc5_decoder/vlc.c @@ -17,6 +17,141 @@ */ #include "headers.h" +#include + +/* + Prefix lookup table for fast VLC decoding. + + Instead of linear-searching through 264 codebook entries per codeword, + peek the top VLC_PEEK_BITS bits from the bitstream buffer and use them + as an index into this table. Entries with bits > 0 are direct hits; + entries with bits == 0 require fallback to the original linear search. +*/ +#define VLC_PEEK_BITS 12 + +typedef struct { + uint8_t bits; /* Codeword length in bits (0 = not in table) */ + uint8_t value; /* Unsigned coefficient magnitude */ + uint16_t count; /* Run length */ +} VLC_LOOKUP; + +static VLC_LOOKUP vlc_table[1 << VLC_PEEK_BITS]; /* 4096 entries, 16KB */ +static pthread_once_t vlc_init_once = PTHREAD_ONCE_INIT; +static CODEBOOK *vlc_init_codebook = NULL; +static int vlc_fast_disabled = 0; + +/*! + @brief Build the prefix lookup table from a codebook + + For each codebook entry with codeword size <= VLC_PEEK_BITS, fill all + table slots whose top bits match the codeword. A codeword of N bits + maps to 2^(PEEK-N) table entries (the "don't care" suffix bits). +*/ +static void InitVLCTable(CODEBOOK *codebook) +{ + int codebook_length = codebook->length; + RLV *entries = (RLV *)((uint8_t *)codebook + sizeof(CODEBOOK)); + int table_size = 1 << VLC_PEEK_BITS; + + /* Clear the table (bits=0 means "not found") */ + memset(vlc_table, 0, sizeof(vlc_table)); + + for (int i = 0; i < codebook_length; i++) + { + int size = entries[i].size; + if (size < 1 || size > VLC_PEEK_BITS) + continue; + + uint32_t codeword = entries[i].bits; + int fill_count = 1 << (VLC_PEEK_BITS - size); + int base_index = (int)(codeword << (VLC_PEEK_BITS - size)); + + VLC_LOOKUP entry; + entry.bits = (uint8_t)size; + entry.value = (uint8_t)entries[i].value; + entry.count = (uint16_t)entries[i].count; + + for (int j = 0; j < fill_count; j++) + { + vlc_table[base_index + j] = entry; + } + } +} + +/*! + @brief pthread_once callback for thread-safe VLC table initialization +*/ +static void InitVLCTableOnce(void) +{ + InitVLCTable(vlc_init_codebook); + if (getenv("VLC_NO_FAST") != NULL) + vlc_fast_disabled = 1; +} + +/*! + @brief Fast combined RLV + sign bit decode using prefix lookup table + + Peeks VLC_PEEK_BITS bits from the bitstream buffer and does a single + table lookup. If the codeword is found and fits in the available bits, + consumes the bits and reads the sign bit inline. + + When the buffer is empty, refills before peeking. When the buffer has + fewer than VLC_PEEK_BITS bits, peeks anyway (zero-padded) and validates + that the found codeword fits within the available bits. Falls back to + GetRun() only when the codeword is genuinely longer than available bits + or not in the table. +*/ +CODEC_ERROR GetRunFast(BITSTREAM *stream, CODEBOOK *codebook, RUN *run) +{ + /* Thread-safe one-time initialization of the lookup table. + Note: assumes a single codebook is used throughout the decode session. */ + if (vlc_init_codebook == NULL) + vlc_init_codebook = codebook; + assert(vlc_init_codebook == codebook); + pthread_once(&vlc_init_once, InitVLCTableOnce); + + /* Bypass fast path if disabled via VLC_NO_FAST env var */ + if (vlc_fast_disabled) + return GetRun(stream, codebook, run); + + /* Refill if buffer is completely empty */ + if (stream->count == 0) + GetBuffer(stream); + + /* Peek top VLC_PEEK_BITS bits from buffer (zero-padded if count < PEEK). + The buffer is left-aligned in 32 bits; unused low bits are already 0. */ + uint32_t peek = stream->buffer >> (32 - VLC_PEEK_BITS); + VLC_LOOKUP *entry = &vlc_table[peek]; + + /* Validate: codeword must be in table AND fit within available bits. + When count < PEEK_BITS, the zero-padded peek may yield a wrong entry + for codewords longer than count, but we catch that with the size check. */ + if (entry->bits == 0 || entry->bits > stream->count) + return GetRun(stream, codebook, run); + + /* Consume codeword bits from the buffer */ + stream->buffer <<= entry->bits; + stream->count -= entry->bits; + run->count = entry->count; + + /* Handle sign bit inline for non-zero magnitude values */ + int32_t value = (int32_t)entry->value; + if (value != 0) + { + /* Refill buffer if empty */ + if (stream->count == 0) + GetBuffer(stream); + + /* Read 1-bit sign: top bit of buffer */ + if (stream->buffer >> 31) + value = -value; + stream->buffer <<= 1; + stream->count--; + } + run->value = value; + + return CODEC_ERROR_OKAY; +} /*! @brief Parse a run length coded magnitude in the bitstream diff --git a/source/lib/vc5_decoder/vlc.h b/source/lib/vc5_decoder/vlc.h index 6ec41f69..9efcc315 100755 --- a/source/lib/vc5_decoder/vlc.h +++ b/source/lib/vc5_decoder/vlc.h @@ -95,6 +95,7 @@ extern "C" { CODEC_ERROR GetRlv(BITSTREAM *stream, CODEBOOK *codebook, RUN *run); CODEC_ERROR GetRun(BITSTREAM *stream, CODEBOOK *codebook, RUN *run); + CODEC_ERROR GetRunFast(BITSTREAM *stream, CODEBOOK *codebook, RUN *run); #ifdef __cplusplus } diff --git a/source/lib/vc5_decoder/wavelet.c b/source/lib/vc5_decoder/wavelet.c index a60eb0a9..d2c3b96e 100755 --- a/source/lib/vc5_decoder/wavelet.c +++ b/source/lib/vc5_decoder/wavelet.c @@ -53,19 +53,15 @@ CODEC_ERROR TransformInverseSpatialQuantLowpass(gpr_allocator *allocator, WAVELE assert(output->data[0] != NULL); // Check for valid quantization values - if (input->quant[0] == 0) { - input->quant[0] = 1; - } - - assert(input->quant[0] > 0); - assert(input->quant[1] > 0); - assert(input->quant[2] > 0); - assert(input->quant[3] > 0); + for (int i = 0; i < 4; ++i) + { + if (input->quant[i] == 0) + input->quant[i] = 1; + } - if (prescale > 1) + if (prescale > 1) { // This is a spatial transform for the lowpass temporal band - assert(prescale == 2); // Apply the inverse spatial transform for a lowpass band that is not prescaled InvertSpatialQuantDescale16s(allocator, @@ -78,10 +74,10 @@ CODEC_ERROR TransformInverseSpatialQuantLowpass(gpr_allocator *allocator, WAVELE output_width, output_height, prescale, input->quant); } - else - { - // This case does not handle any prescaling applied during encoding - assert(prescale == 0); + else + { + if (prescale != 0) + prescale = 0; // Apply the inverse spatial transform for a lowpass band that is not prescaled InvertSpatialQuant16s(allocator, @@ -125,21 +121,17 @@ CODEC_ERROR TransformInverseSpatialQuantArray(gpr_allocator *allocator, assert(input->data[3] != NULL); // Check for valid quantization values - if (input->quant[0] == 0) { - input->quant[0] = 1; - } - - assert(input->quant[0] > 0); - assert(input->quant[1] > 0); - assert(input->quant[2] > 0); - assert(input->quant[3] > 0); + for (int i = 0; i < 4; ++i) + { + if (input->quant[i] == 0) + input->quant[i] = 1; + } assert(output_width > 0 && output_height > 0 && output_pitch > 0 && output_buffer != NULL); - if (prescale > 1) + if (prescale > 1) { // This is a spatial transform for the lowpass temporal band - assert(prescale == 2); // Apply the inverse spatial transform for a lowpass band that is not prescaled InvertSpatialQuantDescale16s(allocator, @@ -152,10 +144,10 @@ CODEC_ERROR TransformInverseSpatialQuantArray(gpr_allocator *allocator, output_width, output_height, prescale, input->quant); } - else - { - // This case does not handle any prescaling applied during encoding - assert(prescale == 0); + else + { + if (prescale != 0) + prescale = 0; // Apply the inverse spatial transform for a lowpass band that is not prescaled InvertSpatialQuant16s(allocator, diff --git a/source/lib/vc5_encoder/CMakeLists.txt b/source/lib/vc5_encoder/CMakeLists.txt index 9dc143b7..9ee6b96c 100644 --- a/source/lib/vc5_encoder/CMakeLists.txt +++ b/source/lib/vc5_encoder/CMakeLists.txt @@ -15,7 +15,8 @@ include_directories( "../common/public" ) # library add_library( ${LIB_NAME} STATIC ${SRC_FILES} ${INC_FILES} ) -target_link_libraries( ${LIB_NAME} ) +find_package( Threads REQUIRED ) +target_link_libraries( ${LIB_NAME} Threads::Threads ) # set the folder where to place the projects set_target_properties( ${LIB_NAME} PROPERTIES FOLDER lib ) diff --git a/source/lib/vc5_encoder/encoder.c b/source/lib/vc5_encoder/encoder.c index 0701a4dd..88a37d75 100755 --- a/source/lib/vc5_encoder/encoder.c +++ b/source/lib/vc5_encoder/encoder.c @@ -36,6 +36,12 @@ #if ENABLED(NEON) #include #endif +#include +#include +#include +#include +#include + /*! @brief Align the bitstream to a byte boundary @@ -352,6 +358,7 @@ CODEC_ERROR EncodeImage(IMAGE *image, STREAM *stream, RGB_IMAGE *rgb_image, ENCO return error; } + // Initialize the bitstream data structure InitBitstream(&bitstream); @@ -738,19 +745,33 @@ CODEC_ERROR ImageUnpackingProcess(const PACKED_IMAGE *input, int bits_per_component; // The configuration of component arrays is determined by the image format - switch (input->format) - { + switch (input->format) + { case PIXEL_FORMAT_RAW_RGGB_12: case PIXEL_FORMAT_RAW_RGGB_12P: - case PIXEL_FORMAT_RAW_RGGB_14: case PIXEL_FORMAT_RAW_GBRG_12: case PIXEL_FORMAT_RAW_GBRG_12P: - case PIXEL_FORMAT_RAW_RGGB_16: channel_count = 4; max_channel_width = input->width / 2; max_channel_height = input->height / 2; bits_per_component = 12; break; + + case PIXEL_FORMAT_RAW_RGGB_14: + case PIXEL_FORMAT_RAW_GBRG_14: + channel_count = 4; + max_channel_width = input->width / 2; + max_channel_height = input->height / 2; + bits_per_component = 14; + break; + + case PIXEL_FORMAT_RAW_GBRG_16: + case PIXEL_FORMAT_RAW_RGGB_16: + channel_count = 4; + max_channel_width = input->width / 2; + max_channel_height = input->height / 2; + bits_per_component = 16; + break; default: assert(0); @@ -770,6 +791,10 @@ CODEC_ERROR ImageUnpackingProcess(const PACKED_IMAGE *input, UnpackImage_14(input, output, enabled_parts, true ); break; + case PIXEL_FORMAT_RAW_GBRG_14: + UnpackImage_14(input, output, enabled_parts, false ); + break; + case PIXEL_FORMAT_RAW_RGGB_12: UnpackImage_12(input, output, enabled_parts, true ); break; @@ -785,6 +810,14 @@ CODEC_ERROR ImageUnpackingProcess(const PACKED_IMAGE *input, case PIXEL_FORMAT_RAW_GBRG_12P: UnpackImage_12P(input, output, enabled_parts, false ); break; + + case PIXEL_FORMAT_RAW_RGGB_16: + UnpackImage_16(input, output, enabled_parts, true ); + break; + + case PIXEL_FORMAT_RAW_GBRG_16: + UnpackImage_16(input, output, enabled_parts, false ); + break; default: assert(0); @@ -1163,14 +1196,13 @@ static void ForwardWaveletTransformRecursive(RECURSIVE_TRANSFORM_DATA *transform if( input_row_index == 0 ) { int row; - + for (row = 0; row < ROW_BUFFER_COUNT; row++) { PIXEL *input_row_ptr = (PIXEL *)((uintptr_t)input_ptr + row * input_pitch); - + FilterHorizontalRow(input_row_ptr, lowpass_buffer[row], highpass_buffer[row], input_width, prescale); } - // Process the first row as a special case for the boundary condition FilterVerticalTopRow(lowpass_buffer, highpass_buffer, output_ptr, output_width, output_pitch, midpoints, multipliers, input_row_index ); input_row_index += 2; @@ -1277,37 +1309,40 @@ static void ForwardWaveletTransform(TRANSFORM *transform, const COMPONENT_ARRAY ForwardWaveletTransformRecursive( transform_data, 0, 0, 0xFFFF ); } -/*! - @brief Encode the portion of a sample that corresponds to a single layer +/*! Thread argument for parallel forward wavelet transforms */ +typedef struct { + TRANSFORM *transform; + const COMPONENT_ARRAY *input_component; + PIXEL *lowpass_buffer[MAX_WAVELET_COUNT][ROW_BUFFER_COUNT]; + PIXEL *highpass_buffer[MAX_WAVELET_COUNT][ROW_BUFFER_COUNT]; + int midpoint_prequant; +} FORWARD_THREAD_ARG; - Samples can be contain multiple subsamples. Each subsample may correspond to - a different view. For example, an encoded video sample may contain both the - left and right subsamples in a stereo pair. - - Subsamples have been called tracks or channels, but this terminology can be - confused with separate video tracks in a multimedia container or the color - planes that are called channels elsewhere in this codec. - - The subsamples are decoded seperately and composited to form a single frame - that is the output of the complete process of decoding a single video sample. - For this reason, the subsamples are called layers. +static void *ForwardTransformThread(void *arg) +{ + FORWARD_THREAD_ARG *a = (FORWARD_THREAD_ARG *)arg; + ForwardWaveletTransform(a->transform, a->input_component, + a->lowpass_buffer, a->highpass_buffer, + a->midpoint_prequant); + return NULL; +} - @todo Need to reset the codec state for each layer? +/*! + @brief Encode the portion of a sample that corresponds to a single layer */ //CODEC_ERROR EncodeLayer(ENCODER *encoder, void *buffer, size_t pitch, BITSTREAM *stream) CODEC_ERROR EncodeMultipleChannels(ENCODER *encoder, const UNPACKED_IMAGE *image, BITSTREAM *stream) { CODEC_ERROR error = CODEC_ERROR_OKAY; - + int channel_count; int channel_index; channel_count = encoder->channel_count; - + #if VC5_ENABLED_PART(VC5_PART_LAYERS) if (IsPartEnabled(encoder->enabled_parts, VC5_PART_LAYERS)) { - // Write the tag value pairs that preceed the encoded wavelet tree error = EncodeLayerHeader(encoder, stream); if (error != CODEC_ERROR_OKAY) { return error; @@ -1315,43 +1350,87 @@ CODEC_ERROR EncodeMultipleChannels(ENCODER *encoder, const UNPACKED_IMAGE *image } #endif - - CODEC_STATE *codec = &encoder->codec; - - // Compute the wavelet transform tree for each channel + CODEC_STATE *codec = &encoder->codec; + + /* Phase 1: Run forward wavelet transforms in parallel (one thread per channel) */ + { + gpr_allocator *allocator = encoder->allocator; + FORWARD_THREAD_ARG thread_args[MAX_CHANNEL_COUNT]; + pthread_t threads[MAX_CHANNEL_COUNT]; + int thread_created[MAX_CHANNEL_COUNT]; + + /* Allocate per-thread scratch buffers and set up thread args */ + for (channel_index = 0; channel_index < channel_count; channel_index++) + { + thread_args[channel_index].transform = &encoder->transform[channel_index]; + thread_args[channel_index].input_component = &image->component_array_list[channel_index]; + thread_args[channel_index].midpoint_prequant = encoder->midpoint_prequant; + + /* Allocate per-channel scratch buffers (same sizes as encoder's shared buffers) */ + int wavelet_index; + for (wavelet_index = 0; wavelet_index < MAX_WAVELET_COUNT; wavelet_index++) + { + int channel_width = encoder->transform[channel_index].wavelet[wavelet_index]->width; + int row; + for (row = 0; row < ROW_BUFFER_COUNT; row++) + { + PIXEL *buf = allocator->Alloc(channel_width * sizeof(PIXEL) * 2); + thread_args[channel_index].lowpass_buffer[wavelet_index][row] = buf; + thread_args[channel_index].highpass_buffer[wavelet_index][row] = buf ? buf + channel_width : NULL; + } + } + + thread_created[channel_index] = (pthread_create(&threads[channel_index], NULL, + ForwardTransformThread, + &thread_args[channel_index]) == 0); + if (!thread_created[channel_index]) + { + /* Fallback: run inline if thread creation fails */ + ForwardTransformThread(&thread_args[channel_index]); + } + } + + /* Wait for all transform threads to complete */ + for (channel_index = 0; channel_index < channel_count; channel_index++) + { + if (thread_created[channel_index]) + pthread_join(threads[channel_index], NULL); + + /* Free per-channel scratch buffers */ + int wavelet_index; + for (wavelet_index = 0; wavelet_index < MAX_WAVELET_COUNT; wavelet_index++) + { + int row; + for (row = 0; row < ROW_BUFFER_COUNT; row++) + allocator->Free(thread_args[channel_index].lowpass_buffer[wavelet_index][row]); + } + } + } + + /* Phase 2: Encode channels sequentially (bitstream is serial) */ for (channel_index = 0; channel_index < channel_count; channel_index++) { - int channel_number; - - ForwardWaveletTransform(&encoder->transform[channel_index], &image->component_array_list[channel_index], encoder->lowpass_buffer, encoder->highpass_buffer, encoder->midpoint_prequant ); + int channel_number = encoder->channel_order_table[channel_index]; - channel_number = encoder->channel_order_table[channel_index]; - - // Encode the tag value pairs in the header for this channel - error = EncodeChannelHeader(encoder, channel_number, stream); - if (error != CODEC_ERROR_OKAY) { - return error; - } - - // Encode the lowpass and highpass bands in the wavelet tree for this channel - error = EncodeChannelSubbands(encoder, channel_number, stream); - if (error != CODEC_ERROR_OKAY) { - return error; - } - - // Encode the tag value pairs in the trailer for this channel - error = EncodeChannelTrailer(encoder, channel_number, stream); - if (error != CODEC_ERROR_OKAY) { - return error; - } - - // Check that the bitstream is alligned to a segment boundary - assert(IsAlignedSegment(stream)); - - // Update the codec state for the next channel in the bitstream - //codec->channel_number++; - codec->channel_number = (channel_number + 1); - codec->subband_number = 0; + error = EncodeChannelHeader(encoder, channel_number, stream); + if (error != CODEC_ERROR_OKAY) { + return error; + } + + error = EncodeChannelSubbands(encoder, channel_number, stream); + if (error != CODEC_ERROR_OKAY) { + return error; + } + + error = EncodeChannelTrailer(encoder, channel_number, stream); + if (error != CODEC_ERROR_OKAY) { + return error; + } + + assert(IsAlignedSegment(stream)); + + codec->channel_number = (channel_number + 1); + codec->subband_number = 0; } #if VC5_ENABLED_PART(VC5_PART_LAYERS) @@ -1906,7 +1985,7 @@ CODEC_ERROR SetEncoderQuantization(ENCODER *encoder, int channel_count = encoder->channel_count; int channel_number; - const int quant_table_length = sizeof(parameters->quant_table)/sizeof(parameters->quant_table[0]); +const int quant_table_length = sizeof(parameters->quant_table)/sizeof(parameters->quant_table[0]); // Set the midpoint prequant parameter encoder->midpoint_prequant = 2; @@ -1914,7 +1993,31 @@ CODEC_ERROR SetEncoderQuantization(ENCODER *encoder, // Set the quantization table in each channel for (channel_number = 0; channel_number < channel_count; channel_number++) { - SetTransformQuantTable(encoder, channel_number, parameters->quant_table, quant_table_length); + QUANT scaled_table[MAX_SUBBAND_COUNT]; + memcpy(scaled_table, parameters->quant_table, sizeof(scaled_table)); + + PRECISION bits = encoder->channel[channel_number].bits_per_component; + double scale = 1.0; + if (bits > 12) + { + scale = 12.0 / (double)bits; + } + + if (scale != 1.0) + { + int i; + /* Start at index 1: index 0 is the lowpass band which must + always keep quant=1 for correct reconstruction. */ + for (i = 1; i < quant_table_length; ++i) + { + int scaled = (int)lrint((double)scaled_table[i] * scale); + if (scaled < 1) scaled = 1; + scaled_table[i] = (QUANT)scaled; + } + } + + encoder->midpoint_prequant = (bits >= 15) ? 3 : 2; + SetTransformQuantTable(encoder, channel_number, scaled_table, quant_table_length); } return CODEC_ERROR_OKAY; @@ -2110,15 +2213,16 @@ CODEC_ERROR EncodeLowpassBand(ENCODER *encoder, WAVELET *wavelet, int channel_nu for (row = 0; row < height; row++) { - uint16_t *lowpass = (uint16_t *)lowpass_row_ptr; + PIXEL *lowpass = (PIXEL *)lowpass_row_ptr; int column; for (column = 0; column < width; column++) { - BITWORD coefficient = lowpass[column]; - //assert(0 <= lowpass[column] && lowpass[column] <= COEFFICIENT_MAX); - assert(lowpass[column] <= COEFFICIENT_MAX); - assert(coefficient <= COEFFICIENT_MAX); + // Clamp the 32-bit pixel value to 16-bit range for storage + int32_t value = lowpass[column]; + if (value < 0) value = 0; + if (value > UINT16_MAX) value = UINT16_MAX; + BITWORD coefficient = (BITWORD)value; PutBits(stream, coefficient, lowpass_precision); } @@ -2552,4 +2656,3 @@ CODEC_ERROR PutVideoLowpassHeader(ENCODER *encoder, int channel_number, BITSTREA return CODEC_ERROR_OKAY; } - diff --git a/source/lib/vc5_encoder/forward.c b/source/lib/vc5_encoder/forward.c index ab54dd89..541e5a07 100755 --- a/source/lib/vc5_encoder/forward.c +++ b/source/lib/vc5_encoder/forward.c @@ -25,12 +25,12 @@ //! Rounding added to the highpass sum before division static const int32_t rounding = 4; -STATIC_INLINE PIXEL QuantizeValue(int16_t value, int32_t midpoint, int32_t multiplier ) +STATIC_INLINE PIXEL QuantizeValue(int32_t value, int32_t midpoint, int32_t multiplier ) { int less_than_zero = 0; int negate_less_than_zero = 0; - int16_t x = abs(value) + midpoint; + int32_t x = abs(value) + midpoint; if( value < 0 ) { @@ -38,7 +38,7 @@ STATIC_INLINE PIXEL QuantizeValue(int16_t value, int32_t midpoint, int32_t multi negate_less_than_zero = 1; } - x = (int32_t)(x * multiplier) >> 16; + x = ((int64_t)x * multiplier) >> 16; x = x ^ less_than_zero; x = x + negate_less_than_zero; @@ -46,7 +46,7 @@ STATIC_INLINE PIXEL QuantizeValue(int16_t value, int32_t midpoint, int32_t multi return ClampPixel(x); } -static void FilterVerticalTopBottom_Core_8x_C_(PIXEL *coefficients[], int column, int16_t* highpass, int16_t* lowpass, bool top ) +static void FilterVerticalTopBottom_Core_8x_C_(PIXEL *coefficients[], int column, PIXEL* highpass, PIXEL* lowpass, bool top ) { const int filter_coeffs_top[] = { 5, -11, 4, 4, -1, -1 }; const int filter_coeffs_bottom[] = { 1, 1, -4, -4, 11, -5 }; @@ -78,196 +78,242 @@ static void FilterVerticalTopBottom_Core_8x_C_(PIXEL *coefficients[], int column #if ENABLED(NEON) -static const uint16x8_t mask = {0x0000, 0xFFFF,0x0000,0xFFFF,0x0000,0xFFFF,0x0000, 0xFFFF}; - #define HorizontalFilter_Prescale2_4x HorizontalFilter_Prescale2_4x_NEON_ void HorizontalFilter_Prescale2_4x_NEON_(PIXEL *input, PIXEL* lowpass, PIXEL* highpass ) { const int prescale_rounding = 3; - const int prescale = 2; - - int32x4_t __pairwise_sum_0_7, __highpass; - int32x4_t __diff; - int16x8_t __input_2_9; - - { - const int16x8_t __prescale_rounding = vdupq_n_s16 (prescale_rounding); - const int16x8_t __shift = vdupq_n_s16 (-prescale); - - int16x8_t __input_0_7 = vld1q_s16( input ); - __input_2_9 = vld1q_s16( input + 2 ); - int16x8_t __input_8_15 = vld1q_s16( input + 8 ); - - __input_0_7 = vaddq_s16( __input_0_7, __prescale_rounding ); - __input_0_7 = vshlq_s16( __input_0_7, __shift ); - - __input_8_15 = vaddq_s16( __input_8_15, __prescale_rounding ); - __input_8_15 = vshlq_s16( __input_8_15, __shift ); - - __pairwise_sum_0_7 = vpaddlq_s16(__input_0_7); - int32x4_t __pairwise_sum_8_15 = vpaddlq_s16(__input_8_15); - - __input_0_7 = vbslq_s16(mask, vnegq_s16(__input_0_7), __input_0_7); - __input_8_15 = vbslq_s16(mask, vnegq_s16(__input_8_15), __input_8_15); - __diff = vextq_s32(vpaddlq_s16( __input_0_7 ), vpaddlq_s16( __input_8_15 ), 1); - - __highpass = vcombine_s32( vget_high_s32(__pairwise_sum_0_7), vget_low_s32(__pairwise_sum_8_15) ); - } - - // High pass band - { - const int32x4_t __rounding = vdupq_n_s32(rounding); + int32_t scaled[12]; + + for (int i = 0; i < 12; ++i) + { + scaled[i] = (input[i] + prescale_rounding) >> 2; + } + + int32_t sum_prev_vals[4] = { + scaled[0] + scaled[1], + scaled[2] + scaled[3], + scaled[4] + scaled[5], + scaled[6] + scaled[7] + }; + + int32_t sum_next_vals[4] = { + scaled[4] + scaled[5], + scaled[6] + scaled[7], + scaled[8] + scaled[9], + scaled[10] + scaled[11] + }; + + int32_t diff_vals[4] = { + scaled[2] - scaled[3], + scaled[4] - scaled[5], + scaled[6] - scaled[7], + scaled[8] - scaled[9] + }; + + int32x4_t sum_prev = vld1q_s32(sum_prev_vals); + int32x4_t sum_next = vld1q_s32(sum_next_vals); + int32x4_t diffs = vld1q_s32(diff_vals); + + int32x4_t hp = vsubq_s32(sum_next, sum_prev); + hp = vaddq_s32(hp, vdupq_n_s32(rounding)); + hp = vshrq_n_s32(hp, 3); + hp = vaddq_s32(hp, diffs); + vst1q_s32(highpass, hp); + + int32_t low_vals[4] = { + (input[2] + input[3] + prescale_rounding) >> 2, + (input[4] + input[5] + prescale_rounding) >> 2, + (input[6] + input[7] + prescale_rounding) >> 2, + (input[8] + input[9] + prescale_rounding) >> 2 + }; + vst1q_s32(lowpass, vld1q_s32(low_vals)); +} - __highpass = vsubq_s32( __highpass, __pairwise_sum_0_7 ); - __highpass = vaddq_s32( __highpass, __rounding ); - __highpass = vshrq_n_s32( __highpass, 3 ); - __highpass = vqaddq_s32( __highpass, __diff ); // Dont need to clamp because we are using saturating instruction - - vst1_s16(highpass, vmovn_s32(__highpass) ); - } - - // Low pass band - { - const int32x4_t __prescale_rounding = vdupq_n_s32(prescale_rounding); - const int32x4_t __shift = vdupq_n_s32(-prescale); - - int32x4_t __pairwise_sum_2_9 = vpaddlq_s16(__input_2_9); - - __pairwise_sum_2_9 = vaddq_s32(__pairwise_sum_2_9, __prescale_rounding); - __pairwise_sum_2_9 = vshlq_s32(__pairwise_sum_2_9, __shift); - - vst1_s16(lowpass, vmovn_s32(__pairwise_sum_2_9) ); - } +#define HorizontalFilter_Prescale3_4x HorizontalFilter_Prescale3_4x_NEON_ +void HorizontalFilter_Prescale3_4x_NEON_(PIXEL *input, PIXEL* lowpass, PIXEL* highpass ) +{ + const int prescale_rounding = 7; + int32_t scaled[12]; + + for (int i = 0; i < 12; ++i) + { + scaled[i] = (input[i] + prescale_rounding) >> 3; + } + + int32_t sum_prev_vals[4] = { + scaled[0] + scaled[1], + scaled[2] + scaled[3], + scaled[4] + scaled[5], + scaled[6] + scaled[7] + }; + + int32_t sum_next_vals[4] = { + scaled[4] + scaled[5], + scaled[6] + scaled[7], + scaled[8] + scaled[9], + scaled[10] + scaled[11] + }; + + int32_t diff_vals[4] = { + scaled[2] - scaled[3], + scaled[4] - scaled[5], + scaled[6] - scaled[7], + scaled[8] - scaled[9] + }; + + int32x4_t sum_prev = vld1q_s32(sum_prev_vals); + int32x4_t sum_next = vld1q_s32(sum_next_vals); + int32x4_t diffs = vld1q_s32(diff_vals); + + int32x4_t hp = vsubq_s32(sum_next, sum_prev); + hp = vaddq_s32(hp, vdupq_n_s32(rounding)); + hp = vshrq_n_s32(hp, 3); + hp = vaddq_s32(hp, diffs); + vst1q_s32(highpass, hp); + + int32_t low_vals[4] = { + (input[2] + input[3] + prescale_rounding) >> 3, + (input[4] + input[5] + prescale_rounding) >> 3, + (input[6] + input[7] + prescale_rounding) >> 3, + (input[8] + input[9] + prescale_rounding) >> 3 + }; + vst1q_s32(lowpass, vld1q_s32(low_vals)); } #define HorizontalFilter_Prescale0_4x HorizontalFilter_Prescale0_4x_NEON_ void HorizontalFilter_Prescale0_4x_NEON_(PIXEL *input, PIXEL* lowpass, PIXEL* highpass ) { - int32x4_t __pairwise_sum_0_7, __highpass; - int32x4_t __diff; - int16x8_t __input_2_9; - - { - int16x8_t __input_0_7 = vld1q_s16( input ); - __input_2_9 = vld1q_s16( input + 2 ); - int16x8_t __input_8_15 = vld1q_s16( input + 8 ); - - __pairwise_sum_0_7 = vpaddlq_s16(__input_0_7); - int32x4_t __pairwise_sum_8_15 = vpaddlq_s16(__input_8_15); - - __input_0_7 = vbslq_s16(mask, vnegq_s16(__input_0_7), __input_0_7); - __input_8_15 = vbslq_s16(mask, vnegq_s16(__input_8_15), __input_8_15); - __diff = vextq_s32(vpaddlq_s16( __input_0_7 ), vpaddlq_s16( __input_8_15 ), 1); - - __highpass = vcombine_s32( vget_high_s32(__pairwise_sum_0_7), vget_low_s32(__pairwise_sum_8_15) ); - } - - // High pass band - { - const int32x4_t __rounding = vdupq_n_s32(rounding); - - __highpass = vsubq_s32( __highpass, __pairwise_sum_0_7 ); - __highpass = vaddq_s32( __highpass, __rounding ); - __highpass = vshrq_n_s32( __highpass, 3 ); - __highpass = vqaddq_s32( __highpass, __diff ); // Dont need to clamp because we are using saturating instruction - - vst1_s16(highpass, vmovn_s32(__highpass) ); - } - - // Low pass band - { - int32x4_t __pairwise_sum_2_9 = vpaddlq_s16(__input_2_9); - - vst1_s16(lowpass, vmovn_s32(__pairwise_sum_2_9) ); - } + int32_t sum_prev_vals[4] = { + input[0] + input[1], + input[2] + input[3], + input[4] + input[5], + input[6] + input[7] + }; + + int32_t sum_next_vals[4] = { + input[4] + input[5], + input[6] + input[7], + input[8] + input[9], + input[10] + input[11] + }; + + int32_t diff_vals[4] = { + input[2] - input[3], + input[4] - input[5], + input[6] - input[7], + input[8] - input[9] + }; + + int32x4_t sum_prev = vld1q_s32(sum_prev_vals); + int32x4_t sum_next = vld1q_s32(sum_next_vals); + int32x4_t diffs = vld1q_s32(diff_vals); + + int32x4_t hp = vsubq_s32(sum_next, sum_prev); + hp = vaddq_s32(hp, vdupq_n_s32(rounding)); + hp = vshrq_n_s32(hp, 3); + hp = vaddq_s32(hp, diffs); + vst1q_s32(highpass, hp); + + int32_t low_vals[4] = { + input[2] + input[3], + input[4] + input[5], + input[6] + input[7], + input[8] + input[9] + }; + vst1q_s32(lowpass, vld1q_s32(low_vals)); } -void QuantizeBand_8x_NEON_(int16_t* wavelet_band, int16_t midpoint, int32_t multiplier, PIXEL *output ) +static INLINE int32x4_t QuantizeVector(int32x4_t values, int32_t midpoint, int32_t multiplier) { - int16x8_t __wavelet_band = vld1q_s16( wavelet_band ); + const int32x4_t midpoint_vec = vdupq_n_s32(midpoint); + int32x4_t abs_vals = vabsq_s32(values); + abs_vals = vaddq_s32(abs_vals, midpoint_vec); - int16x8_t __wavelet_band_abs = vaddq_s16( vabsq_s16(__wavelet_band), vdupq_n_s16( midpoint ) ); + const int32x2_t multiplier_vec = vdup_n_s32(multiplier); + int64x2_t prod_low = vmull_s32(vget_low_s32(abs_vals), multiplier_vec); + int64x2_t prod_high = vmull_s32(vget_high_s32(abs_vals), multiplier_vec); - int32x4_t __multipliers = vdupq_n_s32( multiplier ); + prod_low = vshrq_n_s64(prod_low, 16); + prod_high = vshrq_n_s64(prod_high, 16); - int32x4_t __value_high = vmovl_s16( vget_high_s16(__wavelet_band_abs) ); - __value_high = vmulq_s32( __value_high, __multipliers ); - - int32x4_t __value_low = vmovl_s16( vget_low_s16(__wavelet_band_abs) ); - __value_low = vmulq_s32( __value_low, __multipliers ); + int32x4_t scaled = vcombine_s32(vmovn_s64(prod_low), vmovn_s64(prod_high)); - int16x8_t __multiplied = vcombine_s16( vshrn_n_s32( __value_low, 16 ), vshrn_n_s32( __value_high, 16 ) ); + const int32x4_t zero = vdupq_n_s32(0); + uint32x4_t neg_mask = vcltq_s32(values, zero); + int32x4_t negated = vnegq_s32(scaled); + return vbslq_s32(neg_mask, negated, scaled); +} - uint16x8_t mask = vcltq_s16(__wavelet_band, vdupq_n_s16(0) ); - int16x8_t __neg_output = vnegq_s16(__multiplied); +void QuantizeBand_8x_NEON_(const PIXEL* wavelet_band, int32_t midpoint, int32_t multiplier, PIXEL *output ) +{ + int32x4_t v0 = vld1q_s32(wavelet_band); + int32x4_t v1 = vld1q_s32(wavelet_band + 4); + int32x4_t q0 = QuantizeVector(v0, midpoint, multiplier); + int32x4_t q1 = QuantizeVector(v1, midpoint, multiplier); + vst1q_s32(output, q0); + vst1q_s32(output + 4, q1); +} + +static INLINE void FilterVerticalCoreBlock(int32x4_t row0, + int32x4_t row1, + int32x4_t row2, + int32x4_t row3, + int32x4_t row4, + int32x4_t row5, + PIXEL *high_dest, + PIXEL *low_dest) +{ + int32x4_t high = vsubq_s32(row5, row0); + high = vaddq_s32(high, vsubq_s32(row4, row1)); + high = vaddq_s32(high, vdupq_n_s32(rounding)); + high = vshrq_n_s32(high, 3); + high = vaddq_s32(high, vsubq_s32(row2, row3)); - int16x8_t __result = vbslq_s16( mask, __neg_output, __multiplied ); + int32x4_t low = vaddq_s32(row2, row3); - vst1q_s16(output, __result); + vst1q_s32(high_dest, high); + vst1q_s32(low_dest, low); } -void FilterVerticalMiddle_Core_8x_NEON_(PIXEL *coefficients[], int column, int16_t* highpass, int16_t* lowpass ) +void FilterVerticalMiddle_Core_8x_NEON_(PIXEL *coefficients[], int column, PIXEL* highpass, PIXEL* lowpass ) { - int16x8_t __highpass, __highpass_50, __highpass_14; - - { - int16x8_t __row_0 = vld1q_s16( &coefficients[0][column] ); - int16x8_t __row_5 = vld1q_s16( &coefficients[5][column] ); - - __highpass_50 = vsubq_s16( __row_5, __row_0 ); - } - - { - int16x8_t __row_1 = vld1q_s16( &coefficients[1][column] ); - int16x8_t __row_4 = vld1q_s16( &coefficients[4][column] ); - - __highpass_14 = vsubq_s16( __row_4, __row_1 ); - } - - { - int16x8_t __rounding = vdupq_n_s16 (rounding); - - __highpass = vaddq_s16( __highpass_50, __highpass_14 ); - __highpass = vaddq_s16( __highpass, __rounding ); - __highpass = vshrq_n_s16(__highpass, 3); - } - - { - int16x8_t __row_2 = vld1q_s16( &coefficients[2][column] ); - int16x8_t __row_3 = vld1q_s16( &coefficients[3][column] ); - - int16x8_t __diff_23 = vsubq_s16( __row_2, __row_3 ); - int16x8_t __sum_23 = vaddq_s16( __row_2, __row_3 ); - - __highpass = vaddq_s16( __highpass, __diff_23 ); - - vst1q_s16(lowpass, __sum_23); - } - - vst1q_s16(highpass, __highpass); - + int32x4_t row0_lo = vld1q_s32(&coefficients[0][column]); + int32x4_t row0_hi = vld1q_s32(&coefficients[0][column + 4]); + int32x4_t row1_lo = vld1q_s32(&coefficients[1][column]); + int32x4_t row1_hi = vld1q_s32(&coefficients[1][column + 4]); + int32x4_t row2_lo = vld1q_s32(&coefficients[2][column]); + int32x4_t row2_hi = vld1q_s32(&coefficients[2][column + 4]); + int32x4_t row3_lo = vld1q_s32(&coefficients[3][column]); + int32x4_t row3_hi = vld1q_s32(&coefficients[3][column + 4]); + int32x4_t row4_lo = vld1q_s32(&coefficients[4][column]); + int32x4_t row4_hi = vld1q_s32(&coefficients[4][column + 4]); + int32x4_t row5_lo = vld1q_s32(&coefficients[5][column]); + int32x4_t row5_hi = vld1q_s32(&coefficients[5][column + 4]); + + FilterVerticalCoreBlock(row0_lo, row1_lo, row2_lo, row3_lo, row4_lo, row5_lo, highpass, lowpass); + FilterVerticalCoreBlock(row0_hi, row1_hi, row2_hi, row3_hi, row4_hi, row5_hi, highpass + 4, lowpass + 4); } #define FilterVerticalMiddle_8x FilterVerticalMiddle_8x_NEON_ void FilterVerticalMiddle_8x_NEON_(PIXEL *lowpass[], PIXEL *highpass[], int column, int32_t* midpoints, int32_t* multipliers, PIXEL *result[]) { - int16_t LOW[8]; - int16_t HIGH[8]; - + PIXEL LOW[8]; + PIXEL HIGH[8]; + FilterVerticalMiddle_Core_8x_NEON_( highpass, column, HIGH, LOW); QuantizeBand_8x_NEON_( LOW, midpoints[LH_BAND], multipliers[LH_BAND], result[LH_BAND] + column ); QuantizeBand_8x_NEON_( HIGH, midpoints[HH_BAND], multipliers[HH_BAND], result[HH_BAND] + column ); FilterVerticalMiddle_Core_8x_NEON_( lowpass, column, HIGH, LOW); QuantizeBand_8x_NEON_( LOW, midpoints[LL_BAND], multipliers[LL_BAND], result[LL_BAND] + column ); - QuantizeBand_8x_NEON_( HIGH, midpoints[HL_BAND], multipliers[HL_BAND], result[HL_BAND] + column ); + QuantizeBand_8x_NEON_( HIGH, midpoints[HL_BAND], multipliers[HL_BAND], result[HL_BAND] + column ); } #define FilterVerticalTopBottom_8x FilterVerticalTopBottom_8x_NEON_ void FilterVerticalTopBottom_8x_NEON_(PIXEL *lowpass[], PIXEL *highpass[], int column, int32_t* midpoints, int32_t* multipliers, PIXEL *result[], bool top ) { - int16_t LOW[8]; - int16_t HIGH[8]; + PIXEL LOW[8]; + PIXEL HIGH[8]; FilterVerticalTopBottom_Core_8x_C_( highpass, column, HIGH, LOW, top ); QuantizeBand_8x_NEON_( LOW, midpoints[LH_BAND], multipliers[LH_BAND], result[LH_BAND] + column ); @@ -278,6 +324,7 @@ void FilterVerticalTopBottom_8x_NEON_(PIXEL *lowpass[], PIXEL *highpass[], int c QuantizeBand_8x_NEON_( HIGH, midpoints[HL_BAND], multipliers[HL_BAND], result[HL_BAND] + column ); } + #else #define HorizontalFilter_Prescale2_4x HorizontalFilter_Prescale2_4x_C_ @@ -348,6 +395,66 @@ void HorizontalFilter_Prescale2_4x_C_(PIXEL *input, PIXEL* lowpass, PIXEL* highp lowpass[3] = (input[8] + input[9] + 3) >> 2; } +#define HorizontalFilter_Prescale3_4x HorizontalFilter_Prescale3_4x_C_ +void HorizontalFilter_Prescale3_4x_C_(PIXEL *input, PIXEL* lowpass, PIXEL* highpass ) +{ + int i; + PIXEL input_c[16]; + for ( i = 0; i < 12; i++) + { + input_c[i] = (input[i] + 7) >> 3; + } + + int32_t diff_23 = input_c[2] - input_c[3]; + int32_t diff_45 = input_c[4] - input_c[5]; + int32_t diff_67 = input_c[6] - input_c[7]; + int32_t diff_89 = input_c[8] - input_c[9]; + + int32_t sum_01 = input_c[0] + input_c[1]; + int32_t sum_23 = input_c[2] + input_c[3]; + int32_t sum_45 = input_c[4] + input_c[5]; + int32_t sum_67 = input_c[6] + input_c[7]; + int32_t sum_89 = input_c[8] + input_c[9]; + int32_t sum_1011 = input_c[10] + input_c[11]; + + { + int32_t sum = sum_45 - sum_01; + sum += rounding; + sum = DivideByShift(sum, 3); + sum += diff_23; + highpass[0] = ClampPixel(sum); + } + + { + int32_t sum = sum_67 - sum_23; + sum += rounding; + sum = DivideByShift(sum, 3); + sum += diff_45; + highpass[1] = ClampPixel(sum); + } + + { + int32_t sum = sum_89 - sum_45; + sum += rounding; + sum = DivideByShift(sum, 3); + sum += diff_67; + highpass[2] = ClampPixel(sum); + } + + { + int32_t sum = sum_1011 - sum_67; + sum += rounding; + sum = DivideByShift(sum, 3); + sum += diff_89; + highpass[3] = ClampPixel(sum); + } + + lowpass[0] = (input[2] + input[3] + 7) >> 3; + lowpass[1] = (input[4] + input[5] + 7) >> 3; + lowpass[2] = (input[6] + input[7] + 7) >> 3; + lowpass[3] = (input[8] + input[9] + 7) >> 3; +} + #define HorizontalFilter_Prescale0_4x HorizontalFilter_Prescale0_4x_C_ void HorizontalFilter_Prescale0_4x_C_(PIXEL *input, PIXEL* lowpass, PIXEL* highpass ) { @@ -413,7 +520,7 @@ void HorizontalFilter_Prescale0_4x_C_(PIXEL *input, PIXEL* lowpass, PIXEL* highp lowpass[3] = input[8] + input[9]; } -void FilterVerticalMiddle_Core_8x_C_(PIXEL *coefficients[], int column, int16_t* highpass, int16_t* lowpass ) +void FilterVerticalMiddle_Core_8x_C_(PIXEL *coefficients[], int column, PIXEL* highpass, PIXEL* lowpass ) { PIXEL row_0[8]; PIXEL row_1[8]; @@ -456,10 +563,10 @@ void FilterVerticalMiddle_8x_C_(PIXEL *lowpass[], PIXEL *highpass[], int column, { int i; - int16_t LL[8]; - int16_t HL[8]; - int16_t LH[8]; - int16_t HH[8]; + PIXEL LL[8]; + PIXEL HL[8]; + PIXEL LH[8]; + PIXEL HH[8]; FilterVerticalMiddle_Core_8x_C_( highpass, column, HH, LH); FilterVerticalMiddle_Core_8x_C_( lowpass, column, HL, LL); @@ -478,10 +585,10 @@ void FilterVerticalTopBottom_8x_C_(PIXEL *lowpass[], PIXEL *highpass[], int colu { int i; - int16_t LL[8]; - int16_t HL[8]; - int16_t LH[8]; - int16_t HH[8]; + PIXEL LL[8]; + PIXEL HL[8]; + PIXEL LH[8]; + PIXEL HH[8]; FilterVerticalTopBottom_Core_8x_C_( highpass, column, HH, LH, top ); FilterVerticalTopBottom_Core_8x_C_( lowpass, column, HL, LL, top ); @@ -563,8 +670,12 @@ CODEC_ERROR FilterHorizontalRow(PIXEL *input, PIXEL *lowpass, PIXEL *highpass, i int last_input_column_tight = ( (last_input_column - 4) / 8) * 8; - //TODO Test this routine with other prescale values - assert(prescale == 0 || prescale == 2); + // Clamp prescale to supported values (0, 2, or 3) + if (prescale != 0 && prescale != 2 && prescale != 3) + { + prescale = (prescale > 3) ? 3 : ((prescale > 0) ? 2 : 0); + prescale_rounding = (1 << prescale) - 1; + } /***** Process the left border using the formula for boundary conditions *****/ @@ -581,8 +692,7 @@ CODEC_ERROR FilterHorizontalRow(PIXEL *input, PIXEL *lowpass, PIXEL *highpass, i /***** Process the internal pixels using the normal wavelet formula *****/ for (; column < last_input_column_tight; column += 8) // { - // Column index should always be divisible by two - assert((column % 2) == 0); + if (column & 1) column--; HorizontalFilter_Prescale2_4x( input + column - 2, &lowpass[column/2], &highpass[column/2] ); } @@ -592,21 +702,25 @@ CODEC_ERROR FilterHorizontalRow(PIXEL *input, PIXEL *lowpass, PIXEL *highpass, i /***** Process the internal pixels using the normal wavelet formula *****/ for (; column < last_input_column_tight; column += 8) // { - // Column index should always be divisible by two - assert((column % 2) == 0); + if (column & 1) column--; HorizontalFilter_Prescale0_4x( input + column - 2, &lowpass[column/2], &highpass[column/2] ); } } - else + else if( prescale == 3 ) { - assert(0); + /***** Process the internal pixels using the normal wavelet formula *****/ + for (; column < last_input_column_tight; column += 8) + { + if (column & 1) column--; + + HorizontalFilter_Prescale3_4x( input + column - 2, &lowpass[column/2], &highpass[column/2] ); + } } for (; column < last_input_column; column += 2) { - // Column index should always be divisible by two - assert((column % 2) == 0); + if (column & 1) column--; // Compute the lowpass coefficient lowpass[column/2] = (input[column + 0] + input[column + 1] + prescale_rounding) >> prescale; @@ -634,8 +748,7 @@ CODEC_ERROR FilterHorizontalRow(PIXEL *input, PIXEL *lowpass, PIXEL *highpass, i } } - // Should have exited the loop at the last column - assert(column == last_input_column); + column = last_input_column; /***** Process the right border using the formula for boundary conditions *****/ @@ -716,7 +829,7 @@ static void FilterVerticalTopBottom_1x(PIXEL *lowpass[], PIXEL *highpass[], int // Apply the lowpass vertical filter to the lowpass horizontal results result[LL_BAND][column] = QuantizeValue( lowpass[low_band_index + 0][column] + lowpass[low_band_index + 1][column], midpoints[LL_BAND], multipliers[LL_BAND] ); - + // Apply the lowpass vertical filter to the highpass horizontal results result[LH_BAND][column] = QuantizeValue( highpass[low_band_index + 0][column] + highpass[low_band_index + 1][column], midpoints[LH_BAND], multipliers[LH_BAND] ); @@ -757,16 +870,18 @@ CODEC_ERROR FilterVerticalTopRow(PIXEL **lowpass, PIXEL **highpass, PIXEL **outp { FilterVerticalTopBottom_8x( lowpass, highpass, column, midpoints, multipliers, output, true ); - assert(output[LL_BAND][column] >= 0); + if (output[LL_BAND][column] < 0) + output[LL_BAND][column] = 0; } for (; column < wavelet_width; column++) { FilterVerticalTopBottom_1x( lowpass, highpass, column, midpoints, multipliers, output, true ); - - assert(output[LL_BAND][column] >= 0); + + if (output[LL_BAND][column] < 0) + output[LL_BAND][column] = 0; } - + return CODEC_ERROR_OKAY; } @@ -795,14 +910,16 @@ CODEC_ERROR FilterVerticalBottomRow(PIXEL **lowpass, PIXEL **highpass, PIXEL **o { FilterVerticalTopBottom_8x( lowpass, highpass, column, midpoints, multipliers, result, false ); - assert(result[LL_BAND][column] >= 0); + if (result[LL_BAND][column] < 0) + result[LL_BAND][column] = 0; } for (; column < wavelet_width; column++) { FilterVerticalTopBottom_1x( lowpass, highpass, column, midpoints, multipliers, result, false ); - assert(result[LL_BAND][column] >= 0); + if (result[LL_BAND][column] < 0) + result[LL_BAND][column] = 0; } return CODEC_ERROR_OKAY; @@ -834,18 +951,24 @@ CODEC_ERROR FilterVerticalMiddleRow(PIXEL **lowpass, PIXEL **highpass, PIXEL **o result[band] = (PIXEL *)band_row_ptr; } - for (; column < wavelet_width_m8; column += 8) - { - FilterVerticalMiddle_8x(lowpass, highpass, column, midpoints, multipliers, result); - } + for (; column < wavelet_width_m8; column += 8) + { + FilterVerticalMiddle_8x(lowpass, highpass, column, midpoints, multipliers, result); - for (; column < wavelet_width; column += 1) - { - FilterVerticalMiddle_1x(lowpass, highpass, column, midpoints, multipliers, result); + for (int k = 0; k < 8; ++k) + { + if (result[LL_BAND][column + k] < 0) + result[LL_BAND][column + k] = 0; + } + } + + for (; column < wavelet_width; column += 1) + { + FilterVerticalMiddle_1x(lowpass, highpass, column, midpoints, multipliers, result); + + if (result[LL_BAND][column] < 0) + result[LL_BAND][column] = 0; + } - assert(result[LL_BAND][column] >= 0); - } - return CODEC_ERROR_OKAY; } - diff --git a/source/lib/vc5_encoder/parameters.c b/source/lib/vc5_encoder/parameters.c index 2158a717..9670766b 100755 --- a/source/lib/vc5_encoder/parameters.c +++ b/source/lib/vc5_encoder/parameters.c @@ -83,4 +83,3 @@ CODEC_ERROR InitEncoderParameters(ENCODER_PARAMETERS *parameters) return CODEC_ERROR_OKAY; } - diff --git a/source/lib/vc5_encoder/raw.c b/source/lib/vc5_encoder/raw.c index cc1a3020..af9f8d38 100755 --- a/source/lib/vc5_encoder/raw.c +++ b/source/lib/vc5_encoder/raw.c @@ -18,8 +18,27 @@ #include "headers.h" +#include "logcurve.h" + +static INLINE uint16_t ApplyEncoderCurve(const uint16_t *table, uint16_t sample) +{ + if (vc5_logcurve_bypass()) + { + return sample; + } + return table[sample]; +} + #if ENABLED(NEON) #include + +static INLINE void StoreVectorToPixels(int16x8_t vec, PIXEL *dst) +{ + const int32x4_t lo = vmovl_s16(vget_low_s16(vec)); + const int32x4_t hi = vmovl_s16(vget_high_s16(vec)); + vst1q_s32(dst, lo); + vst1q_s32(dst + 4, hi); +} #endif /** ------------------- **/ @@ -31,12 +50,12 @@ static void UnpackPixel_14(uint16_t *input_row1_ptr, uint16_t *input_row2_ptr, i uint16_t R1, G1, G2, B1; uint16_t GS, GD, RG, BG; - uint16_t *GS_output_row_ptr = (uint16_t *)output_buffer[0]; - uint16_t *GD_output_row_ptr = (uint16_t *)output_buffer[3]; - uint16_t *RG_output_row_ptr = (uint16_t *)output_buffer[1]; - uint16_t *BG_output_row_ptr = (uint16_t *)output_buffer[2]; + PIXEL *GS_output_row_ptr = output_buffer[0]; + PIXEL *GD_output_row_ptr = output_buffer[3]; + PIXEL *RG_output_row_ptr = output_buffer[1]; + PIXEL *BG_output_row_ptr = output_buffer[2]; - const int internal_precision = 12; + const int internal_precision = 14; const int32_t midpoint = (1 << (internal_precision - 1)); if( rggb ) @@ -55,10 +74,10 @@ static void UnpackPixel_14(uint16_t *input_row1_ptr, uint16_t *input_row2_ptr, i } // Apply protune log curve - R1 = EncoderLogCurve[ R1 >> 2 ]; - G1 = EncoderLogCurve[ G1 >> 2 ]; - G2 = EncoderLogCurve[ G2 >> 2 ]; - B1 = EncoderLogCurve[ B1 >> 2 ]; + R1 = ApplyEncoderCurve(EncoderLogCurve14, R1); + G1 = ApplyEncoderCurve(EncoderLogCurve14, G1); + G2 = ApplyEncoderCurve(EncoderLogCurve14, G2); + B1 = ApplyEncoderCurve(EncoderLogCurve14, B1); // Difference the green components and subtract green from the red and blue components GS = (G1 + G2) >> 1; @@ -66,10 +85,10 @@ static void UnpackPixel_14(uint16_t *input_row1_ptr, uint16_t *input_row2_ptr, i RG = (R1 - GS + 2 * midpoint) >> 1; BG = (B1 - GS + 2 * midpoint) >> 1; - GS_output_row_ptr[column] = clamp_uint(GS, internal_precision); - GD_output_row_ptr[column] = clamp_uint(GD, internal_precision); - RG_output_row_ptr[column] = clamp_uint(RG, internal_precision); - BG_output_row_ptr[column] = clamp_uint(BG, internal_precision); + GS_output_row_ptr[column] = (PIXEL)clamp_uint(GS, internal_precision); + GD_output_row_ptr[column] = (PIXEL)clamp_uint(GD, internal_precision); + RG_output_row_ptr[column] = (PIXEL)clamp_uint(RG, internal_precision); + BG_output_row_ptr[column] = (PIXEL)clamp_uint(BG, internal_precision); } #if ENABLED(NEON) @@ -80,7 +99,7 @@ static void UnpackPixel_14_8x_NEON_(uint16_t *input_row1_ptr, uint16_t *input_ro int i; uint16x8x2_t row_1, row_2; - const int internal_precision = 12; + const int internal_precision = 14; const int32_t midpoint = (1 << (internal_precision - 1)); // Apply protune log curve @@ -90,8 +109,8 @@ static void UnpackPixel_14_8x_NEON_(uint16_t *input_row1_ptr, uint16_t *input_ro for (i = 0; i < 16; ++i) { - input_row1_12b[i] = EncoderLogCurve[ input_row1_ptr[2 * column + i] >> 2 ]; - input_row2_12b[i] = EncoderLogCurve[ input_row2_ptr[2 * column + i] >> 2 ]; + input_row1_12b[i] = ApplyEncoderCurve(EncoderLogCurve14, input_row1_ptr[2 * column + i]); + input_row2_12b[i] = ApplyEncoderCurve(EncoderLogCurve14, input_row2_ptr[2 * column + i]); } row_1 = vld2q_u16( input_row1_12b ); @@ -118,23 +137,23 @@ static void UnpackPixel_14_8x_NEON_(uint16_t *input_row1_ptr, uint16_t *input_ro int16x8_t GS, GD, RG, BG; GS = vhaddq_s16(G1, G2); - vst1q_s16( output_buffer[0] + column, GS ); + StoreVectorToPixels(GS, output_buffer[0] + column); { const int16x8_t __midpoint_x2 = vdupq_n_s16(midpoint * 2); GD = vsubq_s16(G1, G2); GD = vhaddq_s16(GD, __midpoint_x2); - vst1q_s16( output_buffer[3] + column, GD ); + StoreVectorToPixels(GD, output_buffer[3] + column); GS = vsubq_s16( __midpoint_x2, GS ); } RG = vhaddq_s16(R1, GS); - vst1q_s16( output_buffer[1] + column, RG ); + StoreVectorToPixels(RG, output_buffer[1] + column); BG = vhaddq_s16(B1, GS); - vst1q_s16( output_buffer[2] + column, BG ); + StoreVectorToPixels(BG, output_buffer[2] + column); } #else @@ -178,27 +197,228 @@ void UnpackImage_14(const PACKED_IMAGE *input, UNPACKED_IMAGE *output, ENABLED_P // output->component_array_list[channel_number].pitch is pitch in bytes, so we need to convert it to pitch in PIXELS output_row_ptr_array_pitch[channel_number] = (output->component_array_list[channel_number].pitch / sizeof(PIXEL)); } - + for (row = 0; row < input_height; row++) { uint16_t* input_row2_ptr = input_row_ptr + (input_pitch / sizeof(uint16_t)); - + int column = 0; - + // Unpack the row of Bayer components from the BYR4 pattern elements for (; column < input_width_m8; column+= 8) { UnpackPixel_14_8x(input_row_ptr, input_row2_ptr, column, output_row_ptr_array, rggb ); } - + // Unpack the row of Bayer components from the BYR4 pattern elements for (; column < input_width; column++) { UnpackPixel_14(input_row_ptr, input_row2_ptr, column, output_row_ptr_array, rggb ); } - + input_row_ptr += input_pitch; - + + for (channel_number = 0; channel_number < MAX_CHANNEL_COUNT; channel_number++) + { + output_row_ptr_array[channel_number] += output_row_ptr_array_pitch[channel_number]; + } + } +} + +/** ------------------- **/ +/** 16 BIT INPUT FORMAT **/ +/** ------------------- **/ + +static void UnpackPixel_16(uint16_t *input_row1_ptr, uint16_t *input_row2_ptr, int column, PIXEL *output_buffer[], bool rggb ) +{ + uint16_t R1, G1, G2, B1; + uint16_t GS, GD, RG, BG; + + PIXEL *GS_output_row_ptr = output_buffer[0]; + PIXEL *GD_output_row_ptr = output_buffer[3]; + PIXEL *RG_output_row_ptr = output_buffer[1]; + PIXEL *BG_output_row_ptr = output_buffer[2]; + + const int internal_precision = 16; + const int32_t midpoint = (1 << (internal_precision - 1)); + + if( rggb ) + { + R1 = input_row1_ptr[2 * column + 0]; + G1 = input_row1_ptr[2 * column + 1]; + G2 = input_row2_ptr[2 * column + 0]; + B1 = input_row2_ptr[2 * column + 1]; + } + else + { + G1 = input_row1_ptr[2 * column + 0]; + B1 = input_row1_ptr[2 * column + 1]; + R1 = input_row2_ptr[2 * column + 0]; + G2 = input_row2_ptr[2 * column + 1]; + } + + // Apply protune log curve using full 16-bit table + R1 = ApplyEncoderCurve(EncoderLogCurve16, R1); + G1 = ApplyEncoderCurve(EncoderLogCurve16, G1); + G2 = ApplyEncoderCurve(EncoderLogCurve16, G2); + B1 = ApplyEncoderCurve(EncoderLogCurve16, B1); + + GS = (G1 + G2) >> 1; + GD = (G1 - G2 + 2 * midpoint) >> 1; + RG = (R1 - GS + 2 * midpoint) >> 1; + BG = (B1 - GS + 2 * midpoint) >> 1; + + GS_output_row_ptr[column] = (PIXEL)clamp_uint(GS, internal_precision); + GD_output_row_ptr[column] = (PIXEL)clamp_uint(GD, internal_precision); + RG_output_row_ptr[column] = (PIXEL)clamp_uint(RG, internal_precision); + BG_output_row_ptr[column] = (PIXEL)clamp_uint(BG, internal_precision); +} + +static void UnpackPixel_16_8x_C_(uint16_t *input_row1_ptr, uint16_t *input_row2_ptr, int column, PIXEL *output_buffer[], bool rggb ) +{ + int i; + for ( i = 0; i < 8; i++) + { + UnpackPixel_16(input_row1_ptr, input_row2_ptr, column + i, output_buffer, rggb ); + } +} + +#if ENABLED(NEON) + +#define UnpackPixel_16_8x UnpackPixel_16_8x_NEON_ + +static INLINE void UnpackPixel_16_4_helper_NEON_( + uint16x4_t r1_half, uint16x4_t g1_half, uint16x4_t g2_half, uint16x4_t b1_half, + int32x4_t v_midpoint_x2, + PIXEL *gs_out, PIXEL *rg_out, PIXEL *bg_out, PIXEL *gd_out) +{ + int32x4_t R1 = vreinterpretq_s32_u32(vmovl_u16(r1_half)); + int32x4_t G1 = vreinterpretq_s32_u32(vmovl_u16(g1_half)); + int32x4_t G2 = vreinterpretq_s32_u32(vmovl_u16(g2_half)); + int32x4_t B1 = vreinterpretq_s32_u32(vmovl_u16(b1_half)); + + int32x4_t GS = vshrq_n_s32(vaddq_s32(G1, G2), 1); + vst1q_s32(gs_out, GS); + + int32x4_t GD = vshrq_n_s32(vaddq_s32(vsubq_s32(G1, G2), v_midpoint_x2), 1); + vst1q_s32(gd_out, GD); + + int32x4_t RG = vshrq_n_s32(vaddq_s32(vsubq_s32(R1, GS), v_midpoint_x2), 1); + vst1q_s32(rg_out, RG); + + int32x4_t BG = vshrq_n_s32(vaddq_s32(vsubq_s32(B1, GS), v_midpoint_x2), 1); + vst1q_s32(bg_out, BG); +} + +static void UnpackPixel_16_8x_NEON_(uint16_t *input_row1_ptr, uint16_t *input_row2_ptr, int column, PIXEL *output_buffer[], bool rggb) +{ + int i; + uint16x8x2_t row_1, row_2; + + const int internal_precision = 16; + const int32_t midpoint = (1 << (internal_precision - 1)); + const int32x4_t v_midpoint_x2 = vdupq_n_s32(midpoint * 2); + + // Apply log curve (scalar -- 65536-entry table too large for vector LUT) + { + uint16_t input_row1_curved[16]; + uint16_t input_row2_curved[16]; + + for (i = 0; i < 16; ++i) + { + input_row1_curved[i] = ApplyEncoderCurve(EncoderLogCurve16, input_row1_ptr[2 * column + i]); + input_row2_curved[i] = ApplyEncoderCurve(EncoderLogCurve16, input_row2_ptr[2 * column + i]); + } + + row_1 = vld2q_u16(input_row1_curved); + row_2 = vld2q_u16(input_row2_curved); + } + + uint16x8_t r1_u16, g1_u16, g2_u16, b1_u16; + + if (rggb) + { + r1_u16 = row_1.val[0]; + g1_u16 = row_1.val[1]; + g2_u16 = row_2.val[0]; + b1_u16 = row_2.val[1]; + } + else + { + g1_u16 = row_1.val[0]; + b1_u16 = row_1.val[1]; + r1_u16 = row_2.val[0]; + g2_u16 = row_2.val[1]; + } + + // Process low 4 pixels (16-bit values overflow int16, must use 32-bit arithmetic) + UnpackPixel_16_4_helper_NEON_( + vget_low_u16(r1_u16), vget_low_u16(g1_u16), + vget_low_u16(g2_u16), vget_low_u16(b1_u16), + v_midpoint_x2, + output_buffer[0] + column, output_buffer[1] + column, + output_buffer[2] + column, output_buffer[3] + column); + + // Process high 4 pixels + UnpackPixel_16_4_helper_NEON_( + vget_high_u16(r1_u16), vget_high_u16(g1_u16), + vget_high_u16(g2_u16), vget_high_u16(b1_u16), + v_midpoint_x2, + output_buffer[0] + column + 4, output_buffer[1] + column + 4, + output_buffer[2] + column + 4, output_buffer[3] + column + 4); +} + +#else + +#define UnpackPixel_16_8x UnpackPixel_16_8x_C_ + +#endif + +void UnpackImage_16(const PACKED_IMAGE *input, UNPACKED_IMAGE *output, ENABLED_PARTS enabled_parts, bool rggb ) +{ + uint8_t *input_buffer = (uint8_t *)input->buffer + input->offset; + + const DIMENSION input_width = input->width / 2; + const DIMENSION input_width_m8 = (input_width / 8) * 8; + + const DIMENSION input_height = input->height / 2; + + size_t input_pitch = input->pitch; + + PIXEL *output_row_ptr_array[MAX_CHANNEL_COUNT]; + uint32_t output_row_ptr_array_pitch[MAX_CHANNEL_COUNT]; + + uint16_t *input_row_ptr = (uint16_t*)input_buffer; + + int channel_number; + + int row; + + for (channel_number = 0; channel_number < MAX_CHANNEL_COUNT; channel_number++) + { + output_row_ptr_array[channel_number] = (PIXEL *)(output->component_array_list[channel_number].data); + + output_row_ptr_array_pitch[channel_number] = (output->component_array_list[channel_number].pitch / sizeof(PIXEL)); + } + + for (row = 0; row < input_height; row++) + { + uint16_t* input_row2_ptr = input_row_ptr + (input_pitch / sizeof(uint16_t)); + + int column = 0; + + for (; column < input_width_m8; column+= 8) + { + UnpackPixel_16_8x(input_row_ptr, input_row2_ptr, column, output_row_ptr_array, rggb ); + } + + for (; column < input_width; column++) + { + UnpackPixel_16(input_row_ptr, input_row2_ptr, column, output_row_ptr_array, rggb ); + } + + input_row_ptr += input_pitch; + for (channel_number = 0; channel_number < MAX_CHANNEL_COUNT; channel_number++) { output_row_ptr_array[channel_number] += output_row_ptr_array_pitch[channel_number]; @@ -215,10 +435,10 @@ static void UnpackPixel_12(uint16_t *input_row1_ptr, uint16_t *input_row2_ptr, i uint16_t R1, G1, G2, B1; uint16_t GS, GD, RG, BG; - uint16_t *GS_output_row_ptr = (uint16_t *)output_buffer[0]; - uint16_t *GD_output_row_ptr = (uint16_t *)output_buffer[3]; - uint16_t *RG_output_row_ptr = (uint16_t *)output_buffer[1]; - uint16_t *BG_output_row_ptr = (uint16_t *)output_buffer[2]; + PIXEL *GS_output_row_ptr = output_buffer[0]; + PIXEL *GD_output_row_ptr = output_buffer[3]; + PIXEL *RG_output_row_ptr = output_buffer[1]; + PIXEL *BG_output_row_ptr = output_buffer[2]; const int internal_precision = 12; const int32_t midpoint = (1 << (internal_precision - 1)); @@ -239,10 +459,10 @@ static void UnpackPixel_12(uint16_t *input_row1_ptr, uint16_t *input_row2_ptr, i } // Apply protune log curve - R1 = EncoderLogCurve[ R1 ]; - G1 = EncoderLogCurve[ G1 ]; - G2 = EncoderLogCurve[ G2 ]; - B1 = EncoderLogCurve[ B1 ]; + R1 = ApplyEncoderCurve(EncoderLogCurve12, R1); + G1 = ApplyEncoderCurve(EncoderLogCurve12, G1); + G2 = ApplyEncoderCurve(EncoderLogCurve12, G2); + B1 = ApplyEncoderCurve(EncoderLogCurve12, B1); // Difference the green components and subtract green from the red and blue components GS = (G1 + G2) >> 1; @@ -250,10 +470,10 @@ static void UnpackPixel_12(uint16_t *input_row1_ptr, uint16_t *input_row2_ptr, i RG = (R1 - GS + 2 * midpoint) >> 1; BG = (B1 - GS + 2 * midpoint) >> 1; - GS_output_row_ptr[column] = clamp_uint(GS, internal_precision); - GD_output_row_ptr[column] = clamp_uint(GD, internal_precision); - RG_output_row_ptr[column] = clamp_uint(RG, internal_precision); - BG_output_row_ptr[column] = clamp_uint(BG, internal_precision); + GS_output_row_ptr[column] = (PIXEL)clamp_uint(GS, internal_precision); + GD_output_row_ptr[column] = (PIXEL)clamp_uint(GD, internal_precision); + RG_output_row_ptr[column] = (PIXEL)clamp_uint(RG, internal_precision); + BG_output_row_ptr[column] = (PIXEL)clamp_uint(BG, internal_precision); } #if ENABLED(NEON) @@ -274,8 +494,8 @@ static void UnpackPixel_12_8x_NEON_(uint16_t *input_row1_ptr, uint16_t *input_ro for (i = 0; i < 16; ++i) { - input_row1_12b[i] = EncoderLogCurve[ input_row1_ptr[2 * column + i] ]; - input_row2_12b[i] = EncoderLogCurve[ input_row2_ptr[2 * column + i] ]; + input_row1_12b[i] = ApplyEncoderCurve(EncoderLogCurve12, input_row1_ptr[2 * column + i]); + input_row2_12b[i] = ApplyEncoderCurve(EncoderLogCurve12, input_row2_ptr[2 * column + i]); } row_1 = vld2q_u16( input_row1_12b ); @@ -302,23 +522,23 @@ static void UnpackPixel_12_8x_NEON_(uint16_t *input_row1_ptr, uint16_t *input_ro int16x8_t GS, GD, RG, BG; GS = vhaddq_s16(G1, G2); - vst1q_s16( output_buffer[0] + column, GS ); + StoreVectorToPixels(GS, output_buffer[0] + column); { const int16x8_t __midpoint_x2 = vdupq_n_s16(midpoint * 2); GD = vsubq_s16(G1, G2); GD = vhaddq_s16(GD, __midpoint_x2); - vst1q_s16( output_buffer[3] + column, GD ); + StoreVectorToPixels(GD, output_buffer[3] + column); GS = vsubq_s16( __midpoint_x2, GS ); } RG = vhaddq_s16(R1, GS); - vst1q_s16( output_buffer[1] + column, RG ); + StoreVectorToPixels(RG, output_buffer[1] + column); BG = vhaddq_s16(B1, GS); - vst1q_s16( output_buffer[2] + column, BG ); + StoreVectorToPixels(BG, output_buffer[2] + column); } #else @@ -442,10 +662,10 @@ static void UnpackPixel_12P(uint16_t *input_row1_ptr, uint16_t *input_row2_ptr, } // Apply protune log curve - G1 = EncoderLogCurve[ G1 ]; - B1 = EncoderLogCurve[ B1 ]; - R1 = EncoderLogCurve[ R1 ]; - G2 = EncoderLogCurve[ G2 ]; + G1 = ApplyEncoderCurve(EncoderLogCurve12, G1); + B1 = ApplyEncoderCurve(EncoderLogCurve12, B1); + R1 = ApplyEncoderCurve(EncoderLogCurve12, R1); + G2 = ApplyEncoderCurve(EncoderLogCurve12, G2); // difference the green components and subtract green from the red and blue components GS = (G1 + G2) >> 1; @@ -454,15 +674,15 @@ static void UnpackPixel_12P(uint16_t *input_row1_ptr, uint16_t *input_row2_ptr, BG = (B1 - GS + 2 * midpoint) >> 1; { // write output - uint16_t *GS_output_row_ptr = (uint16_t *)output_buffer[0]; - uint16_t *GD_output_row_ptr = (uint16_t *)output_buffer[3]; - uint16_t *RG_output_row_ptr = (uint16_t *)output_buffer[1]; - uint16_t *BG_output_row_ptr = (uint16_t *)output_buffer[2]; + PIXEL *GS_output_row_ptr = output_buffer[0]; + PIXEL *GD_output_row_ptr = output_buffer[3]; + PIXEL *RG_output_row_ptr = output_buffer[1]; + PIXEL *BG_output_row_ptr = output_buffer[2]; - GS_output_row_ptr[column] = clamp_uint(GS, internal_precision); - GD_output_row_ptr[column] = clamp_uint(GD, internal_precision); - RG_output_row_ptr[column] = clamp_uint(RG, internal_precision); - BG_output_row_ptr[column] = clamp_uint(BG, internal_precision); + GS_output_row_ptr[column] = (PIXEL)clamp_uint(GS, internal_precision); + GD_output_row_ptr[column] = (PIXEL)clamp_uint(GD, internal_precision); + RG_output_row_ptr[column] = (PIXEL)clamp_uint(RG, internal_precision); + BG_output_row_ptr[column] = (PIXEL)clamp_uint(BG, internal_precision); } } @@ -518,10 +738,10 @@ static void UnpackPixel_12P_8x_NEON_(uint16_t *input_row1_ptr, uint16_t *input_r for(i = 0; i < 8; i++) { - g1[i] = EncoderLogCurve[g1[i]]; - b1[i] = EncoderLogCurve[b1[i]]; - r1[i] = EncoderLogCurve[r1[i]]; - g2[i] = EncoderLogCurve[g2[i]]; + g1[i] = ApplyEncoderCurve(EncoderLogCurve12, g1[i]); + b1[i] = ApplyEncoderCurve(EncoderLogCurve12, b1[i]); + r1[i] = ApplyEncoderCurve(EncoderLogCurve12, r1[i]); + g2[i] = ApplyEncoderCurve(EncoderLogCurve12, g2[i]); } int16x8_t R1, G1, G2, B1; @@ -534,22 +754,22 @@ static void UnpackPixel_12P_8x_NEON_(uint16_t *input_row1_ptr, uint16_t *input_r int16x8_t GS, GD, RG, BG; GS = vhaddq_s16(G1, G2); - vst1q_s16( output_buffer[0] + column, GS ); + StoreVectorToPixels(GS, output_buffer[0] + column); { const int16x8_t __midpoint_x2 = vdupq_n_s16(midpoint * 2); GD = vsubq_s16(G1, G2); GD = vhaddq_s16(GD, __midpoint_x2); - vst1q_s16( output_buffer[3] + column, GD ); + StoreVectorToPixels(GD, output_buffer[3] + column); GS = vsubq_s16( __midpoint_x2, GS ); } RG = vhaddq_s16(R1, GS); - vst1q_s16( output_buffer[1] + column, RG ); + StoreVectorToPixels(RG, output_buffer[1] + column); BG = vhaddq_s16(B1, GS); - vst1q_s16( output_buffer[2] + column, BG ); + StoreVectorToPixels(BG, output_buffer[2] + column); } #else @@ -618,4 +838,3 @@ void UnpackImage_12P(const PACKED_IMAGE *input, UNPACKED_IMAGE *output, ENABLED_ } } } - diff --git a/source/lib/vc5_encoder/raw.h b/source/lib/vc5_encoder/raw.h index 894d60e6..b0e10c4e 100755 --- a/source/lib/vc5_encoder/raw.h +++ b/source/lib/vc5_encoder/raw.h @@ -28,6 +28,8 @@ extern "C" { void UnpackImage_12(const PACKED_IMAGE *input, UNPACKED_IMAGE *output, ENABLED_PARTS enabled_parts, bool rggb ); void UnpackImage_12P(const PACKED_IMAGE *input, UNPACKED_IMAGE *output, ENABLED_PARTS enabled_parts, bool rggb ); + + void UnpackImage_16(const PACKED_IMAGE *input, UNPACKED_IMAGE *output, ENABLED_PARTS enabled_parts, bool rggb ); #ifdef __cplusplus } diff --git a/source/lib/vc5_encoder/syntax.c b/source/lib/vc5_encoder/syntax.c index 14afdf1b..85708bb7 100755 --- a/source/lib/vc5_encoder/syntax.c +++ b/source/lib/vc5_encoder/syntax.c @@ -106,11 +106,11 @@ CODEC_ERROR PutTagPair(BITSTREAM *stream, int tag, int value) // The bitstream should be aligned on a tag word boundary assert(IsAlignedTag(stream)); - // The value must fit within a tag word - assert(((uint32_t)value & ~(uint32_t)CODEC_TAG_MASK) == 0); - - PutLong(stream, ((uint32_t )tag << 16) | (value & CODEC_TAG_MASK)); - + // The value must fit within a tag word (mask to 16 bits to handle sign-extended TAGWORD values) + assert(((uint32_t)(uint16_t)value & ~(uint32_t)CODEC_TAG_MASK) == 0); + + PutLong(stream, ((uint32_t )tag << 16) | ((uint32_t)(uint16_t)value & CODEC_TAG_MASK)); + return CODEC_ERROR_OKAY; } @@ -123,16 +123,16 @@ CODEC_ERROR PutTagPairOptional(BITSTREAM *stream, int tag, int value) { // The bitstream should be aligned on a tag word boundary assert(IsAlignedTag(stream)); - - // The value must fit within a tag word - assert(((uint32_t)value & ~(uint32_t)CODEC_TAG_MASK) == 0); - + + // The value must fit within a tag word (mask to 16 bits to handle sign-extended TAGWORD values) + assert(((uint32_t)(uint16_t)value & ~(uint32_t)CODEC_TAG_MASK) == 0); + // Set the optional tag bit //tag |= CODEC_TAG_OPTIONAL; //tag = NEG(tag); tag = neg(tag); - - PutLong(stream, ((uint32_t )tag << 16) | (value & CODEC_TAG_MASK)); + + PutLong(stream, ((uint32_t )tag << 16) | ((uint32_t)(uint16_t)value & CODEC_TAG_MASK)); return CODEC_ERROR_OKAY; } diff --git a/source/lib/vc5_encoder/vc5_encoder.c b/source/lib/vc5_encoder/vc5_encoder.c index c28d904a..00b9e267 100755 --- a/source/lib/vc5_encoder/vc5_encoder.c +++ b/source/lib/vc5_encoder/vc5_encoder.c @@ -41,11 +41,18 @@ CODEC_ERROR vc5_encoder_process(const vc5_encoder_parameters* encoding_paramet { CODEC_ERROR error = CODEC_ERROR_OKAY; IMAGE image; + memset(&image, 0, sizeof(IMAGE)); ENCODER_PARAMETERS parameters; STREAM bitstream_file; - const int max_vc5_buffer_size = 10000000; + // Allocate a conservative buffer for VC5 bitstream: 1.5x raw size + 1MB + size_t base_size = image.size; + if (base_size == 0) + { + base_size = (size_t)encoding_parameters->input_width * encoding_parameters->input_height * sizeof(uint16_t); + } +const size_t max_vc5_buffer_size = base_size + (base_size >> 1) + (1 << 20); // Initialize the data structure for passing parameters to the encoder InitEncoderParameters(¶meters); @@ -57,12 +64,16 @@ CODEC_ERROR vc5_encoder_process(const vc5_encoder_parameters* encoding_paramet {1, 24, 24, 12, 32, 32, 24, 128, 128, 192}, // CineForm High {1, 24, 24, 12, 24, 24, 12, 96, 96, 144}, // CineForm Filmscan-1 {1, 24, 24, 12, 24, 24, 12, 64, 64, 96}, // CineForm Filmscan-X - {1, 24, 24, 12, 24, 24, 12, 32, 32, 48} // CineForm Filmscan-2 + {1, 24, 24, 12, 24, 24, 12, 32, 32, 48}, // CineForm Filmscan-2 + {1, 12, 12, 6, 12, 12, 6, 16, 16, 24}, // CineForm Filmscan-3 (Edit-Safe) + {1, 6, 6, 4, 12, 12, 6, 16, 16, 24}, // CineForm Filmscan-4 (Near-Lossless) + {1, 4, 4, 2, 10, 10, 6, 16, 16, 24} // CineForm Filmscan-5 (Virtually Lossless) }; - if( encoding_parameters->quality_setting < VC5_ENCODER_QUALITY_SETTING_COUNT ) + int quality = encoding_parameters->quality_setting; + if( quality >= 0 && quality < VC5_ENCODER_QUALITY_SETTING_COUNT ) { - memcpy(parameters.quant_table, quant_table[encoding_parameters->quality_setting], sizeof(parameters.quant_table)); + memcpy(parameters.quant_table, quant_table[quality], sizeof(parameters.quant_table)); } } @@ -114,6 +125,14 @@ CODEC_ERROR vc5_encoder_process(const vc5_encoder_parameters* encoding_paramet case VC5_ENCODER_PIXEL_FORMAT_GBRG_12P: image.format = PIXEL_FORMAT_RAW_GBRG_12P; break; + + case VC5_ENCODER_PIXEL_FORMAT_RGGB_16: + image.format = PIXEL_FORMAT_RAW_RGGB_16; + break; + + case VC5_ENCODER_PIXEL_FORMAT_GBRG_16: + image.format = PIXEL_FORMAT_RAW_GBRG_16; + break; default: assert(0); @@ -134,6 +153,10 @@ CODEC_ERROR vc5_encoder_process(const vc5_encoder_parameters* encoding_paramet #endif vc5_buffer->buffer = encoding_parameters->mem_alloc( max_vc5_buffer_size ); + if (vc5_buffer->buffer == NULL) + { + return CODEC_ERROR_OUTOFMEMORY; + } // Open a stream to the output file error = CreateStreamBuffer(&bitstream_file, vc5_buffer->buffer, max_vc5_buffer_size ); diff --git a/source/lib/vc5_encoder/vc5_encoder.h b/source/lib/vc5_encoder/vc5_encoder.h index 35a9b16b..0a995332 100755 --- a/source/lib/vc5_encoder/vc5_encoder.h +++ b/source/lib/vc5_encoder/vc5_encoder.h @@ -43,6 +43,10 @@ VC5_ENCODER_PIXEL_FORMAT_GBRG_12 = 3, // GBRG 12bit pixels packed into 16bits VC5_ENCODER_PIXEL_FORMAT_GBRG_12P = 4, // GBRG 12bit pixels packed into 12bits + + VC5_ENCODER_PIXEL_FORMAT_RGGB_16 = 5, // RGGB 16bit pixels packed into 16bits + + VC5_ENCODER_PIXEL_FORMAT_GBRG_16 = 6, // GBRG 16bit pixels packed into 16bits VC5_ENCODER_PIXEL_FORMAT_DEFAULT = VC5_ENCODER_PIXEL_FORMAT_RGGB_14, @@ -60,10 +64,13 @@ VC5_ENCODER_QUALITY_SETTING_HIGH = 2, // High VC5_ENCODER_QUALITY_SETTING_FS1 = 3, // Film Scan 1 VC5_ENCODER_QUALITY_SETTING_FSX = 4, // Film Scan X - VC5_ENCODER_QUALITY_SETTING_FS2 = 5, // Film Scan 2 (Highest Quality) - - VC5_ENCODER_QUALITY_SETTING_COUNT = 6, - + VC5_ENCODER_QUALITY_SETTING_FS2 = 5, // Film Scan 2 + VC5_ENCODER_QUALITY_SETTING_FS3 = 6, // Film Scan 3 (Edit-Safe) + VC5_ENCODER_QUALITY_SETTING_FS4 = 7, // Film Scan 4 (Near-Lossless) + VC5_ENCODER_QUALITY_SETTING_FS5 = 8, // Film Scan 5 (Virtually Lossless) + + VC5_ENCODER_QUALITY_SETTING_COUNT = 9, + VC5_ENCODER_QUALITY_SETTING_DEFAULT = VC5_ENCODER_QUALITY_SETTING_FSX, } VC5_ENCODER_QUALITY_SETTING; @@ -74,19 +81,19 @@ typedef struct { ENABLED_PARTS enabled_parts; - + unsigned int input_width; // Image Width in Components (Default: 4000) unsigned int input_height; // Image Height in Components (Default: 3000) int input_pitch; // Image Buffer Stride in Components (Default: 4000) - + VC5_ENCODER_PIXEL_FORMAT pixel_format; // Bayer Ordering Pattern (Default: VC5_ENCODER_BAYER_ORDERING_RGGB) - + VC5_ENCODER_QUALITY_SETTING quality_setting; // Quality setting of the encoder (Default: VC5_ENCODER_QUALITY_SETTING_FS2) - + gpr_malloc mem_alloc; // Callback function to allocate memory - + gpr_free mem_free; // Callback function to free memory - + } vc5_encoder_parameters; void vc5_encoder_parameters_set_default(vc5_encoder_parameters* encoding_parameters);