From bb206caff85c6a34b13c983f249183f69234c0fa Mon Sep 17 00:00:00 2001 From: jinyaoguo <384606266@qq.com> Date: Sat, 15 Nov 2025 01:14:43 -0500 Subject: [PATCH] Add OOB seed files --- contrib/largeNbDicts/largeNbDicts.c | 1085 +++ contrib/match_finders/zstd_edist.c | 558 ++ contrib/pzstd/Pzstd.cpp | 626 ++ contrib/seekable_format/zstdseek_decompress.c | 600 ++ doc/educational_decoder/zstd_decompress.c | 2323 +++++ lib/common/fse.h | 625 ++ lib/common/threading.c | 182 + lib/compress/fse_compress.c | 625 ++ lib/compress/hist.c | 191 + lib/compress/huf_compress.c | 1465 +++ lib/compress/zstd_compress.c | 7851 +++++++++++++++++ lib/compress/zstd_compress_sequences.c | 442 + lib/compress/zstd_double_fast.c | 778 ++ lib/compress/zstd_lazy.c | 2199 +++++ lib/compress/zstd_ldm.c | 745 ++ lib/compress/zstd_opt.c | 1580 ++++ lib/compress/zstdmt_compress.c | 1923 ++++ lib/decompress/huf_decompress.c | 1945 ++++ lib/decompress/zstd_decompress.c | 2410 +++++ lib/decompress/zstd_decompress_block.c | 2217 +++++ lib/deprecated/zbuff_compress.c | 167 + lib/dictBuilder/divsufsort.c | 1913 ++++ lib/dictBuilder/fastcover.c | 765 ++ lib/dictBuilder/zdict.c | 1137 +++ lib/legacy/zstd_v02.c | 3465 ++++++++ lib/legacy/zstd_v03.c | 3105 +++++++ lib/legacy/zstd_v04.c | 3598 ++++++++ lib/legacy/zstd_v05.c | 4005 +++++++++ lib/legacy/zstd_v06.c | 4110 +++++++++ lib/legacy/zstd_v07.c | 4490 ++++++++++ programs/benchzstd.c | 1270 +++ programs/dibio.c | 440 + programs/fileio.c | 3473 ++++++++ programs/fileio_asyncio.c | 663 ++ programs/lorem.c | 285 + programs/util.c | 1730 ++++ programs/zstdcli.c | 1668 ++++ 37 files changed, 66654 insertions(+) create mode 100644 contrib/largeNbDicts/largeNbDicts.c create mode 100644 contrib/match_finders/zstd_edist.c create mode 100644 contrib/pzstd/Pzstd.cpp create mode 100644 contrib/seekable_format/zstdseek_decompress.c create mode 100644 doc/educational_decoder/zstd_decompress.c create mode 100644 lib/common/fse.h create mode 100644 lib/common/threading.c create mode 100644 lib/compress/fse_compress.c create mode 100644 lib/compress/hist.c create mode 100644 lib/compress/huf_compress.c create mode 100644 lib/compress/zstd_compress.c create mode 100644 lib/compress/zstd_compress_sequences.c create mode 100644 lib/compress/zstd_double_fast.c create mode 100644 lib/compress/zstd_lazy.c create mode 100644 lib/compress/zstd_ldm.c create mode 100644 lib/compress/zstd_opt.c create mode 100644 lib/compress/zstdmt_compress.c create mode 100644 lib/decompress/huf_decompress.c create mode 100644 lib/decompress/zstd_decompress.c create mode 100644 lib/decompress/zstd_decompress_block.c create mode 100644 lib/deprecated/zbuff_compress.c create mode 100644 lib/dictBuilder/divsufsort.c create mode 100644 lib/dictBuilder/fastcover.c create mode 100644 lib/dictBuilder/zdict.c create mode 100644 lib/legacy/zstd_v02.c create mode 100644 lib/legacy/zstd_v03.c create mode 100644 lib/legacy/zstd_v04.c create mode 100644 lib/legacy/zstd_v05.c create mode 100644 lib/legacy/zstd_v06.c create mode 100644 lib/legacy/zstd_v07.c create mode 100644 programs/benchzstd.c create mode 100644 programs/dibio.c create mode 100644 programs/fileio.c create mode 100644 programs/fileio_asyncio.c create mode 100644 programs/lorem.c create mode 100644 programs/util.c create mode 100644 programs/zstdcli.c diff --git a/contrib/largeNbDicts/largeNbDicts.c b/contrib/largeNbDicts/largeNbDicts.c new file mode 100644 index 0000000..6502f12 --- /dev/null +++ b/contrib/largeNbDicts/largeNbDicts.c @@ -0,0 +1,1085 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/* largeNbDicts + * This is a benchmark test tool + * dedicated to the specific case of dictionary decompression + * using a very large nb of dictionaries + * thus suffering latency from lots of cache misses. + * It's created in a bid to investigate performance and find optimizations. */ + + +/*--- Dependencies ---*/ + +#include /* size_t */ +#include /* malloc, free, abort, qsort*/ +#include /* fprintf */ +#include /* UINT_MAX */ +#include /* assert */ + +#include "util.h" +#include "benchfn.h" +#define ZSTD_STATIC_LINKING_ONLY +#include "zstd.h" +#include "zdict.h" + + +/*--- Constants --- */ + +#define KB *(1<<10) +#define MB *(1<<20) + +#define BLOCKSIZE_DEFAULT 0 /* no slicing into blocks */ +#define DICTSIZE (4 KB) +#define CLEVEL_DEFAULT 3 +#define DICT_LOAD_METHOD ZSTD_dlm_byCopy + +#define BENCH_TIME_DEFAULT_S 6 +#define RUN_TIME_DEFAULT_MS 1000 +#define BENCH_TIME_DEFAULT_MS (BENCH_TIME_DEFAULT_S * RUN_TIME_DEFAULT_MS) + +#define DISPLAY_LEVEL_DEFAULT 3 + +#define BENCH_SIZE_MAX (1200 MB) + + +/*--- Macros ---*/ + +#define CONTROL(c) { if (!(c)) abort(); } +#undef MIN +#define MIN(a,b) ((a) < (b) ? (a) : (b)) + + +/*--- Display Macros ---*/ + +#define DISPLAY(...) fprintf(stdout, __VA_ARGS__) +#define DISPLAYLEVEL(l, ...) { if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); } } +static int g_displayLevel = DISPLAY_LEVEL_DEFAULT; /* 0 : no display, 1: errors, 2 : + result + interaction + warnings, 3 : + progression, 4 : + information */ + + +/*--- buffer_t ---*/ + +typedef struct { + void* ptr; + size_t size; + size_t capacity; +} buffer_t; + +static const buffer_t kBuffNull = { NULL, 0, 0 }; + +/* @return : kBuffNull if any error */ +static buffer_t createBuffer(size_t capacity) +{ + assert(capacity > 0); + void* const ptr = malloc(capacity); + if (ptr==NULL) return kBuffNull; + + buffer_t buffer; + buffer.ptr = ptr; + buffer.capacity = capacity; + buffer.size = 0; + return buffer; +} + +static void freeBuffer(buffer_t buff) +{ + free(buff.ptr); +} + + +static void fillBuffer_fromHandle(buffer_t* buff, FILE* f) +{ + size_t const readSize = fread(buff->ptr, 1, buff->capacity, f); + buff->size = readSize; +} + + +/* @return : kBuffNull if any error */ +static buffer_t createBuffer_fromFile(const char* fileName) +{ + U64 const fileSize = UTIL_getFileSize(fileName); + size_t const bufferSize = (size_t) fileSize; + + if (fileSize == UTIL_FILESIZE_UNKNOWN) return kBuffNull; + assert((U64)bufferSize == fileSize); /* check overflow */ + + { FILE* const f = fopen(fileName, "rb"); + if (f == NULL) return kBuffNull; + + buffer_t buff = createBuffer(bufferSize); + CONTROL(buff.ptr != NULL); + + fillBuffer_fromHandle(&buff, f); + CONTROL(buff.size == buff.capacity); + + fclose(f); /* do nothing specific if fclose() fails */ + return buff; + } +} + + +/* @return : kBuffNull if any error */ +static buffer_t +createDictionaryBuffer(const char* dictionaryName, + const void* srcBuffer, + const size_t* srcBlockSizes, size_t nbBlocks, + size_t requestedDictSize) +{ + if (dictionaryName) { + DISPLAYLEVEL(3, "loading dictionary %s \n", dictionaryName); + return createBuffer_fromFile(dictionaryName); /* note : result might be kBuffNull */ + + } else { + + DISPLAYLEVEL(3, "creating dictionary, of target size %u bytes \n", + (unsigned)requestedDictSize); + void* const dictBuffer = malloc(requestedDictSize); + CONTROL(dictBuffer != NULL); + + assert(nbBlocks <= UINT_MAX); + size_t const dictSize = ZDICT_trainFromBuffer(dictBuffer, requestedDictSize, + srcBuffer, + srcBlockSizes, (unsigned)nbBlocks); + CONTROL(!ZSTD_isError(dictSize)); + + buffer_t result; + result.ptr = dictBuffer; + result.capacity = requestedDictSize; + result.size = dictSize; + return result; + } +} + +/*! BMK_loadFiles() : + * Loads `buffer`, with content from files listed within `fileNamesTable`. + * Fills `buffer` entirely. + * @return : 0 on success, !=0 on error */ +static int loadFiles(void* buffer, size_t bufferSize, + size_t* fileSizes, + const char* const * fileNamesTable, unsigned nbFiles) +{ + size_t pos = 0, totalSize = 0; + + for (unsigned n=0; n 0); + void* const srcBuffer = malloc(loadedSize); + assert(srcBuffer != NULL); + + assert(nbFiles > 0); + size_t* const fileSizes = (size_t*)calloc(nbFiles, sizeof(*fileSizes)); + assert(fileSizes != NULL); + + /* Load input buffer */ + int const errorCode = loadFiles(srcBuffer, loadedSize, + fileSizes, + fileNamesTable, nbFiles); + assert(errorCode == 0); + + void** sliceTable = (void**)malloc(nbFiles * sizeof(*sliceTable)); + assert(sliceTable != NULL); + + char* const ptr = (char*)srcBuffer; + size_t pos = 0; + unsigned fileNb = 0; + for ( ; (pos < loadedSize) && (fileNb < nbFiles); fileNb++) { + sliceTable[fileNb] = ptr + pos; + pos += fileSizes[fileNb]; + } + assert(pos == loadedSize); + assert(fileNb == nbFiles); + + + buffer_t buffer; + buffer.ptr = srcBuffer; + buffer.capacity = loadedSize; + buffer.size = loadedSize; + + slice_collection_t slices; + slices.slicePtrs = sliceTable; + slices.capacities = fileSizes; + slices.nbSlices = nbFiles; + + buffer_collection_t bc; + bc.buffer = buffer; + bc.slices = slices; + return bc; +} + + + + +/*--- ddict_collection_t ---*/ + +typedef struct { + ZSTD_DDict** ddicts; + size_t nbDDict; +} ddict_collection_t; + +typedef struct { + ZSTD_CDict** cdicts; + size_t nbCDict; +} cdict_collection_t; + +static const cdict_collection_t kNullCDictCollection = { NULL, 0 }; + +static void freeCDictCollection(cdict_collection_t cdictc) +{ + for (size_t dictNb=0; dictNb < cdictc.nbCDict; dictNb++) { + ZSTD_freeCDict(cdictc.cdicts[dictNb]); + } + free(cdictc.cdicts); +} + +/* returns .buffers=NULL if operation fails */ +static cdict_collection_t createCDictCollection(const void* dictBuffer, size_t dictSize, size_t nbCDict, ZSTD_dictContentType_e dictContentType, ZSTD_CCtx_params* cctxParams) +{ + ZSTD_CDict** const cdicts = malloc(nbCDict * sizeof(ZSTD_CDict*)); + if (cdicts==NULL) return kNullCDictCollection; + for (size_t dictNb=0; dictNb < nbCDict; dictNb++) { + cdicts[dictNb] = ZSTD_createCDict_advanced2(dictBuffer, dictSize, DICT_LOAD_METHOD, dictContentType, cctxParams, ZSTD_defaultCMem); + CONTROL(cdicts[dictNb] != NULL); + } + cdict_collection_t cdictc; + cdictc.cdicts = cdicts; + cdictc.nbCDict = nbCDict; + return cdictc; +} + +static const ddict_collection_t kNullDDictCollection = { NULL, 0 }; + +static void freeDDictCollection(ddict_collection_t ddictc) +{ + for (size_t dictNb=0; dictNb < ddictc.nbDDict; dictNb++) { + ZSTD_freeDDict(ddictc.ddicts[dictNb]); + } + free(ddictc.ddicts); +} + +/* returns .buffers=NULL if operation fails */ +static ddict_collection_t createDDictCollection(const void* dictBuffer, size_t dictSize, size_t nbDDict) +{ + ZSTD_DDict** const ddicts = malloc(nbDDict * sizeof(ZSTD_DDict*)); + assert(ddicts != NULL); + if (ddicts==NULL) return kNullDDictCollection; + for (size_t dictNb=0; dictNb < nbDDict; dictNb++) { + ddicts[dictNb] = ZSTD_createDDict(dictBuffer, dictSize); + assert(ddicts[dictNb] != NULL); + } + ddict_collection_t ddictc; + ddictc.ddicts = ddicts; + ddictc.nbDDict = nbDDict; + return ddictc; +} + + +/* mess with addresses, so that linear scanning dictionaries != linear address scanning */ +void shuffleCDictionaries(cdict_collection_t dicts) +{ + size_t const nbDicts = dicts.nbCDict; + for (size_t r=0; rcctx, ci->dictionaries.cdicts[ci->dictNb]); + ZSTD_compress2(ci->cctx, + dst, srcSize, + src, srcSize); + + ci->dictNb = ci->dictNb + 1; + if (ci->dictNb >= ci->nbDicts) ci->dictNb = 0; + + return srcSize; +} + +/* benched function */ +size_t decompress(const void* src, size_t srcSize, void* dst, size_t dstCapacity, void* payload) +{ + decompressInstructions* const di = (decompressInstructions*) payload; + + size_t const result = ZSTD_decompress_usingDDict(di->dctx, + dst, dstCapacity, + src, srcSize, + di->dictionaries.ddicts[di->dictNb]); + + di->dictNb = di->dictNb + 1; + if (di->dictNb >= di->nbDicts) di->dictNb = 0; + + return result; +} + +typedef enum { + fastest = 0, + median = 1, +} metricAggregatePref_e; + +/* compareFunction() : + * Sort input in decreasing order when used with qsort() */ +int compareFunction(const void *a, const void *b) +{ + double x = *(const double *)a; + double y = *(const double *)b; + if (x < y) + return 1; + else if (x > y) + return -1; + return 0; +} + +double aggregateData(double *data, size_t size, + metricAggregatePref_e metricAggregatePref) +{ + qsort(data, size, sizeof(*data), compareFunction); + if (metricAggregatePref == fastest) + return data[0]; + else /* median */ + return (data[(size - 1) / 2] + data[size / 2]) / 2; +} + +static int benchMem(slice_collection_t dstBlocks, slice_collection_t srcBlocks, + ddict_collection_t ddictionaries, + cdict_collection_t cdictionaries, unsigned nbRounds, + int benchCompression, const char *exeName, + ZSTD_CCtx_params *cctxParams, + metricAggregatePref_e metricAggregatePref) +{ + assert(dstBlocks.nbSlices == srcBlocks.nbSlices); + if (benchCompression) assert(cctxParams); + + unsigned const ms_per_round = RUN_TIME_DEFAULT_MS; + unsigned const total_time_ms = nbRounds * ms_per_round; + + double *const speedPerRound = (double *)malloc(nbRounds * sizeof(double)); + + BMK_timedFnState_t* const benchState = + BMK_createTimedFnState(total_time_ms, ms_per_round); + + decompressInstructions di = createDecompressInstructions(ddictionaries); + compressInstructions ci = + createCompressInstructions(cdictionaries, cctxParams); + void* payload = benchCompression ? (void*)&ci : (void*)&di; + BMK_benchParams_t const bp = { + .benchFn = benchCompression ? compress : decompress, + .benchPayload = payload, + .initFn = NULL, + .initPayload = NULL, + .errorFn = ZSTD_isError, + .blockCount = dstBlocks.nbSlices, + .srcBuffers = (const void* const*) srcBlocks.slicePtrs, + .srcSizes = srcBlocks.capacities, + .dstBuffers = dstBlocks.slicePtrs, + .dstCapacities = dstBlocks.capacities, + .blockResults = NULL + }; + + size_t roundNb = 0; + for (;;) { + BMK_runOutcome_t const outcome = BMK_benchTimedFn(benchState, bp); + CONTROL(BMK_isSuccessful_runOutcome(outcome)); + + BMK_runTime_t const result = BMK_extract_runTime(outcome); + double const dTime_ns = result.nanoSecPerRun; + double const dTime_sec = (double)dTime_ns / 1000000000; + size_t const srcSize = result.sumOfReturn; + double const speed_MBps = (double)srcSize / dTime_sec / (1 MB); + speedPerRound[roundNb] = speed_MBps; + if (benchCompression) + DISPLAY("Compression Speed : %.1f MB/s \r", speed_MBps); + else + DISPLAY("Decompression Speed : %.1f MB/s \r", speed_MBps); + + fflush(stdout); + if (BMK_isCompleted_TimedFn(benchState)) break; + roundNb++; + } + DISPLAY("\n"); + /* BMK_benchTimedFn may not run exactly nbRounds iterations */ + double speedAggregated = + aggregateData(speedPerRound, roundNb + 1, metricAggregatePref); + if (metricAggregatePref == fastest) + DISPLAY("Fastest Speed : %.1f MB/s \n", speedAggregated); + else + DISPLAY("Median Speed : %.1f MB/s \n", speedAggregated); + + char* csvFileName = malloc(strlen(exeName) + 5); + strcpy(csvFileName, exeName); + strcat(csvFileName, ".csv"); + FILE* csvFile = fopen(csvFileName, "r"); + if (!csvFile) { + csvFile = fopen(csvFileName, "wt"); + assert(csvFile); + fprintf(csvFile, "%s\n", exeName); + /* Print table headers */ + fprintf( + csvFile, + "Compression/Decompression,Level,nbDicts,dictAttachPref,metricAggregatePref,Speed\n"); + } else { + fclose(csvFile); + csvFile = fopen(csvFileName, "at"); + assert(csvFile); + } + + int cLevel = -1; + int dictAttachPref = -1; + if (benchCompression) { + ZSTD_CCtxParams_getParameter(cctxParams, ZSTD_c_compressionLevel, + &cLevel); + ZSTD_CCtxParams_getParameter(cctxParams, ZSTD_c_forceAttachDict, + &dictAttachPref); + } + fprintf(csvFile, "%s,%d,%ld,%d,%d,%.1f\n", + benchCompression ? "Compression" : "Decompression", cLevel, + benchCompression ? ci.nbDicts : di.nbDicts, dictAttachPref, + metricAggregatePref, speedAggregated); + fclose(csvFile); + free(csvFileName); + + freeDecompressInstructions(di); + freeCompressInstructions(ci); + BMK_freeTimedFnState(benchState); + + return 0; /* success */ +} + + +/*! bench() : + * fileName : file to load for benchmarking purpose + * dictionary : optional (can be NULL), file to load as dictionary, + * if none provided : will be calculated on the fly by the program. + * @return : 0 is success, 1+ otherwise */ +int bench(const char **fileNameTable, unsigned nbFiles, const char *dictionary, + size_t blockSize, int clevel, unsigned nbDictMax, unsigned nbBlocks, + unsigned nbRounds, int benchCompression, + ZSTD_dictContentType_e dictContentType, ZSTD_CCtx_params *cctxParams, + const char *exeName, metricAggregatePref_e metricAggregatePref) +{ + int result = 0; + + DISPLAYLEVEL(3, "loading %u files... \n", nbFiles); + buffer_collection_t const srcs = createBufferCollection_fromFiles(fileNameTable, nbFiles); + CONTROL(srcs.buffer.ptr != NULL); + buffer_t srcBuffer = srcs.buffer; + size_t const srcSize = srcBuffer.size; + DISPLAYLEVEL(3, "created src buffer of size %.1f MB \n", + (double)srcSize / (1 MB)); + + slice_collection_t const srcSlices = splitSlices(srcs.slices, blockSize, nbBlocks); + nbBlocks = (unsigned)(srcSlices.nbSlices); + DISPLAYLEVEL(3, "split input into %u blocks ", nbBlocks); + if (blockSize) + DISPLAYLEVEL(3, "of max size %u bytes ", (unsigned)blockSize); + DISPLAYLEVEL(3, "\n"); + size_t const totalSrcSlicesSize = sliceCollection_totalCapacity(srcSlices); + + + size_t* const dstCapacities = malloc(nbBlocks * sizeof(*dstCapacities)); + CONTROL(dstCapacities != NULL); + size_t dstBufferCapacity = 0; + for (size_t bnb=0; bnb='0') && (**stringPtr <='9')) { + unsigned const max = (((unsigned)(-1)) / 10) - 1; + assert(result <= max); /* check overflow */ + result *= 10, result += (unsigned)**stringPtr - '0', (*stringPtr)++ ; + } + if ((**stringPtr=='K') || (**stringPtr=='M')) { + unsigned const maxK = ((unsigned)(-1)) >> 10; + assert(result <= maxK); /* check overflow */ + result <<= 10; + if (**stringPtr=='M') { + assert(result <= maxK); /* check overflow */ + result <<= 10; + } + (*stringPtr)++; /* skip `K` or `M` */ + if (**stringPtr=='i') (*stringPtr)++; + if (**stringPtr=='B') (*stringPtr)++; + } + return result; +} + +/** longCommandWArg() : + * check if *stringPtr is the same as longCommand. + * If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand. + * @return 0 and doesn't modify *stringPtr otherwise. + */ +static int longCommandWArg(const char** stringPtr, const char* longCommand) +{ + size_t const comSize = strlen(longCommand); + int const result = !strncmp(*stringPtr, longCommand, comSize); + if (result) *stringPtr += comSize; + return result; +} + + +int usage(const char* exeName) +{ + DISPLAY (" \n"); + DISPLAY (" %s [Options] filename(s) \n", exeName); + DISPLAY (" \n"); + DISPLAY ("Options : \n"); + DISPLAY ("-z : benchmark compression (default) \n"); + DISPLAY ("-d : benchmark decompression \n"); + DISPLAY ("-r : recursively load all files in subdirectories (default: off) \n"); + DISPLAY ("-B# : split input into blocks of size # (default: no split) \n"); + DISPLAY ("-# : use compression level # (default: %u) \n", CLEVEL_DEFAULT); + DISPLAY ("-D # : use # as a dictionary (default: create one) \n"); + DISPLAY ("-i# : nb benchmark rounds (default: %u) \n", BENCH_TIME_DEFAULT_S); + DISPLAY ("-p# : print speed for all rounds 0=fastest 1=median (default: 0) \n"); + DISPLAY ("--nbBlocks=#: use # blocks for bench (default: one per file) \n"); + DISPLAY ("--nbDicts=# : create # dictionaries for bench (default: one per block) \n"); + DISPLAY ("-h : help (this text) \n"); + DISPLAY (" \n"); + DISPLAY ("Advanced Options (see zstd.h for documentation) : \n"); + DISPLAY ("--dedicated-dict-search\n"); + DISPLAY ("--dict-content-type=#\n"); + DISPLAY ("--dict-attach-pref=#\n"); + return 0; +} + +int bad_usage(const char* exeName) +{ + DISPLAY (" bad usage : \n"); + usage(exeName); + return 1; +} + +int main (int argc, const char** argv) +{ + int recursiveMode = 0; + int benchCompression = 1; + int dedicatedDictSearch = 0; + unsigned nbRounds = BENCH_TIME_DEFAULT_S; + const char* const exeName = argv[0]; + + if (argc < 2) return bad_usage(exeName); + + const char** nameTable = (const char**)malloc((size_t)argc * sizeof(const char*)); + assert(nameTable != NULL); + unsigned nameIdx = 0; + + const char* dictionary = NULL; + int cLevel = CLEVEL_DEFAULT; + size_t blockSize = BLOCKSIZE_DEFAULT; + unsigned nbDicts = 0; /* determine nbDicts automatically: 1 dictionary per block */ + unsigned nbBlocks = 0; /* determine nbBlocks automatically, from source and blockSize */ + ZSTD_dictContentType_e dictContentType = ZSTD_dct_auto; + ZSTD_dictAttachPref_e dictAttachPref = ZSTD_dictDefaultAttach; + ZSTD_ParamSwitch_e prefetchCDictTables = ZSTD_ps_auto; + metricAggregatePref_e metricAggregatePref = fastest; + + for (int argNb = 1; argNb < argc ; argNb++) { + const char* argument = argv[argNb]; + if (!strcmp(argument, "-h")) { free(nameTable); return usage(exeName); } + if (!strcmp(argument, "-d")) { benchCompression = 0; continue; } + if (!strcmp(argument, "-z")) { benchCompression = 1; continue; } + if (!strcmp(argument, "-r")) { recursiveMode = 1; continue; } + if (!strcmp(argument, "-D")) { argNb++; assert(argNb < argc); dictionary = argv[argNb]; continue; } + if (longCommandWArg(&argument, "-i")) { nbRounds = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "-p")) { metricAggregatePref = (int)readU32FromChar(&argument); continue;} + if (longCommandWArg(&argument, "--dictionary=")) { dictionary = argument; continue; } + if (longCommandWArg(&argument, "-B")) { blockSize = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "--blockSize=")) { blockSize = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "--nbDicts=")) { nbDicts = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "--nbBlocks=")) { nbBlocks = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "--clevel=")) { cLevel = (int)readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "--dedicated-dict-search")) { dedicatedDictSearch = 1; continue; } + if (longCommandWArg(&argument, "--dict-content-type=")) { dictContentType = (int)readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "--dict-attach-pref=")) { dictAttachPref = (int)readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "--prefetch-cdict-tables=")) { prefetchCDictTables = (int)readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "-")) { cLevel = (int)readU32FromChar(&argument); continue; } + /* anything that's not a command is a filename */ + nameTable[nameIdx++] = argument; + } + + FileNamesTable* filenameTable; + + if (recursiveMode) { +#ifndef UTIL_HAS_CREATEFILELIST + assert(0); /* missing capability, do not run */ +#endif + filenameTable = UTIL_createExpandedFNT(nameTable, nameIdx, 1 /* follow_links */); + } else { + filenameTable = UTIL_assembleFileNamesTable(nameTable, nameIdx, NULL); + nameTable = NULL; /* UTIL_createFileNamesTable() takes ownership of nameTable */ + } + + ZSTD_CCtx_params* cctxParams = ZSTD_createCCtxParams(); + ZSTD_CCtxParams_init(cctxParams, cLevel); + ZSTD_CCtxParams_setParameter(cctxParams, ZSTD_c_enableDedicatedDictSearch, dedicatedDictSearch); + ZSTD_CCtxParams_setParameter(cctxParams, ZSTD_c_nbWorkers, 0); + ZSTD_CCtxParams_setParameter(cctxParams, ZSTD_c_forceAttachDict, dictAttachPref); + ZSTD_CCtxParams_setParameter(cctxParams, ZSTD_c_prefetchCDictTables, prefetchCDictTables); + + int result = + bench(filenameTable->fileNames, (unsigned)filenameTable->tableSize, + dictionary, blockSize, cLevel, nbDicts, nbBlocks, nbRounds, + benchCompression, dictContentType, cctxParams, exeName, + metricAggregatePref); + + UTIL_freeFileNamesTable(filenameTable); + free(nameTable); + ZSTD_freeCCtxParams(cctxParams); + + return result; +} diff --git a/contrib/match_finders/zstd_edist.c b/contrib/match_finders/zstd_edist.c new file mode 100644 index 0000000..d685cdd --- /dev/null +++ b/contrib/match_finders/zstd_edist.c @@ -0,0 +1,558 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/*-************************************* +* Dependencies +***************************************/ + +/* Currently relies on qsort when combining contiguous matches. This can probably + * be avoided but would require changes to the algorithm. The qsort is far from + * the bottleneck in this algorithm even for medium sized files so it's probably + * not worth trying to address */ +#include +#include + +#include "zstd_edist.h" +#include "mem.h" + +/*-************************************* +* Constants +***************************************/ + +/* Just a sential for the entries of the diagonal matrix */ +#define ZSTD_EDIST_DIAG_MAX (S32)(1 << 30) + +/* How large should a snake be to be considered a 'big' snake. + * For an explanation of what a 'snake' is with respect to the + * edit distance matrix, see the linked paper in zstd_edist.h */ +#define ZSTD_EDIST_SNAKE_THRESH 20 + +/* After how many iterations should we start to use the heuristic + * based on 'big' snakes */ +#define ZSTD_EDIST_SNAKE_ITER_THRESH 200 + +/* After how many iterations should be just give up and take + * the best available edit script for this round */ +#define ZSTD_EDIST_EXPENSIVE_THRESH 1024 + +/*-************************************* +* Structures +***************************************/ + +typedef struct { + U32 dictIdx; + U32 srcIdx; + U32 matchLength; +} ZSTD_eDist_match; + +typedef struct { + const BYTE* dict; + const BYTE* src; + size_t dictSize; + size_t srcSize; + S32* forwardDiag; /* Entries of the forward diagonal stored here */ + S32* backwardDiag; /* Entries of the backward diagonal stored here. + * Note: this buffer and the 'forwardDiag' buffer + * are contiguous. See the ZSTD_eDist_genSequences */ + ZSTD_eDist_match* matches; /* Accumulate matches of length 1 in this buffer. + * In a subsequence post-processing step, we combine + * contiguous matches. */ + U32 nbMatches; +} ZSTD_eDist_state; + +typedef struct { + S32 dictMid; /* The mid diagonal for the dictionary */ + S32 srcMid; /* The mid diagonal for the source */ + int lowUseHeuristics; /* Should we use heuristics for the low part */ + int highUseHeuristics; /* Should we use heuristics for the high part */ +} ZSTD_eDist_partition; + +/*-************************************* +* Internal +***************************************/ + +static void ZSTD_eDist_diag(ZSTD_eDist_state* state, + ZSTD_eDist_partition* partition, + S32 dictLow, S32 dictHigh, S32 srcLow, + S32 srcHigh, int useHeuristics) +{ + S32* const forwardDiag = state->forwardDiag; + S32* const backwardDiag = state->backwardDiag; + const BYTE* const dict = state->dict; + const BYTE* const src = state->src; + + S32 const diagMin = dictLow - srcHigh; + S32 const diagMax = dictHigh - srcLow; + S32 const forwardMid = dictLow - srcLow; + S32 const backwardMid = dictHigh - srcHigh; + + S32 forwardMin = forwardMid; + S32 forwardMax = forwardMid; + S32 backwardMin = backwardMid; + S32 backwardMax = backwardMid; + int odd = (forwardMid - backwardMid) & 1; + U32 iterations; + + forwardDiag[forwardMid] = dictLow; + backwardDiag[backwardMid] = dictHigh; + + /* Main loop for updating diag entries. Unless useHeuristics is + * set to false, this loop will run until it finds the minimal + * edit script */ + for (iterations = 1;;iterations++) { + S32 diag; + int bigSnake = 0; + + if (forwardMin > diagMin) { + forwardMin--; + forwardDiag[forwardMin - 1] = -1; + } else { + forwardMin++; + } + + if (forwardMax < diagMax) { + forwardMax++; + forwardDiag[forwardMax + 1] = -1; + } else { + forwardMax--; + } + + for (diag = forwardMax; diag >= forwardMin; diag -= 2) { + S32 dictIdx; + S32 srcIdx; + S32 low = forwardDiag[diag - 1]; + S32 high = forwardDiag[diag + 1]; + S32 dictIdx0 = low < high ? high : low + 1; + + for (dictIdx = dictIdx0, srcIdx = dictIdx0 - diag; + dictIdx < dictHigh && srcIdx < srcHigh && dict[dictIdx] == src[srcIdx]; + dictIdx++, srcIdx++) continue; + + if (dictIdx - dictIdx0 > ZSTD_EDIST_SNAKE_THRESH) + bigSnake = 1; + + forwardDiag[diag] = dictIdx; + + if (odd && backwardMin <= diag && diag <= backwardMax && backwardDiag[diag] <= dictIdx) { + partition->dictMid = dictIdx; + partition->srcMid = srcIdx; + partition->lowUseHeuristics = 0; + partition->highUseHeuristics = 0; + return; + } + } + + if (backwardMin > diagMin) { + backwardMin--; + backwardDiag[backwardMin - 1] = ZSTD_EDIST_DIAG_MAX; + } else { + backwardMin++; + } + + if (backwardMax < diagMax) { + backwardMax++; + backwardDiag[backwardMax + 1] = ZSTD_EDIST_DIAG_MAX; + } else { + backwardMax--; + } + + + for (diag = backwardMax; diag >= backwardMin; diag -= 2) { + S32 dictIdx; + S32 srcIdx; + S32 low = backwardDiag[diag - 1]; + S32 high = backwardDiag[diag + 1]; + S32 dictIdx0 = low < high ? low : high - 1; + + for (dictIdx = dictIdx0, srcIdx = dictIdx0 - diag; + dictLow < dictIdx && srcLow < srcIdx && dict[dictIdx - 1] == src[srcIdx - 1]; + dictIdx--, srcIdx--) continue; + + if (dictIdx0 - dictIdx > ZSTD_EDIST_SNAKE_THRESH) + bigSnake = 1; + + backwardDiag[diag] = dictIdx; + + if (!odd && forwardMin <= diag && diag <= forwardMax && dictIdx <= forwardDiag[diag]) { + partition->dictMid = dictIdx; + partition->srcMid = srcIdx; + partition->lowUseHeuristics = 0; + partition->highUseHeuristics = 0; + return; + } + } + + if (!useHeuristics) + continue; + + /* Everything under this point is a heuristic. Using these will + * substantially speed up the match finding. In some cases, taking + * the total match finding time from several minutes to seconds. + * Of course, the caveat is that the edit script found may no longer + * be optimal */ + + /* Big snake heuristic */ + if (iterations > ZSTD_EDIST_SNAKE_ITER_THRESH && bigSnake) { + { + S32 best = 0; + + for (diag = forwardMax; diag >= forwardMin; diag -= 2) { + S32 diagDiag = diag - forwardMid; + S32 dictIdx = forwardDiag[diag]; + S32 srcIdx = dictIdx - diag; + S32 v = (dictIdx - dictLow) * 2 - diagDiag; + + if (v > 12 * (iterations + (diagDiag < 0 ? -diagDiag : diagDiag))) { + if (v > best + && dictLow + ZSTD_EDIST_SNAKE_THRESH <= dictIdx && dictIdx <= dictHigh + && srcLow + ZSTD_EDIST_SNAKE_THRESH <= srcIdx && srcIdx <= srcHigh) { + S32 k; + for (k = 1; dict[dictIdx - k] == src[srcIdx - k]; k++) { + if (k == ZSTD_EDIST_SNAKE_THRESH) { + best = v; + partition->dictMid = dictIdx; + partition->srcMid = srcIdx; + break; + } + } + } + } + } + + if (best > 0) { + partition->lowUseHeuristics = 0; + partition->highUseHeuristics = 1; + return; + } + } + + { + S32 best = 0; + + for (diag = backwardMax; diag >= backwardMin; diag -= 2) { + S32 diagDiag = diag - backwardMid; + S32 dictIdx = backwardDiag[diag]; + S32 srcIdx = dictIdx - diag; + S32 v = (dictHigh - dictIdx) * 2 + diagDiag; + + if (v > 12 * (iterations + (diagDiag < 0 ? -diagDiag : diagDiag))) { + if (v > best + && dictLow < dictIdx && dictIdx <= dictHigh - ZSTD_EDIST_SNAKE_THRESH + && srcLow < srcIdx && srcIdx <= srcHigh - ZSTD_EDIST_SNAKE_THRESH) { + int k; + for (k = 0; dict[dictIdx + k] == src[srcIdx + k]; k++) { + if (k == ZSTD_EDIST_SNAKE_THRESH - 1) { + best = v; + partition->dictMid = dictIdx; + partition->srcMid = srcIdx; + break; + } + } + } + } + } + + if (best > 0) { + partition->lowUseHeuristics = 1; + partition->highUseHeuristics = 0; + return; + } + } + } + + /* More general 'too expensive' heuristic */ + if (iterations >= ZSTD_EDIST_EXPENSIVE_THRESH) { + S32 forwardDictSrcBest; + S32 forwardDictBest = 0; + S32 backwardDictSrcBest; + S32 backwardDictBest = 0; + + forwardDictSrcBest = -1; + for (diag = forwardMax; diag >= forwardMin; diag -= 2) { + S32 dictIdx = MIN(forwardDiag[diag], dictHigh); + S32 srcIdx = dictIdx - diag; + + if (srcHigh < srcIdx) { + dictIdx = srcHigh + diag; + srcIdx = srcHigh; + } + + if (forwardDictSrcBest < dictIdx + srcIdx) { + forwardDictSrcBest = dictIdx + srcIdx; + forwardDictBest = dictIdx; + } + } + + backwardDictSrcBest = ZSTD_EDIST_DIAG_MAX; + for (diag = backwardMax; diag >= backwardMin; diag -= 2) { + S32 dictIdx = MAX(dictLow, backwardDiag[diag]); + S32 srcIdx = dictIdx - diag; + + if (srcIdx < srcLow) { + dictIdx = srcLow + diag; + srcIdx = srcLow; + } + + if (dictIdx + srcIdx < backwardDictSrcBest) { + backwardDictSrcBest = dictIdx + srcIdx; + backwardDictBest = dictIdx; + } + } + + if ((dictHigh + srcHigh) - backwardDictSrcBest < forwardDictSrcBest - (dictLow + srcLow)) { + partition->dictMid = forwardDictBest; + partition->srcMid = forwardDictSrcBest - forwardDictBest; + partition->lowUseHeuristics = 0; + partition->highUseHeuristics = 1; + } else { + partition->dictMid = backwardDictBest; + partition->srcMid = backwardDictSrcBest - backwardDictBest; + partition->lowUseHeuristics = 1; + partition->highUseHeuristics = 0; + } + return; + } + } +} + +static void ZSTD_eDist_insertMatch(ZSTD_eDist_state* state, + S32 const dictIdx, S32 const srcIdx) +{ + state->matches[state->nbMatches].dictIdx = dictIdx; + state->matches[state->nbMatches].srcIdx = srcIdx; + state->matches[state->nbMatches].matchLength = 1; + state->nbMatches++; +} + +static int ZSTD_eDist_compare(ZSTD_eDist_state* state, + S32 dictLow, S32 dictHigh, S32 srcLow, + S32 srcHigh, int useHeuristics) +{ + const BYTE* const dict = state->dict; + const BYTE* const src = state->src; + + /* Found matches while traversing from the low end */ + while (dictLow < dictHigh && srcLow < srcHigh && dict[dictLow] == src[srcLow]) { + ZSTD_eDist_insertMatch(state, dictLow, srcLow); + dictLow++; + srcLow++; + } + + /* Found matches while traversing from the high end */ + while (dictLow < dictHigh && srcLow < srcHigh && dict[dictHigh - 1] == src[srcHigh - 1]) { + ZSTD_eDist_insertMatch(state, dictHigh - 1, srcHigh - 1); + dictHigh--; + srcHigh--; + } + + /* If the low and high end end up touching. If we wanted to make + * note of the differences like most diffing algorithms do, we would + * do so here. In our case, we're only concerned with matches + * Note: if you wanted to find the edit distance of the algorithm, + * you could just accumulate the cost for an insertion/deletion + * below. */ + if (dictLow == dictHigh) { + while (srcLow < srcHigh) { + /* Reaching this point means inserting src[srcLow] into + * the current position of dict */ + srcLow++; + } + } else if (srcLow == srcHigh) { + while (dictLow < dictHigh) { + /* Reaching this point means deleting dict[dictLow] from + * the current position of dict */ + dictLow++; + } + } else { + ZSTD_eDist_partition partition; + partition.dictMid = 0; + partition.srcMid = 0; + ZSTD_eDist_diag(state, &partition, dictLow, dictHigh, + srcLow, srcHigh, useHeuristics); + if (ZSTD_eDist_compare(state, dictLow, partition.dictMid, + srcLow, partition.srcMid, partition.lowUseHeuristics)) + return 1; + if (ZSTD_eDist_compare(state, partition.dictMid, dictHigh, + partition.srcMid, srcHigh, partition.highUseHeuristics)) + return 1; + } + + return 0; +} + +static int ZSTD_eDist_matchComp(const void* p, const void* q) +{ + S32 const l = ((ZSTD_eDist_match*)p)->srcIdx; + S32 const r = ((ZSTD_eDist_match*)q)->srcIdx; + return (l - r); +} + +/* The matches from the approach above will all be of the form + * (dictIdx, srcIdx, 1). This method combines contiguous matches + * of length MINMATCH or greater. Matches less than MINMATCH + * are discarded */ +static void ZSTD_eDist_combineMatches(ZSTD_eDist_state* state) +{ + /* Create a new buffer to put the combined matches into + * and memcpy to state->matches after */ + ZSTD_eDist_match* combinedMatches = + ZSTD_malloc(state->nbMatches * sizeof(ZSTD_eDist_match), + ZSTD_defaultCMem); + + U32 nbCombinedMatches = 1; + size_t i; + + /* Make sure that the srcIdx and dictIdx are in sorted order. + * The combination step won't work otherwise */ + qsort(state->matches, state->nbMatches, sizeof(ZSTD_eDist_match), ZSTD_eDist_matchComp); + + memcpy(combinedMatches, state->matches, sizeof(ZSTD_eDist_match)); + for (i = 1; i < state->nbMatches; i++) { + ZSTD_eDist_match const match = state->matches[i]; + ZSTD_eDist_match const combinedMatch = + combinedMatches[nbCombinedMatches - 1]; + if (combinedMatch.srcIdx + combinedMatch.matchLength == match.srcIdx && + combinedMatch.dictIdx + combinedMatch.matchLength == match.dictIdx) { + combinedMatches[nbCombinedMatches - 1].matchLength++; + } else { + /* Discard matches that are less than MINMATCH */ + if (combinedMatches[nbCombinedMatches - 1].matchLength < MINMATCH) { + nbCombinedMatches--; + } + + memcpy(combinedMatches + nbCombinedMatches, + state->matches + i, sizeof(ZSTD_eDist_match)); + nbCombinedMatches++; + } + } + memcpy(state->matches, combinedMatches, nbCombinedMatches * sizeof(ZSTD_eDist_match)); + state->nbMatches = nbCombinedMatches; + ZSTD_free(combinedMatches, ZSTD_defaultCMem); +} + +static size_t ZSTD_eDist_convertMatchesToSequences(ZSTD_Sequence* sequences, + ZSTD_eDist_state* state) +{ + const ZSTD_eDist_match* matches = state->matches; + size_t const nbMatches = state->nbMatches; + size_t const dictSize = state->dictSize; + size_t nbSequences = 0; + size_t i; + for (i = 0; i < nbMatches; i++) { + ZSTD_eDist_match const match = matches[i]; + U32 const litLength = !i ? match.srcIdx : + match.srcIdx - (matches[i - 1].srcIdx + matches[i - 1].matchLength); + U32 const offset = (match.srcIdx + dictSize) - match.dictIdx; + U32 const matchLength = match.matchLength; + sequences[nbSequences].offset = offset; + sequences[nbSequences].litLength = litLength; + sequences[nbSequences].matchLength = matchLength; + nbSequences++; + } + return nbSequences; +} + +/*-************************************* +* Internal utils +***************************************/ + +static size_t ZSTD_eDist_hamingDist(const BYTE* const a, + const BYTE* const b, size_t n) +{ + size_t i; + size_t dist = 0; + for (i = 0; i < n; i++) + dist += a[i] != b[i]; + return dist; +} + +/* This is a pretty naive recursive implementation that should only + * be used for quick tests obviously. Don't try and run this on a + * GB file or something. There are faster implementations. Use those + * if you need to run it for large files. */ +static size_t ZSTD_eDist_levenshteinDist(const BYTE* const s, + size_t const sn, const BYTE* const t, + size_t const tn) +{ + size_t a, b, c; + + if (!sn) + return tn; + if (!tn) + return sn; + + if (s[sn - 1] == t[tn - 1]) + return ZSTD_eDist_levenshteinDist( + s, sn - 1, t, tn - 1); + + a = ZSTD_eDist_levenshteinDist(s, sn - 1, t, tn - 1); + b = ZSTD_eDist_levenshteinDist(s, sn, t, tn - 1); + c = ZSTD_eDist_levenshteinDist(s, sn - 1, t, tn); + + if (a > b) + a = b; + if (a > c) + a = c; + + return a + 1; +} + +static void ZSTD_eDist_validateMatches(ZSTD_eDist_match* matches, + size_t const nbMatches, const BYTE* const dict, + size_t const dictSize, const BYTE* const src, + size_t const srcSize) +{ + size_t i; + for (i = 0; i < nbMatches; i++) { + ZSTD_eDist_match match = matches[i]; + U32 const dictIdx = match.dictIdx; + U32 const srcIdx = match.srcIdx; + U32 const matchLength = match.matchLength; + + assert(dictIdx + matchLength < dictSize); + assert(srcIdx + matchLength < srcSize); + assert(!memcmp(dict + dictIdx, src + srcIdx, matchLength)); + } +} + +/*-************************************* +* API +***************************************/ + +size_t ZSTD_eDist_genSequences(ZSTD_Sequence* sequences, + const void* dict, size_t dictSize, + const void* src, size_t srcSize, + int useHeuristics) +{ + size_t const nbDiags = dictSize + srcSize + 3; + S32* buffer = ZSTD_malloc(nbDiags * 2 * sizeof(S32), ZSTD_defaultCMem); + ZSTD_eDist_state state; + size_t nbSequences = 0; + + state.dict = (const BYTE*)dict; + state.src = (const BYTE*)src; + state.dictSize = dictSize; + state.srcSize = srcSize; + state.forwardDiag = buffer; + state.backwardDiag = buffer + nbDiags; + state.forwardDiag += srcSize + 1; + state.backwardDiag += srcSize + 1; + state.matches = ZSTD_malloc(srcSize * sizeof(ZSTD_eDist_match), ZSTD_defaultCMem); + state.nbMatches = 0; + + ZSTD_eDist_compare(&state, 0, dictSize, 0, srcSize, 1); + ZSTD_eDist_combineMatches(&state); + nbSequences = ZSTD_eDist_convertMatchesToSequences(sequences, &state); + + ZSTD_free(buffer, ZSTD_defaultCMem); + ZSTD_free(state.matches, ZSTD_defaultCMem); + + return nbSequences; +} diff --git a/contrib/pzstd/Pzstd.cpp b/contrib/pzstd/Pzstd.cpp new file mode 100644 index 0000000..048a006 --- /dev/null +++ b/contrib/pzstd/Pzstd.cpp @@ -0,0 +1,626 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + */ +#include "platform.h" /* Large Files support, SET_BINARY_MODE */ +#include "Pzstd.h" +#include "SkippableFrame.h" +#include "utils/FileSystem.h" +#include "utils/Portability.h" +#include "utils/Range.h" +#include "utils/ScopeGuard.h" +#include "utils/ThreadPool.h" +#include "utils/WorkQueue.h" + +#include +#include +#include +#include +#include +#include +#include + + +namespace pzstd { + +namespace { +#ifdef _WIN32 +const std::string nullOutput = "nul"; +#else +const std::string nullOutput = "/dev/null"; +#endif +} + +using std::size_t; + +static std::uintmax_t fileSizeOrZero(const std::string &file) { + if (file == "-") { + return 0; + } + std::error_code ec; + auto size = file_size(file, ec); + if (ec) { + size = 0; + } + return size; +} + +static std::uint64_t handleOneInput(const Options &options, + const std::string &inputFile, + FILE* inputFd, + const std::string &outputFile, + FILE* outputFd, + SharedState& state) { + auto inputSize = fileSizeOrZero(inputFile); + // WorkQueue outlives ThreadPool so in the case of error we are certain + // we don't accidentally try to call push() on it after it is destroyed + WorkQueue> outs{options.numThreads + 1}; + std::uint64_t bytesRead; + std::uint64_t bytesWritten; + { + // Initialize the (de)compression thread pool with numThreads + ThreadPool executor(options.numThreads); + // Run the reader thread on an extra thread + ThreadPool readExecutor(1); + if (!options.decompress) { + // Add a job that reads the input and starts all the compression jobs + readExecutor.add( + [&state, &outs, &executor, inputFd, inputSize, &options, &bytesRead] { + bytesRead = asyncCompressChunks( + state, + outs, + executor, + inputFd, + inputSize, + options.numThreads, + options.determineParameters()); + }); + // Start writing + bytesWritten = writeFile(state, outs, outputFd, options.decompress); + } else { + // Add a job that reads the input and starts all the decompression jobs + readExecutor.add([&state, &outs, &executor, inputFd, &bytesRead] { + bytesRead = asyncDecompressFrames(state, outs, executor, inputFd); + }); + // Start writing + bytesWritten = writeFile(state, outs, outputFd, options.decompress); + } + } + if (!state.errorHolder.hasError()) { + std::string inputFileName = inputFile == "-" ? "stdin" : inputFile; + std::string outputFileName = outputFile == "-" ? "stdout" : outputFile; + if (!options.decompress) { + double ratio = static_cast(bytesWritten) / + static_cast(bytesRead + !bytesRead); + state.log(kLogInfo, "%-20s :%6.2f%% (%6" PRIu64 " => %6" PRIu64 + " bytes, %s)\n", + inputFileName.c_str(), ratio * 100, bytesRead, bytesWritten, + outputFileName.c_str()); + } else { + state.log(kLogInfo, "%-20s: %" PRIu64 " bytes \n", + inputFileName.c_str(),bytesWritten); + } + } + return bytesWritten; +} + +static FILE *openInputFile(const std::string &inputFile, + ErrorHolder &errorHolder) { + if (inputFile == "-") { + SET_BINARY_MODE(stdin); + return stdin; + } + // Check if input file is a directory + { + std::error_code ec; + if (is_directory(inputFile, ec)) { + errorHolder.setError("Output file is a directory -- ignored"); + return nullptr; + } + } + auto inputFd = std::fopen(inputFile.c_str(), "rb"); + if (!errorHolder.check(inputFd != nullptr, "Failed to open input file")) { + return nullptr; + } + return inputFd; +} + +static FILE *openOutputFile(const Options &options, + const std::string &outputFile, + SharedState& state) { + if (outputFile == "-") { + SET_BINARY_MODE(stdout); + return stdout; + } + // Check if the output file exists and then open it + if (!options.overwrite && outputFile != nullOutput) { + auto outputFd = std::fopen(outputFile.c_str(), "rb"); + if (outputFd != nullptr) { + std::fclose(outputFd); + if (!state.log.logsAt(kLogInfo)) { + state.errorHolder.setError("Output file exists"); + return nullptr; + } + state.log( + kLogInfo, + "pzstd: %s already exists; do you wish to overwrite (y/n) ? ", + outputFile.c_str()); + int c = getchar(); + if (c != 'y' && c != 'Y') { + state.errorHolder.setError("Not overwritten"); + return nullptr; + } + } + } + auto outputFd = std::fopen(outputFile.c_str(), "wb"); + if (!state.errorHolder.check( + outputFd != nullptr, "Failed to open output file")) { + return nullptr; + } + return outputFd; +} + +int pzstdMain(const Options &options) { + int returnCode = 0; + SharedState state(options); + for (const auto& input : options.inputFiles) { + // Setup the shared state + auto printErrorGuard = makeScopeGuard([&] { + if (state.errorHolder.hasError()) { + returnCode = 1; + state.log(kLogError, "pzstd: %s: %s.\n", input.c_str(), + state.errorHolder.getError().c_str()); + } + }); + // Open the input file + auto inputFd = openInputFile(input, state.errorHolder); + if (inputFd == nullptr) { + continue; + } + auto closeInputGuard = makeScopeGuard([&] { std::fclose(inputFd); }); + // Open the output file + auto outputFile = options.getOutputFile(input); + if (!state.errorHolder.check(outputFile != "", + "Input file does not have extension .zst")) { + continue; + } + auto outputFd = openOutputFile(options, outputFile, state); + if (outputFd == nullptr) { + continue; + } + auto closeOutputGuard = makeScopeGuard([&] { std::fclose(outputFd); }); + // (de)compress the file + handleOneInput(options, input, inputFd, outputFile, outputFd, state); + if (state.errorHolder.hasError()) { + continue; + } + // Delete the input file if necessary + if (!options.keepSource) { + // Be sure that we are done and have written everything before we delete + if (!state.errorHolder.check(std::fclose(inputFd) == 0, + "Failed to close input file")) { + continue; + } + closeInputGuard.dismiss(); + if (!state.errorHolder.check(std::fclose(outputFd) == 0, + "Failed to close output file")) { + continue; + } + closeOutputGuard.dismiss(); + if (std::remove(input.c_str()) != 0) { + state.errorHolder.setError("Failed to remove input file"); + continue; + } + } + } + // Returns 1 if any of the files failed to (de)compress. + return returnCode; +} + +/// Construct a `ZSTD_inBuffer` that points to the data in `buffer`. +static ZSTD_inBuffer makeZstdInBuffer(const Buffer& buffer) { + return ZSTD_inBuffer{buffer.data(), buffer.size(), 0}; +} + +/** + * Advance `buffer` and `inBuffer` by the amount of data read, as indicated by + * `inBuffer.pos`. + */ +void advance(Buffer& buffer, ZSTD_inBuffer& inBuffer) { + auto pos = inBuffer.pos; + inBuffer.src = static_cast(inBuffer.src) + pos; + inBuffer.size -= pos; + inBuffer.pos = 0; + return buffer.advance(pos); +} + +/// Construct a `ZSTD_outBuffer` that points to the data in `buffer`. +static ZSTD_outBuffer makeZstdOutBuffer(Buffer& buffer) { + return ZSTD_outBuffer{buffer.data(), buffer.size(), 0}; +} + +/** + * Split `buffer` and advance `outBuffer` by the amount of data written, as + * indicated by `outBuffer.pos`. + */ +Buffer split(Buffer& buffer, ZSTD_outBuffer& outBuffer) { + auto pos = outBuffer.pos; + outBuffer.dst = static_cast(outBuffer.dst) + pos; + outBuffer.size -= pos; + outBuffer.pos = 0; + return buffer.splitAt(pos); +} + +/** + * Stream chunks of input from `in`, compress it, and stream it out to `out`. + * + * @param state The shared state + * @param in Queue that we `pop()` input buffers from + * @param out Queue that we `push()` compressed output buffers to + * @param maxInputSize An upper bound on the size of the input + */ +static void compress( + SharedState& state, + std::shared_ptr in, + std::shared_ptr out, + size_t maxInputSize) { + auto& errorHolder = state.errorHolder; + auto guard = makeScopeGuard([&] { + in->finish(); + out->finish(); + }); + // Initialize the CCtx + auto ctx = state.cStreamPool->get(); + if (!errorHolder.check(ctx != nullptr, "Failed to allocate ZSTD_CStream")) { + return; + } + { + auto err = ZSTD_CCtx_reset(ctx.get(), ZSTD_reset_session_only); + if (!errorHolder.check(!ZSTD_isError(err), ZSTD_getErrorName(err))) { + return; + } + } + + // Allocate space for the result + auto outBuffer = Buffer(ZSTD_compressBound(maxInputSize)); + auto zstdOutBuffer = makeZstdOutBuffer(outBuffer); + { + Buffer inBuffer; + // Read a buffer in from the input queue + while (in->pop(inBuffer) && !errorHolder.hasError()) { + auto zstdInBuffer = makeZstdInBuffer(inBuffer); + // Compress the whole buffer and send it to the output queue + while (!inBuffer.empty() && !errorHolder.hasError()) { + if (!errorHolder.check( + !outBuffer.empty(), "ZSTD_compressBound() was too small")) { + return; + } + // Compress + auto err = + ZSTD_compressStream(ctx.get(), &zstdOutBuffer, &zstdInBuffer); + if (!errorHolder.check(!ZSTD_isError(err), ZSTD_getErrorName(err))) { + return; + } + // Split the compressed data off outBuffer and pass to the output queue + out->push(split(outBuffer, zstdOutBuffer)); + // Forget about the data we already compressed + advance(inBuffer, zstdInBuffer); + } + } + } + // Write the epilog + size_t bytesLeft; + do { + if (!errorHolder.check( + !outBuffer.empty(), "ZSTD_compressBound() was too small")) { + return; + } + bytesLeft = ZSTD_endStream(ctx.get(), &zstdOutBuffer); + if (!errorHolder.check( + !ZSTD_isError(bytesLeft), ZSTD_getErrorName(bytesLeft))) { + return; + } + out->push(split(outBuffer, zstdOutBuffer)); + } while (bytesLeft != 0 && !errorHolder.hasError()); +} + +/** + * Calculates how large each independently compressed frame should be. + * + * @param size The size of the source if known, 0 otherwise + * @param numThreads The number of threads available to run compression jobs on + * @param params The zstd parameters to be used for compression + */ +static size_t calculateStep( + std::uintmax_t size, + size_t numThreads, + const ZSTD_parameters ¶ms) { + (void)size; + (void)numThreads; + // Not validated to work correctly for window logs > 23. + // It will definitely fail if windowLog + 2 is >= 4GB because + // the skippable frame can only store sizes up to 4GB. + assert(params.cParams.windowLog <= 23); + return size_t{1} << (params.cParams.windowLog + 2); +} + +namespace { +enum class FileStatus { Continue, Done, Error }; +/// Determines the status of the file descriptor `fd`. +FileStatus fileStatus(FILE* fd) { + if (std::feof(fd)) { + return FileStatus::Done; + } else if (std::ferror(fd)) { + return FileStatus::Error; + } + return FileStatus::Continue; +} +} // anonymous namespace + +/** + * Reads `size` data in chunks of `chunkSize` and puts it into `queue`. + * Will read less if an error or EOF occurs. + * Returns the status of the file after all of the reads have occurred. + */ +static FileStatus +readData(BufferWorkQueue& queue, size_t chunkSize, size_t size, FILE* fd, + std::uint64_t *totalBytesRead) { + Buffer buffer(size); + while (!buffer.empty()) { + auto bytesRead = + std::fread(buffer.data(), 1, std::min(chunkSize, buffer.size()), fd); + *totalBytesRead += bytesRead; + queue.push(buffer.splitAt(bytesRead)); + auto status = fileStatus(fd); + if (status != FileStatus::Continue) { + return status; + } + } + return FileStatus::Continue; +} + +std::uint64_t asyncCompressChunks( + SharedState& state, + WorkQueue>& chunks, + ThreadPool& executor, + FILE* fd, + std::uintmax_t size, + size_t numThreads, + ZSTD_parameters params) { + auto chunksGuard = makeScopeGuard([&] { chunks.finish(); }); + std::uint64_t bytesRead = 0; + + // Break the input up into chunks of size `step` and compress each chunk + // independently. + size_t step = calculateStep(size, numThreads, params); + state.log(kLogDebug, "Chosen frame size: %zu\n", step); + auto status = FileStatus::Continue; + while (status == FileStatus::Continue && !state.errorHolder.hasError()) { + // Make a new input queue that we will put the chunk's input data into. + auto in = std::make_shared(); + auto inGuard = makeScopeGuard([&] { in->finish(); }); + // Make a new output queue that compress will put the compressed data into. + auto out = std::make_shared(); + // Start compression in the thread pool + executor.add([&state, in, out, step] { + return compress( + state, std::move(in), std::move(out), step); + }); + // Pass the output queue to the writer thread. + chunks.push(std::move(out)); + state.log(kLogVerbose, "%s\n", "Starting a new frame"); + // Fill the input queue for the compression job we just started + status = readData(*in, ZSTD_CStreamInSize(), step, fd, &bytesRead); + } + state.errorHolder.check(status != FileStatus::Error, "Error reading input"); + return bytesRead; +} + +/** + * Decompress a frame, whose data is streamed into `in`, and stream the output + * to `out`. + * + * @param state The shared state + * @param in Queue that we `pop()` input buffers from. It contains + * exactly one compressed frame. + * @param out Queue that we `push()` decompressed output buffers to + */ +static void decompress( + SharedState& state, + std::shared_ptr in, + std::shared_ptr out) { + auto& errorHolder = state.errorHolder; + auto guard = makeScopeGuard([&] { + in->finish(); + out->finish(); + }); + // Initialize the DCtx + auto ctx = state.dStreamPool->get(); + if (!errorHolder.check(ctx != nullptr, "Failed to allocate ZSTD_DStream")) { + return; + } + { + auto err = ZSTD_DCtx_reset(ctx.get(), ZSTD_reset_session_only); + if (!errorHolder.check(!ZSTD_isError(err), ZSTD_getErrorName(err))) { + return; + } + } + + const size_t outSize = ZSTD_DStreamOutSize(); + Buffer inBuffer; + size_t returnCode = 0; + // Read a buffer in from the input queue + while (in->pop(inBuffer) && !errorHolder.hasError()) { + auto zstdInBuffer = makeZstdInBuffer(inBuffer); + // Decompress the whole buffer and send it to the output queue + while (!inBuffer.empty() && !errorHolder.hasError()) { + // Allocate a buffer with at least outSize bytes. + Buffer outBuffer(outSize); + auto zstdOutBuffer = makeZstdOutBuffer(outBuffer); + // Decompress + returnCode = + ZSTD_decompressStream(ctx.get(), &zstdOutBuffer, &zstdInBuffer); + if (!errorHolder.check( + !ZSTD_isError(returnCode), ZSTD_getErrorName(returnCode))) { + return; + } + // Pass the buffer with the decompressed data to the output queue + out->push(split(outBuffer, zstdOutBuffer)); + // Advance past the input we already read + advance(inBuffer, zstdInBuffer); + if (returnCode == 0) { + // The frame is over, prepare to (maybe) start a new frame + ZSTD_initDStream(ctx.get()); + } + } + } + if (!errorHolder.check(returnCode <= 1, "Incomplete block")) { + return; + } + // We've given ZSTD_decompressStream all of our data, but there may still + // be data to read. + while (returnCode == 1) { + // Allocate a buffer with at least outSize bytes. + Buffer outBuffer(outSize); + auto zstdOutBuffer = makeZstdOutBuffer(outBuffer); + // Pass in no input. + ZSTD_inBuffer zstdInBuffer{nullptr, 0, 0}; + // Decompress + returnCode = + ZSTD_decompressStream(ctx.get(), &zstdOutBuffer, &zstdInBuffer); + if (!errorHolder.check( + !ZSTD_isError(returnCode), ZSTD_getErrorName(returnCode))) { + return; + } + // Pass the buffer with the decompressed data to the output queue + out->push(split(outBuffer, zstdOutBuffer)); + } +} + +std::uint64_t asyncDecompressFrames( + SharedState& state, + WorkQueue>& frames, + ThreadPool& executor, + FILE* fd) { + auto framesGuard = makeScopeGuard([&] { frames.finish(); }); + std::uint64_t totalBytesRead = 0; + + // Split the source up into its component frames. + // If we find our recognized skippable frame we know the next frames size + // which means that we can decompress each standard frame in independently. + // Otherwise, we will decompress using only one decompression task. + const size_t chunkSize = ZSTD_DStreamInSize(); + auto status = FileStatus::Continue; + while (status == FileStatus::Continue && !state.errorHolder.hasError()) { + // Make a new input queue that we will put the frames's bytes into. + auto in = std::make_shared(); + auto inGuard = makeScopeGuard([&] { in->finish(); }); + // Make a output queue that decompress will put the decompressed data into + auto out = std::make_shared(); + + size_t frameSize; + { + // Calculate the size of the next frame. + // frameSize is 0 if the frame info can't be decoded. + Buffer buffer(SkippableFrame::kSize); + auto bytesRead = std::fread(buffer.data(), 1, buffer.size(), fd); + totalBytesRead += bytesRead; + status = fileStatus(fd); + if (bytesRead == 0 && status != FileStatus::Continue) { + break; + } + buffer.subtract(buffer.size() - bytesRead); + frameSize = SkippableFrame::tryRead(buffer.range()); + in->push(std::move(buffer)); + } + if (frameSize == 0) { + // We hit a non SkippableFrame, so this will be the last job. + // Make sure that we don't use too much memory + in->setMaxSize(64); + out->setMaxSize(64); + } + // Start decompression in the thread pool + executor.add([&state, in, out] { + return decompress(state, std::move(in), std::move(out)); + }); + // Pass the output queue to the writer thread + frames.push(std::move(out)); + if (frameSize == 0) { + // We hit a non SkippableFrame ==> not compressed by pzstd or corrupted + // Pass the rest of the source to this decompression task + state.log(kLogVerbose, "%s\n", + "Input not in pzstd format, falling back to serial decompression"); + while (status == FileStatus::Continue && !state.errorHolder.hasError()) { + status = readData(*in, chunkSize, chunkSize, fd, &totalBytesRead); + } + break; + } + state.log(kLogVerbose, "Decompressing a frame of size %zu", frameSize); + // Fill the input queue for the decompression job we just started + status = readData(*in, chunkSize, frameSize, fd, &totalBytesRead); + } + state.errorHolder.check(status != FileStatus::Error, "Error reading input"); + return totalBytesRead; +} + +/// Write `data` to `fd`, returns true iff success. +static bool writeData(ByteRange data, FILE* fd) { + while (!data.empty()) { + data.advance(std::fwrite(data.begin(), 1, data.size(), fd)); + if (std::ferror(fd)) { + return false; + } + } + return true; +} + +std::uint64_t writeFile( + SharedState& state, + WorkQueue>& outs, + FILE* outputFd, + bool decompress) { + auto& errorHolder = state.errorHolder; + auto outsFinishGuard = makeScopeGuard([&outs] { outs.finish(); }); + auto lineClearGuard = makeScopeGuard([&state] { + state.log.clear(kLogInfo); + }); + std::uint64_t bytesWritten = 0; + std::shared_ptr out; + // Grab the output queue for each decompression job (in order). + while (outs.pop(out)) { + auto outFinishGuard = makeScopeGuard([&out] { out->finish(); }); + if (errorHolder.hasError()) { + continue; + } + if (!decompress) { + // If we are compressing and want to write skippable frames we can't + // start writing before compression is done because we need to know the + // compressed size. + // Wait for the compressed size to be available and write skippable frame + assert(uint64_t(out->size()) < uint64_t(1) << 32); + SkippableFrame frame(uint32_t(out->size())); + if (!writeData(frame.data(), outputFd)) { + errorHolder.setError("Failed to write output"); + return bytesWritten; + } + bytesWritten += frame.kSize; + } + // For each chunk of the frame: Pop it from the queue and write it + Buffer buffer; + while (out->pop(buffer) && !errorHolder.hasError()) { + if (!writeData(buffer.range(), outputFd)) { + errorHolder.setError("Failed to write output"); + return bytesWritten; + } + bytesWritten += buffer.size(); + state.log.update(kLogInfo, "Written: %u MB ", + static_cast(bytesWritten >> 20)); + } + } + return bytesWritten; +} +} diff --git a/contrib/seekable_format/zstdseek_decompress.c b/contrib/seekable_format/zstdseek_decompress.c new file mode 100644 index 0000000..ab9088a --- /dev/null +++ b/contrib/seekable_format/zstdseek_decompress.c @@ -0,0 +1,600 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/* ********************************************************* +* Turn on Large Files support (>4GB) for 32-bit Linux/Unix +***********************************************************/ +#if !defined(__64BIT__) || defined(__MINGW32__) /* No point defining Large file for 64 bit but MinGW-w64 requires it */ +# if !defined(_FILE_OFFSET_BITS) +# define _FILE_OFFSET_BITS 64 /* turn off_t into a 64-bit type for ftello, fseeko */ +# endif +# if !defined(_LARGEFILE_SOURCE) /* obsolete macro, replaced with _FILE_OFFSET_BITS */ +# define _LARGEFILE_SOURCE 1 /* Large File Support extension (LFS) - fseeko, ftello */ +# endif +# if defined(_AIX) || defined(__hpux) +# define _LARGE_FILES /* Large file support on 32-bits AIX and HP-UX */ +# endif +#endif + +/* ************************************************************ +* Detect POSIX version +* PLATFORM_POSIX_VERSION = 0 for non-Unix e.g. Windows +* PLATFORM_POSIX_VERSION = 1 for Unix-like but non-POSIX +* PLATFORM_POSIX_VERSION > 1 is equal to found _POSIX_VERSION +* Value of PLATFORM_POSIX_VERSION can be forced on command line +***************************************************************/ +#ifndef PLATFORM_POSIX_VERSION + +# if (defined(__APPLE__) && defined(__MACH__)) || defined(__SVR4) || defined(_AIX) || defined(__hpux) /* POSIX.1-2001 (SUSv3) conformant */ \ + || defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) /* BSD distros */ + /* exception rule : force posix version to 200112L, + * note: it's better to use unistd.h's _POSIX_VERSION whenever possible */ +# define PLATFORM_POSIX_VERSION 200112L + +/* try to determine posix version through official unistd.h's _POSIX_VERSION (https://pubs.opengroup.org/onlinepubs/7908799/xsh/unistd.h.html). + * note : there is no simple way to know in advance if is present or not on target system, + * Posix specification mandates its presence and its content, but target system must respect this spec. + * It's necessary to _not_ #include whenever target OS is not unix-like + * otherwise it will block preprocessing stage. + * The following list of build macros tries to "guess" if target OS is likely unix-like, and therefore can #include + */ +# elif !defined(_WIN32) \ + && ( defined(__unix__) || defined(__unix) \ + || defined(__midipix__) || defined(__VMS) || defined(__HAIKU__) ) + +# if defined(__linux__) || defined(__linux) || defined(__CYGWIN__) +# ifndef _POSIX_C_SOURCE +# define _POSIX_C_SOURCE 200809L /* feature test macro : https://www.gnu.org/software/libc/manual/html_node/Feature-Test-Macros.html */ +# endif +# endif +# include /* declares _POSIX_VERSION */ +# if defined(_POSIX_VERSION) /* POSIX compliant */ +# define PLATFORM_POSIX_VERSION _POSIX_VERSION +# else +# define PLATFORM_POSIX_VERSION 1 +# endif + +# ifdef __UCLIBC__ +# ifndef __USE_MISC +# define __USE_MISC /* enable st_mtim on uclibc */ +# endif +# endif + +# else /* non-unix target platform (like Windows) */ +# define PLATFORM_POSIX_VERSION 0 +# endif + +#endif /* PLATFORM_POSIX_VERSION */ + + +/* ************************************************************ +* Avoid fseek()'s 2GiB barrier with MSVC, macOS, *BSD, MinGW +***************************************************************/ +#if defined(LIBC_NO_FSEEKO) +/* Some older libc implementations don't include these functions (e.g. Bionic < 24) */ +# define LONG_SEEK fseek +#elif defined(_MSC_VER) && _MSC_VER >= 1400 +# define LONG_SEEK _fseeki64 +#elif !defined(__64BIT__) && (PLATFORM_POSIX_VERSION >= 200112L) /* No point defining Large file for 64 bit */ +# define LONG_SEEK fseeko +#elif defined(__MINGW32__) && !defined(__STRICT_ANSI__) && !defined(__NO_MINGW_LFS) && defined(__MSVCRT__) +# define LONG_SEEK fseeko64 +#elif defined(_WIN32) && !defined(__DJGPP__) +# include + static int LONG_SEEK(FILE* file, __int64 offset, int origin) { + LARGE_INTEGER off; + DWORD method; + off.QuadPart = offset; + if (origin == SEEK_END) + method = FILE_END; + else if (origin == SEEK_CUR) + method = FILE_CURRENT; + else + method = FILE_BEGIN; + + if (SetFilePointerEx((HANDLE) _get_osfhandle(_fileno(file)), off, NULL, method)) + return 0; + else + return -1; + } +#else +# define LONG_SEEK fseek +#endif + +#include /* malloc, free */ +#include /* FILE* */ +#include /* UNIT_MAX */ +#include + +#define XXH_STATIC_LINKING_ONLY +#include "xxhash.h" + +#define ZSTD_STATIC_LINKING_ONLY +#include "zstd.h" +#include "zstd_errors.h" +#include "mem.h" +#include "zstd_seekable.h" + +#undef ERROR +#define ERROR(name) ((size_t)-ZSTD_error_##name) + +#define CHECK_IO(f) { int const errcod = (f); if (errcod < 0) return ERROR(seekableIO); } + +#undef MIN +#undef MAX +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +#define ZSTD_SEEKABLE_NO_OUTPUT_PROGRESS_MAX 16 + +/* Special-case callbacks for FILE* and in-memory modes, so that we can treat + * them the same way as the advanced API */ +static int ZSTD_seekable_read_FILE(void* opaque, void* buffer, size_t n) +{ + size_t const result = fread(buffer, 1, n, (FILE*)opaque); + if (result != n) { + return -1; + } + return 0; +} + +static int ZSTD_seekable_seek_FILE(void* opaque, long long offset, int origin) +{ + int const ret = LONG_SEEK((FILE*)opaque, offset, origin); + if (ret) return ret; + return fflush((FILE*)opaque); +} + +typedef struct { + const void *ptr; + size_t size; + size_t pos; +} buffWrapper_t; + +static int ZSTD_seekable_read_buff(void* opaque, void* buffer, size_t n) +{ + buffWrapper_t* const buff = (buffWrapper_t*)opaque; + assert(buff != NULL); + if (buff->pos + n > buff->size) return -1; + memcpy(buffer, (const BYTE*)buff->ptr + buff->pos, n); + buff->pos += n; + return 0; +} + +static int ZSTD_seekable_seek_buff(void* opaque, long long offset, int origin) +{ + buffWrapper_t* const buff = (buffWrapper_t*) opaque; + unsigned long long newOffset; + assert(buff != NULL); + switch (origin) { + case SEEK_SET: + assert(offset >= 0); + newOffset = (unsigned long long)offset; + break; + case SEEK_CUR: + newOffset = (unsigned long long)((long long)buff->pos + offset); + break; + case SEEK_END: + newOffset = (unsigned long long)((long long)buff->size + offset); + break; + default: + assert(0); /* not possible */ + } + if (newOffset > buff->size) { + return -1; + } + buff->pos = newOffset; + return 0; +} + +typedef struct { + U64 cOffset; + U64 dOffset; + U32 checksum; +} seekEntry_t; + +struct ZSTD_seekTable_s { + seekEntry_t* entries; + size_t tableLen; + + int checksumFlag; +}; + +#define SEEKABLE_BUFF_SIZE ZSTD_BLOCKSIZE_MAX + +struct ZSTD_seekable_s { + ZSTD_DStream* dstream; + ZSTD_seekTable seekTable; + ZSTD_seekable_customFile src; + + U64 decompressedOffset; + U32 curFrame; + + BYTE inBuff[SEEKABLE_BUFF_SIZE]; /* need to do our own input buffering */ + BYTE outBuff[SEEKABLE_BUFF_SIZE]; /* so we can efficiently decompress the + starts of chunks before we get to the + desired section */ + ZSTD_inBuffer in; /* maintain continuity across ZSTD_seekable_decompress operations */ + buffWrapper_t buffWrapper; /* for `src.opaque` in in-memory mode */ + + XXH64_state_t xxhState; +}; + +ZSTD_seekable* ZSTD_seekable_create(void) +{ + ZSTD_seekable* const zs = (ZSTD_seekable*)malloc(sizeof(ZSTD_seekable)); + if (zs == NULL) return NULL; + + /* also initializes stage to zsds_init */ + memset(zs, 0, sizeof(*zs)); + + zs->dstream = ZSTD_createDStream(); + if (zs->dstream == NULL) { + free(zs); + return NULL; + } + + return zs; +} + +size_t ZSTD_seekable_free(ZSTD_seekable* zs) +{ + if (zs == NULL) return 0; /* support free on null */ + ZSTD_freeDStream(zs->dstream); + free(zs->seekTable.entries); + free(zs); + return 0; +} + +ZSTD_seekTable* ZSTD_seekTable_create_fromSeekable(const ZSTD_seekable* zs) +{ + assert(zs != NULL); + if (zs->seekTable.entries == NULL) return NULL; + ZSTD_seekTable* const st = (ZSTD_seekTable*)malloc(sizeof(ZSTD_seekTable)); + if (st==NULL) return NULL; + + st->checksumFlag = zs->seekTable.checksumFlag; + st->tableLen = zs->seekTable.tableLen; + + /* Allocate an extra entry at the end to match logic of initial allocation */ + size_t const entriesSize = sizeof(seekEntry_t) * (zs->seekTable.tableLen + 1); + seekEntry_t* const entries = (seekEntry_t*)malloc(entriesSize); + if (entries==NULL) { + free(st); + return NULL; + } + + memcpy(entries, zs->seekTable.entries, entriesSize); + st->entries = entries; + return st; +} + +size_t ZSTD_seekTable_free(ZSTD_seekTable* st) +{ + if (st == NULL) return 0; /* support free on null */ + free(st->entries); + free(st); + return 0; +} + +/** ZSTD_seekable_offsetToFrameIndex() : + * Performs a binary search to find the last frame with a decompressed offset + * <= pos + * @return : the frame's index */ +unsigned ZSTD_seekable_offsetToFrameIndex(const ZSTD_seekable* zs, unsigned long long pos) +{ + return ZSTD_seekTable_offsetToFrameIndex(&zs->seekTable, pos); +} + +unsigned ZSTD_seekTable_offsetToFrameIndex(const ZSTD_seekTable* st, unsigned long long pos) +{ + U32 lo = 0; + U32 hi = (U32)st->tableLen; + assert(st->tableLen <= UINT_MAX); + + if (pos >= st->entries[st->tableLen].dOffset) { + return (unsigned)st->tableLen; + } + + while (lo + 1 < hi) { + U32 const mid = lo + ((hi - lo) >> 1); + if (st->entries[mid].dOffset <= pos) { + lo = mid; + } else { + hi = mid; + } + } + return lo; +} + +unsigned ZSTD_seekable_getNumFrames(const ZSTD_seekable* zs) +{ + return ZSTD_seekTable_getNumFrames(&zs->seekTable); +} + +unsigned ZSTD_seekTable_getNumFrames(const ZSTD_seekTable* st) +{ + assert(st->tableLen <= UINT_MAX); + return (unsigned)st->tableLen; +} + +unsigned long long ZSTD_seekable_getFrameCompressedOffset(const ZSTD_seekable* zs, unsigned frameIndex) +{ + return ZSTD_seekTable_getFrameCompressedOffset(&zs->seekTable, frameIndex); +} + +unsigned long long ZSTD_seekTable_getFrameCompressedOffset(const ZSTD_seekTable* st, unsigned frameIndex) +{ + if (frameIndex >= st->tableLen) return ZSTD_SEEKABLE_FRAMEINDEX_TOOLARGE; + return st->entries[frameIndex].cOffset; +} + +unsigned long long ZSTD_seekable_getFrameDecompressedOffset(const ZSTD_seekable* zs, unsigned frameIndex) +{ + return ZSTD_seekTable_getFrameDecompressedOffset(&zs->seekTable, frameIndex); +} + +unsigned long long ZSTD_seekTable_getFrameDecompressedOffset(const ZSTD_seekTable* st, unsigned frameIndex) +{ + if (frameIndex >= st->tableLen) return ZSTD_SEEKABLE_FRAMEINDEX_TOOLARGE; + return st->entries[frameIndex].dOffset; +} + +size_t ZSTD_seekable_getFrameCompressedSize(const ZSTD_seekable* zs, unsigned frameIndex) +{ + return ZSTD_seekTable_getFrameCompressedSize(&zs->seekTable, frameIndex); +} + +size_t ZSTD_seekTable_getFrameCompressedSize(const ZSTD_seekTable* st, unsigned frameIndex) +{ + if (frameIndex >= st->tableLen) return ERROR(frameIndex_tooLarge); + return st->entries[frameIndex + 1].cOffset - + st->entries[frameIndex].cOffset; +} + +size_t ZSTD_seekable_getFrameDecompressedSize(const ZSTD_seekable* zs, unsigned frameIndex) +{ + return ZSTD_seekTable_getFrameDecompressedSize(&zs->seekTable, frameIndex); +} + +size_t ZSTD_seekTable_getFrameDecompressedSize(const ZSTD_seekTable* st, unsigned frameIndex) +{ + if (frameIndex > st->tableLen) return ERROR(frameIndex_tooLarge); + return st->entries[frameIndex + 1].dOffset - + st->entries[frameIndex].dOffset; +} + +static size_t ZSTD_seekable_loadSeekTable(ZSTD_seekable* zs) +{ + int checksumFlag; + ZSTD_seekable_customFile src = zs->src; + /* read the footer, fixed size */ + CHECK_IO(src.seek(src.opaque, -(int)ZSTD_seekTableFooterSize, SEEK_END)); + CHECK_IO(src.read(src.opaque, zs->inBuff, ZSTD_seekTableFooterSize)); + + if (MEM_readLE32(zs->inBuff + 5) != ZSTD_SEEKABLE_MAGICNUMBER) { + return ERROR(prefix_unknown); + } + + { BYTE const sfd = zs->inBuff[4]; + checksumFlag = sfd >> 7; + + /* check reserved bits */ + if ((sfd >> 2) & 0x1f) { + return ERROR(corruption_detected); + } } + + { U32 const numFrames = MEM_readLE32(zs->inBuff); + U32 const sizePerEntry = 8 + (checksumFlag?4:0); + U32 const tableSize = sizePerEntry * numFrames; + U32 const frameSize = tableSize + ZSTD_seekTableFooterSize + ZSTD_SKIPPABLEHEADERSIZE; + + U32 remaining = frameSize - ZSTD_seekTableFooterSize; /* don't need to re-read footer */ + { U32 const toRead = MIN(remaining, SEEKABLE_BUFF_SIZE); + CHECK_IO(src.seek(src.opaque, -(S64)frameSize, SEEK_END)); + CHECK_IO(src.read(src.opaque, zs->inBuff, toRead)); + remaining -= toRead; + } + + if (MEM_readLE32(zs->inBuff) != (ZSTD_MAGIC_SKIPPABLE_START | 0xE)) { + return ERROR(prefix_unknown); + } + if (MEM_readLE32(zs->inBuff+4) + ZSTD_SKIPPABLEHEADERSIZE != frameSize) { + return ERROR(prefix_unknown); + } + + { /* Allocate an extra entry at the end so that we can do size + * computations on the last element without special case */ + seekEntry_t* const entries = (seekEntry_t*)malloc(sizeof(seekEntry_t) * (numFrames + 1)); + + U32 idx = 0; + U32 pos = 8; + + U64 cOffset = 0; + U64 dOffset = 0; + + if (entries == NULL) return ERROR(memory_allocation); + + /* compute cumulative positions */ + for (; idx < numFrames; idx++) { + if (pos + sizePerEntry > SEEKABLE_BUFF_SIZE) { + U32 const offset = SEEKABLE_BUFF_SIZE - pos; + U32 const toRead = MIN(remaining, SEEKABLE_BUFF_SIZE - offset); + memmove(zs->inBuff, zs->inBuff + pos, offset); /* move any data we haven't read yet */ + CHECK_IO(src.read(src.opaque, zs->inBuff+offset, toRead)); + remaining -= toRead; + pos = 0; + } + entries[idx].cOffset = cOffset; + entries[idx].dOffset = dOffset; + + cOffset += MEM_readLE32(zs->inBuff + pos); + pos += 4; + dOffset += MEM_readLE32(zs->inBuff + pos); + pos += 4; + if (checksumFlag) { + entries[idx].checksum = MEM_readLE32(zs->inBuff + pos); + pos += 4; + } + } + entries[numFrames].cOffset = cOffset; + entries[numFrames].dOffset = dOffset; + + zs->seekTable.entries = entries; + zs->seekTable.tableLen = numFrames; + zs->seekTable.checksumFlag = checksumFlag; + return 0; + } + } +} + +size_t ZSTD_seekable_initBuff(ZSTD_seekable* zs, const void* src, size_t srcSize) +{ + zs->buffWrapper = (buffWrapper_t){src, srcSize, 0}; + { ZSTD_seekable_customFile srcFile = {&zs->buffWrapper, + &ZSTD_seekable_read_buff, + &ZSTD_seekable_seek_buff}; + return ZSTD_seekable_initAdvanced(zs, srcFile); } +} + +size_t ZSTD_seekable_initFile(ZSTD_seekable* zs, FILE* src) +{ + ZSTD_seekable_customFile srcFile = {src, &ZSTD_seekable_read_FILE, + &ZSTD_seekable_seek_FILE}; + return ZSTD_seekable_initAdvanced(zs, srcFile); +} + +size_t ZSTD_seekable_initAdvanced(ZSTD_seekable* zs, ZSTD_seekable_customFile src) +{ + zs->src = src; + + { const size_t seekTableInit = ZSTD_seekable_loadSeekTable(zs); + if (ZSTD_isError(seekTableInit)) return seekTableInit; } + + zs->decompressedOffset = (U64)-1; + zs->curFrame = (U32)-1; + + { const size_t dstreamInit = ZSTD_initDStream(zs->dstream); + if (ZSTD_isError(dstreamInit)) return dstreamInit; } + return 0; +} + +size_t ZSTD_seekable_decompress(ZSTD_seekable* zs, void* dst, size_t len, unsigned long long offset) +{ + unsigned long long const eos = zs->seekTable.entries[zs->seekTable.tableLen].dOffset; + if (offset + len > eos) { + len = eos - offset; + } + + U32 targetFrame = ZSTD_seekable_offsetToFrameIndex(zs, offset); + U32 noOutputProgressCount = 0; + size_t srcBytesRead = 0; + do { + /* check if we can continue from a previous decompress job */ + if (targetFrame != zs->curFrame || offset < zs->decompressedOffset) { + zs->decompressedOffset = zs->seekTable.entries[targetFrame].dOffset; + zs->curFrame = targetFrame; + + assert(zs->seekTable.entries[targetFrame].cOffset < LLONG_MAX); + CHECK_IO(zs->src.seek(zs->src.opaque, + (long long)zs->seekTable.entries[targetFrame].cOffset, + SEEK_SET)); + zs->in = (ZSTD_inBuffer){zs->inBuff, 0, 0}; + XXH64_reset(&zs->xxhState, 0); + ZSTD_DCtx_reset(zs->dstream, ZSTD_reset_session_only); + if (zs->buffWrapper.size && srcBytesRead > zs->buffWrapper.size) { + return ERROR(seekableIO); + } + } + + while (zs->decompressedOffset < offset + len) { + size_t toRead; + ZSTD_outBuffer outTmp; + size_t prevOutPos; + size_t prevInPos; + size_t forwardProgress; + if (zs->decompressedOffset < offset) { + /* dummy decompressions until we get to the target offset */ + outTmp = (ZSTD_outBuffer){zs->outBuff, (size_t) (MIN(SEEKABLE_BUFF_SIZE, offset - zs->decompressedOffset)), 0}; + } else { + outTmp = (ZSTD_outBuffer){dst, len, (size_t) (zs->decompressedOffset - offset)}; + } + + prevOutPos = outTmp.pos; + prevInPos = zs->in.pos; + toRead = ZSTD_decompressStream(zs->dstream, &outTmp, &zs->in); + if (ZSTD_isError(toRead)) { + return toRead; + } + + if (zs->seekTable.checksumFlag) { + XXH64_update(&zs->xxhState, (BYTE*)outTmp.dst + prevOutPos, + outTmp.pos - prevOutPos); + } + forwardProgress = outTmp.pos - prevOutPos; + if (forwardProgress == 0) { + if (noOutputProgressCount++ > ZSTD_SEEKABLE_NO_OUTPUT_PROGRESS_MAX) { + return ERROR(seekableIO); + } + } else { + noOutputProgressCount = 0; + } + zs->decompressedOffset += forwardProgress; + srcBytesRead += zs->in.pos - prevInPos; + + if (toRead == 0) { + /* frame complete */ + + /* verify checksum */ + if (zs->seekTable.checksumFlag && + (XXH64_digest(&zs->xxhState) & 0xFFFFFFFFU) != + zs->seekTable.entries[targetFrame].checksum) { + return ERROR(corruption_detected); + } + + if (zs->decompressedOffset < offset + len) { + /* go back to the start and force a reset of the stream */ + targetFrame = ZSTD_seekable_offsetToFrameIndex(zs, zs->decompressedOffset); + /* in this case it will fail later with corruption_detected, since last block does not have checksum */ + assert(targetFrame != zs->seekTable.tableLen); + } + break; + } + + /* read in more data if we're done with this buffer */ + if (zs->in.pos == zs->in.size) { + toRead = MIN(toRead, SEEKABLE_BUFF_SIZE); + CHECK_IO(zs->src.read(zs->src.opaque, zs->inBuff, toRead)); + zs->in.size = toRead; + zs->in.pos = 0; + } + } /* while (zs->decompressedOffset < offset + len) */ + } while (zs->decompressedOffset != offset + len); + + return len; +} + +size_t ZSTD_seekable_decompressFrame(ZSTD_seekable* zs, void* dst, size_t dstSize, unsigned frameIndex) +{ + if (frameIndex >= zs->seekTable.tableLen) { + return ERROR(frameIndex_tooLarge); + } + + { size_t const decompressedSize = + zs->seekTable.entries[frameIndex + 1].dOffset - + zs->seekTable.entries[frameIndex].dOffset; + if (dstSize < decompressedSize) { + return ERROR(dstSize_tooSmall); + } + return ZSTD_seekable_decompress( + zs, dst, decompressedSize, + zs->seekTable.entries[frameIndex].dOffset); + } +} diff --git a/doc/educational_decoder/zstd_decompress.c b/doc/educational_decoder/zstd_decompress.c new file mode 100644 index 0000000..839e085 --- /dev/null +++ b/doc/educational_decoder/zstd_decompress.c @@ -0,0 +1,2323 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/// Zstandard educational decoder implementation +/// See https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md + +#include // uint8_t, etc. +#include // malloc, free, exit +#include // fprintf +#include // memset, memcpy +#include "zstd_decompress.h" + + +/******* IMPORTANT CONSTANTS *********************************************/ + +// Zstandard frame +// "Magic_Number +// 4 Bytes, little-endian format. Value : 0xFD2FB528" +#define ZSTD_MAGIC_NUMBER 0xFD2FB528U + +// The size of `Block_Content` is limited by `Block_Maximum_Size`, +#define ZSTD_BLOCK_SIZE_MAX ((size_t)128 * 1024) + +// literal blocks can't be larger than their block +#define MAX_LITERALS_SIZE ZSTD_BLOCK_SIZE_MAX + + +/******* UTILITY MACROS AND TYPES *********************************************/ +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +#if defined(ZDEC_NO_MESSAGE) +#define MESSAGE(...) +#else +#define MESSAGE(...) fprintf(stderr, "" __VA_ARGS__) +#endif + +/// This decoder calls exit(1) when it encounters an error, however a production +/// library should propagate error codes +#define ERROR(s) \ + do { \ + MESSAGE("Error: %s\n", s); \ + exit(1); \ + } while (0) +#define INP_SIZE() \ + ERROR("Input buffer smaller than it should be or input is " \ + "corrupted") +#define OUT_SIZE() ERROR("Output buffer too small for output") +#define CORRUPTION() ERROR("Corruption detected while decompressing") +#define BAD_ALLOC() ERROR("Memory allocation error") +#define IMPOSSIBLE() ERROR("An impossibility has occurred") + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; + +typedef int8_t i8; +typedef int16_t i16; +typedef int32_t i32; +typedef int64_t i64; +/******* END UTILITY MACROS AND TYPES *****************************************/ + +/******* IMPLEMENTATION PRIMITIVE PROTOTYPES **********************************/ +/// The implementations for these functions can be found at the bottom of this +/// file. They implement low-level functionality needed for the higher level +/// decompression functions. + +/*** IO STREAM OPERATIONS *************/ + +/// ostream_t/istream_t are used to wrap the pointers/length data passed into +/// ZSTD_decompress, so that all IO operations are safely bounds checked +/// They are written/read forward, and reads are treated as little-endian +/// They should be used opaquely to ensure safety +typedef struct { + u8 *ptr; + size_t len; +} ostream_t; + +typedef struct { + const u8 *ptr; + size_t len; + + // Input often reads a few bits at a time, so maintain an internal offset + int bit_offset; +} istream_t; + +/// The following two functions are the only ones that allow the istream to be +/// non-byte aligned + +/// Reads `num` bits from a bitstream, and updates the internal offset +static inline u64 IO_read_bits(istream_t *const in, const int num_bits); +/// Backs-up the stream by `num` bits so they can be read again +static inline void IO_rewind_bits(istream_t *const in, const int num_bits); +/// If the remaining bits in a byte will be unused, advance to the end of the +/// byte +static inline void IO_align_stream(istream_t *const in); + +/// Write the given byte into the output stream +static inline void IO_write_byte(ostream_t *const out, u8 symb); + +/// Returns the number of bytes left to be read in this stream. The stream must +/// be byte aligned. +static inline size_t IO_istream_len(const istream_t *const in); + +/// Advances the stream by `len` bytes, and returns a pointer to the chunk that +/// was skipped. The stream must be byte aligned. +static inline const u8 *IO_get_read_ptr(istream_t *const in, size_t len); +/// Advances the stream by `len` bytes, and returns a pointer to the chunk that +/// was skipped so it can be written to. +static inline u8 *IO_get_write_ptr(ostream_t *const out, size_t len); + +/// Advance the inner state by `len` bytes. The stream must be byte aligned. +static inline void IO_advance_input(istream_t *const in, size_t len); + +/// Returns an `ostream_t` constructed from the given pointer and length. +static inline ostream_t IO_make_ostream(u8 *out, size_t len); +/// Returns an `istream_t` constructed from the given pointer and length. +static inline istream_t IO_make_istream(const u8 *in, size_t len); + +/// Returns an `istream_t` with the same base as `in`, and length `len`. +/// Then, advance `in` to account for the consumed bytes. +/// `in` must be byte aligned. +static inline istream_t IO_make_sub_istream(istream_t *const in, size_t len); +/*** END IO STREAM OPERATIONS *********/ + +/*** BITSTREAM OPERATIONS *************/ +/// Read `num` bits (up to 64) from `src + offset`, where `offset` is in bits, +/// and return them interpreted as a little-endian unsigned integer. +static inline u64 read_bits_LE(const u8 *src, const int num_bits, + const size_t offset); + +/// Read bits from the end of a HUF or FSE bitstream. `offset` is in bits, so +/// it updates `offset` to `offset - bits`, and then reads `bits` bits from +/// `src + offset`. If the offset becomes negative, the extra bits at the +/// bottom are filled in with `0` bits instead of reading from before `src`. +static inline u64 STREAM_read_bits(const u8 *src, const int bits, + i64 *const offset); +/*** END BITSTREAM OPERATIONS *********/ + +/*** BIT COUNTING OPERATIONS **********/ +/// Returns the index of the highest set bit in `num`, or `-1` if `num == 0` +static inline int highest_set_bit(const u64 num); +/*** END BIT COUNTING OPERATIONS ******/ + +/*** HUFFMAN PRIMITIVES ***************/ +// Table decode method uses exponential memory, so we need to limit depth +#define HUF_MAX_BITS (16) + +// Limit the maximum number of symbols to 256 so we can store a symbol in a byte +#define HUF_MAX_SYMBS (256) + +/// Structure containing all tables necessary for efficient Huffman decoding +typedef struct { + u8 *symbols; + u8 *num_bits; + int max_bits; +} HUF_dtable; + +/// Decode a single symbol and read in enough bits to refresh the state +static inline u8 HUF_decode_symbol(const HUF_dtable *const dtable, + u16 *const state, const u8 *const src, + i64 *const offset); +/// Read in a full state's worth of bits to initialize it +static inline void HUF_init_state(const HUF_dtable *const dtable, + u16 *const state, const u8 *const src, + i64 *const offset); + +/// Decompresses a single Huffman stream, returns the number of bytes decoded. +/// `src_len` must be the exact length of the Huffman-coded block. +static size_t HUF_decompress_1stream(const HUF_dtable *const dtable, + ostream_t *const out, istream_t *const in); +/// Same as previous but decodes 4 streams, formatted as in the Zstandard +/// specification. +/// `src_len` must be the exact length of the Huffman-coded block. +static size_t HUF_decompress_4stream(const HUF_dtable *const dtable, + ostream_t *const out, istream_t *const in); + +/// Initialize a Huffman decoding table using the table of bit counts provided +static void HUF_init_dtable(HUF_dtable *const table, const u8 *const bits, + const int num_symbs); +/// Initialize a Huffman decoding table using the table of weights provided +/// Weights follow the definition provided in the Zstandard specification +static void HUF_init_dtable_usingweights(HUF_dtable *const table, + const u8 *const weights, + const int num_symbs); + +/// Free the malloc'ed parts of a decoding table +static void HUF_free_dtable(HUF_dtable *const dtable); +/*** END HUFFMAN PRIMITIVES ***********/ + +/*** FSE PRIMITIVES *******************/ +/// For more description of FSE see +/// https://github.com/Cyan4973/FiniteStateEntropy/ + +// FSE table decoding uses exponential memory, so limit the maximum accuracy +#define FSE_MAX_ACCURACY_LOG (15) +// Limit the maximum number of symbols so they can be stored in a single byte +#define FSE_MAX_SYMBS (256) + +/// The tables needed to decode FSE encoded streams +typedef struct { + u8 *symbols; + u8 *num_bits; + u16 *new_state_base; + int accuracy_log; +} FSE_dtable; + +/// Return the symbol for the current state +static inline u8 FSE_peek_symbol(const FSE_dtable *const dtable, + const u16 state); +/// Read the number of bits necessary to update state, update, and shift offset +/// back to reflect the bits read +static inline void FSE_update_state(const FSE_dtable *const dtable, + u16 *const state, const u8 *const src, + i64 *const offset); + +/// Combine peek and update: decode a symbol and update the state +static inline u8 FSE_decode_symbol(const FSE_dtable *const dtable, + u16 *const state, const u8 *const src, + i64 *const offset); + +/// Read bits from the stream to initialize the state and shift offset back +static inline void FSE_init_state(const FSE_dtable *const dtable, + u16 *const state, const u8 *const src, + i64 *const offset); + +/// Decompress two interleaved bitstreams (e.g. compressed Huffman weights) +/// using an FSE decoding table. `src_len` must be the exact length of the +/// block. +static size_t FSE_decompress_interleaved2(const FSE_dtable *const dtable, + ostream_t *const out, + istream_t *const in); + +/// Initialize a decoding table using normalized frequencies. +static void FSE_init_dtable(FSE_dtable *const dtable, + const i16 *const norm_freqs, const int num_symbs, + const int accuracy_log); + +/// Decode an FSE header as defined in the Zstandard format specification and +/// use the decoded frequencies to initialize a decoding table. +static void FSE_decode_header(FSE_dtable *const dtable, istream_t *const in, + const int max_accuracy_log); + +/// Initialize an FSE table that will always return the same symbol and consume +/// 0 bits per symbol, to be used for RLE mode in sequence commands +static void FSE_init_dtable_rle(FSE_dtable *const dtable, const u8 symb); + +/// Free the malloc'ed parts of a decoding table +static void FSE_free_dtable(FSE_dtable *const dtable); +/*** END FSE PRIMITIVES ***************/ + +/******* END IMPLEMENTATION PRIMITIVE PROTOTYPES ******************************/ + +/******* ZSTD HELPER STRUCTS AND PROTOTYPES ***********************************/ + +/// A small structure that can be reused in various places that need to access +/// frame header information +typedef struct { + // The size of window that we need to be able to contiguously store for + // references + size_t window_size; + // The total output size of this compressed frame + size_t frame_content_size; + + // The dictionary id if this frame uses one + u32 dictionary_id; + + // Whether or not the content of this frame has a checksum + int content_checksum_flag; + // Whether or not the output for this frame is in a single segment + int single_segment_flag; +} frame_header_t; + +/// The context needed to decode blocks in a frame +typedef struct { + frame_header_t header; + + // The total amount of data available for backreferences, to determine if an + // offset too large to be correct + size_t current_total_output; + + const u8 *dict_content; + size_t dict_content_len; + + // Entropy encoding tables so they can be repeated by future blocks instead + // of retransmitting + HUF_dtable literals_dtable; + FSE_dtable ll_dtable; + FSE_dtable ml_dtable; + FSE_dtable of_dtable; + + // The last 3 offsets for the special "repeat offsets". + u64 previous_offsets[3]; +} frame_context_t; + +/// The decoded contents of a dictionary so that it doesn't have to be repeated +/// for each frame that uses it +struct dictionary_s { + // Entropy tables + HUF_dtable literals_dtable; + FSE_dtable ll_dtable; + FSE_dtable ml_dtable; + FSE_dtable of_dtable; + + // Raw content for backreferences + u8 *content; + size_t content_size; + + // Offset history to prepopulate the frame's history + u64 previous_offsets[3]; + + u32 dictionary_id; +}; + +/// A tuple containing the parts necessary to decode and execute a ZSTD sequence +/// command +typedef struct { + u32 literal_length; + u32 match_length; + u32 offset; +} sequence_command_t; + +/// The decoder works top-down, starting at the high level like Zstd frames, and +/// working down to lower more technical levels such as blocks, literals, and +/// sequences. The high-level functions roughly follow the outline of the +/// format specification: +/// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md + +/// Before the implementation of each high-level function declared here, the +/// prototypes for their helper functions are defined and explained + +/// Decode a single Zstd frame, or error if the input is not a valid frame. +/// Accepts a dict argument, which may be NULL indicating no dictionary. +/// See +/// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame-concatenation +static void decode_frame(ostream_t *const out, istream_t *const in, + const dictionary_t *const dict); + +// Decode data in a compressed block +static void decompress_block(frame_context_t *const ctx, ostream_t *const out, + istream_t *const in); + +// Decode the literals section of a block +static size_t decode_literals(frame_context_t *const ctx, istream_t *const in, + u8 **const literals); + +// Decode the sequences part of a block +static size_t decode_sequences(frame_context_t *const ctx, istream_t *const in, + sequence_command_t **const sequences); + +// Execute the decoded sequences on the literals block +static void execute_sequences(frame_context_t *const ctx, ostream_t *const out, + const u8 *const literals, + const size_t literals_len, + const sequence_command_t *const sequences, + const size_t num_sequences); + +// Copies literals and returns the total literal length that was copied +static u32 copy_literals(const size_t seq, istream_t *litstream, + ostream_t *const out); + +// Given an offset code from a sequence command (either an actual offset value +// or an index for previous offset), computes the correct offset and updates +// the offset history +static size_t compute_offset(sequence_command_t seq, u64 *const offset_hist); + +// Given an offset, match length, and total output, as well as the frame +// context for the dictionary, determines if the dictionary is used and +// executes the copy operation +static void execute_match_copy(frame_context_t *const ctx, size_t offset, + size_t match_length, size_t total_output, + ostream_t *const out); + +/******* END ZSTD HELPER STRUCTS AND PROTOTYPES *******************************/ + +size_t ZSTD_decompress(void *const dst, const size_t dst_len, + const void *const src, const size_t src_len) { + dictionary_t* const uninit_dict = create_dictionary(); + size_t const decomp_size = ZSTD_decompress_with_dict(dst, dst_len, src, + src_len, uninit_dict); + free_dictionary(uninit_dict); + return decomp_size; +} + +size_t ZSTD_decompress_with_dict(void *const dst, const size_t dst_len, + const void *const src, const size_t src_len, + dictionary_t* parsed_dict) { + + istream_t in = IO_make_istream(src, src_len); + ostream_t out = IO_make_ostream(dst, dst_len); + + // "A content compressed by Zstandard is transformed into a Zstandard frame. + // Multiple frames can be appended into a single file or stream. A frame is + // totally independent, has a defined beginning and end, and a set of + // parameters which tells the decoder how to decompress it." + + /* this decoder assumes decompression of a single frame */ + decode_frame(&out, &in, parsed_dict); + + return (size_t)(out.ptr - (u8 *)dst); +} + +/******* FRAME DECODING ******************************************************/ + +static void decode_data_frame(ostream_t *const out, istream_t *const in, + const dictionary_t *const dict); +static void init_frame_context(frame_context_t *const context, + istream_t *const in, + const dictionary_t *const dict); +static void free_frame_context(frame_context_t *const context); +static void parse_frame_header(frame_header_t *const header, + istream_t *const in); +static void frame_context_apply_dict(frame_context_t *const ctx, + const dictionary_t *const dict); + +static void decompress_data(frame_context_t *const ctx, ostream_t *const out, + istream_t *const in); + +static void decode_frame(ostream_t *const out, istream_t *const in, + const dictionary_t *const dict) { + const u32 magic_number = (u32)IO_read_bits(in, 32); + if (magic_number == ZSTD_MAGIC_NUMBER) { + // ZSTD frame + decode_data_frame(out, in, dict); + + return; + } + + // not a real frame or a skippable frame + ERROR("Tried to decode non-ZSTD frame"); +} + +/// Decode a frame that contains compressed data. Not all frames do as there +/// are skippable frames. +/// See +/// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#general-structure-of-zstandard-frame-format +static void decode_data_frame(ostream_t *const out, istream_t *const in, + const dictionary_t *const dict) { + frame_context_t ctx; + + // Initialize the context that needs to be carried from block to block + init_frame_context(&ctx, in, dict); + + if (ctx.header.frame_content_size != 0 && + ctx.header.frame_content_size > out->len) { + OUT_SIZE(); + } + + decompress_data(&ctx, out, in); + + free_frame_context(&ctx); +} + +/// Takes the information provided in the header and dictionary, and initializes +/// the context for this frame +static void init_frame_context(frame_context_t *const context, + istream_t *const in, + const dictionary_t *const dict) { + // Most fields in context are correct when initialized to 0 + memset(context, 0, sizeof(frame_context_t)); + + // Parse data from the frame header + parse_frame_header(&context->header, in); + + // Set up the offset history for the repeat offset commands + context->previous_offsets[0] = 1; + context->previous_offsets[1] = 4; + context->previous_offsets[2] = 8; + + // Apply details from the dict if it exists + frame_context_apply_dict(context, dict); +} + +static void free_frame_context(frame_context_t *const context) { + HUF_free_dtable(&context->literals_dtable); + + FSE_free_dtable(&context->ll_dtable); + FSE_free_dtable(&context->ml_dtable); + FSE_free_dtable(&context->of_dtable); + + memset(context, 0, sizeof(frame_context_t)); +} + +static void parse_frame_header(frame_header_t *const header, + istream_t *const in) { + // "The first header's byte is called the Frame_Header_Descriptor. It tells + // which other fields are present. Decoding this byte is enough to tell the + // size of Frame_Header. + // + // Bit number Field name + // 7-6 Frame_Content_Size_flag + // 5 Single_Segment_flag + // 4 Unused_bit + // 3 Reserved_bit + // 2 Content_Checksum_flag + // 1-0 Dictionary_ID_flag" + const u8 descriptor = (u8)IO_read_bits(in, 8); + + // decode frame header descriptor into flags + const u8 frame_content_size_flag = descriptor >> 6; + const u8 single_segment_flag = (descriptor >> 5) & 1; + const u8 reserved_bit = (descriptor >> 3) & 1; + const u8 content_checksum_flag = (descriptor >> 2) & 1; + const u8 dictionary_id_flag = descriptor & 3; + + if (reserved_bit != 0) { + CORRUPTION(); + } + + header->single_segment_flag = single_segment_flag; + header->content_checksum_flag = content_checksum_flag; + + // decode window size + if (!single_segment_flag) { + // "Provides guarantees on maximum back-reference distance that will be + // used within compressed data. This information is important for + // decoders to allocate enough memory. + // + // Bit numbers 7-3 2-0 + // Field name Exponent Mantissa" + u8 window_descriptor = (u8)IO_read_bits(in, 8); + u8 exponent = window_descriptor >> 3; + u8 mantissa = window_descriptor & 7; + + // Use the algorithm from the specification to compute window size + // https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#window_descriptor + size_t window_base = (size_t)1 << (10 + exponent); + size_t window_add = (window_base / 8) * mantissa; + header->window_size = window_base + window_add; + } + + // decode dictionary id if it exists + if (dictionary_id_flag) { + // "This is a variable size field, which contains the ID of the + // dictionary required to properly decode the frame. Note that this + // field is optional. When it's not present, it's up to the caller to + // make sure it uses the correct dictionary. Format is little-endian." + const int bytes_array[] = {0, 1, 2, 4}; + const int bytes = bytes_array[dictionary_id_flag]; + + header->dictionary_id = (u32)IO_read_bits(in, bytes * 8); + } else { + header->dictionary_id = 0; + } + + // decode frame content size if it exists + if (single_segment_flag || frame_content_size_flag) { + // "This is the original (uncompressed) size. This information is + // optional. The Field_Size is provided according to value of + // Frame_Content_Size_flag. The Field_Size can be equal to 0 (not + // present), 1, 2, 4 or 8 bytes. Format is little-endian." + // + // if frame_content_size_flag == 0 but single_segment_flag is set, we + // still have a 1 byte field + const int bytes_array[] = {1, 2, 4, 8}; + const int bytes = bytes_array[frame_content_size_flag]; + + header->frame_content_size = IO_read_bits(in, bytes * 8); + if (bytes == 2) { + // "When Field_Size is 2, the offset of 256 is added." + header->frame_content_size += 256; + } + } else { + header->frame_content_size = 0; + } + + if (single_segment_flag) { + // "The Window_Descriptor byte is optional. It is absent when + // Single_Segment_flag is set. In this case, the maximum back-reference + // distance is the content size itself, which can be any value from 1 to + // 2^64-1 bytes (16 EB)." + header->window_size = header->frame_content_size; + } +} + +/// Decompress the data from a frame block by block +static void decompress_data(frame_context_t *const ctx, ostream_t *const out, + istream_t *const in) { + // "A frame encapsulates one or multiple blocks. Each block can be + // compressed or not, and has a guaranteed maximum content size, which + // depends on frame parameters. Unlike frames, each block depends on + // previous blocks for proper decoding. However, each block can be + // decompressed without waiting for its successor, allowing streaming + // operations." + int last_block = 0; + do { + // "Last_Block + // + // The lowest bit signals if this block is the last one. Frame ends + // right after this block. + // + // Block_Type and Block_Size + // + // The next 2 bits represent the Block_Type, while the remaining 21 bits + // represent the Block_Size. Format is little-endian." + last_block = (int)IO_read_bits(in, 1); + const int block_type = (int)IO_read_bits(in, 2); + const size_t block_len = IO_read_bits(in, 21); + + switch (block_type) { + case 0: { + // "Raw_Block - this is an uncompressed block. Block_Size is the + // number of bytes to read and copy." + const u8 *const read_ptr = IO_get_read_ptr(in, block_len); + u8 *const write_ptr = IO_get_write_ptr(out, block_len); + + // Copy the raw data into the output + memcpy(write_ptr, read_ptr, block_len); + + ctx->current_total_output += block_len; + break; + } + case 1: { + // "RLE_Block - this is a single byte, repeated N times. In which + // case, Block_Size is the size to regenerate, while the + // "compressed" block is just 1 byte (the byte to repeat)." + const u8 *const read_ptr = IO_get_read_ptr(in, 1); + u8 *const write_ptr = IO_get_write_ptr(out, block_len); + + // Copy `block_len` copies of `read_ptr[0]` to the output + memset(write_ptr, read_ptr[0], block_len); + + ctx->current_total_output += block_len; + break; + } + case 2: { + // "Compressed_Block - this is a Zstandard compressed block, + // detailed in another section of this specification. Block_Size is + // the compressed size. + + // Create a sub-stream for the block + istream_t block_stream = IO_make_sub_istream(in, block_len); + decompress_block(ctx, out, &block_stream); + break; + } + case 3: + // "Reserved - this is not a block. This value cannot be used with + // current version of this specification." + CORRUPTION(); + break; + default: + IMPOSSIBLE(); + } + } while (!last_block); + + if (ctx->header.content_checksum_flag) { + // This program does not support checking the checksum, so skip over it + // if it's present + IO_advance_input(in, 4); + } +} +/******* END FRAME DECODING ***************************************************/ + +/******* BLOCK DECOMPRESSION **************************************************/ +static void decompress_block(frame_context_t *const ctx, ostream_t *const out, + istream_t *const in) { + // "A compressed block consists of 2 sections : + // + // Literals_Section + // Sequences_Section" + + + // Part 1: decode the literals block + u8 *literals = NULL; + const size_t literals_size = decode_literals(ctx, in, &literals); + + // Part 2: decode the sequences block + sequence_command_t *sequences = NULL; + const size_t num_sequences = + decode_sequences(ctx, in, &sequences); + + // Part 3: combine literals and sequence commands to generate output + execute_sequences(ctx, out, literals, literals_size, sequences, + num_sequences); + free(literals); + free(sequences); +} +/******* END BLOCK DECOMPRESSION **********************************************/ + +/******* LITERALS DECODING ****************************************************/ +static size_t decode_literals_simple(istream_t *const in, u8 **const literals, + const int block_type, + const int size_format); +static size_t decode_literals_compressed(frame_context_t *const ctx, + istream_t *const in, + u8 **const literals, + const int block_type, + const int size_format); +static void decode_huf_table(HUF_dtable *const dtable, istream_t *const in); +static void fse_decode_hufweights(ostream_t *weights, istream_t *const in, + int *const num_symbs); + +static size_t decode_literals(frame_context_t *const ctx, istream_t *const in, + u8 **const literals) { + // "Literals can be stored uncompressed or compressed using Huffman prefix + // codes. When compressed, an optional tree description can be present, + // followed by 1 or 4 streams." + // + // "Literals_Section_Header + // + // Header is in charge of describing how literals are packed. It's a + // byte-aligned variable-size bitfield, ranging from 1 to 5 bytes, using + // little-endian convention." + // + // "Literals_Block_Type + // + // This field uses 2 lowest bits of first byte, describing 4 different block + // types" + // + // size_format takes between 1 and 2 bits + int block_type = (int)IO_read_bits(in, 2); + int size_format = (int)IO_read_bits(in, 2); + + if (block_type <= 1) { + // Raw or RLE literals block + return decode_literals_simple(in, literals, block_type, + size_format); + } else { + // Huffman compressed literals + return decode_literals_compressed(ctx, in, literals, block_type, + size_format); + } +} + +/// Decodes literals blocks in raw or RLE form +static size_t decode_literals_simple(istream_t *const in, u8 **const literals, + const int block_type, + const int size_format) { + size_t size; + switch (size_format) { + // These cases are in the form ?0 + // In this case, the ? bit is actually part of the size field + case 0: + case 2: + // "Size_Format uses 1 bit. Regenerated_Size uses 5 bits (0-31)." + IO_rewind_bits(in, 1); + size = IO_read_bits(in, 5); + break; + case 1: + // "Size_Format uses 2 bits. Regenerated_Size uses 12 bits (0-4095)." + size = IO_read_bits(in, 12); + break; + case 3: + // "Size_Format uses 2 bits. Regenerated_Size uses 20 bits (0-1048575)." + size = IO_read_bits(in, 20); + break; + default: + // Size format is in range 0-3 + IMPOSSIBLE(); + } + + if (size > MAX_LITERALS_SIZE) { + CORRUPTION(); + } + + *literals = malloc(size); + if (!*literals) { + BAD_ALLOC(); + } + + switch (block_type) { + case 0: { + // "Raw_Literals_Block - Literals are stored uncompressed." + const u8 *const read_ptr = IO_get_read_ptr(in, size); + memcpy(*literals, read_ptr, size); + break; + } + case 1: { + // "RLE_Literals_Block - Literals consist of a single byte value repeated N times." + const u8 *const read_ptr = IO_get_read_ptr(in, 1); + memset(*literals, read_ptr[0], size); + break; + } + default: + IMPOSSIBLE(); + } + + return size; +} + +/// Decodes Huffman compressed literals +static size_t decode_literals_compressed(frame_context_t *const ctx, + istream_t *const in, + u8 **const literals, + const int block_type, + const int size_format) { + size_t regenerated_size, compressed_size; + // Only size_format=0 has 1 stream, so default to 4 + int num_streams = 4; + switch (size_format) { + case 0: + // "A single stream. Both Compressed_Size and Regenerated_Size use 10 + // bits (0-1023)." + num_streams = 1; + // Fall through as it has the same size format + /* fallthrough */ + case 1: + // "4 streams. Both Compressed_Size and Regenerated_Size use 10 bits + // (0-1023)." + regenerated_size = IO_read_bits(in, 10); + compressed_size = IO_read_bits(in, 10); + break; + case 2: + // "4 streams. Both Compressed_Size and Regenerated_Size use 14 bits + // (0-16383)." + regenerated_size = IO_read_bits(in, 14); + compressed_size = IO_read_bits(in, 14); + break; + case 3: + // "4 streams. Both Compressed_Size and Regenerated_Size use 18 bits + // (0-262143)." + regenerated_size = IO_read_bits(in, 18); + compressed_size = IO_read_bits(in, 18); + break; + default: + // Impossible + IMPOSSIBLE(); + } + if (regenerated_size > MAX_LITERALS_SIZE) { + CORRUPTION(); + } + + *literals = malloc(regenerated_size); + if (!*literals) { + BAD_ALLOC(); + } + + ostream_t lit_stream = IO_make_ostream(*literals, regenerated_size); + istream_t huf_stream = IO_make_sub_istream(in, compressed_size); + + if (block_type == 2) { + // Decode the provided Huffman table + // "This section is only present when Literals_Block_Type type is + // Compressed_Literals_Block (2)." + + HUF_free_dtable(&ctx->literals_dtable); + decode_huf_table(&ctx->literals_dtable, &huf_stream); + } else { + // If the previous Huffman table is being repeated, ensure it exists + if (!ctx->literals_dtable.symbols) { + CORRUPTION(); + } + } + + size_t symbols_decoded; + if (num_streams == 1) { + symbols_decoded = HUF_decompress_1stream(&ctx->literals_dtable, &lit_stream, &huf_stream); + } else { + symbols_decoded = HUF_decompress_4stream(&ctx->literals_dtable, &lit_stream, &huf_stream); + } + + if (symbols_decoded != regenerated_size) { + CORRUPTION(); + } + + return regenerated_size; +} + +// Decode the Huffman table description +static void decode_huf_table(HUF_dtable *const dtable, istream_t *const in) { + // "All literal values from zero (included) to last present one (excluded) + // are represented by Weight with values from 0 to Max_Number_of_Bits." + + // "This is a single byte value (0-255), which describes how to decode the list of weights." + const u8 header = IO_read_bits(in, 8); + + u8 weights[HUF_MAX_SYMBS]; + memset(weights, 0, sizeof(weights)); + + int num_symbs; + + if (header >= 128) { + // "This is a direct representation, where each Weight is written + // directly as a 4 bits field (0-15). The full representation occupies + // ((Number_of_Symbols+1)/2) bytes, meaning it uses a last full byte + // even if Number_of_Symbols is odd. Number_of_Symbols = headerByte - + // 127" + num_symbs = header - 127; + const size_t bytes = (num_symbs + 1) / 2; + + const u8 *const weight_src = IO_get_read_ptr(in, bytes); + + for (int i = 0; i < num_symbs; i++) { + // "They are encoded forward, 2 + // weights to a byte with the first weight taking the top four bits + // and the second taking the bottom four (e.g. the following + // operations could be used to read the weights: Weight[0] = + // (Byte[0] >> 4), Weight[1] = (Byte[0] & 0xf), etc.)." + if (i % 2 == 0) { + weights[i] = weight_src[i / 2] >> 4; + } else { + weights[i] = weight_src[i / 2] & 0xf; + } + } + } else { + // The weights are FSE encoded, decode them before we can construct the + // table + istream_t fse_stream = IO_make_sub_istream(in, header); + ostream_t weight_stream = IO_make_ostream(weights, HUF_MAX_SYMBS); + fse_decode_hufweights(&weight_stream, &fse_stream, &num_symbs); + } + + // Construct the table using the decoded weights + HUF_init_dtable_usingweights(dtable, weights, num_symbs); +} + +static void fse_decode_hufweights(ostream_t *weights, istream_t *const in, + int *const num_symbs) { + const int MAX_ACCURACY_LOG = 7; + + FSE_dtable dtable; + + // "An FSE bitstream starts by a header, describing probabilities + // distribution. It will create a Decoding Table. For a list of Huffman + // weights, maximum accuracy is 7 bits." + FSE_decode_header(&dtable, in, MAX_ACCURACY_LOG); + + // Decode the weights + *num_symbs = FSE_decompress_interleaved2(&dtable, weights, in); + + FSE_free_dtable(&dtable); +} +/******* END LITERALS DECODING ************************************************/ + +/******* SEQUENCE DECODING ****************************************************/ +/// The combination of FSE states needed to decode sequences +typedef struct { + FSE_dtable ll_table; + FSE_dtable of_table; + FSE_dtable ml_table; + + u16 ll_state; + u16 of_state; + u16 ml_state; +} sequence_states_t; + +/// Different modes to signal to decode_seq_tables what to do +typedef enum { + seq_literal_length = 0, + seq_offset = 1, + seq_match_length = 2, +} seq_part_t; + +typedef enum { + seq_predefined = 0, + seq_rle = 1, + seq_fse = 2, + seq_repeat = 3, +} seq_mode_t; + +/// The predefined FSE distribution tables for `seq_predefined` mode +static const i16 SEQ_LITERAL_LENGTH_DEFAULT_DIST[36] = { + 4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1, -1, -1, -1, -1}; +static const i16 SEQ_OFFSET_DEFAULT_DIST[29] = { + 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1}; +static const i16 SEQ_MATCH_LENGTH_DEFAULT_DIST[53] = { + 1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1}; + +/// The sequence decoding baseline and number of additional bits to read/add +/// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#the-codes-for-literals-lengths-match-lengths-and-offsets +static const u32 SEQ_LITERAL_LENGTH_BASELINES[36] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 18, 20, 22, 24, 28, 32, 40, + 48, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536}; +static const u8 SEQ_LITERAL_LENGTH_EXTRA_BITS[36] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, + 1, 1, 2, 2, 3, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; + +static const u32 SEQ_MATCH_LENGTH_BASELINES[53] = { + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 37, 39, 41, 43, 47, 51, 59, 67, 83, + 99, 131, 259, 515, 1027, 2051, 4099, 8195, 16387, 32771, 65539}; +static const u8 SEQ_MATCH_LENGTH_EXTRA_BITS[53] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, + 2, 2, 3, 3, 4, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; + +/// Offset decoding is simpler so we just need a maximum code value +static const u8 SEQ_MAX_CODES[3] = {35, (u8)-1, 52}; + +static void decompress_sequences(frame_context_t *const ctx, + istream_t *const in, + sequence_command_t *const sequences, + const size_t num_sequences); +static sequence_command_t decode_sequence(sequence_states_t *const state, + const u8 *const src, + i64 *const offset, + int lastSequence); +static void decode_seq_table(FSE_dtable *const table, istream_t *const in, + const seq_part_t type, const seq_mode_t mode); + +static size_t decode_sequences(frame_context_t *const ctx, istream_t *in, + sequence_command_t **const sequences) { + // "A compressed block is a succession of sequences . A sequence is a + // literal copy command, followed by a match copy command. A literal copy + // command specifies a length. It is the number of bytes to be copied (or + // extracted) from the literal section. A match copy command specifies an + // offset and a length. The offset gives the position to copy from, which + // can be within a previous block." + + size_t num_sequences; + + // "Number_of_Sequences + // + // This is a variable size field using between 1 and 3 bytes. Let's call its + // first byte byte0." + u8 header = IO_read_bits(in, 8); + if (header < 128) { + // "Number_of_Sequences = byte0 . Uses 1 byte." + num_sequences = header; + } else if (header < 255) { + // "Number_of_Sequences = ((byte0-128) << 8) + byte1 . Uses 2 bytes." + num_sequences = ((header - 128) << 8) + IO_read_bits(in, 8); + } else { + // "Number_of_Sequences = byte1 + (byte2<<8) + 0x7F00 . Uses 3 bytes." + num_sequences = IO_read_bits(in, 16) + 0x7F00; + } + + if (num_sequences == 0) { + // "There are no sequences. The sequence section stops there." + *sequences = NULL; + return 0; + } + + *sequences = malloc(num_sequences * sizeof(sequence_command_t)); + if (!*sequences) { + BAD_ALLOC(); + } + + decompress_sequences(ctx, in, *sequences, num_sequences); + return num_sequences; +} + +/// Decompress the FSE encoded sequence commands +static void decompress_sequences(frame_context_t *const ctx, istream_t *in, + sequence_command_t *const sequences, + const size_t num_sequences) { + // "The Sequences_Section regroup all symbols required to decode commands. + // There are 3 symbol types : literals lengths, offsets and match lengths. + // They are encoded together, interleaved, in a single bitstream." + + // "Symbol compression modes + // + // This is a single byte, defining the compression mode of each symbol + // type." + // + // Bit number : Field name + // 7-6 : Literals_Lengths_Mode + // 5-4 : Offsets_Mode + // 3-2 : Match_Lengths_Mode + // 1-0 : Reserved + u8 compression_modes = IO_read_bits(in, 8); + + if ((compression_modes & 3) != 0) { + // Reserved bits set + CORRUPTION(); + } + + // "Following the header, up to 3 distribution tables can be described. When + // present, they are in this order : + // + // Literals lengths + // Offsets + // Match Lengths" + // Update the tables we have stored in the context + decode_seq_table(&ctx->ll_dtable, in, seq_literal_length, + (compression_modes >> 6) & 3); + + decode_seq_table(&ctx->of_dtable, in, seq_offset, + (compression_modes >> 4) & 3); + + decode_seq_table(&ctx->ml_dtable, in, seq_match_length, + (compression_modes >> 2) & 3); + + + sequence_states_t states; + + // Initialize the decoding tables + { + states.ll_table = ctx->ll_dtable; + states.of_table = ctx->of_dtable; + states.ml_table = ctx->ml_dtable; + } + + const size_t len = IO_istream_len(in); + const u8 *const src = IO_get_read_ptr(in, len); + + // "After writing the last bit containing information, the compressor writes + // a single 1-bit and then fills the byte with 0-7 0 bits of padding." + const int padding = 8 - highest_set_bit(src[len - 1]); + // The offset starts at the end because FSE streams are read backwards + i64 bit_offset = (i64)(len * 8 - (size_t)padding); + + // "The bitstream starts with initial state values, each using the required + // number of bits in their respective accuracy, decoded previously from + // their normalized distribution. + // + // It starts by Literals_Length_State, followed by Offset_State, and finally + // Match_Length_State." + FSE_init_state(&states.ll_table, &states.ll_state, src, &bit_offset); + FSE_init_state(&states.of_table, &states.of_state, src, &bit_offset); + FSE_init_state(&states.ml_table, &states.ml_state, src, &bit_offset); + + for (size_t i = 0; i < num_sequences; i++) { + // Decode sequences one by one + sequences[i] = decode_sequence(&states, src, &bit_offset, i==num_sequences-1); + } + + if (bit_offset != 0) { + CORRUPTION(); + } +} + +// Decode a single sequence and update the state +static sequence_command_t decode_sequence(sequence_states_t *const states, + const u8 *const src, + i64 *const offset, + int lastSequence) { + // "Each symbol is a code in its own context, which specifies Baseline and + // Number_of_Bits to add. Codes are FSE compressed, and interleaved with raw + // additional bits in the same bitstream." + + // Decode symbols, but don't update states + const u8 of_code = FSE_peek_symbol(&states->of_table, states->of_state); + const u8 ll_code = FSE_peek_symbol(&states->ll_table, states->ll_state); + const u8 ml_code = FSE_peek_symbol(&states->ml_table, states->ml_state); + + // Offset doesn't need a max value as it's not decoded using a table + if (ll_code > SEQ_MAX_CODES[seq_literal_length] || + ml_code > SEQ_MAX_CODES[seq_match_length]) { + CORRUPTION(); + } + + // Read the interleaved bits + sequence_command_t seq; + // "Decoding starts by reading the Number_of_Bits required to decode Offset. + // It then does the same for Match_Length, and then for Literals_Length." + seq.offset = ((u32)1 << of_code) + STREAM_read_bits(src, of_code, offset); + + seq.match_length = + SEQ_MATCH_LENGTH_BASELINES[ml_code] + + STREAM_read_bits(src, SEQ_MATCH_LENGTH_EXTRA_BITS[ml_code], offset); + + seq.literal_length = + SEQ_LITERAL_LENGTH_BASELINES[ll_code] + + STREAM_read_bits(src, SEQ_LITERAL_LENGTH_EXTRA_BITS[ll_code], offset); + + // "If it is not the last sequence in the block, the next operation is to + // update states. Using the rules pre-calculated in the decoding tables, + // Literals_Length_State is updated, followed by Match_Length_State, and + // then Offset_State." + // If the stream is complete don't read bits to update state + if (!lastSequence) { + FSE_update_state(&states->ll_table, &states->ll_state, src, offset); + FSE_update_state(&states->ml_table, &states->ml_state, src, offset); + FSE_update_state(&states->of_table, &states->of_state, src, offset); + } + + return seq; +} + +/// Given a sequence part and table mode, decode the FSE distribution +/// Errors if the mode is `seq_repeat` without a pre-existing table in `table` +static void decode_seq_table(FSE_dtable *const table, istream_t *const in, + const seq_part_t type, const seq_mode_t mode) { + // Constant arrays indexed by seq_part_t + const i16 *const default_distributions[] = {SEQ_LITERAL_LENGTH_DEFAULT_DIST, + SEQ_OFFSET_DEFAULT_DIST, + SEQ_MATCH_LENGTH_DEFAULT_DIST}; + const size_t default_distribution_lengths[] = {36, 29, 53}; + const size_t default_distribution_accuracies[] = {6, 5, 6}; + + const size_t max_accuracies[] = {9, 8, 9}; + + if (mode != seq_repeat) { + // Free old one before overwriting + FSE_free_dtable(table); + } + + switch (mode) { + case seq_predefined: { + // "Predefined_Mode : uses a predefined distribution table." + const i16 *distribution = default_distributions[type]; + const size_t symbs = default_distribution_lengths[type]; + const size_t accuracy_log = default_distribution_accuracies[type]; + + FSE_init_dtable(table, distribution, symbs, accuracy_log); + break; + } + case seq_rle: { + // "RLE_Mode : it's a single code, repeated Number_of_Sequences times." + const u8 symb = IO_get_read_ptr(in, 1)[0]; + FSE_init_dtable_rle(table, symb); + break; + } + case seq_fse: { + // "FSE_Compressed_Mode : standard FSE compression. A distribution table + // will be present " + FSE_decode_header(table, in, max_accuracies[type]); + break; + } + case seq_repeat: + // "Repeat_Mode : reuse distribution table from previous compressed + // block." + // Nothing to do here, table will be unchanged + if (!table->symbols) { + // This mode is invalid if we don't already have a table + CORRUPTION(); + } + break; + default: + // Impossible, as mode is from 0-3 + IMPOSSIBLE(); + break; + } + +} +/******* END SEQUENCE DECODING ************************************************/ + +/******* SEQUENCE EXECUTION ***************************************************/ +static void execute_sequences(frame_context_t *const ctx, ostream_t *const out, + const u8 *const literals, + const size_t literals_len, + const sequence_command_t *const sequences, + const size_t num_sequences) { + istream_t litstream = IO_make_istream(literals, literals_len); + + u64 *const offset_hist = ctx->previous_offsets; + size_t total_output = ctx->current_total_output; + + for (size_t i = 0; i < num_sequences; i++) { + const sequence_command_t seq = sequences[i]; + { + const u32 literals_size = copy_literals(seq.literal_length, &litstream, out); + total_output += literals_size; + } + + size_t const offset = compute_offset(seq, offset_hist); + + size_t const match_length = seq.match_length; + + execute_match_copy(ctx, offset, match_length, total_output, out); + + total_output += match_length; + } + + // Copy any leftover literals + { + size_t len = IO_istream_len(&litstream); + copy_literals(len, &litstream, out); + total_output += len; + } + + ctx->current_total_output = total_output; +} + +static u32 copy_literals(const size_t literal_length, istream_t *litstream, + ostream_t *const out) { + // If the sequence asks for more literals than are left, the + // sequence must be corrupted + if (literal_length > IO_istream_len(litstream)) { + CORRUPTION(); + } + + u8 *const write_ptr = IO_get_write_ptr(out, literal_length); + const u8 *const read_ptr = + IO_get_read_ptr(litstream, literal_length); + // Copy literals to output + memcpy(write_ptr, read_ptr, literal_length); + + return literal_length; +} + +static size_t compute_offset(sequence_command_t seq, u64 *const offset_hist) { + size_t offset; + // Offsets are special, we need to handle the repeat offsets + if (seq.offset <= 3) { + // "The first 3 values define a repeated offset and we will call + // them Repeated_Offset1, Repeated_Offset2, and Repeated_Offset3. + // They are sorted in recency order, with Repeated_Offset1 meaning + // 'most recent one'". + + // Use 0 indexing for the array + u32 idx = seq.offset - 1; + if (seq.literal_length == 0) { + // "There is an exception though, when current sequence's + // literals length is 0. In this case, repeated offsets are + // shifted by one, so Repeated_Offset1 becomes Repeated_Offset2, + // Repeated_Offset2 becomes Repeated_Offset3, and + // Repeated_Offset3 becomes Repeated_Offset1 - 1_byte." + idx++; + } + + if (idx == 0) { + offset = offset_hist[0]; + } else { + // If idx == 3 then literal length was 0 and the offset was 3, + // as per the exception listed above + offset = idx < 3 ? offset_hist[idx] : offset_hist[0] - 1; + + // If idx == 1 we don't need to modify offset_hist[2], since + // we're using the second-most recent code + if (idx > 1) { + offset_hist[2] = offset_hist[1]; + } + offset_hist[1] = offset_hist[0]; + offset_hist[0] = offset; + } + } else { + // When it's not a repeat offset: + // "if (Offset_Value > 3) offset = Offset_Value - 3;" + offset = seq.offset - 3; + + // Shift back history + offset_hist[2] = offset_hist[1]; + offset_hist[1] = offset_hist[0]; + offset_hist[0] = offset; + } + return offset; +} + +static void execute_match_copy(frame_context_t *const ctx, size_t offset, + size_t match_length, size_t total_output, + ostream_t *const out) { + u8 *write_ptr = IO_get_write_ptr(out, match_length); + if (total_output <= ctx->header.window_size) { + // In this case offset might go back into the dictionary + if (offset > total_output + ctx->dict_content_len) { + // The offset goes beyond even the dictionary + CORRUPTION(); + } + + if (offset > total_output) { + // "The rest of the dictionary is its content. The content act + // as a "past" in front of data to compress or decompress, so it + // can be referenced in sequence commands." + const size_t dict_copy = + MIN(offset - total_output, match_length); + const size_t dict_offset = + ctx->dict_content_len - (offset - total_output); + + memcpy(write_ptr, ctx->dict_content + dict_offset, dict_copy); + write_ptr += dict_copy; + match_length -= dict_copy; + } + } else if (offset > ctx->header.window_size) { + CORRUPTION(); + } + + // We must copy byte by byte because the match length might be larger + // than the offset + // ex: if the output so far was "abc", a command with offset=3 and + // match_length=6 would produce "abcabcabc" as the new output + for (size_t j = 0; j < match_length; j++) { + *write_ptr = *(write_ptr - offset); + write_ptr++; + } +} +/******* END SEQUENCE EXECUTION ***********************************************/ + +/******* OUTPUT SIZE COUNTING *************************************************/ +/// Get the decompressed size of an input stream so memory can be allocated in +/// advance. +/// This implementation assumes `src` points to a single ZSTD-compressed frame +size_t ZSTD_get_decompressed_size(const void *src, const size_t src_len) { + istream_t in = IO_make_istream(src, src_len); + + // get decompressed size from ZSTD frame header + { + const u32 magic_number = (u32)IO_read_bits(&in, 32); + + if (magic_number == ZSTD_MAGIC_NUMBER) { + // ZSTD frame + frame_header_t header; + parse_frame_header(&header, &in); + + if (header.frame_content_size == 0 && !header.single_segment_flag) { + // Content size not provided, we can't tell + return (size_t)-1; + } + + return header.frame_content_size; + } else { + // not a real frame or skippable frame + ERROR("ZSTD frame magic number did not match"); + } + } +} +/******* END OUTPUT SIZE COUNTING *********************************************/ + +/******* DICTIONARY PARSING ***************************************************/ +dictionary_t* create_dictionary(void) { + dictionary_t* const dict = calloc(1, sizeof(dictionary_t)); + if (!dict) { + BAD_ALLOC(); + } + return dict; +} + +/// Free an allocated dictionary +void free_dictionary(dictionary_t *const dict) { + HUF_free_dtable(&dict->literals_dtable); + FSE_free_dtable(&dict->ll_dtable); + FSE_free_dtable(&dict->of_dtable); + FSE_free_dtable(&dict->ml_dtable); + + free(dict->content); + + memset(dict, 0, sizeof(dictionary_t)); + + free(dict); +} + + +#if !defined(ZDEC_NO_DICTIONARY) +#define DICT_SIZE_ERROR() ERROR("Dictionary size cannot be less than 8 bytes") +#define NULL_SRC() ERROR("Tried to create dictionary with pointer to null src"); + +static void init_dictionary_content(dictionary_t *const dict, + istream_t *const in); + +void parse_dictionary(dictionary_t *const dict, const void *src, + size_t src_len) { + const u8 *byte_src = (const u8 *)src; + memset(dict, 0, sizeof(dictionary_t)); + if (src == NULL) { /* cannot initialize dictionary with null src */ + NULL_SRC(); + } + if (src_len < 8) { + DICT_SIZE_ERROR(); + } + + istream_t in = IO_make_istream(byte_src, src_len); + + const u32 magic_number = IO_read_bits(&in, 32); + if (magic_number != 0xEC30A437) { + // raw content dict + IO_rewind_bits(&in, 32); + init_dictionary_content(dict, &in); + return; + } + + dict->dictionary_id = IO_read_bits(&in, 32); + + // "Entropy_Tables : following the same format as the tables in compressed + // blocks. They are stored in following order : Huffman tables for literals, + // FSE table for offsets, FSE table for match lengths, and FSE table for + // literals lengths. It's finally followed by 3 offset values, populating + // recent offsets (instead of using {1,4,8}), stored in order, 4-bytes + // little-endian each, for a total of 12 bytes. Each recent offset must have + // a value < dictionary size." + decode_huf_table(&dict->literals_dtable, &in); + decode_seq_table(&dict->of_dtable, &in, seq_offset, seq_fse); + decode_seq_table(&dict->ml_dtable, &in, seq_match_length, seq_fse); + decode_seq_table(&dict->ll_dtable, &in, seq_literal_length, seq_fse); + + // Read in the previous offset history + dict->previous_offsets[0] = IO_read_bits(&in, 32); + dict->previous_offsets[1] = IO_read_bits(&in, 32); + dict->previous_offsets[2] = IO_read_bits(&in, 32); + + // Ensure the provided offsets aren't too large + // "Each recent offset must have a value < dictionary size." + for (int i = 0; i < 3; i++) { + if (dict->previous_offsets[i] > src_len) { + ERROR("Dictionary corrupted"); + } + } + + // "Content : The rest of the dictionary is its content. The content act as + // a "past" in front of data to compress or decompress, so it can be + // referenced in sequence commands." + init_dictionary_content(dict, &in); +} + +static void init_dictionary_content(dictionary_t *const dict, + istream_t *const in) { + // Copy in the content + dict->content_size = IO_istream_len(in); + dict->content = malloc(dict->content_size); + if (!dict->content) { + BAD_ALLOC(); + } + + const u8 *const content = IO_get_read_ptr(in, dict->content_size); + + memcpy(dict->content, content, dict->content_size); +} + +static void HUF_copy_dtable(HUF_dtable *const dst, + const HUF_dtable *const src) { + if (src->max_bits == 0) { + memset(dst, 0, sizeof(HUF_dtable)); + return; + } + + const size_t size = (size_t)1 << src->max_bits; + dst->max_bits = src->max_bits; + + dst->symbols = malloc(size); + dst->num_bits = malloc(size); + if (!dst->symbols || !dst->num_bits) { + BAD_ALLOC(); + } + + memcpy(dst->symbols, src->symbols, size); + memcpy(dst->num_bits, src->num_bits, size); +} + +static void FSE_copy_dtable(FSE_dtable *const dst, const FSE_dtable *const src) { + if (src->accuracy_log == 0) { + memset(dst, 0, sizeof(FSE_dtable)); + return; + } + + size_t size = (size_t)1 << src->accuracy_log; + dst->accuracy_log = src->accuracy_log; + + dst->symbols = malloc(size); + dst->num_bits = malloc(size); + dst->new_state_base = malloc(size * sizeof(u16)); + if (!dst->symbols || !dst->num_bits || !dst->new_state_base) { + BAD_ALLOC(); + } + + memcpy(dst->symbols, src->symbols, size); + memcpy(dst->num_bits, src->num_bits, size); + memcpy(dst->new_state_base, src->new_state_base, size * sizeof(u16)); +} + +/// A dictionary acts as initializing values for the frame context before +/// decompression, so we implement it by applying it's predetermined +/// tables and content to the context before beginning decompression +static void frame_context_apply_dict(frame_context_t *const ctx, + const dictionary_t *const dict) { + // If the content pointer is NULL then it must be an empty dict + if (!dict || !dict->content) + return; + + // If the requested dictionary_id is non-zero, the correct dictionary must + // be present + if (ctx->header.dictionary_id != 0 && + ctx->header.dictionary_id != dict->dictionary_id) { + ERROR("Wrong dictionary provided"); + } + + // Copy the dict content to the context for references during sequence + // execution + ctx->dict_content = dict->content; + ctx->dict_content_len = dict->content_size; + + // If it's a formatted dict copy the precomputed tables in so they can + // be used in the table repeat modes + if (dict->dictionary_id != 0) { + // Deep copy the entropy tables so they can be freed independently of + // the dictionary struct + HUF_copy_dtable(&ctx->literals_dtable, &dict->literals_dtable); + FSE_copy_dtable(&ctx->ll_dtable, &dict->ll_dtable); + FSE_copy_dtable(&ctx->of_dtable, &dict->of_dtable); + FSE_copy_dtable(&ctx->ml_dtable, &dict->ml_dtable); + + // Copy the repeated offsets + memcpy(ctx->previous_offsets, dict->previous_offsets, + sizeof(ctx->previous_offsets)); + } +} + +#else // ZDEC_NO_DICTIONARY is defined + +static void frame_context_apply_dict(frame_context_t *const ctx, + const dictionary_t *const dict) { + (void)ctx; + if (dict && dict->content) ERROR("dictionary not supported"); +} + +#endif +/******* END DICTIONARY PARSING ***********************************************/ + +/******* IO STREAM OPERATIONS *************************************************/ + +/// Reads `num` bits from a bitstream, and updates the internal offset +static inline u64 IO_read_bits(istream_t *const in, const int num_bits) { + if (num_bits > 64 || num_bits <= 0) { + ERROR("Attempt to read an invalid number of bits"); + } + + const size_t bytes = (num_bits + in->bit_offset + 7) / 8; + const size_t full_bytes = (num_bits + in->bit_offset) / 8; + if (bytes > in->len) { + INP_SIZE(); + } + + const u64 result = read_bits_LE(in->ptr, num_bits, in->bit_offset); + + in->bit_offset = (num_bits + in->bit_offset) % 8; + in->ptr += full_bytes; + in->len -= full_bytes; + + return result; +} + +/// If a non-zero number of bits have been read from the current byte, advance +/// the offset to the next byte +static inline void IO_rewind_bits(istream_t *const in, int num_bits) { + if (num_bits < 0) { + ERROR("Attempting to rewind stream by a negative number of bits"); + } + + // move the offset back by `num_bits` bits + const int new_offset = in->bit_offset - num_bits; + // determine the number of whole bytes we have to rewind, rounding up to an + // integer number (e.g. if `new_offset == -5`, `bytes == 1`) + const i64 bytes = -(new_offset - 7) / 8; + + in->ptr -= bytes; + in->len += bytes; + // make sure the resulting `bit_offset` is positive, as mod in C does not + // convert numbers from negative to positive (e.g. -22 % 8 == -6) + in->bit_offset = ((new_offset % 8) + 8) % 8; +} + +/// If the remaining bits in a byte will be unused, advance to the end of the +/// byte +static inline void IO_align_stream(istream_t *const in) { + if (in->bit_offset != 0) { + if (in->len == 0) { + INP_SIZE(); + } + in->ptr++; + in->len--; + in->bit_offset = 0; + } +} + +/// Write the given byte into the output stream +static inline void IO_write_byte(ostream_t *const out, u8 symb) { + if (out->len == 0) { + OUT_SIZE(); + } + + out->ptr[0] = symb; + out->ptr++; + out->len--; +} + +/// Returns the number of bytes left to be read in this stream. The stream must +/// be byte aligned. +static inline size_t IO_istream_len(const istream_t *const in) { + return in->len; +} + +/// Returns a pointer where `len` bytes can be read, and advances the internal +/// state. The stream must be byte aligned. +static inline const u8 *IO_get_read_ptr(istream_t *const in, size_t len) { + if (len > in->len) { + INP_SIZE(); + } + if (in->bit_offset != 0) { + ERROR("Attempting to operate on a non-byte aligned stream"); + } + const u8 *const ptr = in->ptr; + in->ptr += len; + in->len -= len; + + return ptr; +} +/// Returns a pointer to write `len` bytes to, and advances the internal state +static inline u8 *IO_get_write_ptr(ostream_t *const out, size_t len) { + if (len > out->len) { + OUT_SIZE(); + } + u8 *const ptr = out->ptr; + out->ptr += len; + out->len -= len; + + return ptr; +} + +/// Advance the inner state by `len` bytes +static inline void IO_advance_input(istream_t *const in, size_t len) { + if (len > in->len) { + INP_SIZE(); + } + if (in->bit_offset != 0) { + ERROR("Attempting to operate on a non-byte aligned stream"); + } + + in->ptr += len; + in->len -= len; +} + +/// Returns an `ostream_t` constructed from the given pointer and length +static inline ostream_t IO_make_ostream(u8 *out, size_t len) { + return (ostream_t) { out, len }; +} + +/// Returns an `istream_t` constructed from the given pointer and length +static inline istream_t IO_make_istream(const u8 *in, size_t len) { + return (istream_t) { in, len, 0 }; +} + +/// Returns an `istream_t` with the same base as `in`, and length `len` +/// Then, advance `in` to account for the consumed bytes +/// `in` must be byte aligned +static inline istream_t IO_make_sub_istream(istream_t *const in, size_t len) { + // Consume `len` bytes of the parent stream + const u8 *const ptr = IO_get_read_ptr(in, len); + + // Make a substream using the pointer to those `len` bytes + return IO_make_istream(ptr, len); +} +/******* END IO STREAM OPERATIONS *********************************************/ + +/******* BITSTREAM OPERATIONS *************************************************/ +/// Read `num` bits (up to 64) from `src + offset`, where `offset` is in bits +static inline u64 read_bits_LE(const u8 *src, const int num_bits, + const size_t offset) { + if (num_bits > 64) { + ERROR("Attempt to read an invalid number of bits"); + } + + // Skip over bytes that aren't in range + src += offset / 8; + size_t bit_offset = offset % 8; + u64 res = 0; + + int shift = 0; + int left = num_bits; + while (left > 0) { + u64 mask = left >= 8 ? 0xff : (((u64)1 << left) - 1); + // Read the next byte, shift it to account for the offset, and then mask + // out the top part if we don't need all the bits + res += (((u64)*src++ >> bit_offset) & mask) << shift; + shift += 8 - bit_offset; + left -= 8 - bit_offset; + bit_offset = 0; + } + + return res; +} + +/// Read bits from the end of a HUF or FSE bitstream. `offset` is in bits, so +/// it updates `offset` to `offset - bits`, and then reads `bits` bits from +/// `src + offset`. If the offset becomes negative, the extra bits at the +/// bottom are filled in with `0` bits instead of reading from before `src`. +static inline u64 STREAM_read_bits(const u8 *const src, const int bits, + i64 *const offset) { + *offset = *offset - bits; + size_t actual_off = *offset; + size_t actual_bits = bits; + // Don't actually read bits from before the start of src, so if `*offset < + // 0` fix actual_off and actual_bits to reflect the quantity to read + if (*offset < 0) { + actual_bits += *offset; + actual_off = 0; + } + u64 res = read_bits_LE(src, actual_bits, actual_off); + + if (*offset < 0) { + // Fill in the bottom "overflowed" bits with 0's + res = -*offset >= 64 ? 0 : (res << -*offset); + } + return res; +} +/******* END BITSTREAM OPERATIONS *********************************************/ + +/******* BIT COUNTING OPERATIONS **********************************************/ +/// Returns `x`, where `2^x` is the largest power of 2 less than or equal to +/// `num`, or `-1` if `num == 0`. +static inline int highest_set_bit(const u64 num) { + for (int i = 63; i >= 0; i--) { + if (((u64)1 << i) <= num) { + return i; + } + } + return -1; +} +/******* END BIT COUNTING OPERATIONS ******************************************/ + +/******* HUFFMAN PRIMITIVES ***************************************************/ +static inline u8 HUF_decode_symbol(const HUF_dtable *const dtable, + u16 *const state, const u8 *const src, + i64 *const offset) { + // Look up the symbol and number of bits to read + const u8 symb = dtable->symbols[*state]; + const u8 bits = dtable->num_bits[*state]; + const u16 rest = STREAM_read_bits(src, bits, offset); + // Shift `bits` bits out of the state, keeping the low order bits that + // weren't necessary to determine this symbol. Then add in the new bits + // read from the stream. + *state = ((*state << bits) + rest) & (((u16)1 << dtable->max_bits) - 1); + + return symb; +} + +static inline void HUF_init_state(const HUF_dtable *const dtable, + u16 *const state, const u8 *const src, + i64 *const offset) { + // Read in a full `dtable->max_bits` bits to initialize the state + const u8 bits = dtable->max_bits; + *state = STREAM_read_bits(src, bits, offset); +} + +static size_t HUF_decompress_1stream(const HUF_dtable *const dtable, + ostream_t *const out, + istream_t *const in) { + const size_t len = IO_istream_len(in); + if (len == 0) { + INP_SIZE(); + } + const u8 *const src = IO_get_read_ptr(in, len); + + // "Each bitstream must be read backward, that is starting from the end down + // to the beginning. Therefore it's necessary to know the size of each + // bitstream. + // + // It's also necessary to know exactly which bit is the latest. This is + // detected by a final bit flag : the highest bit of latest byte is a + // final-bit-flag. Consequently, a last byte of 0 is not possible. And the + // final-bit-flag itself is not part of the useful bitstream. Hence, the + // last byte contains between 0 and 7 useful bits." + const int padding = 8 - highest_set_bit(src[len - 1]); + + // Offset starts at the end because HUF streams are read backwards + i64 bit_offset = len * 8 - padding; + u16 state; + + HUF_init_state(dtable, &state, src, &bit_offset); + + size_t symbols_written = 0; + while (bit_offset > -dtable->max_bits) { + // Iterate over the stream, decoding one symbol at a time + IO_write_byte(out, HUF_decode_symbol(dtable, &state, src, &bit_offset)); + symbols_written++; + } + // "The process continues up to reading the required number of symbols per + // stream. If a bitstream is not entirely and exactly consumed, hence + // reaching exactly its beginning position with all bits consumed, the + // decoding process is considered faulty." + + // When all symbols have been decoded, the final state value shouldn't have + // any data from the stream, so it should have "read" dtable->max_bits from + // before the start of `src` + // Therefore `offset`, the edge to start reading new bits at, should be + // dtable->max_bits before the start of the stream + if (bit_offset != -dtable->max_bits) { + CORRUPTION(); + } + + return symbols_written; +} + +static size_t HUF_decompress_4stream(const HUF_dtable *const dtable, + ostream_t *const out, istream_t *const in) { + // "Compressed size is provided explicitly : in the 4-streams variant, + // bitstreams are preceded by 3 unsigned little-endian 16-bits values. Each + // value represents the compressed size of one stream, in order. The last + // stream size is deducted from total compressed size and from previously + // decoded stream sizes" + const size_t csize1 = IO_read_bits(in, 16); + const size_t csize2 = IO_read_bits(in, 16); + const size_t csize3 = IO_read_bits(in, 16); + + istream_t in1 = IO_make_sub_istream(in, csize1); + istream_t in2 = IO_make_sub_istream(in, csize2); + istream_t in3 = IO_make_sub_istream(in, csize3); + istream_t in4 = IO_make_sub_istream(in, IO_istream_len(in)); + + size_t total_output = 0; + // Decode each stream independently for simplicity + // If we wanted to we could decode all 4 at the same time for speed, + // utilizing more execution units + total_output += HUF_decompress_1stream(dtable, out, &in1); + total_output += HUF_decompress_1stream(dtable, out, &in2); + total_output += HUF_decompress_1stream(dtable, out, &in3); + total_output += HUF_decompress_1stream(dtable, out, &in4); + + return total_output; +} + +/// Initializes a Huffman table using canonical Huffman codes +/// For more explanation on canonical Huffman codes see +/// https://www.cs.scranton.edu/~mccloske/courses/cmps340/huff_canonical_dec2015.html +/// Codes within a level are allocated in symbol order (i.e. smaller symbols get +/// earlier codes) +static void HUF_init_dtable(HUF_dtable *const table, const u8 *const bits, + const int num_symbs) { + memset(table, 0, sizeof(HUF_dtable)); + if (num_symbs > HUF_MAX_SYMBS) { + ERROR("Too many symbols for Huffman"); + } + + u8 max_bits = 0; + u16 rank_count[HUF_MAX_BITS + 1]; + memset(rank_count, 0, sizeof(rank_count)); + + // Count the number of symbols for each number of bits, and determine the + // depth of the tree + for (int i = 0; i < num_symbs; i++) { + if (bits[i] > HUF_MAX_BITS) { + ERROR("Huffman table depth too large"); + } + max_bits = MAX(max_bits, bits[i]); + rank_count[bits[i]]++; + } + + const size_t table_size = 1 << max_bits; + table->max_bits = max_bits; + table->symbols = malloc(table_size); + table->num_bits = malloc(table_size); + + if (!table->symbols || !table->num_bits) { + free(table->symbols); + free(table->num_bits); + BAD_ALLOC(); + } + + // "Symbols are sorted by Weight. Within same Weight, symbols keep natural + // order. Symbols with a Weight of zero are removed. Then, starting from + // lowest weight, prefix codes are distributed in order." + + u32 rank_idx[HUF_MAX_BITS + 1]; + // Initialize the starting codes for each rank (number of bits) + rank_idx[max_bits] = 0; + for (int i = max_bits; i >= 1; i--) { + rank_idx[i - 1] = rank_idx[i] + rank_count[i] * (1 << (max_bits - i)); + // The entire range takes the same number of bits so we can memset it + memset(&table->num_bits[rank_idx[i]], i, rank_idx[i - 1] - rank_idx[i]); + } + + if (rank_idx[0] != table_size) { + CORRUPTION(); + } + + // Allocate codes and fill in the table + for (int i = 0; i < num_symbs; i++) { + if (bits[i] != 0) { + // Allocate a code for this symbol and set its range in the table + const u16 code = rank_idx[bits[i]]; + // Since the code doesn't care about the bottom `max_bits - bits[i]` + // bits of state, it gets a range that spans all possible values of + // the lower bits + const u16 len = 1 << (max_bits - bits[i]); + memset(&table->symbols[code], i, len); + rank_idx[bits[i]] += len; + } + } +} + +static void HUF_init_dtable_usingweights(HUF_dtable *const table, + const u8 *const weights, + const int num_symbs) { + // +1 because the last weight is not transmitted in the header + if (num_symbs + 1 > HUF_MAX_SYMBS) { + ERROR("Too many symbols for Huffman"); + } + + u8 bits[HUF_MAX_SYMBS]; + + u64 weight_sum = 0; + for (int i = 0; i < num_symbs; i++) { + // Weights are in the same range as bit count + if (weights[i] > HUF_MAX_BITS) { + CORRUPTION(); + } + weight_sum += weights[i] > 0 ? (u64)1 << (weights[i] - 1) : 0; + } + + // Find the first power of 2 larger than the sum + const int max_bits = highest_set_bit(weight_sum) + 1; + const u64 left_over = ((u64)1 << max_bits) - weight_sum; + // If the left over isn't a power of 2, the weights are invalid + if (left_over & (left_over - 1)) { + CORRUPTION(); + } + + // left_over is used to find the last weight as it's not transmitted + // by inverting 2^(weight - 1) we can determine the value of last_weight + const int last_weight = highest_set_bit(left_over) + 1; + + for (int i = 0; i < num_symbs; i++) { + // "Number_of_Bits = Number_of_Bits ? Max_Number_of_Bits + 1 - Weight : 0" + bits[i] = weights[i] > 0 ? (max_bits + 1 - weights[i]) : 0; + } + bits[num_symbs] = + max_bits + 1 - last_weight; // Last weight is always non-zero + + HUF_init_dtable(table, bits, num_symbs + 1); +} + +static void HUF_free_dtable(HUF_dtable *const dtable) { + free(dtable->symbols); + free(dtable->num_bits); + memset(dtable, 0, sizeof(HUF_dtable)); +} +/******* END HUFFMAN PRIMITIVES ***********************************************/ + +/******* FSE PRIMITIVES *******************************************************/ +/// For more description of FSE see +/// https://github.com/Cyan4973/FiniteStateEntropy/ + +/// Allow a symbol to be decoded without updating state +static inline u8 FSE_peek_symbol(const FSE_dtable *const dtable, + const u16 state) { + return dtable->symbols[state]; +} + +/// Consumes bits from the input and uses the current state to determine the +/// next state +static inline void FSE_update_state(const FSE_dtable *const dtable, + u16 *const state, const u8 *const src, + i64 *const offset) { + const u8 bits = dtable->num_bits[*state]; + const u16 rest = STREAM_read_bits(src, bits, offset); + *state = dtable->new_state_base[*state] + rest; +} + +/// Decodes a single FSE symbol and updates the offset +static inline u8 FSE_decode_symbol(const FSE_dtable *const dtable, + u16 *const state, const u8 *const src, + i64 *const offset) { + const u8 symb = FSE_peek_symbol(dtable, *state); + FSE_update_state(dtable, state, src, offset); + return symb; +} + +static inline void FSE_init_state(const FSE_dtable *const dtable, + u16 *const state, const u8 *const src, + i64 *const offset) { + // Read in a full `accuracy_log` bits to initialize the state + const u8 bits = dtable->accuracy_log; + *state = STREAM_read_bits(src, bits, offset); +} + +static size_t FSE_decompress_interleaved2(const FSE_dtable *const dtable, + ostream_t *const out, + istream_t *const in) { + const size_t len = IO_istream_len(in); + if (len == 0) { + INP_SIZE(); + } + const u8 *const src = IO_get_read_ptr(in, len); + + // "Each bitstream must be read backward, that is starting from the end down + // to the beginning. Therefore it's necessary to know the size of each + // bitstream. + // + // It's also necessary to know exactly which bit is the latest. This is + // detected by a final bit flag : the highest bit of latest byte is a + // final-bit-flag. Consequently, a last byte of 0 is not possible. And the + // final-bit-flag itself is not part of the useful bitstream. Hence, the + // last byte contains between 0 and 7 useful bits." + const int padding = 8 - highest_set_bit(src[len - 1]); + i64 offset = len * 8 - padding; + + u16 state1, state2; + // "The first state (State1) encodes the even indexed symbols, and the + // second (State2) encodes the odd indexes. State1 is initialized first, and + // then State2, and they take turns decoding a single symbol and updating + // their state." + FSE_init_state(dtable, &state1, src, &offset); + FSE_init_state(dtable, &state2, src, &offset); + + // Decode until we overflow the stream + // Since we decode in reverse order, overflowing the stream is offset going + // negative + size_t symbols_written = 0; + while (1) { + // "The number of symbols to decode is determined by tracking bitStream + // overflow condition: If updating state after decoding a symbol would + // require more bits than remain in the stream, it is assumed the extra + // bits are 0. Then, the symbols for each of the final states are + // decoded and the process is complete." + IO_write_byte(out, FSE_decode_symbol(dtable, &state1, src, &offset)); + symbols_written++; + if (offset < 0) { + // There's still a symbol to decode in state2 + IO_write_byte(out, FSE_peek_symbol(dtable, state2)); + symbols_written++; + break; + } + + IO_write_byte(out, FSE_decode_symbol(dtable, &state2, src, &offset)); + symbols_written++; + if (offset < 0) { + // There's still a symbol to decode in state1 + IO_write_byte(out, FSE_peek_symbol(dtable, state1)); + symbols_written++; + break; + } + } + + return symbols_written; +} + +static void FSE_init_dtable(FSE_dtable *const dtable, + const i16 *const norm_freqs, const int num_symbs, + const int accuracy_log) { + if (accuracy_log > FSE_MAX_ACCURACY_LOG) { + ERROR("FSE accuracy too large"); + } + if (num_symbs > FSE_MAX_SYMBS) { + ERROR("Too many symbols for FSE"); + } + + dtable->accuracy_log = accuracy_log; + + const size_t size = (size_t)1 << accuracy_log; + dtable->symbols = malloc(size * sizeof(u8)); + dtable->num_bits = malloc(size * sizeof(u8)); + dtable->new_state_base = malloc(size * sizeof(u16)); + + if (!dtable->symbols || !dtable->num_bits || !dtable->new_state_base) { + BAD_ALLOC(); + } + + // Used to determine how many bits need to be read for each state, + // and where the destination range should start + // Needs to be u16 because max value is 2 * max number of symbols, + // which can be larger than a byte can store + u16 state_desc[FSE_MAX_SYMBS]; + + // "Symbols are scanned in their natural order for "less than 1" + // probabilities. Symbols with this probability are being attributed a + // single cell, starting from the end of the table. These symbols define a + // full state reset, reading Accuracy_Log bits." + int high_threshold = size; + for (int s = 0; s < num_symbs; s++) { + // Scan for low probability symbols to put at the top + if (norm_freqs[s] == -1) { + dtable->symbols[--high_threshold] = s; + state_desc[s] = 1; + } + } + + // "All remaining symbols are sorted in their natural order. Starting from + // symbol 0 and table position 0, each symbol gets attributed as many cells + // as its probability. Cell allocation is spread, not linear." + // Place the rest in the table + const u16 step = (size >> 1) + (size >> 3) + 3; + const u16 mask = size - 1; + u16 pos = 0; + for (int s = 0; s < num_symbs; s++) { + if (norm_freqs[s] <= 0) { + continue; + } + + state_desc[s] = norm_freqs[s]; + + for (int i = 0; i < norm_freqs[s]; i++) { + // Give `norm_freqs[s]` states to symbol s + dtable->symbols[pos] = s; + // "A position is skipped if already occupied, typically by a "less + // than 1" probability symbol." + do { + pos = (pos + step) & mask; + } while (pos >= + high_threshold); + // Note: no other collision checking is necessary as `step` is + // coprime to `size`, so the cycle will visit each position exactly + // once + } + } + if (pos != 0) { + CORRUPTION(); + } + + // Now we can fill baseline and num bits + for (size_t i = 0; i < size; i++) { + u8 symbol = dtable->symbols[i]; + u16 next_state_desc = state_desc[symbol]++; + // Fills in the table appropriately, next_state_desc increases by symbol + // over time, decreasing number of bits + dtable->num_bits[i] = (u8)(accuracy_log - highest_set_bit(next_state_desc)); + // Baseline increases until the bit threshold is passed, at which point + // it resets to 0 + dtable->new_state_base[i] = + ((u16)next_state_desc << dtable->num_bits[i]) - size; + } +} + +/// Decode an FSE header as defined in the Zstandard format specification and +/// use the decoded frequencies to initialize a decoding table. +static void FSE_decode_header(FSE_dtable *const dtable, istream_t *const in, + const int max_accuracy_log) { + // "An FSE distribution table describes the probabilities of all symbols + // from 0 to the last present one (included) on a normalized scale of 1 << + // Accuracy_Log . + // + // It's a bitstream which is read forward, in little-endian fashion. It's + // not necessary to know its exact size, since it will be discovered and + // reported by the decoding process. + if (max_accuracy_log > FSE_MAX_ACCURACY_LOG) { + ERROR("FSE accuracy too large"); + } + + // The bitstream starts by reporting on which scale it operates. + // Accuracy_Log = low4bits + 5. Note that maximum Accuracy_Log for literal + // and match lengths is 9, and for offsets is 8. Higher values are + // considered errors." + const int accuracy_log = 5 + IO_read_bits(in, 4); + if (accuracy_log > max_accuracy_log) { + ERROR("FSE accuracy too large"); + } + + // "Then follows each symbol value, from 0 to last present one. The number + // of bits used by each field is variable. It depends on : + // + // Remaining probabilities + 1 : example : Presuming an Accuracy_Log of 8, + // and presuming 100 probabilities points have already been distributed, the + // decoder may read any value from 0 to 255 - 100 + 1 == 156 (inclusive). + // Therefore, it must read log2sup(156) == 8 bits. + // + // Value decoded : small values use 1 less bit : example : Presuming values + // from 0 to 156 (inclusive) are possible, 255-156 = 99 values are remaining + // in an 8-bits field. They are used this way : first 99 values (hence from + // 0 to 98) use only 7 bits, values from 99 to 156 use 8 bits. " + + i32 remaining = 1 << accuracy_log; + i16 frequencies[FSE_MAX_SYMBS]; + + int symb = 0; + while (remaining > 0 && symb < FSE_MAX_SYMBS) { + // Log of the number of possible values we could read + int bits = highest_set_bit(remaining + 1) + 1; + + u16 val = IO_read_bits(in, bits); + + // Try to mask out the lower bits to see if it qualifies for the "small + // value" threshold + const u16 lower_mask = ((u16)1 << (bits - 1)) - 1; + const u16 threshold = ((u16)1 << bits) - 1 - (remaining + 1); + + if ((val & lower_mask) < threshold) { + IO_rewind_bits(in, 1); + val = val & lower_mask; + } else if (val > lower_mask) { + val = val - threshold; + } + + // "Probability is obtained from Value decoded by following formula : + // Proba = value - 1" + const i16 proba = (i16)val - 1; + + // "It means value 0 becomes negative probability -1. -1 is a special + // probability, which means "less than 1". Its effect on distribution + // table is described in next paragraph. For the purpose of calculating + // cumulated distribution, it counts as one." + remaining -= proba < 0 ? -proba : proba; + + frequencies[symb] = proba; + symb++; + + // "When a symbol has a probability of zero, it is followed by a 2-bits + // repeat flag. This repeat flag tells how many probabilities of zeroes + // follow the current one. It provides a number ranging from 0 to 3. If + // it is a 3, another 2-bits repeat flag follows, and so on." + if (proba == 0) { + // Read the next two bits to see how many more 0s + int repeat = IO_read_bits(in, 2); + + while (1) { + for (int i = 0; i < repeat && symb < FSE_MAX_SYMBS; i++) { + frequencies[symb++] = 0; + } + if (repeat == 3) { + repeat = IO_read_bits(in, 2); + } else { + break; + } + } + } + } + IO_align_stream(in); + + // "When last symbol reaches cumulated total of 1 << Accuracy_Log, decoding + // is complete. If the last symbol makes cumulated total go above 1 << + // Accuracy_Log, distribution is considered corrupted." + if (remaining != 0 || symb >= FSE_MAX_SYMBS) { + CORRUPTION(); + } + + // Initialize the decoding table using the determined weights + FSE_init_dtable(dtable, frequencies, symb, accuracy_log); +} + +static void FSE_init_dtable_rle(FSE_dtable *const dtable, const u8 symb) { + dtable->symbols = malloc(sizeof(u8)); + dtable->num_bits = malloc(sizeof(u8)); + dtable->new_state_base = malloc(sizeof(u16)); + + if (!dtable->symbols || !dtable->num_bits || !dtable->new_state_base) { + BAD_ALLOC(); + } + + // This setup will always have a state of 0, always return symbol `symb`, + // and never consume any bits + dtable->symbols[0] = symb; + dtable->num_bits[0] = 0; + dtable->new_state_base[0] = 0; + dtable->accuracy_log = 0; +} + +static void FSE_free_dtable(FSE_dtable *const dtable) { + free(dtable->symbols); + free(dtable->num_bits); + free(dtable->new_state_base); + memset(dtable, 0, sizeof(FSE_dtable)); +} +/******* END FSE PRIMITIVES ***************************************************/ diff --git a/lib/common/fse.h b/lib/common/fse.h new file mode 100644 index 0000000..b6c2a3e --- /dev/null +++ b/lib/common/fse.h @@ -0,0 +1,625 @@ +/* ****************************************************************** + * FSE : Finite State Entropy codec + * Public Prototypes declaration + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ +#ifndef FSE_H +#define FSE_H + + +/*-***************************************** +* Dependencies +******************************************/ +#include "zstd_deps.h" /* size_t, ptrdiff_t */ + +/*-***************************************** +* FSE_PUBLIC_API : control library symbols visibility +******************************************/ +#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4) +# define FSE_PUBLIC_API __attribute__ ((visibility ("default"))) +#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) /* Visual expected */ +# define FSE_PUBLIC_API __declspec(dllexport) +#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1) +# define FSE_PUBLIC_API __declspec(dllimport) /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ +#else +# define FSE_PUBLIC_API +#endif + +/*------ Version ------*/ +#define FSE_VERSION_MAJOR 0 +#define FSE_VERSION_MINOR 9 +#define FSE_VERSION_RELEASE 0 + +#define FSE_LIB_VERSION FSE_VERSION_MAJOR.FSE_VERSION_MINOR.FSE_VERSION_RELEASE +#define FSE_QUOTE(str) #str +#define FSE_EXPAND_AND_QUOTE(str) FSE_QUOTE(str) +#define FSE_VERSION_STRING FSE_EXPAND_AND_QUOTE(FSE_LIB_VERSION) + +#define FSE_VERSION_NUMBER (FSE_VERSION_MAJOR *100*100 + FSE_VERSION_MINOR *100 + FSE_VERSION_RELEASE) +FSE_PUBLIC_API unsigned FSE_versionNumber(void); /**< library version number; to be used when checking dll version */ + + +/*-***************************************** +* Tool functions +******************************************/ +FSE_PUBLIC_API size_t FSE_compressBound(size_t size); /* maximum compressed size */ + +/* Error Management */ +FSE_PUBLIC_API unsigned FSE_isError(size_t code); /* tells if a return value is an error code */ +FSE_PUBLIC_API const char* FSE_getErrorName(size_t code); /* provides error code string (useful for debugging) */ + + +/*-***************************************** +* FSE detailed API +******************************************/ +/*! +FSE_compress() does the following: +1. count symbol occurrence from source[] into table count[] (see hist.h) +2. normalize counters so that sum(count[]) == Power_of_2 (2^tableLog) +3. save normalized counters to memory buffer using writeNCount() +4. build encoding table 'CTable' from normalized counters +5. encode the data stream using encoding table 'CTable' + +FSE_decompress() does the following: +1. read normalized counters with readNCount() +2. build decoding table 'DTable' from normalized counters +3. decode the data stream using decoding table 'DTable' + +The following API allows targeting specific sub-functions for advanced tasks. +For example, it's possible to compress several blocks using the same 'CTable', +or to save and provide normalized distribution using external method. +*/ + +/* *** COMPRESSION *** */ + +/*! FSE_optimalTableLog(): + dynamically downsize 'tableLog' when conditions are met. + It saves CPU time, by using smaller tables, while preserving or even improving compression ratio. + @return : recommended tableLog (necessarily <= 'maxTableLog') */ +FSE_PUBLIC_API unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue); + +/*! FSE_normalizeCount(): + normalize counts so that sum(count[]) == Power_of_2 (2^tableLog) + 'normalizedCounter' is a table of short, of minimum size (maxSymbolValue+1). + useLowProbCount is a boolean parameter which trades off compressed size for + faster header decoding. When it is set to 1, the compressed data will be slightly + smaller. And when it is set to 0, FSE_readNCount() and FSE_buildDTable() will be + faster. If you are compressing a small amount of data (< 2 KB) then useLowProbCount=0 + is a good default, since header deserialization makes a big speed difference. + Otherwise, useLowProbCount=1 is a good default, since the speed difference is small. + @return : tableLog, + or an errorCode, which can be tested using FSE_isError() */ +FSE_PUBLIC_API size_t FSE_normalizeCount(short* normalizedCounter, unsigned tableLog, + const unsigned* count, size_t srcSize, unsigned maxSymbolValue, unsigned useLowProbCount); + +/*! FSE_NCountWriteBound(): + Provides the maximum possible size of an FSE normalized table, given 'maxSymbolValue' and 'tableLog'. + Typically useful for allocation purpose. */ +FSE_PUBLIC_API size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog); + +/*! FSE_writeNCount(): + Compactly save 'normalizedCounter' into 'buffer'. + @return : size of the compressed table, + or an errorCode, which can be tested using FSE_isError(). */ +FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize, + const short* normalizedCounter, + unsigned maxSymbolValue, unsigned tableLog); + +/*! Constructor and Destructor of FSE_CTable. + Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */ +typedef unsigned FSE_CTable; /* don't allocate that. It's only meant to be more restrictive than void* */ + +/*! FSE_buildCTable(): + Builds `ct`, which must be already allocated, using FSE_createCTable(). + @return : 0, or an errorCode, which can be tested using FSE_isError() */ +FSE_PUBLIC_API size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog); + +/*! FSE_compress_usingCTable(): + Compress `src` using `ct` into `dst` which must be already allocated. + @return : size of compressed data (<= `dstCapacity`), + or 0 if compressed data could not fit into `dst`, + or an errorCode, which can be tested using FSE_isError() */ +FSE_PUBLIC_API size_t FSE_compress_usingCTable (void* dst, size_t dstCapacity, const void* src, size_t srcSize, const FSE_CTable* ct); + +/*! +Tutorial : +---------- +The first step is to count all symbols. FSE_count() does this job very fast. +Result will be saved into 'count', a table of unsigned int, which must be already allocated, and have 'maxSymbolValuePtr[0]+1' cells. +'src' is a table of bytes of size 'srcSize'. All values within 'src' MUST be <= maxSymbolValuePtr[0] +maxSymbolValuePtr[0] will be updated, with its real value (necessarily <= original value) +FSE_count() will return the number of occurrence of the most frequent symbol. +This can be used to know if there is a single symbol within 'src', and to quickly evaluate its compressibility. +If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()). + +The next step is to normalize the frequencies. +FSE_normalizeCount() will ensure that sum of frequencies is == 2 ^'tableLog'. +It also guarantees a minimum of 1 to any Symbol with frequency >= 1. +You can use 'tableLog'==0 to mean "use default tableLog value". +If you are unsure of which tableLog value to use, you can ask FSE_optimalTableLog(), +which will provide the optimal valid tableLog given sourceSize, maxSymbolValue, and a user-defined maximum (0 means "default"). + +The result of FSE_normalizeCount() will be saved into a table, +called 'normalizedCounter', which is a table of signed short. +'normalizedCounter' must be already allocated, and have at least 'maxSymbolValue+1' cells. +The return value is tableLog if everything proceeded as expected. +It is 0 if there is a single symbol within distribution. +If there is an error (ex: invalid tableLog value), the function will return an ErrorCode (which can be tested using FSE_isError()). + +'normalizedCounter' can be saved in a compact manner to a memory area using FSE_writeNCount(). +'buffer' must be already allocated. +For guaranteed success, buffer size must be at least FSE_headerBound(). +The result of the function is the number of bytes written into 'buffer'. +If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError(); ex : buffer size too small). + +'normalizedCounter' can then be used to create the compression table 'CTable'. +The space required by 'CTable' must be already allocated, using FSE_createCTable(). +You can then use FSE_buildCTable() to fill 'CTable'. +If there is an error, both functions will return an ErrorCode (which can be tested using FSE_isError()). + +'CTable' can then be used to compress 'src', with FSE_compress_usingCTable(). +Similar to FSE_count(), the convention is that 'src' is assumed to be a table of char of size 'srcSize' +The function returns the size of compressed data (without header), necessarily <= `dstCapacity`. +If it returns '0', compressed data could not fit into 'dst'. +If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()). +*/ + + +/* *** DECOMPRESSION *** */ + +/*! FSE_readNCount(): + Read compactly saved 'normalizedCounter' from 'rBuffer'. + @return : size read from 'rBuffer', + or an errorCode, which can be tested using FSE_isError(). + maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */ +FSE_PUBLIC_API size_t FSE_readNCount (short* normalizedCounter, + unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, + const void* rBuffer, size_t rBuffSize); + +/*! FSE_readNCount_bmi2(): + * Same as FSE_readNCount() but pass bmi2=1 when your CPU supports BMI2 and 0 otherwise. + */ +FSE_PUBLIC_API size_t FSE_readNCount_bmi2(short* normalizedCounter, + unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, + const void* rBuffer, size_t rBuffSize, int bmi2); + +typedef unsigned FSE_DTable; /* don't allocate that. It's just a way to be more restrictive than void* */ + +/*! +Tutorial : +---------- +(Note : these functions only decompress FSE-compressed blocks. + If block is uncompressed, use memcpy() instead + If block is a single repeated byte, use memset() instead ) + +The first step is to obtain the normalized frequencies of symbols. +This can be performed by FSE_readNCount() if it was saved using FSE_writeNCount(). +'normalizedCounter' must be already allocated, and have at least 'maxSymbolValuePtr[0]+1' cells of signed short. +In practice, that means it's necessary to know 'maxSymbolValue' beforehand, +or size the table to handle worst case situations (typically 256). +FSE_readNCount() will provide 'tableLog' and 'maxSymbolValue'. +The result of FSE_readNCount() is the number of bytes read from 'rBuffer'. +Note that 'rBufferSize' must be at least 4 bytes, even if useful information is less than that. +If there is an error, the function will return an error code, which can be tested using FSE_isError(). + +The next step is to build the decompression tables 'FSE_DTable' from 'normalizedCounter'. +This is performed by the function FSE_buildDTable(). +The space required by 'FSE_DTable' must be already allocated using FSE_createDTable(). +If there is an error, the function will return an error code, which can be tested using FSE_isError(). + +`FSE_DTable` can then be used to decompress `cSrc`, with FSE_decompress_usingDTable(). +`cSrcSize` must be strictly correct, otherwise decompression will fail. +FSE_decompress_usingDTable() result will tell how many bytes were regenerated (<=`dstCapacity`). +If there is an error, the function will return an error code, which can be tested using FSE_isError(). (ex: dst buffer too small) +*/ + +#endif /* FSE_H */ + + +#if defined(FSE_STATIC_LINKING_ONLY) && !defined(FSE_H_FSE_STATIC_LINKING_ONLY) +#define FSE_H_FSE_STATIC_LINKING_ONLY +#include "bitstream.h" + +/* ***************************************** +* Static allocation +*******************************************/ +/* FSE buffer bounds */ +#define FSE_NCOUNTBOUND 512 +#define FSE_BLOCKBOUND(size) ((size) + ((size)>>7) + 4 /* fse states */ + sizeof(size_t) /* bitContainer */) +#define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size)) /* Macro version, useful for static allocation */ + +/* It is possible to statically allocate FSE CTable/DTable as a table of FSE_CTable/FSE_DTable using below macros */ +#define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) (1 + (1<<((maxTableLog)-1)) + (((maxSymbolValue)+1)*2)) +#define FSE_DTABLE_SIZE_U32(maxTableLog) (1 + (1<<(maxTableLog))) + +/* or use the size to malloc() space directly. Pay attention to alignment restrictions though */ +#define FSE_CTABLE_SIZE(maxTableLog, maxSymbolValue) (FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(FSE_CTable)) +#define FSE_DTABLE_SIZE(maxTableLog) (FSE_DTABLE_SIZE_U32(maxTableLog) * sizeof(FSE_DTable)) + + +/* ***************************************** + * FSE advanced API + ***************************************** */ + +unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus); +/**< same as FSE_optimalTableLog(), which used `minus==2` */ + +size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue); +/**< build a fake FSE_CTable, designed to compress always the same symbolValue */ + +/* FSE_buildCTable_wksp() : + * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`). + * `wkspSize` must be >= `FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog)` of `unsigned`. + * See FSE_buildCTable_wksp() for breakdown of workspace usage. + */ +#define FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog) (((maxSymbolValue + 2) + (1ull << (tableLog)))/2 + sizeof(U64)/sizeof(U32) /* additional 8 bytes for potential table overwrite */) +#define FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog) (sizeof(unsigned) * FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog)) +size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); + +#define FSE_BUILD_DTABLE_WKSP_SIZE(maxTableLog, maxSymbolValue) (sizeof(short) * (maxSymbolValue + 1) + (1ULL << maxTableLog) + 8) +#define FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) ((FSE_BUILD_DTABLE_WKSP_SIZE(maxTableLog, maxSymbolValue) + sizeof(unsigned) - 1) / sizeof(unsigned)) +FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); +/**< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */ + +#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + 1 + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1) +#define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned)) +size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2); +/**< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)`. + * Set bmi2 to 1 if your CPU supports BMI2 or 0 if it doesn't */ + +typedef enum { + FSE_repeat_none, /**< Cannot use the previous table */ + FSE_repeat_check, /**< Can use the previous table but it must be checked */ + FSE_repeat_valid /**< Can use the previous table and it is assumed to be valid */ + } FSE_repeat; + +/* ***************************************** +* FSE symbol compression API +*******************************************/ +/*! + This API consists of small unitary functions, which highly benefit from being inlined. + Hence their body are included in next section. +*/ +typedef struct { + ptrdiff_t value; + const void* stateTable; + const void* symbolTT; + unsigned stateLog; +} FSE_CState_t; + +static void FSE_initCState(FSE_CState_t* CStatePtr, const FSE_CTable* ct); + +static void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* CStatePtr, unsigned symbol); + +static void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* CStatePtr); + +/**< +These functions are inner components of FSE_compress_usingCTable(). +They allow the creation of custom streams, mixing multiple tables and bit sources. + +A key property to keep in mind is that encoding and decoding are done **in reverse direction**. +So the first symbol you will encode is the last you will decode, like a LIFO stack. + +You will need a few variables to track your CStream. They are : + +FSE_CTable ct; // Provided by FSE_buildCTable() +BIT_CStream_t bitStream; // bitStream tracking structure +FSE_CState_t state; // State tracking structure (can have several) + + +The first thing to do is to init bitStream and state. + size_t errorCode = BIT_initCStream(&bitStream, dstBuffer, maxDstSize); + FSE_initCState(&state, ct); + +Note that BIT_initCStream() can produce an error code, so its result should be tested, using FSE_isError(); +You can then encode your input data, byte after byte. +FSE_encodeSymbol() outputs a maximum of 'tableLog' bits at a time. +Remember decoding will be done in reverse direction. + FSE_encodeByte(&bitStream, &state, symbol); + +At any time, you can also add any bit sequence. +Note : maximum allowed nbBits is 25, for compatibility with 32-bits decoders + BIT_addBits(&bitStream, bitField, nbBits); + +The above methods don't commit data to memory, they just store it into local register, for speed. +Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t). +Writing data to memory is a manual operation, performed by the flushBits function. + BIT_flushBits(&bitStream); + +Your last FSE encoding operation shall be to flush your last state value(s). + FSE_flushState(&bitStream, &state); + +Finally, you must close the bitStream. +The function returns the size of CStream in bytes. +If data couldn't fit into dstBuffer, it will return a 0 ( == not compressible) +If there is an error, it returns an errorCode (which can be tested using FSE_isError()). + size_t size = BIT_closeCStream(&bitStream); +*/ + + +/* ***************************************** +* FSE symbol decompression API +*******************************************/ +typedef struct { + size_t state; + const void* table; /* precise table may vary, depending on U16 */ +} FSE_DState_t; + + +static void FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt); + +static unsigned char FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD); + +static unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr); + +/**< +Let's now decompose FSE_decompress_usingDTable() into its unitary components. +You will decode FSE-encoded symbols from the bitStream, +and also any other bitFields you put in, **in reverse order**. + +You will need a few variables to track your bitStream. They are : + +BIT_DStream_t DStream; // Stream context +FSE_DState_t DState; // State context. Multiple ones are possible +FSE_DTable* DTablePtr; // Decoding table, provided by FSE_buildDTable() + +The first thing to do is to init the bitStream. + errorCode = BIT_initDStream(&DStream, srcBuffer, srcSize); + +You should then retrieve your initial state(s) +(in reverse flushing order if you have several ones) : + errorCode = FSE_initDState(&DState, &DStream, DTablePtr); + +You can then decode your data, symbol after symbol. +For information the maximum number of bits read by FSE_decodeSymbol() is 'tableLog'. +Keep in mind that symbols are decoded in reverse order, like a LIFO stack (last in, first out). + unsigned char symbol = FSE_decodeSymbol(&DState, &DStream); + +You can retrieve any bitfield you eventually stored into the bitStream (in reverse order) +Note : maximum allowed nbBits is 25, for 32-bits compatibility + size_t bitField = BIT_readBits(&DStream, nbBits); + +All above operations only read from local register (which size depends on size_t). +Refueling the register from memory is manually performed by the reload method. + endSignal = FSE_reloadDStream(&DStream); + +BIT_reloadDStream() result tells if there is still some more data to read from DStream. +BIT_DStream_unfinished : there is still some data left into the DStream. +BIT_DStream_endOfBuffer : Dstream reached end of buffer. Its container may no longer be completely filled. +BIT_DStream_completed : Dstream reached its exact end, corresponding in general to decompression completed. +BIT_DStream_tooFar : Dstream went too far. Decompression result is corrupted. + +When reaching end of buffer (BIT_DStream_endOfBuffer), progress slowly, notably if you decode multiple symbols per loop, +to properly detect the exact end of stream. +After each decoded symbol, check if DStream is fully consumed using this simple test : + BIT_reloadDStream(&DStream) >= BIT_DStream_completed + +When it's done, verify decompression is fully completed, by checking both DStream and the relevant states. +Checking if DStream has reached its end is performed by : + BIT_endOfDStream(&DStream); +Check also the states. There might be some symbols left there, if some high probability ones (>50%) are possible. + FSE_endOfDState(&DState); +*/ + + +/* ***************************************** +* FSE unsafe API +*******************************************/ +static unsigned char FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD); +/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */ + + +/* ***************************************** +* Implementation of inlined functions +*******************************************/ +typedef struct { + int deltaFindState; + U32 deltaNbBits; +} FSE_symbolCompressionTransform; /* total 8 bytes */ + +MEM_STATIC void FSE_initCState(FSE_CState_t* statePtr, const FSE_CTable* ct) +{ + const void* ptr = ct; + const U16* u16ptr = (const U16*) ptr; + const U32 tableLog = MEM_read16(ptr); + statePtr->value = (ptrdiff_t)1<stateTable = u16ptr+2; + statePtr->symbolTT = ct + 1 + (tableLog ? (1<<(tableLog-1)) : 1); + statePtr->stateLog = tableLog; +} + + +/*! FSE_initCState2() : +* Same as FSE_initCState(), but the first symbol to include (which will be the last to be read) +* uses the smallest state value possible, saving the cost of this symbol */ +MEM_STATIC void FSE_initCState2(FSE_CState_t* statePtr, const FSE_CTable* ct, U32 symbol) +{ + FSE_initCState(statePtr, ct); + { const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol]; + const U16* stateTable = (const U16*)(statePtr->stateTable); + U32 nbBitsOut = (U32)((symbolTT.deltaNbBits + (1<<15)) >> 16); + statePtr->value = (nbBitsOut << 16) - symbolTT.deltaNbBits; + statePtr->value = stateTable[(statePtr->value >> nbBitsOut) + symbolTT.deltaFindState]; + } +} + +MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, unsigned symbol) +{ + FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol]; + const U16* const stateTable = (const U16*)(statePtr->stateTable); + U32 const nbBitsOut = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16); + BIT_addBits(bitC, (BitContainerType)statePtr->value, nbBitsOut); + statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState]; +} + +MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr) +{ + BIT_addBits(bitC, (BitContainerType)statePtr->value, statePtr->stateLog); + BIT_flushBits(bitC); +} + + +/* FSE_getMaxNbBits() : + * Approximate maximum cost of a symbol, in bits. + * Fractional get rounded up (i.e. a symbol with a normalized frequency of 3 gives the same result as a frequency of 2) + * note 1 : assume symbolValue is valid (<= maxSymbolValue) + * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */ +MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue) +{ + const FSE_symbolCompressionTransform* symbolTT = (const FSE_symbolCompressionTransform*) symbolTTPtr; + return (symbolTT[symbolValue].deltaNbBits + ((1<<16)-1)) >> 16; +} + +/* FSE_bitCost() : + * Approximate symbol cost, as fractional value, using fixed-point format (accuracyLog fractional bits) + * note 1 : assume symbolValue is valid (<= maxSymbolValue) + * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */ +MEM_STATIC U32 FSE_bitCost(const void* symbolTTPtr, U32 tableLog, U32 symbolValue, U32 accuracyLog) +{ + const FSE_symbolCompressionTransform* symbolTT = (const FSE_symbolCompressionTransform*) symbolTTPtr; + U32 const minNbBits = symbolTT[symbolValue].deltaNbBits >> 16; + U32 const threshold = (minNbBits+1) << 16; + assert(tableLog < 16); + assert(accuracyLog < 31-tableLog); /* ensure enough room for renormalization double shift */ + { U32 const tableSize = 1 << tableLog; + U32 const deltaFromThreshold = threshold - (symbolTT[symbolValue].deltaNbBits + tableSize); + U32 const normalizedDeltaFromThreshold = (deltaFromThreshold << accuracyLog) >> tableLog; /* linear interpolation (very approximate) */ + U32 const bitMultiplier = 1 << accuracyLog; + assert(symbolTT[symbolValue].deltaNbBits + tableSize <= threshold); + assert(normalizedDeltaFromThreshold <= bitMultiplier); + return (minNbBits+1)*bitMultiplier - normalizedDeltaFromThreshold; + } +} + + +/* ====== Decompression ====== */ + +typedef struct { + U16 tableLog; + U16 fastMode; +} FSE_DTableHeader; /* sizeof U32 */ + +typedef struct +{ + unsigned short newState; + unsigned char symbol; + unsigned char nbBits; +} FSE_decode_t; /* size == U32 */ + +MEM_STATIC void FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt) +{ + const void* ptr = dt; + const FSE_DTableHeader* const DTableH = (const FSE_DTableHeader*)ptr; + DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog); + BIT_reloadDStream(bitD); + DStatePtr->table = dt + 1; +} + +MEM_STATIC BYTE FSE_peekSymbol(const FSE_DState_t* DStatePtr) +{ + FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state]; + return DInfo.symbol; +} + +MEM_STATIC void FSE_updateState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD) +{ + FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state]; + U32 const nbBits = DInfo.nbBits; + size_t const lowBits = BIT_readBits(bitD, nbBits); + DStatePtr->state = DInfo.newState + lowBits; +} + +MEM_STATIC BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD) +{ + FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state]; + U32 const nbBits = DInfo.nbBits; + BYTE const symbol = DInfo.symbol; + size_t const lowBits = BIT_readBits(bitD, nbBits); + + DStatePtr->state = DInfo.newState + lowBits; + return symbol; +} + +/*! FSE_decodeSymbolFast() : + unsafe, only works if no symbol has a probability > 50% */ +MEM_STATIC BYTE FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD) +{ + FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state]; + U32 const nbBits = DInfo.nbBits; + BYTE const symbol = DInfo.symbol; + size_t const lowBits = BIT_readBitsFast(bitD, nbBits); + + DStatePtr->state = DInfo.newState + lowBits; + return symbol; +} + +MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr) +{ + return DStatePtr->state == 0; +} + + + +#ifndef FSE_COMMONDEFS_ONLY + +/* ************************************************************** +* Tuning parameters +****************************************************************/ +/*!MEMORY_USAGE : +* Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) +* Increasing memory usage improves compression ratio +* Reduced memory usage can improve speed, due to cache effect +* Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */ +#ifndef FSE_MAX_MEMORY_USAGE +# define FSE_MAX_MEMORY_USAGE 14 +#endif +#ifndef FSE_DEFAULT_MEMORY_USAGE +# define FSE_DEFAULT_MEMORY_USAGE 13 +#endif +#if (FSE_DEFAULT_MEMORY_USAGE > FSE_MAX_MEMORY_USAGE) +# error "FSE_DEFAULT_MEMORY_USAGE must be <= FSE_MAX_MEMORY_USAGE" +#endif + +/*!FSE_MAX_SYMBOL_VALUE : +* Maximum symbol value authorized. +* Required for proper stack allocation */ +#ifndef FSE_MAX_SYMBOL_VALUE +# define FSE_MAX_SYMBOL_VALUE 255 +#endif + +/* ************************************************************** +* template functions type & suffix +****************************************************************/ +#define FSE_FUNCTION_TYPE BYTE +#define FSE_FUNCTION_EXTENSION +#define FSE_DECODE_TYPE FSE_decode_t + + +#endif /* !FSE_COMMONDEFS_ONLY */ + + +/* *************************************************************** +* Constants +*****************************************************************/ +#define FSE_MAX_TABLELOG (FSE_MAX_MEMORY_USAGE-2) +#define FSE_MAX_TABLESIZE (1U< FSE_TABLELOG_ABSOLUTE_MAX +# error "FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX is not supported" +#endif + +#define FSE_TABLESTEP(tableSize) (((tableSize)>>1) + ((tableSize)>>3) + 3) + +#endif /* FSE_STATIC_LINKING_ONLY */ diff --git a/lib/common/threading.c b/lib/common/threading.c new file mode 100644 index 0000000..25bb8b9 --- /dev/null +++ b/lib/common/threading.c @@ -0,0 +1,182 @@ +/** + * Copyright (c) 2016 Tino Reichardt + * All rights reserved. + * + * You can contact the author at: + * - zstdmt source repository: https://github.com/mcmilk/zstdmt + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/** + * This file will hold wrapper for systems, which do not support pthreads + */ + +#include "threading.h" + +/* create fake symbol to avoid empty translation unit warning */ +int g_ZSTD_threading_useless_symbol; + +#if defined(ZSTD_MULTITHREAD) && defined(_WIN32) + +/** + * Windows minimalist Pthread Wrapper + */ + + +/* === Dependencies === */ +#include +#include + + +/* === Implementation === */ + +typedef struct { + void* (*start_routine)(void*); + void* arg; + int initialized; + ZSTD_pthread_cond_t initialized_cond; + ZSTD_pthread_mutex_t initialized_mutex; +} ZSTD_thread_params_t; + +static unsigned __stdcall worker(void *arg) +{ + void* (*start_routine)(void*); + void* thread_arg; + + /* Initialized thread_arg and start_routine and signal main thread that we don't need it + * to wait any longer. + */ + { + ZSTD_thread_params_t* thread_param = (ZSTD_thread_params_t*)arg; + thread_arg = thread_param->arg; + start_routine = thread_param->start_routine; + + /* Signal main thread that we are running and do not depend on its memory anymore */ + ZSTD_pthread_mutex_lock(&thread_param->initialized_mutex); + thread_param->initialized = 1; + ZSTD_pthread_cond_signal(&thread_param->initialized_cond); + ZSTD_pthread_mutex_unlock(&thread_param->initialized_mutex); + } + + start_routine(thread_arg); + + return 0; +} + +int ZSTD_pthread_create(ZSTD_pthread_t* thread, const void* unused, + void* (*start_routine) (void*), void* arg) +{ + ZSTD_thread_params_t thread_param; + (void)unused; + + if (thread==NULL) return -1; + *thread = NULL; + + thread_param.start_routine = start_routine; + thread_param.arg = arg; + thread_param.initialized = 0; + + /* Setup thread initialization synchronization */ + if(ZSTD_pthread_cond_init(&thread_param.initialized_cond, NULL)) { + /* Should never happen on Windows */ + return -1; + } + if(ZSTD_pthread_mutex_init(&thread_param.initialized_mutex, NULL)) { + /* Should never happen on Windows */ + ZSTD_pthread_cond_destroy(&thread_param.initialized_cond); + return -1; + } + + /* Spawn thread */ + *thread = (HANDLE)_beginthreadex(NULL, 0, worker, &thread_param, 0, NULL); + if (*thread==NULL) { + ZSTD_pthread_mutex_destroy(&thread_param.initialized_mutex); + ZSTD_pthread_cond_destroy(&thread_param.initialized_cond); + return errno; + } + + /* Wait for thread to be initialized */ + ZSTD_pthread_mutex_lock(&thread_param.initialized_mutex); + while(!thread_param.initialized) { + ZSTD_pthread_cond_wait(&thread_param.initialized_cond, &thread_param.initialized_mutex); + } + ZSTD_pthread_mutex_unlock(&thread_param.initialized_mutex); + ZSTD_pthread_mutex_destroy(&thread_param.initialized_mutex); + ZSTD_pthread_cond_destroy(&thread_param.initialized_cond); + + return 0; +} + +int ZSTD_pthread_join(ZSTD_pthread_t thread) +{ + DWORD result; + + if (!thread) return 0; + + result = WaitForSingleObject(thread, INFINITE); + CloseHandle(thread); + + switch (result) { + case WAIT_OBJECT_0: + return 0; + case WAIT_ABANDONED: + return EINVAL; + default: + return GetLastError(); + } +} + +#endif /* ZSTD_MULTITHREAD */ + +#if defined(ZSTD_MULTITHREAD) && DEBUGLEVEL >= 1 && !defined(_WIN32) + +#define ZSTD_DEPS_NEED_MALLOC +#include "zstd_deps.h" + +int ZSTD_pthread_mutex_init(ZSTD_pthread_mutex_t* mutex, pthread_mutexattr_t const* attr) +{ + assert(mutex != NULL); + *mutex = (pthread_mutex_t*)ZSTD_malloc(sizeof(pthread_mutex_t)); + if (!*mutex) + return 1; + return pthread_mutex_init(*mutex, attr); +} + +int ZSTD_pthread_mutex_destroy(ZSTD_pthread_mutex_t* mutex) +{ + assert(mutex != NULL); + if (!*mutex) + return 0; + { + int const ret = pthread_mutex_destroy(*mutex); + ZSTD_free(*mutex); + return ret; + } +} + +int ZSTD_pthread_cond_init(ZSTD_pthread_cond_t* cond, pthread_condattr_t const* attr) +{ + assert(cond != NULL); + *cond = (pthread_cond_t*)ZSTD_malloc(sizeof(pthread_cond_t)); + if (!*cond) + return 1; + return pthread_cond_init(*cond, attr); +} + +int ZSTD_pthread_cond_destroy(ZSTD_pthread_cond_t* cond) +{ + assert(cond != NULL); + if (!*cond) + return 0; + { + int const ret = pthread_cond_destroy(*cond); + ZSTD_free(*cond); + return ret; + } +} + +#endif diff --git a/lib/compress/fse_compress.c b/lib/compress/fse_compress.c new file mode 100644 index 0000000..1ce3cf1 --- /dev/null +++ b/lib/compress/fse_compress.c @@ -0,0 +1,625 @@ +/* ****************************************************************** + * FSE : Finite State Entropy encoder + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy + * - Public forum : https://groups.google.com/forum/#!forum/lz4c + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ + +/* ************************************************************** +* Includes +****************************************************************/ +#include "../common/compiler.h" +#include "../common/mem.h" /* U32, U16, etc. */ +#include "../common/debug.h" /* assert, DEBUGLOG */ +#include "hist.h" /* HIST_count_wksp */ +#include "../common/bitstream.h" +#define FSE_STATIC_LINKING_ONLY +#include "../common/fse.h" +#include "../common/error_private.h" +#define ZSTD_DEPS_NEED_MALLOC +#define ZSTD_DEPS_NEED_MATH64 +#include "../common/zstd_deps.h" /* ZSTD_memset */ +#include "../common/bits.h" /* ZSTD_highbit32 */ + + +/* ************************************************************** +* Error Management +****************************************************************/ +#define FSE_isError ERR_isError + + +/* ************************************************************** +* Templates +****************************************************************/ +/* + designed to be included + for type-specific functions (template emulation in C) + Objective is to write these functions only once, for improved maintenance +*/ + +/* safety checks */ +#ifndef FSE_FUNCTION_EXTENSION +# error "FSE_FUNCTION_EXTENSION must be defined" +#endif +#ifndef FSE_FUNCTION_TYPE +# error "FSE_FUNCTION_TYPE must be defined" +#endif + +/* Function names */ +#define FSE_CAT(X,Y) X##Y +#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y) +#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y) + + +/* Function templates */ + +/* FSE_buildCTable_wksp() : + * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`). + * wkspSize should be sized to handle worst case situation, which is `1<>1 : 1) ; + FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT); + U32 const step = FSE_TABLESTEP(tableSize); + U32 const maxSV1 = maxSymbolValue+1; + + U16* cumul = (U16*)workSpace; /* size = maxSV1 */ + FSE_FUNCTION_TYPE* const tableSymbol = (FSE_FUNCTION_TYPE*)(cumul + (maxSV1+1)); /* size = tableSize */ + + U32 highThreshold = tableSize-1; + + assert(((size_t)workSpace & 1) == 0); /* Must be 2 bytes-aligned */ + if (FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog) > wkspSize) return ERROR(tableLog_tooLarge); + /* CTable header */ + tableU16[-2] = (U16) tableLog; + tableU16[-1] = (U16) maxSymbolValue; + assert(tableLog < 16); /* required for threshold strategy to work */ + + /* For explanations on how to distribute symbol values over the table : + * https://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */ + + #ifdef __clang_analyzer__ + ZSTD_memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize); /* useless initialization, just to keep scan-build happy */ + #endif + + /* symbol start positions */ + { U32 u; + cumul[0] = 0; + for (u=1; u <= maxSV1; u++) { + if (normalizedCounter[u-1]==-1) { /* Low proba symbol */ + cumul[u] = cumul[u-1] + 1; + tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(u-1); + } else { + assert(normalizedCounter[u-1] >= 0); + cumul[u] = cumul[u-1] + (U16)normalizedCounter[u-1]; + assert(cumul[u] >= cumul[u-1]); /* no overflow */ + } } + cumul[maxSV1] = (U16)(tableSize+1); + } + + /* Spread symbols */ + if (highThreshold == tableSize - 1) { + /* Case for no low prob count symbols. Lay down 8 bytes at a time + * to reduce branch misses since we are operating on a small block + */ + BYTE* const spread = tableSymbol + tableSize; /* size = tableSize + 8 (may write beyond tableSize) */ + { U64 const add = 0x0101010101010101ull; + size_t pos = 0; + U64 sv = 0; + U32 s; + for (s=0; s=0); + pos += (size_t)n; + } + } + /* Spread symbols across the table. Lack of lowprob symbols means that + * we don't need variable sized inner loop, so we can unroll the loop and + * reduce branch misses. + */ + { size_t position = 0; + size_t s; + size_t const unroll = 2; /* Experimentally determined optimal unroll */ + assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */ + for (s = 0; s < (size_t)tableSize; s += unroll) { + size_t u; + for (u = 0; u < unroll; ++u) { + size_t const uPosition = (position + (u * step)) & tableMask; + tableSymbol[uPosition] = spread[s + u]; + } + position = (position + (unroll * step)) & tableMask; + } + assert(position == 0); /* Must have initialized all positions */ + } + } else { + U32 position = 0; + U32 symbol; + for (symbol=0; symbol highThreshold) + position = (position + step) & tableMask; /* Low proba area */ + } } + assert(position==0); /* Must have initialized all positions */ + } + + /* Build table */ + { U32 u; for (u=0; u 1); + { U32 const maxBitsOut = tableLog - ZSTD_highbit32 ((U32)normalizedCounter[s]-1); + U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut; + symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus; + symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]); + total += (unsigned)normalizedCounter[s]; + } } } } + +#if 0 /* debug : symbol costs */ + DEBUGLOG(5, "\n --- table statistics : "); + { U32 symbol; + for (symbol=0; symbol<=maxSymbolValue; symbol++) { + DEBUGLOG(5, "%3u: w=%3i, maxBits=%u, fracBits=%.2f", + symbol, normalizedCounter[symbol], + FSE_getMaxNbBits(symbolTT, symbol), + (double)FSE_bitCost(symbolTT, tableLog, symbol, 8) / 256); + } } +#endif + + return 0; +} + + + +#ifndef FSE_COMMONDEFS_ONLY + +/*-************************************************************** +* FSE NCount encoding +****************************************************************/ +size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog) +{ + size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog + + 4 /* bitCount initialized at 4 */ + + 2 /* first two symbols may use one additional bit each */) / 8) + + 1 /* round up to whole nb bytes */ + + 2 /* additional two bytes for bitstream flush */; + return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND; /* maxSymbolValue==0 ? use default */ +} + +static size_t +FSE_writeNCount_generic (void* header, size_t headerBufferSize, + const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, + unsigned writeIsSafe) +{ + BYTE* const ostart = (BYTE*) header; + BYTE* out = ostart; + BYTE* const oend = ostart + headerBufferSize; + int nbBits; + const int tableSize = 1 << tableLog; + int remaining; + int threshold; + U32 bitStream = 0; + int bitCount = 0; + unsigned symbol = 0; + unsigned const alphabetSize = maxSymbolValue + 1; + int previousIs0 = 0; + + /* Table Size */ + bitStream += (tableLog-FSE_MIN_TABLELOG) << bitCount; + bitCount += 4; + + /* Init */ + remaining = tableSize+1; /* +1 for extra accuracy */ + threshold = tableSize; + nbBits = (int)tableLog+1; + + while ((symbol < alphabetSize) && (remaining>1)) { /* stops at 1 */ + if (previousIs0) { + unsigned start = symbol; + while ((symbol < alphabetSize) && !normalizedCounter[symbol]) symbol++; + if (symbol == alphabetSize) break; /* incorrect distribution */ + while (symbol >= start+24) { + start+=24; + bitStream += 0xFFFFU << bitCount; + if ((!writeIsSafe) && (out > oend-2)) + return ERROR(dstSize_tooSmall); /* Buffer overflow */ + out[0] = (BYTE) bitStream; + out[1] = (BYTE)(bitStream>>8); + out+=2; + bitStream>>=16; + } + while (symbol >= start+3) { + start+=3; + bitStream += 3U << bitCount; + bitCount += 2; + } + bitStream += (symbol-start) << bitCount; + bitCount += 2; + if (bitCount>16) { + if ((!writeIsSafe) && (out > oend - 2)) + return ERROR(dstSize_tooSmall); /* Buffer overflow */ + out[0] = (BYTE)bitStream; + out[1] = (BYTE)(bitStream>>8); + out += 2; + bitStream >>= 16; + bitCount -= 16; + } } + { int count = normalizedCounter[symbol++]; + int const max = (2*threshold-1) - remaining; + remaining -= count < 0 ? -count : count; + count++; /* +1 for extra accuracy */ + if (count>=threshold) + count += max; /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */ + bitStream += (U32)count << bitCount; + bitCount += nbBits; + bitCount -= (count>=1; } + } + if (bitCount>16) { + if ((!writeIsSafe) && (out > oend - 2)) + return ERROR(dstSize_tooSmall); /* Buffer overflow */ + out[0] = (BYTE)bitStream; + out[1] = (BYTE)(bitStream>>8); + out += 2; + bitStream >>= 16; + bitCount -= 16; + } } + + if (remaining != 1) + return ERROR(GENERIC); /* incorrect normalized distribution */ + assert(symbol <= alphabetSize); + + /* flush remaining bitStream */ + if ((!writeIsSafe) && (out > oend - 2)) + return ERROR(dstSize_tooSmall); /* Buffer overflow */ + out[0] = (BYTE)bitStream; + out[1] = (BYTE)(bitStream>>8); + out+= (bitCount+7) /8; + + assert(out >= ostart); + return (size_t)(out-ostart); +} + + +size_t FSE_writeNCount (void* buffer, size_t bufferSize, + const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog) +{ + if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge); /* Unsupported */ + if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC); /* Unsupported */ + + if (bufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog)) + return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 0); + + return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1 /* write in buffer is safe */); +} + + +/*-************************************************************** +* FSE Compression Code +****************************************************************/ + +/* provides the minimum logSize to safely represent a distribution */ +static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue) +{ + U32 minBitsSrc = ZSTD_highbit32((U32)(srcSize)) + 1; + U32 minBitsSymbols = ZSTD_highbit32(maxSymbolValue) + 2; + U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols; + assert(srcSize > 1); /* Not supported, RLE should be used instead */ + return minBits; +} + +unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus) +{ + U32 maxBitsSrc = ZSTD_highbit32((U32)(srcSize - 1)) - minus; + U32 tableLog = maxTableLog; + U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue); + assert(srcSize > 1); /* Not supported, RLE should be used instead */ + if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG; + if (maxBitsSrc < tableLog) tableLog = maxBitsSrc; /* Accuracy can be reduced */ + if (minBits > tableLog) tableLog = minBits; /* Need a minimum to safely represent all symbol values */ + if (tableLog < FSE_MIN_TABLELOG) tableLog = FSE_MIN_TABLELOG; + if (tableLog > FSE_MAX_TABLELOG) tableLog = FSE_MAX_TABLELOG; + return tableLog; +} + +unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue) +{ + return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 2); +} + +/* Secondary normalization method. + To be used when primary method fails. */ + +static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count, size_t total, U32 maxSymbolValue, short lowProbCount) +{ + short const NOT_YET_ASSIGNED = -2; + U32 s; + U32 distributed = 0; + U32 ToDistribute; + + /* Init */ + U32 const lowThreshold = (U32)(total >> tableLog); + U32 lowOne = (U32)((total * 3) >> (tableLog + 1)); + + for (s=0; s<=maxSymbolValue; s++) { + if (count[s] == 0) { + norm[s]=0; + continue; + } + if (count[s] <= lowThreshold) { + norm[s] = lowProbCount; + distributed++; + total -= count[s]; + continue; + } + if (count[s] <= lowOne) { + norm[s] = 1; + distributed++; + total -= count[s]; + continue; + } + + norm[s]=NOT_YET_ASSIGNED; + } + ToDistribute = (1 << tableLog) - distributed; + + if (ToDistribute == 0) + return 0; + + if ((total / ToDistribute) > lowOne) { + /* risk of rounding to zero */ + lowOne = (U32)((total * 3) / (ToDistribute * 2)); + for (s=0; s<=maxSymbolValue; s++) { + if ((norm[s] == NOT_YET_ASSIGNED) && (count[s] <= lowOne)) { + norm[s] = 1; + distributed++; + total -= count[s]; + continue; + } } + ToDistribute = (1 << tableLog) - distributed; + } + + if (distributed == maxSymbolValue+1) { + /* all values are pretty poor; + probably incompressible data (should have already been detected); + find max, then give all remaining points to max */ + U32 maxV = 0, maxC = 0; + for (s=0; s<=maxSymbolValue; s++) + if (count[s] > maxC) { maxV=s; maxC=count[s]; } + norm[maxV] += (short)ToDistribute; + return 0; + } + + if (total == 0) { + /* all of the symbols were low enough for the lowOne or lowThreshold */ + for (s=0; ToDistribute > 0; s = (s+1)%(maxSymbolValue+1)) + if (norm[s] > 0) { ToDistribute--; norm[s]++; } + return 0; + } + + { U64 const vStepLog = 62 - tableLog; + U64 const mid = (1ULL << (vStepLog-1)) - 1; + U64 const rStep = ZSTD_div64((((U64)1<> vStepLog); + U32 const sEnd = (U32)(end >> vStepLog); + U32 const weight = sEnd - sStart; + if (weight < 1) + return ERROR(GENERIC); + norm[s] = (short)weight; + tmpTotal = end; + } } } + + return 0; +} + +size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog, + const unsigned* count, size_t total, + unsigned maxSymbolValue, unsigned useLowProbCount) +{ + /* Sanity checks */ + if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG; + if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC); /* Unsupported size */ + if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge); /* Unsupported size */ + if (tableLog < FSE_minTableLog(total, maxSymbolValue)) return ERROR(GENERIC); /* Too small tableLog, compression potentially impossible */ + + { static U32 const rtbTable[] = { 0, 473195, 504333, 520860, 550000, 700000, 750000, 830000 }; + short const lowProbCount = useLowProbCount ? -1 : 1; + U64 const scale = 62 - tableLog; + U64 const step = ZSTD_div64((U64)1<<62, (U32)total); /* <== here, one division ! */ + U64 const vStep = 1ULL<<(scale-20); + int stillToDistribute = 1<> tableLog); + + for (s=0; s<=maxSymbolValue; s++) { + if (count[s] == total) return 0; /* rle special case */ + if (count[s] == 0) { normalizedCounter[s]=0; continue; } + if (count[s] <= lowThreshold) { + normalizedCounter[s] = lowProbCount; + stillToDistribute--; + } else { + short proba = (short)((count[s]*step) >> scale); + if (proba<8) { + U64 restToBeat = vStep * rtbTable[proba]; + proba += (count[s]*step) - ((U64)proba< restToBeat; + } + if (proba > largestP) { largestP=proba; largest=s; } + normalizedCounter[s] = proba; + stillToDistribute -= proba; + } } + if (-stillToDistribute >= (normalizedCounter[largest] >> 1)) { + /* corner case, need another normalization method */ + size_t const errorCode = FSE_normalizeM2(normalizedCounter, tableLog, count, total, maxSymbolValue, lowProbCount); + if (FSE_isError(errorCode)) return errorCode; + } + else normalizedCounter[largest] += (short)stillToDistribute; + } + +#if 0 + { /* Print Table (debug) */ + U32 s; + U32 nTotal = 0; + for (s=0; s<=maxSymbolValue; s++) + RAWLOG(2, "%3i: %4i \n", s, normalizedCounter[s]); + for (s=0; s<=maxSymbolValue; s++) + nTotal += abs(normalizedCounter[s]); + if (nTotal != (1U< FSE_MAX_TABLELOG*4+7 ) && (srcSize & 2)) { /* test bit 2 */ + FSE_encodeSymbol(&bitC, &CState2, *--ip); + FSE_encodeSymbol(&bitC, &CState1, *--ip); + FSE_FLUSHBITS(&bitC); + } + + /* 2 or 4 encoding per loop */ + while ( ip>istart ) { + + FSE_encodeSymbol(&bitC, &CState2, *--ip); + + if (sizeof(bitC.bitContainer)*8 < FSE_MAX_TABLELOG*2+7 ) /* this test must be static */ + FSE_FLUSHBITS(&bitC); + + FSE_encodeSymbol(&bitC, &CState1, *--ip); + + if (sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) { /* this test must be static */ + FSE_encodeSymbol(&bitC, &CState2, *--ip); + FSE_encodeSymbol(&bitC, &CState1, *--ip); + } + + FSE_FLUSHBITS(&bitC); + } + + FSE_flushCState(&bitC, &CState2); + FSE_flushCState(&bitC, &CState1); + return BIT_closeCStream(&bitC); +} + +size_t FSE_compress_usingCTable (void* dst, size_t dstSize, + const void* src, size_t srcSize, + const FSE_CTable* ct) +{ + unsigned const fast = (dstSize >= FSE_BLOCKBOUND(srcSize)); + + if (fast) + return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 1); + else + return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 0); +} + + +size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); } + +#endif /* FSE_COMMONDEFS_ONLY */ diff --git a/lib/compress/hist.c b/lib/compress/hist.c new file mode 100644 index 0000000..4ccf9a9 --- /dev/null +++ b/lib/compress/hist.c @@ -0,0 +1,191 @@ +/* ****************************************************************** + * hist : Histogram functions + * part of Finite State Entropy project + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy + * - Public forum : https://groups.google.com/forum/#!forum/lz4c + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ + +/* --- dependencies --- */ +#include "../common/mem.h" /* U32, BYTE, etc. */ +#include "../common/debug.h" /* assert, DEBUGLOG */ +#include "../common/error_private.h" /* ERROR */ +#include "hist.h" + + +/* --- Error management --- */ +unsigned HIST_isError(size_t code) { return ERR_isError(code); } + +/*-************************************************************** + * Histogram functions + ****************************************************************/ +void HIST_add(unsigned* count, const void* src, size_t srcSize) +{ + const BYTE* ip = (const BYTE*)src; + const BYTE* const end = ip + srcSize; + + while (ip largestCount) largestCount = count[s]; + } + + return largestCount; +} + +typedef enum { trustInput, checkMaxSymbolValue } HIST_checkInput_e; + +/* HIST_count_parallel_wksp() : + * store histogram into 4 intermediate tables, recombined at the end. + * this design makes better use of OoO cpus, + * and is noticeably faster when some values are heavily repeated. + * But it needs some additional workspace for intermediate tables. + * `workSpace` must be a U32 table of size >= HIST_WKSP_SIZE_U32. + * @return : largest histogram frequency, + * or an error code (notably when histogram's alphabet is larger than *maxSymbolValuePtr) */ +static size_t HIST_count_parallel_wksp( + unsigned* count, unsigned* maxSymbolValuePtr, + const void* source, size_t sourceSize, + HIST_checkInput_e check, + U32* const workSpace) +{ + const BYTE* ip = (const BYTE*)source; + const BYTE* const iend = ip+sourceSize; + size_t const countSize = (*maxSymbolValuePtr + 1) * sizeof(*count); + unsigned max=0; + U32* const Counting1 = workSpace; + U32* const Counting2 = Counting1 + 256; + U32* const Counting3 = Counting2 + 256; + U32* const Counting4 = Counting3 + 256; + + /* safety checks */ + assert(*maxSymbolValuePtr <= 255); + if (!sourceSize) { + ZSTD_memset(count, 0, countSize); + *maxSymbolValuePtr = 0; + return 0; + } + ZSTD_memset(workSpace, 0, 4*256*sizeof(unsigned)); + + /* by stripes of 16 bytes */ + { U32 cached = MEM_read32(ip); ip += 4; + while (ip < iend-15) { + U32 c = cached; cached = MEM_read32(ip); ip += 4; + Counting1[(BYTE) c ]++; + Counting2[(BYTE)(c>>8) ]++; + Counting3[(BYTE)(c>>16)]++; + Counting4[ c>>24 ]++; + c = cached; cached = MEM_read32(ip); ip += 4; + Counting1[(BYTE) c ]++; + Counting2[(BYTE)(c>>8) ]++; + Counting3[(BYTE)(c>>16)]++; + Counting4[ c>>24 ]++; + c = cached; cached = MEM_read32(ip); ip += 4; + Counting1[(BYTE) c ]++; + Counting2[(BYTE)(c>>8) ]++; + Counting3[(BYTE)(c>>16)]++; + Counting4[ c>>24 ]++; + c = cached; cached = MEM_read32(ip); ip += 4; + Counting1[(BYTE) c ]++; + Counting2[(BYTE)(c>>8) ]++; + Counting3[(BYTE)(c>>16)]++; + Counting4[ c>>24 ]++; + } + ip-=4; + } + + /* finish last symbols */ + while (ip max) max = Counting1[s]; + } } + + { unsigned maxSymbolValue = 255; + while (!Counting1[maxSymbolValue]) maxSymbolValue--; + if (check && maxSymbolValue > *maxSymbolValuePtr) return ERROR(maxSymbolValue_tooSmall); + *maxSymbolValuePtr = maxSymbolValue; + ZSTD_memmove(count, Counting1, countSize); /* in case count & Counting1 are overlapping */ + } + return (size_t)max; +} + +/* HIST_countFast_wksp() : + * Same as HIST_countFast(), but using an externally provided scratch buffer. + * `workSpace` is a writable buffer which must be 4-bytes aligned, + * `workSpaceSize` must be >= HIST_WKSP_SIZE + */ +size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr, + const void* source, size_t sourceSize, + void* workSpace, size_t workSpaceSize) +{ + if (sourceSize < 1500) /* heuristic threshold */ + return HIST_count_simple(count, maxSymbolValuePtr, source, sourceSize); + if ((size_t)workSpace & 3) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */ + if (workSpaceSize < HIST_WKSP_SIZE) return ERROR(workSpace_tooSmall); + return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, trustInput, (U32*)workSpace); +} + +/* HIST_count_wksp() : + * Same as HIST_count(), but using an externally provided scratch buffer. + * `workSpace` size must be table of >= HIST_WKSP_SIZE_U32 unsigned */ +size_t HIST_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr, + const void* source, size_t sourceSize, + void* workSpace, size_t workSpaceSize) +{ + if ((size_t)workSpace & 3) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */ + if (workSpaceSize < HIST_WKSP_SIZE) return ERROR(workSpace_tooSmall); + if (*maxSymbolValuePtr < 255) + return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, checkMaxSymbolValue, (U32*)workSpace); + *maxSymbolValuePtr = 255; + return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, workSpace, workSpaceSize); +} + +#ifndef ZSTD_NO_UNUSED_FUNCTIONS +/* fast variant (unsafe : won't check if src contains values beyond count[] limit) */ +size_t HIST_countFast(unsigned* count, unsigned* maxSymbolValuePtr, + const void* source, size_t sourceSize) +{ + unsigned tmpCounters[HIST_WKSP_SIZE_U32]; + return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, tmpCounters, sizeof(tmpCounters)); +} + +size_t HIST_count(unsigned* count, unsigned* maxSymbolValuePtr, + const void* src, size_t srcSize) +{ + unsigned tmpCounters[HIST_WKSP_SIZE_U32]; + return HIST_count_wksp(count, maxSymbolValuePtr, src, srcSize, tmpCounters, sizeof(tmpCounters)); +} +#endif diff --git a/lib/compress/huf_compress.c b/lib/compress/huf_compress.c new file mode 100644 index 0000000..6f2194e --- /dev/null +++ b/lib/compress/huf_compress.c @@ -0,0 +1,1465 @@ +/* ****************************************************************** + * Huffman encoder, part of New Generation Entropy library + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy + * - Public forum : https://groups.google.com/forum/#!forum/lz4c + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ + +/* ************************************************************** +* Compiler specifics +****************************************************************/ +#ifdef _MSC_VER /* Visual Studio */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +#endif + + +/* ************************************************************** +* Includes +****************************************************************/ +#include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memset */ +#include "../common/compiler.h" +#include "../common/bitstream.h" +#include "hist.h" +#define FSE_STATIC_LINKING_ONLY /* FSE_optimalTableLog_internal */ +#include "../common/fse.h" /* header compression */ +#include "../common/huf.h" +#include "../common/error_private.h" +#include "../common/bits.h" /* ZSTD_highbit32 */ + + +/* ************************************************************** +* Error Management +****************************************************************/ +#define HUF_isError ERR_isError +#define HUF_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c) /* use only *after* variable declarations */ + + +/* ************************************************************** +* Required declarations +****************************************************************/ +typedef struct nodeElt_s { + U32 count; + U16 parent; + BYTE byte; + BYTE nbBits; +} nodeElt; + + +/* ************************************************************** +* Debug Traces +****************************************************************/ + +#if DEBUGLEVEL >= 2 + +static size_t showU32(const U32* arr, size_t size) +{ + size_t u; + for (u=0; u= add) { + assert(add < align); + assert(((size_t)aligned & mask) == 0); + *workspaceSizePtr -= add; + return aligned; + } else { + *workspaceSizePtr = 0; + return NULL; + } +} + + +/* HUF_compressWeights() : + * Same as FSE_compress(), but dedicated to huff0's weights compression. + * The use case needs much less stack memory. + * Note : all elements within weightTable are supposed to be <= HUF_TABLELOG_MAX. + */ +#define MAX_FSE_TABLELOG_FOR_HUFF_HEADER 6 + +typedef struct { + FSE_CTable CTable[FSE_CTABLE_SIZE_U32(MAX_FSE_TABLELOG_FOR_HUFF_HEADER, HUF_TABLELOG_MAX)]; + U32 scratchBuffer[FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(HUF_TABLELOG_MAX, MAX_FSE_TABLELOG_FOR_HUFF_HEADER)]; + unsigned count[HUF_TABLELOG_MAX+1]; + S16 norm[HUF_TABLELOG_MAX+1]; +} HUF_CompressWeightsWksp; + +static size_t +HUF_compressWeights(void* dst, size_t dstSize, + const void* weightTable, size_t wtSize, + void* workspace, size_t workspaceSize) +{ + BYTE* const ostart = (BYTE*) dst; + BYTE* op = ostart; + BYTE* const oend = ostart + dstSize; + + unsigned maxSymbolValue = HUF_TABLELOG_MAX; + U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER; + HUF_CompressWeightsWksp* wksp = (HUF_CompressWeightsWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32)); + + if (workspaceSize < sizeof(HUF_CompressWeightsWksp)) return ERROR(GENERIC); + + /* init conditions */ + if (wtSize <= 1) return 0; /* Not compressible */ + + /* Scan input and build symbol stats */ + { unsigned const maxCount = HIST_count_simple(wksp->count, &maxSymbolValue, weightTable, wtSize); /* never fails */ + if (maxCount == wtSize) return 1; /* only a single symbol in src : rle */ + if (maxCount == 1) return 0; /* each symbol present maximum once => not compressible */ + } + + tableLog = FSE_optimalTableLog(tableLog, wtSize, maxSymbolValue); + CHECK_F( FSE_normalizeCount(wksp->norm, tableLog, wksp->count, wtSize, maxSymbolValue, /* useLowProbCount */ 0) ); + + /* Write table description header */ + { CHECK_V_F(hSize, FSE_writeNCount(op, (size_t)(oend-op), wksp->norm, maxSymbolValue, tableLog) ); + op += hSize; + } + + /* Compress */ + CHECK_F( FSE_buildCTable_wksp(wksp->CTable, wksp->norm, maxSymbolValue, tableLog, wksp->scratchBuffer, sizeof(wksp->scratchBuffer)) ); + { CHECK_V_F(cSize, FSE_compress_usingCTable(op, (size_t)(oend - op), weightTable, wtSize, wksp->CTable) ); + if (cSize == 0) return 0; /* not enough space for compressed data */ + op += cSize; + } + + return (size_t)(op-ostart); +} + +static size_t HUF_getNbBits(HUF_CElt elt) +{ + return elt & 0xFF; +} + +static size_t HUF_getNbBitsFast(HUF_CElt elt) +{ + return elt; +} + +static size_t HUF_getValue(HUF_CElt elt) +{ + return elt & ~(size_t)0xFF; +} + +static size_t HUF_getValueFast(HUF_CElt elt) +{ + return elt; +} + +static void HUF_setNbBits(HUF_CElt* elt, size_t nbBits) +{ + assert(nbBits <= HUF_TABLELOG_ABSOLUTEMAX); + *elt = nbBits; +} + +static void HUF_setValue(HUF_CElt* elt, size_t value) +{ + size_t const nbBits = HUF_getNbBits(*elt); + if (nbBits > 0) { + assert((value >> nbBits) == 0); + *elt |= value << (sizeof(HUF_CElt) * 8 - nbBits); + } +} + +HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable) +{ + HUF_CTableHeader header; + ZSTD_memcpy(&header, ctable, sizeof(header)); + return header; +} + +static void HUF_writeCTableHeader(HUF_CElt* ctable, U32 tableLog, U32 maxSymbolValue) +{ + HUF_CTableHeader header; + HUF_STATIC_ASSERT(sizeof(ctable[0]) == sizeof(header)); + ZSTD_memset(&header, 0, sizeof(header)); + assert(tableLog < 256); + header.tableLog = (BYTE)tableLog; + assert(maxSymbolValue < 256); + header.maxSymbolValue = (BYTE)maxSymbolValue; + ZSTD_memcpy(ctable, &header, sizeof(header)); +} + +typedef struct { + HUF_CompressWeightsWksp wksp; + BYTE bitsToWeight[HUF_TABLELOG_MAX + 1]; /* precomputed conversion table */ + BYTE huffWeight[HUF_SYMBOLVALUE_MAX]; +} HUF_WriteCTableWksp; + +size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, + const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, + void* workspace, size_t workspaceSize) +{ + HUF_CElt const* const ct = CTable + 1; + BYTE* op = (BYTE*)dst; + U32 n; + HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32)); + + HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE >= sizeof(HUF_WriteCTableWksp)); + + assert(HUF_readCTableHeader(CTable).maxSymbolValue == maxSymbolValue); + assert(HUF_readCTableHeader(CTable).tableLog == huffLog); + + /* check conditions */ + if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC); + if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge); + + /* convert to weight */ + wksp->bitsToWeight[0] = 0; + for (n=1; nbitsToWeight[n] = (BYTE)(huffLog + 1 - n); + for (n=0; nhuffWeight[n] = wksp->bitsToWeight[HUF_getNbBits(ct[n])]; + + /* attempt weights compression by FSE */ + if (maxDstSize < 1) return ERROR(dstSize_tooSmall); + { CHECK_V_F(hSize, HUF_compressWeights(op+1, maxDstSize-1, wksp->huffWeight, maxSymbolValue, &wksp->wksp, sizeof(wksp->wksp)) ); + if ((hSize>1) & (hSize < maxSymbolValue/2)) { /* FSE compressed */ + op[0] = (BYTE)hSize; + return hSize+1; + } } + + /* write raw values as 4-bits (max : 15) */ + if (maxSymbolValue > (256-128)) return ERROR(GENERIC); /* should not happen : likely means source cannot be compressed */ + if (((maxSymbolValue+1)/2) + 1 > maxDstSize) return ERROR(dstSize_tooSmall); /* not enough space within dst buffer */ + op[0] = (BYTE)(128 /*special case*/ + (maxSymbolValue-1)); + wksp->huffWeight[maxSymbolValue] = 0; /* to be sure it doesn't cause msan issue in final combination */ + for (n=0; nhuffWeight[n] << 4) + wksp->huffWeight[n+1]); + return ((maxSymbolValue+1)/2) + 1; +} + + +size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights) +{ + BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1]; /* init not required, even though some static analyzer may complain */ + U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; /* large enough for values from 0 to 16 */ + U32 tableLog = 0; + U32 nbSymbols = 0; + HUF_CElt* const ct = CTable + 1; + + /* get symbol weights */ + CHECK_V_F(readSize, HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX+1, rankVal, &nbSymbols, &tableLog, src, srcSize)); + *hasZeroWeights = (rankVal[0] > 0); + + /* check result */ + if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); + if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall); + + *maxSymbolValuePtr = nbSymbols - 1; + + HUF_writeCTableHeader(CTable, tableLog, *maxSymbolValuePtr); + + /* Prepare base value per rank */ + { U32 n, nextRankStart = 0; + for (n=1; n<=tableLog; n++) { + U32 curr = nextRankStart; + nextRankStart += (rankVal[n] << (n-1)); + rankVal[n] = curr; + } } + + /* fill nbBits */ + { U32 n; for (n=0; nn=tableLog+1 */ + U16 valPerRank[HUF_TABLELOG_MAX+2] = {0}; + { U32 n; for (n=0; n0; n--) { /* start at n=tablelog <-> w=1 */ + valPerRank[n] = min; /* get starting value within each rank */ + min += nbPerRank[n]; + min >>= 1; + } } + /* assign value within rank, symbol order */ + { U32 n; for (n=0; n HUF_readCTableHeader(CTable).maxSymbolValue) + return 0; + return (U32)HUF_getNbBits(ct[symbolValue]); +} + + +/** + * HUF_setMaxHeight(): + * Try to enforce @targetNbBits on the Huffman tree described in @huffNode. + * + * It attempts to convert all nodes with nbBits > @targetNbBits + * to employ @targetNbBits instead. Then it adjusts the tree + * so that it remains a valid canonical Huffman tree. + * + * @pre The sum of the ranks of each symbol == 2^largestBits, + * where largestBits == huffNode[lastNonNull].nbBits. + * @post The sum of the ranks of each symbol == 2^largestBits, + * where largestBits is the return value (expected <= targetNbBits). + * + * @param huffNode The Huffman tree modified in place to enforce targetNbBits. + * It's presumed sorted, from most frequent to rarest symbol. + * @param lastNonNull The symbol with the lowest count in the Huffman tree. + * @param targetNbBits The allowed number of bits, which the Huffman tree + * may not respect. After this function the Huffman tree will + * respect targetNbBits. + * @return The maximum number of bits of the Huffman tree after adjustment. + */ +static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 targetNbBits) +{ + const U32 largestBits = huffNode[lastNonNull].nbBits; + /* early exit : no elt > targetNbBits, so the tree is already valid. */ + if (largestBits <= targetNbBits) return largestBits; + + DEBUGLOG(5, "HUF_setMaxHeight (targetNbBits = %u)", targetNbBits); + + /* there are several too large elements (at least >= 2) */ + { int totalCost = 0; + const U32 baseCost = 1 << (largestBits - targetNbBits); + int n = (int)lastNonNull; + + /* Adjust any ranks > targetNbBits to targetNbBits. + * Compute totalCost, which is how far the sum of the ranks is + * we are over 2^largestBits after adjust the offending ranks. + */ + while (huffNode[n].nbBits > targetNbBits) { + totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits)); + huffNode[n].nbBits = (BYTE)targetNbBits; + n--; + } + /* n stops at huffNode[n].nbBits <= targetNbBits */ + assert(huffNode[n].nbBits <= targetNbBits); + /* n end at index of smallest symbol using < targetNbBits */ + while (huffNode[n].nbBits == targetNbBits) --n; + + /* renorm totalCost from 2^largestBits to 2^targetNbBits + * note : totalCost is necessarily a multiple of baseCost */ + assert(((U32)totalCost & (baseCost - 1)) == 0); + totalCost >>= (largestBits - targetNbBits); + assert(totalCost > 0); + + /* repay normalized cost */ + { U32 const noSymbol = 0xF0F0F0F0; + U32 rankLast[HUF_TABLELOG_MAX+2]; + + /* Get pos of last (smallest = lowest cum. count) symbol per rank */ + ZSTD_memset(rankLast, 0xF0, sizeof(rankLast)); + { U32 currentNbBits = targetNbBits; + int pos; + for (pos=n ; pos >= 0; pos--) { + if (huffNode[pos].nbBits >= currentNbBits) continue; + currentNbBits = huffNode[pos].nbBits; /* < targetNbBits */ + rankLast[targetNbBits-currentNbBits] = (U32)pos; + } } + + while (totalCost > 0) { + /* Try to reduce the next power of 2 above totalCost because we + * gain back half the rank. + */ + U32 nBitsToDecrease = ZSTD_highbit32((U32)totalCost) + 1; + assert(nBitsToDecrease <= HUF_TABLELOG_MAX+1); + for ( ; nBitsToDecrease > 1; nBitsToDecrease--) { + U32 const highPos = rankLast[nBitsToDecrease]; + U32 const lowPos = rankLast[nBitsToDecrease-1]; + if (highPos == noSymbol) continue; + /* Decrease highPos if no symbols of lowPos or if it is + * not cheaper to remove 2 lowPos than highPos. + */ + if (lowPos == noSymbol) break; + { U32 const highTotal = huffNode[highPos].count; + U32 const lowTotal = 2 * huffNode[lowPos].count; + if (highTotal <= lowTotal) break; + } } + /* only triggered when no more rank 1 symbol left => find closest one (note : there is necessarily at least one !) */ + assert(rankLast[nBitsToDecrease] != noSymbol || nBitsToDecrease == 1); + /* HUF_MAX_TABLELOG test just to please gcc 5+; but it should not be necessary */ + while ((nBitsToDecrease<=HUF_TABLELOG_MAX) && (rankLast[nBitsToDecrease] == noSymbol)) + nBitsToDecrease++; + assert(rankLast[nBitsToDecrease] != noSymbol); + /* Increase the number of bits to gain back half the rank cost. */ + totalCost -= 1 << (nBitsToDecrease-1); + huffNode[rankLast[nBitsToDecrease]].nbBits++; + + /* Fix up the new rank. + * If the new rank was empty, this symbol is now its smallest. + * Otherwise, this symbol will be the largest in the new rank so no adjustment. + */ + if (rankLast[nBitsToDecrease-1] == noSymbol) + rankLast[nBitsToDecrease-1] = rankLast[nBitsToDecrease]; + /* Fix up the old rank. + * If the symbol was at position 0, meaning it was the highest weight symbol in the tree, + * it must be the only symbol in its rank, so the old rank now has no symbols. + * Otherwise, since the Huffman nodes are sorted by count, the previous position is now + * the smallest node in the rank. If the previous position belongs to a different rank, + * then the rank is now empty. + */ + if (rankLast[nBitsToDecrease] == 0) /* special case, reached largest symbol */ + rankLast[nBitsToDecrease] = noSymbol; + else { + rankLast[nBitsToDecrease]--; + if (huffNode[rankLast[nBitsToDecrease]].nbBits != targetNbBits-nBitsToDecrease) + rankLast[nBitsToDecrease] = noSymbol; /* this rank is now empty */ + } + } /* while (totalCost > 0) */ + + /* If we've removed too much weight, then we have to add it back. + * To avoid overshooting again, we only adjust the smallest rank. + * We take the largest nodes from the lowest rank 0 and move them + * to rank 1. There's guaranteed to be enough rank 0 symbols because + * TODO. + */ + while (totalCost < 0) { /* Sometimes, cost correction overshoot */ + /* special case : no rank 1 symbol (using targetNbBits-1); + * let's create one from largest rank 0 (using targetNbBits). + */ + if (rankLast[1] == noSymbol) { + while (huffNode[n].nbBits == targetNbBits) n--; + huffNode[n+1].nbBits--; + assert(n >= 0); + rankLast[1] = (U32)(n+1); + totalCost++; + continue; + } + huffNode[ rankLast[1] + 1 ].nbBits--; + rankLast[1]++; + totalCost ++; + } + } /* repay normalized cost */ + } /* there are several too large elements (at least >= 2) */ + + return targetNbBits; +} + +typedef struct { + U16 base; + U16 curr; +} rankPos; + +typedef nodeElt huffNodeTable[2 * (HUF_SYMBOLVALUE_MAX + 1)]; + +/* Number of buckets available for HUF_sort() */ +#define RANK_POSITION_TABLE_SIZE 192 + +typedef struct { + huffNodeTable huffNodeTbl; + rankPos rankPosition[RANK_POSITION_TABLE_SIZE]; +} HUF_buildCTable_wksp_tables; + +/* RANK_POSITION_DISTINCT_COUNT_CUTOFF == Cutoff point in HUF_sort() buckets for which we use log2 bucketing. + * Strategy is to use as many buckets as possible for representing distinct + * counts while using the remainder to represent all "large" counts. + * + * To satisfy this requirement for 192 buckets, we can do the following: + * Let buckets 0-166 represent distinct counts of [0, 166] + * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing. + */ +#define RANK_POSITION_MAX_COUNT_LOG 32 +#define RANK_POSITION_LOG_BUCKETS_BEGIN ((RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */) +#define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_LOG_BUCKETS_BEGIN + ZSTD_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */) + +/* Return the appropriate bucket index for a given count. See definition of + * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy. + */ +static U32 HUF_getIndex(U32 const count) { + return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF) + ? count + : ZSTD_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN; +} + +/* Helper swap function for HUF_quickSortPartition() */ +static void HUF_swapNodes(nodeElt* a, nodeElt* b) { + nodeElt tmp = *a; + *a = *b; + *b = tmp; +} + +/* Returns 0 if the huffNode array is not sorted by descending count */ +MEM_STATIC int HUF_isSorted(nodeElt huffNode[], U32 const maxSymbolValue1) { + U32 i; + for (i = 1; i < maxSymbolValue1; ++i) { + if (huffNode[i].count > huffNode[i-1].count) { + return 0; + } + } + return 1; +} + +/* Insertion sort by descending order */ +HINT_INLINE void HUF_insertionSort(nodeElt huffNode[], int const low, int const high) { + int i; + int const size = high-low+1; + huffNode += low; + for (i = 1; i < size; ++i) { + nodeElt const key = huffNode[i]; + int j = i - 1; + while (j >= 0 && huffNode[j].count < key.count) { + huffNode[j + 1] = huffNode[j]; + j--; + } + huffNode[j + 1] = key; + } +} + +/* Pivot helper function for quicksort. */ +static int HUF_quickSortPartition(nodeElt arr[], int const low, int const high) { + /* Simply select rightmost element as pivot. "Better" selectors like + * median-of-three don't experimentally appear to have any benefit. + */ + U32 const pivot = arr[high].count; + int i = low - 1; + int j = low; + for ( ; j < high; j++) { + if (arr[j].count > pivot) { + i++; + HUF_swapNodes(&arr[i], &arr[j]); + } + } + HUF_swapNodes(&arr[i + 1], &arr[high]); + return i + 1; +} + +/* Classic quicksort by descending with partially iterative calls + * to reduce worst case callstack size. + */ +static void HUF_simpleQuickSort(nodeElt arr[], int low, int high) { + int const kInsertionSortThreshold = 8; + if (high - low < kInsertionSortThreshold) { + HUF_insertionSort(arr, low, high); + return; + } + while (low < high) { + int const idx = HUF_quickSortPartition(arr, low, high); + if (idx - low < high - idx) { + HUF_simpleQuickSort(arr, low, idx - 1); + low = idx + 1; + } else { + HUF_simpleQuickSort(arr, idx + 1, high); + high = idx - 1; + } + } +} + +/** + * HUF_sort(): + * Sorts the symbols [0, maxSymbolValue] by count[symbol] in decreasing order. + * This is a typical bucket sorting strategy that uses either quicksort or insertion sort to sort each bucket. + * + * @param[out] huffNode Sorted symbols by decreasing count. Only members `.count` and `.byte` are filled. + * Must have (maxSymbolValue + 1) entries. + * @param[in] count Histogram of the symbols. + * @param[in] maxSymbolValue Maximum symbol value. + * @param rankPosition This is a scratch workspace. Must have RANK_POSITION_TABLE_SIZE entries. + */ +static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSymbolValue, rankPos rankPosition[]) { + U32 n; + U32 const maxSymbolValue1 = maxSymbolValue+1; + + /* Compute base and set curr to base. + * For symbol s let lowerRank = HUF_getIndex(count[n]) and rank = lowerRank + 1. + * See HUF_getIndex to see bucketing strategy. + * We attribute each symbol to lowerRank's base value, because we want to know where + * each rank begins in the output, so for rank R we want to count ranks R+1 and above. + */ + ZSTD_memset(rankPosition, 0, sizeof(*rankPosition) * RANK_POSITION_TABLE_SIZE); + for (n = 0; n < maxSymbolValue1; ++n) { + U32 lowerRank = HUF_getIndex(count[n]); + assert(lowerRank < RANK_POSITION_TABLE_SIZE - 1); + rankPosition[lowerRank].base++; + } + + assert(rankPosition[RANK_POSITION_TABLE_SIZE - 1].base == 0); + /* Set up the rankPosition table */ + for (n = RANK_POSITION_TABLE_SIZE - 1; n > 0; --n) { + rankPosition[n-1].base += rankPosition[n].base; + rankPosition[n-1].curr = rankPosition[n-1].base; + } + + /* Insert each symbol into their appropriate bucket, setting up rankPosition table. */ + for (n = 0; n < maxSymbolValue1; ++n) { + U32 const c = count[n]; + U32 const r = HUF_getIndex(c) + 1; + U32 const pos = rankPosition[r].curr++; + assert(pos < maxSymbolValue1); + huffNode[pos].count = c; + huffNode[pos].byte = (BYTE)n; + } + + /* Sort each bucket. */ + for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) { + int const bucketSize = rankPosition[n].curr - rankPosition[n].base; + U32 const bucketStartIdx = rankPosition[n].base; + if (bucketSize > 1) { + assert(bucketStartIdx < maxSymbolValue1); + HUF_simpleQuickSort(huffNode + bucketStartIdx, 0, bucketSize-1); + } + } + + assert(HUF_isSorted(huffNode, maxSymbolValue1)); +} + + +/** HUF_buildCTable_wksp() : + * Same as HUF_buildCTable(), but using externally allocated scratch buffer. + * `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables). + */ +#define STARTNODE (HUF_SYMBOLVALUE_MAX+1) + +/* HUF_buildTree(): + * Takes the huffNode array sorted by HUF_sort() and builds an unlimited-depth Huffman tree. + * + * @param huffNode The array sorted by HUF_sort(). Builds the Huffman tree in this array. + * @param maxSymbolValue The maximum symbol value. + * @return The smallest node in the Huffman tree (by count). + */ +static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue) +{ + nodeElt* const huffNode0 = huffNode - 1; + int nonNullRank; + int lowS, lowN; + int nodeNb = STARTNODE; + int n, nodeRoot; + DEBUGLOG(5, "HUF_buildTree (alphabet size = %u)", maxSymbolValue + 1); + /* init for parents */ + nonNullRank = (int)maxSymbolValue; + while(huffNode[nonNullRank].count == 0) nonNullRank--; + lowS = nonNullRank; nodeRoot = nodeNb + lowS - 1; lowN = nodeNb; + huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS-1].count; + huffNode[lowS].parent = huffNode[lowS-1].parent = (U16)nodeNb; + nodeNb++; lowS-=2; + for (n=nodeNb; n<=nodeRoot; n++) huffNode[n].count = (U32)(1U<<30); + huffNode0[0].count = (U32)(1U<<31); /* fake entry, strong barrier */ + + /* create parents */ + while (nodeNb <= nodeRoot) { + int const n1 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++; + int const n2 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++; + huffNode[nodeNb].count = huffNode[n1].count + huffNode[n2].count; + huffNode[n1].parent = huffNode[n2].parent = (U16)nodeNb; + nodeNb++; + } + + /* distribute weights (unlimited tree height) */ + huffNode[nodeRoot].nbBits = 0; + for (n=nodeRoot-1; n>=STARTNODE; n--) + huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1; + for (n=0; n<=nonNullRank; n++) + huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1; + + DEBUGLOG(6, "Initial distribution of bits completed (%zu sorted symbols)", showHNodeBits(huffNode, maxSymbolValue+1)); + + return nonNullRank; +} + +/** + * HUF_buildCTableFromTree(): + * Build the CTable given the Huffman tree in huffNode. + * + * @param[out] CTable The output Huffman CTable. + * @param huffNode The Huffman tree. + * @param nonNullRank The last and smallest node in the Huffman tree. + * @param maxSymbolValue The maximum symbol value. + * @param maxNbBits The exact maximum number of bits used in the Huffman tree. + */ +static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, int nonNullRank, U32 maxSymbolValue, U32 maxNbBits) +{ + HUF_CElt* const ct = CTable + 1; + /* fill result into ctable (val, nbBits) */ + int n; + U16 nbPerRank[HUF_TABLELOG_MAX+1] = {0}; + U16 valPerRank[HUF_TABLELOG_MAX+1] = {0}; + int const alphabetSize = (int)(maxSymbolValue + 1); + for (n=0; n<=nonNullRank; n++) + nbPerRank[huffNode[n].nbBits]++; + /* determine starting value per rank */ + { U16 min = 0; + for (n=(int)maxNbBits; n>0; n--) { + valPerRank[n] = min; /* get starting value within each rank */ + min += nbPerRank[n]; + min >>= 1; + } } + for (n=0; nhuffNodeTbl; + nodeElt* const huffNode = huffNode0+1; + int nonNullRank; + + HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE == sizeof(HUF_buildCTable_wksp_tables)); + + DEBUGLOG(5, "HUF_buildCTable_wksp (alphabet size = %u)", maxSymbolValue+1); + + /* safety checks */ + if (wkspSize < sizeof(HUF_buildCTable_wksp_tables)) + return ERROR(workSpace_tooSmall); + if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT; + if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) + return ERROR(maxSymbolValue_tooLarge); + ZSTD_memset(huffNode0, 0, sizeof(huffNodeTable)); + + /* sort, decreasing order */ + HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition); + DEBUGLOG(6, "sorted symbols completed (%zu symbols)", showHNodeSymbols(huffNode, maxSymbolValue+1)); + + /* build tree */ + nonNullRank = HUF_buildTree(huffNode, maxSymbolValue); + + /* determine and enforce maxTableLog */ + maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits); + if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC); /* check fit into table */ + + HUF_buildCTableFromTree(CTable, huffNode, nonNullRank, maxSymbolValue, maxNbBits); + + return maxNbBits; +} + +size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) +{ + HUF_CElt const* ct = CTable + 1; + size_t nbBits = 0; + int s; + for (s = 0; s <= (int)maxSymbolValue; ++s) { + nbBits += HUF_getNbBits(ct[s]) * count[s]; + } + return nbBits >> 3; +} + +int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) { + HUF_CTableHeader header = HUF_readCTableHeader(CTable); + HUF_CElt const* ct = CTable + 1; + int bad = 0; + int s; + + assert(header.tableLog <= HUF_TABLELOG_ABSOLUTEMAX); + + if (header.maxSymbolValue < maxSymbolValue) + return 0; + + for (s = 0; s <= (int)maxSymbolValue; ++s) { + bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0); + } + return !bad; +} + +size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); } + +/** HUF_CStream_t: + * Huffman uses its own BIT_CStream_t implementation. + * There are three major differences from BIT_CStream_t: + * 1. HUF_addBits() takes a HUF_CElt (size_t) which is + * the pair (nbBits, value) in the format: + * format: + * - Bits [0, 4) = nbBits + * - Bits [4, 64 - nbBits) = 0 + * - Bits [64 - nbBits, 64) = value + * 2. The bitContainer is built from the upper bits and + * right shifted. E.g. to add a new value of N bits + * you right shift the bitContainer by N, then or in + * the new value into the N upper bits. + * 3. The bitstream has two bit containers. You can add + * bits to the second container and merge them into + * the first container. + */ + +#define HUF_BITS_IN_CONTAINER (sizeof(size_t) * 8) + +typedef struct { + size_t bitContainer[2]; + size_t bitPos[2]; + + BYTE* startPtr; + BYTE* ptr; + BYTE* endPtr; +} HUF_CStream_t; + +/**! HUF_initCStream(): + * Initializes the bitstream. + * @returns 0 or an error code. + */ +static size_t HUF_initCStream(HUF_CStream_t* bitC, + void* startPtr, size_t dstCapacity) +{ + ZSTD_memset(bitC, 0, sizeof(*bitC)); + bitC->startPtr = (BYTE*)startPtr; + bitC->ptr = bitC->startPtr; + bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer[0]); + if (dstCapacity <= sizeof(bitC->bitContainer[0])) return ERROR(dstSize_tooSmall); + return 0; +} + +/*! HUF_addBits(): + * Adds the symbol stored in HUF_CElt elt to the bitstream. + * + * @param elt The element we're adding. This is a (nbBits, value) pair. + * See the HUF_CStream_t docs for the format. + * @param idx Insert into the bitstream at this idx. + * @param kFast This is a template parameter. If the bitstream is guaranteed + * to have at least 4 unused bits after this call it may be 1, + * otherwise it must be 0. HUF_addBits() is faster when fast is set. + */ +FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int idx, int kFast) +{ + assert(idx <= 1); + assert(HUF_getNbBits(elt) <= HUF_TABLELOG_ABSOLUTEMAX); + /* This is efficient on x86-64 with BMI2 because shrx + * only reads the low 6 bits of the register. The compiler + * knows this and elides the mask. When fast is set, + * every operation can use the same value loaded from elt. + */ + bitC->bitContainer[idx] >>= HUF_getNbBits(elt); + bitC->bitContainer[idx] |= kFast ? HUF_getValueFast(elt) : HUF_getValue(elt); + /* We only read the low 8 bits of bitC->bitPos[idx] so it + * doesn't matter that the high bits have noise from the value. + */ + bitC->bitPos[idx] += HUF_getNbBitsFast(elt); + assert((bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER); + /* The last 4-bits of elt are dirty if fast is set, + * so we must not be overwriting bits that have already been + * inserted into the bit container. + */ +#if DEBUGLEVEL >= 1 + { + size_t const nbBits = HUF_getNbBits(elt); + size_t const dirtyBits = nbBits == 0 ? 0 : ZSTD_highbit32((U32)nbBits) + 1; + (void)dirtyBits; + /* Middle bits are 0. */ + assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0); + /* We didn't overwrite any bits in the bit container. */ + assert(!kFast || (bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER); + (void)dirtyBits; + } +#endif +} + +FORCE_INLINE_TEMPLATE void HUF_zeroIndex1(HUF_CStream_t* bitC) +{ + bitC->bitContainer[1] = 0; + bitC->bitPos[1] = 0; +} + +/*! HUF_mergeIndex1() : + * Merges the bit container @ index 1 into the bit container @ index 0 + * and zeros the bit container @ index 1. + */ +FORCE_INLINE_TEMPLATE void HUF_mergeIndex1(HUF_CStream_t* bitC) +{ + assert((bitC->bitPos[1] & 0xFF) < HUF_BITS_IN_CONTAINER); + bitC->bitContainer[0] >>= (bitC->bitPos[1] & 0xFF); + bitC->bitContainer[0] |= bitC->bitContainer[1]; + bitC->bitPos[0] += bitC->bitPos[1]; + assert((bitC->bitPos[0] & 0xFF) <= HUF_BITS_IN_CONTAINER); +} + +/*! HUF_flushBits() : +* Flushes the bits in the bit container @ index 0. +* +* @post bitPos will be < 8. +* @param kFast If kFast is set then we must know a-priori that +* the bit container will not overflow. +*/ +FORCE_INLINE_TEMPLATE void HUF_flushBits(HUF_CStream_t* bitC, int kFast) +{ + /* The upper bits of bitPos are noisy, so we must mask by 0xFF. */ + size_t const nbBits = bitC->bitPos[0] & 0xFF; + size_t const nbBytes = nbBits >> 3; + /* The top nbBits bits of bitContainer are the ones we need. */ + size_t const bitContainer = bitC->bitContainer[0] >> (HUF_BITS_IN_CONTAINER - nbBits); + /* Mask bitPos to account for the bytes we consumed. */ + bitC->bitPos[0] &= 7; + assert(nbBits > 0); + assert(nbBits <= sizeof(bitC->bitContainer[0]) * 8); + assert(bitC->ptr <= bitC->endPtr); + MEM_writeLEST(bitC->ptr, bitContainer); + bitC->ptr += nbBytes; + assert(!kFast || bitC->ptr <= bitC->endPtr); + if (!kFast && bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr; + /* bitContainer doesn't need to be modified because the leftover + * bits are already the top bitPos bits. And we don't care about + * noise in the lower values. + */ +} + +/*! HUF_endMark() + * @returns The Huffman stream end mark: A 1-bit value = 1. + */ +static HUF_CElt HUF_endMark(void) +{ + HUF_CElt endMark; + HUF_setNbBits(&endMark, 1); + HUF_setValue(&endMark, 1); + return endMark; +} + +/*! HUF_closeCStream() : + * @return Size of CStream, in bytes, + * or 0 if it could not fit into dstBuffer */ +static size_t HUF_closeCStream(HUF_CStream_t* bitC) +{ + HUF_addBits(bitC, HUF_endMark(), /* idx */ 0, /* kFast */ 0); + HUF_flushBits(bitC, /* kFast */ 0); + { + size_t const nbBits = bitC->bitPos[0] & 0xFF; + if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */ + return (size_t)(bitC->ptr - bitC->startPtr) + (nbBits > 0); + } +} + +FORCE_INLINE_TEMPLATE void +HUF_encodeSymbol(HUF_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable, int idx, int fast) +{ + HUF_addBits(bitCPtr, CTable[symbol], idx, fast); +} + +FORCE_INLINE_TEMPLATE void +HUF_compress1X_usingCTable_internal_body_loop(HUF_CStream_t* bitC, + const BYTE* ip, size_t srcSize, + const HUF_CElt* ct, + int kUnroll, int kFastFlush, int kLastFast) +{ + /* Join to kUnroll */ + int n = (int)srcSize; + int rem = n % kUnroll; + if (rem > 0) { + for (; rem > 0; --rem) { + HUF_encodeSymbol(bitC, ip[--n], ct, 0, /* fast */ 0); + } + HUF_flushBits(bitC, kFastFlush); + } + assert(n % kUnroll == 0); + + /* Join to 2 * kUnroll */ + if (n % (2 * kUnroll)) { + int u; + for (u = 1; u < kUnroll; ++u) { + HUF_encodeSymbol(bitC, ip[n - u], ct, 0, 1); + } + HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, 0, kLastFast); + HUF_flushBits(bitC, kFastFlush); + n -= kUnroll; + } + assert(n % (2 * kUnroll) == 0); + + for (; n>0; n-= 2 * kUnroll) { + /* Encode kUnroll symbols into the bitstream @ index 0. */ + int u; + for (u = 1; u < kUnroll; ++u) { + HUF_encodeSymbol(bitC, ip[n - u], ct, /* idx */ 0, /* fast */ 1); + } + HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, /* idx */ 0, /* fast */ kLastFast); + HUF_flushBits(bitC, kFastFlush); + /* Encode kUnroll symbols into the bitstream @ index 1. + * This allows us to start filling the bit container + * without any data dependencies. + */ + HUF_zeroIndex1(bitC); + for (u = 1; u < kUnroll; ++u) { + HUF_encodeSymbol(bitC, ip[n - kUnroll - u], ct, /* idx */ 1, /* fast */ 1); + } + HUF_encodeSymbol(bitC, ip[n - kUnroll - kUnroll], ct, /* idx */ 1, /* fast */ kLastFast); + /* Merge bitstream @ index 1 into the bitstream @ index 0 */ + HUF_mergeIndex1(bitC); + HUF_flushBits(bitC, kFastFlush); + } + assert(n == 0); + +} + +/** + * Returns a tight upper bound on the output space needed by Huffman + * with 8 bytes buffer to handle over-writes. If the output is at least + * this large we don't need to do bounds checks during Huffman encoding. + */ +static size_t HUF_tightCompressBound(size_t srcSize, size_t tableLog) +{ + return ((srcSize * tableLog) >> 3) + 8; +} + + +FORCE_INLINE_TEMPLATE size_t +HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable) +{ + U32 const tableLog = HUF_readCTableHeader(CTable).tableLog; + HUF_CElt const* ct = CTable + 1; + const BYTE* ip = (const BYTE*) src; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstSize; + HUF_CStream_t bitC; + + /* init */ + if (dstSize < 8) return 0; /* not enough space to compress */ + { BYTE* op = ostart; + size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op)); + if (HUF_isError(initErr)) return 0; } + + if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11) + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ MEM_32bits() ? 2 : 4, /* kFast */ 0, /* kLastFast */ 0); + else { + if (MEM_32bits()) { + switch (tableLog) { + case 11: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 10: ZSTD_FALLTHROUGH; + case 9: ZSTD_FALLTHROUGH; + case 8: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 1); + break; + case 7: ZSTD_FALLTHROUGH; + default: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 3, /* kFastFlush */ 1, /* kLastFast */ 1); + break; + } + } else { + switch (tableLog) { + case 11: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 10: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 1); + break; + case 9: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 6, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 8: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 7, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 7: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 8, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 6: ZSTD_FALLTHROUGH; + default: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 9, /* kFastFlush */ 1, /* kLastFast */ 1); + break; + } + } + } + assert(bitC.ptr <= bitC.endPtr); + + return HUF_closeCStream(&bitC); +} + +#if DYNAMIC_BMI2 + +static BMI2_TARGET_ATTRIBUTE size_t +HUF_compress1X_usingCTable_internal_bmi2(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable) +{ + return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable); +} + +static size_t +HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable) +{ + return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable); +} + +static size_t +HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable, const int flags) +{ + if (flags & HUF_flags_bmi2) { + return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable); + } + return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable); +} + +#else + +static size_t +HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable, const int flags) +{ + (void)flags; + return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable); +} + +#endif + +size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags) +{ + return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags); +} + +static size_t +HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable, int flags) +{ + size_t const segmentSize = (srcSize+3)/4; /* first 3 segments */ + const BYTE* ip = (const BYTE*) src; + const BYTE* const iend = ip + srcSize; + BYTE* const ostart = (BYTE*) dst; + BYTE* const oend = ostart + dstSize; + BYTE* op = ostart; + + if (dstSize < 6 + 1 + 1 + 1 + 8) return 0; /* minimum space to compress successfully */ + if (srcSize < 12) return 0; /* no saving possible : too small input */ + op += 6; /* jumpTable */ + + assert(op <= oend); + { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + MEM_writeLE16(ostart, (U16)cSize); + op += cSize; + } + + ip += segmentSize; + assert(op <= oend); + { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + MEM_writeLE16(ostart+2, (U16)cSize); + op += cSize; + } + + ip += segmentSize; + assert(op <= oend); + { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + MEM_writeLE16(ostart+4, (U16)cSize); + op += cSize; + } + + ip += segmentSize; + assert(op <= oend); + assert(ip <= iend); + { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + op += cSize; + } + + return (size_t)(op-ostart); +} + +size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags) +{ + return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags); +} + +typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e; + +static size_t HUF_compressCTable_internal( + BYTE* const ostart, BYTE* op, BYTE* const oend, + const void* src, size_t srcSize, + HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int flags) +{ + size_t const cSize = (nbStreams==HUF_singleStream) ? + HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags) : + HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags); + if (HUF_isError(cSize)) { return cSize; } + if (cSize==0) { return 0; } /* uncompressible */ + op += cSize; + /* check compressibility */ + assert(op >= ostart); + if ((size_t)(op-ostart) >= srcSize-1) { return 0; } + return (size_t)(op-ostart); +} + +typedef struct { + unsigned count[HUF_SYMBOLVALUE_MAX + 1]; + HUF_CElt CTable[HUF_CTABLE_SIZE_ST(HUF_SYMBOLVALUE_MAX)]; + union { + HUF_buildCTable_wksp_tables buildCTable_wksp; + HUF_WriteCTableWksp writeCTable_wksp; + U32 hist_wksp[HIST_WKSP_SIZE_U32]; + } wksps; +} HUF_compress_tables_t; + +#define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096 +#define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10 /* Must be >= 2 */ + +unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue) +{ + unsigned cardinality = 0; + unsigned i; + + for (i = 0; i < maxSymbolValue + 1; i++) { + if (count[i] != 0) cardinality += 1; + } + + return cardinality; +} + +unsigned HUF_minTableLog(unsigned symbolCardinality) +{ + U32 minBitsSymbols = ZSTD_highbit32(symbolCardinality) + 1; + return minBitsSymbols; +} + +unsigned HUF_optimalTableLog( + unsigned maxTableLog, + size_t srcSize, + unsigned maxSymbolValue, + void* workSpace, size_t wkspSize, + HUF_CElt* table, + const unsigned* count, + int flags) +{ + assert(srcSize > 1); /* Not supported, RLE should be used instead */ + assert(wkspSize >= sizeof(HUF_buildCTable_wksp_tables)); + + if (!(flags & HUF_flags_optimalDepth)) { + /* cheap evaluation, based on FSE */ + return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1); + } + + { BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp); + size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp); + size_t hSize, newSize; + const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue); + const unsigned minTableLog = HUF_minTableLog(symbolCardinality); + size_t optSize = ((size_t) ~0) - 1; + unsigned optLog = maxTableLog, optLogGuess; + + DEBUGLOG(6, "HUF_optimalTableLog: probing huf depth (srcSize=%zu)", srcSize); + + /* Search until size increases */ + for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) { + DEBUGLOG(7, "checking for huffLog=%u", optLogGuess); + + { size_t maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize); + if (ERR_isError(maxBits)) continue; + + if (maxBits < optLogGuess && optLogGuess > minTableLog) break; + + hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize); + } + + if (ERR_isError(hSize)) continue; + + newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize; + + if (newSize > optSize + 1) { + break; + } + + if (newSize < optSize) { + optSize = newSize; + optLog = optLogGuess; + } + } + assert(optLog <= HUF_TABLELOG_MAX); + return optLog; + } +} + +/* HUF_compress_internal() : + * `workSpace_align4` must be aligned on 4-bytes boundaries, + * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */ +static size_t +HUF_compress_internal (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + HUF_nbStreams_e nbStreams, + void* workSpace, size_t wkspSize, + HUF_CElt* oldHufTable, HUF_repeat* repeat, int flags) +{ + HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t)); + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstSize; + BYTE* op = ostart; + + DEBUGLOG(5, "HUF_compress_internal (srcSize=%zu)", srcSize); + HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE); + + /* checks & inits */ + if (wkspSize < sizeof(*table)) return ERROR(workSpace_tooSmall); + if (!srcSize) return 0; /* Uncompressed */ + if (!dstSize) return 0; /* cannot fit anything within dst budget */ + if (srcSize > HUF_BLOCKSIZE_MAX) return ERROR(srcSize_wrong); /* current block size limit */ + if (huffLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); + if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge); + if (!maxSymbolValue) maxSymbolValue = HUF_SYMBOLVALUE_MAX; + if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT; + + /* Heuristic : If old table is valid, use it for small inputs */ + if ((flags & HUF_flags_preferRepeat) && repeat && *repeat == HUF_repeat_valid) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, + nbStreams, oldHufTable, flags); + } + + /* If uncompressible data is suspected, do a smaller sampling first */ + DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2); + if ((flags & HUF_flags_suspectUncompressible) && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) { + size_t largestTotal = 0; + DEBUGLOG(5, "input suspected incompressible : sampling to check"); + { unsigned maxSymbolValueBegin = maxSymbolValue; + CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) ); + largestTotal += largestBegin; + } + { unsigned maxSymbolValueEnd = maxSymbolValue; + CHECK_V_F(largestEnd, HIST_count_simple (table->count, &maxSymbolValueEnd, (const BYTE*)src + srcSize - SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) ); + largestTotal += largestEnd; + } + if (largestTotal <= ((2 * SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) >> 7)+4) return 0; /* heuristic : probably not compressible enough */ + } + + /* Scan input and build symbol stats */ + { CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, table->wksps.hist_wksp, sizeof(table->wksps.hist_wksp)) ); + if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */ + if (largest <= (srcSize >> 7)+4) return 0; /* heuristic : probably not compressible enough */ + } + DEBUGLOG(6, "histogram detail completed (%zu symbols)", showU32(table->count, maxSymbolValue+1)); + + /* Check validity of previous table */ + if ( repeat + && *repeat == HUF_repeat_check + && !HUF_validateCTable(oldHufTable, table->count, maxSymbolValue)) { + *repeat = HUF_repeat_none; + } + /* Heuristic : use existing table for small inputs */ + if ((flags & HUF_flags_preferRepeat) && repeat && *repeat != HUF_repeat_none) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, + nbStreams, oldHufTable, flags); + } + + /* Build Huffman Tree */ + huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, &table->wksps, sizeof(table->wksps), table->CTable, table->count, flags); + { size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count, + maxSymbolValue, huffLog, + &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp)); + CHECK_F(maxBits); + huffLog = (U32)maxBits; + DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1)); + } + + /* Write table description header */ + { CHECK_V_F(hSize, HUF_writeCTable_wksp(op, dstSize, table->CTable, maxSymbolValue, huffLog, + &table->wksps.writeCTable_wksp, sizeof(table->wksps.writeCTable_wksp)) ); + /* Check if using previous huffman table is beneficial */ + if (repeat && *repeat != HUF_repeat_none) { + size_t const oldSize = HUF_estimateCompressedSize(oldHufTable, table->count, maxSymbolValue); + size_t const newSize = HUF_estimateCompressedSize(table->CTable, table->count, maxSymbolValue); + if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, + nbStreams, oldHufTable, flags); + } } + + /* Use the new huffman table */ + if (hSize + 12ul >= srcSize) { return 0; } + op += hSize; + if (repeat) { *repeat = HUF_repeat_none; } + if (oldHufTable) + ZSTD_memcpy(oldHufTable, table->CTable, sizeof(table->CTable)); /* Save new table */ + } + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, + nbStreams, table->CTable, flags); +} + +size_t HUF_compress1X_repeat (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + void* workSpace, size_t wkspSize, + HUF_CElt* hufTable, HUF_repeat* repeat, int flags) +{ + DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize); + return HUF_compress_internal(dst, dstSize, src, srcSize, + maxSymbolValue, huffLog, HUF_singleStream, + workSpace, wkspSize, hufTable, + repeat, flags); +} + +/* HUF_compress4X_repeat(): + * compress input using 4 streams. + * consider skipping quickly + * reuse an existing huffman compression table */ +size_t HUF_compress4X_repeat (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + void* workSpace, size_t wkspSize, + HUF_CElt* hufTable, HUF_repeat* repeat, int flags) +{ + DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize); + return HUF_compress_internal(dst, dstSize, src, srcSize, + maxSymbolValue, huffLog, HUF_fourStreams, + workSpace, wkspSize, + hufTable, repeat, flags); +} diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c new file mode 100644 index 0000000..c8f6b28 --- /dev/null +++ b/lib/compress/zstd_compress.c @@ -0,0 +1,7851 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/*-************************************* +* Dependencies +***************************************/ +#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ +#include "../common/zstd_deps.h" /* INT_MAX, ZSTD_memset, ZSTD_memcpy */ +#include "../common/mem.h" +#include "../common/error_private.h" +#include "hist.h" /* HIST_countFast_wksp */ +#define FSE_STATIC_LINKING_ONLY /* FSE_encodeSymbol */ +#include "../common/fse.h" +#include "../common/huf.h" +#include "zstd_compress_internal.h" +#include "zstd_compress_sequences.h" +#include "zstd_compress_literals.h" +#include "zstd_fast.h" +#include "zstd_double_fast.h" +#include "zstd_lazy.h" +#include "zstd_opt.h" +#include "zstd_ldm.h" +#include "zstd_compress_superblock.h" +#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_rotateRight_U64 */ + +/* *************************************************************** +* Tuning parameters +*****************************************************************/ +/*! + * COMPRESS_HEAPMODE : + * Select how default decompression function ZSTD_compress() allocates its context, + * on stack (0, default), or into heap (1). + * Note that functions with explicit context such as ZSTD_compressCCtx() are unaffected. + */ +#ifndef ZSTD_COMPRESS_HEAPMODE +# define ZSTD_COMPRESS_HEAPMODE 0 +#endif + +/*! + * ZSTD_HASHLOG3_MAX : + * Maximum size of the hash table dedicated to find 3-bytes matches, + * in log format, aka 17 => 1 << 17 == 128Ki positions. + * This structure is only used in zstd_opt. + * Since allocation is centralized for all strategies, it has to be known here. + * The actual (selected) size of the hash table is then stored in ZSTD_MatchState_t.hashLog3, + * so that zstd_opt.c doesn't need to know about this constant. + */ +#ifndef ZSTD_HASHLOG3_MAX +# define ZSTD_HASHLOG3_MAX 17 +#endif + +/*-************************************* +* Helper functions +***************************************/ +/* ZSTD_compressBound() + * Note that the result from this function is only valid for + * the one-pass compression functions. + * When employing the streaming mode, + * if flushes are frequently altering the size of blocks, + * the overhead from block headers can make the compressed data larger + * than the return value of ZSTD_compressBound(). + */ +size_t ZSTD_compressBound(size_t srcSize) { + size_t const r = ZSTD_COMPRESSBOUND(srcSize); + if (r==0) return ERROR(srcSize_wrong); + return r; +} + + +/*-************************************* +* Context memory management +***************************************/ +struct ZSTD_CDict_s { + const void* dictContent; + size_t dictContentSize; + ZSTD_dictContentType_e dictContentType; /* The dictContentType the CDict was created with */ + U32* entropyWorkspace; /* entropy workspace of HUF_WORKSPACE_SIZE bytes */ + ZSTD_cwksp workspace; + ZSTD_MatchState_t matchState; + ZSTD_compressedBlockState_t cBlockState; + ZSTD_customMem customMem; + U32 dictID; + int compressionLevel; /* 0 indicates that advanced API was used to select CDict params */ + ZSTD_ParamSwitch_e useRowMatchFinder; /* Indicates whether the CDict was created with params that would use + * row-based matchfinder. Unless the cdict is reloaded, we will use + * the same greedy/lazy matchfinder at compression time. + */ +}; /* typedef'd to ZSTD_CDict within "zstd.h" */ + +ZSTD_CCtx* ZSTD_createCCtx(void) +{ + return ZSTD_createCCtx_advanced(ZSTD_defaultCMem); +} + +static void ZSTD_initCCtx(ZSTD_CCtx* cctx, ZSTD_customMem memManager) +{ + assert(cctx != NULL); + ZSTD_memset(cctx, 0, sizeof(*cctx)); + cctx->customMem = memManager; + cctx->bmi2 = ZSTD_cpuSupportsBmi2(); + { size_t const err = ZSTD_CCtx_reset(cctx, ZSTD_reset_parameters); + assert(!ZSTD_isError(err)); + (void)err; + } +} + +ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem) +{ + ZSTD_STATIC_ASSERT(zcss_init==0); + ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN==(0ULL - 1)); + if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL; + { ZSTD_CCtx* const cctx = (ZSTD_CCtx*)ZSTD_customMalloc(sizeof(ZSTD_CCtx), customMem); + if (!cctx) return NULL; + ZSTD_initCCtx(cctx, customMem); + return cctx; + } +} + +ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize) +{ + ZSTD_cwksp ws; + ZSTD_CCtx* cctx; + if (workspaceSize <= sizeof(ZSTD_CCtx)) return NULL; /* minimum size */ + if ((size_t)workspace & 7) return NULL; /* must be 8-aligned */ + ZSTD_cwksp_init(&ws, workspace, workspaceSize, ZSTD_cwksp_static_alloc); + + cctx = (ZSTD_CCtx*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CCtx)); + if (cctx == NULL) return NULL; + + ZSTD_memset(cctx, 0, sizeof(ZSTD_CCtx)); + ZSTD_cwksp_move(&cctx->workspace, &ws); + cctx->staticSize = workspaceSize; + + /* statically sized space. tmpWorkspace never moves (but prev/next block swap places) */ + if (!ZSTD_cwksp_check_available(&cctx->workspace, TMP_WORKSPACE_SIZE + 2 * sizeof(ZSTD_compressedBlockState_t))) return NULL; + cctx->blockState.prevCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t)); + cctx->blockState.nextCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t)); + cctx->tmpWorkspace = ZSTD_cwksp_reserve_object(&cctx->workspace, TMP_WORKSPACE_SIZE); + cctx->tmpWkspSize = TMP_WORKSPACE_SIZE; + cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); + return cctx; +} + +/** + * Clears and frees all of the dictionaries in the CCtx. + */ +static void ZSTD_clearAllDicts(ZSTD_CCtx* cctx) +{ + ZSTD_customFree(cctx->localDict.dictBuffer, cctx->customMem); + ZSTD_freeCDict(cctx->localDict.cdict); + ZSTD_memset(&cctx->localDict, 0, sizeof(cctx->localDict)); + ZSTD_memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict)); + cctx->cdict = NULL; +} + +static size_t ZSTD_sizeof_localDict(ZSTD_localDict dict) +{ + size_t const bufferSize = dict.dictBuffer != NULL ? dict.dictSize : 0; + size_t const cdictSize = ZSTD_sizeof_CDict(dict.cdict); + return bufferSize + cdictSize; +} + +static void ZSTD_freeCCtxContent(ZSTD_CCtx* cctx) +{ + assert(cctx != NULL); + assert(cctx->staticSize == 0); + ZSTD_clearAllDicts(cctx); +#ifdef ZSTD_MULTITHREAD + ZSTDMT_freeCCtx(cctx->mtctx); cctx->mtctx = NULL; +#endif + ZSTD_cwksp_free(&cctx->workspace, cctx->customMem); +} + +size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx) +{ + DEBUGLOG(3, "ZSTD_freeCCtx (address: %p)", (void*)cctx); + if (cctx==NULL) return 0; /* support free on NULL */ + RETURN_ERROR_IF(cctx->staticSize, memory_allocation, + "not compatible with static CCtx"); + { int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx); + ZSTD_freeCCtxContent(cctx); + if (!cctxInWorkspace) ZSTD_customFree(cctx, cctx->customMem); + } + return 0; +} + + +static size_t ZSTD_sizeof_mtctx(const ZSTD_CCtx* cctx) +{ +#ifdef ZSTD_MULTITHREAD + return ZSTDMT_sizeof_CCtx(cctx->mtctx); +#else + (void)cctx; + return 0; +#endif +} + + +size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx) +{ + if (cctx==NULL) return 0; /* support sizeof on NULL */ + /* cctx may be in the workspace */ + return (cctx->workspace.workspace == cctx ? 0 : sizeof(*cctx)) + + ZSTD_cwksp_sizeof(&cctx->workspace) + + ZSTD_sizeof_localDict(cctx->localDict) + + ZSTD_sizeof_mtctx(cctx); +} + +size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs) +{ + return ZSTD_sizeof_CCtx(zcs); /* same object */ +} + +/* private API call, for dictBuilder only */ +const SeqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); } + +/* Returns true if the strategy supports using a row based matchfinder */ +static int ZSTD_rowMatchFinderSupported(const ZSTD_strategy strategy) { + return (strategy >= ZSTD_greedy && strategy <= ZSTD_lazy2); +} + +/* Returns true if the strategy and useRowMatchFinder mode indicate that we will use the row based matchfinder + * for this compression. + */ +static int ZSTD_rowMatchFinderUsed(const ZSTD_strategy strategy, const ZSTD_ParamSwitch_e mode) { + assert(mode != ZSTD_ps_auto); + return ZSTD_rowMatchFinderSupported(strategy) && (mode == ZSTD_ps_enable); +} + +/* Returns row matchfinder usage given an initial mode and cParams */ +static ZSTD_ParamSwitch_e ZSTD_resolveRowMatchFinderMode(ZSTD_ParamSwitch_e mode, + const ZSTD_compressionParameters* const cParams) { +#ifdef ZSTD_LINUX_KERNEL + /* The Linux Kernel does not use SIMD, and 128KB is a very common size, e.g. in BtrFS. + * The row match finder is slower for this size without SIMD, so disable it. + */ + const unsigned kWindowLogLowerBound = 17; +#else + const unsigned kWindowLogLowerBound = 14; +#endif + if (mode != ZSTD_ps_auto) return mode; /* if requested enabled, but no SIMD, we still will use row matchfinder */ + mode = ZSTD_ps_disable; + if (!ZSTD_rowMatchFinderSupported(cParams->strategy)) return mode; + if (cParams->windowLog > kWindowLogLowerBound) mode = ZSTD_ps_enable; + return mode; +} + +/* Returns block splitter usage (generally speaking, when using slower/stronger compression modes) */ +static ZSTD_ParamSwitch_e ZSTD_resolveBlockSplitterMode(ZSTD_ParamSwitch_e mode, + const ZSTD_compressionParameters* const cParams) { + if (mode != ZSTD_ps_auto) return mode; + return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 17) ? ZSTD_ps_enable : ZSTD_ps_disable; +} + +/* Returns 1 if the arguments indicate that we should allocate a chainTable, 0 otherwise */ +static int ZSTD_allocateChainTable(const ZSTD_strategy strategy, + const ZSTD_ParamSwitch_e useRowMatchFinder, + const U32 forDDSDict) { + assert(useRowMatchFinder != ZSTD_ps_auto); + /* We always should allocate a chaintable if we are allocating a matchstate for a DDS dictionary matchstate. + * We do not allocate a chaintable if we are using ZSTD_fast, or are using the row-based matchfinder. + */ + return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder)); +} + +/* Returns ZSTD_ps_enable if compression parameters are such that we should + * enable long distance matching (wlog >= 27, strategy >= btopt). + * Returns ZSTD_ps_disable otherwise. + */ +static ZSTD_ParamSwitch_e ZSTD_resolveEnableLdm(ZSTD_ParamSwitch_e mode, + const ZSTD_compressionParameters* const cParams) { + if (mode != ZSTD_ps_auto) return mode; + return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable; +} + +static int ZSTD_resolveExternalSequenceValidation(int mode) { + return mode; +} + +/* Resolves maxBlockSize to the default if no value is present. */ +static size_t ZSTD_resolveMaxBlockSize(size_t maxBlockSize) { + if (maxBlockSize == 0) { + return ZSTD_BLOCKSIZE_MAX; + } else { + return maxBlockSize; + } +} + +static ZSTD_ParamSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_ParamSwitch_e value, int cLevel) { + if (value != ZSTD_ps_auto) return value; + if (cLevel < 10) { + return ZSTD_ps_disable; + } else { + return ZSTD_ps_enable; + } +} + +/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged. + * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */ +static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) { + return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast; +} + +static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( + ZSTD_compressionParameters cParams) +{ + ZSTD_CCtx_params cctxParams; + /* should not matter, as all cParams are presumed properly defined */ + ZSTD_CCtxParams_init(&cctxParams, ZSTD_CLEVEL_DEFAULT); + cctxParams.cParams = cParams; + + /* Adjust advanced params according to cParams */ + cctxParams.ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams.ldmParams.enableLdm, &cParams); + if (cctxParams.ldmParams.enableLdm == ZSTD_ps_enable) { + ZSTD_ldm_adjustParameters(&cctxParams.ldmParams, &cParams); + assert(cctxParams.ldmParams.hashLog >= cctxParams.ldmParams.bucketSizeLog); + assert(cctxParams.ldmParams.hashRateLog < 32); + } + cctxParams.postBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.postBlockSplitter, &cParams); + cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams); + cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences); + cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize); + cctxParams.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams.searchForExternalRepcodes, + cctxParams.compressionLevel); + assert(!ZSTD_checkCParams(cParams)); + return cctxParams; +} + +static ZSTD_CCtx_params* ZSTD_createCCtxParams_advanced( + ZSTD_customMem customMem) +{ + ZSTD_CCtx_params* params; + if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL; + params = (ZSTD_CCtx_params*)ZSTD_customCalloc( + sizeof(ZSTD_CCtx_params), customMem); + if (!params) { return NULL; } + ZSTD_CCtxParams_init(params, ZSTD_CLEVEL_DEFAULT); + params->customMem = customMem; + return params; +} + +ZSTD_CCtx_params* ZSTD_createCCtxParams(void) +{ + return ZSTD_createCCtxParams_advanced(ZSTD_defaultCMem); +} + +size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params) +{ + if (params == NULL) { return 0; } + ZSTD_customFree(params, params->customMem); + return 0; +} + +size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params) +{ + return ZSTD_CCtxParams_init(params, ZSTD_CLEVEL_DEFAULT); +} + +size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel) { + RETURN_ERROR_IF(!cctxParams, GENERIC, "NULL pointer!"); + ZSTD_memset(cctxParams, 0, sizeof(*cctxParams)); + cctxParams->compressionLevel = compressionLevel; + cctxParams->fParams.contentSizeFlag = 1; + return 0; +} + +#define ZSTD_NO_CLEVEL 0 + +/** + * Initializes `cctxParams` from `params` and `compressionLevel`. + * @param compressionLevel If params are derived from a compression level then that compression level, otherwise ZSTD_NO_CLEVEL. + */ +static void +ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, + const ZSTD_parameters* params, + int compressionLevel) +{ + assert(!ZSTD_checkCParams(params->cParams)); + ZSTD_memset(cctxParams, 0, sizeof(*cctxParams)); + cctxParams->cParams = params->cParams; + cctxParams->fParams = params->fParams; + /* Should not matter, as all cParams are presumed properly defined. + * But, set it for tracing anyway. + */ + cctxParams->compressionLevel = compressionLevel; + cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, ¶ms->cParams); + cctxParams->postBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->postBlockSplitter, ¶ms->cParams); + cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, ¶ms->cParams); + cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences); + cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize); + cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel); + DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d", + cctxParams->useRowMatchFinder, cctxParams->postBlockSplitter, cctxParams->ldmParams.enableLdm); +} + +size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params) +{ + RETURN_ERROR_IF(!cctxParams, GENERIC, "NULL pointer!"); + FORWARD_IF_ERROR( ZSTD_checkCParams(params.cParams) , ""); + ZSTD_CCtxParams_init_internal(cctxParams, ¶ms, ZSTD_NO_CLEVEL); + return 0; +} + +/** + * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone. + * @param params Validated zstd parameters. + */ +static void ZSTD_CCtxParams_setZstdParams( + ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params) +{ + assert(!ZSTD_checkCParams(params->cParams)); + cctxParams->cParams = params->cParams; + cctxParams->fParams = params->fParams; + /* Should not matter, as all cParams are presumed properly defined. + * But, set it for tracing anyway. + */ + cctxParams->compressionLevel = ZSTD_NO_CLEVEL; +} + +ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) +{ + ZSTD_bounds bounds = { 0, 0, 0 }; + + switch(param) + { + case ZSTD_c_compressionLevel: + bounds.lowerBound = ZSTD_minCLevel(); + bounds.upperBound = ZSTD_maxCLevel(); + return bounds; + + case ZSTD_c_windowLog: + bounds.lowerBound = ZSTD_WINDOWLOG_MIN; + bounds.upperBound = ZSTD_WINDOWLOG_MAX; + return bounds; + + case ZSTD_c_hashLog: + bounds.lowerBound = ZSTD_HASHLOG_MIN; + bounds.upperBound = ZSTD_HASHLOG_MAX; + return bounds; + + case ZSTD_c_chainLog: + bounds.lowerBound = ZSTD_CHAINLOG_MIN; + bounds.upperBound = ZSTD_CHAINLOG_MAX; + return bounds; + + case ZSTD_c_searchLog: + bounds.lowerBound = ZSTD_SEARCHLOG_MIN; + bounds.upperBound = ZSTD_SEARCHLOG_MAX; + return bounds; + + case ZSTD_c_minMatch: + bounds.lowerBound = ZSTD_MINMATCH_MIN; + bounds.upperBound = ZSTD_MINMATCH_MAX; + return bounds; + + case ZSTD_c_targetLength: + bounds.lowerBound = ZSTD_TARGETLENGTH_MIN; + bounds.upperBound = ZSTD_TARGETLENGTH_MAX; + return bounds; + + case ZSTD_c_strategy: + bounds.lowerBound = ZSTD_STRATEGY_MIN; + bounds.upperBound = ZSTD_STRATEGY_MAX; + return bounds; + + case ZSTD_c_contentSizeFlag: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + + case ZSTD_c_checksumFlag: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + + case ZSTD_c_dictIDFlag: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + + case ZSTD_c_nbWorkers: + bounds.lowerBound = 0; +#ifdef ZSTD_MULTITHREAD + bounds.upperBound = ZSTDMT_NBWORKERS_MAX; +#else + bounds.upperBound = 0; +#endif + return bounds; + + case ZSTD_c_jobSize: + bounds.lowerBound = 0; +#ifdef ZSTD_MULTITHREAD + bounds.upperBound = ZSTDMT_JOBSIZE_MAX; +#else + bounds.upperBound = 0; +#endif + return bounds; + + case ZSTD_c_overlapLog: +#ifdef ZSTD_MULTITHREAD + bounds.lowerBound = ZSTD_OVERLAPLOG_MIN; + bounds.upperBound = ZSTD_OVERLAPLOG_MAX; +#else + bounds.lowerBound = 0; + bounds.upperBound = 0; +#endif + return bounds; + + case ZSTD_c_enableDedicatedDictSearch: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + + case ZSTD_c_enableLongDistanceMatching: + bounds.lowerBound = (int)ZSTD_ps_auto; + bounds.upperBound = (int)ZSTD_ps_disable; + return bounds; + + case ZSTD_c_ldmHashLog: + bounds.lowerBound = ZSTD_LDM_HASHLOG_MIN; + bounds.upperBound = ZSTD_LDM_HASHLOG_MAX; + return bounds; + + case ZSTD_c_ldmMinMatch: + bounds.lowerBound = ZSTD_LDM_MINMATCH_MIN; + bounds.upperBound = ZSTD_LDM_MINMATCH_MAX; + return bounds; + + case ZSTD_c_ldmBucketSizeLog: + bounds.lowerBound = ZSTD_LDM_BUCKETSIZELOG_MIN; + bounds.upperBound = ZSTD_LDM_BUCKETSIZELOG_MAX; + return bounds; + + case ZSTD_c_ldmHashRateLog: + bounds.lowerBound = ZSTD_LDM_HASHRATELOG_MIN; + bounds.upperBound = ZSTD_LDM_HASHRATELOG_MAX; + return bounds; + + /* experimental parameters */ + case ZSTD_c_rsyncable: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + + case ZSTD_c_forceMaxWindow : + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + + case ZSTD_c_format: + ZSTD_STATIC_ASSERT(ZSTD_f_zstd1 < ZSTD_f_zstd1_magicless); + bounds.lowerBound = ZSTD_f_zstd1; + bounds.upperBound = ZSTD_f_zstd1_magicless; /* note : how to ensure at compile time that this is the highest value enum ? */ + return bounds; + + case ZSTD_c_forceAttachDict: + ZSTD_STATIC_ASSERT(ZSTD_dictDefaultAttach < ZSTD_dictForceLoad); + bounds.lowerBound = ZSTD_dictDefaultAttach; + bounds.upperBound = ZSTD_dictForceLoad; /* note : how to ensure at compile time that this is the highest value enum ? */ + return bounds; + + case ZSTD_c_literalCompressionMode: + ZSTD_STATIC_ASSERT(ZSTD_ps_auto < ZSTD_ps_enable && ZSTD_ps_enable < ZSTD_ps_disable); + bounds.lowerBound = (int)ZSTD_ps_auto; + bounds.upperBound = (int)ZSTD_ps_disable; + return bounds; + + case ZSTD_c_targetCBlockSize: + bounds.lowerBound = ZSTD_TARGETCBLOCKSIZE_MIN; + bounds.upperBound = ZSTD_TARGETCBLOCKSIZE_MAX; + return bounds; + + case ZSTD_c_srcSizeHint: + bounds.lowerBound = ZSTD_SRCSIZEHINT_MIN; + bounds.upperBound = ZSTD_SRCSIZEHINT_MAX; + return bounds; + + case ZSTD_c_stableInBuffer: + case ZSTD_c_stableOutBuffer: + bounds.lowerBound = (int)ZSTD_bm_buffered; + bounds.upperBound = (int)ZSTD_bm_stable; + return bounds; + + case ZSTD_c_blockDelimiters: + bounds.lowerBound = (int)ZSTD_sf_noBlockDelimiters; + bounds.upperBound = (int)ZSTD_sf_explicitBlockDelimiters; + return bounds; + + case ZSTD_c_validateSequences: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + + case ZSTD_c_splitAfterSequences: + bounds.lowerBound = (int)ZSTD_ps_auto; + bounds.upperBound = (int)ZSTD_ps_disable; + return bounds; + + case ZSTD_c_blockSplitterLevel: + bounds.lowerBound = 0; + bounds.upperBound = ZSTD_BLOCKSPLITTER_LEVEL_MAX; + return bounds; + + case ZSTD_c_useRowMatchFinder: + bounds.lowerBound = (int)ZSTD_ps_auto; + bounds.upperBound = (int)ZSTD_ps_disable; + return bounds; + + case ZSTD_c_deterministicRefPrefix: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + + case ZSTD_c_prefetchCDictTables: + bounds.lowerBound = (int)ZSTD_ps_auto; + bounds.upperBound = (int)ZSTD_ps_disable; + return bounds; + + case ZSTD_c_enableSeqProducerFallback: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + + case ZSTD_c_maxBlockSize: + bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN; + bounds.upperBound = ZSTD_BLOCKSIZE_MAX; + return bounds; + + case ZSTD_c_repcodeResolution: + bounds.lowerBound = (int)ZSTD_ps_auto; + bounds.upperBound = (int)ZSTD_ps_disable; + return bounds; + + default: + bounds.error = ERROR(parameter_unsupported); + return bounds; + } +} + +/* ZSTD_cParam_clampBounds: + * Clamps the value into the bounded range. + */ +static size_t ZSTD_cParam_clampBounds(ZSTD_cParameter cParam, int* value) +{ + ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam); + if (ZSTD_isError(bounds.error)) return bounds.error; + if (*value < bounds.lowerBound) *value = bounds.lowerBound; + if (*value > bounds.upperBound) *value = bounds.upperBound; + return 0; +} + +#define BOUNDCHECK(cParam, val) \ + do { \ + RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \ + parameter_outOfBound, "Param out of bounds"); \ + } while (0) + + +static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) +{ + switch(param) + { + case ZSTD_c_compressionLevel: + case ZSTD_c_hashLog: + case ZSTD_c_chainLog: + case ZSTD_c_searchLog: + case ZSTD_c_minMatch: + case ZSTD_c_targetLength: + case ZSTD_c_strategy: + case ZSTD_c_blockSplitterLevel: + return 1; + + case ZSTD_c_format: + case ZSTD_c_windowLog: + case ZSTD_c_contentSizeFlag: + case ZSTD_c_checksumFlag: + case ZSTD_c_dictIDFlag: + case ZSTD_c_forceMaxWindow : + case ZSTD_c_nbWorkers: + case ZSTD_c_jobSize: + case ZSTD_c_overlapLog: + case ZSTD_c_rsyncable: + case ZSTD_c_enableDedicatedDictSearch: + case ZSTD_c_enableLongDistanceMatching: + case ZSTD_c_ldmHashLog: + case ZSTD_c_ldmMinMatch: + case ZSTD_c_ldmBucketSizeLog: + case ZSTD_c_ldmHashRateLog: + case ZSTD_c_forceAttachDict: + case ZSTD_c_literalCompressionMode: + case ZSTD_c_targetCBlockSize: + case ZSTD_c_srcSizeHint: + case ZSTD_c_stableInBuffer: + case ZSTD_c_stableOutBuffer: + case ZSTD_c_blockDelimiters: + case ZSTD_c_validateSequences: + case ZSTD_c_splitAfterSequences: + case ZSTD_c_useRowMatchFinder: + case ZSTD_c_deterministicRefPrefix: + case ZSTD_c_prefetchCDictTables: + case ZSTD_c_enableSeqProducerFallback: + case ZSTD_c_maxBlockSize: + case ZSTD_c_repcodeResolution: + default: + return 0; + } +} + +size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) +{ + DEBUGLOG(4, "ZSTD_CCtx_setParameter (%i, %i)", (int)param, value); + if (cctx->streamStage != zcss_init) { + if (ZSTD_isUpdateAuthorized(param)) { + cctx->cParamsChanged = 1; + } else { + RETURN_ERROR(stage_wrong, "can only set params in cctx init stage"); + } } + + switch(param) + { + case ZSTD_c_nbWorkers: + RETURN_ERROR_IF((value!=0) && cctx->staticSize, parameter_unsupported, + "MT not compatible with static alloc"); + break; + + case ZSTD_c_compressionLevel: + case ZSTD_c_windowLog: + case ZSTD_c_hashLog: + case ZSTD_c_chainLog: + case ZSTD_c_searchLog: + case ZSTD_c_minMatch: + case ZSTD_c_targetLength: + case ZSTD_c_strategy: + case ZSTD_c_ldmHashRateLog: + case ZSTD_c_format: + case ZSTD_c_contentSizeFlag: + case ZSTD_c_checksumFlag: + case ZSTD_c_dictIDFlag: + case ZSTD_c_forceMaxWindow: + case ZSTD_c_forceAttachDict: + case ZSTD_c_literalCompressionMode: + case ZSTD_c_jobSize: + case ZSTD_c_overlapLog: + case ZSTD_c_rsyncable: + case ZSTD_c_enableDedicatedDictSearch: + case ZSTD_c_enableLongDistanceMatching: + case ZSTD_c_ldmHashLog: + case ZSTD_c_ldmMinMatch: + case ZSTD_c_ldmBucketSizeLog: + case ZSTD_c_targetCBlockSize: + case ZSTD_c_srcSizeHint: + case ZSTD_c_stableInBuffer: + case ZSTD_c_stableOutBuffer: + case ZSTD_c_blockDelimiters: + case ZSTD_c_validateSequences: + case ZSTD_c_splitAfterSequences: + case ZSTD_c_blockSplitterLevel: + case ZSTD_c_useRowMatchFinder: + case ZSTD_c_deterministicRefPrefix: + case ZSTD_c_prefetchCDictTables: + case ZSTD_c_enableSeqProducerFallback: + case ZSTD_c_maxBlockSize: + case ZSTD_c_repcodeResolution: + break; + + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); + } + return ZSTD_CCtxParams_setParameter(&cctx->requestedParams, param, value); +} + +size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + ZSTD_cParameter param, int value) +{ + DEBUGLOG(4, "ZSTD_CCtxParams_setParameter (%i, %i)", (int)param, value); + switch(param) + { + case ZSTD_c_format : + BOUNDCHECK(ZSTD_c_format, value); + CCtxParams->format = (ZSTD_format_e)value; + return (size_t)CCtxParams->format; + + case ZSTD_c_compressionLevel : { + FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), ""); + if (value == 0) + CCtxParams->compressionLevel = ZSTD_CLEVEL_DEFAULT; /* 0 == default */ + else + CCtxParams->compressionLevel = value; + if (CCtxParams->compressionLevel >= 0) return (size_t)CCtxParams->compressionLevel; + return 0; /* return type (size_t) cannot represent negative values */ + } + + case ZSTD_c_windowLog : + if (value!=0) /* 0 => use default */ + BOUNDCHECK(ZSTD_c_windowLog, value); + CCtxParams->cParams.windowLog = (U32)value; + return CCtxParams->cParams.windowLog; + + case ZSTD_c_hashLog : + if (value!=0) /* 0 => use default */ + BOUNDCHECK(ZSTD_c_hashLog, value); + CCtxParams->cParams.hashLog = (U32)value; + return CCtxParams->cParams.hashLog; + + case ZSTD_c_chainLog : + if (value!=0) /* 0 => use default */ + BOUNDCHECK(ZSTD_c_chainLog, value); + CCtxParams->cParams.chainLog = (U32)value; + return CCtxParams->cParams.chainLog; + + case ZSTD_c_searchLog : + if (value!=0) /* 0 => use default */ + BOUNDCHECK(ZSTD_c_searchLog, value); + CCtxParams->cParams.searchLog = (U32)value; + return (size_t)value; + + case ZSTD_c_minMatch : + if (value!=0) /* 0 => use default */ + BOUNDCHECK(ZSTD_c_minMatch, value); + CCtxParams->cParams.minMatch = (U32)value; + return CCtxParams->cParams.minMatch; + + case ZSTD_c_targetLength : + BOUNDCHECK(ZSTD_c_targetLength, value); + CCtxParams->cParams.targetLength = (U32)value; + return CCtxParams->cParams.targetLength; + + case ZSTD_c_strategy : + if (value!=0) /* 0 => use default */ + BOUNDCHECK(ZSTD_c_strategy, value); + CCtxParams->cParams.strategy = (ZSTD_strategy)value; + return (size_t)CCtxParams->cParams.strategy; + + case ZSTD_c_contentSizeFlag : + /* Content size written in frame header _when known_ (default:1) */ + DEBUGLOG(4, "set content size flag = %u", (value!=0)); + CCtxParams->fParams.contentSizeFlag = value != 0; + return (size_t)CCtxParams->fParams.contentSizeFlag; + + case ZSTD_c_checksumFlag : + /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */ + CCtxParams->fParams.checksumFlag = value != 0; + return (size_t)CCtxParams->fParams.checksumFlag; + + case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */ + DEBUGLOG(4, "set dictIDFlag = %u", (value!=0)); + CCtxParams->fParams.noDictIDFlag = !value; + return !CCtxParams->fParams.noDictIDFlag; + + case ZSTD_c_forceMaxWindow : + CCtxParams->forceWindow = (value != 0); + return (size_t)CCtxParams->forceWindow; + + case ZSTD_c_forceAttachDict : { + const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value; + BOUNDCHECK(ZSTD_c_forceAttachDict, (int)pref); + CCtxParams->attachDictPref = pref; + return CCtxParams->attachDictPref; + } + + case ZSTD_c_literalCompressionMode : { + const ZSTD_ParamSwitch_e lcm = (ZSTD_ParamSwitch_e)value; + BOUNDCHECK(ZSTD_c_literalCompressionMode, (int)lcm); + CCtxParams->literalCompressionMode = lcm; + return CCtxParams->literalCompressionMode; + } + + case ZSTD_c_nbWorkers : +#ifndef ZSTD_MULTITHREAD + RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading"); + return 0; +#else + FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), ""); + CCtxParams->nbWorkers = value; + return (size_t)(CCtxParams->nbWorkers); +#endif + + case ZSTD_c_jobSize : +#ifndef ZSTD_MULTITHREAD + RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading"); + return 0; +#else + /* Adjust to the minimum non-default value. */ + if (value != 0 && value < ZSTDMT_JOBSIZE_MIN) + value = ZSTDMT_JOBSIZE_MIN; + FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), ""); + assert(value >= 0); + CCtxParams->jobSize = (size_t)value; + return CCtxParams->jobSize; +#endif + + case ZSTD_c_overlapLog : +#ifndef ZSTD_MULTITHREAD + RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading"); + return 0; +#else + FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(ZSTD_c_overlapLog, &value), ""); + CCtxParams->overlapLog = value; + return (size_t)CCtxParams->overlapLog; +#endif + + case ZSTD_c_rsyncable : +#ifndef ZSTD_MULTITHREAD + RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading"); + return 0; +#else + FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(ZSTD_c_overlapLog, &value), ""); + CCtxParams->rsyncable = value; + return (size_t)CCtxParams->rsyncable; +#endif + + case ZSTD_c_enableDedicatedDictSearch : + CCtxParams->enableDedicatedDictSearch = (value!=0); + return (size_t)CCtxParams->enableDedicatedDictSearch; + + case ZSTD_c_enableLongDistanceMatching : + BOUNDCHECK(ZSTD_c_enableLongDistanceMatching, value); + CCtxParams->ldmParams.enableLdm = (ZSTD_ParamSwitch_e)value; + return CCtxParams->ldmParams.enableLdm; + + case ZSTD_c_ldmHashLog : + if (value!=0) /* 0 ==> auto */ + BOUNDCHECK(ZSTD_c_ldmHashLog, value); + CCtxParams->ldmParams.hashLog = (U32)value; + return CCtxParams->ldmParams.hashLog; + + case ZSTD_c_ldmMinMatch : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmMinMatch, value); + CCtxParams->ldmParams.minMatchLength = (U32)value; + return CCtxParams->ldmParams.minMatchLength; + + case ZSTD_c_ldmBucketSizeLog : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value); + CCtxParams->ldmParams.bucketSizeLog = (U32)value; + return CCtxParams->ldmParams.bucketSizeLog; + + case ZSTD_c_ldmHashRateLog : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmHashRateLog, value); + CCtxParams->ldmParams.hashRateLog = (U32)value; + return CCtxParams->ldmParams.hashRateLog; + + case ZSTD_c_targetCBlockSize : + if (value!=0) { /* 0 ==> default */ + value = MAX(value, ZSTD_TARGETCBLOCKSIZE_MIN); + BOUNDCHECK(ZSTD_c_targetCBlockSize, value); + } + CCtxParams->targetCBlockSize = (U32)value; + return CCtxParams->targetCBlockSize; + + case ZSTD_c_srcSizeHint : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_srcSizeHint, value); + CCtxParams->srcSizeHint = value; + return (size_t)CCtxParams->srcSizeHint; + + case ZSTD_c_stableInBuffer: + BOUNDCHECK(ZSTD_c_stableInBuffer, value); + CCtxParams->inBufferMode = (ZSTD_bufferMode_e)value; + return CCtxParams->inBufferMode; + + case ZSTD_c_stableOutBuffer: + BOUNDCHECK(ZSTD_c_stableOutBuffer, value); + CCtxParams->outBufferMode = (ZSTD_bufferMode_e)value; + return CCtxParams->outBufferMode; + + case ZSTD_c_blockDelimiters: + BOUNDCHECK(ZSTD_c_blockDelimiters, value); + CCtxParams->blockDelimiters = (ZSTD_SequenceFormat_e)value; + return CCtxParams->blockDelimiters; + + case ZSTD_c_validateSequences: + BOUNDCHECK(ZSTD_c_validateSequences, value); + CCtxParams->validateSequences = value; + return (size_t)CCtxParams->validateSequences; + + case ZSTD_c_splitAfterSequences: + BOUNDCHECK(ZSTD_c_splitAfterSequences, value); + CCtxParams->postBlockSplitter = (ZSTD_ParamSwitch_e)value; + return CCtxParams->postBlockSplitter; + + case ZSTD_c_blockSplitterLevel: + BOUNDCHECK(ZSTD_c_blockSplitterLevel, value); + CCtxParams->preBlockSplitter_level = value; + return (size_t)CCtxParams->preBlockSplitter_level; + + case ZSTD_c_useRowMatchFinder: + BOUNDCHECK(ZSTD_c_useRowMatchFinder, value); + CCtxParams->useRowMatchFinder = (ZSTD_ParamSwitch_e)value; + return CCtxParams->useRowMatchFinder; + + case ZSTD_c_deterministicRefPrefix: + BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value); + CCtxParams->deterministicRefPrefix = !!value; + return (size_t)CCtxParams->deterministicRefPrefix; + + case ZSTD_c_prefetchCDictTables: + BOUNDCHECK(ZSTD_c_prefetchCDictTables, value); + CCtxParams->prefetchCDictTables = (ZSTD_ParamSwitch_e)value; + return CCtxParams->prefetchCDictTables; + + case ZSTD_c_enableSeqProducerFallback: + BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value); + CCtxParams->enableMatchFinderFallback = value; + return (size_t)CCtxParams->enableMatchFinderFallback; + + case ZSTD_c_maxBlockSize: + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_maxBlockSize, value); + assert(value>=0); + CCtxParams->maxBlockSize = (size_t)value; + return CCtxParams->maxBlockSize; + + case ZSTD_c_repcodeResolution: + BOUNDCHECK(ZSTD_c_repcodeResolution, value); + CCtxParams->searchForExternalRepcodes = (ZSTD_ParamSwitch_e)value; + return CCtxParams->searchForExternalRepcodes; + + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); + } +} + +size_t ZSTD_CCtx_getParameter(ZSTD_CCtx const* cctx, ZSTD_cParameter param, int* value) +{ + return ZSTD_CCtxParams_getParameter(&cctx->requestedParams, param, value); +} + +size_t ZSTD_CCtxParams_getParameter( + ZSTD_CCtx_params const* CCtxParams, ZSTD_cParameter param, int* value) +{ + switch(param) + { + case ZSTD_c_format : + *value = (int)CCtxParams->format; + break; + case ZSTD_c_compressionLevel : + *value = CCtxParams->compressionLevel; + break; + case ZSTD_c_windowLog : + *value = (int)CCtxParams->cParams.windowLog; + break; + case ZSTD_c_hashLog : + *value = (int)CCtxParams->cParams.hashLog; + break; + case ZSTD_c_chainLog : + *value = (int)CCtxParams->cParams.chainLog; + break; + case ZSTD_c_searchLog : + *value = (int)CCtxParams->cParams.searchLog; + break; + case ZSTD_c_minMatch : + *value = (int)CCtxParams->cParams.minMatch; + break; + case ZSTD_c_targetLength : + *value = (int)CCtxParams->cParams.targetLength; + break; + case ZSTD_c_strategy : + *value = (int)CCtxParams->cParams.strategy; + break; + case ZSTD_c_contentSizeFlag : + *value = CCtxParams->fParams.contentSizeFlag; + break; + case ZSTD_c_checksumFlag : + *value = CCtxParams->fParams.checksumFlag; + break; + case ZSTD_c_dictIDFlag : + *value = !CCtxParams->fParams.noDictIDFlag; + break; + case ZSTD_c_forceMaxWindow : + *value = CCtxParams->forceWindow; + break; + case ZSTD_c_forceAttachDict : + *value = (int)CCtxParams->attachDictPref; + break; + case ZSTD_c_literalCompressionMode : + *value = (int)CCtxParams->literalCompressionMode; + break; + case ZSTD_c_nbWorkers : +#ifndef ZSTD_MULTITHREAD + assert(CCtxParams->nbWorkers == 0); +#endif + *value = CCtxParams->nbWorkers; + break; + case ZSTD_c_jobSize : +#ifndef ZSTD_MULTITHREAD + RETURN_ERROR(parameter_unsupported, "not compiled with multithreading"); +#else + assert(CCtxParams->jobSize <= INT_MAX); + *value = (int)CCtxParams->jobSize; + break; +#endif + case ZSTD_c_overlapLog : +#ifndef ZSTD_MULTITHREAD + RETURN_ERROR(parameter_unsupported, "not compiled with multithreading"); +#else + *value = CCtxParams->overlapLog; + break; +#endif + case ZSTD_c_rsyncable : +#ifndef ZSTD_MULTITHREAD + RETURN_ERROR(parameter_unsupported, "not compiled with multithreading"); +#else + *value = CCtxParams->rsyncable; + break; +#endif + case ZSTD_c_enableDedicatedDictSearch : + *value = CCtxParams->enableDedicatedDictSearch; + break; + case ZSTD_c_enableLongDistanceMatching : + *value = (int)CCtxParams->ldmParams.enableLdm; + break; + case ZSTD_c_ldmHashLog : + *value = (int)CCtxParams->ldmParams.hashLog; + break; + case ZSTD_c_ldmMinMatch : + *value = (int)CCtxParams->ldmParams.minMatchLength; + break; + case ZSTD_c_ldmBucketSizeLog : + *value = (int)CCtxParams->ldmParams.bucketSizeLog; + break; + case ZSTD_c_ldmHashRateLog : + *value = (int)CCtxParams->ldmParams.hashRateLog; + break; + case ZSTD_c_targetCBlockSize : + *value = (int)CCtxParams->targetCBlockSize; + break; + case ZSTD_c_srcSizeHint : + *value = (int)CCtxParams->srcSizeHint; + break; + case ZSTD_c_stableInBuffer : + *value = (int)CCtxParams->inBufferMode; + break; + case ZSTD_c_stableOutBuffer : + *value = (int)CCtxParams->outBufferMode; + break; + case ZSTD_c_blockDelimiters : + *value = (int)CCtxParams->blockDelimiters; + break; + case ZSTD_c_validateSequences : + *value = (int)CCtxParams->validateSequences; + break; + case ZSTD_c_splitAfterSequences : + *value = (int)CCtxParams->postBlockSplitter; + break; + case ZSTD_c_blockSplitterLevel : + *value = CCtxParams->preBlockSplitter_level; + break; + case ZSTD_c_useRowMatchFinder : + *value = (int)CCtxParams->useRowMatchFinder; + break; + case ZSTD_c_deterministicRefPrefix: + *value = (int)CCtxParams->deterministicRefPrefix; + break; + case ZSTD_c_prefetchCDictTables: + *value = (int)CCtxParams->prefetchCDictTables; + break; + case ZSTD_c_enableSeqProducerFallback: + *value = CCtxParams->enableMatchFinderFallback; + break; + case ZSTD_c_maxBlockSize: + *value = (int)CCtxParams->maxBlockSize; + break; + case ZSTD_c_repcodeResolution: + *value = (int)CCtxParams->searchForExternalRepcodes; + break; + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); + } + return 0; +} + +/** ZSTD_CCtx_setParametersUsingCCtxParams() : + * just applies `params` into `cctx` + * no action is performed, parameters are merely stored. + * If ZSTDMT is enabled, parameters are pushed to cctx->mtctx. + * This is possible even if a compression is ongoing. + * In which case, new parameters will be applied on the fly, starting with next compression job. + */ +size_t ZSTD_CCtx_setParametersUsingCCtxParams( + ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params) +{ + DEBUGLOG(4, "ZSTD_CCtx_setParametersUsingCCtxParams"); + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "The context is in the wrong stage!"); + RETURN_ERROR_IF(cctx->cdict, stage_wrong, + "Can't override parameters with cdict attached (some must " + "be inherited from the cdict)."); + + cctx->requestedParams = *params; + return 0; +} + +size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams) +{ + ZSTD_STATIC_ASSERT(sizeof(cparams) == 7 * 4 /* all params are listed below */); + DEBUGLOG(4, "ZSTD_CCtx_setCParams"); + /* only update if all parameters are valid */ + FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, (int)cparams.windowLog), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_chainLog, (int)cparams.chainLog), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_hashLog, (int)cparams.hashLog), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_searchLog, (int)cparams.searchLog), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, (int)cparams.minMatch), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetLength, (int)cparams.targetLength), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_strategy, (int)cparams.strategy), ""); + return 0; +} + +size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams) +{ + ZSTD_STATIC_ASSERT(sizeof(fparams) == 3 * 4 /* all params are listed below */); + DEBUGLOG(4, "ZSTD_CCtx_setFParams"); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, fparams.contentSizeFlag != 0), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, fparams.checksumFlag != 0), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_dictIDFlag, fparams.noDictIDFlag == 0), ""); + return 0; +} + +size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params) +{ + DEBUGLOG(4, "ZSTD_CCtx_setParams"); + /* First check cParams, because we want to update all or none. */ + FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), ""); + /* Next set fParams, because this could fail if the cctx isn't in init stage. */ + FORWARD_IF_ERROR(ZSTD_CCtx_setFParams(cctx, params.fParams), ""); + /* Finally set cParams, which should succeed. */ + FORWARD_IF_ERROR(ZSTD_CCtx_setCParams(cctx, params.cParams), ""); + return 0; +} + +size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize) +{ + DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %llu bytes", pledgedSrcSize); + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Can't set pledgedSrcSize when not in init stage."); + cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1; + return 0; +} + +static ZSTD_compressionParameters ZSTD_dedicatedDictSearch_getCParams( + int const compressionLevel, + size_t const dictSize); +static int ZSTD_dedicatedDictSearch_isSupported( + const ZSTD_compressionParameters* cParams); +static void ZSTD_dedicatedDictSearch_revertCParams( + ZSTD_compressionParameters* cParams); + +/** + * Initializes the local dictionary using requested parameters. + * NOTE: Initialization does not employ the pledged src size, + * because the dictionary may be used for multiple compressions. + */ +static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) +{ + ZSTD_localDict* const dl = &cctx->localDict; + if (dl->dict == NULL) { + /* No local dictionary. */ + assert(dl->dictBuffer == NULL); + assert(dl->cdict == NULL); + assert(dl->dictSize == 0); + return 0; + } + if (dl->cdict != NULL) { + /* Local dictionary already initialized. */ + assert(cctx->cdict == dl->cdict); + return 0; + } + assert(dl->dictSize > 0); + assert(cctx->cdict == NULL); + assert(cctx->prefixDict.dict == NULL); + + dl->cdict = ZSTD_createCDict_advanced2( + dl->dict, + dl->dictSize, + ZSTD_dlm_byRef, + dl->dictContentType, + &cctx->requestedParams, + cctx->customMem); + RETURN_ERROR_IF(!dl->cdict, memory_allocation, "ZSTD_createCDict_advanced failed"); + cctx->cdict = dl->cdict; + return 0; +} + +size_t ZSTD_CCtx_loadDictionary_advanced( + ZSTD_CCtx* cctx, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType) +{ + DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize); + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Can't load a dictionary when cctx is not in init stage."); + ZSTD_clearAllDicts(cctx); /* erase any previously set dictionary */ + if (dict == NULL || dictSize == 0) /* no dictionary */ + return 0; + if (dictLoadMethod == ZSTD_dlm_byRef) { + cctx->localDict.dict = dict; + } else { + /* copy dictionary content inside CCtx to own its lifetime */ + void* dictBuffer; + RETURN_ERROR_IF(cctx->staticSize, memory_allocation, + "static CCtx can't allocate for an internal copy of dictionary"); + dictBuffer = ZSTD_customMalloc(dictSize, cctx->customMem); + RETURN_ERROR_IF(dictBuffer==NULL, memory_allocation, + "allocation failed for dictionary content"); + ZSTD_memcpy(dictBuffer, dict, dictSize); + cctx->localDict.dictBuffer = dictBuffer; /* owned ptr to free */ + cctx->localDict.dict = dictBuffer; /* read-only reference */ + } + cctx->localDict.dictSize = dictSize; + cctx->localDict.dictContentType = dictContentType; + return 0; +} + +size_t ZSTD_CCtx_loadDictionary_byReference( + ZSTD_CCtx* cctx, const void* dict, size_t dictSize) +{ + return ZSTD_CCtx_loadDictionary_advanced( + cctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto); +} + +size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize) +{ + return ZSTD_CCtx_loadDictionary_advanced( + cctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto); +} + + +size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) +{ + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Can't ref a dict when ctx not in init stage."); + /* Free the existing local cdict (if any) to save memory. */ + ZSTD_clearAllDicts(cctx); + cctx->cdict = cdict; + return 0; +} + +size_t ZSTD_CCtx_refThreadPool(ZSTD_CCtx* cctx, ZSTD_threadPool* pool) +{ + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Can't ref a pool when ctx not in init stage."); + cctx->pool = pool; + return 0; +} + +size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize) +{ + return ZSTD_CCtx_refPrefix_advanced(cctx, prefix, prefixSize, ZSTD_dct_rawContent); +} + +size_t ZSTD_CCtx_refPrefix_advanced( + ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType) +{ + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Can't ref a prefix when ctx not in init stage."); + ZSTD_clearAllDicts(cctx); + if (prefix != NULL && prefixSize > 0) { + cctx->prefixDict.dict = prefix; + cctx->prefixDict.dictSize = prefixSize; + cctx->prefixDict.dictContentType = dictContentType; + } + return 0; +} + +/*! ZSTD_CCtx_reset() : + * Also dumps dictionary */ +size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset) +{ + if ( (reset == ZSTD_reset_session_only) + || (reset == ZSTD_reset_session_and_parameters) ) { + cctx->streamStage = zcss_init; + cctx->pledgedSrcSizePlusOne = 0; + } + if ( (reset == ZSTD_reset_parameters) + || (reset == ZSTD_reset_session_and_parameters) ) { + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Reset parameters is only possible during init stage."); + ZSTD_clearAllDicts(cctx); + return ZSTD_CCtxParams_reset(&cctx->requestedParams); + } + return 0; +} + + +/** ZSTD_checkCParams() : + control CParam values remain within authorized range. + @return : 0, or an error code if one value is beyond authorized range */ +size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams) +{ + BOUNDCHECK(ZSTD_c_windowLog, (int)cParams.windowLog); + BOUNDCHECK(ZSTD_c_chainLog, (int)cParams.chainLog); + BOUNDCHECK(ZSTD_c_hashLog, (int)cParams.hashLog); + BOUNDCHECK(ZSTD_c_searchLog, (int)cParams.searchLog); + BOUNDCHECK(ZSTD_c_minMatch, (int)cParams.minMatch); + BOUNDCHECK(ZSTD_c_targetLength,(int)cParams.targetLength); + BOUNDCHECK(ZSTD_c_strategy, (int)cParams.strategy); + return 0; +} + +/** ZSTD_clampCParams() : + * make CParam values within valid range. + * @return : valid CParams */ +static ZSTD_compressionParameters +ZSTD_clampCParams(ZSTD_compressionParameters cParams) +{ +# define CLAMP_TYPE(cParam, val, type) \ + do { \ + ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam); \ + if ((int)valbounds.upperBound) val=(type)bounds.upperBound; \ + } while (0) +# define CLAMP(cParam, val) CLAMP_TYPE(cParam, val, unsigned) + CLAMP(ZSTD_c_windowLog, cParams.windowLog); + CLAMP(ZSTD_c_chainLog, cParams.chainLog); + CLAMP(ZSTD_c_hashLog, cParams.hashLog); + CLAMP(ZSTD_c_searchLog, cParams.searchLog); + CLAMP(ZSTD_c_minMatch, cParams.minMatch); + CLAMP(ZSTD_c_targetLength,cParams.targetLength); + CLAMP_TYPE(ZSTD_c_strategy,cParams.strategy, ZSTD_strategy); + return cParams; +} + +/** ZSTD_cycleLog() : + * condition for correct operation : hashLog > 1 */ +U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat) +{ + U32 const btScale = ((U32)strat >= (U32)ZSTD_btlazy2); + return hashLog - btScale; +} + +/** ZSTD_dictAndWindowLog() : + * Returns an adjusted window log that is large enough to fit the source and the dictionary. + * The zstd format says that the entire dictionary is valid if one byte of the dictionary + * is within the window. So the hashLog and chainLog should be large enough to reference both + * the dictionary and the window. So we must use this adjusted dictAndWindowLog when downsizing + * the hashLog and windowLog. + * NOTE: srcSize must not be ZSTD_CONTENTSIZE_UNKNOWN. + */ +static U32 ZSTD_dictAndWindowLog(U32 windowLog, U64 srcSize, U64 dictSize) +{ + const U64 maxWindowSize = 1ULL << ZSTD_WINDOWLOG_MAX; + /* No dictionary ==> No change */ + if (dictSize == 0) { + return windowLog; + } + assert(windowLog <= ZSTD_WINDOWLOG_MAX); + assert(srcSize != ZSTD_CONTENTSIZE_UNKNOWN); /* Handled in ZSTD_adjustCParams_internal() */ + { + U64 const windowSize = 1ULL << windowLog; + U64 const dictAndWindowSize = dictSize + windowSize; + /* If the window size is already large enough to fit both the source and the dictionary + * then just use the window size. Otherwise adjust so that it fits the dictionary and + * the window. + */ + if (windowSize >= dictSize + srcSize) { + return windowLog; /* Window size large enough already */ + } else if (dictAndWindowSize >= maxWindowSize) { + return ZSTD_WINDOWLOG_MAX; /* Larger than max window log */ + } else { + return ZSTD_highbit32((U32)dictAndWindowSize - 1) + 1; + } + } +} + +/** ZSTD_adjustCParams_internal() : + * optimize `cPar` for a specified input (`srcSize` and `dictSize`). + * mostly downsize to reduce memory consumption and initialization latency. + * `srcSize` can be ZSTD_CONTENTSIZE_UNKNOWN when not known. + * `mode` is the mode for parameter adjustment. See docs for `ZSTD_CParamMode_e`. + * note : `srcSize==0` means 0! + * condition : cPar is presumed validated (can be checked using ZSTD_checkCParams()). */ +static ZSTD_compressionParameters +ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, + unsigned long long srcSize, + size_t dictSize, + ZSTD_CParamMode_e mode, + ZSTD_ParamSwitch_e useRowMatchFinder) +{ + const U64 minSrcSize = 513; /* (1<<9) + 1 */ + const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1); + assert(ZSTD_checkCParams(cPar)==0); + + /* Cascade the selected strategy down to the next-highest one built into + * this binary. */ +#ifdef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR + if (cPar.strategy == ZSTD_btultra2) { + cPar.strategy = ZSTD_btultra; + } + if (cPar.strategy == ZSTD_btultra) { + cPar.strategy = ZSTD_btopt; + } +#endif +#ifdef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR + if (cPar.strategy == ZSTD_btopt) { + cPar.strategy = ZSTD_btlazy2; + } +#endif +#ifdef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR + if (cPar.strategy == ZSTD_btlazy2) { + cPar.strategy = ZSTD_lazy2; + } +#endif +#ifdef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR + if (cPar.strategy == ZSTD_lazy2) { + cPar.strategy = ZSTD_lazy; + } +#endif +#ifdef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR + if (cPar.strategy == ZSTD_lazy) { + cPar.strategy = ZSTD_greedy; + } +#endif +#ifdef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR + if (cPar.strategy == ZSTD_greedy) { + cPar.strategy = ZSTD_dfast; + } +#endif +#ifdef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR + if (cPar.strategy == ZSTD_dfast) { + cPar.strategy = ZSTD_fast; + cPar.targetLength = 0; + } +#endif + + switch (mode) { + case ZSTD_cpm_unknown: + case ZSTD_cpm_noAttachDict: + /* If we don't know the source size, don't make any + * assumptions about it. We will already have selected + * smaller parameters if a dictionary is in use. + */ + break; + case ZSTD_cpm_createCDict: + /* Assume a small source size when creating a dictionary + * with an unknown source size. + */ + if (dictSize && srcSize == ZSTD_CONTENTSIZE_UNKNOWN) + srcSize = minSrcSize; + break; + case ZSTD_cpm_attachDict: + /* Dictionary has its own dedicated parameters which have + * already been selected. We are selecting parameters + * for only the source. + */ + dictSize = 0; + break; + default: + assert(0); + break; + } + + /* resize windowLog if input is small enough, to use less memory */ + if ( (srcSize <= maxWindowResize) + && (dictSize <= maxWindowResize) ) { + U32 const tSize = (U32)(srcSize + dictSize); + static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN; + U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN : + ZSTD_highbit32(tSize-1) + 1; + if (cPar.windowLog > srcLog) cPar.windowLog = srcLog; + } + if (srcSize != ZSTD_CONTENTSIZE_UNKNOWN) { + U32 const dictAndWindowLog = ZSTD_dictAndWindowLog(cPar.windowLog, (U64)srcSize, (U64)dictSize); + U32 const cycleLog = ZSTD_cycleLog(cPar.chainLog, cPar.strategy); + if (cPar.hashLog > dictAndWindowLog+1) cPar.hashLog = dictAndWindowLog+1; + if (cycleLog > dictAndWindowLog) + cPar.chainLog -= (cycleLog - dictAndWindowLog); + } + + if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN) + cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN; /* minimum wlog required for valid frame header */ + + /* We can't use more than 32 bits of hash in total, so that means that we require: + * (hashLog + 8) <= 32 && (chainLog + 8) <= 32 + */ + if (mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) { + U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS; + if (cPar.hashLog > maxShortCacheHashLog) { + cPar.hashLog = maxShortCacheHashLog; + } + if (cPar.chainLog > maxShortCacheHashLog) { + cPar.chainLog = maxShortCacheHashLog; + } + } + + + /* At this point, we aren't 100% sure if we are using the row match finder. + * Unless it is explicitly disabled, conservatively assume that it is enabled. + * In this case it will only be disabled for small sources, so shrinking the + * hash log a little bit shouldn't result in any ratio loss. + */ + if (useRowMatchFinder == ZSTD_ps_auto) + useRowMatchFinder = ZSTD_ps_enable; + + /* We can't hash more than 32-bits in total. So that means that we require: + * (hashLog - rowLog + 8) <= 32 + */ + if (ZSTD_rowMatchFinderUsed(cPar.strategy, useRowMatchFinder)) { + /* Switch to 32-entry rows if searchLog is 5 (or more) */ + U32 const rowLog = BOUNDED(4, cPar.searchLog, 6); + U32 const maxRowHashLog = 32 - ZSTD_ROW_HASH_TAG_BITS; + U32 const maxHashLog = maxRowHashLog + rowLog; + assert(cPar.hashLog >= rowLog); + if (cPar.hashLog > maxHashLog) { + cPar.hashLog = maxHashLog; + } + } + + return cPar; +} + +ZSTD_compressionParameters +ZSTD_adjustCParams(ZSTD_compressionParameters cPar, + unsigned long long srcSize, + size_t dictSize) +{ + cPar = ZSTD_clampCParams(cPar); /* resulting cPar is necessarily valid (all parameters within range) */ + if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN; + return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown, ZSTD_ps_auto); +} + +static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode); +static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode); + +static void ZSTD_overrideCParams( + ZSTD_compressionParameters* cParams, + const ZSTD_compressionParameters* overrides) +{ + if (overrides->windowLog) cParams->windowLog = overrides->windowLog; + if (overrides->hashLog) cParams->hashLog = overrides->hashLog; + if (overrides->chainLog) cParams->chainLog = overrides->chainLog; + if (overrides->searchLog) cParams->searchLog = overrides->searchLog; + if (overrides->minMatch) cParams->minMatch = overrides->minMatch; + if (overrides->targetLength) cParams->targetLength = overrides->targetLength; + if (overrides->strategy) cParams->strategy = overrides->strategy; +} + +ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( + const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode) +{ + ZSTD_compressionParameters cParams; + if (srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN && CCtxParams->srcSizeHint > 0) { + assert(CCtxParams->srcSizeHint>=0); + srcSizeHint = (U64)CCtxParams->srcSizeHint; + } + cParams = ZSTD_getCParams_internal(CCtxParams->compressionLevel, srcSizeHint, dictSize, mode); + if (CCtxParams->ldmParams.enableLdm == ZSTD_ps_enable) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG; + ZSTD_overrideCParams(&cParams, &CCtxParams->cParams); + assert(!ZSTD_checkCParams(cParams)); + /* srcSizeHint == 0 means 0 */ + return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode, CCtxParams->useRowMatchFinder); +} + +static size_t +ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, + const ZSTD_ParamSwitch_e useRowMatchFinder, + const int enableDedicatedDictSearch, + const U32 forCCtx) +{ + /* chain table size should be 0 for fast or row-hash strategies */ + size_t const chainSize = ZSTD_allocateChainTable(cParams->strategy, useRowMatchFinder, enableDedicatedDictSearch && !forCCtx) + ? ((size_t)1 << cParams->chainLog) + : 0; + size_t const hSize = ((size_t)1) << cParams->hashLog; + U32 const hashLog3 = (forCCtx && cParams->minMatch==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0; + size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0; + /* We don't use ZSTD_cwksp_alloc_size() here because the tables aren't + * surrounded by redzones in ASAN. */ + size_t const tableSpace = chainSize * sizeof(U32) + + hSize * sizeof(U32) + + h3Size * sizeof(U32); + size_t const optPotentialSpace = + ZSTD_cwksp_aligned64_alloc_size((MaxML+1) * sizeof(U32)) + + ZSTD_cwksp_aligned64_alloc_size((MaxLL+1) * sizeof(U32)) + + ZSTD_cwksp_aligned64_alloc_size((MaxOff+1) * sizeof(U32)) + + ZSTD_cwksp_aligned64_alloc_size((1<strategy, useRowMatchFinder) + ? ZSTD_cwksp_aligned64_alloc_size(hSize) + : 0; + size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt)) + ? optPotentialSpace + : 0; + size_t const slackSpace = ZSTD_cwksp_slack_space_required(); + + /* tables are guaranteed to be sized in multiples of 64 bytes (or 16 uint32_t) */ + ZSTD_STATIC_ASSERT(ZSTD_HASHLOG_MIN >= 4 && ZSTD_WINDOWLOG_MIN >= 4 && ZSTD_CHAINLOG_MIN >= 4); + assert(useRowMatchFinder != ZSTD_ps_auto); + + DEBUGLOG(4, "chainSize: %u - hSize: %u - h3Size: %u", + (U32)chainSize, (U32)hSize, (U32)h3Size); + return tableSpace + optSpace + slackSpace + lazyAdditionalSpace; +} + +/* Helper function for calculating memory requirements. + * Gives a tighter bound than ZSTD_sequenceBound() by taking minMatch into account. */ +static size_t ZSTD_maxNbSeq(size_t blockSize, unsigned minMatch, int useSequenceProducer) { + U32 const divider = (minMatch==3 || useSequenceProducer) ? 3 : 4; + return blockSize / divider; +} + +static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( + const ZSTD_compressionParameters* cParams, + const ldmParams_t* ldmParams, + const int isStatic, + const ZSTD_ParamSwitch_e useRowMatchFinder, + const size_t buffInSize, + const size_t buffOutSize, + const U64 pledgedSrcSize, + int useSequenceProducer, + size_t maxBlockSize) +{ + size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize); + size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize); + size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer); + size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) + + ZSTD_cwksp_aligned64_alloc_size(maxNbSeq * sizeof(SeqDef)) + + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE)); + size_t const tmpWorkSpace = ZSTD_cwksp_alloc_size(TMP_WORKSPACE_SIZE); + size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t)); + size_t const matchStateSize = ZSTD_sizeof_matchState(cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 0, /* forCCtx */ 1); + + size_t const ldmSpace = ZSTD_ldm_getTableSize(*ldmParams); + size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(*ldmParams, blockSize); + size_t const ldmSeqSpace = ldmParams->enableLdm == ZSTD_ps_enable ? + ZSTD_cwksp_aligned64_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0; + + + size_t const bufferSpace = ZSTD_cwksp_alloc_size(buffInSize) + + ZSTD_cwksp_alloc_size(buffOutSize); + + size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0; + + size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); + size_t const externalSeqSpace = useSequenceProducer + ? ZSTD_cwksp_aligned64_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence)) + : 0; + + size_t const neededSpace = + cctxSpace + + tmpWorkSpace + + blockStateSpace + + ldmSpace + + ldmSeqSpace + + matchStateSize + + tokenSpace + + bufferSpace + + externalSeqSpace; + + DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace); + return neededSpace; +} + +size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params) +{ + ZSTD_compressionParameters const cParams = + ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); + ZSTD_ParamSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, + &cParams); + + RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); + /* estimateCCtxSize is for one-shot compression. So no buffers should + * be needed. However, we still allocate two 0-sized buffers, which can + * take space under ASAN. */ + return ZSTD_estimateCCtxSize_usingCCtxParams_internal( + &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize); +} + +size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams) +{ + ZSTD_CCtx_params initialParams = ZSTD_makeCCtxParamsFromCParams(cParams); + if (ZSTD_rowMatchFinderSupported(cParams.strategy)) { + /* Pick bigger of not using and using row-based matchfinder for greedy and lazy strategies */ + size_t noRowCCtxSize; + size_t rowCCtxSize; + initialParams.useRowMatchFinder = ZSTD_ps_disable; + noRowCCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams); + initialParams.useRowMatchFinder = ZSTD_ps_enable; + rowCCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams); + return MAX(noRowCCtxSize, rowCCtxSize); + } else { + return ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams); + } +} + +static size_t ZSTD_estimateCCtxSize_internal(int compressionLevel) +{ + int tier = 0; + size_t largestSize = 0; + static const unsigned long long srcSizeTiers[4] = {16 KB, 128 KB, 256 KB, ZSTD_CONTENTSIZE_UNKNOWN}; + for (; tier < 4; ++tier) { + /* Choose the set of cParams for a given level across all srcSizes that give the largest cctxSize */ + ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, srcSizeTiers[tier], 0, ZSTD_cpm_noAttachDict); + largestSize = MAX(ZSTD_estimateCCtxSize_usingCParams(cParams), largestSize); + } + return largestSize; +} + +size_t ZSTD_estimateCCtxSize(int compressionLevel) +{ + int level; + size_t memBudget = 0; + for (level=MIN(compressionLevel, 1); level<=compressionLevel; level++) { + /* Ensure monotonically increasing memory usage as compression level increases */ + size_t const newMB = ZSTD_estimateCCtxSize_internal(level); + if (newMB > memBudget) memBudget = newMB; + } + return memBudget; +} + +size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) +{ + RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); + { ZSTD_compressionParameters const cParams = + ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); + size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(params->maxBlockSize), (size_t)1 << cParams.windowLog); + size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered) + ? ((size_t)1 << cParams.windowLog) + blockSize + : 0; + size_t const outBuffSize = (params->outBufferMode == ZSTD_bm_buffered) + ? ZSTD_compressBound(blockSize) + 1 + : 0; + ZSTD_ParamSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, ¶ms->cParams); + + return ZSTD_estimateCCtxSize_usingCCtxParams_internal( + &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize, + ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize); + } +} + +size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams) +{ + ZSTD_CCtx_params initialParams = ZSTD_makeCCtxParamsFromCParams(cParams); + if (ZSTD_rowMatchFinderSupported(cParams.strategy)) { + /* Pick bigger of not using and using row-based matchfinder for greedy and lazy strategies */ + size_t noRowCCtxSize; + size_t rowCCtxSize; + initialParams.useRowMatchFinder = ZSTD_ps_disable; + noRowCCtxSize = ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams); + initialParams.useRowMatchFinder = ZSTD_ps_enable; + rowCCtxSize = ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams); + return MAX(noRowCCtxSize, rowCCtxSize); + } else { + return ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams); + } +} + +static size_t ZSTD_estimateCStreamSize_internal(int compressionLevel) +{ + ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); + return ZSTD_estimateCStreamSize_usingCParams(cParams); +} + +size_t ZSTD_estimateCStreamSize(int compressionLevel) +{ + int level; + size_t memBudget = 0; + for (level=MIN(compressionLevel, 1); level<=compressionLevel; level++) { + size_t const newMB = ZSTD_estimateCStreamSize_internal(level); + if (newMB > memBudget) memBudget = newMB; + } + return memBudget; +} + +/* ZSTD_getFrameProgression(): + * tells how much data has been consumed (input) and produced (output) for current frame. + * able to count progression inside worker threads (non-blocking mode). + */ +ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx) +{ +#ifdef ZSTD_MULTITHREAD + if (cctx->appliedParams.nbWorkers > 0) { + return ZSTDMT_getFrameProgression(cctx->mtctx); + } +#endif + { ZSTD_frameProgression fp; + size_t const buffered = (cctx->inBuff == NULL) ? 0 : + cctx->inBuffPos - cctx->inToCompress; + if (buffered) assert(cctx->inBuffPos >= cctx->inToCompress); + assert(buffered <= ZSTD_BLOCKSIZE_MAX); + fp.ingested = cctx->consumedSrcSize + buffered; + fp.consumed = cctx->consumedSrcSize; + fp.produced = cctx->producedCSize; + fp.flushed = cctx->producedCSize; /* simplified; some data might still be left within streaming output buffer */ + fp.currentJobID = 0; + fp.nbActiveWorkers = 0; + return fp; +} } + +/*! ZSTD_toFlushNow() + * Only useful for multithreading scenarios currently (nbWorkers >= 1). + */ +size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx) +{ +#ifdef ZSTD_MULTITHREAD + if (cctx->appliedParams.nbWorkers > 0) { + return ZSTDMT_toFlushNow(cctx->mtctx); + } +#endif + (void)cctx; + return 0; /* over-simplification; could also check if context is currently running in streaming mode, and in which case, report how many bytes are left to be flushed within output buffer */ +} + +static void ZSTD_assertEqualCParams(ZSTD_compressionParameters cParams1, + ZSTD_compressionParameters cParams2) +{ + (void)cParams1; + (void)cParams2; + assert(cParams1.windowLog == cParams2.windowLog); + assert(cParams1.chainLog == cParams2.chainLog); + assert(cParams1.hashLog == cParams2.hashLog); + assert(cParams1.searchLog == cParams2.searchLog); + assert(cParams1.minMatch == cParams2.minMatch); + assert(cParams1.targetLength == cParams2.targetLength); + assert(cParams1.strategy == cParams2.strategy); +} + +void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs) +{ + int i; + for (i = 0; i < ZSTD_REP_NUM; ++i) + bs->rep[i] = repStartValue[i]; + bs->entropy.huf.repeatMode = HUF_repeat_none; + bs->entropy.fse.offcode_repeatMode = FSE_repeat_none; + bs->entropy.fse.matchlength_repeatMode = FSE_repeat_none; + bs->entropy.fse.litlength_repeatMode = FSE_repeat_none; +} + +/*! ZSTD_invalidateMatchState() + * Invalidate all the matches in the match finder tables. + * Requires nextSrc and base to be set (can be NULL). + */ +static void ZSTD_invalidateMatchState(ZSTD_MatchState_t* ms) +{ + ZSTD_window_clear(&ms->window); + + ms->nextToUpdate = ms->window.dictLimit; + ms->loadedDictEnd = 0; + ms->opt.litLengthSum = 0; /* force reset of btopt stats */ + ms->dictMatchState = NULL; +} + +/** + * Controls, for this matchState reset, whether the tables need to be cleared / + * prepared for the coming compression (ZSTDcrp_makeClean), or whether the + * tables can be left unclean (ZSTDcrp_leaveDirty), because we know that a + * subsequent operation will overwrite the table space anyways (e.g., copying + * the matchState contents in from a CDict). + */ +typedef enum { + ZSTDcrp_makeClean, + ZSTDcrp_leaveDirty +} ZSTD_compResetPolicy_e; + +/** + * Controls, for this matchState reset, whether indexing can continue where it + * left off (ZSTDirp_continue), or whether it needs to be restarted from zero + * (ZSTDirp_reset). + */ +typedef enum { + ZSTDirp_continue, + ZSTDirp_reset +} ZSTD_indexResetPolicy_e; + +typedef enum { + ZSTD_resetTarget_CDict, + ZSTD_resetTarget_CCtx +} ZSTD_resetTarget_e; + +/* Mixes bits in a 64 bits in a value, based on XXH3_rrmxmx */ +static U64 ZSTD_bitmix(U64 val, U64 len) { + val ^= ZSTD_rotateRight_U64(val, 49) ^ ZSTD_rotateRight_U64(val, 24); + val *= 0x9FB21C651E98DF25ULL; + val ^= (val >> 35) + len ; + val *= 0x9FB21C651E98DF25ULL; + return val ^ (val >> 28); +} + +/* Mixes in the hashSalt and hashSaltEntropy to create a new hashSalt */ +static void ZSTD_advanceHashSalt(ZSTD_MatchState_t* ms) { + ms->hashSalt = ZSTD_bitmix(ms->hashSalt, 8) ^ ZSTD_bitmix((U64) ms->hashSaltEntropy, 4); +} + +static size_t +ZSTD_reset_matchState(ZSTD_MatchState_t* ms, + ZSTD_cwksp* ws, + const ZSTD_compressionParameters* cParams, + const ZSTD_ParamSwitch_e useRowMatchFinder, + const ZSTD_compResetPolicy_e crp, + const ZSTD_indexResetPolicy_e forceResetIndex, + const ZSTD_resetTarget_e forWho) +{ + /* disable chain table allocation for fast or row-based strategies */ + size_t const chainSize = ZSTD_allocateChainTable(cParams->strategy, useRowMatchFinder, + ms->dedicatedDictSearch && (forWho == ZSTD_resetTarget_CDict)) + ? ((size_t)1 << cParams->chainLog) + : 0; + size_t const hSize = ((size_t)1) << cParams->hashLog; + U32 const hashLog3 = ((forWho == ZSTD_resetTarget_CCtx) && cParams->minMatch==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0; + size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0; + + DEBUGLOG(4, "reset indices : %u", forceResetIndex == ZSTDirp_reset); + assert(useRowMatchFinder != ZSTD_ps_auto); + if (forceResetIndex == ZSTDirp_reset) { + ZSTD_window_init(&ms->window); + ZSTD_cwksp_mark_tables_dirty(ws); + } + + ms->hashLog3 = hashLog3; + ms->lazySkipping = 0; + + ZSTD_invalidateMatchState(ms); + + assert(!ZSTD_cwksp_reserve_failed(ws)); /* check that allocation hasn't already failed */ + + ZSTD_cwksp_clear_tables(ws); + + DEBUGLOG(5, "reserving table space"); + /* table Space */ + ms->hashTable = (U32*)ZSTD_cwksp_reserve_table(ws, hSize * sizeof(U32)); + ms->chainTable = (U32*)ZSTD_cwksp_reserve_table(ws, chainSize * sizeof(U32)); + ms->hashTable3 = (U32*)ZSTD_cwksp_reserve_table(ws, h3Size * sizeof(U32)); + RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation, + "failed a workspace allocation in ZSTD_reset_matchState"); + + DEBUGLOG(4, "reset table : %u", crp!=ZSTDcrp_leaveDirty); + if (crp!=ZSTDcrp_leaveDirty) { + /* reset tables only */ + ZSTD_cwksp_clean_tables(ws); + } + + if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) { + /* Row match finder needs an additional table of hashes ("tags") */ + size_t const tagTableSize = hSize; + /* We want to generate a new salt in case we reset a Cctx, but we always want to use + * 0 when we reset a Cdict */ + if(forWho == ZSTD_resetTarget_CCtx) { + ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize); + ZSTD_advanceHashSalt(ms); + } else { + /* When we are not salting we want to always memset the memory */ + ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned64(ws, tagTableSize); + ZSTD_memset(ms->tagTable, 0, tagTableSize); + ms->hashSalt = 0; + } + { /* Switch to 32-entry rows if searchLog is 5 (or more) */ + U32 const rowLog = BOUNDED(4, cParams->searchLog, 6); + assert(cParams->hashLog >= rowLog); + ms->rowHashLog = cParams->hashLog - rowLog; + } + } + + /* opt parser space */ + if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) { + DEBUGLOG(4, "reserving optimal parser space"); + ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (1<opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (MaxLL+1) * sizeof(unsigned)); + ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (MaxML+1) * sizeof(unsigned)); + ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (MaxOff+1) * sizeof(unsigned)); + ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned64(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_match_t)); + ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned64(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t)); + } + + ms->cParams = *cParams; + + RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation, + "failed a workspace allocation in ZSTD_reset_matchState"); + return 0; +} + +/* ZSTD_indexTooCloseToMax() : + * minor optimization : prefer memset() rather than reduceIndex() + * which is measurably slow in some circumstances (reported for Visual Studio). + * Works when re-using a context for a lot of smallish inputs : + * if all inputs are smaller than ZSTD_INDEXOVERFLOW_MARGIN, + * memset() will be triggered before reduceIndex(). + */ +#define ZSTD_INDEXOVERFLOW_MARGIN (16 MB) +static int ZSTD_indexTooCloseToMax(ZSTD_window_t w) +{ + return (size_t)(w.nextSrc - w.base) > (ZSTD_CURRENT_MAX - ZSTD_INDEXOVERFLOW_MARGIN); +} + +/** ZSTD_dictTooBig(): + * When dictionaries are larger than ZSTD_CHUNKSIZE_MAX they can't be loaded in + * one go generically. So we ensure that in that case we reset the tables to zero, + * so that we can load as much of the dictionary as possible. + */ +static int ZSTD_dictTooBig(size_t const loadedDictSize) +{ + return loadedDictSize > ZSTD_CHUNKSIZE_MAX; +} + +/*! ZSTD_resetCCtx_internal() : + * @param loadedDictSize The size of the dictionary to be loaded + * into the context, if any. If no dictionary is used, or the + * dictionary is being attached / copied, then pass 0. + * note : `params` are assumed fully validated at this stage. + */ +static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + ZSTD_CCtx_params const* params, + U64 const pledgedSrcSize, + size_t const loadedDictSize, + ZSTD_compResetPolicy_e const crp, + ZSTD_buffered_policy_e const zbuff) +{ + ZSTD_cwksp* const ws = &zc->workspace; + DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u, useRowMatchFinder=%d useBlockSplitter=%d", + (U32)pledgedSrcSize, params->cParams.windowLog, (int)params->useRowMatchFinder, (int)params->postBlockSplitter); + assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams))); + + zc->isFirstBlock = 1; + + /* Set applied params early so we can modify them for LDM, + * and point params at the applied params. + */ + zc->appliedParams = *params; + params = &zc->appliedParams; + + assert(params->useRowMatchFinder != ZSTD_ps_auto); + assert(params->postBlockSplitter != ZSTD_ps_auto); + assert(params->ldmParams.enableLdm != ZSTD_ps_auto); + assert(params->maxBlockSize != 0); + if (params->ldmParams.enableLdm == ZSTD_ps_enable) { + /* Adjust long distance matching parameters */ + ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, ¶ms->cParams); + assert(params->ldmParams.hashLog >= params->ldmParams.bucketSizeLog); + assert(params->ldmParams.hashRateLog < 32); + } + + { size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize)); + size_t const blockSize = MIN(params->maxBlockSize, windowSize); + size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, ZSTD_hasExtSeqProd(params)); + size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered) + ? ZSTD_compressBound(blockSize) + 1 + : 0; + size_t const buffInSize = (zbuff == ZSTDb_buffered && params->inBufferMode == ZSTD_bm_buffered) + ? windowSize + blockSize + : 0; + size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(params->ldmParams, blockSize); + + int const indexTooClose = ZSTD_indexTooCloseToMax(zc->blockState.matchState.window); + int const dictTooBig = ZSTD_dictTooBig(loadedDictSize); + ZSTD_indexResetPolicy_e needsIndexReset = + (indexTooClose || dictTooBig || !zc->initialized) ? ZSTDirp_reset : ZSTDirp_continue; + + size_t const neededSpace = + ZSTD_estimateCCtxSize_usingCCtxParams_internal( + ¶ms->cParams, ¶ms->ldmParams, zc->staticSize != 0, params->useRowMatchFinder, + buffInSize, buffOutSize, pledgedSrcSize, ZSTD_hasExtSeqProd(params), params->maxBlockSize); + + FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!"); + + if (!zc->staticSize) ZSTD_cwksp_bump_oversized_duration(ws, 0); + + { /* Check if workspace is large enough, alloc a new one if needed */ + int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace; + int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace); + int resizeWorkspace = workspaceTooSmall || workspaceWasteful; + DEBUGLOG(4, "Need %zu B workspace", neededSpace); + DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize); + + if (resizeWorkspace) { + DEBUGLOG(4, "Resize workspaceSize from %zuKB to %zuKB", + ZSTD_cwksp_sizeof(ws) >> 10, + neededSpace >> 10); + + RETURN_ERROR_IF(zc->staticSize, memory_allocation, "static cctx : no resize"); + + needsIndexReset = ZSTDirp_reset; + + ZSTD_cwksp_free(ws, zc->customMem); + FORWARD_IF_ERROR(ZSTD_cwksp_create(ws, neededSpace, zc->customMem), ""); + + DEBUGLOG(5, "reserving object space"); + /* Statically sized space. + * tmpWorkspace never moves, + * though prev/next block swap places */ + assert(ZSTD_cwksp_check_available(ws, 2 * sizeof(ZSTD_compressedBlockState_t))); + zc->blockState.prevCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t)); + RETURN_ERROR_IF(zc->blockState.prevCBlock == NULL, memory_allocation, "couldn't allocate prevCBlock"); + zc->blockState.nextCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t)); + RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate nextCBlock"); + zc->tmpWorkspace = ZSTD_cwksp_reserve_object(ws, TMP_WORKSPACE_SIZE); + RETURN_ERROR_IF(zc->tmpWorkspace == NULL, memory_allocation, "couldn't allocate tmpWorkspace"); + zc->tmpWkspSize = TMP_WORKSPACE_SIZE; + } } + + ZSTD_cwksp_clear(ws); + + /* init params */ + zc->blockState.matchState.cParams = params->cParams; + zc->blockState.matchState.prefetchCDictTables = params->prefetchCDictTables == ZSTD_ps_enable; + zc->pledgedSrcSizePlusOne = pledgedSrcSize+1; + zc->consumedSrcSize = 0; + zc->producedCSize = 0; + if (pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN) + zc->appliedParams.fParams.contentSizeFlag = 0; + DEBUGLOG(4, "pledged content size : %u ; flag : %u", + (unsigned)pledgedSrcSize, zc->appliedParams.fParams.contentSizeFlag); + zc->blockSizeMax = blockSize; + + XXH64_reset(&zc->xxhState, 0); + zc->stage = ZSTDcs_init; + zc->dictID = 0; + zc->dictContentSize = 0; + + ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock); + + FORWARD_IF_ERROR(ZSTD_reset_matchState( + &zc->blockState.matchState, + ws, + ¶ms->cParams, + params->useRowMatchFinder, + crp, + needsIndexReset, + ZSTD_resetTarget_CCtx), ""); + + zc->seqStore.sequencesStart = (SeqDef*)ZSTD_cwksp_reserve_aligned64(ws, maxNbSeq * sizeof(SeqDef)); + + /* ldm hash table */ + if (params->ldmParams.enableLdm == ZSTD_ps_enable) { + /* TODO: avoid memset? */ + size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog; + zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned64(ws, ldmHSize * sizeof(ldmEntry_t)); + ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t)); + zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned64(ws, maxNbLdmSeq * sizeof(rawSeq)); + zc->maxNbLdmSequences = maxNbLdmSeq; + + ZSTD_window_init(&zc->ldmState.window); + zc->ldmState.loadedDictEnd = 0; + } + + /* reserve space for block-level external sequences */ + if (ZSTD_hasExtSeqProd(params)) { + size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); + zc->extSeqBufCapacity = maxNbExternalSeq; + zc->extSeqBuf = + (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned64(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence)); + } + + /* buffers */ + + /* ZSTD_wildcopy() is used to copy into the literals buffer, + * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes. + */ + zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH); + zc->seqStore.maxNbLit = blockSize; + + zc->bufferedPolicy = zbuff; + zc->inBuffSize = buffInSize; + zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize); + zc->outBuffSize = buffOutSize; + zc->outBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffOutSize); + + /* ldm bucketOffsets table */ + if (params->ldmParams.enableLdm == ZSTD_ps_enable) { + /* TODO: avoid memset? */ + size_t const numBuckets = + ((size_t)1) << (params->ldmParams.hashLog - + params->ldmParams.bucketSizeLog); + zc->ldmState.bucketOffsets = ZSTD_cwksp_reserve_buffer(ws, numBuckets); + ZSTD_memset(zc->ldmState.bucketOffsets, 0, numBuckets); + } + + /* sequences storage */ + ZSTD_referenceExternalSequences(zc, NULL, 0); + zc->seqStore.maxNbSeq = maxNbSeq; + zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + + DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws)); + assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace)); + + zc->initialized = 1; + + return 0; + } +} + +/* ZSTD_invalidateRepCodes() : + * ensures next compression will not use repcodes from previous block. + * Note : only works with regular variant; + * do not use with extDict variant ! */ +void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx) { + int i; + for (i=0; iblockState.prevCBlock->rep[i] = 0; + assert(!ZSTD_window_hasExtDict(cctx->blockState.matchState.window)); +} + +/* These are the approximate sizes for each strategy past which copying the + * dictionary tables into the working context is faster than using them + * in-place. + */ +static const size_t attachDictSizeCutoffs[ZSTD_STRATEGY_MAX+1] = { + 8 KB, /* unused */ + 8 KB, /* ZSTD_fast */ + 16 KB, /* ZSTD_dfast */ + 32 KB, /* ZSTD_greedy */ + 32 KB, /* ZSTD_lazy */ + 32 KB, /* ZSTD_lazy2 */ + 32 KB, /* ZSTD_btlazy2 */ + 32 KB, /* ZSTD_btopt */ + 8 KB, /* ZSTD_btultra */ + 8 KB /* ZSTD_btultra2 */ +}; + +static int ZSTD_shouldAttachDict(const ZSTD_CDict* cdict, + const ZSTD_CCtx_params* params, + U64 pledgedSrcSize) +{ + size_t cutoff = attachDictSizeCutoffs[cdict->matchState.cParams.strategy]; + int const dedicatedDictSearch = cdict->matchState.dedicatedDictSearch; + return dedicatedDictSearch + || ( ( pledgedSrcSize <= cutoff + || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN + || params->attachDictPref == ZSTD_dictForceAttach ) + && params->attachDictPref != ZSTD_dictForceCopy + && !params->forceWindow ); /* dictMatchState isn't correctly + * handled in _enforceMaxDist */ +} + +static size_t +ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, + const ZSTD_CDict* cdict, + ZSTD_CCtx_params params, + U64 pledgedSrcSize, + ZSTD_buffered_policy_e zbuff) +{ + DEBUGLOG(4, "ZSTD_resetCCtx_byAttachingCDict() pledgedSrcSize=%llu", + (unsigned long long)pledgedSrcSize); + { + ZSTD_compressionParameters adjusted_cdict_cParams = cdict->matchState.cParams; + unsigned const windowLog = params.cParams.windowLog; + assert(windowLog != 0); + /* Resize working context table params for input only, since the dict + * has its own tables. */ + /* pledgedSrcSize == 0 means 0! */ + + if (cdict->matchState.dedicatedDictSearch) { + ZSTD_dedicatedDictSearch_revertCParams(&adjusted_cdict_cParams); + } + + params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize, + cdict->dictContentSize, ZSTD_cpm_attachDict, + params.useRowMatchFinder); + params.cParams.windowLog = windowLog; + params.useRowMatchFinder = cdict->useRowMatchFinder; /* cdict overrides */ + FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, ¶ms, pledgedSrcSize, + /* loadedDictSize */ 0, + ZSTDcrp_makeClean, zbuff), ""); + assert(cctx->appliedParams.cParams.strategy == adjusted_cdict_cParams.strategy); + } + + { const U32 cdictEnd = (U32)( cdict->matchState.window.nextSrc + - cdict->matchState.window.base); + const U32 cdictLen = cdictEnd - cdict->matchState.window.dictLimit; + if (cdictLen == 0) { + /* don't even attach dictionaries with no contents */ + DEBUGLOG(4, "skipping attaching empty dictionary"); + } else { + DEBUGLOG(4, "attaching dictionary into context"); + cctx->blockState.matchState.dictMatchState = &cdict->matchState; + + /* prep working match state so dict matches never have negative indices + * when they are translated to the working context's index space. */ + if (cctx->blockState.matchState.window.dictLimit < cdictEnd) { + cctx->blockState.matchState.window.nextSrc = + cctx->blockState.matchState.window.base + cdictEnd; + ZSTD_window_clear(&cctx->blockState.matchState.window); + } + /* loadedDictEnd is expressed within the referential of the active context */ + cctx->blockState.matchState.loadedDictEnd = cctx->blockState.matchState.window.dictLimit; + } } + + cctx->dictID = cdict->dictID; + cctx->dictContentSize = cdict->dictContentSize; + + /* copy block state */ + ZSTD_memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState)); + + return 0; +} + +static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const* src, size_t tableSize, + ZSTD_compressionParameters const* cParams) { + if (ZSTD_CDictIndicesAreTagged(cParams)){ + /* Remove tags from the CDict table if they are present. + * See docs on "short cache" in zstd_compress_internal.h for context. */ + size_t i; + for (i = 0; i < tableSize; i++) { + U32 const taggedIndex = src[i]; + U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS; + dst[i] = index; + } + } else { + ZSTD_memcpy(dst, src, tableSize * sizeof(U32)); + } +} + +static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, + const ZSTD_CDict* cdict, + ZSTD_CCtx_params params, + U64 pledgedSrcSize, + ZSTD_buffered_policy_e zbuff) +{ + const ZSTD_compressionParameters *cdict_cParams = &cdict->matchState.cParams; + + assert(!cdict->matchState.dedicatedDictSearch); + DEBUGLOG(4, "ZSTD_resetCCtx_byCopyingCDict() pledgedSrcSize=%llu", + (unsigned long long)pledgedSrcSize); + + { unsigned const windowLog = params.cParams.windowLog; + assert(windowLog != 0); + /* Copy only compression parameters related to tables. */ + params.cParams = *cdict_cParams; + params.cParams.windowLog = windowLog; + params.useRowMatchFinder = cdict->useRowMatchFinder; + FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, ¶ms, pledgedSrcSize, + /* loadedDictSize */ 0, + ZSTDcrp_leaveDirty, zbuff), ""); + assert(cctx->appliedParams.cParams.strategy == cdict_cParams->strategy); + assert(cctx->appliedParams.cParams.hashLog == cdict_cParams->hashLog); + assert(cctx->appliedParams.cParams.chainLog == cdict_cParams->chainLog); + } + + ZSTD_cwksp_mark_tables_dirty(&cctx->workspace); + assert(params.useRowMatchFinder != ZSTD_ps_auto); + + /* copy tables */ + { size_t const chainSize = ZSTD_allocateChainTable(cdict_cParams->strategy, cdict->useRowMatchFinder, 0 /* DDS guaranteed disabled */) + ? ((size_t)1 << cdict_cParams->chainLog) + : 0; + size_t const hSize = (size_t)1 << cdict_cParams->hashLog; + + ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable, + cdict->matchState.hashTable, + hSize, cdict_cParams); + + /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */ + if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) { + ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable, + cdict->matchState.chainTable, + chainSize, cdict_cParams); + } + /* copy tag table */ + if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) { + size_t const tagTableSize = hSize; + ZSTD_memcpy(cctx->blockState.matchState.tagTable, + cdict->matchState.tagTable, + tagTableSize); + cctx->blockState.matchState.hashSalt = cdict->matchState.hashSalt; + } + } + + /* Zero the hashTable3, since the cdict never fills it */ + assert(cctx->blockState.matchState.hashLog3 <= 31); + { U32 const h3log = cctx->blockState.matchState.hashLog3; + size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0; + assert(cdict->matchState.hashLog3 == 0); + ZSTD_memset(cctx->blockState.matchState.hashTable3, 0, h3Size * sizeof(U32)); + } + + ZSTD_cwksp_mark_tables_clean(&cctx->workspace); + + /* copy dictionary offsets */ + { ZSTD_MatchState_t const* srcMatchState = &cdict->matchState; + ZSTD_MatchState_t* dstMatchState = &cctx->blockState.matchState; + dstMatchState->window = srcMatchState->window; + dstMatchState->nextToUpdate = srcMatchState->nextToUpdate; + dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd; + } + + cctx->dictID = cdict->dictID; + cctx->dictContentSize = cdict->dictContentSize; + + /* copy block state */ + ZSTD_memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState)); + + return 0; +} + +/* We have a choice between copying the dictionary context into the working + * context, or referencing the dictionary context from the working context + * in-place. We decide here which strategy to use. */ +static size_t ZSTD_resetCCtx_usingCDict(ZSTD_CCtx* cctx, + const ZSTD_CDict* cdict, + const ZSTD_CCtx_params* params, + U64 pledgedSrcSize, + ZSTD_buffered_policy_e zbuff) +{ + + DEBUGLOG(4, "ZSTD_resetCCtx_usingCDict (pledgedSrcSize=%u)", + (unsigned)pledgedSrcSize); + + if (ZSTD_shouldAttachDict(cdict, params, pledgedSrcSize)) { + return ZSTD_resetCCtx_byAttachingCDict( + cctx, cdict, *params, pledgedSrcSize, zbuff); + } else { + return ZSTD_resetCCtx_byCopyingCDict( + cctx, cdict, *params, pledgedSrcSize, zbuff); + } +} + +/*! ZSTD_copyCCtx_internal() : + * Duplicate an existing context `srcCCtx` into another one `dstCCtx`. + * Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()). + * The "context", in this case, refers to the hash and chain tables, + * entropy tables, and dictionary references. + * `windowLog` value is enforced if != 0, otherwise value is copied from srcCCtx. + * @return : 0, or an error code */ +static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx, + const ZSTD_CCtx* srcCCtx, + ZSTD_frameParameters fParams, + U64 pledgedSrcSize, + ZSTD_buffered_policy_e zbuff) +{ + RETURN_ERROR_IF(srcCCtx->stage!=ZSTDcs_init, stage_wrong, + "Can't copy a ctx that's not in init stage."); + DEBUGLOG(5, "ZSTD_copyCCtx_internal"); + ZSTD_memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_customMem)); + { ZSTD_CCtx_params params = dstCCtx->requestedParams; + /* Copy only compression parameters related to tables. */ + params.cParams = srcCCtx->appliedParams.cParams; + assert(srcCCtx->appliedParams.useRowMatchFinder != ZSTD_ps_auto); + assert(srcCCtx->appliedParams.postBlockSplitter != ZSTD_ps_auto); + assert(srcCCtx->appliedParams.ldmParams.enableLdm != ZSTD_ps_auto); + params.useRowMatchFinder = srcCCtx->appliedParams.useRowMatchFinder; + params.postBlockSplitter = srcCCtx->appliedParams.postBlockSplitter; + params.ldmParams = srcCCtx->appliedParams.ldmParams; + params.fParams = fParams; + params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize; + ZSTD_resetCCtx_internal(dstCCtx, ¶ms, pledgedSrcSize, + /* loadedDictSize */ 0, + ZSTDcrp_leaveDirty, zbuff); + assert(dstCCtx->appliedParams.cParams.windowLog == srcCCtx->appliedParams.cParams.windowLog); + assert(dstCCtx->appliedParams.cParams.strategy == srcCCtx->appliedParams.cParams.strategy); + assert(dstCCtx->appliedParams.cParams.hashLog == srcCCtx->appliedParams.cParams.hashLog); + assert(dstCCtx->appliedParams.cParams.chainLog == srcCCtx->appliedParams.cParams.chainLog); + assert(dstCCtx->blockState.matchState.hashLog3 == srcCCtx->blockState.matchState.hashLog3); + } + + ZSTD_cwksp_mark_tables_dirty(&dstCCtx->workspace); + + /* copy tables */ + { size_t const chainSize = ZSTD_allocateChainTable(srcCCtx->appliedParams.cParams.strategy, + srcCCtx->appliedParams.useRowMatchFinder, + 0 /* forDDSDict */) + ? ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog) + : 0; + size_t const hSize = (size_t)1 << srcCCtx->appliedParams.cParams.hashLog; + U32 const h3log = srcCCtx->blockState.matchState.hashLog3; + size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0; + + ZSTD_memcpy(dstCCtx->blockState.matchState.hashTable, + srcCCtx->blockState.matchState.hashTable, + hSize * sizeof(U32)); + ZSTD_memcpy(dstCCtx->blockState.matchState.chainTable, + srcCCtx->blockState.matchState.chainTable, + chainSize * sizeof(U32)); + ZSTD_memcpy(dstCCtx->blockState.matchState.hashTable3, + srcCCtx->blockState.matchState.hashTable3, + h3Size * sizeof(U32)); + } + + ZSTD_cwksp_mark_tables_clean(&dstCCtx->workspace); + + /* copy dictionary offsets */ + { + const ZSTD_MatchState_t* srcMatchState = &srcCCtx->blockState.matchState; + ZSTD_MatchState_t* dstMatchState = &dstCCtx->blockState.matchState; + dstMatchState->window = srcMatchState->window; + dstMatchState->nextToUpdate = srcMatchState->nextToUpdate; + dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd; + } + dstCCtx->dictID = srcCCtx->dictID; + dstCCtx->dictContentSize = srcCCtx->dictContentSize; + + /* copy block state */ + ZSTD_memcpy(dstCCtx->blockState.prevCBlock, srcCCtx->blockState.prevCBlock, sizeof(*srcCCtx->blockState.prevCBlock)); + + return 0; +} + +/*! ZSTD_copyCCtx() : + * Duplicate an existing context `srcCCtx` into another one `dstCCtx`. + * Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()). + * pledgedSrcSize==0 means "unknown". +* @return : 0, or an error code */ +size_t ZSTD_copyCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx, unsigned long long pledgedSrcSize) +{ + ZSTD_frameParameters fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; + ZSTD_buffered_policy_e const zbuff = srcCCtx->bufferedPolicy; + ZSTD_STATIC_ASSERT((U32)ZSTDb_buffered==1); + if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN; + fParams.contentSizeFlag = (pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN); + + return ZSTD_copyCCtx_internal(dstCCtx, srcCCtx, + fParams, pledgedSrcSize, + zbuff); +} + + +#define ZSTD_ROWSIZE 16 +/*! ZSTD_reduceTable() : + * reduce table indexes by `reducerValue`, or squash to zero. + * PreserveMark preserves "unsorted mark" for btlazy2 strategy. + * It must be set to a clear 0/1 value, to remove branch during inlining. + * Presume table size is a multiple of ZSTD_ROWSIZE + * to help auto-vectorization */ +FORCE_INLINE_TEMPLATE void +ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerValue, int const preserveMark) +{ + int const nbRows = (int)size / ZSTD_ROWSIZE; + int cellNb = 0; + int rowNb; + /* Protect special index values < ZSTD_WINDOW_START_INDEX. */ + U32 const reducerThreshold = reducerValue + ZSTD_WINDOW_START_INDEX; + assert((size & (ZSTD_ROWSIZE-1)) == 0); /* multiple of ZSTD_ROWSIZE */ + assert(size < (1U<<31)); /* can be cast to int */ + +#if ZSTD_MEMORY_SANITIZER && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE) + /* To validate that the table reuse logic is sound, and that we don't + * access table space that we haven't cleaned, we re-"poison" the table + * space every time we mark it dirty. + * + * This function however is intended to operate on those dirty tables and + * re-clean them. So when this function is used correctly, we can unpoison + * the memory it operated on. This introduces a blind spot though, since + * if we now try to operate on __actually__ poisoned memory, we will not + * detect that. */ + __msan_unpoison(table, size * sizeof(U32)); +#endif + + for (rowNb=0 ; rowNb < nbRows ; rowNb++) { + int column; + for (column=0; columncParams.hashLog; + ZSTD_reduceTable(ms->hashTable, hSize, reducerValue); + } + + if (ZSTD_allocateChainTable(params->cParams.strategy, params->useRowMatchFinder, (U32)ms->dedicatedDictSearch)) { + U32 const chainSize = (U32)1 << params->cParams.chainLog; + if (params->cParams.strategy == ZSTD_btlazy2) + ZSTD_reduceTable_btlazy2(ms->chainTable, chainSize, reducerValue); + else + ZSTD_reduceTable(ms->chainTable, chainSize, reducerValue); + } + + if (ms->hashLog3) { + U32 const h3Size = (U32)1 << ms->hashLog3; + ZSTD_reduceTable(ms->hashTable3, h3Size, reducerValue); + } +} + + +/*-******************************************************* +* Block entropic compression +*********************************************************/ + +/* See doc/zstd_compression_format.md for detailed format description */ + +int ZSTD_seqToCodes(const SeqStore_t* seqStorePtr) +{ + const SeqDef* const sequences = seqStorePtr->sequencesStart; + BYTE* const llCodeTable = seqStorePtr->llCode; + BYTE* const ofCodeTable = seqStorePtr->ofCode; + BYTE* const mlCodeTable = seqStorePtr->mlCode; + U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + U32 u; + int longOffsets = 0; + assert(nbSeq <= seqStorePtr->maxNbSeq); + for (u=0; u= STREAM_ACCUMULATOR_MIN)); + if (MEM_32bits() && ofCode >= STREAM_ACCUMULATOR_MIN) + longOffsets = 1; + } + if (seqStorePtr->longLengthType==ZSTD_llt_literalLength) + llCodeTable[seqStorePtr->longLengthPos] = MaxLL; + if (seqStorePtr->longLengthType==ZSTD_llt_matchLength) + mlCodeTable[seqStorePtr->longLengthPos] = MaxML; + return longOffsets; +} + +/* ZSTD_useTargetCBlockSize(): + * Returns if target compressed block size param is being used. + * If used, compression will do best effort to make a compressed block size to be around targetCBlockSize. + * Returns 1 if true, 0 otherwise. */ +static int ZSTD_useTargetCBlockSize(const ZSTD_CCtx_params* cctxParams) +{ + DEBUGLOG(5, "ZSTD_useTargetCBlockSize (targetCBlockSize=%zu)", cctxParams->targetCBlockSize); + return (cctxParams->targetCBlockSize != 0); +} + +/* ZSTD_blockSplitterEnabled(): + * Returns if block splitting param is being used + * If used, compression will do best effort to split a block in order to improve compression ratio. + * At the time this function is called, the parameter must be finalized. + * Returns 1 if true, 0 otherwise. */ +static int ZSTD_blockSplitterEnabled(ZSTD_CCtx_params* cctxParams) +{ + DEBUGLOG(5, "ZSTD_blockSplitterEnabled (postBlockSplitter=%d)", cctxParams->postBlockSplitter); + assert(cctxParams->postBlockSplitter != ZSTD_ps_auto); + return (cctxParams->postBlockSplitter == ZSTD_ps_enable); +} + +/* Type returned by ZSTD_buildSequencesStatistics containing finalized symbol encoding types + * and size of the sequences statistics + */ +typedef struct { + U32 LLtype; + U32 Offtype; + U32 MLtype; + size_t size; + size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */ + int longOffsets; +} ZSTD_symbolEncodingTypeStats_t; + +/* ZSTD_buildSequencesStatistics(): + * Returns a ZSTD_symbolEncodingTypeStats_t, or a zstd error code in the `size` field. + * Modifies `nextEntropy` to have the appropriate values as a side effect. + * nbSeq must be greater than 0. + * + * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32) + */ +static ZSTD_symbolEncodingTypeStats_t +ZSTD_buildSequencesStatistics( + const SeqStore_t* seqStorePtr, size_t nbSeq, + const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy, + BYTE* dst, const BYTE* const dstEnd, + ZSTD_strategy strategy, unsigned* countWorkspace, + void* entropyWorkspace, size_t entropyWkspSize) +{ + BYTE* const ostart = dst; + const BYTE* const oend = dstEnd; + BYTE* op = ostart; + FSE_CTable* CTable_LitLength = nextEntropy->litlengthCTable; + FSE_CTable* CTable_OffsetBits = nextEntropy->offcodeCTable; + FSE_CTable* CTable_MatchLength = nextEntropy->matchlengthCTable; + const BYTE* const ofCodeTable = seqStorePtr->ofCode; + const BYTE* const llCodeTable = seqStorePtr->llCode; + const BYTE* const mlCodeTable = seqStorePtr->mlCode; + ZSTD_symbolEncodingTypeStats_t stats; + + stats.lastCountSize = 0; + /* convert length/distances into codes */ + stats.longOffsets = ZSTD_seqToCodes(seqStorePtr); + assert(op <= oend); + assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */ + /* build CTable for Literal Lengths */ + { unsigned max = MaxLL; + size_t const mostFrequent = HIST_countFast_wksp(countWorkspace, &max, llCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ + DEBUGLOG(5, "Building LL table"); + nextEntropy->litlength_repeatMode = prevEntropy->litlength_repeatMode; + stats.LLtype = ZSTD_selectEncodingType(&nextEntropy->litlength_repeatMode, + countWorkspace, max, mostFrequent, nbSeq, + LLFSELog, prevEntropy->litlengthCTable, + LL_defaultNorm, LL_defaultNormLog, + ZSTD_defaultAllowed, strategy); + assert(set_basic < set_compressed && set_rle < set_compressed); + assert(!(stats.LLtype < set_compressed && nextEntropy->litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable( + op, (size_t)(oend - op), + CTable_LitLength, LLFSELog, (SymbolEncodingType_e)stats.LLtype, + countWorkspace, max, llCodeTable, nbSeq, + LL_defaultNorm, LL_defaultNormLog, MaxLL, + prevEntropy->litlengthCTable, + sizeof(prevEntropy->litlengthCTable), + entropyWorkspace, entropyWkspSize); + if (ZSTD_isError(countSize)) { + DEBUGLOG(3, "ZSTD_buildCTable for LitLens failed"); + stats.size = countSize; + return stats; + } + if (stats.LLtype == set_compressed) + stats.lastCountSize = countSize; + op += countSize; + assert(op <= oend); + } } + /* build CTable for Offsets */ + { unsigned max = MaxOff; + size_t const mostFrequent = HIST_countFast_wksp( + countWorkspace, &max, ofCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ + /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */ + ZSTD_DefaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; + DEBUGLOG(5, "Building OF table"); + nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode; + stats.Offtype = ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMode, + countWorkspace, max, mostFrequent, nbSeq, + OffFSELog, prevEntropy->offcodeCTable, + OF_defaultNorm, OF_defaultNormLog, + defaultPolicy, strategy); + assert(!(stats.Offtype < set_compressed && nextEntropy->offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable( + op, (size_t)(oend - op), + CTable_OffsetBits, OffFSELog, (SymbolEncodingType_e)stats.Offtype, + countWorkspace, max, ofCodeTable, nbSeq, + OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, + prevEntropy->offcodeCTable, + sizeof(prevEntropy->offcodeCTable), + entropyWorkspace, entropyWkspSize); + if (ZSTD_isError(countSize)) { + DEBUGLOG(3, "ZSTD_buildCTable for Offsets failed"); + stats.size = countSize; + return stats; + } + if (stats.Offtype == set_compressed) + stats.lastCountSize = countSize; + op += countSize; + assert(op <= oend); + } } + /* build CTable for MatchLengths */ + { unsigned max = MaxML; + size_t const mostFrequent = HIST_countFast_wksp( + countWorkspace, &max, mlCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ + DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend-op)); + nextEntropy->matchlength_repeatMode = prevEntropy->matchlength_repeatMode; + stats.MLtype = ZSTD_selectEncodingType(&nextEntropy->matchlength_repeatMode, + countWorkspace, max, mostFrequent, nbSeq, + MLFSELog, prevEntropy->matchlengthCTable, + ML_defaultNorm, ML_defaultNormLog, + ZSTD_defaultAllowed, strategy); + assert(!(stats.MLtype < set_compressed && nextEntropy->matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable( + op, (size_t)(oend - op), + CTable_MatchLength, MLFSELog, (SymbolEncodingType_e)stats.MLtype, + countWorkspace, max, mlCodeTable, nbSeq, + ML_defaultNorm, ML_defaultNormLog, MaxML, + prevEntropy->matchlengthCTable, + sizeof(prevEntropy->matchlengthCTable), + entropyWorkspace, entropyWkspSize); + if (ZSTD_isError(countSize)) { + DEBUGLOG(3, "ZSTD_buildCTable for MatchLengths failed"); + stats.size = countSize; + return stats; + } + if (stats.MLtype == set_compressed) + stats.lastCountSize = countSize; + op += countSize; + assert(op <= oend); + } } + stats.size = (size_t)(op-ostart); + return stats; +} + +/* ZSTD_entropyCompressSeqStore_internal(): + * compresses both literals and sequences + * Returns compressed size of block, or a zstd error. + */ +#define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20 +MEM_STATIC size_t +ZSTD_entropyCompressSeqStore_internal( + void* dst, size_t dstCapacity, + const void* literals, size_t litSize, + const SeqStore_t* seqStorePtr, + const ZSTD_entropyCTables_t* prevEntropy, + ZSTD_entropyCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, + void* entropyWorkspace, size_t entropyWkspSize, + const int bmi2) +{ + ZSTD_strategy const strategy = cctxParams->cParams.strategy; + unsigned* count = (unsigned*)entropyWorkspace; + FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable; + FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable; + FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable; + const SeqDef* const sequences = seqStorePtr->sequencesStart; + const size_t nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + const BYTE* const ofCodeTable = seqStorePtr->ofCode; + const BYTE* const llCodeTable = seqStorePtr->llCode; + const BYTE* const mlCodeTable = seqStorePtr->mlCode; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstCapacity; + BYTE* op = ostart; + size_t lastCountSize; + int longOffsets = 0; + + entropyWorkspace = count + (MaxSeq + 1); + entropyWkspSize -= (MaxSeq + 1) * sizeof(*count); + + DEBUGLOG(5, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu, dstCapacity=%zu)", nbSeq, dstCapacity); + ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<= HUF_WORKSPACE_SIZE); + + /* Compress literals */ + { size_t const numSequences = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + /* Base suspicion of uncompressibility on ratio of literals to sequences */ + int const suspectUncompressible = (numSequences == 0) || (litSize / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO); + + size_t const cSize = ZSTD_compressLiterals( + op, dstCapacity, + literals, litSize, + entropyWorkspace, entropyWkspSize, + &prevEntropy->huf, &nextEntropy->huf, + cctxParams->cParams.strategy, + ZSTD_literalsCompressionIsDisabled(cctxParams), + suspectUncompressible, bmi2); + FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed"); + assert(cSize <= dstCapacity); + op += cSize; + } + + /* Sequences Header */ + RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/, + dstSize_tooSmall, "Can't fit seq hdr in output buf!"); + if (nbSeq < 128) { + *op++ = (BYTE)nbSeq; + } else if (nbSeq < LONGNBSEQ) { + op[0] = (BYTE)((nbSeq>>8) + 0x80); + op[1] = (BYTE)nbSeq; + op+=2; + } else { + op[0]=0xFF; + MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)); + op+=3; + } + assert(op <= oend); + if (nbSeq==0) { + /* Copy the old tables over as if we repeated them */ + ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse)); + return (size_t)(op - ostart); + } + { BYTE* const seqHead = op++; + /* build stats for sequences */ + const ZSTD_symbolEncodingTypeStats_t stats = + ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, + &prevEntropy->fse, &nextEntropy->fse, + op, oend, + strategy, count, + entropyWorkspace, entropyWkspSize); + FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed!"); + *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2)); + lastCountSize = stats.lastCountSize; + op += stats.size; + longOffsets = stats.longOffsets; + } + + { size_t const bitstreamSize = ZSTD_encodeSequences( + op, (size_t)(oend - op), + CTable_MatchLength, mlCodeTable, + CTable_OffsetBits, ofCodeTable, + CTable_LitLength, llCodeTable, + sequences, nbSeq, + longOffsets, bmi2); + FORWARD_IF_ERROR(bitstreamSize, "ZSTD_encodeSequences failed"); + op += bitstreamSize; + assert(op <= oend); + /* zstd versions <= 1.3.4 mistakenly report corruption when + * FSE_readNCount() receives a buffer < 4 bytes. + * Fixed by https://github.com/facebook/zstd/pull/1146. + * This can happen when the last set_compressed table present is 2 + * bytes and the bitstream is only one byte. + * In this exceedingly rare case, we will simply emit an uncompressed + * block, since it isn't worth optimizing. + */ + if (lastCountSize && (lastCountSize + bitstreamSize) < 4) { + /* lastCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */ + assert(lastCountSize + bitstreamSize == 3); + DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.3.4 by " + "emitting an uncompressed block."); + return 0; + } + } + + DEBUGLOG(5, "compressed block size : %u", (unsigned)(op - ostart)); + return (size_t)(op - ostart); +} + +static size_t +ZSTD_entropyCompressSeqStore_wExtLitBuffer( + void* dst, size_t dstCapacity, + const void* literals, size_t litSize, + size_t blockSize, + const SeqStore_t* seqStorePtr, + const ZSTD_entropyCTables_t* prevEntropy, + ZSTD_entropyCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, + void* entropyWorkspace, size_t entropyWkspSize, + int bmi2) +{ + size_t const cSize = ZSTD_entropyCompressSeqStore_internal( + dst, dstCapacity, + literals, litSize, + seqStorePtr, prevEntropy, nextEntropy, cctxParams, + entropyWorkspace, entropyWkspSize, bmi2); + if (cSize == 0) return 0; + /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block. + * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block. + */ + if ((cSize == ERROR(dstSize_tooSmall)) & (blockSize <= dstCapacity)) { + DEBUGLOG(4, "not enough dstCapacity (%zu) for ZSTD_entropyCompressSeqStore_internal()=> do not compress block", dstCapacity); + return 0; /* block not compressed */ + } + FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed"); + + /* Check compressibility */ + { size_t const maxCSize = blockSize - ZSTD_minGain(blockSize, cctxParams->cParams.strategy); + if (cSize >= maxCSize) return 0; /* block not compressed */ + } + DEBUGLOG(5, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize); + /* libzstd decoder before > v1.5.4 is not compatible with compressed blocks of size ZSTD_BLOCKSIZE_MAX exactly. + * This restriction is indirectly already fulfilled by respecting ZSTD_minGain() condition above. + */ + assert(cSize < ZSTD_BLOCKSIZE_MAX); + return cSize; +} + +static size_t +ZSTD_entropyCompressSeqStore( + const SeqStore_t* seqStorePtr, + const ZSTD_entropyCTables_t* prevEntropy, + ZSTD_entropyCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, + void* dst, size_t dstCapacity, + size_t srcSize, + void* entropyWorkspace, size_t entropyWkspSize, + int bmi2) +{ + return ZSTD_entropyCompressSeqStore_wExtLitBuffer( + dst, dstCapacity, + seqStorePtr->litStart, (size_t)(seqStorePtr->lit - seqStorePtr->litStart), + srcSize, + seqStorePtr, + prevEntropy, nextEntropy, + cctxParams, + entropyWorkspace, entropyWkspSize, + bmi2); +} + +/* ZSTD_selectBlockCompressor() : + * Not static, but internal use only (used by long distance matcher) + * assumption : strat is a valid strategy */ +ZSTD_BlockCompressor_f ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_ParamSwitch_e useRowMatchFinder, ZSTD_dictMode_e dictMode) +{ + static const ZSTD_BlockCompressor_f blockCompressor[4][ZSTD_STRATEGY_MAX+1] = { + { ZSTD_compressBlock_fast /* default for 0 */, + ZSTD_compressBlock_fast, + ZSTD_COMPRESSBLOCK_DOUBLEFAST, + ZSTD_COMPRESSBLOCK_GREEDY, + ZSTD_COMPRESSBLOCK_LAZY, + ZSTD_COMPRESSBLOCK_LAZY2, + ZSTD_COMPRESSBLOCK_BTLAZY2, + ZSTD_COMPRESSBLOCK_BTOPT, + ZSTD_COMPRESSBLOCK_BTULTRA, + ZSTD_COMPRESSBLOCK_BTULTRA2 + }, + { ZSTD_compressBlock_fast_extDict /* default for 0 */, + ZSTD_compressBlock_fast_extDict, + ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT, + ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT, + ZSTD_COMPRESSBLOCK_LAZY_EXTDICT, + ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT, + ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT, + ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT, + ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT, + ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT + }, + { ZSTD_compressBlock_fast_dictMatchState /* default for 0 */, + ZSTD_compressBlock_fast_dictMatchState, + ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE, + ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE, + ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE, + ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE, + ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE, + ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE, + ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE, + ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE + }, + { NULL /* default for 0 */, + NULL, + NULL, + ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH, + ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH, + ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH, + NULL, + NULL, + NULL, + NULL } + }; + ZSTD_BlockCompressor_f selectedCompressor; + ZSTD_STATIC_ASSERT((unsigned)ZSTD_fast == 1); + + assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat)); + DEBUGLOG(5, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder); + if (ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) { + static const ZSTD_BlockCompressor_f rowBasedBlockCompressors[4][3] = { + { + ZSTD_COMPRESSBLOCK_GREEDY_ROW, + ZSTD_COMPRESSBLOCK_LAZY_ROW, + ZSTD_COMPRESSBLOCK_LAZY2_ROW + }, + { + ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW, + ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW, + ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW + }, + { + ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW, + ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW, + ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW + }, + { + ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW, + ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW, + ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW + } + }; + DEBUGLOG(5, "Selecting a row-based matchfinder"); + assert(useRowMatchFinder != ZSTD_ps_auto); + selectedCompressor = rowBasedBlockCompressors[(int)dictMode][(int)strat - (int)ZSTD_greedy]; + } else { + selectedCompressor = blockCompressor[(int)dictMode][(int)strat]; + } + assert(selectedCompressor != NULL); + return selectedCompressor; +} + +static void ZSTD_storeLastLiterals(SeqStore_t* seqStorePtr, + const BYTE* anchor, size_t lastLLSize) +{ + ZSTD_memcpy(seqStorePtr->lit, anchor, lastLLSize); + seqStorePtr->lit += lastLLSize; +} + +void ZSTD_resetSeqStore(SeqStore_t* ssPtr) +{ + ssPtr->lit = ssPtr->litStart; + ssPtr->sequences = ssPtr->sequencesStart; + ssPtr->longLengthType = ZSTD_llt_none; +} + +/* ZSTD_postProcessSequenceProducerResult() : + * Validates and post-processes sequences obtained through the external matchfinder API: + * - Checks whether nbExternalSeqs represents an error condition. + * - Appends a block delimiter to outSeqs if one is not already present. + * See zstd.h for context regarding block delimiters. + * Returns the number of sequences after post-processing, or an error code. */ +static size_t ZSTD_postProcessSequenceProducerResult( + ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, size_t srcSize +) { + RETURN_ERROR_IF( + nbExternalSeqs > outSeqsCapacity, + sequenceProducer_failed, + "External sequence producer returned error code %lu", + (unsigned long)nbExternalSeqs + ); + + RETURN_ERROR_IF( + nbExternalSeqs == 0 && srcSize > 0, + sequenceProducer_failed, + "Got zero sequences from external sequence producer for a non-empty src buffer!" + ); + + if (srcSize == 0) { + ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence)); + return 1; + } + + { + ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1]; + + /* We can return early if lastSeq is already a block delimiter. */ + if (lastSeq.offset == 0 && lastSeq.matchLength == 0) { + return nbExternalSeqs; + } + + /* This error condition is only possible if the external matchfinder + * produced an invalid parse, by definition of ZSTD_sequenceBound(). */ + RETURN_ERROR_IF( + nbExternalSeqs == outSeqsCapacity, + sequenceProducer_failed, + "nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!" + ); + + /* lastSeq is not a block delimiter, so we need to append one. */ + ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence)); + return nbExternalSeqs + 1; + } +} + +/* ZSTD_fastSequenceLengthSum() : + * Returns sum(litLen) + sum(matchLen) + lastLits for *seqBuf*. + * Similar to another function in zstd_compress.c (determine_blockSize), + * except it doesn't check for a block delimiter to end summation. + * Removing the early exit allows the compiler to auto-vectorize (https://godbolt.org/z/cY1cajz9P). + * This function can be deleted and replaced by determine_blockSize after we resolve issue #3456. */ +static size_t ZSTD_fastSequenceLengthSum(ZSTD_Sequence const* seqBuf, size_t seqBufSize) { + size_t matchLenSum, litLenSum, i; + matchLenSum = 0; + litLenSum = 0; + for (i = 0; i < seqBufSize; i++) { + litLenSum += seqBuf[i].litLength; + matchLenSum += seqBuf[i].matchLength; + } + return litLenSum + matchLenSum; +} + +/** + * Function to validate sequences produced by a block compressor. + */ +static void ZSTD_validateSeqStore(const SeqStore_t* seqStore, const ZSTD_compressionParameters* cParams) +{ +#if DEBUGLEVEL >= 1 + const SeqDef* seq = seqStore->sequencesStart; + const SeqDef* const seqEnd = seqStore->sequences; + size_t const matchLenLowerBound = cParams->minMatch == 3 ? 3 : 4; + for (; seq < seqEnd; ++seq) { + const ZSTD_SequenceLength seqLength = ZSTD_getSequenceLength(seqStore, seq); + assert(seqLength.matchLength >= matchLenLowerBound); + (void)seqLength; + (void)matchLenLowerBound; + } +#else + (void)seqStore; + (void)cParams; +#endif +} + +static size_t +ZSTD_transferSequences_wBlockDelim(ZSTD_CCtx* cctx, + ZSTD_SequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, + const void* src, size_t blockSize, + ZSTD_ParamSwitch_e externalRepSearch); + +typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_BuildSeqStore_e; + +static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) +{ + ZSTD_MatchState_t* const ms = &zc->blockState.matchState; + DEBUGLOG(5, "ZSTD_buildSeqStore (srcSize=%zu)", srcSize); + assert(srcSize <= ZSTD_BLOCKSIZE_MAX); + /* Assert that we have correctly flushed the ctx params into the ms's copy */ + ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams); + /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding + * additional 1. We need to revisit and change this logic to be more consistent */ + if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) { + if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) { + ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize); + } else { + ZSTD_ldm_skipSequences(&zc->externSeqStore, srcSize, zc->appliedParams.cParams.minMatch); + } + return ZSTDbss_noCompress; /* don't even attempt compression below a certain srcSize */ + } + ZSTD_resetSeqStore(&(zc->seqStore)); + /* required for optimal parser to read stats from dictionary */ + ms->opt.symbolCosts = &zc->blockState.prevCBlock->entropy; + /* tell the optimal parser how we expect to compress literals */ + ms->opt.literalCompressionMode = zc->appliedParams.literalCompressionMode; + /* a gap between an attached dict and the current window is not safe, + * they must remain adjacent, + * and when that stops being the case, the dict must be unset */ + assert(ms->dictMatchState == NULL || ms->loadedDictEnd == ms->window.dictLimit); + + /* limited update after a very long match */ + { const BYTE* const base = ms->window.base; + const BYTE* const istart = (const BYTE*)src; + const U32 curr = (U32)(istart-base); + if (sizeof(ptrdiff_t)==8) assert(istart - base < (ptrdiff_t)(U32)(-1)); /* ensure no overflow */ + if (curr > ms->nextToUpdate + 384) + ms->nextToUpdate = curr - MIN(192, (U32)(curr - ms->nextToUpdate - 384)); + } + + /* select and store sequences */ + { ZSTD_dictMode_e const dictMode = ZSTD_matchState_dictMode(ms); + size_t lastLLSize; + { int i; + for (i = 0; i < ZSTD_REP_NUM; ++i) + zc->blockState.nextCBlock->rep[i] = zc->blockState.prevCBlock->rep[i]; + } + if (zc->externSeqStore.pos < zc->externSeqStore.size) { + assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable); + + /* External matchfinder + LDM is technically possible, just not implemented yet. + * We need to revisit soon and implement it. */ + RETURN_ERROR_IF( + ZSTD_hasExtSeqProd(&zc->appliedParams), + parameter_combination_unsupported, + "Long-distance matching with external sequence producer enabled is not currently supported." + ); + + /* Updates ldmSeqStore.pos */ + lastLLSize = + ZSTD_ldm_blockCompress(&zc->externSeqStore, + ms, &zc->seqStore, + zc->blockState.nextCBlock->rep, + zc->appliedParams.useRowMatchFinder, + src, srcSize); + assert(zc->externSeqStore.pos <= zc->externSeqStore.size); + } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) { + RawSeqStore_t ldmSeqStore = kNullRawSeqStore; + + /* External matchfinder + LDM is technically possible, just not implemented yet. + * We need to revisit soon and implement it. */ + RETURN_ERROR_IF( + ZSTD_hasExtSeqProd(&zc->appliedParams), + parameter_combination_unsupported, + "Long-distance matching with external sequence producer enabled is not currently supported." + ); + + ldmSeqStore.seq = zc->ldmSequences; + ldmSeqStore.capacity = zc->maxNbLdmSequences; + /* Updates ldmSeqStore.size */ + FORWARD_IF_ERROR(ZSTD_ldm_generateSequences(&zc->ldmState, &ldmSeqStore, + &zc->appliedParams.ldmParams, + src, srcSize), ""); + /* Updates ldmSeqStore.pos */ + lastLLSize = + ZSTD_ldm_blockCompress(&ldmSeqStore, + ms, &zc->seqStore, + zc->blockState.nextCBlock->rep, + zc->appliedParams.useRowMatchFinder, + src, srcSize); + assert(ldmSeqStore.pos == ldmSeqStore.size); + } else if (ZSTD_hasExtSeqProd(&zc->appliedParams)) { + assert( + zc->extSeqBufCapacity >= ZSTD_sequenceBound(srcSize) + ); + assert(zc->appliedParams.extSeqProdFunc != NULL); + + { U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog; + + size_t const nbExternalSeqs = (zc->appliedParams.extSeqProdFunc)( + zc->appliedParams.extSeqProdState, + zc->extSeqBuf, + zc->extSeqBufCapacity, + src, srcSize, + NULL, 0, /* dict and dictSize, currently not supported */ + zc->appliedParams.compressionLevel, + windowSize + ); + + size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult( + zc->extSeqBuf, + nbExternalSeqs, + zc->extSeqBufCapacity, + srcSize + ); + + /* Return early if there is no error, since we don't need to worry about last literals */ + if (!ZSTD_isError(nbPostProcessedSeqs)) { + ZSTD_SequencePosition seqPos = {0,0,0}; + size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->extSeqBuf, nbPostProcessedSeqs); + RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!"); + FORWARD_IF_ERROR( + ZSTD_transferSequences_wBlockDelim( + zc, &seqPos, + zc->extSeqBuf, nbPostProcessedSeqs, + src, srcSize, + zc->appliedParams.searchForExternalRepcodes + ), + "Failed to copy external sequences to seqStore!" + ); + ms->ldmSeqStore = NULL; + DEBUGLOG(5, "Copied %lu sequences from external sequence producer to internal seqStore.", (unsigned long)nbExternalSeqs); + return ZSTDbss_compress; + } + + /* Propagate the error if fallback is disabled */ + if (!zc->appliedParams.enableMatchFinderFallback) { + return nbPostProcessedSeqs; + } + + /* Fallback to software matchfinder */ + { ZSTD_BlockCompressor_f const blockCompressor = + ZSTD_selectBlockCompressor( + zc->appliedParams.cParams.strategy, + zc->appliedParams.useRowMatchFinder, + dictMode); + ms->ldmSeqStore = NULL; + DEBUGLOG( + 5, + "External sequence producer returned error code %lu. Falling back to internal parser.", + (unsigned long)nbExternalSeqs + ); + lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); + } } + } else { /* not long range mode and no external matchfinder */ + ZSTD_BlockCompressor_f const blockCompressor = ZSTD_selectBlockCompressor( + zc->appliedParams.cParams.strategy, + zc->appliedParams.useRowMatchFinder, + dictMode); + ms->ldmSeqStore = NULL; + lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); + } + { const BYTE* const lastLiterals = (const BYTE*)src + srcSize - lastLLSize; + ZSTD_storeLastLiterals(&zc->seqStore, lastLiterals, lastLLSize); + } } + ZSTD_validateSeqStore(&zc->seqStore, &zc->appliedParams.cParams); + return ZSTDbss_compress; +} + +static size_t ZSTD_copyBlockSequences(SeqCollector* seqCollector, const SeqStore_t* seqStore, const U32 prevRepcodes[ZSTD_REP_NUM]) +{ + const SeqDef* inSeqs = seqStore->sequencesStart; + const size_t nbInSequences = (size_t)(seqStore->sequences - inSeqs); + const size_t nbInLiterals = (size_t)(seqStore->lit - seqStore->litStart); + + ZSTD_Sequence* outSeqs = seqCollector->seqIndex == 0 ? seqCollector->seqStart : seqCollector->seqStart + seqCollector->seqIndex; + const size_t nbOutSequences = nbInSequences + 1; + size_t nbOutLiterals = 0; + Repcodes_t repcodes; + size_t i; + + /* Bounds check that we have enough space for every input sequence + * and the block delimiter + */ + assert(seqCollector->seqIndex <= seqCollector->maxSequences); + RETURN_ERROR_IF( + nbOutSequences > (size_t)(seqCollector->maxSequences - seqCollector->seqIndex), + dstSize_tooSmall, + "Not enough space to copy sequences"); + + ZSTD_memcpy(&repcodes, prevRepcodes, sizeof(repcodes)); + for (i = 0; i < nbInSequences; ++i) { + U32 rawOffset; + outSeqs[i].litLength = inSeqs[i].litLength; + outSeqs[i].matchLength = inSeqs[i].mlBase + MINMATCH; + outSeqs[i].rep = 0; + + /* Handle the possible single length >= 64K + * There can only be one because we add MINMATCH to every match length, + * and blocks are at most 128K. + */ + if (i == seqStore->longLengthPos) { + if (seqStore->longLengthType == ZSTD_llt_literalLength) { + outSeqs[i].litLength += 0x10000; + } else if (seqStore->longLengthType == ZSTD_llt_matchLength) { + outSeqs[i].matchLength += 0x10000; + } + } + + /* Determine the raw offset given the offBase, which may be a repcode. */ + if (OFFBASE_IS_REPCODE(inSeqs[i].offBase)) { + const U32 repcode = OFFBASE_TO_REPCODE(inSeqs[i].offBase); + assert(repcode > 0); + outSeqs[i].rep = repcode; + if (outSeqs[i].litLength != 0) { + rawOffset = repcodes.rep[repcode - 1]; + } else { + if (repcode == 3) { + assert(repcodes.rep[0] > 1); + rawOffset = repcodes.rep[0] - 1; + } else { + rawOffset = repcodes.rep[repcode]; + } + } + } else { + rawOffset = OFFBASE_TO_OFFSET(inSeqs[i].offBase); + } + outSeqs[i].offset = rawOffset; + + /* Update repcode history for the sequence */ + ZSTD_updateRep(repcodes.rep, + inSeqs[i].offBase, + inSeqs[i].litLength == 0); + + nbOutLiterals += outSeqs[i].litLength; + } + /* Insert last literals (if any exist) in the block as a sequence with ml == off == 0. + * If there are no last literals, then we'll emit (of: 0, ml: 0, ll: 0), which is a marker + * for the block boundary, according to the API. + */ + assert(nbInLiterals >= nbOutLiterals); + { + const size_t lastLLSize = nbInLiterals - nbOutLiterals; + outSeqs[nbInSequences].litLength = (U32)lastLLSize; + outSeqs[nbInSequences].matchLength = 0; + outSeqs[nbInSequences].offset = 0; + assert(nbOutSequences == nbInSequences + 1); + } + seqCollector->seqIndex += nbOutSequences; + assert(seqCollector->seqIndex <= seqCollector->maxSequences); + + return 0; +} + +size_t ZSTD_sequenceBound(size_t srcSize) { + const size_t maxNbSeq = (srcSize / ZSTD_MINMATCH_MIN) + 1; + const size_t maxNbDelims = (srcSize / ZSTD_BLOCKSIZE_MAX_MIN) + 1; + return maxNbSeq + maxNbDelims; +} + +size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, + size_t outSeqsSize, const void* src, size_t srcSize) +{ + const size_t dstCapacity = ZSTD_compressBound(srcSize); + void* dst; /* Make C90 happy. */ + SeqCollector seqCollector; + { + int targetCBlockSize; + FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_targetCBlockSize, &targetCBlockSize), ""); + RETURN_ERROR_IF(targetCBlockSize != 0, parameter_unsupported, "targetCBlockSize != 0"); + } + { + int nbWorkers; + FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_nbWorkers, &nbWorkers), ""); + RETURN_ERROR_IF(nbWorkers != 0, parameter_unsupported, "nbWorkers != 0"); + } + + dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem); + RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!"); + + seqCollector.collectSequences = 1; + seqCollector.seqStart = outSeqs; + seqCollector.seqIndex = 0; + seqCollector.maxSequences = outSeqsSize; + zc->seqCollector = seqCollector; + + { + const size_t ret = ZSTD_compress2(zc, dst, dstCapacity, src, srcSize); + ZSTD_customFree(dst, ZSTD_defaultCMem); + FORWARD_IF_ERROR(ret, "ZSTD_compress2 failed"); + } + assert(zc->seqCollector.seqIndex <= ZSTD_sequenceBound(srcSize)); + return zc->seqCollector.seqIndex; +} + +size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize) { + size_t in = 0; + size_t out = 0; + for (; in < seqsSize; ++in) { + if (sequences[in].offset == 0 && sequences[in].matchLength == 0) { + if (in != seqsSize - 1) { + sequences[in+1].litLength += sequences[in].litLength; + } + } else { + sequences[out] = sequences[in]; + ++out; + } + } + return out; +} + +/* Unrolled loop to read four size_ts of input at a time. Returns 1 if is RLE, 0 if not. */ +static int ZSTD_isRLE(const BYTE* src, size_t length) { + const BYTE* ip = src; + const BYTE value = ip[0]; + const size_t valueST = (size_t)((U64)value * 0x0101010101010101ULL); + const size_t unrollSize = sizeof(size_t) * 4; + const size_t unrollMask = unrollSize - 1; + const size_t prefixLength = length & unrollMask; + size_t i; + if (length == 1) return 1; + /* Check if prefix is RLE first before using unrolled loop */ + if (prefixLength && ZSTD_count(ip+1, ip, ip+prefixLength) != prefixLength-1) { + return 0; + } + for (i = prefixLength; i != length; i += unrollSize) { + size_t u; + for (u = 0; u < unrollSize; u += sizeof(size_t)) { + if (MEM_readST(ip + i + u) != valueST) { + return 0; + } } } + return 1; +} + +/* Returns true if the given block may be RLE. + * This is just a heuristic based on the compressibility. + * It may return both false positives and false negatives. + */ +static int ZSTD_maybeRLE(SeqStore_t const* seqStore) +{ + size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); + size_t const nbLits = (size_t)(seqStore->lit - seqStore->litStart); + + return nbSeqs < 4 && nbLits < 10; +} + +static void +ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs) +{ + ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock; + bs->prevCBlock = bs->nextCBlock; + bs->nextCBlock = tmp; +} + +/* Writes the block header */ +static void +writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) +{ + U32 const cBlockHeader = cSize == 1 ? + lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) : + lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); + MEM_writeLE24(op, cBlockHeader); + DEBUGLOG(5, "writeBlockHeader: cSize: %zu blockSize: %zu lastBlock: %u", cSize, blockSize, lastBlock); +} + +/** ZSTD_buildBlockEntropyStats_literals() : + * Builds entropy for the literals. + * Stores literals block type (raw, rle, compressed, repeat) and + * huffman description table to hufMetadata. + * Requires ENTROPY_WORKSPACE_SIZE workspace + * @return : size of huffman description table, or an error code + */ +static size_t +ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize, + const ZSTD_hufCTables_t* prevHuf, + ZSTD_hufCTables_t* nextHuf, + ZSTD_hufCTablesMetadata_t* hufMetadata, + const int literalsCompressionIsDisabled, + void* workspace, size_t wkspSize, + int hufFlags) +{ + BYTE* const wkspStart = (BYTE*)workspace; + BYTE* const wkspEnd = wkspStart + wkspSize; + BYTE* const countWkspStart = wkspStart; + unsigned* const countWksp = (unsigned*)workspace; + const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned); + BYTE* const nodeWksp = countWkspStart + countWkspSize; + const size_t nodeWkspSize = (size_t)(wkspEnd - nodeWksp); + unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; + unsigned huffLog = LitHufLog; + HUF_repeat repeat = prevHuf->repeatMode; + DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize); + + /* Prepare nextEntropy assuming reusing the existing table */ + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + + if (literalsCompressionIsDisabled) { + DEBUGLOG(5, "set_basic - disabled"); + hufMetadata->hType = set_basic; + return 0; + } + + /* small ? don't even attempt compression (speed opt) */ +#ifndef COMPRESS_LITERALS_SIZE_MIN +# define COMPRESS_LITERALS_SIZE_MIN 63 /* heuristic */ +#endif + { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; + if (srcSize <= minLitSize) { + DEBUGLOG(5, "set_basic - too small"); + hufMetadata->hType = set_basic; + return 0; + } } + + /* Scan input and build symbol stats */ + { size_t const largest = + HIST_count_wksp (countWksp, &maxSymbolValue, + (const BYTE*)src, srcSize, + workspace, wkspSize); + FORWARD_IF_ERROR(largest, "HIST_count_wksp failed"); + if (largest == srcSize) { + /* only one literal symbol */ + DEBUGLOG(5, "set_rle"); + hufMetadata->hType = set_rle; + return 0; + } + if (largest <= (srcSize >> 7)+4) { + /* heuristic: likely not compressible */ + DEBUGLOG(5, "set_basic - no gain"); + hufMetadata->hType = set_basic; + return 0; + } } + + /* Validate the previous Huffman table */ + if (repeat == HUF_repeat_check + && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { + repeat = HUF_repeat_none; + } + + /* Build Huffman Tree */ + ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable)); + huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, nodeWksp, nodeWkspSize, nextHuf->CTable, countWksp, hufFlags); + assert(huffLog <= LitHufLog); + { size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp, + maxSymbolValue, huffLog, + nodeWksp, nodeWkspSize); + FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp"); + huffLog = (U32)maxBits; + } + { /* Build and write the CTable */ + size_t const newCSize = HUF_estimateCompressedSize( + (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); + size_t const hSize = HUF_writeCTable_wksp( + hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), + (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog, + nodeWksp, nodeWkspSize); + /* Check against repeating the previous CTable */ + if (repeat != HUF_repeat_none) { + size_t const oldCSize = HUF_estimateCompressedSize( + (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); + if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { + DEBUGLOG(5, "set_repeat - smaller"); + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + hufMetadata->hType = set_repeat; + return 0; + } } + if (newCSize + hSize >= srcSize) { + DEBUGLOG(5, "set_basic - no gains"); + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + hufMetadata->hType = set_basic; + return 0; + } + DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); + hufMetadata->hType = set_compressed; + nextHuf->repeatMode = HUF_repeat_check; + return hSize; + } +} + + +/* ZSTD_buildDummySequencesStatistics(): + * Returns a ZSTD_symbolEncodingTypeStats_t with all encoding types as set_basic, + * and updates nextEntropy to the appropriate repeatMode. + */ +static ZSTD_symbolEncodingTypeStats_t +ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) +{ + ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0, 0}; + nextEntropy->litlength_repeatMode = FSE_repeat_none; + nextEntropy->offcode_repeatMode = FSE_repeat_none; + nextEntropy->matchlength_repeatMode = FSE_repeat_none; + return stats; +} + +/** ZSTD_buildBlockEntropyStats_sequences() : + * Builds entropy for the sequences. + * Stores symbol compression modes and fse table to fseMetadata. + * Requires ENTROPY_WORKSPACE_SIZE wksp. + * @return : size of fse tables or error code */ +static size_t +ZSTD_buildBlockEntropyStats_sequences( + const SeqStore_t* seqStorePtr, + const ZSTD_fseCTables_t* prevEntropy, + ZSTD_fseCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, + ZSTD_fseCTablesMetadata_t* fseMetadata, + void* workspace, size_t wkspSize) +{ + ZSTD_strategy const strategy = cctxParams->cParams.strategy; + size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + BYTE* const ostart = fseMetadata->fseTablesBuffer; + BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer); + BYTE* op = ostart; + unsigned* countWorkspace = (unsigned*)workspace; + unsigned* entropyWorkspace = countWorkspace + (MaxSeq + 1); + size_t entropyWorkspaceSize = wkspSize - (MaxSeq + 1) * sizeof(*countWorkspace); + ZSTD_symbolEncodingTypeStats_t stats; + + DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_sequences (nbSeq=%zu)", nbSeq); + stats = nbSeq != 0 ? ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, + prevEntropy, nextEntropy, op, oend, + strategy, countWorkspace, + entropyWorkspace, entropyWorkspaceSize) + : ZSTD_buildDummySequencesStatistics(nextEntropy); + FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed!"); + fseMetadata->llType = (SymbolEncodingType_e) stats.LLtype; + fseMetadata->ofType = (SymbolEncodingType_e) stats.Offtype; + fseMetadata->mlType = (SymbolEncodingType_e) stats.MLtype; + fseMetadata->lastCountSize = stats.lastCountSize; + return stats.size; +} + + +/** ZSTD_buildBlockEntropyStats() : + * Builds entropy for the block. + * Requires workspace size ENTROPY_WORKSPACE_SIZE + * @return : 0 on success, or an error code + * Note : also employed in superblock + */ +size_t ZSTD_buildBlockEntropyStats( + const SeqStore_t* seqStorePtr, + const ZSTD_entropyCTables_t* prevEntropy, + ZSTD_entropyCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, + ZSTD_entropyCTablesMetadata_t* entropyMetadata, + void* workspace, size_t wkspSize) +{ + size_t const litSize = (size_t)(seqStorePtr->lit - seqStorePtr->litStart); + int const huf_useOptDepth = (cctxParams->cParams.strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD); + int const hufFlags = huf_useOptDepth ? HUF_flags_optimalDepth : 0; + + entropyMetadata->hufMetadata.hufDesSize = + ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize, + &prevEntropy->huf, &nextEntropy->huf, + &entropyMetadata->hufMetadata, + ZSTD_literalsCompressionIsDisabled(cctxParams), + workspace, wkspSize, hufFlags); + + FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed"); + entropyMetadata->fseMetadata.fseTablesSize = + ZSTD_buildBlockEntropyStats_sequences(seqStorePtr, + &prevEntropy->fse, &nextEntropy->fse, + cctxParams, + &entropyMetadata->fseMetadata, + workspace, wkspSize); + FORWARD_IF_ERROR(entropyMetadata->fseMetadata.fseTablesSize, "ZSTD_buildBlockEntropyStats_sequences failed"); + return 0; +} + +/* Returns the size estimate for the literals section (header + content) of a block */ +static size_t +ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize, + const ZSTD_hufCTables_t* huf, + const ZSTD_hufCTablesMetadata_t* hufMetadata, + void* workspace, size_t wkspSize, + int writeEntropy) +{ + unsigned* const countWksp = (unsigned*)workspace; + unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; + size_t literalSectionHeaderSize = 3 + (litSize >= 1 KB) + (litSize >= 16 KB); + U32 singleStream = litSize < 256; + + if (hufMetadata->hType == set_basic) return litSize; + else if (hufMetadata->hType == set_rle) return 1; + else if (hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat) { + size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)literals, litSize, workspace, wkspSize); + if (ZSTD_isError(largest)) return litSize; + { size_t cLitSizeEstimate = HUF_estimateCompressedSize((const HUF_CElt*)huf->CTable, countWksp, maxSymbolValue); + if (writeEntropy) cLitSizeEstimate += hufMetadata->hufDesSize; + if (!singleStream) cLitSizeEstimate += 6; /* multi-stream huffman uses 6-byte jump table */ + return cLitSizeEstimate + literalSectionHeaderSize; + } } + assert(0); /* impossible */ + return 0; +} + +/* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */ +static size_t +ZSTD_estimateBlockSize_symbolType(SymbolEncodingType_e type, + const BYTE* codeTable, size_t nbSeq, unsigned maxCode, + const FSE_CTable* fseCTable, + const U8* additionalBits, + short const* defaultNorm, U32 defaultNormLog, U32 defaultMax, + void* workspace, size_t wkspSize) +{ + unsigned* const countWksp = (unsigned*)workspace; + const BYTE* ctp = codeTable; + const BYTE* const ctStart = ctp; + const BYTE* const ctEnd = ctStart + nbSeq; + size_t cSymbolTypeSizeEstimateInBits = 0; + unsigned max = maxCode; + + HIST_countFast_wksp(countWksp, &max, codeTable, nbSeq, workspace, wkspSize); /* can't fail */ + if (type == set_basic) { + /* We selected this encoding type, so it must be valid. */ + assert(max <= defaultMax); + (void)defaultMax; + cSymbolTypeSizeEstimateInBits = ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, countWksp, max); + } else if (type == set_rle) { + cSymbolTypeSizeEstimateInBits = 0; + } else if (type == set_compressed || type == set_repeat) { + cSymbolTypeSizeEstimateInBits = ZSTD_fseBitCost(fseCTable, countWksp, max); + } + if (ZSTD_isError(cSymbolTypeSizeEstimateInBits)) { + return nbSeq * 10; + } + while (ctp < ctEnd) { + if (additionalBits) cSymbolTypeSizeEstimateInBits += additionalBits[*ctp]; + else cSymbolTypeSizeEstimateInBits += *ctp; /* for offset, offset code is also the number of additional bits */ + ctp++; + } + return cSymbolTypeSizeEstimateInBits >> 3; +} + +/* Returns the size estimate for the sequences section (header + content) of a block */ +static size_t +ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable, + const BYTE* llCodeTable, + const BYTE* mlCodeTable, + size_t nbSeq, + const ZSTD_fseCTables_t* fseTables, + const ZSTD_fseCTablesMetadata_t* fseMetadata, + void* workspace, size_t wkspSize, + int writeEntropy) +{ + size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ); + size_t cSeqSizeEstimate = 0; + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff, + fseTables->offcodeCTable, NULL, + OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, + workspace, wkspSize); + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL, + fseTables->litlengthCTable, LL_bits, + LL_defaultNorm, LL_defaultNormLog, MaxLL, + workspace, wkspSize); + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML, + fseTables->matchlengthCTable, ML_bits, + ML_defaultNorm, ML_defaultNormLog, MaxML, + workspace, wkspSize); + if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize; + return cSeqSizeEstimate + sequencesSectionHeaderSize; +} + +/* Returns the size estimate for a given stream of literals, of, ll, ml */ +static size_t +ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize, + const BYTE* ofCodeTable, + const BYTE* llCodeTable, + const BYTE* mlCodeTable, + size_t nbSeq, + const ZSTD_entropyCTables_t* entropy, + const ZSTD_entropyCTablesMetadata_t* entropyMetadata, + void* workspace, size_t wkspSize, + int writeLitEntropy, int writeSeqEntropy) +{ + size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize, + &entropy->huf, &entropyMetadata->hufMetadata, + workspace, wkspSize, writeLitEntropy); + size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, + nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, + workspace, wkspSize, writeSeqEntropy); + return seqSize + literalsSize + ZSTD_blockHeaderSize; +} + +/* Builds entropy statistics and uses them for blocksize estimation. + * + * @return: estimated compressed size of the seqStore, or a zstd error. + */ +static size_t +ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(SeqStore_t* seqStore, ZSTD_CCtx* zc) +{ + ZSTD_entropyCTablesMetadata_t* const entropyMetadata = &zc->blockSplitCtx.entropyMetadata; + DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()"); + FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore, + &zc->blockState.prevCBlock->entropy, + &zc->blockState.nextCBlock->entropy, + &zc->appliedParams, + entropyMetadata, + zc->tmpWorkspace, zc->tmpWkspSize), ""); + return ZSTD_estimateBlockSize( + seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart), + seqStore->ofCode, seqStore->llCode, seqStore->mlCode, + (size_t)(seqStore->sequences - seqStore->sequencesStart), + &zc->blockState.nextCBlock->entropy, + entropyMetadata, + zc->tmpWorkspace, zc->tmpWkspSize, + (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1); +} + +/* Returns literals bytes represented in a seqStore */ +static size_t ZSTD_countSeqStoreLiteralsBytes(const SeqStore_t* const seqStore) +{ + size_t literalsBytes = 0; + size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); + size_t i; + for (i = 0; i < nbSeqs; ++i) { + SeqDef const seq = seqStore->sequencesStart[i]; + literalsBytes += seq.litLength; + if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) { + literalsBytes += 0x10000; + } } + return literalsBytes; +} + +/* Returns match bytes represented in a seqStore */ +static size_t ZSTD_countSeqStoreMatchBytes(const SeqStore_t* const seqStore) +{ + size_t matchBytes = 0; + size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); + size_t i; + for (i = 0; i < nbSeqs; ++i) { + SeqDef seq = seqStore->sequencesStart[i]; + matchBytes += seq.mlBase + MINMATCH; + if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) { + matchBytes += 0x10000; + } } + return matchBytes; +} + +/* Derives the seqStore that is a chunk of the originalSeqStore from [startIdx, endIdx). + * Stores the result in resultSeqStore. + */ +static void ZSTD_deriveSeqStoreChunk(SeqStore_t* resultSeqStore, + const SeqStore_t* originalSeqStore, + size_t startIdx, size_t endIdx) +{ + *resultSeqStore = *originalSeqStore; + if (startIdx > 0) { + resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx; + resultSeqStore->litStart += ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); + } + + /* Move longLengthPos into the correct position if necessary */ + if (originalSeqStore->longLengthType != ZSTD_llt_none) { + if (originalSeqStore->longLengthPos < startIdx || originalSeqStore->longLengthPos > endIdx) { + resultSeqStore->longLengthType = ZSTD_llt_none; + } else { + resultSeqStore->longLengthPos -= (U32)startIdx; + } + } + resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx; + resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx; + if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) { + /* This accounts for possible last literals if the derived chunk reaches the end of the block */ + assert(resultSeqStore->lit == originalSeqStore->lit); + } else { + size_t const literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); + resultSeqStore->lit = resultSeqStore->litStart + literalsBytes; + } + resultSeqStore->llCode += startIdx; + resultSeqStore->mlCode += startIdx; + resultSeqStore->ofCode += startIdx; +} + +/** + * Returns the raw offset represented by the combination of offBase, ll0, and repcode history. + * offBase must represent a repcode in the numeric representation of ZSTD_storeSeq(). + */ +static U32 +ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offBase, const U32 ll0) +{ + U32 const adjustedRepCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0; /* [ 0 - 3 ] */ + assert(OFFBASE_IS_REPCODE(offBase)); + if (adjustedRepCode == ZSTD_REP_NUM) { + assert(ll0); + /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 + * This is only valid if it results in a valid offset value, aka > 0. + * Note : it may happen that `rep[0]==1` in exceptional circumstances. + * In which case this function will return 0, which is an invalid offset. + * It's not an issue though, since this value will be + * compared and discarded within ZSTD_seqStore_resolveOffCodes(). + */ + return rep[0] - 1; + } + return rep[adjustedRepCode]; +} + +/** + * ZSTD_seqStore_resolveOffCodes() reconciles any possible divergences in offset history that may arise + * due to emission of RLE/raw blocks that disturb the offset history, + * and replaces any repcodes within the seqStore that may be invalid. + * + * dRepcodes are updated as would be on the decompression side. + * cRepcodes are updated exactly in accordance with the seqStore. + * + * Note : this function assumes seq->offBase respects the following numbering scheme : + * 0 : invalid + * 1-3 : repcode 1-3 + * 4+ : real_offset+3 + */ +static void +ZSTD_seqStore_resolveOffCodes(Repcodes_t* const dRepcodes, Repcodes_t* const cRepcodes, + const SeqStore_t* const seqStore, U32 const nbSeq) +{ + U32 idx = 0; + U32 const longLitLenIdx = seqStore->longLengthType == ZSTD_llt_literalLength ? seqStore->longLengthPos : nbSeq; + for (; idx < nbSeq; ++idx) { + SeqDef* const seq = seqStore->sequencesStart + idx; + U32 const ll0 = (seq->litLength == 0) && (idx != longLitLenIdx); + U32 const offBase = seq->offBase; + assert(offBase > 0); + if (OFFBASE_IS_REPCODE(offBase)) { + U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offBase, ll0); + U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offBase, ll0); + /* Adjust simulated decompression repcode history if we come across a mismatch. Replace + * the repcode with the offset it actually references, determined by the compression + * repcode history. + */ + if (dRawOffset != cRawOffset) { + seq->offBase = OFFSET_TO_OFFBASE(cRawOffset); + } + } + /* Compression repcode history is always updated with values directly from the unmodified seqStore. + * Decompression repcode history may use modified seq->offset value taken from compression repcode history. + */ + ZSTD_updateRep(dRepcodes->rep, seq->offBase, ll0); + ZSTD_updateRep(cRepcodes->rep, offBase, ll0); + } +} + +/* ZSTD_compressSeqStore_singleBlock(): + * Compresses a seqStore into a block with a block header, into the buffer dst. + * + * Returns the total size of that block (including header) or a ZSTD error code. + */ +static size_t +ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, + const SeqStore_t* const seqStore, + Repcodes_t* const dRep, Repcodes_t* const cRep, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + U32 lastBlock, U32 isPartition) +{ + const U32 rleMaxLength = 25; + BYTE* op = (BYTE*)dst; + const BYTE* ip = (const BYTE*)src; + size_t cSize; + size_t cSeqsSize; + + /* In case of an RLE or raw block, the simulated decompression repcode history must be reset */ + Repcodes_t const dRepOriginal = *dRep; + DEBUGLOG(5, "ZSTD_compressSeqStore_singleBlock"); + if (isPartition) + ZSTD_seqStore_resolveOffCodes(dRep, cRep, seqStore, (U32)(seqStore->sequences - seqStore->sequencesStart)); + + RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "Block header doesn't fit"); + cSeqsSize = ZSTD_entropyCompressSeqStore(seqStore, + &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy, + &zc->appliedParams, + op + ZSTD_blockHeaderSize, dstCapacity - ZSTD_blockHeaderSize, + srcSize, + zc->tmpWorkspace, zc->tmpWkspSize /* statically allocated in resetCCtx */, + zc->bmi2); + FORWARD_IF_ERROR(cSeqsSize, "ZSTD_entropyCompressSeqStore failed!"); + + if (!zc->isFirstBlock && + cSeqsSize < rleMaxLength && + ZSTD_isRLE((BYTE const*)src, srcSize)) { + /* We don't want to emit our first block as a RLE even if it qualifies because + * doing so will cause the decoder (cli only) to throw a "should consume all input error." + * This is only an issue for zstd <= v1.4.3 + */ + cSeqsSize = 1; + } + + /* Sequence collection not supported when block splitting */ + if (zc->seqCollector.collectSequences) { + FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, seqStore, dRepOriginal.rep), "copyBlockSequences failed"); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + return 0; + } + + if (cSeqsSize == 0) { + cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock); + FORWARD_IF_ERROR(cSize, "Nocompress block failed"); + DEBUGLOG(5, "Writing out nocompress block, size: %zu", cSize); + *dRep = dRepOriginal; /* reset simulated decompression repcode history */ + } else if (cSeqsSize == 1) { + cSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, srcSize, lastBlock); + FORWARD_IF_ERROR(cSize, "RLE compress block failed"); + DEBUGLOG(5, "Writing out RLE block, size: %zu", cSize); + *dRep = dRepOriginal; /* reset simulated decompression repcode history */ + } else { + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + writeBlockHeader(op, cSeqsSize, srcSize, lastBlock); + cSize = ZSTD_blockHeaderSize + cSeqsSize; + DEBUGLOG(5, "Writing out compressed block, size: %zu", cSize); + } + + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) + zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; + + return cSize; +} + +/* Struct to keep track of where we are in our recursive calls. */ +typedef struct { + U32* splitLocations; /* Array of split indices */ + size_t idx; /* The current index within splitLocations being worked on */ +} seqStoreSplits; + +#define MIN_SEQUENCES_BLOCK_SPLITTING 300 + +/* Helper function to perform the recursive search for block splits. + * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half. + * If advantageous to split, then we recurse down the two sub-blocks. + * If not, or if an error occurred in estimation, then we do not recurse. + * + * Note: The recursion depth is capped by a heuristic minimum number of sequences, + * defined by MIN_SEQUENCES_BLOCK_SPLITTING. + * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING). + * In practice, recursion depth usually doesn't go beyond 4. + * + * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. + * At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize + * maximum of 128 KB, this value is actually impossible to reach. + */ +static void +ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx, + ZSTD_CCtx* zc, const SeqStore_t* origSeqStore) +{ + SeqStore_t* const fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk; + SeqStore_t* const firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore; + SeqStore_t* const secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore; + size_t estimatedOriginalSize; + size_t estimatedFirstHalfSize; + size_t estimatedSecondHalfSize; + size_t midIdx = (startIdx + endIdx)/2; + + DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx); + assert(endIdx >= startIdx); + if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) { + DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences (%zu)", endIdx - startIdx); + return; + } + ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx); + ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx); + ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx); + estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc); + estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc); + estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc); + DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu", + estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize); + if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) { + return; + } + if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) { + DEBUGLOG(5, "split decided at seqNb:%zu", midIdx); + ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore); + splits->splitLocations[splits->idx] = (U32)midIdx; + splits->idx++; + ZSTD_deriveBlockSplitsHelper(splits, midIdx, endIdx, zc, origSeqStore); + } +} + +/* Base recursive function. + * Populates a table with intra-block partition indices that can improve compression ratio. + * + * @return: number of splits made (which equals the size of the partition table - 1). + */ +static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) +{ + seqStoreSplits splits; + splits.splitLocations = partitions; + splits.idx = 0; + if (nbSeq <= 4) { + DEBUGLOG(5, "ZSTD_deriveBlockSplits: Too few sequences to split (%u <= 4)", nbSeq); + /* Refuse to try and split anything with less than 4 sequences */ + return 0; + } + ZSTD_deriveBlockSplitsHelper(&splits, 0, nbSeq, zc, &zc->seqStore); + splits.splitLocations[splits.idx] = nbSeq; + DEBUGLOG(5, "ZSTD_deriveBlockSplits: final nb partitions: %zu", splits.idx+1); + return splits.idx; +} + +/* ZSTD_compressBlock_splitBlock(): + * Attempts to split a given block into multiple blocks to improve compression ratio. + * + * Returns combined size of all blocks (which includes headers), or a ZSTD error code. + */ +static size_t +ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t blockSize, + U32 lastBlock, U32 nbSeq) +{ + size_t cSize = 0; + const BYTE* ip = (const BYTE*)src; + BYTE* op = (BYTE*)dst; + size_t i = 0; + size_t srcBytesTotal = 0; + U32* const partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */ + SeqStore_t* const nextSeqStore = &zc->blockSplitCtx.nextSeqStore; + SeqStore_t* const currSeqStore = &zc->blockSplitCtx.currSeqStore; + size_t const numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq); + + /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history + * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two + * separate repcode histories that simulate repcode history on compression and decompression side, + * and use the histories to determine whether we must replace a particular repcode with its raw offset. + * + * 1) cRep gets updated for each partition, regardless of whether the block was emitted as uncompressed + * or RLE. This allows us to retrieve the offset value that an invalid repcode references within + * a nocompress/RLE block. + * 2) dRep gets updated only for compressed partitions, and when a repcode gets replaced, will use + * the replacement offset value rather than the original repcode to update the repcode history. + * dRep also will be the final repcode history sent to the next block. + * + * See ZSTD_seqStore_resolveOffCodes() for more details. + */ + Repcodes_t dRep; + Repcodes_t cRep; + ZSTD_memcpy(dRep.rep, zc->blockState.prevCBlock->rep, sizeof(Repcodes_t)); + ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(Repcodes_t)); + ZSTD_memset(nextSeqStore, 0, sizeof(SeqStore_t)); + + DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", + (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, + (unsigned)zc->blockState.matchState.nextToUpdate); + + if (numSplits == 0) { + size_t cSizeSingleBlock = + ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, + &dRep, &cRep, + op, dstCapacity, + ip, blockSize, + lastBlock, 0 /* isPartition */); + FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!"); + DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits"); + assert(zc->blockSizeMax <= ZSTD_BLOCKSIZE_MAX); + assert(cSizeSingleBlock <= zc->blockSizeMax + ZSTD_blockHeaderSize); + return cSizeSingleBlock; + } + + ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]); + for (i = 0; i <= numSplits; ++i) { + size_t cSizeChunk; + U32 const lastPartition = (i == numSplits); + U32 lastBlockEntireSrc = 0; + + size_t srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore); + srcBytesTotal += srcBytes; + if (lastPartition) { + /* This is the final partition, need to account for possible last literals */ + srcBytes += blockSize - srcBytesTotal; + lastBlockEntireSrc = lastBlock; + } else { + ZSTD_deriveSeqStoreChunk(nextSeqStore, &zc->seqStore, partitions[i], partitions[i+1]); + } + + cSizeChunk = ZSTD_compressSeqStore_singleBlock(zc, currSeqStore, + &dRep, &cRep, + op, dstCapacity, + ip, srcBytes, + lastBlockEntireSrc, 1 /* isPartition */); + DEBUGLOG(5, "Estimated size: %zu vs %zu : actual size", + ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk); + FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!"); + + ip += srcBytes; + op += cSizeChunk; + dstCapacity -= cSizeChunk; + cSize += cSizeChunk; + *currSeqStore = *nextSeqStore; + assert(cSizeChunk <= zc->blockSizeMax + ZSTD_blockHeaderSize); + } + /* cRep and dRep may have diverged during the compression. + * If so, we use the dRep repcodes for the next block. + */ + ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(Repcodes_t)); + return cSize; +} + +static size_t +ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, U32 lastBlock) +{ + U32 nbSeq; + size_t cSize; + DEBUGLOG(5, "ZSTD_compressBlock_splitBlock"); + assert(zc->appliedParams.postBlockSplitter == ZSTD_ps_enable); + + { const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); + FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); + if (bss == ZSTDbss_noCompress) { + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) + zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; + RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block"); + cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); + DEBUGLOG(5, "ZSTD_compressBlock_splitBlock: Nocompress block"); + return cSize; + } + nbSeq = (U32)(zc->seqStore.sequences - zc->seqStore.sequencesStart); + } + + cSize = ZSTD_compressBlock_splitBlock_internal(zc, dst, dstCapacity, src, srcSize, lastBlock, nbSeq); + FORWARD_IF_ERROR(cSize, "Splitting blocks failed!"); + return cSize; +} + +static size_t +ZSTD_compressBlock_internal(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, U32 frame) +{ + /* This is an estimated upper bound for the length of an rle block. + * This isn't the actual upper bound. + * Finding the real threshold needs further investigation. + */ + const U32 rleMaxLength = 25; + size_t cSize; + const BYTE* ip = (const BYTE*)src; + BYTE* op = (BYTE*)dst; + DEBUGLOG(5, "ZSTD_compressBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", + (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, + (unsigned)zc->blockState.matchState.nextToUpdate); + + { const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); + FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); + if (bss == ZSTDbss_noCompress) { + RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block"); + cSize = 0; + goto out; + } + } + + if (zc->seqCollector.collectSequences) { + FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, ZSTD_getSeqStore(zc), zc->blockState.prevCBlock->rep), "copyBlockSequences failed"); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + return 0; + } + + /* encode sequences and literals */ + cSize = ZSTD_entropyCompressSeqStore(&zc->seqStore, + &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy, + &zc->appliedParams, + dst, dstCapacity, + srcSize, + zc->tmpWorkspace, zc->tmpWkspSize /* statically allocated in resetCCtx */, + zc->bmi2); + + if (frame && + /* We don't want to emit our first block as a RLE even if it qualifies because + * doing so will cause the decoder (cli only) to throw a "should consume all input error." + * This is only an issue for zstd <= v1.4.3 + */ + !zc->isFirstBlock && + cSize < rleMaxLength && + ZSTD_isRLE(ip, srcSize)) + { + cSize = 1; + op[0] = ip[0]; + } + +out: + if (!ZSTD_isError(cSize) && cSize > 1) { + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + } + /* We check that dictionaries have offset codes available for the first + * block. After the first block, the offcode table might not have large + * enough codes to represent the offsets in the data. + */ + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) + zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; + + return cSize; +} + +static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const size_t bss, U32 lastBlock) +{ + DEBUGLOG(6, "Attempting ZSTD_compressSuperBlock()"); + if (bss == ZSTDbss_compress) { + if (/* We don't want to emit our first block as a RLE even if it qualifies because + * doing so will cause the decoder (cli only) to throw a "should consume all input error." + * This is only an issue for zstd <= v1.4.3 + */ + !zc->isFirstBlock && + ZSTD_maybeRLE(&zc->seqStore) && + ZSTD_isRLE((BYTE const*)src, srcSize)) + { + return ZSTD_rleCompressBlock(dst, dstCapacity, *(BYTE const*)src, srcSize, lastBlock); + } + /* Attempt superblock compression. + * + * Note that compressed size of ZSTD_compressSuperBlock() is not bound by the + * standard ZSTD_compressBound(). This is a problem, because even if we have + * space now, taking an extra byte now could cause us to run out of space later + * and violate ZSTD_compressBound(). + * + * Define blockBound(blockSize) = blockSize + ZSTD_blockHeaderSize. + * + * In order to respect ZSTD_compressBound() we must attempt to emit a raw + * uncompressed block in these cases: + * * cSize == 0: Return code for an uncompressed block. + * * cSize == dstSize_tooSmall: We may have expanded beyond blockBound(srcSize). + * ZSTD_noCompressBlock() will return dstSize_tooSmall if we are really out of + * output space. + * * cSize >= blockBound(srcSize): We have expanded the block too much so + * emit an uncompressed block. + */ + { size_t const cSize = + ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock); + if (cSize != ERROR(dstSize_tooSmall)) { + size_t const maxCSize = + srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy); + FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed"); + if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) { + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + return cSize; + } + } + } + } /* if (bss == ZSTDbss_compress)*/ + + DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()"); + /* Superblock compression failed, attempt to emit a single no compress block. + * The decoder will be able to stream this block since it is uncompressed. + */ + return ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock); +} + +static size_t ZSTD_compressBlock_targetCBlockSize(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + U32 lastBlock) +{ + size_t cSize = 0; + const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); + DEBUGLOG(5, "ZSTD_compressBlock_targetCBlockSize (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u, srcSize=%zu)", + (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, (unsigned)zc->blockState.matchState.nextToUpdate, srcSize); + FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); + + cSize = ZSTD_compressBlock_targetCBlockSize_body(zc, dst, dstCapacity, src, srcSize, bss, lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_targetCBlockSize_body failed"); + + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) + zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; + + return cSize; +} + +static void ZSTD_overflowCorrectIfNeeded(ZSTD_MatchState_t* ms, + ZSTD_cwksp* ws, + ZSTD_CCtx_params const* params, + void const* ip, + void const* iend) +{ + U32 const cycleLog = ZSTD_cycleLog(params->cParams.chainLog, params->cParams.strategy); + U32 const maxDist = (U32)1 << params->cParams.windowLog; + if (ZSTD_window_needOverflowCorrection(ms->window, cycleLog, maxDist, ms->loadedDictEnd, ip, iend)) { + U32 const correction = ZSTD_window_correctOverflow(&ms->window, cycleLog, maxDist, ip); + ZSTD_STATIC_ASSERT(ZSTD_CHAINLOG_MAX <= 30); + ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_32 <= 30); + ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31); + ZSTD_cwksp_mark_tables_dirty(ws); + ZSTD_reduceIndex(ms, params, correction); + ZSTD_cwksp_mark_tables_clean(ws); + if (ms->nextToUpdate < correction) ms->nextToUpdate = 0; + else ms->nextToUpdate -= correction; + /* invalidate dictionaries on overflow correction */ + ms->loadedDictEnd = 0; + ms->dictMatchState = NULL; + } +} + +#include "zstd_preSplit.h" + +static size_t ZSTD_optimalBlockSize(ZSTD_CCtx* cctx, const void* src, size_t srcSize, size_t blockSizeMax, int splitLevel, ZSTD_strategy strat, S64 savings) +{ + /* split level based on compression strategy, from `fast` to `btultra2` */ + static const int splitLevels[] = { 0, 0, 1, 2, 2, 3, 3, 4, 4, 4 }; + /* note: conservatively only split full blocks (128 KB) currently. + * While it's possible to go lower, let's keep it simple for a first implementation. + * Besides, benefits of splitting are reduced when blocks are already small. + */ + if (srcSize < 128 KB || blockSizeMax < 128 KB) + return MIN(srcSize, blockSizeMax); + /* do not split incompressible data though: + * require verified savings to allow pre-splitting. + * Note: as a consequence, the first full block is not split. + */ + if (savings < 3) { + DEBUGLOG(6, "don't attempt splitting: savings (%i) too low", (int)savings); + return 128 KB; + } + /* apply @splitLevel, or use default value (which depends on @strat). + * note that splitting heuristic is still conditioned by @savings >= 3, + * so the first block will not reach this code path */ + if (splitLevel == 1) return 128 KB; + if (splitLevel == 0) { + assert(ZSTD_fast <= strat && strat <= ZSTD_btultra2); + splitLevel = splitLevels[strat]; + } else { + assert(2 <= splitLevel && splitLevel <= 6); + splitLevel -= 2; + } + return ZSTD_splitBlock(src, blockSizeMax, splitLevel, cctx->tmpWorkspace, cctx->tmpWkspSize); +} + +/*! ZSTD_compress_frameChunk() : +* Compress a chunk of data into one or multiple blocks. +* All blocks will be terminated, all input will be consumed. +* Function will issue an error if there is not enough `dstCapacity` to hold the compressed content. +* Frame is supposed already started (header already produced) +* @return : compressed size, or an error code +*/ +static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + U32 lastFrameChunk) +{ + size_t blockSizeMax = cctx->blockSizeMax; + size_t remaining = srcSize; + const BYTE* ip = (const BYTE*)src; + BYTE* const ostart = (BYTE*)dst; + BYTE* op = ostart; + U32 const maxDist = (U32)1 << cctx->appliedParams.cParams.windowLog; + S64 savings = (S64)cctx->consumedSrcSize - (S64)cctx->producedCSize; + + assert(cctx->appliedParams.cParams.windowLog <= ZSTD_WINDOWLOG_MAX); + + DEBUGLOG(5, "ZSTD_compress_frameChunk (srcSize=%u, blockSizeMax=%u)", (unsigned)srcSize, (unsigned)blockSizeMax); + if (cctx->appliedParams.fParams.checksumFlag && srcSize) + XXH64_update(&cctx->xxhState, src, srcSize); + + while (remaining) { + ZSTD_MatchState_t* const ms = &cctx->blockState.matchState; + size_t const blockSize = ZSTD_optimalBlockSize(cctx, + ip, remaining, + blockSizeMax, + cctx->appliedParams.preBlockSplitter_level, + cctx->appliedParams.cParams.strategy, + savings); + U32 const lastBlock = lastFrameChunk & (blockSize == remaining); + assert(blockSize <= remaining); + + /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding + * additional 1. We need to revisit and change this logic to be more consistent */ + RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE + 1, + dstSize_tooSmall, + "not enough space to store compressed block"); + + ZSTD_overflowCorrectIfNeeded( + ms, &cctx->workspace, &cctx->appliedParams, ip, ip + blockSize); + ZSTD_checkDictValidity(&ms->window, ip + blockSize, maxDist, &ms->loadedDictEnd, &ms->dictMatchState); + ZSTD_window_enforceMaxDist(&ms->window, ip, maxDist, &ms->loadedDictEnd, &ms->dictMatchState); + + /* Ensure hash/chain table insertion resumes no sooner than lowlimit */ + if (ms->nextToUpdate < ms->window.lowLimit) ms->nextToUpdate = ms->window.lowLimit; + + { size_t cSize; + if (ZSTD_useTargetCBlockSize(&cctx->appliedParams)) { + cSize = ZSTD_compressBlock_targetCBlockSize(cctx, op, dstCapacity, ip, blockSize, lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_targetCBlockSize failed"); + assert(cSize > 0); + assert(cSize <= blockSize + ZSTD_blockHeaderSize); + } else if (ZSTD_blockSplitterEnabled(&cctx->appliedParams)) { + cSize = ZSTD_compressBlock_splitBlock(cctx, op, dstCapacity, ip, blockSize, lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_splitBlock failed"); + assert(cSize > 0 || cctx->seqCollector.collectSequences == 1); + } else { + cSize = ZSTD_compressBlock_internal(cctx, + op+ZSTD_blockHeaderSize, dstCapacity-ZSTD_blockHeaderSize, + ip, blockSize, 1 /* frame */); + FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_internal failed"); + + if (cSize == 0) { /* block is not compressible */ + cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); + } else { + U32 const cBlockHeader = cSize == 1 ? + lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) : + lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); + MEM_writeLE24(op, cBlockHeader); + cSize += ZSTD_blockHeaderSize; + } + } /* if (ZSTD_useTargetCBlockSize(&cctx->appliedParams))*/ + + /* @savings is employed to ensure that splitting doesn't worsen expansion of incompressible data. + * Without splitting, the maximum expansion is 3 bytes per full block. + * An adversarial input could attempt to fudge the split detector, + * and make it split incompressible data, resulting in more block headers. + * Note that, since ZSTD_COMPRESSBOUND() assumes a worst case scenario of 1KB per block, + * and the splitter never creates blocks that small (current lower limit is 8 KB), + * there is already no risk to expand beyond ZSTD_COMPRESSBOUND() limit. + * But if the goal is to not expand by more than 3-bytes per 128 KB full block, + * then yes, it becomes possible to make the block splitter oversplit incompressible data. + * Using @savings, we enforce an even more conservative condition, + * requiring the presence of enough savings (at least 3 bytes) to authorize splitting, + * otherwise only full blocks are used. + * But being conservative is fine, + * since splitting barely compressible blocks is not fruitful anyway */ + savings += (S64)blockSize - (S64)cSize; + + ip += blockSize; + assert(remaining >= blockSize); + remaining -= blockSize; + op += cSize; + assert(dstCapacity >= cSize); + dstCapacity -= cSize; + cctx->isFirstBlock = 0; + DEBUGLOG(5, "ZSTD_compress_frameChunk: adding a block of size %u", + (unsigned)cSize); + } } + + if (lastFrameChunk && (op>ostart)) cctx->stage = ZSTDcs_ending; + return (size_t)(op-ostart); +} + + +static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity, + const ZSTD_CCtx_params* params, + U64 pledgedSrcSize, U32 dictID) +{ + BYTE* const op = (BYTE*)dst; + U32 const dictIDSizeCodeLength = (dictID>0) + (dictID>=256) + (dictID>=65536); /* 0-3 */ + U32 const dictIDSizeCode = params->fParams.noDictIDFlag ? 0 : dictIDSizeCodeLength; /* 0-3 */ + U32 const checksumFlag = params->fParams.checksumFlag>0; + U32 const windowSize = (U32)1 << params->cParams.windowLog; + U32 const singleSegment = params->fParams.contentSizeFlag && (windowSize >= pledgedSrcSize); + BYTE const windowLogByte = (BYTE)((params->cParams.windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN) << 3); + U32 const fcsCode = params->fParams.contentSizeFlag ? + (pledgedSrcSize>=256) + (pledgedSrcSize>=65536+256) + (pledgedSrcSize>=0xFFFFFFFFU) : 0; /* 0-3 */ + BYTE const frameHeaderDescriptionByte = (BYTE)(dictIDSizeCode + (checksumFlag<<2) + (singleSegment<<5) + (fcsCode<<6) ); + size_t pos=0; + + assert(!(params->fParams.contentSizeFlag && pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN)); + RETURN_ERROR_IF(dstCapacity < ZSTD_FRAMEHEADERSIZE_MAX, dstSize_tooSmall, + "dst buf is too small to fit worst-case frame header size."); + DEBUGLOG(4, "ZSTD_writeFrameHeader : dictIDFlag : %u ; dictID : %u ; dictIDSizeCode : %u", + !params->fParams.noDictIDFlag, (unsigned)dictID, (unsigned)dictIDSizeCode); + if (params->format == ZSTD_f_zstd1) { + MEM_writeLE32(dst, ZSTD_MAGICNUMBER); + pos = 4; + } + op[pos++] = frameHeaderDescriptionByte; + if (!singleSegment) op[pos++] = windowLogByte; + switch(dictIDSizeCode) + { + default: + assert(0); /* impossible */ + ZSTD_FALLTHROUGH; + case 0 : break; + case 1 : op[pos] = (BYTE)(dictID); pos++; break; + case 2 : MEM_writeLE16(op+pos, (U16)dictID); pos+=2; break; + case 3 : MEM_writeLE32(op+pos, dictID); pos+=4; break; + } + switch(fcsCode) + { + default: + assert(0); /* impossible */ + ZSTD_FALLTHROUGH; + case 0 : if (singleSegment) op[pos++] = (BYTE)(pledgedSrcSize); break; + case 1 : MEM_writeLE16(op+pos, (U16)(pledgedSrcSize-256)); pos+=2; break; + case 2 : MEM_writeLE32(op+pos, (U32)(pledgedSrcSize)); pos+=4; break; + case 3 : MEM_writeLE64(op+pos, (U64)(pledgedSrcSize)); pos+=8; break; + } + return pos; +} + +/* ZSTD_writeSkippableFrame_advanced() : + * Writes out a skippable frame with the specified magic number variant (16 are supported), + * from ZSTD_MAGIC_SKIPPABLE_START to ZSTD_MAGIC_SKIPPABLE_START+15, and the desired source data. + * + * Returns the total number of bytes written, or a ZSTD error code. + */ +size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity, + const void* src, size_t srcSize, unsigned magicVariant) { + BYTE* op = (BYTE*)dst; + RETURN_ERROR_IF(dstCapacity < srcSize + ZSTD_SKIPPABLEHEADERSIZE /* Skippable frame overhead */, + dstSize_tooSmall, "Not enough room for skippable frame"); + RETURN_ERROR_IF(srcSize > (unsigned)0xFFFFFFFF, srcSize_wrong, "Src size too large for skippable frame"); + RETURN_ERROR_IF(magicVariant > 15, parameter_outOfBound, "Skippable frame magic number variant not supported"); + + MEM_writeLE32(op, (U32)(ZSTD_MAGIC_SKIPPABLE_START + magicVariant)); + MEM_writeLE32(op+4, (U32)srcSize); + ZSTD_memcpy(op+8, src, srcSize); + return srcSize + ZSTD_SKIPPABLEHEADERSIZE; +} + +/* ZSTD_writeLastEmptyBlock() : + * output an empty Block with end-of-frame mark to complete a frame + * @return : size of data written into `dst` (== ZSTD_blockHeaderSize (defined in zstd_internal.h)) + * or an error code if `dstCapacity` is too small (stage == ZSTDcs_init); + assert(nbSeq == 0 || cctx->appliedParams.ldmParams.enableLdm != ZSTD_ps_enable); + cctx->externSeqStore.seq = seq; + cctx->externSeqStore.size = nbSeq; + cctx->externSeqStore.capacity = nbSeq; + cctx->externSeqStore.pos = 0; + cctx->externSeqStore.posInSequence = 0; +} + + +static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + U32 frame, U32 lastFrameChunk) +{ + ZSTD_MatchState_t* const ms = &cctx->blockState.matchState; + size_t fhSize = 0; + + DEBUGLOG(5, "ZSTD_compressContinue_internal, stage: %u, srcSize: %u", + cctx->stage, (unsigned)srcSize); + RETURN_ERROR_IF(cctx->stage==ZSTDcs_created, stage_wrong, + "missing init (ZSTD_compressBegin)"); + + if (frame && (cctx->stage==ZSTDcs_init)) { + fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, + cctx->pledgedSrcSizePlusOne-1, cctx->dictID); + FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed"); + assert(fhSize <= dstCapacity); + dstCapacity -= fhSize; + dst = (char*)dst + fhSize; + cctx->stage = ZSTDcs_ongoing; + } + + if (!srcSize) return fhSize; /* do not generate an empty block if no input */ + + if (!ZSTD_window_update(&ms->window, src, srcSize, ms->forceNonContiguous)) { + ms->forceNonContiguous = 0; + ms->nextToUpdate = ms->window.dictLimit; + } + if (cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) { + ZSTD_window_update(&cctx->ldmState.window, src, srcSize, /* forceNonContiguous */ 0); + } + + if (!frame) { + /* overflow check and correction for block mode */ + ZSTD_overflowCorrectIfNeeded( + ms, &cctx->workspace, &cctx->appliedParams, + src, (BYTE const*)src + srcSize); + } + + DEBUGLOG(5, "ZSTD_compressContinue_internal (blockSize=%u)", (unsigned)cctx->blockSizeMax); + { size_t const cSize = frame ? + ZSTD_compress_frameChunk (cctx, dst, dstCapacity, src, srcSize, lastFrameChunk) : + ZSTD_compressBlock_internal (cctx, dst, dstCapacity, src, srcSize, 0 /* frame */); + FORWARD_IF_ERROR(cSize, "%s", frame ? "ZSTD_compress_frameChunk failed" : "ZSTD_compressBlock_internal failed"); + cctx->consumedSrcSize += srcSize; + cctx->producedCSize += (cSize + fhSize); + assert(!(cctx->appliedParams.fParams.contentSizeFlag && cctx->pledgedSrcSizePlusOne == 0)); + if (cctx->pledgedSrcSizePlusOne != 0) { /* control src size */ + ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN == (unsigned long long)-1); + RETURN_ERROR_IF( + cctx->consumedSrcSize+1 > cctx->pledgedSrcSizePlusOne, + srcSize_wrong, + "error : pledgedSrcSize = %u, while realSrcSize >= %u", + (unsigned)cctx->pledgedSrcSizePlusOne-1, + (unsigned)cctx->consumedSrcSize); + } + return cSize + fhSize; + } +} + +size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize) +{ + DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize); + return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */); +} + +/* NOTE: Must just wrap ZSTD_compressContinue_public() */ +size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize) +{ + return ZSTD_compressContinue_public(cctx, dst, dstCapacity, src, srcSize); +} + +static size_t ZSTD_getBlockSize_deprecated(const ZSTD_CCtx* cctx) +{ + ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams; + assert(!ZSTD_checkCParams(cParams)); + return MIN(cctx->appliedParams.maxBlockSize, (size_t)1 << cParams.windowLog); +} + +/* NOTE: Must just wrap ZSTD_getBlockSize_deprecated() */ +size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx) +{ + return ZSTD_getBlockSize_deprecated(cctx); +} + +/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */ +size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) +{ + DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize); + { size_t const blockSizeMax = ZSTD_getBlockSize_deprecated(cctx); + RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); } + + return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */); +} + +/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */ +size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) +{ + return ZSTD_compressBlock_deprecated(cctx, dst, dstCapacity, src, srcSize); +} + +/*! ZSTD_loadDictionaryContent() : + * @return : 0, or an error code + */ +static size_t +ZSTD_loadDictionaryContent(ZSTD_MatchState_t* ms, + ldmState_t* ls, + ZSTD_cwksp* ws, + ZSTD_CCtx_params const* params, + const void* src, size_t srcSize, + ZSTD_dictTableLoadMethod_e dtlm, + ZSTD_tableFillPurpose_e tfp) +{ + const BYTE* ip = (const BYTE*) src; + const BYTE* const iend = ip + srcSize; + int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL; + + /* Assert that the ms params match the params we're being given */ + ZSTD_assertEqualCParams(params->cParams, ms->cParams); + + { /* Ensure large dictionaries can't cause index overflow */ + + /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX. + * Dictionaries right at the edge will immediately trigger overflow + * correction, but I don't want to insert extra constraints here. + */ + U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX; + + int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(¶ms->cParams); + if (CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) { + /* Some dictionary matchfinders in zstd use "short cache", + * which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each + * CDict hashtable entry as a tag rather than as part of an index. + * When short cache is used, we need to truncate the dictionary + * so that its indices don't overlap with the tag. */ + U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX; + maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize); + assert(!loadLdmDict); + } + + /* If the dictionary is too large, only load the suffix of the dictionary. */ + if (srcSize > maxDictSize) { + ip = iend - maxDictSize; + src = ip; + srcSize = maxDictSize; + } + } + + if (srcSize > ZSTD_CHUNKSIZE_MAX) { + /* We must have cleared our windows when our source is this large. */ + assert(ZSTD_window_isEmpty(ms->window)); + if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window)); + } + ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0); + + DEBUGLOG(4, "ZSTD_loadDictionaryContent: useRowMatchFinder=%d", (int)params->useRowMatchFinder); + + if (loadLdmDict) { /* Load the entire dict into LDM matchfinders. */ + DEBUGLOG(4, "ZSTD_loadDictionaryContent: Trigger loadLdmDict"); + ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0); + ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base); + ZSTD_ldm_fillHashTable(ls, ip, iend, ¶ms->ldmParams); + DEBUGLOG(4, "ZSTD_loadDictionaryContent: ZSTD_ldm_fillHashTable completes"); + } + + /* If the dict is larger than we can reasonably index in our tables, only load the suffix. */ + { U32 maxDictSize = 1U << MIN(MAX(params->cParams.hashLog + 3, params->cParams.chainLog + 1), 31); + if (srcSize > maxDictSize) { + ip = iend - maxDictSize; + src = ip; + srcSize = maxDictSize; + } + } + + ms->nextToUpdate = (U32)(ip - ms->window.base); + ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); + ms->forceNonContiguous = params->deterministicRefPrefix; + + if (srcSize <= HASH_READ_SIZE) return 0; + + ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend); + + switch(params->cParams.strategy) + { + case ZSTD_fast: + ZSTD_fillHashTable(ms, iend, dtlm, tfp); + break; + case ZSTD_dfast: +#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR + ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp); +#else + assert(0); /* shouldn't be called: cparams should've been adjusted. */ +#endif + break; + + case ZSTD_greedy: + case ZSTD_lazy: + case ZSTD_lazy2: +#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) + assert(srcSize >= HASH_READ_SIZE); + if (ms->dedicatedDictSearch) { + assert(ms->chainTable != NULL); + ZSTD_dedicatedDictSearch_lazy_loadDictionary(ms, iend-HASH_READ_SIZE); + } else { + assert(params->useRowMatchFinder != ZSTD_ps_auto); + if (params->useRowMatchFinder == ZSTD_ps_enable) { + size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog); + ZSTD_memset(ms->tagTable, 0, tagTableSize); + ZSTD_row_update(ms, iend-HASH_READ_SIZE); + DEBUGLOG(4, "Using row-based hash table for lazy dict"); + } else { + ZSTD_insertAndFindFirstIndex(ms, iend-HASH_READ_SIZE); + DEBUGLOG(4, "Using chain-based hash table for lazy dict"); + } + } +#else + assert(0); /* shouldn't be called: cparams should've been adjusted. */ +#endif + break; + + case ZSTD_btlazy2: /* we want the dictionary table fully sorted */ + case ZSTD_btopt: + case ZSTD_btultra: + case ZSTD_btultra2: +#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) + assert(srcSize >= HASH_READ_SIZE); + DEBUGLOG(4, "Fill %u bytes into the Binary Tree", (unsigned)srcSize); + ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend); +#else + assert(0); /* shouldn't be called: cparams should've been adjusted. */ +#endif + break; + + default: + assert(0); /* not possible : not a valid strategy id */ + } + + ms->nextToUpdate = (U32)(iend - ms->window.base); + return 0; +} + + +/* Dictionaries that assign zero probability to symbols that show up causes problems + * when FSE encoding. Mark dictionaries with zero probability symbols as FSE_repeat_check + * and only dictionaries with 100% valid symbols can be assumed valid. + */ +static FSE_repeat ZSTD_dictNCountRepeat(short* normalizedCounter, unsigned dictMaxSymbolValue, unsigned maxSymbolValue) +{ + U32 s; + if (dictMaxSymbolValue < maxSymbolValue) { + return FSE_repeat_check; + } + for (s = 0; s <= maxSymbolValue; ++s) { + if (normalizedCounter[s] == 0) { + return FSE_repeat_check; + } + } + return FSE_repeat_valid; +} + +size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, + const void* const dict, size_t dictSize) +{ + short offcodeNCount[MaxOff+1]; + unsigned offcodeMaxValue = MaxOff; + const BYTE* dictPtr = (const BYTE*)dict; /* skip magic num and dict ID */ + const BYTE* const dictEnd = dictPtr + dictSize; + dictPtr += 8; + bs->entropy.huf.repeatMode = HUF_repeat_check; + + { unsigned maxSymbolValue = 255; + unsigned hasZeroWeights = 1; + size_t const hufHeaderSize = HUF_readCTable((HUF_CElt*)bs->entropy.huf.CTable, &maxSymbolValue, dictPtr, + (size_t)(dictEnd-dictPtr), &hasZeroWeights); + + /* We only set the loaded table as valid if it contains all non-zero + * weights. Otherwise, we set it to check */ + if (!hasZeroWeights && maxSymbolValue == 255) + bs->entropy.huf.repeatMode = HUF_repeat_valid; + + RETURN_ERROR_IF(HUF_isError(hufHeaderSize), dictionary_corrupted, ""); + dictPtr += hufHeaderSize; + } + + { unsigned offcodeLog; + size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, (size_t)(dictEnd-dictPtr)); + RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, ""); + /* fill all offset symbols to avoid garbage at end of table */ + RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp( + bs->entropy.fse.offcodeCTable, + offcodeNCount, MaxOff, offcodeLog, + workspace, HUF_WORKSPACE_SIZE)), + dictionary_corrupted, ""); + /* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */ + dictPtr += offcodeHeaderSize; + } + + { short matchlengthNCount[MaxML+1]; + unsigned matchlengthMaxValue = MaxML, matchlengthLog; + size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, (size_t)(dictEnd-dictPtr)); + RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, ""); + RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp( + bs->entropy.fse.matchlengthCTable, + matchlengthNCount, matchlengthMaxValue, matchlengthLog, + workspace, HUF_WORKSPACE_SIZE)), + dictionary_corrupted, ""); + bs->entropy.fse.matchlength_repeatMode = ZSTD_dictNCountRepeat(matchlengthNCount, matchlengthMaxValue, MaxML); + dictPtr += matchlengthHeaderSize; + } + + { short litlengthNCount[MaxLL+1]; + unsigned litlengthMaxValue = MaxLL, litlengthLog; + size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, (size_t)(dictEnd-dictPtr)); + RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, ""); + RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp( + bs->entropy.fse.litlengthCTable, + litlengthNCount, litlengthMaxValue, litlengthLog, + workspace, HUF_WORKSPACE_SIZE)), + dictionary_corrupted, ""); + bs->entropy.fse.litlength_repeatMode = ZSTD_dictNCountRepeat(litlengthNCount, litlengthMaxValue, MaxLL); + dictPtr += litlengthHeaderSize; + } + + RETURN_ERROR_IF(dictPtr+12 > dictEnd, dictionary_corrupted, ""); + bs->rep[0] = MEM_readLE32(dictPtr+0); + bs->rep[1] = MEM_readLE32(dictPtr+4); + bs->rep[2] = MEM_readLE32(dictPtr+8); + dictPtr += 12; + + { size_t const dictContentSize = (size_t)(dictEnd - dictPtr); + U32 offcodeMax = MaxOff; + if (dictContentSize <= ((U32)-1) - 128 KB) { + U32 const maxOffset = (U32)dictContentSize + 128 KB; /* The maximum offset that must be supported */ + offcodeMax = ZSTD_highbit32(maxOffset); /* Calculate minimum offset code required to represent maxOffset */ + } + /* All offset values <= dictContentSize + 128 KB must be representable for a valid table */ + bs->entropy.fse.offcode_repeatMode = ZSTD_dictNCountRepeat(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff)); + + /* All repCodes must be <= dictContentSize and != 0 */ + { U32 u; + for (u=0; u<3; u++) { + RETURN_ERROR_IF(bs->rep[u] == 0, dictionary_corrupted, ""); + RETURN_ERROR_IF(bs->rep[u] > dictContentSize, dictionary_corrupted, ""); + } } } + + return (size_t)(dictPtr - (const BYTE*)dict); +} + +/* Dictionary format : + * See : + * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#dictionary-format + */ +/*! ZSTD_loadZstdDictionary() : + * @return : dictID, or an error code + * assumptions : magic number supposed already checked + * dictSize supposed >= 8 + */ +static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, + ZSTD_MatchState_t* ms, + ZSTD_cwksp* ws, + ZSTD_CCtx_params const* params, + const void* dict, size_t dictSize, + ZSTD_dictTableLoadMethod_e dtlm, + ZSTD_tableFillPurpose_e tfp, + void* workspace) +{ + const BYTE* dictPtr = (const BYTE*)dict; + const BYTE* const dictEnd = dictPtr + dictSize; + size_t dictID; + size_t eSize; + ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<= 8); + assert(MEM_readLE32(dictPtr) == ZSTD_MAGIC_DICTIONARY); + + dictID = params->fParams.noDictIDFlag ? 0 : MEM_readLE32(dictPtr + 4 /* skip magic number */ ); + eSize = ZSTD_loadCEntropy(bs, workspace, dict, dictSize); + FORWARD_IF_ERROR(eSize, "ZSTD_loadCEntropy failed"); + dictPtr += eSize; + + { + size_t const dictContentSize = (size_t)(dictEnd - dictPtr); + FORWARD_IF_ERROR(ZSTD_loadDictionaryContent( + ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp), ""); + } + return dictID; +} + +/** ZSTD_compress_insertDictionary() : +* @return : dictID, or an error code */ +static size_t +ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, + ZSTD_MatchState_t* ms, + ldmState_t* ls, + ZSTD_cwksp* ws, + const ZSTD_CCtx_params* params, + const void* dict, size_t dictSize, + ZSTD_dictContentType_e dictContentType, + ZSTD_dictTableLoadMethod_e dtlm, + ZSTD_tableFillPurpose_e tfp, + void* workspace) +{ + DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize); + if ((dict==NULL) || (dictSize<8)) { + RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, ""); + return 0; + } + + ZSTD_reset_compressedBlockState(bs); + + /* dict restricted modes */ + if (dictContentType == ZSTD_dct_rawContent) + return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp); + + if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) { + if (dictContentType == ZSTD_dct_auto) { + DEBUGLOG(4, "raw content dictionary detected"); + return ZSTD_loadDictionaryContent( + ms, ls, ws, params, dict, dictSize, dtlm, tfp); + } + RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, ""); + assert(0); /* impossible */ + } + + /* dict as full zstd dictionary */ + return ZSTD_loadZstdDictionary( + bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace); +} + +#define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB) +#define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6ULL) + +/*! ZSTD_compressBegin_internal() : + * Assumption : either @dict OR @cdict (or none) is non-NULL, never both + * @return : 0, or an error code */ +static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, + const void* dict, size_t dictSize, + ZSTD_dictContentType_e dictContentType, + ZSTD_dictTableLoadMethod_e dtlm, + const ZSTD_CDict* cdict, + const ZSTD_CCtx_params* params, U64 pledgedSrcSize, + ZSTD_buffered_policy_e zbuff) +{ + size_t const dictContentSize = cdict ? cdict->dictContentSize : dictSize; +#if ZSTD_TRACE + cctx->traceCtx = (ZSTD_trace_compress_begin != NULL) ? ZSTD_trace_compress_begin(cctx) : 0; +#endif + DEBUGLOG(4, "ZSTD_compressBegin_internal: wlog=%u", params->cParams.windowLog); + /* params are supposed to be fully validated at this point */ + assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams))); + assert(!((dict) && (cdict))); /* either dict or cdict, not both */ + if ( (cdict) + && (cdict->dictContentSize > 0) + && ( pledgedSrcSize < ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF + || pledgedSrcSize < cdict->dictContentSize * ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER + || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN + || cdict->compressionLevel == 0) + && (params->attachDictPref != ZSTD_dictForceLoad) ) { + return ZSTD_resetCCtx_usingCDict(cctx, cdict, params, pledgedSrcSize, zbuff); + } + + FORWARD_IF_ERROR( ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize, + dictContentSize, + ZSTDcrp_makeClean, zbuff) , ""); + { size_t const dictID = cdict ? + ZSTD_compress_insertDictionary( + cctx->blockState.prevCBlock, &cctx->blockState.matchState, + &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent, + cdict->dictContentSize, cdict->dictContentType, dtlm, + ZSTD_tfp_forCCtx, cctx->tmpWorkspace) + : ZSTD_compress_insertDictionary( + cctx->blockState.prevCBlock, &cctx->blockState.matchState, + &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize, + dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->tmpWorkspace); + FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); + assert(dictID <= UINT_MAX); + cctx->dictID = (U32)dictID; + cctx->dictContentSize = dictContentSize; + } + return 0; +} + +size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx, + const void* dict, size_t dictSize, + ZSTD_dictContentType_e dictContentType, + ZSTD_dictTableLoadMethod_e dtlm, + const ZSTD_CDict* cdict, + const ZSTD_CCtx_params* params, + unsigned long long pledgedSrcSize) +{ + DEBUGLOG(4, "ZSTD_compressBegin_advanced_internal: wlog=%u", params->cParams.windowLog); + /* compression parameters verification and optimization */ + FORWARD_IF_ERROR( ZSTD_checkCParams(params->cParams) , ""); + return ZSTD_compressBegin_internal(cctx, + dict, dictSize, dictContentType, dtlm, + cdict, + params, pledgedSrcSize, + ZSTDb_not_buffered); +} + +/*! ZSTD_compressBegin_advanced() : +* @return : 0, or an error code */ +size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, + const void* dict, size_t dictSize, + ZSTD_parameters params, unsigned long long pledgedSrcSize) +{ + ZSTD_CCtx_params cctxParams; + ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, ZSTD_NO_CLEVEL); + return ZSTD_compressBegin_advanced_internal(cctx, + dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, + NULL /*cdict*/, + &cctxParams, pledgedSrcSize); +} + +static size_t +ZSTD_compressBegin_usingDict_deprecated(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) +{ + ZSTD_CCtx_params cctxParams; + { ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict); + ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel); + } + DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize); + return ZSTD_compressBegin_internal(cctx, dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL, + &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered); +} + +size_t +ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) +{ + return ZSTD_compressBegin_usingDict_deprecated(cctx, dict, dictSize, compressionLevel); +} + +size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel) +{ + return ZSTD_compressBegin_usingDict_deprecated(cctx, NULL, 0, compressionLevel); +} + + +/*! ZSTD_writeEpilogue() : +* Ends a frame. +* @return : nb of bytes written into dst (or an error code) */ +static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity) +{ + BYTE* const ostart = (BYTE*)dst; + BYTE* op = ostart; + + DEBUGLOG(4, "ZSTD_writeEpilogue"); + RETURN_ERROR_IF(cctx->stage == ZSTDcs_created, stage_wrong, "init missing"); + + /* special case : empty frame */ + if (cctx->stage == ZSTDcs_init) { + size_t fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0); + FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed"); + dstCapacity -= fhSize; + op += fhSize; + cctx->stage = ZSTDcs_ongoing; + } + + if (cctx->stage != ZSTDcs_ending) { + /* write one last empty block, make it the "last" block */ + U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1) + 0; + ZSTD_STATIC_ASSERT(ZSTD_BLOCKHEADERSIZE == 3); + RETURN_ERROR_IF(dstCapacity<3, dstSize_tooSmall, "no room for epilogue"); + MEM_writeLE24(op, cBlockHeader24); + op += ZSTD_blockHeaderSize; + dstCapacity -= ZSTD_blockHeaderSize; + } + + if (cctx->appliedParams.fParams.checksumFlag) { + U32 const checksum = (U32) XXH64_digest(&cctx->xxhState); + RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for checksum"); + DEBUGLOG(4, "ZSTD_writeEpilogue: write checksum : %08X", (unsigned)checksum); + MEM_writeLE32(op, checksum); + op += 4; + } + + cctx->stage = ZSTDcs_created; /* return to "created but no init" status */ + return (size_t)(op-ostart); +} + +void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize) +{ +#if ZSTD_TRACE + if (cctx->traceCtx && ZSTD_trace_compress_end != NULL) { + int const streaming = cctx->inBuffSize > 0 || cctx->outBuffSize > 0 || cctx->appliedParams.nbWorkers > 0; + ZSTD_Trace trace; + ZSTD_memset(&trace, 0, sizeof(trace)); + trace.version = ZSTD_VERSION_NUMBER; + trace.streaming = streaming; + trace.dictionaryID = cctx->dictID; + trace.dictionarySize = cctx->dictContentSize; + trace.uncompressedSize = cctx->consumedSrcSize; + trace.compressedSize = cctx->producedCSize + extraCSize; + trace.params = &cctx->appliedParams; + trace.cctx = cctx; + ZSTD_trace_compress_end(cctx->traceCtx, &trace); + } + cctx->traceCtx = 0; +#else + (void)cctx; + (void)extraCSize; +#endif +} + +size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize) +{ + size_t endResult; + size_t const cSize = ZSTD_compressContinue_internal(cctx, + dst, dstCapacity, src, srcSize, + 1 /* frame mode */, 1 /* last chunk */); + FORWARD_IF_ERROR(cSize, "ZSTD_compressContinue_internal failed"); + endResult = ZSTD_writeEpilogue(cctx, (char*)dst + cSize, dstCapacity-cSize); + FORWARD_IF_ERROR(endResult, "ZSTD_writeEpilogue failed"); + assert(!(cctx->appliedParams.fParams.contentSizeFlag && cctx->pledgedSrcSizePlusOne == 0)); + if (cctx->pledgedSrcSizePlusOne != 0) { /* control src size */ + ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN == (unsigned long long)-1); + DEBUGLOG(4, "end of frame : controlling src size"); + RETURN_ERROR_IF( + cctx->pledgedSrcSizePlusOne != cctx->consumedSrcSize+1, + srcSize_wrong, + "error : pledgedSrcSize = %u, while realSrcSize = %u", + (unsigned)cctx->pledgedSrcSizePlusOne-1, + (unsigned)cctx->consumedSrcSize); + } + ZSTD_CCtx_trace(cctx, endResult); + return cSize + endResult; +} + +/* NOTE: Must just wrap ZSTD_compressEnd_public() */ +size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize) +{ + return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); +} + +size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize, + ZSTD_parameters params) +{ + DEBUGLOG(4, "ZSTD_compress_advanced"); + FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), ""); + ZSTD_CCtxParams_init_internal(&cctx->simpleApiParams, ¶ms, ZSTD_NO_CLEVEL); + return ZSTD_compress_advanced_internal(cctx, + dst, dstCapacity, + src, srcSize, + dict, dictSize, + &cctx->simpleApiParams); +} + +/* Internal */ +size_t ZSTD_compress_advanced_internal( + ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize, + const ZSTD_CCtx_params* params) +{ + DEBUGLOG(4, "ZSTD_compress_advanced_internal (srcSize:%u)", (unsigned)srcSize); + FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx, + dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL, + params, srcSize, ZSTDb_not_buffered) , ""); + return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); +} + +size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize, + int compressionLevel) +{ + { + ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, srcSize, dict ? dictSize : 0, ZSTD_cpm_noAttachDict); + assert(params.fParams.contentSizeFlag == 1); + ZSTD_CCtxParams_init_internal(&cctx->simpleApiParams, ¶ms, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT: compressionLevel); + } + DEBUGLOG(4, "ZSTD_compress_usingDict (srcSize=%u)", (unsigned)srcSize); + return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, dict, dictSize, &cctx->simpleApiParams); +} + +size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + int compressionLevel) +{ + DEBUGLOG(4, "ZSTD_compressCCtx (srcSize=%u)", (unsigned)srcSize); + assert(cctx != NULL); + return ZSTD_compress_usingDict(cctx, dst, dstCapacity, src, srcSize, NULL, 0, compressionLevel); +} + +size_t ZSTD_compress(void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + int compressionLevel) +{ + size_t result; +#if ZSTD_COMPRESS_HEAPMODE + ZSTD_CCtx* cctx = ZSTD_createCCtx(); + RETURN_ERROR_IF(!cctx, memory_allocation, "ZSTD_createCCtx failed"); + result = ZSTD_compressCCtx(cctx, dst, dstCapacity, src, srcSize, compressionLevel); + ZSTD_freeCCtx(cctx); +#else + ZSTD_CCtx ctxBody; + ZSTD_initCCtx(&ctxBody, ZSTD_defaultCMem); + result = ZSTD_compressCCtx(&ctxBody, dst, dstCapacity, src, srcSize, compressionLevel); + ZSTD_freeCCtxContent(&ctxBody); /* can't free ctxBody itself, as it's on stack; free only heap content */ +#endif + return result; +} + + +/* ===== Dictionary API ===== */ + +/*! ZSTD_estimateCDictSize_advanced() : + * Estimate amount of memory that will be needed to create a dictionary with following arguments */ +size_t ZSTD_estimateCDictSize_advanced( + size_t dictSize, ZSTD_compressionParameters cParams, + ZSTD_dictLoadMethod_e dictLoadMethod) +{ + DEBUGLOG(5, "sizeof(ZSTD_CDict) : %u", (unsigned)sizeof(ZSTD_CDict)); + return ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) + /* enableDedicatedDictSearch == 1 ensures that CDict estimation will not be too small + * in case we are using DDS with row-hash. */ + + ZSTD_sizeof_matchState(&cParams, ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams), + /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0) + + (dictLoadMethod == ZSTD_dlm_byRef ? 0 + : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void *)))); +} + +size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel) +{ + ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict); + return ZSTD_estimateCDictSize_advanced(dictSize, cParams, ZSTD_dlm_byCopy); +} + +size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict) +{ + if (cdict==NULL) return 0; /* support sizeof on NULL */ + DEBUGLOG(5, "sizeof(*cdict) : %u", (unsigned)sizeof(*cdict)); + /* cdict may be in the workspace */ + return (cdict->workspace.workspace == cdict ? 0 : sizeof(*cdict)) + + ZSTD_cwksp_sizeof(&cdict->workspace); +} + +static size_t ZSTD_initCDict_internal( + ZSTD_CDict* cdict, + const void* dictBuffer, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_CCtx_params params) +{ + DEBUGLOG(3, "ZSTD_initCDict_internal (dictContentType:%u)", (unsigned)dictContentType); + assert(!ZSTD_checkCParams(params.cParams)); + cdict->matchState.cParams = params.cParams; + cdict->matchState.dedicatedDictSearch = params.enableDedicatedDictSearch; + if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dictBuffer) || (!dictSize)) { + cdict->dictContent = dictBuffer; + } else { + void *internalBuffer = ZSTD_cwksp_reserve_object(&cdict->workspace, ZSTD_cwksp_align(dictSize, sizeof(void*))); + RETURN_ERROR_IF(!internalBuffer, memory_allocation, "NULL pointer!"); + cdict->dictContent = internalBuffer; + ZSTD_memcpy(internalBuffer, dictBuffer, dictSize); + } + cdict->dictContentSize = dictSize; + cdict->dictContentType = dictContentType; + + cdict->entropyWorkspace = (U32*)ZSTD_cwksp_reserve_object(&cdict->workspace, HUF_WORKSPACE_SIZE); + + + /* Reset the state to no dictionary */ + ZSTD_reset_compressedBlockState(&cdict->cBlockState); + FORWARD_IF_ERROR(ZSTD_reset_matchState( + &cdict->matchState, + &cdict->workspace, + ¶ms.cParams, + params.useRowMatchFinder, + ZSTDcrp_makeClean, + ZSTDirp_reset, + ZSTD_resetTarget_CDict), ""); + /* (Maybe) load the dictionary + * Skips loading the dictionary if it is < 8 bytes. + */ + { params.compressionLevel = ZSTD_CLEVEL_DEFAULT; + params.fParams.contentSizeFlag = 1; + { size_t const dictID = ZSTD_compress_insertDictionary( + &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace, + ¶ms, cdict->dictContent, cdict->dictContentSize, + dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace); + FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); + assert(dictID <= (size_t)(U32)-1); + cdict->dictID = (U32)dictID; + } + } + + return 0; +} + +static ZSTD_CDict* +ZSTD_createCDict_advanced_internal(size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_compressionParameters cParams, + ZSTD_ParamSwitch_e useRowMatchFinder, + int enableDedicatedDictSearch, + ZSTD_customMem customMem) +{ + if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL; + DEBUGLOG(3, "ZSTD_createCDict_advanced_internal (dictSize=%u)", (unsigned)dictSize); + + { size_t const workspaceSize = + ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) + + ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, enableDedicatedDictSearch, /* forCCtx */ 0) + + (dictLoadMethod == ZSTD_dlm_byRef ? 0 + : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*)))); + void* const workspace = ZSTD_customMalloc(workspaceSize, customMem); + ZSTD_cwksp ws; + ZSTD_CDict* cdict; + + if (!workspace) { + ZSTD_customFree(workspace, customMem); + return NULL; + } + + ZSTD_cwksp_init(&ws, workspace, workspaceSize, ZSTD_cwksp_dynamic_alloc); + + cdict = (ZSTD_CDict*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CDict)); + assert(cdict != NULL); + ZSTD_cwksp_move(&cdict->workspace, &ws); + cdict->customMem = customMem; + cdict->compressionLevel = ZSTD_NO_CLEVEL; /* signals advanced API usage */ + cdict->useRowMatchFinder = useRowMatchFinder; + return cdict; + } +} + +ZSTD_CDict* ZSTD_createCDict_advanced(const void* dictBuffer, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams, + ZSTD_customMem customMem) +{ + ZSTD_CCtx_params cctxParams; + ZSTD_memset(&cctxParams, 0, sizeof(cctxParams)); + DEBUGLOG(3, "ZSTD_createCDict_advanced, dictSize=%u, mode=%u", (unsigned)dictSize, (unsigned)dictContentType); + ZSTD_CCtxParams_init(&cctxParams, 0); + cctxParams.cParams = cParams; + cctxParams.customMem = customMem; + return ZSTD_createCDict_advanced2( + dictBuffer, dictSize, + dictLoadMethod, dictContentType, + &cctxParams, customMem); +} + +ZSTD_CDict* ZSTD_createCDict_advanced2( + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + const ZSTD_CCtx_params* originalCctxParams, + ZSTD_customMem customMem) +{ + ZSTD_CCtx_params cctxParams = *originalCctxParams; + ZSTD_compressionParameters cParams; + ZSTD_CDict* cdict; + + DEBUGLOG(3, "ZSTD_createCDict_advanced2, dictSize=%u, mode=%u", (unsigned)dictSize, (unsigned)dictContentType); + if (!customMem.customAlloc ^ !customMem.customFree) return NULL; + + if (cctxParams.enableDedicatedDictSearch) { + cParams = ZSTD_dedicatedDictSearch_getCParams( + cctxParams.compressionLevel, dictSize); + ZSTD_overrideCParams(&cParams, &cctxParams.cParams); + } else { + cParams = ZSTD_getCParamsFromCCtxParams( + &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict); + } + + if (!ZSTD_dedicatedDictSearch_isSupported(&cParams)) { + /* Fall back to non-DDSS params */ + cctxParams.enableDedicatedDictSearch = 0; + cParams = ZSTD_getCParamsFromCCtxParams( + &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict); + } + + DEBUGLOG(3, "ZSTD_createCDict_advanced2: DedicatedDictSearch=%u", cctxParams.enableDedicatedDictSearch); + cctxParams.cParams = cParams; + cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams); + + cdict = ZSTD_createCDict_advanced_internal(dictSize, + dictLoadMethod, cctxParams.cParams, + cctxParams.useRowMatchFinder, cctxParams.enableDedicatedDictSearch, + customMem); + + if (!cdict || ZSTD_isError( ZSTD_initCDict_internal(cdict, + dict, dictSize, + dictLoadMethod, dictContentType, + cctxParams) )) { + ZSTD_freeCDict(cdict); + return NULL; + } + + return cdict; +} + +ZSTD_CDict* ZSTD_createCDict(const void* dict, size_t dictSize, int compressionLevel) +{ + ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict); + ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dict, dictSize, + ZSTD_dlm_byCopy, ZSTD_dct_auto, + cParams, ZSTD_defaultCMem); + if (cdict) + cdict->compressionLevel = (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel; + return cdict; +} + +ZSTD_CDict* ZSTD_createCDict_byReference(const void* dict, size_t dictSize, int compressionLevel) +{ + ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict); + ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dict, dictSize, + ZSTD_dlm_byRef, ZSTD_dct_auto, + cParams, ZSTD_defaultCMem); + if (cdict) + cdict->compressionLevel = (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel; + return cdict; +} + +size_t ZSTD_freeCDict(ZSTD_CDict* cdict) +{ + if (cdict==NULL) return 0; /* support free on NULL */ + { ZSTD_customMem const cMem = cdict->customMem; + int cdictInWorkspace = ZSTD_cwksp_owns_buffer(&cdict->workspace, cdict); + ZSTD_cwksp_free(&cdict->workspace, cMem); + if (!cdictInWorkspace) { + ZSTD_customFree(cdict, cMem); + } + return 0; + } +} + +/*! ZSTD_initStaticCDict_advanced() : + * Generate a digested dictionary in provided memory area. + * workspace: The memory area to emplace the dictionary into. + * Provided pointer must 8-bytes aligned. + * It must outlive dictionary usage. + * workspaceSize: Use ZSTD_estimateCDictSize() + * to determine how large workspace must be. + * cParams : use ZSTD_getCParams() to transform a compression level + * into its relevant cParams. + * @return : pointer to ZSTD_CDict*, or NULL if error (size too small) + * Note : there is no corresponding "free" function. + * Since workspace was allocated externally, it must be freed externally. + */ +const ZSTD_CDict* ZSTD_initStaticCDict( + void* workspace, size_t workspaceSize, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams) +{ + ZSTD_ParamSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams); + /* enableDedicatedDictSearch == 1 ensures matchstate is not too small in case this CDict will be used for DDS + row hash */ + size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0); + size_t const neededSize = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + + (dictLoadMethod == ZSTD_dlm_byRef ? 0 + : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*)))) + + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) + + matchStateSize; + ZSTD_CDict* cdict; + ZSTD_CCtx_params params; + + DEBUGLOG(4, "ZSTD_initStaticCDict (dictSize==%u)", (unsigned)dictSize); + if ((size_t)workspace & 7) return NULL; /* 8-aligned */ + + { + ZSTD_cwksp ws; + ZSTD_cwksp_init(&ws, workspace, workspaceSize, ZSTD_cwksp_static_alloc); + cdict = (ZSTD_CDict*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CDict)); + if (cdict == NULL) return NULL; + ZSTD_cwksp_move(&cdict->workspace, &ws); + } + + if (workspaceSize < neededSize) return NULL; + + ZSTD_CCtxParams_init(¶ms, 0); + params.cParams = cParams; + params.useRowMatchFinder = useRowMatchFinder; + cdict->useRowMatchFinder = useRowMatchFinder; + cdict->compressionLevel = ZSTD_NO_CLEVEL; + + if (ZSTD_isError( ZSTD_initCDict_internal(cdict, + dict, dictSize, + dictLoadMethod, dictContentType, + params) )) + return NULL; + + return cdict; +} + +ZSTD_compressionParameters ZSTD_getCParamsFromCDict(const ZSTD_CDict* cdict) +{ + assert(cdict != NULL); + return cdict->matchState.cParams; +} + +/*! ZSTD_getDictID_fromCDict() : + * Provides the dictID of the dictionary loaded into `cdict`. + * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty. + * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */ +unsigned ZSTD_getDictID_fromCDict(const ZSTD_CDict* cdict) +{ + if (cdict==NULL) return 0; + return cdict->dictID; +} + +/* ZSTD_compressBegin_usingCDict_internal() : + * Implementation of various ZSTD_compressBegin_usingCDict* functions. + */ +static size_t ZSTD_compressBegin_usingCDict_internal( + ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, + ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize) +{ + ZSTD_CCtx_params cctxParams; + DEBUGLOG(4, "ZSTD_compressBegin_usingCDict_internal"); + RETURN_ERROR_IF(cdict==NULL, dictionary_wrong, "NULL pointer!"); + /* Initialize the cctxParams from the cdict */ + { + ZSTD_parameters params; + params.fParams = fParams; + params.cParams = ( pledgedSrcSize < ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF + || pledgedSrcSize < cdict->dictContentSize * ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER + || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN + || cdict->compressionLevel == 0 ) ? + ZSTD_getCParamsFromCDict(cdict) + : ZSTD_getCParams(cdict->compressionLevel, + pledgedSrcSize, + cdict->dictContentSize); + ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, cdict->compressionLevel); + } + /* Increase window log to fit the entire dictionary and source if the + * source size is known. Limit the increase to 19, which is the + * window log for compression level 1 with the largest source size. + */ + if (pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN) { + U32 const limitedSrcSize = (U32)MIN(pledgedSrcSize, 1U << 19); + U32 const limitedSrcLog = limitedSrcSize > 1 ? ZSTD_highbit32(limitedSrcSize - 1) + 1 : 1; + cctxParams.cParams.windowLog = MAX(cctxParams.cParams.windowLog, limitedSrcLog); + } + return ZSTD_compressBegin_internal(cctx, + NULL, 0, ZSTD_dct_auto, ZSTD_dtlm_fast, + cdict, + &cctxParams, pledgedSrcSize, + ZSTDb_not_buffered); +} + + +/* ZSTD_compressBegin_usingCDict_advanced() : + * This function is DEPRECATED. + * cdict must be != NULL */ +size_t ZSTD_compressBegin_usingCDict_advanced( + ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, + ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize) +{ + return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, pledgedSrcSize); +} + +/* ZSTD_compressBegin_usingCDict() : + * cdict must be != NULL */ +size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) +{ + ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; + return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN); +} + +size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) +{ + return ZSTD_compressBegin_usingCDict_deprecated(cctx, cdict); +} + +/*! ZSTD_compress_usingCDict_internal(): + * Implementation of various ZSTD_compress_usingCDict* functions. + */ +static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict, ZSTD_frameParameters fParams) +{ + FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */ + return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); +} + +/*! ZSTD_compress_usingCDict_advanced(): + * This function is DEPRECATED. + */ +size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict, ZSTD_frameParameters fParams) +{ + return ZSTD_compress_usingCDict_internal(cctx, dst, dstCapacity, src, srcSize, cdict, fParams); +} + +/*! ZSTD_compress_usingCDict() : + * Compression using a digested Dictionary. + * Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times. + * Note that compression parameters are decided at CDict creation time + * while frame parameters are hardcoded */ +size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict) +{ + ZSTD_frameParameters const fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; + return ZSTD_compress_usingCDict_internal(cctx, dst, dstCapacity, src, srcSize, cdict, fParams); +} + + + +/* ****************************************************************** +* Streaming +********************************************************************/ + +ZSTD_CStream* ZSTD_createCStream(void) +{ + DEBUGLOG(3, "ZSTD_createCStream"); + return ZSTD_createCStream_advanced(ZSTD_defaultCMem); +} + +ZSTD_CStream* ZSTD_initStaticCStream(void *workspace, size_t workspaceSize) +{ + return ZSTD_initStaticCCtx(workspace, workspaceSize); +} + +ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem) +{ /* CStream and CCtx are now same object */ + return ZSTD_createCCtx_advanced(customMem); +} + +size_t ZSTD_freeCStream(ZSTD_CStream* zcs) +{ + return ZSTD_freeCCtx(zcs); /* same object */ +} + + + +/*====== Initialization ======*/ + +size_t ZSTD_CStreamInSize(void) { return ZSTD_BLOCKSIZE_MAX; } + +size_t ZSTD_CStreamOutSize(void) +{ + return ZSTD_compressBound(ZSTD_BLOCKSIZE_MAX) + ZSTD_blockHeaderSize + 4 /* 32-bits hash */ ; +} + +static ZSTD_CParamMode_e ZSTD_getCParamMode(ZSTD_CDict const* cdict, ZSTD_CCtx_params const* params, U64 pledgedSrcSize) +{ + if (cdict != NULL && ZSTD_shouldAttachDict(cdict, params, pledgedSrcSize)) + return ZSTD_cpm_attachDict; + else + return ZSTD_cpm_noAttachDict; +} + +/* ZSTD_resetCStream(): + * pledgedSrcSize == 0 means "unknown" */ +size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pss) +{ + /* temporary : 0 interpreted as "unknown" during transition period. + * Users willing to specify "unknown" **must** use ZSTD_CONTENTSIZE_UNKNOWN. + * 0 will be interpreted as "empty" in the future. + */ + U64 const pledgedSrcSize = (pss==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss; + DEBUGLOG(4, "ZSTD_resetCStream: pledgedSrcSize = %u", (unsigned)pledgedSrcSize); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , ""); + return 0; +} + +/*! ZSTD_initCStream_internal() : + * Note : for lib/compress only. Used by zstdmt_compress.c. + * Assumption 1 : params are valid + * Assumption 2 : either dict, or cdict, is defined, not both */ +size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, const ZSTD_CDict* cdict, + const ZSTD_CCtx_params* params, + unsigned long long pledgedSrcSize) +{ + DEBUGLOG(4, "ZSTD_initCStream_internal"); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , ""); + assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams))); + zcs->requestedParams = *params; + assert(!((dict) && (cdict))); /* either dict or cdict, not both */ + if (dict) { + FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) , ""); + } else { + /* Dictionary is cleared if !cdict */ + FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) , ""); + } + return 0; +} + +/* ZSTD_initCStream_usingCDict_advanced() : + * same as ZSTD_initCStream_usingCDict(), with control over frame parameters */ +size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, + const ZSTD_CDict* cdict, + ZSTD_frameParameters fParams, + unsigned long long pledgedSrcSize) +{ + DEBUGLOG(4, "ZSTD_initCStream_usingCDict_advanced"); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , ""); + zcs->requestedParams.fParams = fParams; + FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) , ""); + return 0; +} + +/* note : cdict must outlive compression session */ +size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict) +{ + DEBUGLOG(4, "ZSTD_initCStream_usingCDict"); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) , ""); + return 0; +} + + +/* ZSTD_initCStream_advanced() : + * pledgedSrcSize must be exact. + * if srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN. + * dict is loaded with default parameters ZSTD_dct_auto and ZSTD_dlm_byCopy. */ +size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + ZSTD_parameters params, unsigned long long pss) +{ + /* for compatibility with older programs relying on this behavior. + * Users should now specify ZSTD_CONTENTSIZE_UNKNOWN. + * This line will be removed in the future. + */ + U64 const pledgedSrcSize = (pss==0 && params.fParams.contentSizeFlag==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss; + DEBUGLOG(4, "ZSTD_initCStream_advanced"); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , ""); + FORWARD_IF_ERROR( ZSTD_checkCParams(params.cParams) , ""); + ZSTD_CCtxParams_setZstdParams(&zcs->requestedParams, ¶ms); + FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) , ""); + return 0; +} + +size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel) +{ + DEBUGLOG(4, "ZSTD_initCStream_usingDict"); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) , ""); + return 0; +} + +size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pss) +{ + /* temporary : 0 interpreted as "unknown" during transition period. + * Users willing to specify "unknown" **must** use ZSTD_CONTENTSIZE_UNKNOWN. + * 0 will be interpreted as "empty" in the future. + */ + U64 const pledgedSrcSize = (pss==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss; + DEBUGLOG(4, "ZSTD_initCStream_srcSize"); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, NULL) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , ""); + return 0; +} + +size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel) +{ + DEBUGLOG(4, "ZSTD_initCStream"); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, NULL) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) , ""); + return 0; +} + +/*====== Compression ======*/ + +static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx) +{ + if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { + return cctx->blockSizeMax - cctx->stableIn_notConsumed; + } + assert(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered); + { size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos; + if (hintInSize==0) hintInSize = cctx->blockSizeMax; + return hintInSize; + } +} + +/** ZSTD_compressStream_generic(): + * internal function for all *compressStream*() variants + * @return : hint size for next input to complete ongoing block */ +static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective const flushMode) +{ + const char* const istart = (assert(input != NULL), (const char*)input->src); + const char* const iend = (istart != NULL) ? istart + input->size : istart; + const char* ip = (istart != NULL) ? istart + input->pos : istart; + char* const ostart = (assert(output != NULL), (char*)output->dst); + char* const oend = (ostart != NULL) ? ostart + output->size : ostart; + char* op = (ostart != NULL) ? ostart + output->pos : ostart; + U32 someMoreWork = 1; + + /* check expectations */ + DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%i, srcSize = %zu", (int)flushMode, input->size - input->pos); + assert(zcs != NULL); + if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) { + assert(input->pos >= zcs->stableIn_notConsumed); + input->pos -= zcs->stableIn_notConsumed; + if (ip) ip -= zcs->stableIn_notConsumed; + zcs->stableIn_notConsumed = 0; + } + if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) { + assert(zcs->inBuff != NULL); + assert(zcs->inBuffSize > 0); + } + if (zcs->appliedParams.outBufferMode == ZSTD_bm_buffered) { + assert(zcs->outBuff != NULL); + assert(zcs->outBuffSize > 0); + } + if (input->src == NULL) assert(input->size == 0); + assert(input->pos <= input->size); + if (output->dst == NULL) assert(output->size == 0); + assert(output->pos <= output->size); + assert((U32)flushMode <= (U32)ZSTD_e_end); + + while (someMoreWork) { + switch(zcs->streamStage) + { + case zcss_init: + RETURN_ERROR(init_missing, "call ZSTD_initCStream() first!"); + + case zcss_load: + if ( (flushMode == ZSTD_e_end) + && ( (size_t)(oend-op) >= ZSTD_compressBound((size_t)(iend-ip)) /* Enough output space */ + || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) /* OR we are allowed to return dstSizeTooSmall */ + && (zcs->inBuffPos == 0) ) { + /* shortcut to compression pass directly into output buffer */ + size_t const cSize = ZSTD_compressEnd_public(zcs, + op, (size_t)(oend-op), + ip, (size_t)(iend-ip)); + DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize); + FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed"); + ip = iend; + op += cSize; + zcs->frameEnded = 1; + ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + someMoreWork = 0; break; + } + /* complete loading into inBuffer in buffered mode */ + if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) { + size_t const toLoad = zcs->inBuffTarget - zcs->inBuffPos; + size_t const loaded = ZSTD_limitCopy( + zcs->inBuff + zcs->inBuffPos, toLoad, + ip, (size_t)(iend-ip)); + zcs->inBuffPos += loaded; + if (ip) ip += loaded; + if ( (flushMode == ZSTD_e_continue) + && (zcs->inBuffPos < zcs->inBuffTarget) ) { + /* not enough input to fill full block : stop here */ + someMoreWork = 0; break; + } + if ( (flushMode == ZSTD_e_flush) + && (zcs->inBuffPos == zcs->inToCompress) ) { + /* empty */ + someMoreWork = 0; break; + } + } else { + assert(zcs->appliedParams.inBufferMode == ZSTD_bm_stable); + if ( (flushMode == ZSTD_e_continue) + && ( (size_t)(iend - ip) < zcs->blockSizeMax) ) { + /* can't compress a full block : stop here */ + zcs->stableIn_notConsumed = (size_t)(iend - ip); + ip = iend; /* pretend to have consumed input */ + someMoreWork = 0; break; + } + if ( (flushMode == ZSTD_e_flush) + && (ip == iend) ) { + /* empty */ + someMoreWork = 0; break; + } + } + /* compress current block (note : this stage cannot be stopped in the middle) */ + DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode); + { int const inputBuffered = (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered); + void* cDst; + size_t cSize; + size_t oSize = (size_t)(oend-op); + size_t const iSize = inputBuffered ? zcs->inBuffPos - zcs->inToCompress + : MIN((size_t)(iend - ip), zcs->blockSizeMax); + if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) + cDst = op; /* compress into output buffer, to skip flush stage */ + else + cDst = zcs->outBuff, oSize = zcs->outBuffSize; + if (inputBuffered) { + unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend); + cSize = lastBlock ? + ZSTD_compressEnd_public(zcs, cDst, oSize, + zcs->inBuff + zcs->inToCompress, iSize) : + ZSTD_compressContinue_public(zcs, cDst, oSize, + zcs->inBuff + zcs->inToCompress, iSize); + FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed"); + zcs->frameEnded = lastBlock; + /* prepare next block */ + zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSizeMax; + if (zcs->inBuffTarget > zcs->inBuffSize) + zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSizeMax; + DEBUGLOG(5, "inBuffTarget:%u / inBuffSize:%u", + (unsigned)zcs->inBuffTarget, (unsigned)zcs->inBuffSize); + if (!lastBlock) + assert(zcs->inBuffTarget <= zcs->inBuffSize); + zcs->inToCompress = zcs->inBuffPos; + } else { /* !inputBuffered, hence ZSTD_bm_stable */ + unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend); + cSize = lastBlock ? + ZSTD_compressEnd_public(zcs, cDst, oSize, ip, iSize) : + ZSTD_compressContinue_public(zcs, cDst, oSize, ip, iSize); + /* Consume the input prior to error checking to mirror buffered mode. */ + if (ip) ip += iSize; + FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed"); + zcs->frameEnded = lastBlock; + if (lastBlock) assert(ip == iend); + } + if (cDst == op) { /* no need to flush */ + op += cSize; + if (zcs->frameEnded) { + DEBUGLOG(5, "Frame completed directly in outBuffer"); + someMoreWork = 0; + ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + } + break; + } + zcs->outBuffContentSize = cSize; + zcs->outBuffFlushedSize = 0; + zcs->streamStage = zcss_flush; /* pass-through to flush stage */ + } + ZSTD_FALLTHROUGH; + case zcss_flush: + DEBUGLOG(5, "flush stage"); + assert(zcs->appliedParams.outBufferMode == ZSTD_bm_buffered); + { size_t const toFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize; + size_t const flushed = ZSTD_limitCopy(op, (size_t)(oend-op), + zcs->outBuff + zcs->outBuffFlushedSize, toFlush); + DEBUGLOG(5, "toFlush: %u into %u ==> flushed: %u", + (unsigned)toFlush, (unsigned)(oend-op), (unsigned)flushed); + if (flushed) + op += flushed; + zcs->outBuffFlushedSize += flushed; + if (toFlush!=flushed) { + /* flush not fully completed, presumably because dst is too small */ + assert(op==oend); + someMoreWork = 0; + break; + } + zcs->outBuffContentSize = zcs->outBuffFlushedSize = 0; + if (zcs->frameEnded) { + DEBUGLOG(5, "Frame completed on flush"); + someMoreWork = 0; + ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + break; + } + zcs->streamStage = zcss_load; + break; + } + + default: /* impossible */ + assert(0); + } + } + + input->pos = (size_t)(ip - istart); + output->pos = (size_t)(op - ostart); + if (zcs->frameEnded) return 0; + return ZSTD_nextInputSizeHint(zcs); +} + +static size_t ZSTD_nextInputSizeHint_MTorST(const ZSTD_CCtx* cctx) +{ +#ifdef ZSTD_MULTITHREAD + if (cctx->appliedParams.nbWorkers >= 1) { + assert(cctx->mtctx != NULL); + return ZSTDMT_nextInputSizeHint(cctx->mtctx); + } +#endif + return ZSTD_nextInputSizeHint(cctx); + +} + +size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input) +{ + FORWARD_IF_ERROR( ZSTD_compressStream2(zcs, output, input, ZSTD_e_continue) , ""); + return ZSTD_nextInputSizeHint_MTorST(zcs); +} + +/* After a compression call set the expected input/output buffer. + * This is validated at the start of the next compression call. + */ +static void +ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, const ZSTD_outBuffer* output, const ZSTD_inBuffer* input) +{ + DEBUGLOG(5, "ZSTD_setBufferExpectations (for advanced stable in/out modes)"); + if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { + cctx->expectedInBuffer = *input; + } + if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) { + cctx->expectedOutBufferSize = output->size - output->pos; + } +} + +/* Validate that the input/output buffers match the expectations set by + * ZSTD_setBufferExpectations. + */ +static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx, + ZSTD_outBuffer const* output, + ZSTD_inBuffer const* input, + ZSTD_EndDirective endOp) +{ + if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { + ZSTD_inBuffer const expect = cctx->expectedInBuffer; + if (expect.src != input->src || expect.pos != input->pos) + RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableInBuffer enabled but input differs!"); + } + (void)endOp; + if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) { + size_t const outBufferSize = output->size - output->pos; + if (cctx->expectedOutBufferSize != outBufferSize) + RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableOutBuffer enabled but output size differs!"); + } + return 0; +} + +/* + * If @endOp == ZSTD_e_end, @inSize becomes pledgedSrcSize. + * Otherwise, it's ignored. + * @return: 0 on success, or a ZSTD_error code otherwise. + */ +static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, + ZSTD_EndDirective endOp, + size_t inSize) +{ + ZSTD_CCtx_params params = cctx->requestedParams; + ZSTD_prefixDict const prefixDict = cctx->prefixDict; + FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */ + ZSTD_memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict)); /* single usage */ + assert(prefixDict.dict==NULL || cctx->cdict==NULL); /* only one can be set */ + if (cctx->cdict && !cctx->localDict.cdict) { + /* Let the cdict's compression level take priority over the requested params. + * But do not take the cdict's compression level if the "cdict" is actually a localDict + * generated from ZSTD_initLocalDict(). + */ + params.compressionLevel = cctx->cdict->compressionLevel; + } + DEBUGLOG(4, "ZSTD_CCtx_init_compressStream2 : transparent init stage"); + if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-determine pledgedSrcSize */ + + { size_t const dictSize = prefixDict.dict + ? prefixDict.dictSize + : (cctx->cdict ? cctx->cdict->dictContentSize : 0); + ZSTD_CParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, ¶ms, cctx->pledgedSrcSizePlusOne - 1); + params.cParams = ZSTD_getCParamsFromCCtxParams( + ¶ms, cctx->pledgedSrcSizePlusOne-1, + dictSize, mode); + } + + params.postBlockSplitter = ZSTD_resolveBlockSplitterMode(params.postBlockSplitter, ¶ms.cParams); + params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, ¶ms.cParams); + params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, ¶ms.cParams); + params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences); + params.maxBlockSize = ZSTD_resolveMaxBlockSize(params.maxBlockSize); + params.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(params.searchForExternalRepcodes, params.compressionLevel); + +#ifdef ZSTD_MULTITHREAD + /* If external matchfinder is enabled, make sure to fail before checking job size (for consistency) */ + RETURN_ERROR_IF( + ZSTD_hasExtSeqProd(¶ms) && params.nbWorkers >= 1, + parameter_combination_unsupported, + "External sequence producer isn't supported with nbWorkers >= 1" + ); + + if ((cctx->pledgedSrcSizePlusOne-1) <= ZSTDMT_JOBSIZE_MIN) { + params.nbWorkers = 0; /* do not invoke multi-threading when src size is too small */ + } + if (params.nbWorkers > 0) { +# if ZSTD_TRACE + cctx->traceCtx = (ZSTD_trace_compress_begin != NULL) ? ZSTD_trace_compress_begin(cctx) : 0; +# endif + /* mt context creation */ + if (cctx->mtctx == NULL) { + DEBUGLOG(4, "ZSTD_compressStream2: creating new mtctx for nbWorkers=%u", + params.nbWorkers); + cctx->mtctx = ZSTDMT_createCCtx_advanced((U32)params.nbWorkers, cctx->customMem, cctx->pool); + RETURN_ERROR_IF(cctx->mtctx == NULL, memory_allocation, "NULL pointer!"); + } + /* mt compression */ + DEBUGLOG(4, "call ZSTDMT_initCStream_internal as nbWorkers=%u", params.nbWorkers); + FORWARD_IF_ERROR( ZSTDMT_initCStream_internal( + cctx->mtctx, + prefixDict.dict, prefixDict.dictSize, prefixDict.dictContentType, + cctx->cdict, params, cctx->pledgedSrcSizePlusOne-1) , ""); + cctx->dictID = cctx->cdict ? cctx->cdict->dictID : 0; + cctx->dictContentSize = cctx->cdict ? cctx->cdict->dictContentSize : prefixDict.dictSize; + cctx->consumedSrcSize = 0; + cctx->producedCSize = 0; + cctx->streamStage = zcss_load; + cctx->appliedParams = params; + } else +#endif /* ZSTD_MULTITHREAD */ + { U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1; + assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); + FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx, + prefixDict.dict, prefixDict.dictSize, prefixDict.dictContentType, ZSTD_dtlm_fast, + cctx->cdict, + ¶ms, pledgedSrcSize, + ZSTDb_buffered) , ""); + assert(cctx->appliedParams.nbWorkers == 0); + cctx->inToCompress = 0; + cctx->inBuffPos = 0; + if (cctx->appliedParams.inBufferMode == ZSTD_bm_buffered) { + /* for small input: avoid automatic flush on reaching end of block, since + * it would require to add a 3-bytes null block to end frame + */ + cctx->inBuffTarget = cctx->blockSizeMax + (cctx->blockSizeMax == pledgedSrcSize); + } else { + cctx->inBuffTarget = 0; + } + cctx->outBuffContentSize = cctx->outBuffFlushedSize = 0; + cctx->streamStage = zcss_load; + cctx->frameEnded = 0; + } + return 0; +} + +/* @return provides a minimum amount of data remaining to be flushed from internal buffers + */ +size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective endOp) +{ + DEBUGLOG(5, "ZSTD_compressStream2, endOp=%u ", (unsigned)endOp); + /* check conditions */ + RETURN_ERROR_IF(output->pos > output->size, dstSize_tooSmall, "invalid output buffer"); + RETURN_ERROR_IF(input->pos > input->size, srcSize_wrong, "invalid input buffer"); + RETURN_ERROR_IF((U32)endOp > (U32)ZSTD_e_end, parameter_outOfBound, "invalid endDirective"); + assert(cctx != NULL); + + /* transparent initialization stage */ + if (cctx->streamStage == zcss_init) { + size_t const inputSize = input->size - input->pos; /* no obligation to start from pos==0 */ + size_t const totalInputSize = inputSize + cctx->stableIn_notConsumed; + if ( (cctx->requestedParams.inBufferMode == ZSTD_bm_stable) /* input is presumed stable, across invocations */ + && (endOp == ZSTD_e_continue) /* no flush requested, more input to come */ + && (totalInputSize < ZSTD_BLOCKSIZE_MAX) ) { /* not even reached one block yet */ + if (cctx->stableIn_notConsumed) { /* not the first time */ + /* check stable source guarantees */ + RETURN_ERROR_IF(input->src != cctx->expectedInBuffer.src, stabilityCondition_notRespected, "stableInBuffer condition not respected: wrong src pointer"); + RETURN_ERROR_IF(input->pos != cctx->expectedInBuffer.size, stabilityCondition_notRespected, "stableInBuffer condition not respected: externally modified pos"); + } + /* pretend input was consumed, to give a sense forward progress */ + input->pos = input->size; + /* save stable inBuffer, for later control, and flush/end */ + cctx->expectedInBuffer = *input; + /* but actually input wasn't consumed, so keep track of position from where compression shall resume */ + cctx->stableIn_notConsumed += inputSize; + /* don't initialize yet, wait for the first block of flush() order, for better parameters adaptation */ + return ZSTD_FRAMEHEADERSIZE_MIN(cctx->requestedParams.format); /* at least some header to produce */ + } + FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, totalInputSize), "compressStream2 initialization failed"); + ZSTD_setBufferExpectations(cctx, output, input); /* Set initial buffer expectations now that we've initialized */ + } + /* end of transparent initialization stage */ + + FORWARD_IF_ERROR(ZSTD_checkBufferStability(cctx, output, input, endOp), "invalid buffers"); + /* compression stage */ +#ifdef ZSTD_MULTITHREAD + if (cctx->appliedParams.nbWorkers > 0) { + size_t flushMin; + if (cctx->cParamsChanged) { + ZSTDMT_updateCParams_whileCompressing(cctx->mtctx, &cctx->requestedParams); + cctx->cParamsChanged = 0; + } + if (cctx->stableIn_notConsumed) { + assert(cctx->appliedParams.inBufferMode == ZSTD_bm_stable); + /* some early data was skipped - make it available for consumption */ + assert(input->pos >= cctx->stableIn_notConsumed); + input->pos -= cctx->stableIn_notConsumed; + cctx->stableIn_notConsumed = 0; + } + for (;;) { + size_t const ipos = input->pos; + size_t const opos = output->pos; + flushMin = ZSTDMT_compressStream_generic(cctx->mtctx, output, input, endOp); + cctx->consumedSrcSize += (U64)(input->pos - ipos); + cctx->producedCSize += (U64)(output->pos - opos); + if ( ZSTD_isError(flushMin) + || (endOp == ZSTD_e_end && flushMin == 0) ) { /* compression completed */ + if (flushMin == 0) + ZSTD_CCtx_trace(cctx, 0); + ZSTD_CCtx_reset(cctx, ZSTD_reset_session_only); + } + FORWARD_IF_ERROR(flushMin, "ZSTDMT_compressStream_generic failed"); + + if (endOp == ZSTD_e_continue) { + /* We only require some progress with ZSTD_e_continue, not maximal progress. + * We're done if we've consumed or produced any bytes, or either buffer is + * full. + */ + if (input->pos != ipos || output->pos != opos || input->pos == input->size || output->pos == output->size) + break; + } else { + assert(endOp == ZSTD_e_flush || endOp == ZSTD_e_end); + /* We require maximal progress. We're done when the flush is complete or the + * output buffer is full. + */ + if (flushMin == 0 || output->pos == output->size) + break; + } + } + DEBUGLOG(5, "completed ZSTD_compressStream2 delegating to ZSTDMT_compressStream_generic"); + /* Either we don't require maximum forward progress, we've finished the + * flush, or we are out of output space. + */ + assert(endOp == ZSTD_e_continue || flushMin == 0 || output->pos == output->size); + ZSTD_setBufferExpectations(cctx, output, input); + return flushMin; + } +#endif /* ZSTD_MULTITHREAD */ + FORWARD_IF_ERROR( ZSTD_compressStream_generic(cctx, output, input, endOp) , ""); + DEBUGLOG(5, "completed ZSTD_compressStream2"); + ZSTD_setBufferExpectations(cctx, output, input); + return cctx->outBuffContentSize - cctx->outBuffFlushedSize; /* remaining to flush */ +} + +size_t ZSTD_compressStream2_simpleArgs ( + ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos, + ZSTD_EndDirective endOp) +{ + ZSTD_outBuffer output; + ZSTD_inBuffer input; + output.dst = dst; + output.size = dstCapacity; + output.pos = *dstPos; + input.src = src; + input.size = srcSize; + input.pos = *srcPos; + /* ZSTD_compressStream2() will check validity of dstPos and srcPos */ + { size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp); + *dstPos = output.pos; + *srcPos = input.pos; + return cErr; + } +} + +size_t ZSTD_compress2(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize) +{ + ZSTD_bufferMode_e const originalInBufferMode = cctx->requestedParams.inBufferMode; + ZSTD_bufferMode_e const originalOutBufferMode = cctx->requestedParams.outBufferMode; + DEBUGLOG(4, "ZSTD_compress2 (srcSize=%u)", (unsigned)srcSize); + ZSTD_CCtx_reset(cctx, ZSTD_reset_session_only); + /* Enable stable input/output buffers. */ + cctx->requestedParams.inBufferMode = ZSTD_bm_stable; + cctx->requestedParams.outBufferMode = ZSTD_bm_stable; + { size_t oPos = 0; + size_t iPos = 0; + size_t const result = ZSTD_compressStream2_simpleArgs(cctx, + dst, dstCapacity, &oPos, + src, srcSize, &iPos, + ZSTD_e_end); + /* Reset to the original values. */ + cctx->requestedParams.inBufferMode = originalInBufferMode; + cctx->requestedParams.outBufferMode = originalOutBufferMode; + + FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed"); + if (result != 0) { /* compression not completed, due to lack of output space */ + assert(oPos == dstCapacity); + RETURN_ERROR(dstSize_tooSmall, ""); + } + assert(iPos == srcSize); /* all input is expected consumed */ + return oPos; + } +} + +/* ZSTD_validateSequence() : + * @offBase : must use the format required by ZSTD_storeSeq() + * @returns a ZSTD error code if sequence is not valid + */ +static size_t +ZSTD_validateSequence(U32 offBase, U32 matchLength, U32 minMatch, + size_t posInSrc, U32 windowLog, size_t dictSize, int useSequenceProducer) +{ + U32 const windowSize = 1u << windowLog; + /* posInSrc represents the amount of data the decoder would decode up to this point. + * As long as the amount of data decoded is less than or equal to window size, offsets may be + * larger than the total length of output decoded in order to reference the dict, even larger than + * window size. After output surpasses windowSize, we're limited to windowSize offsets again. + */ + size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize; + size_t const matchLenLowerBound = (minMatch == 3 || useSequenceProducer) ? 3 : 4; + RETURN_ERROR_IF(offBase > OFFSET_TO_OFFBASE(offsetBound), externalSequences_invalid, "Offset too large!"); + /* Validate maxNbSeq is large enough for the given matchLength and minMatch */ + RETURN_ERROR_IF(matchLength < matchLenLowerBound, externalSequences_invalid, "Matchlength too small for the minMatch"); + return 0; +} + +/* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */ +static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0) +{ + U32 offBase = OFFSET_TO_OFFBASE(rawOffset); + + if (!ll0 && rawOffset == rep[0]) { + offBase = REPCODE1_TO_OFFBASE; + } else if (rawOffset == rep[1]) { + offBase = REPCODE_TO_OFFBASE(2 - ll0); + } else if (rawOffset == rep[2]) { + offBase = REPCODE_TO_OFFBASE(3 - ll0); + } else if (ll0 && rawOffset == rep[0] - 1) { + offBase = REPCODE3_TO_OFFBASE; + } + return offBase; +} + +/* This function scans through an array of ZSTD_Sequence, + * storing the sequences it reads, until it reaches a block delimiter. + * Note that the block delimiter includes the last literals of the block. + * @blockSize must be == sum(sequence_lengths). + * @returns @blockSize on success, and a ZSTD_error otherwise. + */ +static size_t +ZSTD_transferSequences_wBlockDelim(ZSTD_CCtx* cctx, + ZSTD_SequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, + const void* src, size_t blockSize, + ZSTD_ParamSwitch_e externalRepSearch) +{ + U32 idx = seqPos->idx; + U32 const startIdx = idx; + BYTE const* ip = (BYTE const*)(src); + const BYTE* const iend = ip + blockSize; + Repcodes_t updatedRepcodes; + U32 dictSize; + + DEBUGLOG(5, "ZSTD_transferSequences_wBlockDelim (blockSize = %zu)", blockSize); + + if (cctx->cdict) { + dictSize = (U32)cctx->cdict->dictContentSize; + } else if (cctx->prefixDict.dict) { + dictSize = (U32)cctx->prefixDict.dictSize; + } else { + dictSize = 0; + } + ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(Repcodes_t)); + for (; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) { + U32 const litLength = inSeqs[idx].litLength; + U32 const matchLength = inSeqs[idx].matchLength; + U32 offBase; + + if (externalRepSearch == ZSTD_ps_disable) { + offBase = OFFSET_TO_OFFBASE(inSeqs[idx].offset); + } else { + U32 const ll0 = (litLength == 0); + offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0); + ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); + } + + DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); + if (cctx->appliedParams.validateSequences) { + seqPos->posInSrc += litLength + matchLength; + FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, + seqPos->posInSrc, + cctx->appliedParams.cParams.windowLog, dictSize, + ZSTD_hasExtSeqProd(&cctx->appliedParams)), + "Sequence validation failed"); + } + RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid, + "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); + ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength); + ip += matchLength + litLength; + } + RETURN_ERROR_IF(idx == inSeqsSize, externalSequences_invalid, "Block delimiter not found."); + + /* If we skipped repcode search while parsing, we need to update repcodes now */ + assert(externalRepSearch != ZSTD_ps_auto); + assert(idx >= startIdx); + if (externalRepSearch == ZSTD_ps_disable && idx != startIdx) { + U32* const rep = updatedRepcodes.rep; + U32 lastSeqIdx = idx - 1; /* index of last non-block-delimiter sequence */ + + if (lastSeqIdx >= startIdx + 2) { + rep[2] = inSeqs[lastSeqIdx - 2].offset; + rep[1] = inSeqs[lastSeqIdx - 1].offset; + rep[0] = inSeqs[lastSeqIdx].offset; + } else if (lastSeqIdx == startIdx + 1) { + rep[2] = rep[0]; + rep[1] = inSeqs[lastSeqIdx - 1].offset; + rep[0] = inSeqs[lastSeqIdx].offset; + } else { + assert(lastSeqIdx == startIdx); + rep[2] = rep[1]; + rep[1] = rep[0]; + rep[0] = inSeqs[lastSeqIdx].offset; + } + } + + ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(Repcodes_t)); + + if (inSeqs[idx].litLength) { + DEBUGLOG(6, "Storing last literals of size: %u", inSeqs[idx].litLength); + ZSTD_storeLastLiterals(&cctx->seqStore, ip, inSeqs[idx].litLength); + ip += inSeqs[idx].litLength; + seqPos->posInSrc += inSeqs[idx].litLength; + } + RETURN_ERROR_IF(ip != iend, externalSequences_invalid, "Blocksize doesn't agree with block delimiter!"); + seqPos->idx = idx+1; + return blockSize; +} + +/* + * This function attempts to scan through @blockSize bytes in @src + * represented by the sequences in @inSeqs, + * storing any (partial) sequences. + * + * Occasionally, we may want to reduce the actual number of bytes consumed from @src + * to avoid splitting a match, notably if it would produce a match smaller than MINMATCH. + * + * @returns the number of bytes consumed from @src, necessarily <= @blockSize. + * Otherwise, it may return a ZSTD error if something went wrong. + */ +static size_t +ZSTD_transferSequences_noDelim(ZSTD_CCtx* cctx, + ZSTD_SequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, + const void* src, size_t blockSize, + ZSTD_ParamSwitch_e externalRepSearch) +{ + U32 idx = seqPos->idx; + U32 startPosInSequence = seqPos->posInSequence; + U32 endPosInSequence = seqPos->posInSequence + (U32)blockSize; + size_t dictSize; + const BYTE* const istart = (const BYTE*)(src); + const BYTE* ip = istart; + const BYTE* iend = istart + blockSize; /* May be adjusted if we decide to process fewer than blockSize bytes */ + Repcodes_t updatedRepcodes; + U32 bytesAdjustment = 0; + U32 finalMatchSplit = 0; + + /* TODO(embg) support fast parsing mode in noBlockDelim mode */ + (void)externalRepSearch; + + if (cctx->cdict) { + dictSize = cctx->cdict->dictContentSize; + } else if (cctx->prefixDict.dict) { + dictSize = cctx->prefixDict.dictSize; + } else { + dictSize = 0; + } + DEBUGLOG(5, "ZSTD_transferSequences_noDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize); + DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); + ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(Repcodes_t)); + while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) { + const ZSTD_Sequence currSeq = inSeqs[idx]; + U32 litLength = currSeq.litLength; + U32 matchLength = currSeq.matchLength; + U32 const rawOffset = currSeq.offset; + U32 offBase; + + /* Modify the sequence depending on where endPosInSequence lies */ + if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) { + if (startPosInSequence >= litLength) { + startPosInSequence -= litLength; + litLength = 0; + matchLength -= startPosInSequence; + } else { + litLength -= startPosInSequence; + } + /* Move to the next sequence */ + endPosInSequence -= currSeq.litLength + currSeq.matchLength; + startPosInSequence = 0; + } else { + /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence + does not reach the end of the match. So, we have to split the sequence */ + DEBUGLOG(6, "Require a split: diff: %u, idx: %u PIS: %u", + currSeq.litLength + currSeq.matchLength - endPosInSequence, idx, endPosInSequence); + if (endPosInSequence > litLength) { + U32 firstHalfMatchLength; + litLength = startPosInSequence >= litLength ? 0 : litLength - startPosInSequence; + firstHalfMatchLength = endPosInSequence - startPosInSequence - litLength; + if (matchLength > blockSize && firstHalfMatchLength >= cctx->appliedParams.cParams.minMatch) { + /* Only ever split the match if it is larger than the block size */ + U32 secondHalfMatchLength = currSeq.matchLength + currSeq.litLength - endPosInSequence; + if (secondHalfMatchLength < cctx->appliedParams.cParams.minMatch) { + /* Move the endPosInSequence backward so that it creates match of minMatch length */ + endPosInSequence -= cctx->appliedParams.cParams.minMatch - secondHalfMatchLength; + bytesAdjustment = cctx->appliedParams.cParams.minMatch - secondHalfMatchLength; + firstHalfMatchLength -= bytesAdjustment; + } + matchLength = firstHalfMatchLength; + /* Flag that we split the last match - after storing the sequence, exit the loop, + but keep the value of endPosInSequence */ + finalMatchSplit = 1; + } else { + /* Move the position in sequence backwards so that we don't split match, and break to store + * the last literals. We use the original currSeq.litLength as a marker for where endPosInSequence + * should go. We prefer to do this whenever it is not necessary to split the match, or if doing so + * would cause the first half of the match to be too small + */ + bytesAdjustment = endPosInSequence - currSeq.litLength; + endPosInSequence = currSeq.litLength; + break; + } + } else { + /* This sequence ends inside the literals, break to store the last literals */ + break; + } + } + /* Check if this offset can be represented with a repcode */ + { U32 const ll0 = (litLength == 0); + offBase = ZSTD_finalizeOffBase(rawOffset, updatedRepcodes.rep, ll0); + ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); + } + + if (cctx->appliedParams.validateSequences) { + seqPos->posInSrc += litLength + matchLength; + FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc, + cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)), + "Sequence validation failed"); + } + DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); + RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid, + "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); + ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength); + ip += matchLength + litLength; + if (!finalMatchSplit) + idx++; /* Next Sequence */ + } + DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); + assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength); + seqPos->idx = idx; + seqPos->posInSequence = endPosInSequence; + ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(Repcodes_t)); + + iend -= bytesAdjustment; + if (ip != iend) { + /* Store any last literals */ + U32 const lastLLSize = (U32)(iend - ip); + assert(ip <= iend); + DEBUGLOG(6, "Storing last literals of size: %u", lastLLSize); + ZSTD_storeLastLiterals(&cctx->seqStore, ip, lastLLSize); + seqPos->posInSrc += lastLLSize; + } + + return (size_t)(iend-istart); +} + +/* @seqPos represents a position within @inSeqs, + * it is read and updated by this function, + * once the goal to produce a block of size @blockSize is reached. + * @return: nb of bytes consumed from @src, necessarily <= @blockSize. + */ +typedef size_t (*ZSTD_SequenceCopier_f)(ZSTD_CCtx* cctx, + ZSTD_SequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, + const void* src, size_t blockSize, + ZSTD_ParamSwitch_e externalRepSearch); + +static ZSTD_SequenceCopier_f ZSTD_selectSequenceCopier(ZSTD_SequenceFormat_e mode) +{ + assert(ZSTD_cParam_withinBounds(ZSTD_c_blockDelimiters, (int)mode)); + if (mode == ZSTD_sf_explicitBlockDelimiters) { + return ZSTD_transferSequences_wBlockDelim; + } + assert(mode == ZSTD_sf_noBlockDelimiters); + return ZSTD_transferSequences_noDelim; +} + +/* Discover the size of next block by searching for the delimiter. + * Note that a block delimiter **must** exist in this mode, + * otherwise it's an input error. + * The block size retrieved will be later compared to ensure it remains within bounds */ +static size_t +blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_SequencePosition seqPos) +{ + int end = 0; + size_t blockSize = 0; + size_t spos = seqPos.idx; + DEBUGLOG(6, "blockSize_explicitDelimiter : seq %zu / %zu", spos, inSeqsSize); + assert(spos <= inSeqsSize); + while (spos < inSeqsSize) { + end = (inSeqs[spos].offset == 0); + blockSize += inSeqs[spos].litLength + inSeqs[spos].matchLength; + if (end) { + if (inSeqs[spos].matchLength != 0) + RETURN_ERROR(externalSequences_invalid, "delimiter format error : both matchlength and offset must be == 0"); + break; + } + spos++; + } + if (!end) + RETURN_ERROR(externalSequences_invalid, "Reached end of sequences without finding a block delimiter"); + return blockSize; +} + +static size_t determine_blockSize(ZSTD_SequenceFormat_e mode, + size_t blockSize, size_t remaining, + const ZSTD_Sequence* inSeqs, size_t inSeqsSize, + ZSTD_SequencePosition seqPos) +{ + DEBUGLOG(6, "determine_blockSize : remainingSize = %zu", remaining); + if (mode == ZSTD_sf_noBlockDelimiters) { + /* Note: more a "target" block size */ + return MIN(remaining, blockSize); + } + assert(mode == ZSTD_sf_explicitBlockDelimiters); + { size_t const explicitBlockSize = blockSize_explicitDelimiter(inSeqs, inSeqsSize, seqPos); + FORWARD_IF_ERROR(explicitBlockSize, "Error while determining block size with explicit delimiters"); + if (explicitBlockSize > blockSize) + RETURN_ERROR(externalSequences_invalid, "sequences incorrectly define a too large block"); + if (explicitBlockSize > remaining) + RETURN_ERROR(externalSequences_invalid, "sequences define a frame longer than source"); + return explicitBlockSize; + } +} + +/* Compress all provided sequences, block-by-block. + * + * Returns the cumulative size of all compressed blocks (including their headers), + * otherwise a ZSTD error. + */ +static size_t +ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const ZSTD_Sequence* inSeqs, size_t inSeqsSize, + const void* src, size_t srcSize) +{ + size_t cSize = 0; + size_t remaining = srcSize; + ZSTD_SequencePosition seqPos = {0, 0, 0}; + + const BYTE* ip = (BYTE const*)src; + BYTE* op = (BYTE*)dst; + ZSTD_SequenceCopier_f const sequenceCopier = ZSTD_selectSequenceCopier(cctx->appliedParams.blockDelimiters); + + DEBUGLOG(4, "ZSTD_compressSequences_internal srcSize: %zu, inSeqsSize: %zu", srcSize, inSeqsSize); + /* Special case: empty frame */ + if (remaining == 0) { + U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1); + RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "No room for empty frame block header"); + MEM_writeLE32(op, cBlockHeader24); + op += ZSTD_blockHeaderSize; + dstCapacity -= ZSTD_blockHeaderSize; + cSize += ZSTD_blockHeaderSize; + } + + while (remaining) { + size_t compressedSeqsSize; + size_t cBlockSize; + size_t blockSize = determine_blockSize(cctx->appliedParams.blockDelimiters, + cctx->blockSizeMax, remaining, + inSeqs, inSeqsSize, seqPos); + U32 const lastBlock = (blockSize == remaining); + FORWARD_IF_ERROR(blockSize, "Error while trying to determine block size"); + assert(blockSize <= remaining); + ZSTD_resetSeqStore(&cctx->seqStore); + + blockSize = sequenceCopier(cctx, + &seqPos, inSeqs, inSeqsSize, + ip, blockSize, + cctx->appliedParams.searchForExternalRepcodes); + FORWARD_IF_ERROR(blockSize, "Bad sequence copy"); + + /* If blocks are too small, emit as a nocompress block */ + /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding + * additional 1. We need to revisit and change this logic to be more consistent */ + if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) { + cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); + FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed"); + DEBUGLOG(5, "Block too small (%zu): data remains uncompressed: cSize=%zu", blockSize, cBlockSize); + cSize += cBlockSize; + ip += blockSize; + op += cBlockSize; + remaining -= blockSize; + dstCapacity -= cBlockSize; + continue; + } + + RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block"); + compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore, + &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy, + &cctx->appliedParams, + op + ZSTD_blockHeaderSize /* Leave space for block header */, dstCapacity - ZSTD_blockHeaderSize, + blockSize, + cctx->tmpWorkspace, cctx->tmpWkspSize /* statically allocated in resetCCtx */, + cctx->bmi2); + FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed"); + DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize); + + if (!cctx->isFirstBlock && + ZSTD_maybeRLE(&cctx->seqStore) && + ZSTD_isRLE(ip, blockSize)) { + /* Note: don't emit the first block as RLE even if it qualifies because + * doing so will cause the decoder (cli <= v1.4.3 only) to throw an (invalid) error + * "should consume all input error." + */ + compressedSeqsSize = 1; + } + + if (compressedSeqsSize == 0) { + /* ZSTD_noCompressBlock writes the block header as well */ + cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); + FORWARD_IF_ERROR(cBlockSize, "ZSTD_noCompressBlock failed"); + DEBUGLOG(5, "Writing out nocompress block, size: %zu", cBlockSize); + } else if (compressedSeqsSize == 1) { + cBlockSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, blockSize, lastBlock); + FORWARD_IF_ERROR(cBlockSize, "ZSTD_rleCompressBlock failed"); + DEBUGLOG(5, "Writing out RLE block, size: %zu", cBlockSize); + } else { + U32 cBlockHeader; + /* Error checking and repcodes update */ + ZSTD_blockState_confirmRepcodesAndEntropyTables(&cctx->blockState); + if (cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) + cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; + + /* Write block header into beginning of block*/ + cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3); + MEM_writeLE24(op, cBlockHeader); + cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize; + DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize); + } + + cSize += cBlockSize; + + if (lastBlock) { + break; + } else { + ip += blockSize; + op += cBlockSize; + remaining -= blockSize; + dstCapacity -= cBlockSize; + cctx->isFirstBlock = 0; + } + DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity); + } + + DEBUGLOG(4, "cSize final total: %zu", cSize); + return cSize; +} + +size_t ZSTD_compressSequences(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const ZSTD_Sequence* inSeqs, size_t inSeqsSize, + const void* src, size_t srcSize) +{ + BYTE* op = (BYTE*)dst; + size_t cSize = 0; + + /* Transparent initialization stage, same as compressStream2() */ + DEBUGLOG(4, "ZSTD_compressSequences (nbSeqs=%zu,dstCapacity=%zu)", inSeqsSize, dstCapacity); + assert(cctx != NULL); + FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed"); + + /* Begin writing output, starting with frame header */ + { size_t const frameHeaderSize = ZSTD_writeFrameHeader(op, dstCapacity, + &cctx->appliedParams, srcSize, cctx->dictID); + op += frameHeaderSize; + assert(frameHeaderSize <= dstCapacity); + dstCapacity -= frameHeaderSize; + cSize += frameHeaderSize; + } + if (cctx->appliedParams.fParams.checksumFlag && srcSize) { + XXH64_update(&cctx->xxhState, src, srcSize); + } + + /* Now generate compressed blocks */ + { size_t const cBlocksSize = ZSTD_compressSequences_internal(cctx, + op, dstCapacity, + inSeqs, inSeqsSize, + src, srcSize); + FORWARD_IF_ERROR(cBlocksSize, "Compressing blocks failed!"); + cSize += cBlocksSize; + assert(cBlocksSize <= dstCapacity); + dstCapacity -= cBlocksSize; + } + + /* Complete with frame checksum, if needed */ + if (cctx->appliedParams.fParams.checksumFlag) { + U32 const checksum = (U32) XXH64_digest(&cctx->xxhState); + RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for checksum"); + DEBUGLOG(4, "Write checksum : %08X", (unsigned)checksum); + MEM_writeLE32((char*)dst + cSize, checksum); + cSize += 4; + } + + DEBUGLOG(4, "Final compressed size: %zu", cSize); + return cSize; +} + + +#if defined(__AVX2__) + +#include /* AVX2 intrinsics */ + +/* + * Convert 2 sequences per iteration, using AVX2 intrinsics: + * - offset -> offBase = offset + 2 + * - litLength -> (U16) litLength + * - matchLength -> (U16)(matchLength - 3) + * - rep is ignored + * Store only 8 bytes per SeqDef (offBase[4], litLength[2], mlBase[2]). + * + * At the end, instead of extracting two __m128i, + * we use _mm256_permute4x64_epi64(..., 0xE8) to move lane2 into lane1, + * then store the lower 16 bytes in one go. + * + * @returns 0 on succes, with no long length detected + * @returns > 0 if there is one long length (> 65535), + * indicating the position, and type. + */ +static size_t convertSequences_noRepcodes( + SeqDef* dstSeqs, + const ZSTD_Sequence* inSeqs, + size_t nbSequences) +{ + /* + * addition: + * For each 128-bit half: (offset+2, litLength+0, matchLength-3, rep+0) + */ + const __m256i addition = _mm256_setr_epi32( + ZSTD_REP_NUM, 0, -MINMATCH, 0, /* for sequence i */ + ZSTD_REP_NUM, 0, -MINMATCH, 0 /* for sequence i+1 */ + ); + + /* limit: check if there is a long length */ + const __m256i limit = _mm256_set1_epi32(65535); + + /* + * shuffle mask for byte-level rearrangement in each 128-bit half: + * + * Input layout (after addition) per 128-bit half: + * [ offset+2 (4 bytes) | litLength (4 bytes) | matchLength (4 bytes) | rep (4 bytes) ] + * We only need: + * offBase (4 bytes) = offset+2 + * litLength (2 bytes) = low 2 bytes of litLength + * mlBase (2 bytes) = low 2 bytes of (matchLength) + * => Bytes [0..3, 4..5, 8..9], zero the rest. + */ + const __m256i mask = _mm256_setr_epi8( + /* For the lower 128 bits => sequence i */ + 0, 1, 2, 3, /* offset+2 */ + 4, 5, /* litLength (16 bits) */ + 8, 9, /* matchLength (16 bits) */ + (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, + (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, + + /* For the upper 128 bits => sequence i+1 */ + 16,17,18,19, /* offset+2 */ + 20,21, /* litLength */ + 24,25, /* matchLength */ + (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, + (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80 + ); + + /* + * Next, we'll use _mm256_permute4x64_epi64(vshf, 0xE8). + * Explanation of 0xE8 = 11101000b => [lane0, lane2, lane2, lane3]. + * So the lower 128 bits become [lane0, lane2] => combining seq0 and seq1. + */ +#define PERM_LANE_0X_E8 0xE8 /* [0,2,2,3] in lane indices */ + + size_t longLen = 0, i = 0; + + /* AVX permutation depends on the specific definition of target structures */ + ZSTD_STATIC_ASSERT(sizeof(ZSTD_Sequence) == 16); + ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, offset) == 0); + ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, litLength) == 4); + ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) == 8); + ZSTD_STATIC_ASSERT(sizeof(SeqDef) == 8); + ZSTD_STATIC_ASSERT(offsetof(SeqDef, offBase) == 0); + ZSTD_STATIC_ASSERT(offsetof(SeqDef, litLength) == 4); + ZSTD_STATIC_ASSERT(offsetof(SeqDef, mlBase) == 6); + + /* Process 2 sequences per loop iteration */ + for (; i + 1 < nbSequences; i += 2) { + /* Load 2 ZSTD_Sequence (32 bytes) */ + __m256i vin = _mm256_loadu_si256((const __m256i*)(const void*)&inSeqs[i]); + + /* Add {2, 0, -3, 0} in each 128-bit half */ + __m256i vadd = _mm256_add_epi32(vin, addition); + + /* Check for long length */ + __m256i ll_cmp = _mm256_cmpgt_epi32(vadd, limit); /* 0xFFFFFFFF for element > 65535 */ + int ll_res = _mm256_movemask_epi8(ll_cmp); + + /* Shuffle bytes so each half gives us the 8 bytes we need */ + __m256i vshf = _mm256_shuffle_epi8(vadd, mask); + /* + * Now: + * Lane0 = seq0's 8 bytes + * Lane1 = 0 + * Lane2 = seq1's 8 bytes + * Lane3 = 0 + */ + + /* Permute 64-bit lanes => move Lane2 down into Lane1. */ + __m256i vperm = _mm256_permute4x64_epi64(vshf, PERM_LANE_0X_E8); + /* + * Now the lower 16 bytes (Lane0+Lane1) = [seq0, seq1]. + * The upper 16 bytes are [Lane2, Lane3] = [seq1, 0], but we won't use them. + */ + + /* Store only the lower 16 bytes => 2 SeqDef (8 bytes each) */ + _mm_storeu_si128((__m128i *)(void*)&dstSeqs[i], _mm256_castsi256_si128(vperm)); + /* + * This writes out 16 bytes total: + * - offset 0..7 => seq0 (offBase, litLength, mlBase) + * - offset 8..15 => seq1 (offBase, litLength, mlBase) + */ + + /* check (unlikely) long lengths > 65535 + * indices for lengths correspond to bits [4..7], [8..11], [20..23], [24..27] + * => combined mask = 0x0FF00FF0 + */ + if (UNLIKELY((ll_res & 0x0FF00FF0) != 0)) { + /* long length detected: let's figure out which one*/ + if (inSeqs[i].matchLength > 65535+MINMATCH) { + assert(longLen == 0); + longLen = i + 1; + } + if (inSeqs[i].litLength > 65535) { + assert(longLen == 0); + longLen = i + nbSequences + 1; + } + if (inSeqs[i+1].matchLength > 65535+MINMATCH) { + assert(longLen == 0); + longLen = i + 1 + 1; + } + if (inSeqs[i+1].litLength > 65535) { + assert(longLen == 0); + longLen = i + 1 + nbSequences + 1; + } + } + } + + /* Handle leftover if @nbSequences is odd */ + if (i < nbSequences) { + /* process last sequence */ + assert(i == nbSequences - 1); + dstSeqs[i].offBase = OFFSET_TO_OFFBASE(inSeqs[i].offset); + dstSeqs[i].litLength = (U16)inSeqs[i].litLength; + dstSeqs[i].mlBase = (U16)(inSeqs[i].matchLength - MINMATCH); + /* check (unlikely) long lengths > 65535 */ + if (UNLIKELY(inSeqs[i].matchLength > 65535+MINMATCH)) { + assert(longLen == 0); + longLen = i + 1; + } + if (UNLIKELY(inSeqs[i].litLength > 65535)) { + assert(longLen == 0); + longLen = i + nbSequences + 1; + } + } + + return longLen; +} + +/* the vector implementation could also be ported to SSSE3, + * but since this implementation is targeting modern systems (>= Sapphire Rapid), + * it's not useful to develop and maintain code for older pre-AVX2 platforms */ + +#else /* no AVX2 */ + +static size_t convertSequences_noRepcodes( + SeqDef* dstSeqs, + const ZSTD_Sequence* inSeqs, + size_t nbSequences) +{ + size_t longLen = 0; + size_t n; + for (n=0; n 65535 */ + if (UNLIKELY(inSeqs[n].matchLength > 65535+MINMATCH)) { + assert(longLen == 0); + longLen = n + 1; + } + if (UNLIKELY(inSeqs[n].litLength > 65535)) { + assert(longLen == 0); + longLen = n + nbSequences + 1; + } + } + return longLen; +} + +#endif + +/* + * Precondition: Sequences must end on an explicit Block Delimiter + * @return: 0 on success, or an error code. + * Note: Sequence validation functionality has been disabled (removed). + * This is helpful to generate a lean main pipeline, improving performance. + * It may be re-inserted later. + */ +size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, + const ZSTD_Sequence* const inSeqs, size_t nbSequences, + int repcodeResolution) +{ + Repcodes_t updatedRepcodes; + size_t seqNb = 0; + + DEBUGLOG(5, "ZSTD_convertBlockSequences (nbSequences = %zu)", nbSequences); + + RETURN_ERROR_IF(nbSequences >= cctx->seqStore.maxNbSeq, externalSequences_invalid, + "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); + + ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(Repcodes_t)); + + /* check end condition */ + assert(nbSequences >= 1); + assert(inSeqs[nbSequences-1].matchLength == 0); + assert(inSeqs[nbSequences-1].offset == 0); + + /* Convert Sequences from public format to internal format */ + if (!repcodeResolution) { + size_t const longl = convertSequences_noRepcodes(cctx->seqStore.sequencesStart, inSeqs, nbSequences-1); + cctx->seqStore.sequences = cctx->seqStore.sequencesStart + nbSequences-1; + if (longl) { + DEBUGLOG(5, "long length"); + assert(cctx->seqStore.longLengthType == ZSTD_llt_none); + if (longl <= nbSequences-1) { + DEBUGLOG(5, "long match length detected at pos %zu", longl-1); + cctx->seqStore.longLengthType = ZSTD_llt_matchLength; + cctx->seqStore.longLengthPos = (U32)(longl-1); + } else { + DEBUGLOG(5, "long literals length detected at pos %zu", longl-nbSequences); + assert(longl <= 2* (nbSequences-1)); + cctx->seqStore.longLengthType = ZSTD_llt_literalLength; + cctx->seqStore.longLengthPos = (U32)(longl-(nbSequences-1)-1); + } + } + } else { + for (seqNb = 0; seqNb < nbSequences - 1 ; seqNb++) { + U32 const litLength = inSeqs[seqNb].litLength; + U32 const matchLength = inSeqs[seqNb].matchLength; + U32 const ll0 = (litLength == 0); + U32 const offBase = ZSTD_finalizeOffBase(inSeqs[seqNb].offset, updatedRepcodes.rep, ll0); + + DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); + ZSTD_storeSeqOnly(&cctx->seqStore, litLength, offBase, matchLength); + ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); + } + } + + /* If we skipped repcode search while parsing, we need to update repcodes now */ + if (!repcodeResolution && nbSequences > 1) { + U32* const rep = updatedRepcodes.rep; + + if (nbSequences >= 4) { + U32 lastSeqIdx = (U32)nbSequences - 2; /* index of last full sequence */ + rep[2] = inSeqs[lastSeqIdx - 2].offset; + rep[1] = inSeqs[lastSeqIdx - 1].offset; + rep[0] = inSeqs[lastSeqIdx].offset; + } else if (nbSequences == 3) { + rep[2] = rep[0]; + rep[1] = inSeqs[0].offset; + rep[0] = inSeqs[1].offset; + } else { + assert(nbSequences == 2); + rep[2] = rep[1]; + rep[1] = rep[0]; + rep[0] = inSeqs[0].offset; + } + } + + ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(Repcodes_t)); + + return 0; +} + +#if defined(ZSTD_ARCH_X86_AVX2) + +BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) +{ + size_t i; + __m256i const zeroVec = _mm256_setzero_si256(); + __m256i sumVec = zeroVec; /* accumulates match+lit in 32-bit lanes */ + ZSTD_ALIGNED(32) U32 tmp[8]; /* temporary buffer for reduction */ + size_t mSum = 0, lSum = 0; + ZSTD_STATIC_ASSERT(sizeof(ZSTD_Sequence) == 16); + + /* Process 2 structs (32 bytes) at a time */ + for (i = 0; i + 2 <= nbSeqs; i += 2) { + /* Load two consecutive ZSTD_Sequence (8×4 = 32 bytes) */ + __m256i data = _mm256_loadu_si256((const __m256i*)(const void*)&seqs[i]); + /* check end of block signal */ + __m256i cmp = _mm256_cmpeq_epi32(data, zeroVec); + int cmp_res = _mm256_movemask_epi8(cmp); + /* indices for match lengths correspond to bits [8..11], [24..27] + * => combined mask = 0x0F000F00 */ + ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) == 8); + if (cmp_res & 0x0F000F00) break; + /* Accumulate in sumVec */ + sumVec = _mm256_add_epi32(sumVec, data); + } + + /* Horizontal reduction */ + _mm256_store_si256((__m256i*)tmp, sumVec); + lSum = tmp[1] + tmp[5]; + mSum = tmp[2] + tmp[6]; + + /* Handle the leftover */ + for (; i < nbSeqs; i++) { + lSum += seqs[i].litLength; + mSum += seqs[i].matchLength; + if (seqs[i].matchLength == 0) break; /* end of block */ + } + + if (i==nbSeqs) { + /* reaching end of sequences: end of block signal was not present */ + BlockSummary bs; + bs.nbSequences = ERROR(externalSequences_invalid); + return bs; + } + { BlockSummary bs; + bs.nbSequences = i+1; + bs.blockSize = lSum + mSum; + bs.litSize = lSum; + return bs; + } +} + +#else + +BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) +{ + size_t totalMatchSize = 0; + size_t litSize = 0; + size_t n; + assert(seqs); + for (n=0; nappliedParams.searchForExternalRepcodes == ZSTD_ps_enable); + assert(cctx->appliedParams.searchForExternalRepcodes != ZSTD_ps_auto); + + DEBUGLOG(4, "ZSTD_compressSequencesAndLiterals_internal: nbSeqs=%zu, litSize=%zu", nbSequences, litSize); + RETURN_ERROR_IF(nbSequences == 0, externalSequences_invalid, "Requires at least 1 end-of-block"); + + /* Special case: empty frame */ + if ((nbSequences == 1) && (inSeqs[0].litLength == 0)) { + U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1); + RETURN_ERROR_IF(dstCapacity<3, dstSize_tooSmall, "No room for empty frame block header"); + MEM_writeLE24(op, cBlockHeader24); + op += ZSTD_blockHeaderSize; + dstCapacity -= ZSTD_blockHeaderSize; + cSize += ZSTD_blockHeaderSize; + } + + while (nbSequences) { + size_t compressedSeqsSize, cBlockSize, conversionStatus; + BlockSummary const block = ZSTD_get1BlockSummary(inSeqs, nbSequences); + U32 const lastBlock = (block.nbSequences == nbSequences); + FORWARD_IF_ERROR(block.nbSequences, "Error while trying to determine nb of sequences for a block"); + assert(block.nbSequences <= nbSequences); + RETURN_ERROR_IF(block.litSize > litSize, externalSequences_invalid, "discrepancy: Sequences require more literals than present in buffer"); + ZSTD_resetSeqStore(&cctx->seqStore); + + conversionStatus = ZSTD_convertBlockSequences(cctx, + inSeqs, block.nbSequences, + repcodeResolution); + FORWARD_IF_ERROR(conversionStatus, "Bad sequence conversion"); + inSeqs += block.nbSequences; + nbSequences -= block.nbSequences; + remaining -= block.blockSize; + + /* Note: when blockSize is very small, other variant send it uncompressed. + * Here, we still send the sequences, because we don't have the original source to send it uncompressed. + * One could imagine in theory reproducing the source from the sequences, + * but that's complex and costly memory intensive, and goes against the objectives of this variant. */ + + RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block"); + + compressedSeqsSize = ZSTD_entropyCompressSeqStore_internal( + op + ZSTD_blockHeaderSize /* Leave space for block header */, dstCapacity - ZSTD_blockHeaderSize, + literals, block.litSize, + &cctx->seqStore, + &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy, + &cctx->appliedParams, + cctx->tmpWorkspace, cctx->tmpWkspSize /* statically allocated in resetCCtx */, + cctx->bmi2); + FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed"); + /* note: the spec forbids for any compressed block to be larger than maximum block size */ + if (compressedSeqsSize > cctx->blockSizeMax) compressedSeqsSize = 0; + DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize); + litSize -= block.litSize; + literals = (const char*)literals + block.litSize; + + /* Note: difficult to check source for RLE block when only Literals are provided, + * but it could be considered from analyzing the sequence directly */ + + if (compressedSeqsSize == 0) { + /* Sending uncompressed blocks is out of reach, because the source is not provided. + * In theory, one could use the sequences to regenerate the source, like a decompressor, + * but it's complex, and memory hungry, killing the purpose of this variant. + * Current outcome: generate an error code. + */ + RETURN_ERROR(cannotProduce_uncompressedBlock, "ZSTD_compressSequencesAndLiterals cannot generate an uncompressed block"); + } else { + U32 cBlockHeader; + assert(compressedSeqsSize > 1); /* no RLE */ + /* Error checking and repcodes update */ + ZSTD_blockState_confirmRepcodesAndEntropyTables(&cctx->blockState); + if (cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) + cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; + + /* Write block header into beginning of block*/ + cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3); + MEM_writeLE24(op, cBlockHeader); + cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize; + DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize); + } + + cSize += cBlockSize; + op += cBlockSize; + dstCapacity -= cBlockSize; + cctx->isFirstBlock = 0; + DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity); + + if (lastBlock) { + assert(nbSequences == 0); + break; + } + } + + RETURN_ERROR_IF(litSize != 0, externalSequences_invalid, "literals must be entirely and exactly consumed"); + RETURN_ERROR_IF(remaining != 0, externalSequences_invalid, "Sequences must represent a total of exactly srcSize=%zu", srcSize); + DEBUGLOG(4, "cSize final total: %zu", cSize); + return cSize; +} + +size_t +ZSTD_compressSequencesAndLiterals(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const ZSTD_Sequence* inSeqs, size_t inSeqsSize, + const void* literals, size_t litSize, size_t litCapacity, + size_t decompressedSize) +{ + BYTE* op = (BYTE*)dst; + size_t cSize = 0; + + /* Transparent initialization stage, same as compressStream2() */ + DEBUGLOG(4, "ZSTD_compressSequencesAndLiterals (dstCapacity=%zu)", dstCapacity); + assert(cctx != NULL); + if (litCapacity < litSize) { + RETURN_ERROR(workSpace_tooSmall, "literals buffer is not large enough: must be at least 8 bytes larger than litSize (risk of read out-of-bound)"); + } + FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, decompressedSize), "CCtx initialization failed"); + + if (cctx->appliedParams.blockDelimiters == ZSTD_sf_noBlockDelimiters) { + RETURN_ERROR(frameParameter_unsupported, "This mode is only compatible with explicit delimiters"); + } + if (cctx->appliedParams.validateSequences) { + RETURN_ERROR(parameter_unsupported, "This mode is not compatible with Sequence validation"); + } + if (cctx->appliedParams.fParams.checksumFlag) { + RETURN_ERROR(frameParameter_unsupported, "this mode is not compatible with frame checksum"); + } + + /* Begin writing output, starting with frame header */ + { size_t const frameHeaderSize = ZSTD_writeFrameHeader(op, dstCapacity, + &cctx->appliedParams, decompressedSize, cctx->dictID); + op += frameHeaderSize; + assert(frameHeaderSize <= dstCapacity); + dstCapacity -= frameHeaderSize; + cSize += frameHeaderSize; + } + + /* Now generate compressed blocks */ + { size_t const cBlocksSize = ZSTD_compressSequencesAndLiterals_internal(cctx, + op, dstCapacity, + inSeqs, inSeqsSize, + literals, litSize, decompressedSize); + FORWARD_IF_ERROR(cBlocksSize, "Compressing blocks failed!"); + cSize += cBlocksSize; + assert(cBlocksSize <= dstCapacity); + dstCapacity -= cBlocksSize; + } + + DEBUGLOG(4, "Final compressed size: %zu", cSize); + return cSize; +} + +/*====== Finalize ======*/ + +static ZSTD_inBuffer inBuffer_forEndFlush(const ZSTD_CStream* zcs) +{ + const ZSTD_inBuffer nullInput = { NULL, 0, 0 }; + const int stableInput = (zcs->appliedParams.inBufferMode == ZSTD_bm_stable); + return stableInput ? zcs->expectedInBuffer : nullInput; +} + +/*! ZSTD_flushStream() : + * @return : amount of data remaining to flush */ +size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) +{ + ZSTD_inBuffer input = inBuffer_forEndFlush(zcs); + input.size = input.pos; /* do not ingest more input during flush */ + return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush); +} + +size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) +{ + ZSTD_inBuffer input = inBuffer_forEndFlush(zcs); + size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end); + FORWARD_IF_ERROR(remainingToFlush , "ZSTD_compressStream2(,,ZSTD_e_end) failed"); + if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush; /* minimal estimation */ + /* single thread mode : attempt to calculate remaining to flush more precisely */ + { size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE; + size_t const checksumSize = (size_t)(zcs->frameEnded ? 0 : zcs->appliedParams.fParams.checksumFlag * 4); + size_t const toFlush = remainingToFlush + lastBlockSize + checksumSize; + DEBUGLOG(4, "ZSTD_endStream : remaining to flush : %u", (unsigned)toFlush); + return toFlush; + } +} + + +/*-===== Pre-defined compression levels =====-*/ +#include "clevels.h" + +int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; } +int ZSTD_minCLevel(void) { return (int)-ZSTD_TARGETLENGTH_MAX; } +int ZSTD_defaultCLevel(void) { return ZSTD_CLEVEL_DEFAULT; } + +static ZSTD_compressionParameters ZSTD_dedicatedDictSearch_getCParams(int const compressionLevel, size_t const dictSize) +{ + ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, 0, dictSize, ZSTD_cpm_createCDict); + switch (cParams.strategy) { + case ZSTD_fast: + case ZSTD_dfast: + break; + case ZSTD_greedy: + case ZSTD_lazy: + case ZSTD_lazy2: + cParams.hashLog += ZSTD_LAZY_DDSS_BUCKET_LOG; + break; + case ZSTD_btlazy2: + case ZSTD_btopt: + case ZSTD_btultra: + case ZSTD_btultra2: + break; + } + return cParams; +} + +static int ZSTD_dedicatedDictSearch_isSupported( + ZSTD_compressionParameters const* cParams) +{ + return (cParams->strategy >= ZSTD_greedy) + && (cParams->strategy <= ZSTD_lazy2) + && (cParams->hashLog > cParams->chainLog) + && (cParams->chainLog <= 24); +} + +/** + * Reverses the adjustment applied to cparams when enabling dedicated dict + * search. This is used to recover the params set to be used in the working + * context. (Otherwise, those tables would also grow.) + */ +static void ZSTD_dedicatedDictSearch_revertCParams( + ZSTD_compressionParameters* cParams) { + switch (cParams->strategy) { + case ZSTD_fast: + case ZSTD_dfast: + break; + case ZSTD_greedy: + case ZSTD_lazy: + case ZSTD_lazy2: + cParams->hashLog -= ZSTD_LAZY_DDSS_BUCKET_LOG; + if (cParams->hashLog < ZSTD_HASHLOG_MIN) { + cParams->hashLog = ZSTD_HASHLOG_MIN; + } + break; + case ZSTD_btlazy2: + case ZSTD_btopt: + case ZSTD_btultra: + case ZSTD_btultra2: + break; + } +} + +static U64 ZSTD_getCParamRowSize(U64 srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode) +{ + switch (mode) { + case ZSTD_cpm_unknown: + case ZSTD_cpm_noAttachDict: + case ZSTD_cpm_createCDict: + break; + case ZSTD_cpm_attachDict: + dictSize = 0; + break; + default: + assert(0); + break; + } + { int const unknown = srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN; + size_t const addedSize = unknown && dictSize > 0 ? 500 : 0; + return unknown && dictSize == 0 ? ZSTD_CONTENTSIZE_UNKNOWN : srcSizeHint+dictSize+addedSize; + } +} + +/*! ZSTD_getCParams_internal() : + * @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize. + * Note: srcSizeHint 0 means 0, use ZSTD_CONTENTSIZE_UNKNOWN for unknown. + * Use dictSize == 0 for unknown or unused. + * Note: `mode` controls how we treat the `dictSize`. See docs for `ZSTD_CParamMode_e`. */ +static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode) +{ + U64 const rSize = ZSTD_getCParamRowSize(srcSizeHint, dictSize, mode); + U32 const tableID = (rSize <= 256 KB) + (rSize <= 128 KB) + (rSize <= 16 KB); + int row; + DEBUGLOG(5, "ZSTD_getCParams_internal (cLevel=%i)", compressionLevel); + + /* row */ + if (compressionLevel == 0) row = ZSTD_CLEVEL_DEFAULT; /* 0 == default */ + else if (compressionLevel < 0) row = 0; /* entry 0 is baseline for fast mode */ + else if (compressionLevel > ZSTD_MAX_CLEVEL) row = ZSTD_MAX_CLEVEL; + else row = compressionLevel; + + { ZSTD_compressionParameters cp = ZSTD_defaultCParameters[tableID][row]; + DEBUGLOG(5, "ZSTD_getCParams_internal selected tableID: %u row: %u strat: %u", tableID, row, (U32)cp.strategy); + /* acceleration factor */ + if (compressionLevel < 0) { + int const clampedCompressionLevel = MAX(ZSTD_minCLevel(), compressionLevel); + cp.targetLength = (unsigned)(-clampedCompressionLevel); + } + /* refine parameters based on srcSize & dictSize */ + return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode, ZSTD_ps_auto); + } +} + +/*! ZSTD_getCParams() : + * @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize. + * Size values are optional, provide 0 if not known or unused */ +ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) +{ + if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN; + return ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown); +} + +/*! ZSTD_getParams() : + * same idea as ZSTD_getCParams() + * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`). + * Fields of `ZSTD_frameParameters` are set to default values */ +static ZSTD_parameters +ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode) +{ + ZSTD_parameters params; + ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize, mode); + DEBUGLOG(5, "ZSTD_getParams (cLevel=%i)", compressionLevel); + ZSTD_memset(¶ms, 0, sizeof(params)); + params.cParams = cParams; + params.fParams.contentSizeFlag = 1; + return params; +} + +/*! ZSTD_getParams() : + * same idea as ZSTD_getCParams() + * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`). + * Fields of `ZSTD_frameParameters` are set to default values */ +ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) +{ + if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN; + return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown); +} + +void ZSTD_registerSequenceProducer( + ZSTD_CCtx* zc, + void* extSeqProdState, + ZSTD_sequenceProducer_F extSeqProdFunc) +{ + assert(zc != NULL); + ZSTD_CCtxParams_registerSequenceProducer( + &zc->requestedParams, extSeqProdState, extSeqProdFunc + ); +} + +void ZSTD_CCtxParams_registerSequenceProducer( + ZSTD_CCtx_params* params, + void* extSeqProdState, + ZSTD_sequenceProducer_F extSeqProdFunc) +{ + assert(params != NULL); + if (extSeqProdFunc != NULL) { + params->extSeqProdFunc = extSeqProdFunc; + params->extSeqProdState = extSeqProdState; + } else { + params->extSeqProdFunc = NULL; + params->extSeqProdState = NULL; + } +} diff --git a/lib/compress/zstd_compress_sequences.c b/lib/compress/zstd_compress_sequences.c new file mode 100644 index 0000000..7beb9da --- /dev/null +++ b/lib/compress/zstd_compress_sequences.c @@ -0,0 +1,442 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + /*-************************************* + * Dependencies + ***************************************/ +#include "zstd_compress_sequences.h" + +/** + * -log2(x / 256) lookup table for x in [0, 256). + * If x == 0: Return 0 + * Else: Return floor(-log2(x / 256) * 256) + */ +static unsigned const kInverseProbabilityLog256[256] = { + 0, 2048, 1792, 1642, 1536, 1453, 1386, 1329, 1280, 1236, 1197, 1162, + 1130, 1100, 1073, 1047, 1024, 1001, 980, 960, 941, 923, 906, 889, + 874, 859, 844, 830, 817, 804, 791, 779, 768, 756, 745, 734, + 724, 714, 704, 694, 685, 676, 667, 658, 650, 642, 633, 626, + 618, 610, 603, 595, 588, 581, 574, 567, 561, 554, 548, 542, + 535, 529, 523, 517, 512, 506, 500, 495, 489, 484, 478, 473, + 468, 463, 458, 453, 448, 443, 438, 434, 429, 424, 420, 415, + 411, 407, 402, 398, 394, 390, 386, 382, 377, 373, 370, 366, + 362, 358, 354, 350, 347, 343, 339, 336, 332, 329, 325, 322, + 318, 315, 311, 308, 305, 302, 298, 295, 292, 289, 286, 282, + 279, 276, 273, 270, 267, 264, 261, 258, 256, 253, 250, 247, + 244, 241, 239, 236, 233, 230, 228, 225, 222, 220, 217, 215, + 212, 209, 207, 204, 202, 199, 197, 194, 192, 190, 187, 185, + 182, 180, 178, 175, 173, 171, 168, 166, 164, 162, 159, 157, + 155, 153, 151, 149, 146, 144, 142, 140, 138, 136, 134, 132, + 130, 128, 126, 123, 121, 119, 117, 115, 114, 112, 110, 108, + 106, 104, 102, 100, 98, 96, 94, 93, 91, 89, 87, 85, + 83, 82, 80, 78, 76, 74, 73, 71, 69, 67, 66, 64, + 62, 61, 59, 57, 55, 54, 52, 50, 49, 47, 46, 44, + 42, 41, 39, 37, 36, 34, 33, 31, 30, 28, 26, 25, + 23, 22, 20, 19, 17, 16, 14, 13, 11, 10, 8, 7, + 5, 4, 2, 1, +}; + +static unsigned ZSTD_getFSEMaxSymbolValue(FSE_CTable const* ctable) { + void const* ptr = ctable; + U16 const* u16ptr = (U16 const*)ptr; + U32 const maxSymbolValue = MEM_read16(u16ptr + 1); + return maxSymbolValue; +} + +/** + * Returns true if we should use ncount=-1 else we should + * use ncount=1 for low probability symbols instead. + */ +static unsigned ZSTD_useLowProbCount(size_t const nbSeq) +{ + /* Heuristic: This should cover most blocks <= 16K and + * start to fade out after 16K to about 32K depending on + * compressibility. + */ + return nbSeq >= 2048; +} + +/** + * Returns the cost in bytes of encoding the normalized count header. + * Returns an error if any of the helper functions return an error. + */ +static size_t ZSTD_NCountCost(unsigned const* count, unsigned const max, + size_t const nbSeq, unsigned const FSELog) +{ + BYTE wksp[FSE_NCOUNTBOUND]; + S16 norm[MaxSeq + 1]; + const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max); + FORWARD_IF_ERROR(FSE_normalizeCount(norm, tableLog, count, nbSeq, max, ZSTD_useLowProbCount(nbSeq)), ""); + return FSE_writeNCount(wksp, sizeof(wksp), norm, max, tableLog); +} + +/** + * Returns the cost in bits of encoding the distribution described by count + * using the entropy bound. + */ +static size_t ZSTD_entropyCost(unsigned const* count, unsigned const max, size_t const total) +{ + unsigned cost = 0; + unsigned s; + + assert(total > 0); + for (s = 0; s <= max; ++s) { + unsigned norm = (unsigned)((256 * count[s]) / total); + if (count[s] != 0 && norm == 0) + norm = 1; + assert(count[s] < total); + cost += count[s] * kInverseProbabilityLog256[norm]; + } + return cost >> 8; +} + +/** + * Returns the cost in bits of encoding the distribution in count using ctable. + * Returns an error if ctable cannot represent all the symbols in count. + */ +size_t ZSTD_fseBitCost( + FSE_CTable const* ctable, + unsigned const* count, + unsigned const max) +{ + unsigned const kAccuracyLog = 8; + size_t cost = 0; + unsigned s; + FSE_CState_t cstate; + FSE_initCState(&cstate, ctable); + if (ZSTD_getFSEMaxSymbolValue(ctable) < max) { + DEBUGLOG(5, "Repeat FSE_CTable has maxSymbolValue %u < %u", + ZSTD_getFSEMaxSymbolValue(ctable), max); + return ERROR(GENERIC); + } + for (s = 0; s <= max; ++s) { + unsigned const tableLog = cstate.stateLog; + unsigned const badCost = (tableLog + 1) << kAccuracyLog; + unsigned const bitCost = FSE_bitCost(cstate.symbolTT, tableLog, s, kAccuracyLog); + if (count[s] == 0) + continue; + if (bitCost >= badCost) { + DEBUGLOG(5, "Repeat FSE_CTable has Prob[%u] == 0", s); + return ERROR(GENERIC); + } + cost += (size_t)count[s] * bitCost; + } + return cost >> kAccuracyLog; +} + +/** + * Returns the cost in bits of encoding the distribution in count using the + * table described by norm. The max symbol support by norm is assumed >= max. + * norm must be valid for every symbol with non-zero probability in count. + */ +size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog, + unsigned const* count, unsigned const max) +{ + unsigned const shift = 8 - accuracyLog; + size_t cost = 0; + unsigned s; + assert(accuracyLog <= 8); + for (s = 0; s <= max; ++s) { + unsigned const normAcc = (norm[s] != -1) ? (unsigned)norm[s] : 1; + unsigned const norm256 = normAcc << shift; + assert(norm256 > 0); + assert(norm256 < 256); + cost += count[s] * kInverseProbabilityLog256[norm256]; + } + return cost >> 8; +} + +SymbolEncodingType_e +ZSTD_selectEncodingType( + FSE_repeat* repeatMode, unsigned const* count, unsigned const max, + size_t const mostFrequent, size_t nbSeq, unsigned const FSELog, + FSE_CTable const* prevCTable, + short const* defaultNorm, U32 defaultNormLog, + ZSTD_DefaultPolicy_e const isDefaultAllowed, + ZSTD_strategy const strategy) +{ + ZSTD_STATIC_ASSERT(ZSTD_defaultDisallowed == 0 && ZSTD_defaultAllowed != 0); + if (mostFrequent == nbSeq) { + *repeatMode = FSE_repeat_none; + if (isDefaultAllowed && nbSeq <= 2) { + /* Prefer set_basic over set_rle when there are 2 or fewer symbols, + * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol. + * If basic encoding isn't possible, always choose RLE. + */ + DEBUGLOG(5, "Selected set_basic"); + return set_basic; + } + DEBUGLOG(5, "Selected set_rle"); + return set_rle; + } + if (strategy < ZSTD_lazy) { + if (isDefaultAllowed) { + size_t const staticFse_nbSeq_max = 1000; + size_t const mult = 10 - strategy; + size_t const baseLog = 3; + size_t const dynamicFse_nbSeq_min = (((size_t)1 << defaultNormLog) * mult) >> baseLog; /* 28-36 for offset, 56-72 for lengths */ + assert(defaultNormLog >= 5 && defaultNormLog <= 6); /* xx_DEFAULTNORMLOG */ + assert(mult <= 9 && mult >= 7); + if ( (*repeatMode == FSE_repeat_valid) + && (nbSeq < staticFse_nbSeq_max) ) { + DEBUGLOG(5, "Selected set_repeat"); + return set_repeat; + } + if ( (nbSeq < dynamicFse_nbSeq_min) + || (mostFrequent < (nbSeq >> (defaultNormLog-1))) ) { + DEBUGLOG(5, "Selected set_basic"); + /* The format allows default tables to be repeated, but it isn't useful. + * When using simple heuristics to select encoding type, we don't want + * to confuse these tables with dictionaries. When running more careful + * analysis, we don't need to waste time checking both repeating tables + * and default tables. + */ + *repeatMode = FSE_repeat_none; + return set_basic; + } + } + } else { + size_t const basicCost = isDefaultAllowed ? ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, count, max) : ERROR(GENERIC); + size_t const repeatCost = *repeatMode != FSE_repeat_none ? ZSTD_fseBitCost(prevCTable, count, max) : ERROR(GENERIC); + size_t const NCountCost = ZSTD_NCountCost(count, max, nbSeq, FSELog); + size_t const compressedCost = (NCountCost << 3) + ZSTD_entropyCost(count, max, nbSeq); + + if (isDefaultAllowed) { + assert(!ZSTD_isError(basicCost)); + assert(!(*repeatMode == FSE_repeat_valid && ZSTD_isError(repeatCost))); + } + assert(!ZSTD_isError(NCountCost)); + assert(compressedCost < ERROR(maxCode)); + DEBUGLOG(5, "Estimated bit costs: basic=%u\trepeat=%u\tcompressed=%u", + (unsigned)basicCost, (unsigned)repeatCost, (unsigned)compressedCost); + if (basicCost <= repeatCost && basicCost <= compressedCost) { + DEBUGLOG(5, "Selected set_basic"); + assert(isDefaultAllowed); + *repeatMode = FSE_repeat_none; + return set_basic; + } + if (repeatCost <= compressedCost) { + DEBUGLOG(5, "Selected set_repeat"); + assert(!ZSTD_isError(repeatCost)); + return set_repeat; + } + assert(compressedCost < basicCost && compressedCost < repeatCost); + } + DEBUGLOG(5, "Selected set_compressed"); + *repeatMode = FSE_repeat_check; + return set_compressed; +} + +typedef struct { + S16 norm[MaxSeq + 1]; + U32 wksp[FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(MaxSeq, MaxFSELog)]; +} ZSTD_BuildCTableWksp; + +size_t +ZSTD_buildCTable(void* dst, size_t dstCapacity, + FSE_CTable* nextCTable, U32 FSELog, SymbolEncodingType_e type, + unsigned* count, U32 max, + const BYTE* codeTable, size_t nbSeq, + const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax, + const FSE_CTable* prevCTable, size_t prevCTableSize, + void* entropyWorkspace, size_t entropyWorkspaceSize) +{ + BYTE* op = (BYTE*)dst; + const BYTE* const oend = op + dstCapacity; + DEBUGLOG(6, "ZSTD_buildCTable (dstCapacity=%u)", (unsigned)dstCapacity); + + switch (type) { + case set_rle: + FORWARD_IF_ERROR(FSE_buildCTable_rle(nextCTable, (BYTE)max), ""); + RETURN_ERROR_IF(dstCapacity==0, dstSize_tooSmall, "not enough space"); + *op = codeTable[0]; + return 1; + case set_repeat: + ZSTD_memcpy(nextCTable, prevCTable, prevCTableSize); + return 0; + case set_basic: + FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, defaultNorm, defaultMax, defaultNormLog, entropyWorkspace, entropyWorkspaceSize), ""); /* note : could be pre-calculated */ + return 0; + case set_compressed: { + ZSTD_BuildCTableWksp* wksp = (ZSTD_BuildCTableWksp*)entropyWorkspace; + size_t nbSeq_1 = nbSeq; + const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max); + if (count[codeTable[nbSeq-1]] > 1) { + count[codeTable[nbSeq-1]]--; + nbSeq_1--; + } + assert(nbSeq_1 > 1); + assert(entropyWorkspaceSize >= sizeof(ZSTD_BuildCTableWksp)); + (void)entropyWorkspaceSize; + FORWARD_IF_ERROR(FSE_normalizeCount(wksp->norm, tableLog, count, nbSeq_1, max, ZSTD_useLowProbCount(nbSeq_1)), "FSE_normalizeCount failed"); + assert(oend >= op); + { size_t const NCountSize = FSE_writeNCount(op, (size_t)(oend - op), wksp->norm, max, tableLog); /* overflow protected */ + FORWARD_IF_ERROR(NCountSize, "FSE_writeNCount failed"); + FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, wksp->norm, max, tableLog, wksp->wksp, sizeof(wksp->wksp)), "FSE_buildCTable_wksp failed"); + return NCountSize; + } + } + default: assert(0); RETURN_ERROR(GENERIC, "impossible to reach"); + } +} + +FORCE_INLINE_TEMPLATE size_t +ZSTD_encodeSequences_body( + void* dst, size_t dstCapacity, + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, + SeqDef const* sequences, size_t nbSeq, int longOffsets) +{ + BIT_CStream_t blockStream; + FSE_CState_t stateMatchLength; + FSE_CState_t stateOffsetBits; + FSE_CState_t stateLitLength; + + RETURN_ERROR_IF( + ERR_isError(BIT_initCStream(&blockStream, dst, dstCapacity)), + dstSize_tooSmall, "not enough space remaining"); + DEBUGLOG(6, "available space for bitstream : %i (dstCapacity=%u)", + (int)(blockStream.endPtr - blockStream.startPtr), + (unsigned)dstCapacity); + + /* first symbols */ + FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlCodeTable[nbSeq-1]); + FSE_initCState2(&stateOffsetBits, CTable_OffsetBits, ofCodeTable[nbSeq-1]); + FSE_initCState2(&stateLitLength, CTable_LitLength, llCodeTable[nbSeq-1]); + BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCodeTable[nbSeq-1]]); + if (MEM_32bits()) BIT_flushBits(&blockStream); + BIT_addBits(&blockStream, sequences[nbSeq-1].mlBase, ML_bits[mlCodeTable[nbSeq-1]]); + if (MEM_32bits()) BIT_flushBits(&blockStream); + if (longOffsets) { + U32 const ofBits = ofCodeTable[nbSeq-1]; + unsigned const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1); + if (extraBits) { + BIT_addBits(&blockStream, sequences[nbSeq-1].offBase, extraBits); + BIT_flushBits(&blockStream); + } + BIT_addBits(&blockStream, sequences[nbSeq-1].offBase >> extraBits, + ofBits - extraBits); + } else { + BIT_addBits(&blockStream, sequences[nbSeq-1].offBase, ofCodeTable[nbSeq-1]); + } + BIT_flushBits(&blockStream); + + { size_t n; + for (n=nbSeq-2 ; n= 64-7-(LLFSELog+MLFSELog+OffFSELog))) + BIT_flushBits(&blockStream); /* (7)*/ + BIT_addBits(&blockStream, sequences[n].litLength, llBits); + if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&blockStream); + BIT_addBits(&blockStream, sequences[n].mlBase, mlBits); + if (MEM_32bits() || (ofBits+mlBits+llBits > 56)) BIT_flushBits(&blockStream); + if (longOffsets) { + unsigned const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1); + if (extraBits) { + BIT_addBits(&blockStream, sequences[n].offBase, extraBits); + BIT_flushBits(&blockStream); /* (7)*/ + } + BIT_addBits(&blockStream, sequences[n].offBase >> extraBits, + ofBits - extraBits); /* 31 */ + } else { + BIT_addBits(&blockStream, sequences[n].offBase, ofBits); /* 31 */ + } + BIT_flushBits(&blockStream); /* (7)*/ + DEBUGLOG(7, "remaining space : %i", (int)(blockStream.endPtr - blockStream.ptr)); + } } + + DEBUGLOG(6, "ZSTD_encodeSequences: flushing ML state with %u bits", stateMatchLength.stateLog); + FSE_flushCState(&blockStream, &stateMatchLength); + DEBUGLOG(6, "ZSTD_encodeSequences: flushing Off state with %u bits", stateOffsetBits.stateLog); + FSE_flushCState(&blockStream, &stateOffsetBits); + DEBUGLOG(6, "ZSTD_encodeSequences: flushing LL state with %u bits", stateLitLength.stateLog); + FSE_flushCState(&blockStream, &stateLitLength); + + { size_t const streamSize = BIT_closeCStream(&blockStream); + RETURN_ERROR_IF(streamSize==0, dstSize_tooSmall, "not enough space"); + return streamSize; + } +} + +static size_t +ZSTD_encodeSequences_default( + void* dst, size_t dstCapacity, + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, + SeqDef const* sequences, size_t nbSeq, int longOffsets) +{ + return ZSTD_encodeSequences_body(dst, dstCapacity, + CTable_MatchLength, mlCodeTable, + CTable_OffsetBits, ofCodeTable, + CTable_LitLength, llCodeTable, + sequences, nbSeq, longOffsets); +} + + +#if DYNAMIC_BMI2 + +static BMI2_TARGET_ATTRIBUTE size_t +ZSTD_encodeSequences_bmi2( + void* dst, size_t dstCapacity, + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, + SeqDef const* sequences, size_t nbSeq, int longOffsets) +{ + return ZSTD_encodeSequences_body(dst, dstCapacity, + CTable_MatchLength, mlCodeTable, + CTable_OffsetBits, ofCodeTable, + CTable_LitLength, llCodeTable, + sequences, nbSeq, longOffsets); +} + +#endif + +size_t ZSTD_encodeSequences( + void* dst, size_t dstCapacity, + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, + SeqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2) +{ + DEBUGLOG(5, "ZSTD_encodeSequences: dstCapacity = %u", (unsigned)dstCapacity); +#if DYNAMIC_BMI2 + if (bmi2) { + return ZSTD_encodeSequences_bmi2(dst, dstCapacity, + CTable_MatchLength, mlCodeTable, + CTable_OffsetBits, ofCodeTable, + CTable_LitLength, llCodeTable, + sequences, nbSeq, longOffsets); + } +#endif + (void)bmi2; + return ZSTD_encodeSequences_default(dst, dstCapacity, + CTable_MatchLength, mlCodeTable, + CTable_OffsetBits, ofCodeTable, + CTable_LitLength, llCodeTable, + sequences, nbSeq, longOffsets); +} diff --git a/lib/compress/zstd_double_fast.c b/lib/compress/zstd_double_fast.c new file mode 100644 index 0000000..1a266e7 --- /dev/null +++ b/lib/compress/zstd_double_fast.c @@ -0,0 +1,778 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#include "zstd_compress_internal.h" +#include "zstd_double_fast.h" + +#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR + +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_fillDoubleHashTableForCDict(ZSTD_MatchState_t* ms, + void const* end, ZSTD_dictTableLoadMethod_e dtlm) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const hashLarge = ms->hashTable; + U32 const hBitsL = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; + U32 const mls = cParams->minMatch; + U32* const hashSmall = ms->chainTable; + U32 const hBitsS = cParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS; + const BYTE* const base = ms->window.base; + const BYTE* ip = base + ms->nextToUpdate; + const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; + const U32 fastHashFillStep = 3; + + /* Always insert every fastHashFillStep position into the hash tables. + * Insert the other positions into the large hash table if their entry + * is empty. + */ + for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) { + U32 const curr = (U32)(ip - base); + U32 i; + for (i = 0; i < fastHashFillStep; ++i) { + size_t const smHashAndTag = ZSTD_hashPtr(ip + i, hBitsS, mls); + size_t const lgHashAndTag = ZSTD_hashPtr(ip + i, hBitsL, 8); + if (i == 0) { + ZSTD_writeTaggedIndex(hashSmall, smHashAndTag, curr + i); + } + if (i == 0 || hashLarge[lgHashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { + ZSTD_writeTaggedIndex(hashLarge, lgHashAndTag, curr + i); + } + /* Only load extra positions for ZSTD_dtlm_full */ + if (dtlm == ZSTD_dtlm_fast) + break; + } } +} + +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_fillDoubleHashTableForCCtx(ZSTD_MatchState_t* ms, + void const* end, ZSTD_dictTableLoadMethod_e dtlm) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const hashLarge = ms->hashTable; + U32 const hBitsL = cParams->hashLog; + U32 const mls = cParams->minMatch; + U32* const hashSmall = ms->chainTable; + U32 const hBitsS = cParams->chainLog; + const BYTE* const base = ms->window.base; + const BYTE* ip = base + ms->nextToUpdate; + const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; + const U32 fastHashFillStep = 3; + + /* Always insert every fastHashFillStep position into the hash tables. + * Insert the other positions into the large hash table if their entry + * is empty. + */ + for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) { + U32 const curr = (U32)(ip - base); + U32 i; + for (i = 0; i < fastHashFillStep; ++i) { + size_t const smHash = ZSTD_hashPtr(ip + i, hBitsS, mls); + size_t const lgHash = ZSTD_hashPtr(ip + i, hBitsL, 8); + if (i == 0) + hashSmall[smHash] = curr + i; + if (i == 0 || hashLarge[lgHash] == 0) + hashLarge[lgHash] = curr + i; + /* Only load extra positions for ZSTD_dtlm_full */ + if (dtlm == ZSTD_dtlm_fast) + break; + } } +} + +void ZSTD_fillDoubleHashTable(ZSTD_MatchState_t* ms, + const void* const end, + ZSTD_dictTableLoadMethod_e dtlm, + ZSTD_tableFillPurpose_e tfp) +{ + if (tfp == ZSTD_tfp_forCDict) { + ZSTD_fillDoubleHashTableForCDict(ms, end, dtlm); + } else { + ZSTD_fillDoubleHashTableForCCtx(ms, end, dtlm); + } +} + + +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_compressBlock_doubleFast_noDict_generic( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, U32 const mls /* template */) +{ + ZSTD_compressionParameters const* cParams = &ms->cParams; + U32* const hashLong = ms->hashTable; + const U32 hBitsL = cParams->hashLog; + U32* const hashSmall = ms->chainTable; + const U32 hBitsS = cParams->chainLog; + const BYTE* const base = ms->window.base; + const BYTE* const istart = (const BYTE*)src; + const BYTE* anchor = istart; + const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); + /* presumes that, if there is a dictionary, it must be using Attach mode */ + const U32 prefixLowestIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog); + const BYTE* const prefixLowest = base + prefixLowestIndex; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; + U32 offsetSaved1 = 0, offsetSaved2 = 0; + + size_t mLength; + U32 offset; + U32 curr; + + /* how many positions to search before increasing step size */ + const size_t kStepIncr = 1 << kSearchStrength; + /* the position at which to increment the step size if no match is found */ + const BYTE* nextStep; + size_t step; /* the current step size */ + + size_t hl0; /* the long hash at ip */ + size_t hl1; /* the long hash at ip1 */ + + U32 idxl0; /* the long match index for ip */ + U32 idxl1; /* the long match index for ip1 */ + + const BYTE* matchl0; /* the long match for ip */ + const BYTE* matchs0; /* the short match for ip */ + const BYTE* matchl1; /* the long match for ip1 */ + const BYTE* matchs0_safe; /* matchs0 or safe address */ + + const BYTE* ip = istart; /* the current position */ + const BYTE* ip1; /* the next position */ + /* Array of ~random data, should have low probability of matching data + * we load from here instead of from tables, if matchl0/matchl1 are + * invalid indices. Used to avoid unpredictable branches. */ + const BYTE dummy[] = {0x12,0x34,0x56,0x78,0x9a,0xbc,0xde,0xf0,0xe2,0xb4}; + + DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_noDict_generic"); + + /* init */ + ip += ((ip - prefixLowest) == 0); + { + U32 const current = (U32)(ip - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog); + U32 const maxRep = current - windowLow; + if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0; + if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0; + } + + /* Outer Loop: one iteration per match found and stored */ + while (1) { + step = 1; + nextStep = ip + kStepIncr; + ip1 = ip + step; + + if (ip1 > ilimit) { + goto _cleanup; + } + + hl0 = ZSTD_hashPtr(ip, hBitsL, 8); + idxl0 = hashLong[hl0]; + matchl0 = base + idxl0; + + /* Inner Loop: one iteration per search / position */ + do { + const size_t hs0 = ZSTD_hashPtr(ip, hBitsS, mls); + const U32 idxs0 = hashSmall[hs0]; + curr = (U32)(ip-base); + matchs0 = base + idxs0; + + hashLong[hl0] = hashSmall[hs0] = curr; /* update hash tables */ + + /* check noDict repcode */ + if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) { + mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; + ip++; + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + goto _match_stored; + } + + hl1 = ZSTD_hashPtr(ip1, hBitsL, 8); + + /* idxl0 > prefixLowestIndex is a (somewhat) unpredictable branch. + * However expression below complies into conditional move. Since + * match is unlikely and we only *branch* on idxl0 > prefixLowestIndex + * if there is a match, all branches become predictable. */ + { const BYTE* const matchl0_safe = ZSTD_selectAddr(idxl0, prefixLowestIndex, matchl0, &dummy[0]); + + /* check prefix long match */ + if (MEM_read64(matchl0_safe) == MEM_read64(ip) && matchl0_safe == matchl0) { + mLength = ZSTD_count(ip+8, matchl0+8, iend) + 8; + offset = (U32)(ip-matchl0); + while (((ip>anchor) & (matchl0>prefixLowest)) && (ip[-1] == matchl0[-1])) { ip--; matchl0--; mLength++; } /* catch up */ + goto _match_found; + } } + + idxl1 = hashLong[hl1]; + matchl1 = base + idxl1; + + /* Same optimization as matchl0 above */ + matchs0_safe = ZSTD_selectAddr(idxs0, prefixLowestIndex, matchs0, &dummy[0]); + + /* check prefix short match */ + if(MEM_read32(matchs0_safe) == MEM_read32(ip) && matchs0_safe == matchs0) { + goto _search_next_long; + } + + if (ip1 >= nextStep) { + PREFETCH_L1(ip1 + 64); + PREFETCH_L1(ip1 + 128); + step++; + nextStep += kStepIncr; + } + ip = ip1; + ip1 += step; + + hl0 = hl1; + idxl0 = idxl1; + matchl0 = matchl1; + #if defined(__aarch64__) + PREFETCH_L1(ip+256); + #endif + } while (ip1 <= ilimit); + +_cleanup: + /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), + * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ + offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; + + /* save reps for next block */ + rep[0] = offset_1 ? offset_1 : offsetSaved1; + rep[1] = offset_2 ? offset_2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); + +_search_next_long: + + /* short match found: let's check for a longer one */ + mLength = ZSTD_count(ip+4, matchs0+4, iend) + 4; + offset = (U32)(ip - matchs0); + + /* check long match at +1 position */ + if ((idxl1 > prefixLowestIndex) && (MEM_read64(matchl1) == MEM_read64(ip1))) { + size_t const l1len = ZSTD_count(ip1+8, matchl1+8, iend) + 8; + if (l1len > mLength) { + /* use the long match instead */ + ip = ip1; + mLength = l1len; + offset = (U32)(ip-matchl1); + matchs0 = matchl1; + } + } + + while (((ip>anchor) & (matchs0>prefixLowest)) && (ip[-1] == matchs0[-1])) { ip--; matchs0--; mLength++; } /* complete backward */ + + /* fall-through */ + +_match_found: /* requires ip, offset, mLength */ + offset_2 = offset_1; + offset_1 = offset; + + if (step < 4) { + /* It is unsafe to write this value back to the hashtable when ip1 is + * greater than or equal to the new ip we will have after we're done + * processing this match. Rather than perform that test directly + * (ip1 >= ip + mLength), which costs speed in practice, we do a simpler + * more predictable test. The minmatch even if we take a short match is + * 4 bytes, so as long as step, the distance between ip and ip1 + * (initially) is less than 4, we know ip1 < new ip. */ + hashLong[hl1] = (U32)(ip1 - base); + } + + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + +_match_stored: + /* match found */ + ip += mLength; + anchor = ip; + + if (ip <= ilimit) { + /* Complementary insertion */ + /* done after iLimit test, as candidates could be > iend-8 */ + { U32 const indexToInsert = curr+2; + hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] = indexToInsert; + hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base); + hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = indexToInsert; + hashSmall[ZSTD_hashPtr(ip-1, hBitsS, mls)] = (U32)(ip-1-base); + } + + /* check immediate repcode */ + while ( (ip <= ilimit) + && ( (offset_2>0) + & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) { + /* store sequence */ + size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; + U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; /* swap offset_2 <=> offset_1 */ + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base); + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base); + ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, rLength); + ip += rLength; + anchor = ip; + continue; /* faster when present ... (?) */ + } + } + } +} + + +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, + U32 const mls /* template */) +{ + ZSTD_compressionParameters const* cParams = &ms->cParams; + U32* const hashLong = ms->hashTable; + const U32 hBitsL = cParams->hashLog; + U32* const hashSmall = ms->chainTable; + const U32 hBitsS = cParams->chainLog; + const BYTE* const base = ms->window.base; + const BYTE* const istart = (const BYTE*)src; + const BYTE* ip = istart; + const BYTE* anchor = istart; + const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); + /* presumes that, if there is a dictionary, it must be using Attach mode */ + const U32 prefixLowestIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog); + const BYTE* const prefixLowest = base + prefixLowestIndex; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; + + const ZSTD_MatchState_t* const dms = ms->dictMatchState; + const ZSTD_compressionParameters* const dictCParams = &dms->cParams; + const U32* const dictHashLong = dms->hashTable; + const U32* const dictHashSmall = dms->chainTable; + const U32 dictStartIndex = dms->window.dictLimit; + const BYTE* const dictBase = dms->window.base; + const BYTE* const dictStart = dictBase + dictStartIndex; + const BYTE* const dictEnd = dms->window.nextSrc; + const U32 dictIndexDelta = prefixLowestIndex - (U32)(dictEnd - dictBase); + const U32 dictHBitsL = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; + const U32 dictHBitsS = dictCParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS; + const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictStart)); + + DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic"); + + /* if a dictionary is attached, it must be within window range */ + assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex); + + if (ms->prefetchCDictTables) { + size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32); + size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32); + PREFETCH_AREA(dictHashLong, hashTableBytes); + PREFETCH_AREA(dictHashSmall, chainTableBytes); + } + + /* init */ + ip += (dictAndPrefixLength == 0); + + /* dictMatchState repCode checks don't currently handle repCode == 0 + * disabling. */ + assert(offset_1 <= dictAndPrefixLength); + assert(offset_2 <= dictAndPrefixLength); + + /* Main Search Loop */ + while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */ + size_t mLength; + U32 offset; + size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8); + size_t const h = ZSTD_hashPtr(ip, hBitsS, mls); + size_t const dictHashAndTagL = ZSTD_hashPtr(ip, dictHBitsL, 8); + size_t const dictHashAndTagS = ZSTD_hashPtr(ip, dictHBitsS, mls); + U32 const dictMatchIndexAndTagL = dictHashLong[dictHashAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS]; + U32 const dictMatchIndexAndTagS = dictHashSmall[dictHashAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS]; + int const dictTagsMatchL = ZSTD_comparePackedTags(dictMatchIndexAndTagL, dictHashAndTagL); + int const dictTagsMatchS = ZSTD_comparePackedTags(dictMatchIndexAndTagS, dictHashAndTagS); + U32 const curr = (U32)(ip-base); + U32 const matchIndexL = hashLong[h2]; + U32 matchIndexS = hashSmall[h]; + const BYTE* matchLong = base + matchIndexL; + const BYTE* match = base + matchIndexS; + const U32 repIndex = curr + 1 - offset_1; + const BYTE* repMatch = (repIndex < prefixLowestIndex) ? + dictBase + (repIndex - dictIndexDelta) : + base + repIndex; + hashLong[h2] = hashSmall[h] = curr; /* update hash tables */ + + /* check repcode */ + if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) + && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + ip++; + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + goto _match_stored; + } + + if ((matchIndexL >= prefixLowestIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) { + /* check prefix long match */ + mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8; + offset = (U32)(ip-matchLong); + while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ + goto _match_found; + } else if (dictTagsMatchL) { + /* check dictMatchState long match */ + U32 const dictMatchIndexL = dictMatchIndexAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS; + const BYTE* dictMatchL = dictBase + dictMatchIndexL; + assert(dictMatchL < dictEnd); + + if (dictMatchL > dictStart && MEM_read64(dictMatchL) == MEM_read64(ip)) { + mLength = ZSTD_count_2segments(ip+8, dictMatchL+8, iend, dictEnd, prefixLowest) + 8; + offset = (U32)(curr - dictMatchIndexL - dictIndexDelta); + while (((ip>anchor) & (dictMatchL>dictStart)) && (ip[-1] == dictMatchL[-1])) { ip--; dictMatchL--; mLength++; } /* catch up */ + goto _match_found; + } } + + if (matchIndexS > prefixLowestIndex) { + /* short match candidate */ + if (MEM_read32(match) == MEM_read32(ip)) { + goto _search_next_long; + } + } else if (dictTagsMatchS) { + /* check dictMatchState short match */ + U32 const dictMatchIndexS = dictMatchIndexAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS; + match = dictBase + dictMatchIndexS; + matchIndexS = dictMatchIndexS + dictIndexDelta; + + if (match > dictStart && MEM_read32(match) == MEM_read32(ip)) { + goto _search_next_long; + } } + + ip += ((ip-anchor) >> kSearchStrength) + 1; +#if defined(__aarch64__) + PREFETCH_L1(ip+256); +#endif + continue; + +_search_next_long: + { size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8); + size_t const dictHashAndTagL3 = ZSTD_hashPtr(ip+1, dictHBitsL, 8); + U32 const matchIndexL3 = hashLong[hl3]; + U32 const dictMatchIndexAndTagL3 = dictHashLong[dictHashAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS]; + int const dictTagsMatchL3 = ZSTD_comparePackedTags(dictMatchIndexAndTagL3, dictHashAndTagL3); + const BYTE* matchL3 = base + matchIndexL3; + hashLong[hl3] = curr + 1; + + /* check prefix long +1 match */ + if ((matchIndexL3 >= prefixLowestIndex) && (MEM_read64(matchL3) == MEM_read64(ip+1))) { + mLength = ZSTD_count(ip+9, matchL3+8, iend) + 8; + ip++; + offset = (U32)(ip-matchL3); + while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */ + goto _match_found; + } else if (dictTagsMatchL3) { + /* check dict long +1 match */ + U32 const dictMatchIndexL3 = dictMatchIndexAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS; + const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3; + assert(dictMatchL3 < dictEnd); + if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) { + mLength = ZSTD_count_2segments(ip+1+8, dictMatchL3+8, iend, dictEnd, prefixLowest) + 8; + ip++; + offset = (U32)(curr + 1 - dictMatchIndexL3 - dictIndexDelta); + while (((ip>anchor) & (dictMatchL3>dictStart)) && (ip[-1] == dictMatchL3[-1])) { ip--; dictMatchL3--; mLength++; } /* catch up */ + goto _match_found; + } } } + + /* if no long +1 match, explore the short match we found */ + if (matchIndexS < prefixLowestIndex) { + mLength = ZSTD_count_2segments(ip+4, match+4, iend, dictEnd, prefixLowest) + 4; + offset = (U32)(curr - matchIndexS); + while (((ip>anchor) & (match>dictStart)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ + } else { + mLength = ZSTD_count(ip+4, match+4, iend) + 4; + offset = (U32)(ip - match); + while (((ip>anchor) & (match>prefixLowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ + } + +_match_found: + offset_2 = offset_1; + offset_1 = offset; + + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + +_match_stored: + /* match found */ + ip += mLength; + anchor = ip; + + if (ip <= ilimit) { + /* Complementary insertion */ + /* done after iLimit test, as candidates could be > iend-8 */ + { U32 const indexToInsert = curr+2; + hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] = indexToInsert; + hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base); + hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = indexToInsert; + hashSmall[ZSTD_hashPtr(ip-1, hBitsS, mls)] = (U32)(ip-1-base); + } + + /* check immediate repcode */ + while (ip <= ilimit) { + U32 const current2 = (U32)(ip-base); + U32 const repIndex2 = current2 - offset_2; + const BYTE* repMatch2 = repIndex2 < prefixLowestIndex ? + dictBase + repIndex2 - dictIndexDelta : + base + repIndex2; + if ( (ZSTD_index_overlap_check(prefixLowestIndex, repIndex2)) + && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { + const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4; + U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; + ip += repLength2; + anchor = ip; + continue; + } + break; + } + } + } /* while (ip < ilimit) */ + + /* save reps for next block */ + rep[0] = offset_1; + rep[1] = offset_2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +} + +#define ZSTD_GEN_DFAST_FN(dictMode, mls) \ + static size_t ZSTD_compressBlock_doubleFast_##dictMode##_##mls( \ + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \ + void const* src, size_t srcSize) \ + { \ + return ZSTD_compressBlock_doubleFast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls); \ + } + +ZSTD_GEN_DFAST_FN(noDict, 4) +ZSTD_GEN_DFAST_FN(noDict, 5) +ZSTD_GEN_DFAST_FN(noDict, 6) +ZSTD_GEN_DFAST_FN(noDict, 7) + +ZSTD_GEN_DFAST_FN(dictMatchState, 4) +ZSTD_GEN_DFAST_FN(dictMatchState, 5) +ZSTD_GEN_DFAST_FN(dictMatchState, 6) +ZSTD_GEN_DFAST_FN(dictMatchState, 7) + + +size_t ZSTD_compressBlock_doubleFast( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + const U32 mls = ms->cParams.minMatch; + switch(mls) + { + default: /* includes case 3 */ + case 4 : + return ZSTD_compressBlock_doubleFast_noDict_4(ms, seqStore, rep, src, srcSize); + case 5 : + return ZSTD_compressBlock_doubleFast_noDict_5(ms, seqStore, rep, src, srcSize); + case 6 : + return ZSTD_compressBlock_doubleFast_noDict_6(ms, seqStore, rep, src, srcSize); + case 7 : + return ZSTD_compressBlock_doubleFast_noDict_7(ms, seqStore, rep, src, srcSize); + } +} + + +size_t ZSTD_compressBlock_doubleFast_dictMatchState( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + const U32 mls = ms->cParams.minMatch; + switch(mls) + { + default: /* includes case 3 */ + case 4 : + return ZSTD_compressBlock_doubleFast_dictMatchState_4(ms, seqStore, rep, src, srcSize); + case 5 : + return ZSTD_compressBlock_doubleFast_dictMatchState_5(ms, seqStore, rep, src, srcSize); + case 6 : + return ZSTD_compressBlock_doubleFast_dictMatchState_6(ms, seqStore, rep, src, srcSize); + case 7 : + return ZSTD_compressBlock_doubleFast_dictMatchState_7(ms, seqStore, rep, src, srcSize); + } +} + + +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_compressBlock_doubleFast_extDict_generic( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, + U32 const mls /* template */) +{ + ZSTD_compressionParameters const* cParams = &ms->cParams; + U32* const hashLong = ms->hashTable; + U32 const hBitsL = cParams->hashLog; + U32* const hashSmall = ms->chainTable; + U32 const hBitsS = cParams->chainLog; + const BYTE* const istart = (const BYTE*)src; + const BYTE* ip = istart; + const BYTE* anchor = istart; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - 8; + const BYTE* const base = ms->window.base; + const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); + const U32 lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog); + const U32 dictStartIndex = lowLimit; + const U32 dictLimit = ms->window.dictLimit; + const U32 prefixStartIndex = (dictLimit > lowLimit) ? dictLimit : lowLimit; + const BYTE* const prefixStart = base + prefixStartIndex; + const BYTE* const dictBase = ms->window.dictBase; + const BYTE* const dictStart = dictBase + dictStartIndex; + const BYTE* const dictEnd = dictBase + prefixStartIndex; + U32 offset_1=rep[0], offset_2=rep[1]; + + DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_extDict_generic (srcSize=%zu)", srcSize); + + /* if extDict is invalidated due to maxDistance, switch to "regular" variant */ + if (prefixStartIndex == dictStartIndex) + return ZSTD_compressBlock_doubleFast(ms, seqStore, rep, src, srcSize); + + /* Search Loop */ + while (ip < ilimit) { /* < instead of <=, because (ip+1) */ + const size_t hSmall = ZSTD_hashPtr(ip, hBitsS, mls); + const U32 matchIndex = hashSmall[hSmall]; + const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base; + const BYTE* match = matchBase + matchIndex; + + const size_t hLong = ZSTD_hashPtr(ip, hBitsL, 8); + const U32 matchLongIndex = hashLong[hLong]; + const BYTE* const matchLongBase = matchLongIndex < prefixStartIndex ? dictBase : base; + const BYTE* matchLong = matchLongBase + matchLongIndex; + + const U32 curr = (U32)(ip-base); + const U32 repIndex = curr + 1 - offset_1; /* offset_1 expected <= curr +1 */ + const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base; + const BYTE* const repMatch = repBase + repIndex; + size_t mLength; + hashSmall[hSmall] = hashLong[hLong] = curr; /* update hash table */ + + if (((ZSTD_index_overlap_check(prefixStartIndex, repIndex)) + & (offset_1 <= curr+1 - dictStartIndex)) /* note: we are searching at curr+1 */ + && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { + const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; + ip++; + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + } else { + if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) { + const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend; + const BYTE* const lowMatchPtr = matchLongIndex < prefixStartIndex ? dictStart : prefixStart; + U32 offset; + mLength = ZSTD_count_2segments(ip+8, matchLong+8, iend, matchEnd, prefixStart) + 8; + offset = curr - matchLongIndex; + while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ + offset_2 = offset_1; + offset_1 = offset; + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) { + size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8); + U32 const matchIndex3 = hashLong[h3]; + const BYTE* const match3Base = matchIndex3 < prefixStartIndex ? dictBase : base; + const BYTE* match3 = match3Base + matchIndex3; + U32 offset; + hashLong[h3] = curr + 1; + if ( (matchIndex3 > dictStartIndex) && (MEM_read64(match3) == MEM_read64(ip+1)) ) { + const BYTE* const matchEnd = matchIndex3 < prefixStartIndex ? dictEnd : iend; + const BYTE* const lowMatchPtr = matchIndex3 < prefixStartIndex ? dictStart : prefixStart; + mLength = ZSTD_count_2segments(ip+9, match3+8, iend, matchEnd, prefixStart) + 8; + ip++; + offset = curr+1 - matchIndex3; + while (((ip>anchor) & (match3>lowMatchPtr)) && (ip[-1] == match3[-1])) { ip--; match3--; mLength++; } /* catch up */ + } else { + const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend; + const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart; + mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4; + offset = curr - matchIndex; + while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ + } + offset_2 = offset_1; + offset_1 = offset; + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + } else { + ip += ((ip-anchor) >> kSearchStrength) + 1; + continue; + } } + + /* move to next sequence start */ + ip += mLength; + anchor = ip; + + if (ip <= ilimit) { + /* Complementary insertion */ + /* done after iLimit test, as candidates could be > iend-8 */ + { U32 const indexToInsert = curr+2; + hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] = indexToInsert; + hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base); + hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = indexToInsert; + hashSmall[ZSTD_hashPtr(ip-1, hBitsS, mls)] = (U32)(ip-1-base); + } + + /* check immediate repcode */ + while (ip <= ilimit) { + U32 const current2 = (U32)(ip-base); + U32 const repIndex2 = current2 - offset_2; + const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; + if ( ((ZSTD_index_overlap_check(prefixStartIndex, repIndex2)) + & (offset_2 <= current2 - dictStartIndex)) + && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { + const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; + U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; + ip += repLength2; + anchor = ip; + continue; + } + break; + } } } + + /* save reps for next block */ + rep[0] = offset_1; + rep[1] = offset_2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +} + +ZSTD_GEN_DFAST_FN(extDict, 4) +ZSTD_GEN_DFAST_FN(extDict, 5) +ZSTD_GEN_DFAST_FN(extDict, 6) +ZSTD_GEN_DFAST_FN(extDict, 7) + +size_t ZSTD_compressBlock_doubleFast_extDict( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + U32 const mls = ms->cParams.minMatch; + switch(mls) + { + default: /* includes case 3 */ + case 4 : + return ZSTD_compressBlock_doubleFast_extDict_4(ms, seqStore, rep, src, srcSize); + case 5 : + return ZSTD_compressBlock_doubleFast_extDict_5(ms, seqStore, rep, src, srcSize); + case 6 : + return ZSTD_compressBlock_doubleFast_extDict_6(ms, seqStore, rep, src, srcSize); + case 7 : + return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize); + } +} + +#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */ diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c new file mode 100644 index 0000000..272ebe0 --- /dev/null +++ b/lib/compress/zstd_lazy.c @@ -0,0 +1,2199 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#include "zstd_compress_internal.h" +#include "zstd_lazy.h" +#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */ + +#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) + +#define kLazySkippingStep 8 + + +/*-************************************* +* Binary Tree search +***************************************/ + +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_updateDUBT(ZSTD_MatchState_t* ms, + const BYTE* ip, const BYTE* iend, + U32 mls) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const hashTable = ms->hashTable; + U32 const hashLog = cParams->hashLog; + + U32* const bt = ms->chainTable; + U32 const btLog = cParams->chainLog - 1; + U32 const btMask = (1 << btLog) - 1; + + const BYTE* const base = ms->window.base; + U32 const target = (U32)(ip - base); + U32 idx = ms->nextToUpdate; + + if (idx != target) + DEBUGLOG(7, "ZSTD_updateDUBT, from %u to %u (dictLimit:%u)", + idx, target, ms->window.dictLimit); + assert(ip + 8 <= iend); /* condition for ZSTD_hashPtr */ + (void)iend; + + assert(idx >= ms->window.dictLimit); /* condition for valid base+idx */ + for ( ; idx < target ; idx++) { + size_t const h = ZSTD_hashPtr(base + idx, hashLog, mls); /* assumption : ip + 8 <= iend */ + U32 const matchIndex = hashTable[h]; + + U32* const nextCandidatePtr = bt + 2*(idx&btMask); + U32* const sortMarkPtr = nextCandidatePtr + 1; + + DEBUGLOG(8, "ZSTD_updateDUBT: insert %u", idx); + hashTable[h] = idx; /* Update Hash Table */ + *nextCandidatePtr = matchIndex; /* update BT like a chain */ + *sortMarkPtr = ZSTD_DUBT_UNSORTED_MARK; + } + ms->nextToUpdate = target; +} + + +/** ZSTD_insertDUBT1() : + * sort one already inserted but unsorted position + * assumption : curr >= btlow == (curr - btmask) + * doesn't fail */ +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_insertDUBT1(const ZSTD_MatchState_t* ms, + U32 curr, const BYTE* inputEnd, + U32 nbCompares, U32 btLow, + const ZSTD_dictMode_e dictMode) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const bt = ms->chainTable; + U32 const btLog = cParams->chainLog - 1; + U32 const btMask = (1 << btLog) - 1; + size_t commonLengthSmaller=0, commonLengthLarger=0; + const BYTE* const base = ms->window.base; + const BYTE* const dictBase = ms->window.dictBase; + const U32 dictLimit = ms->window.dictLimit; + const BYTE* const ip = (curr>=dictLimit) ? base + curr : dictBase + curr; + const BYTE* const iend = (curr>=dictLimit) ? inputEnd : dictBase + dictLimit; + const BYTE* const dictEnd = dictBase + dictLimit; + const BYTE* const prefixStart = base + dictLimit; + const BYTE* match; + U32* smallerPtr = bt + 2*(curr&btMask); + U32* largerPtr = smallerPtr + 1; + U32 matchIndex = *smallerPtr; /* this candidate is unsorted : next sorted candidate is reached through *smallerPtr, while *largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) */ + U32 dummy32; /* to be nullified at the end */ + U32 const windowValid = ms->window.lowLimit; + U32 const maxDistance = 1U << cParams->windowLog; + U32 const windowLow = (curr - windowValid > maxDistance) ? curr - maxDistance : windowValid; + + + DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)", + curr, dictLimit, windowLow); + assert(curr >= btLow); + assert(ip < iend); /* condition for ZSTD_count */ + + for (; nbCompares && (matchIndex > windowLow); --nbCompares) { + U32* const nextPtr = bt + 2*(matchIndex & btMask); + size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ + assert(matchIndex < curr); + /* note : all candidates are now supposed sorted, + * but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK + * when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */ + + if ( (dictMode != ZSTD_extDict) + || (matchIndex+matchLength >= dictLimit) /* both in current segment*/ + || (curr < dictLimit) /* both in extDict */) { + const BYTE* const mBase = ( (dictMode != ZSTD_extDict) + || (matchIndex+matchLength >= dictLimit)) ? + base : dictBase; + assert( (matchIndex+matchLength >= dictLimit) /* might be wrong if extDict is incorrectly set to 0 */ + || (curr < dictLimit) ); + match = mBase + matchIndex; + matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend); + } else { + match = dictBase + matchIndex; + matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart); + if (matchIndex+matchLength >= dictLimit) + match = base + matchIndex; /* preparation for next read of match[matchLength] */ + } + + DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ", + curr, matchIndex, (U32)matchLength); + + if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ + break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */ + } + + if (match[matchLength] < ip[matchLength]) { /* necessarily within buffer */ + /* match is smaller than current */ + *smallerPtr = matchIndex; /* update smaller idx */ + commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ + if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop searching */ + DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is smaller : next => %u", + matchIndex, btLow, nextPtr[1]); + smallerPtr = nextPtr+1; /* new "candidate" => larger than match, which was smaller than target */ + matchIndex = nextPtr[1]; /* new matchIndex, larger than previous and closer to current */ + } else { + /* match is larger than current */ + *largerPtr = matchIndex; + commonLengthLarger = matchLength; + if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop searching */ + DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is larger => %u", + matchIndex, btLow, nextPtr[0]); + largerPtr = nextPtr; + matchIndex = nextPtr[0]; + } } + + *smallerPtr = *largerPtr = 0; +} + + +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_DUBT_findBetterDictMatch ( + const ZSTD_MatchState_t* ms, + const BYTE* const ip, const BYTE* const iend, + size_t* offsetPtr, + size_t bestLength, + U32 nbCompares, + U32 const mls, + const ZSTD_dictMode_e dictMode) +{ + const ZSTD_MatchState_t * const dms = ms->dictMatchState; + const ZSTD_compressionParameters* const dmsCParams = &dms->cParams; + const U32 * const dictHashTable = dms->hashTable; + U32 const hashLog = dmsCParams->hashLog; + size_t const h = ZSTD_hashPtr(ip, hashLog, mls); + U32 dictMatchIndex = dictHashTable[h]; + + const BYTE* const base = ms->window.base; + const BYTE* const prefixStart = base + ms->window.dictLimit; + U32 const curr = (U32)(ip-base); + const BYTE* const dictBase = dms->window.base; + const BYTE* const dictEnd = dms->window.nextSrc; + U32 const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base); + U32 const dictLowLimit = dms->window.lowLimit; + U32 const dictIndexDelta = ms->window.lowLimit - dictHighLimit; + + U32* const dictBt = dms->chainTable; + U32 const btLog = dmsCParams->chainLog - 1; + U32 const btMask = (1 << btLog) - 1; + U32 const btLow = (btMask >= dictHighLimit - dictLowLimit) ? dictLowLimit : dictHighLimit - btMask; + + size_t commonLengthSmaller=0, commonLengthLarger=0; + + (void)dictMode; + assert(dictMode == ZSTD_dictMatchState); + + for (; nbCompares && (dictMatchIndex > dictLowLimit); --nbCompares) { + U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask); + size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ + const BYTE* match = dictBase + dictMatchIndex; + matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart); + if (dictMatchIndex+matchLength >= dictHighLimit) + match = base + dictMatchIndex + dictIndexDelta; /* to prepare for next usage of match[matchLength] */ + + if (matchLength > bestLength) { + U32 matchIndex = dictMatchIndex + dictIndexDelta; + if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) { + DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)", + curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex); + bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); + } + if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */ + break; /* drop, to guarantee consistency (miss a little bit of compression) */ + } + } + + if (match[matchLength] < ip[matchLength]) { + if (dictMatchIndex <= btLow) { break; } /* beyond tree size, stop the search */ + commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ + dictMatchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */ + } else { + /* match is larger than current */ + if (dictMatchIndex <= btLow) { break; } /* beyond tree size, stop the search */ + commonLengthLarger = matchLength; + dictMatchIndex = nextPtr[0]; + } + } + + if (bestLength >= MINMATCH) { + U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex; + DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)", + curr, (U32)bestLength, (U32)*offsetPtr, mIndex); + } + return bestLength; + +} + + +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_DUBT_findBestMatch(ZSTD_MatchState_t* ms, + const BYTE* const ip, const BYTE* const iend, + size_t* offBasePtr, + U32 const mls, + const ZSTD_dictMode_e dictMode) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const hashTable = ms->hashTable; + U32 const hashLog = cParams->hashLog; + size_t const h = ZSTD_hashPtr(ip, hashLog, mls); + U32 matchIndex = hashTable[h]; + + const BYTE* const base = ms->window.base; + U32 const curr = (U32)(ip-base); + U32 const windowLow = ZSTD_getLowestMatchIndex(ms, curr, cParams->windowLog); + + U32* const bt = ms->chainTable; + U32 const btLog = cParams->chainLog - 1; + U32 const btMask = (1 << btLog) - 1; + U32 const btLow = (btMask >= curr) ? 0 : curr - btMask; + U32 const unsortLimit = MAX(btLow, windowLow); + + U32* nextCandidate = bt + 2*(matchIndex&btMask); + U32* unsortedMark = bt + 2*(matchIndex&btMask) + 1; + U32 nbCompares = 1U << cParams->searchLog; + U32 nbCandidates = nbCompares; + U32 previousCandidate = 0; + + DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", curr); + assert(ip <= iend-8); /* required for h calculation */ + assert(dictMode != ZSTD_dedicatedDictSearch); + + /* reach end of unsorted candidates list */ + while ( (matchIndex > unsortLimit) + && (*unsortedMark == ZSTD_DUBT_UNSORTED_MARK) + && (nbCandidates > 1) ) { + DEBUGLOG(8, "ZSTD_DUBT_findBestMatch: candidate %u is unsorted", + matchIndex); + *unsortedMark = previousCandidate; /* the unsortedMark becomes a reversed chain, to move up back to original position */ + previousCandidate = matchIndex; + matchIndex = *nextCandidate; + nextCandidate = bt + 2*(matchIndex&btMask); + unsortedMark = bt + 2*(matchIndex&btMask) + 1; + nbCandidates --; + } + + /* nullify last candidate if it's still unsorted + * simplification, detrimental to compression ratio, beneficial for speed */ + if ( (matchIndex > unsortLimit) + && (*unsortedMark==ZSTD_DUBT_UNSORTED_MARK) ) { + DEBUGLOG(7, "ZSTD_DUBT_findBestMatch: nullify last unsorted candidate %u", + matchIndex); + *nextCandidate = *unsortedMark = 0; + } + + /* batch sort stacked candidates */ + matchIndex = previousCandidate; + while (matchIndex) { /* will end on matchIndex == 0 */ + U32* const nextCandidateIdxPtr = bt + 2*(matchIndex&btMask) + 1; + U32 const nextCandidateIdx = *nextCandidateIdxPtr; + ZSTD_insertDUBT1(ms, matchIndex, iend, + nbCandidates, unsortLimit, dictMode); + matchIndex = nextCandidateIdx; + nbCandidates++; + } + + /* find longest match */ + { size_t commonLengthSmaller = 0, commonLengthLarger = 0; + const BYTE* const dictBase = ms->window.dictBase; + const U32 dictLimit = ms->window.dictLimit; + const BYTE* const dictEnd = dictBase + dictLimit; + const BYTE* const prefixStart = base + dictLimit; + U32* smallerPtr = bt + 2*(curr&btMask); + U32* largerPtr = bt + 2*(curr&btMask) + 1; + U32 matchEndIdx = curr + 8 + 1; + U32 dummy32; /* to be nullified at the end */ + size_t bestLength = 0; + + matchIndex = hashTable[h]; + hashTable[h] = curr; /* Update Hash Table */ + + for (; nbCompares && (matchIndex > windowLow); --nbCompares) { + U32* const nextPtr = bt + 2*(matchIndex & btMask); + size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ + const BYTE* match; + + if ((dictMode != ZSTD_extDict) || (matchIndex+matchLength >= dictLimit)) { + match = base + matchIndex; + matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend); + } else { + match = dictBase + matchIndex; + matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart); + if (matchIndex+matchLength >= dictLimit) + match = base + matchIndex; /* to prepare for next usage of match[matchLength] */ + } + + if (matchLength > bestLength) { + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) ) + bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex); + if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ + if (dictMode == ZSTD_dictMatchState) { + nbCompares = 0; /* in addition to avoiding checking any + * further in this loop, make sure we + * skip checking in the dictionary. */ + } + break; /* drop, to guarantee consistency (miss a little bit of compression) */ + } + } + + if (match[matchLength] < ip[matchLength]) { + /* match is smaller than current */ + *smallerPtr = matchIndex; /* update smaller idx */ + commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ + if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop the search */ + smallerPtr = nextPtr+1; /* new "smaller" => larger of match */ + matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */ + } else { + /* match is larger than current */ + *largerPtr = matchIndex; + commonLengthLarger = matchLength; + if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop the search */ + largerPtr = nextPtr; + matchIndex = nextPtr[0]; + } } + + *smallerPtr = *largerPtr = 0; + + assert(nbCompares <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */ + if (dictMode == ZSTD_dictMatchState && nbCompares) { + bestLength = ZSTD_DUBT_findBetterDictMatch( + ms, ip, iend, + offBasePtr, bestLength, nbCompares, + mls, dictMode); + } + + assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */ + ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ + if (bestLength >= MINMATCH) { + U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex; + DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)", + curr, (U32)bestLength, (U32)*offBasePtr, mIndex); + } + return bestLength; + } +} + + +/** ZSTD_BtFindBestMatch() : Tree updater, providing best match */ +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_BtFindBestMatch( ZSTD_MatchState_t* ms, + const BYTE* const ip, const BYTE* const iLimit, + size_t* offBasePtr, + const U32 mls /* template */, + const ZSTD_dictMode_e dictMode) +{ + DEBUGLOG(7, "ZSTD_BtFindBestMatch"); + if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */ + ZSTD_updateDUBT(ms, ip, iLimit, mls); + return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode); +} + +/*********************************** +* Dedicated dict search +***********************************/ + +void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_MatchState_t* ms, const BYTE* const ip) +{ + const BYTE* const base = ms->window.base; + U32 const target = (U32)(ip - base); + U32* const hashTable = ms->hashTable; + U32* const chainTable = ms->chainTable; + U32 const chainSize = 1 << ms->cParams.chainLog; + U32 idx = ms->nextToUpdate; + U32 const minChain = chainSize < target - idx ? target - chainSize : idx; + U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG; + U32 const cacheSize = bucketSize - 1; + U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize; + U32 const chainLimit = chainAttempts > 255 ? 255 : chainAttempts; + + /* We know the hashtable is oversized by a factor of `bucketSize`. + * We are going to temporarily pretend `bucketSize == 1`, keeping only a + * single entry. We will use the rest of the space to construct a temporary + * chaintable. + */ + U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG; + U32* const tmpHashTable = hashTable; + U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog); + U32 const tmpChainSize = (U32)((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog; + U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx; + U32 hashIdx; + + assert(ms->cParams.chainLog <= 24); + assert(ms->cParams.hashLog > ms->cParams.chainLog); + assert(idx != 0); + assert(tmpMinChain <= minChain); + + /* fill conventional hash table and conventional chain table */ + for ( ; idx < target; idx++) { + U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch); + if (idx >= tmpMinChain) { + tmpChainTable[idx - tmpMinChain] = hashTable[h]; + } + tmpHashTable[h] = idx; + } + + /* sort chains into ddss chain table */ + { + U32 chainPos = 0; + for (hashIdx = 0; hashIdx < (1U << hashLog); hashIdx++) { + U32 count; + U32 countBeyondMinChain = 0; + U32 i = tmpHashTable[hashIdx]; + for (count = 0; i >= tmpMinChain && count < cacheSize; count++) { + /* skip through the chain to the first position that won't be + * in the hash cache bucket */ + if (i < minChain) { + countBeyondMinChain++; + } + i = tmpChainTable[i - tmpMinChain]; + } + if (count == cacheSize) { + for (count = 0; count < chainLimit;) { + if (i < minChain) { + if (!i || ++countBeyondMinChain > cacheSize) { + /* only allow pulling `cacheSize` number of entries + * into the cache or chainTable beyond `minChain`, + * to replace the entries pulled out of the + * chainTable into the cache. This lets us reach + * back further without increasing the total number + * of entries in the chainTable, guaranteeing the + * DDSS chain table will fit into the space + * allocated for the regular one. */ + break; + } + } + chainTable[chainPos++] = i; + count++; + if (i < tmpMinChain) { + break; + } + i = tmpChainTable[i - tmpMinChain]; + } + } else { + count = 0; + } + if (count) { + tmpHashTable[hashIdx] = ((chainPos - count) << 8) + count; + } else { + tmpHashTable[hashIdx] = 0; + } + } + assert(chainPos <= chainSize); /* I believe this is guaranteed... */ + } + + /* move chain pointers into the last entry of each hash bucket */ + for (hashIdx = (1 << hashLog); hashIdx; ) { + U32 const bucketIdx = --hashIdx << ZSTD_LAZY_DDSS_BUCKET_LOG; + U32 const chainPackedPointer = tmpHashTable[hashIdx]; + U32 i; + for (i = 0; i < cacheSize; i++) { + hashTable[bucketIdx + i] = 0; + } + hashTable[bucketIdx + bucketSize - 1] = chainPackedPointer; + } + + /* fill the buckets of the hash table */ + for (idx = ms->nextToUpdate; idx < target; idx++) { + U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch) + << ZSTD_LAZY_DDSS_BUCKET_LOG; + U32 i; + /* Shift hash cache down 1. */ + for (i = cacheSize - 1; i; i--) + hashTable[h + i] = hashTable[h + i - 1]; + hashTable[h] = idx; + } + + ms->nextToUpdate = target; +} + +/* Returns the longest match length found in the dedicated dict search structure. + * If none are longer than the argument ml, then ml will be returned. + */ +FORCE_INLINE_TEMPLATE +size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts, + const ZSTD_MatchState_t* const dms, + const BYTE* const ip, const BYTE* const iLimit, + const BYTE* const prefixStart, const U32 curr, + const U32 dictLimit, const size_t ddsIdx) { + const U32 ddsLowestIndex = dms->window.dictLimit; + const BYTE* const ddsBase = dms->window.base; + const BYTE* const ddsEnd = dms->window.nextSrc; + const U32 ddsSize = (U32)(ddsEnd - ddsBase); + const U32 ddsIndexDelta = dictLimit - ddsSize; + const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG); + const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1; + U32 ddsAttempt; + U32 matchIndex; + + for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) { + PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]); + } + + { + U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1]; + U32 const chainIndex = chainPackedPointer >> 8; + + PREFETCH_L1(&dms->chainTable[chainIndex]); + } + + for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) { + size_t currentMl=0; + const BYTE* match; + matchIndex = dms->hashTable[ddsIdx + ddsAttempt]; + match = ddsBase + matchIndex; + + if (!matchIndex) { + return ml; + } + + /* guaranteed by table construction */ + (void)ddsLowestIndex; + assert(matchIndex >= ddsLowestIndex); + assert(match+4 <= ddsEnd); + if (MEM_read32(match) == MEM_read32(ip)) { + /* assumption : matchIndex <= dictLimit-4 (by table construction) */ + currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4; + } + + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; + *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta)); + if (ip+currentMl == iLimit) { + /* best possible, avoids read overflow on next attempt */ + return ml; + } + } + } + + { + U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1]; + U32 chainIndex = chainPackedPointer >> 8; + U32 const chainLength = chainPackedPointer & 0xFF; + U32 const chainAttempts = nbAttempts - ddsAttempt; + U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts; + U32 chainAttempt; + + for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) { + PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]); + } + + for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) { + size_t currentMl=0; + const BYTE* match; + matchIndex = dms->chainTable[chainIndex]; + match = ddsBase + matchIndex; + + /* guaranteed by table construction */ + assert(matchIndex >= ddsLowestIndex); + assert(match+4 <= ddsEnd); + if (MEM_read32(match) == MEM_read32(ip)) { + /* assumption : matchIndex <= dictLimit-4 (by table construction) */ + currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4; + } + + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; + *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta)); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + } + } + return ml; +} + + +/* ********************************* +* Hash Chain +***********************************/ +#define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)] + +/* Update chains up to ip (excluded) + Assumption : always within prefix (i.e. not within extDict) */ +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +U32 ZSTD_insertAndFindFirstIndex_internal( + ZSTD_MatchState_t* ms, + const ZSTD_compressionParameters* const cParams, + const BYTE* ip, U32 const mls, U32 const lazySkipping) +{ + U32* const hashTable = ms->hashTable; + const U32 hashLog = cParams->hashLog; + U32* const chainTable = ms->chainTable; + const U32 chainMask = (1 << cParams->chainLog) - 1; + const BYTE* const base = ms->window.base; + const U32 target = (U32)(ip - base); + U32 idx = ms->nextToUpdate; + + while(idx < target) { /* catch up */ + size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls); + NEXT_IN_CHAIN(idx, chainMask) = hashTable[h]; + hashTable[h] = idx; + idx++; + /* Stop inserting every position when in the lazy skipping mode. */ + if (lazySkipping) + break; + } + + ms->nextToUpdate = target; + return hashTable[ZSTD_hashPtr(ip, hashLog, mls)]; +} + +U32 ZSTD_insertAndFindFirstIndex(ZSTD_MatchState_t* ms, const BYTE* ip) { + const ZSTD_compressionParameters* const cParams = &ms->cParams; + return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0); +} + +/* inlining is important to hardwire a hot branch (template emulation) */ +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_HcFindBestMatch( + ZSTD_MatchState_t* ms, + const BYTE* const ip, const BYTE* const iLimit, + size_t* offsetPtr, + const U32 mls, const ZSTD_dictMode_e dictMode) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const chainTable = ms->chainTable; + const U32 chainSize = (1 << cParams->chainLog); + const U32 chainMask = chainSize-1; + const BYTE* const base = ms->window.base; + const BYTE* const dictBase = ms->window.dictBase; + const U32 dictLimit = ms->window.dictLimit; + const BYTE* const prefixStart = base + dictLimit; + const BYTE* const dictEnd = dictBase + dictLimit; + const U32 curr = (U32)(ip-base); + const U32 maxDistance = 1U << cParams->windowLog; + const U32 lowestValid = ms->window.lowLimit; + const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid; + const U32 isDictionary = (ms->loadedDictEnd != 0); + const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance; + const U32 minChain = curr > chainSize ? curr - chainSize : 0; + U32 nbAttempts = 1U << cParams->searchLog; + size_t ml=4-1; + + const ZSTD_MatchState_t* const dms = ms->dictMatchState; + const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch + ? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG : 0; + const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch + ? ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG : 0; + + U32 matchIndex; + + if (dictMode == ZSTD_dedicatedDictSearch) { + const U32* entry = &dms->hashTable[ddsIdx]; + PREFETCH_L1(entry); + } + + /* HC4 match finder */ + matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping); + + for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) { + size_t currentMl=0; + if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { + const BYTE* const match = base + matchIndex; + assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ + /* read 4B starting from (match + ml + 1 - sizeof(U32)) */ + if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */ + currentMl = ZSTD_count(ip, match, iLimit); + } else { + const BYTE* const match = dictBase + matchIndex; + assert(match+4 <= dictEnd); + if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */ + currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4; + } + + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; + *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + + if (matchIndex <= minChain) break; + matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask); + } + + assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */ + if (dictMode == ZSTD_dedicatedDictSearch) { + ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms, + ip, iLimit, prefixStart, curr, dictLimit, ddsIdx); + } else if (dictMode == ZSTD_dictMatchState) { + const U32* const dmsChainTable = dms->chainTable; + const U32 dmsChainSize = (1 << dms->cParams.chainLog); + const U32 dmsChainMask = dmsChainSize - 1; + const U32 dmsLowestIndex = dms->window.dictLimit; + const BYTE* const dmsBase = dms->window.base; + const BYTE* const dmsEnd = dms->window.nextSrc; + const U32 dmsSize = (U32)(dmsEnd - dmsBase); + const U32 dmsIndexDelta = dictLimit - dmsSize; + const U32 dmsMinChain = dmsSize > dmsChainSize ? dmsSize - dmsChainSize : 0; + + matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)]; + + for ( ; (matchIndex>=dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) { + size_t currentMl=0; + const BYTE* const match = dmsBase + matchIndex; + assert(match+4 <= dmsEnd); + if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */ + currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4; + + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; + assert(curr > matchIndex + dmsIndexDelta); + *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta)); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + + if (matchIndex <= dmsMinChain) break; + + matchIndex = dmsChainTable[matchIndex & dmsChainMask]; + } + } + + return ml; +} + +/* ********************************* +* (SIMD) Row-based matchfinder +***********************************/ +/* Constants for row-based hash */ +#define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1) +#define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */ + +#define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1) + +typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 representing a mask of matches */ + +/* ZSTD_VecMask_next(): + * Starting from the LSB, returns the idx of the next non-zero bit. + * Basically counting the nb of trailing zeroes. + */ +MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) { + return ZSTD_countTrailingZeros64(val); +} + +/* ZSTD_row_nextIndex(): + * Returns the next index to insert at within a tagTable row, and updates the "head" + * value to reflect the update. Essentially cycles backwards from [1, {entries per row}) + */ +FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) { + U32 next = (*tagRow-1) & rowMask; + next += (next == 0) ? rowMask : 0; /* skip first position */ + *tagRow = (BYTE)next; + return next; +} + +/* ZSTD_isAligned(): + * Checks that a pointer is aligned to "align" bytes which must be a power of 2. + */ +MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) { + assert((align & (align - 1)) == 0); + return (((size_t)ptr) & (align - 1)) == 0; +} + +/* ZSTD_row_prefetch(): + * Performs prefetching for the hashTable and tagTable at a given row. + */ +FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) { + PREFETCH_L1(hashTable + relRow); + if (rowLog >= 5) { + PREFETCH_L1(hashTable + relRow + 16); + /* Note: prefetching more of the hash table does not appear to be beneficial for 128-entry rows */ + } + PREFETCH_L1(tagTable + relRow); + if (rowLog == 6) { + PREFETCH_L1(tagTable + relRow + 32); + } + assert(rowLog == 4 || rowLog == 5 || rowLog == 6); + assert(ZSTD_isAligned(hashTable + relRow, 64)); /* prefetched hash row always 64-byte aligned */ + assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on correct multiple of bytes (32,64,128) */ +} + +/* ZSTD_row_fillHashCache(): + * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries, + * but not beyond iLimit. + */ +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_row_fillHashCache(ZSTD_MatchState_t* ms, const BYTE* base, + U32 const rowLog, U32 const mls, + U32 idx, const BYTE* const iLimit) +{ + U32 const* const hashTable = ms->hashTable; + BYTE const* const tagTable = ms->tagTable; + U32 const hashLog = ms->rowHashLog; + U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1); + U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch); + + for (; idx < lim; ++idx) { + U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); + U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); + ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash; + } + + DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1], + ms->hashCache[2], ms->hashCache[3], ms->hashCache[4], + ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]); +} + +/* ZSTD_row_nextCachedHash(): + * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at + * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable. + */ +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable, + BYTE const* tagTable, BYTE const* base, + U32 idx, U32 const hashLog, + U32 const rowLog, U32 const mls, + U64 const hashSalt) +{ + U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); + U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); + { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK]; + cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash; + return hash; + } +} + +/* ZSTD_row_update_internalImpl(): + * Updates the hash table with positions starting from updateStartIdx until updateEndIdx. + */ +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_row_update_internalImpl(ZSTD_MatchState_t* ms, + U32 updateStartIdx, U32 const updateEndIdx, + U32 const mls, U32 const rowLog, + U32 const rowMask, U32 const useCache) +{ + U32* const hashTable = ms->hashTable; + BYTE* const tagTable = ms->tagTable; + U32 const hashLog = ms->rowHashLog; + const BYTE* const base = ms->window.base; + + DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx); + for (; updateStartIdx < updateEndIdx; ++updateStartIdx) { + U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt) + : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); + U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + U32* const row = hashTable + relRow; + BYTE* tagRow = tagTable + relRow; + U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); + + assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt)); + tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK; + row[pos] = updateStartIdx; + } +} + +/* ZSTD_row_update_internal(): + * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate. + * Skips sections of long matches as is necessary. + */ +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_row_update_internal(ZSTD_MatchState_t* ms, const BYTE* ip, + U32 const mls, U32 const rowLog, + U32 const rowMask, U32 const useCache) +{ + U32 idx = ms->nextToUpdate; + const BYTE* const base = ms->window.base; + const U32 target = (U32)(ip - base); + const U32 kSkipThreshold = 384; + const U32 kMaxMatchStartPositionsToUpdate = 96; + const U32 kMaxMatchEndPositionsToUpdate = 32; + + if (useCache) { + /* Only skip positions when using hash cache, i.e. + * if we are loading a dict, don't skip anything. + * If we decide to skip, then we only update a set number + * of positions at the beginning and end of the match. + */ + if (UNLIKELY(target - idx > kSkipThreshold)) { + U32 const bound = idx + kMaxMatchStartPositionsToUpdate; + ZSTD_row_update_internalImpl(ms, idx, bound, mls, rowLog, rowMask, useCache); + idx = target - kMaxMatchEndPositionsToUpdate; + ZSTD_row_fillHashCache(ms, base, rowLog, mls, idx, ip+1); + } + } + assert(target >= idx); + ZSTD_row_update_internalImpl(ms, idx, target, mls, rowLog, rowMask, useCache); + ms->nextToUpdate = target; +} + +/* ZSTD_row_update(): + * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary + * processing. + */ +void ZSTD_row_update(ZSTD_MatchState_t* const ms, const BYTE* ip) { + const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); + const U32 rowMask = (1u << rowLog) - 1; + const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */); + + DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog); + ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */); +} + +/* Returns the mask width of bits group of which will be set to 1. Given not all + * architectures have easy movemask instruction, this helps to iterate over + * groups of bits easier and faster. + */ +FORCE_INLINE_TEMPLATE U32 +ZSTD_row_matchMaskGroupWidth(const U32 rowEntries) +{ + assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); + assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); + (void)rowEntries; +#if defined(ZSTD_ARCH_ARM_NEON) + /* NEON path only works for little endian */ + if (!MEM_isLittleEndian()) { + return 1; + } + if (rowEntries == 16) { + return 4; + } + if (rowEntries == 32) { + return 2; + } + if (rowEntries == 64) { + return 1; + } +#endif + return 1; +} + +#if defined(ZSTD_ARCH_X86_SSE2) +FORCE_INLINE_TEMPLATE ZSTD_VecMask +ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head) +{ + const __m128i comparisonMask = _mm_set1_epi8((char)tag); + int matches[4] = {0}; + int i; + assert(nbChunks == 1 || nbChunks == 2 || nbChunks == 4); + for (i=0; i> chunkSize; + do { + size_t chunk = MEM_readST(&src[i]); + chunk ^= splatChar; + chunk = (((chunk | x80) - x01) | chunk) & x80; + matches <<= chunkSize; + matches |= (chunk * extractMagic) >> shiftAmount; + i -= chunkSize; + } while (i >= 0); + } else { /* big endian: reverse bits during extraction */ + const size_t msb = xFF ^ (xFF >> 1); + const size_t extractMagic = (msb / 0x1FF) | msb; + do { + size_t chunk = MEM_readST(&src[i]); + chunk ^= splatChar; + chunk = (((chunk | x80) - x01) | chunk) & x80; + matches <<= chunkSize; + matches |= ((chunk >> 7) * extractMagic) >> shiftAmount; + i -= chunkSize; + } while (i >= 0); + } + matches = ~matches; + if (rowEntries == 16) { + return ZSTD_rotateRight_U16((U16)matches, headGrouped); + } else if (rowEntries == 32) { + return ZSTD_rotateRight_U32((U32)matches, headGrouped); + } else { + return ZSTD_rotateRight_U64((U64)matches, headGrouped); + } + } +#endif +} + +/* The high-level approach of the SIMD row based match finder is as follows: + * - Figure out where to insert the new entry: + * - Generate a hash for current input position and split it into a one byte of tag and `rowHashLog` bits of index. + * - The hash is salted by a value that changes on every context reset, so when the same table is used + * we will avoid collisions that would otherwise slow us down by introducing phantom matches. + * - The hashTable is effectively split into groups or "rows" of 15 or 31 entries of U32, and the index determines + * which row to insert into. + * - Determine the correct position within the row to insert the entry into. Each row of 15 or 31 can + * be considered as a circular buffer with a "head" index that resides in the tagTable (overall 16 or 32 bytes + * per row). + * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte tag calculated for the position and + * generate a bitfield that we can cycle through to check the collisions in the hash table. + * - Pick the longest match. + * - Insert the tag into the equivalent row and position in the tagTable. + */ +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_RowFindBestMatch( + ZSTD_MatchState_t* ms, + const BYTE* const ip, const BYTE* const iLimit, + size_t* offsetPtr, + const U32 mls, const ZSTD_dictMode_e dictMode, + const U32 rowLog) +{ + U32* const hashTable = ms->hashTable; + BYTE* const tagTable = ms->tagTable; + U32* const hashCache = ms->hashCache; + const U32 hashLog = ms->rowHashLog; + const ZSTD_compressionParameters* const cParams = &ms->cParams; + const BYTE* const base = ms->window.base; + const BYTE* const dictBase = ms->window.dictBase; + const U32 dictLimit = ms->window.dictLimit; + const BYTE* const prefixStart = base + dictLimit; + const BYTE* const dictEnd = dictBase + dictLimit; + const U32 curr = (U32)(ip-base); + const U32 maxDistance = 1U << cParams->windowLog; + const U32 lowestValid = ms->window.lowLimit; + const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid; + const U32 isDictionary = (ms->loadedDictEnd != 0); + const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance; + const U32 rowEntries = (1U << rowLog); + const U32 rowMask = rowEntries - 1; + const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */ + const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries); + const U64 hashSalt = ms->hashSalt; + U32 nbAttempts = 1U << cappedSearchLog; + size_t ml=4-1; + U32 hash; + + /* DMS/DDS variables that may be referenced laster */ + const ZSTD_MatchState_t* const dms = ms->dictMatchState; + + /* Initialize the following variables to satisfy static analyzer */ + size_t ddsIdx = 0; + U32 ddsExtraAttempts = 0; /* cctx hash tables are limited in searches, but allow extra searches into DDS */ + U32 dmsTag = 0; + U32* dmsRow = NULL; + BYTE* dmsTagRow = NULL; + + if (dictMode == ZSTD_dedicatedDictSearch) { + const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG; + { /* Prefetch DDS hashtable entry */ + ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG; + PREFETCH_L1(&dms->hashTable[ddsIdx]); + } + ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0; + } + + if (dictMode == ZSTD_dictMatchState) { + /* Prefetch DMS rows */ + U32* const dmsHashTable = dms->hashTable; + BYTE* const dmsTagTable = dms->tagTable; + U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls); + U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK; + dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow); + dmsRow = dmsHashTable + dmsRelRow; + ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog); + } + + /* Update the hashTable and tagTable up to (but not including) ip */ + if (!ms->lazySkipping) { + ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */); + hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt); + } else { + /* Stop inserting every position when in the lazy skipping mode. + * The hash cache is also not kept up to date in this mode. + */ + hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); + ms->nextToUpdate = curr; + } + ms->hashSaltEntropy += hash; /* collect salt entropy */ + + { /* Get the hash for ip, compute the appropriate row */ + U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK; + U32* const row = hashTable + relRow; + BYTE* tagRow = (BYTE*)(tagTable + relRow); + U32 const headGrouped = (*tagRow & rowMask) * groupWidth; + U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; + size_t numMatches = 0; + size_t currMatch = 0; + ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries); + + /* Cycle through the matches and prefetch */ + for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) { + U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; + U32 const matchIndex = row[matchPos]; + if(matchPos == 0) continue; + assert(numMatches < rowEntries); + if (matchIndex < lowLimit) + break; + if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { + PREFETCH_L1(base + matchIndex); + } else { + PREFETCH_L1(dictBase + matchIndex); + } + matchBuffer[numMatches++] = matchIndex; + --nbAttempts; + } + + /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop + in ZSTD_row_update_internal() at the next search. */ + { + U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); + tagRow[pos] = (BYTE)tag; + row[pos] = ms->nextToUpdate++; + } + + /* Return the longest match */ + for (; currMatch < numMatches; ++currMatch) { + U32 const matchIndex = matchBuffer[currMatch]; + size_t currentMl=0; + assert(matchIndex < curr); + assert(matchIndex >= lowLimit); + + if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { + const BYTE* const match = base + matchIndex; + assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ + /* read 4B starting from (match + ml + 1 - sizeof(U32)) */ + if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */ + currentMl = ZSTD_count(ip, match, iLimit); + } else { + const BYTE* const match = dictBase + matchIndex; + assert(match+4 <= dictEnd); + if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */ + currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4; + } + + /* Save best solution */ + if (currentMl > ml) { + ml = currentMl; + *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + } + } + + assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */ + if (dictMode == ZSTD_dedicatedDictSearch) { + ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms, + ip, iLimit, prefixStart, curr, dictLimit, ddsIdx); + } else if (dictMode == ZSTD_dictMatchState) { + /* TODO: Measure and potentially add prefetching to DMS */ + const U32 dmsLowestIndex = dms->window.dictLimit; + const BYTE* const dmsBase = dms->window.base; + const BYTE* const dmsEnd = dms->window.nextSrc; + const U32 dmsSize = (U32)(dmsEnd - dmsBase); + const U32 dmsIndexDelta = dictLimit - dmsSize; + + { U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth; + U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; + size_t numMatches = 0; + size_t currMatch = 0; + ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries); + + for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) { + U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; + U32 const matchIndex = dmsRow[matchPos]; + if(matchPos == 0) continue; + if (matchIndex < dmsLowestIndex) + break; + PREFETCH_L1(dmsBase + matchIndex); + matchBuffer[numMatches++] = matchIndex; + --nbAttempts; + } + + /* Return the longest match */ + for (; currMatch < numMatches; ++currMatch) { + U32 const matchIndex = matchBuffer[currMatch]; + size_t currentMl=0; + assert(matchIndex >= dmsLowestIndex); + assert(matchIndex < curr); + + { const BYTE* const match = dmsBase + matchIndex; + assert(match+4 <= dmsEnd); + if (MEM_read32(match) == MEM_read32(ip)) + currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4; + } + + if (currentMl > ml) { + ml = currentMl; + assert(curr > matchIndex + dmsIndexDelta); + *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta)); + if (ip+currentMl == iLimit) break; + } + } + } + } + return ml; +} + + +/** + * Generate search functions templated on (dictMode, mls, rowLog). + * These functions are outlined for code size & compilation time. + * ZSTD_searchMax() dispatches to the correct implementation function. + * + * TODO: The start of the search function involves loading and calculating a + * bunch of constants from the ZSTD_MatchState_t. These computations could be + * done in an initialization function, and saved somewhere in the match state. + * Then we could pass a pointer to the saved state instead of the match state, + * and avoid duplicate computations. + * + * TODO: Move the match re-winding into searchMax. This improves compression + * ratio, and unlocks further simplifications with the next TODO. + * + * TODO: Try moving the repcode search into searchMax. After the re-winding + * and repcode search are in searchMax, there is no more logic in the match + * finder loop that requires knowledge about the dictMode. So we should be + * able to avoid force inlining it, and we can join the extDict loop with + * the single segment loop. It should go in searchMax instead of its own + * function to avoid having multiple virtual function calls per search. + */ + +#define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##_##mls +#define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##_##mls +#define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog + +#define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE + +#define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls) \ + ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)( \ + ZSTD_MatchState_t* ms, \ + const BYTE* ip, const BYTE* const iLimit, \ + size_t* offBasePtr) \ + { \ + assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ + return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode); \ + } \ + +#define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls) \ + ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)( \ + ZSTD_MatchState_t* ms, \ + const BYTE* ip, const BYTE* const iLimit, \ + size_t* offsetPtr) \ + { \ + assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ + return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \ + } \ + +#define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) \ + ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)( \ + ZSTD_MatchState_t* ms, \ + const BYTE* ip, const BYTE* const iLimit, \ + size_t* offsetPtr) \ + { \ + assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ + assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \ + return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \ + } \ + +#define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \ + X(dictMode, mls, 4) \ + X(dictMode, mls, 5) \ + X(dictMode, mls, 6) + +#define ZSTD_FOR_EACH_MLS_ROWLOG(X, dictMode) \ + ZSTD_FOR_EACH_ROWLOG(X, dictMode, 4) \ + ZSTD_FOR_EACH_ROWLOG(X, dictMode, 5) \ + ZSTD_FOR_EACH_ROWLOG(X, dictMode, 6) + +#define ZSTD_FOR_EACH_MLS(X, dictMode) \ + X(dictMode, 4) \ + X(dictMode, 5) \ + X(dictMode, 6) + +#define ZSTD_FOR_EACH_DICT_MODE(X, ...) \ + X(__VA_ARGS__, noDict) \ + X(__VA_ARGS__, extDict) \ + X(__VA_ARGS__, dictMatchState) \ + X(__VA_ARGS__, dedicatedDictSearch) + +/* Generate row search fns for each combination of (dictMode, mls, rowLog) */ +ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_SEARCH_FN) +/* Generate binary Tree search fns for each combination of (dictMode, mls) */ +ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_SEARCH_FN) +/* Generate hash chain search fns for each combination of (dictMode, mls) */ +ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_SEARCH_FN) + +typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e; + +#define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls) \ + case mls: \ + return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr); +#define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls) \ + case mls: \ + return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr); +#define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, mls, rowLog) \ + case rowLog: \ + return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, offsetPtr); + +#define ZSTD_SWITCH_MLS(X, dictMode) \ + switch (mls) { \ + ZSTD_FOR_EACH_MLS(X, dictMode) \ + } + +#define ZSTD_SWITCH_ROWLOG(dictMode, mls) \ + case mls: \ + switch (rowLog) { \ + ZSTD_FOR_EACH_ROWLOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode, mls) \ + } \ + ZSTD_UNREACHABLE; \ + break; + +#define ZSTD_SWITCH_SEARCH_METHOD(dictMode) \ + switch (searchMethod) { \ + case search_hashChain: \ + ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \ + break; \ + case search_binaryTree: \ + ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \ + break; \ + case search_rowHash: \ + ZSTD_SWITCH_MLS(ZSTD_SWITCH_ROWLOG, dictMode) \ + break; \ + } \ + ZSTD_UNREACHABLE; + +/** + * Searches for the longest match at @p ip. + * Dispatches to the correct implementation function based on the + * (searchMethod, dictMode, mls, rowLog). We use switch statements + * here instead of using an indirect function call through a function + * pointer because after Spectre and Meltdown mitigations, indirect + * function calls can be very costly, especially in the kernel. + * + * NOTE: dictMode and searchMethod should be templated, so those switch + * statements should be optimized out. Only the mls & rowLog switches + * should be left. + * + * @param ms The match state. + * @param ip The position to search at. + * @param iend The end of the input data. + * @param[out] offsetPtr Stores the match offset into this pointer. + * @param mls The minimum search length, in the range [4, 6]. + * @param rowLog The row log (if applicable), in the range [4, 6]. + * @param searchMethod The search method to use (templated). + * @param dictMode The dictMode (templated). + * + * @returns The length of the longest match found, or < mls if no match is found. + * If a match is found its offset is stored in @p offsetPtr. + */ +FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax( + ZSTD_MatchState_t* ms, + const BYTE* ip, + const BYTE* iend, + size_t* offsetPtr, + U32 const mls, + U32 const rowLog, + searchMethod_e const searchMethod, + ZSTD_dictMode_e const dictMode) +{ + if (dictMode == ZSTD_noDict) { + ZSTD_SWITCH_SEARCH_METHOD(noDict) + } else if (dictMode == ZSTD_extDict) { + ZSTD_SWITCH_SEARCH_METHOD(extDict) + } else if (dictMode == ZSTD_dictMatchState) { + ZSTD_SWITCH_SEARCH_METHOD(dictMatchState) + } else if (dictMode == ZSTD_dedicatedDictSearch) { + ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch) + } + ZSTD_UNREACHABLE; + return 0; +} + +/* ******************************* +* Common parser - lazy strategy +*********************************/ + +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_compressBlock_lazy_generic( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, + const searchMethod_e searchMethod, const U32 depth, + ZSTD_dictMode_e const dictMode) +{ + const BYTE* const istart = (const BYTE*)src; + const BYTE* ip = istart; + const BYTE* anchor = istart; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = (searchMethod == search_rowHash) ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8; + const BYTE* const base = ms->window.base; + const U32 prefixLowestIndex = ms->window.dictLimit; + const BYTE* const prefixLowest = base + prefixLowestIndex; + const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6); + const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); + + U32 offset_1 = rep[0], offset_2 = rep[1]; + U32 offsetSaved1 = 0, offsetSaved2 = 0; + + const int isDMS = dictMode == ZSTD_dictMatchState; + const int isDDS = dictMode == ZSTD_dedicatedDictSearch; + const int isDxS = isDMS || isDDS; + const ZSTD_MatchState_t* const dms = ms->dictMatchState; + const U32 dictLowestIndex = isDxS ? dms->window.dictLimit : 0; + const BYTE* const dictBase = isDxS ? dms->window.base : NULL; + const BYTE* const dictLowest = isDxS ? dictBase + dictLowestIndex : NULL; + const BYTE* const dictEnd = isDxS ? dms->window.nextSrc : NULL; + const U32 dictIndexDelta = isDxS ? + prefixLowestIndex - (U32)(dictEnd - dictBase) : + 0; + const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest)); + + DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod); + ip += (dictAndPrefixLength == 0); + if (dictMode == ZSTD_noDict) { + U32 const curr = (U32)(ip - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog); + U32 const maxRep = curr - windowLow; + if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0; + if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0; + } + if (isDxS) { + /* dictMatchState repCode checks don't currently handle repCode == 0 + * disabling. */ + assert(offset_1 <= dictAndPrefixLength); + assert(offset_2 <= dictAndPrefixLength); + } + + /* Reset the lazy skipping state */ + ms->lazySkipping = 0; + + if (searchMethod == search_rowHash) { + ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); + } + + /* Match Loop */ +#if defined(__GNUC__) && defined(__x86_64__) + /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the + * code alignment is perturbed. To fix the instability align the loop on 32-bytes. + */ + __asm__(".p2align 5"); +#endif + while (ip < ilimit) { + size_t matchLength=0; + size_t offBase = REPCODE1_TO_OFFBASE; + const BYTE* start=ip+1; + DEBUGLOG(7, "search baseline (depth 0)"); + + /* check repCode */ + if (isDxS) { + const U32 repIndex = (U32)(ip - base) + 1 - offset_1; + const BYTE* repMatch = ((dictMode == ZSTD_dictMatchState || dictMode == ZSTD_dedicatedDictSearch) + && repIndex < prefixLowestIndex) ? + dictBase + (repIndex - dictIndexDelta) : + base + repIndex; + if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) + && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + if (depth==0) goto _storeSequence; + } + } + if ( dictMode == ZSTD_noDict + && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) { + matchLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; + if (depth==0) goto _storeSequence; + } + + /* first search (depth 0) */ + { size_t offbaseFound = 999999999; + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode); + if (ml2 > matchLength) + matchLength = ml2, start = ip, offBase = offbaseFound; + } + + if (matchLength < 4) { + size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */; + ip += step; + /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time. + * In this mode we stop inserting every position into our tables, and only insert + * positions that we search, which is one in step positions. + * The exact cutoff is flexible, I've just chosen a number that is reasonably high, + * so we minimize the compression ratio loss in "normal" scenarios. This mode gets + * triggered once we've gone 2KB without finding any matches. + */ + ms->lazySkipping = step > kLazySkippingStep; + continue; + } + + /* let's try to find a better solution */ + if (depth>=1) + while (ip0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; + int const gain2 = (int)(mlRep * 3); + int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) + matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + if (isDxS) { + const U32 repIndex = (U32)(ip - base) - offset_1; + const BYTE* repMatch = repIndex < prefixLowestIndex ? + dictBase + (repIndex - dictIndexDelta) : + base + repIndex; + if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) + && (MEM_read32(repMatch) == MEM_read32(ip)) ) { + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + int const gain2 = (int)(mlRep * 3); + int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) + matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + } + { size_t ofbCandidate=999999999; + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4); + if ((ml2 >= 4) && (gain2 > gain1)) { + matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; /* search a better one */ + } } + + /* let's find an even better one */ + if ((depth==2) && (ip0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; + int const gain2 = (int)(mlRep * 4); + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) + matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + if (isDxS) { + const U32 repIndex = (U32)(ip - base) - offset_1; + const BYTE* repMatch = repIndex < prefixLowestIndex ? + dictBase + (repIndex - dictIndexDelta) : + base + repIndex; + if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) + && (MEM_read32(repMatch) == MEM_read32(ip)) ) { + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + int const gain2 = (int)(mlRep * 4); + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) + matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + } + { size_t ofbCandidate=999999999; + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7); + if ((ml2 >= 4) && (gain2 > gain1)) { + matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; + } } } + break; /* nothing found : store previous solution */ + } + + /* NOTE: + * Pay attention that `start[-value]` can lead to strange undefined behavior + * notably if `value` is unsigned, resulting in a large positive `-value`. + */ + /* catch up */ + if (OFFBASE_IS_OFFSET(offBase)) { + if (dictMode == ZSTD_noDict) { + while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest)) + && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) ) /* only search for offset within prefix */ + { start--; matchLength++; } + } + if (isDxS) { + U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase)); + const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex; + const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest; + while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ + } + offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase); + } + /* store sequence */ +_storeSequence: + { size_t const litLength = (size_t)(start - anchor); + ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); + anchor = ip = start + matchLength; + } + if (ms->lazySkipping) { + /* We've found a match, disable lazy skipping mode, and refill the hash cache. */ + if (searchMethod == search_rowHash) { + ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); + } + ms->lazySkipping = 0; + } + + /* check immediate repcode */ + if (isDxS) { + while (ip <= ilimit) { + U32 const current2 = (U32)(ip-base); + U32 const repIndex = current2 - offset_2; + const BYTE* repMatch = repIndex < prefixLowestIndex ? + dictBase - dictIndexDelta + repIndex : + base + repIndex; + if ( (ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) + && (MEM_read32(repMatch) == MEM_read32(ip)) ) { + const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4; + offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset_2 <=> offset_1 */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); + ip += matchLength; + anchor = ip; + continue; + } + break; + } + } + + if (dictMode == ZSTD_noDict) { + while ( ((ip <= ilimit) & (offset_2>0)) + && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) { + /* store sequence */ + matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; + offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); + ip += matchLength; + anchor = ip; + continue; /* faster when present ... (?) */ + } } } + + /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), + * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ + offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; + + /* save reps for next block */ + rep[0] = offset_1 ? offset_1 : offsetSaved1; + rep[1] = offset_2 ? offset_2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +} +#endif /* build exclusions */ + + +#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_greedy( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_greedy_dictMatchState( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch); +} + +size_t ZSTD_compressBlock_greedy_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_greedy_dictMatchState_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch); +} +#endif + +#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_lazy( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_lazy_dictMatchState( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch); +} + +size_t ZSTD_compressBlock_lazy_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_lazy_dictMatchState_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch); +} +#endif + +#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_lazy2( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_lazy2_dictMatchState( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch); +} + +size_t ZSTD_compressBlock_lazy2_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_lazy2_dictMatchState_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch); +} +#endif + +#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_btlazy2( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_btlazy2_dictMatchState( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState); +} +#endif + +#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_compressBlock_lazy_extDict_generic( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, + const searchMethod_e searchMethod, const U32 depth) +{ + const BYTE* const istart = (const BYTE*)src; + const BYTE* ip = istart; + const BYTE* anchor = istart; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8; + const BYTE* const base = ms->window.base; + const U32 dictLimit = ms->window.dictLimit; + const BYTE* const prefixStart = base + dictLimit; + const BYTE* const dictBase = ms->window.dictBase; + const BYTE* const dictEnd = dictBase + dictLimit; + const BYTE* const dictStart = dictBase + ms->window.lowLimit; + const U32 windowLog = ms->cParams.windowLog; + const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6); + const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); + + U32 offset_1 = rep[0], offset_2 = rep[1]; + + DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod); + + /* Reset the lazy skipping state */ + ms->lazySkipping = 0; + + /* init */ + ip += (ip == prefixStart); + if (searchMethod == search_rowHash) { + ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); + } + + /* Match Loop */ +#if defined(__GNUC__) && defined(__x86_64__) + /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the + * code alignment is perturbed. To fix the instability align the loop on 32-bytes. + */ + __asm__(".p2align 5"); +#endif + while (ip < ilimit) { + size_t matchLength=0; + size_t offBase = REPCODE1_TO_OFFBASE; + const BYTE* start=ip+1; + U32 curr = (U32)(ip-base); + + /* check repCode */ + { const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr+1, windowLog); + const U32 repIndex = (U32)(curr+1 - offset_1); + const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; + const BYTE* const repMatch = repBase + repIndex; + if ( (ZSTD_index_overlap_check(dictLimit, repIndex)) + & (offset_1 <= curr+1 - windowLow) ) /* note: we are searching at curr+1 */ + if (MEM_read32(ip+1) == MEM_read32(repMatch)) { + /* repcode detected we should take it */ + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repEnd, prefixStart) + 4; + if (depth==0) goto _storeSequence; + } } + + /* first search (depth 0) */ + { size_t ofbCandidate = 999999999; + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); + if (ml2 > matchLength) + matchLength = ml2, start = ip, offBase = ofbCandidate; + } + + if (matchLength < 4) { + size_t const step = ((size_t)(ip-anchor) >> kSearchStrength); + ip += step + 1; /* jump faster over incompressible sections */ + /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time. + * In this mode we stop inserting every position into our tables, and only insert + * positions that we search, which is one in step positions. + * The exact cutoff is flexible, I've just chosen a number that is reasonably high, + * so we minimize the compression ratio loss in "normal" scenarios. This mode gets + * triggered once we've gone 2KB without finding any matches. + */ + ms->lazySkipping = step > kLazySkippingStep; + continue; + } + + /* let's try to find a better solution */ + if (depth>=1) + while (ip repIndex >= windowLow` */ + if (MEM_read32(ip) == MEM_read32(repMatch)) { + /* repcode detected */ + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; + int const gain2 = (int)(repLength * 3); + int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); + if ((repLength >= 4) && (gain2 > gain1)) + matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip; + } } + + /* search match, depth 1 */ + { size_t ofbCandidate = 999999999; + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4); + if ((ml2 >= 4) && (gain2 > gain1)) { + matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; /* search a better one */ + } } + + /* let's find an even better one */ + if ((depth==2) && (ip repIndex >= windowLow` */ + if (MEM_read32(ip) == MEM_read32(repMatch)) { + /* repcode detected */ + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; + int const gain2 = (int)(repLength * 4); + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); + if ((repLength >= 4) && (gain2 > gain1)) + matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip; + } } + + /* search match, depth 2 */ + { size_t ofbCandidate = 999999999; + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7); + if ((ml2 >= 4) && (gain2 > gain1)) { + matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; + } } } + break; /* nothing found : store previous solution */ + } + + /* catch up */ + if (OFFBASE_IS_OFFSET(offBase)) { + U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase)); + const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex; + const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart; + while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ + offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase); + } + + /* store sequence */ +_storeSequence: + { size_t const litLength = (size_t)(start - anchor); + ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); + anchor = ip = start + matchLength; + } + if (ms->lazySkipping) { + /* We've found a match, disable lazy skipping mode, and refill the hash cache. */ + if (searchMethod == search_rowHash) { + ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); + } + ms->lazySkipping = 0; + } + + /* check immediate repcode */ + while (ip <= ilimit) { + const U32 repCurrent = (U32)(ip-base); + const U32 windowLow = ZSTD_getLowestMatchIndex(ms, repCurrent, windowLog); + const U32 repIndex = repCurrent - offset_2; + const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; + const BYTE* const repMatch = repBase + repIndex; + if ( (ZSTD_index_overlap_check(dictLimit, repIndex)) + & (offset_2 <= repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ + if (MEM_read32(ip) == MEM_read32(repMatch)) { + /* repcode detected we should take it */ + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; + offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset history */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); + ip += matchLength; + anchor = ip; + continue; /* faster when present ... (?) */ + } + break; + } } + + /* Save reps for next block */ + rep[0] = offset_1; + rep[1] = offset_2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +} +#endif /* build exclusions */ + +#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_greedy_extDict( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0); +} + +size_t ZSTD_compressBlock_greedy_extDict_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0); +} +#endif + +#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_lazy_extDict( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1); +} + +size_t ZSTD_compressBlock_lazy_extDict_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1); +} +#endif + +#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_lazy2_extDict( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2); +} + +size_t ZSTD_compressBlock_lazy2_extDict_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2); +} +#endif + +#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_btlazy2_extDict( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2); +} +#endif diff --git a/lib/compress/zstd_ldm.c b/lib/compress/zstd_ldm.c new file mode 100644 index 0000000..070551c --- /dev/null +++ b/lib/compress/zstd_ldm.c @@ -0,0 +1,745 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#include "zstd_ldm.h" + +#include "../common/debug.h" +#include "../common/xxhash.h" +#include "zstd_fast.h" /* ZSTD_fillHashTable() */ +#include "zstd_double_fast.h" /* ZSTD_fillDoubleHashTable() */ +#include "zstd_ldm_geartab.h" + +#define LDM_BUCKET_SIZE_LOG 4 +#define LDM_MIN_MATCH_LENGTH 64 +#define LDM_HASH_RLOG 7 + +typedef struct { + U64 rolling; + U64 stopMask; +} ldmRollingHashState_t; + +/** ZSTD_ldm_gear_init(): + * + * Initializes the rolling hash state such that it will honor the + * settings in params. */ +static void ZSTD_ldm_gear_init(ldmRollingHashState_t* state, ldmParams_t const* params) +{ + unsigned maxBitsInMask = MIN(params->minMatchLength, 64); + unsigned hashRateLog = params->hashRateLog; + + state->rolling = ~(U32)0; + + /* The choice of the splitting criterion is subject to two conditions: + * 1. it has to trigger on average every 2^(hashRateLog) bytes; + * 2. ideally, it has to depend on a window of minMatchLength bytes. + * + * In the gear hash algorithm, bit n depends on the last n bytes; + * so in order to obtain a good quality splitting criterion it is + * preferable to use bits with high weight. + * + * To match condition 1 we use a mask with hashRateLog bits set + * and, because of the previous remark, we make sure these bits + * have the highest possible weight while still respecting + * condition 2. + */ + if (hashRateLog > 0 && hashRateLog <= maxBitsInMask) { + state->stopMask = (((U64)1 << hashRateLog) - 1) << (maxBitsInMask - hashRateLog); + } else { + /* In this degenerate case we simply honor the hash rate. */ + state->stopMask = ((U64)1 << hashRateLog) - 1; + } +} + +/** ZSTD_ldm_gear_reset() + * Feeds [data, data + minMatchLength) into the hash without registering any + * splits. This effectively resets the hash state. This is used when skipping + * over data, either at the beginning of a block, or skipping sections. + */ +static void ZSTD_ldm_gear_reset(ldmRollingHashState_t* state, + BYTE const* data, size_t minMatchLength) +{ + U64 hash = state->rolling; + size_t n = 0; + +#define GEAR_ITER_ONCE() do { \ + hash = (hash << 1) + ZSTD_ldm_gearTab[data[n] & 0xff]; \ + n += 1; \ + } while (0) + while (n + 3 < minMatchLength) { + GEAR_ITER_ONCE(); + GEAR_ITER_ONCE(); + GEAR_ITER_ONCE(); + GEAR_ITER_ONCE(); + } + while (n < minMatchLength) { + GEAR_ITER_ONCE(); + } +#undef GEAR_ITER_ONCE +} + +/** ZSTD_ldm_gear_feed(): + * + * Registers in the splits array all the split points found in the first + * size bytes following the data pointer. This function terminates when + * either all the data has been processed or LDM_BATCH_SIZE splits are + * present in the splits array. + * + * Precondition: The splits array must not be full. + * Returns: The number of bytes processed. */ +static size_t ZSTD_ldm_gear_feed(ldmRollingHashState_t* state, + BYTE const* data, size_t size, + size_t* splits, unsigned* numSplits) +{ + size_t n; + U64 hash, mask; + + hash = state->rolling; + mask = state->stopMask; + n = 0; + +#define GEAR_ITER_ONCE() do { \ + hash = (hash << 1) + ZSTD_ldm_gearTab[data[n] & 0xff]; \ + n += 1; \ + if (UNLIKELY((hash & mask) == 0)) { \ + splits[*numSplits] = n; \ + *numSplits += 1; \ + if (*numSplits == LDM_BATCH_SIZE) \ + goto done; \ + } \ + } while (0) + + while (n + 3 < size) { + GEAR_ITER_ONCE(); + GEAR_ITER_ONCE(); + GEAR_ITER_ONCE(); + GEAR_ITER_ONCE(); + } + while (n < size) { + GEAR_ITER_ONCE(); + } + +#undef GEAR_ITER_ONCE + +done: + state->rolling = hash; + return n; +} + +void ZSTD_ldm_adjustParameters(ldmParams_t* params, + const ZSTD_compressionParameters* cParams) +{ + params->windowLog = cParams->windowLog; + ZSTD_STATIC_ASSERT(LDM_BUCKET_SIZE_LOG <= ZSTD_LDM_BUCKETSIZELOG_MAX); + DEBUGLOG(4, "ZSTD_ldm_adjustParameters"); + if (params->hashRateLog == 0) { + if (params->hashLog > 0) { + /* if params->hashLog is set, derive hashRateLog from it */ + assert(params->hashLog <= ZSTD_HASHLOG_MAX); + if (params->windowLog > params->hashLog) { + params->hashRateLog = params->windowLog - params->hashLog; + } + } else { + assert(1 <= (int)cParams->strategy && (int)cParams->strategy <= 9); + /* mapping from [fast, rate7] to [btultra2, rate4] */ + params->hashRateLog = 7 - (cParams->strategy/3); + } + } + if (params->hashLog == 0) { + params->hashLog = BOUNDED(ZSTD_HASHLOG_MIN, params->windowLog - params->hashRateLog, ZSTD_HASHLOG_MAX); + } + if (params->minMatchLength == 0) { + params->minMatchLength = LDM_MIN_MATCH_LENGTH; + if (cParams->strategy >= ZSTD_btultra) + params->minMatchLength /= 2; + } + if (params->bucketSizeLog==0) { + assert(1 <= (int)cParams->strategy && (int)cParams->strategy <= 9); + params->bucketSizeLog = BOUNDED(LDM_BUCKET_SIZE_LOG, (U32)cParams->strategy, ZSTD_LDM_BUCKETSIZELOG_MAX); + } + params->bucketSizeLog = MIN(params->bucketSizeLog, params->hashLog); +} + +size_t ZSTD_ldm_getTableSize(ldmParams_t params) +{ + size_t const ldmHSize = ((size_t)1) << params.hashLog; + size_t const ldmBucketSizeLog = MIN(params.bucketSizeLog, params.hashLog); + size_t const ldmBucketSize = ((size_t)1) << (params.hashLog - ldmBucketSizeLog); + size_t const totalSize = ZSTD_cwksp_alloc_size(ldmBucketSize) + + ZSTD_cwksp_alloc_size(ldmHSize * sizeof(ldmEntry_t)); + return params.enableLdm == ZSTD_ps_enable ? totalSize : 0; +} + +size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize) +{ + return params.enableLdm == ZSTD_ps_enable ? (maxChunkSize / params.minMatchLength) : 0; +} + +/** ZSTD_ldm_getBucket() : + * Returns a pointer to the start of the bucket associated with hash. */ +static ldmEntry_t* ZSTD_ldm_getBucket( + const ldmState_t* ldmState, size_t hash, U32 const bucketSizeLog) +{ + return ldmState->hashTable + (hash << bucketSizeLog); +} + +/** ZSTD_ldm_insertEntry() : + * Insert the entry with corresponding hash into the hash table */ +static void ZSTD_ldm_insertEntry(ldmState_t* ldmState, + size_t const hash, const ldmEntry_t entry, + U32 const bucketSizeLog) +{ + BYTE* const pOffset = ldmState->bucketOffsets + hash; + unsigned const offset = *pOffset; + + *(ZSTD_ldm_getBucket(ldmState, hash, bucketSizeLog) + offset) = entry; + *pOffset = (BYTE)((offset + 1) & ((1u << bucketSizeLog) - 1)); + +} + +/** ZSTD_ldm_countBackwardsMatch() : + * Returns the number of bytes that match backwards before pIn and pMatch. + * + * We count only bytes where pMatch >= pBase and pIn >= pAnchor. */ +static size_t ZSTD_ldm_countBackwardsMatch( + const BYTE* pIn, const BYTE* pAnchor, + const BYTE* pMatch, const BYTE* pMatchBase) +{ + size_t matchLength = 0; + while (pIn > pAnchor && pMatch > pMatchBase && pIn[-1] == pMatch[-1]) { + pIn--; + pMatch--; + matchLength++; + } + return matchLength; +} + +/** ZSTD_ldm_countBackwardsMatch_2segments() : + * Returns the number of bytes that match backwards from pMatch, + * even with the backwards match spanning 2 different segments. + * + * On reaching `pMatchBase`, start counting from mEnd */ +static size_t ZSTD_ldm_countBackwardsMatch_2segments( + const BYTE* pIn, const BYTE* pAnchor, + const BYTE* pMatch, const BYTE* pMatchBase, + const BYTE* pExtDictStart, const BYTE* pExtDictEnd) +{ + size_t matchLength = ZSTD_ldm_countBackwardsMatch(pIn, pAnchor, pMatch, pMatchBase); + if (pMatch - matchLength != pMatchBase || pMatchBase == pExtDictStart) { + /* If backwards match is entirely in the extDict or prefix, immediately return */ + return matchLength; + } + DEBUGLOG(7, "ZSTD_ldm_countBackwardsMatch_2segments: found 2-parts backwards match (length in prefix==%zu)", matchLength); + matchLength += ZSTD_ldm_countBackwardsMatch(pIn - matchLength, pAnchor, pExtDictEnd, pExtDictStart); + DEBUGLOG(7, "final backwards match length = %zu", matchLength); + return matchLength; +} + +/** ZSTD_ldm_fillFastTables() : + * + * Fills the relevant tables for the ZSTD_fast and ZSTD_dfast strategies. + * This is similar to ZSTD_loadDictionaryContent. + * + * The tables for the other strategies are filled within their + * block compressors. */ +static size_t ZSTD_ldm_fillFastTables(ZSTD_MatchState_t* ms, + void const* end) +{ + const BYTE* const iend = (const BYTE*)end; + + switch(ms->cParams.strategy) + { + case ZSTD_fast: + ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx); + break; + + case ZSTD_dfast: +#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR + ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx); +#else + assert(0); /* shouldn't be called: cparams should've been adjusted. */ +#endif + break; + + case ZSTD_greedy: + case ZSTD_lazy: + case ZSTD_lazy2: + case ZSTD_btlazy2: + case ZSTD_btopt: + case ZSTD_btultra: + case ZSTD_btultra2: + break; + default: + assert(0); /* not possible : not a valid strategy id */ + } + + return 0; +} + +void ZSTD_ldm_fillHashTable( + ldmState_t* ldmState, const BYTE* ip, + const BYTE* iend, ldmParams_t const* params) +{ + U32 const minMatchLength = params->minMatchLength; + U32 const bucketSizeLog = params->bucketSizeLog; + U32 const hBits = params->hashLog - bucketSizeLog; + BYTE const* const base = ldmState->window.base; + BYTE const* const istart = ip; + ldmRollingHashState_t hashState; + size_t* const splits = ldmState->splitIndices; + unsigned numSplits; + + DEBUGLOG(5, "ZSTD_ldm_fillHashTable"); + + ZSTD_ldm_gear_init(&hashState, params); + while (ip < iend) { + size_t hashed; + unsigned n; + + numSplits = 0; + hashed = ZSTD_ldm_gear_feed(&hashState, ip, (size_t)(iend - ip), splits, &numSplits); + + for (n = 0; n < numSplits; n++) { + if (ip + splits[n] >= istart + minMatchLength) { + BYTE const* const split = ip + splits[n] - minMatchLength; + U64 const xxhash = XXH64(split, minMatchLength, 0); + U32 const hash = (U32)(xxhash & (((U32)1 << hBits) - 1)); + ldmEntry_t entry; + + entry.offset = (U32)(split - base); + entry.checksum = (U32)(xxhash >> 32); + ZSTD_ldm_insertEntry(ldmState, hash, entry, params->bucketSizeLog); + } + } + + ip += hashed; + } +} + + +/** ZSTD_ldm_limitTableUpdate() : + * + * Sets cctx->nextToUpdate to a position corresponding closer to anchor + * if it is far way + * (after a long match, only update tables a limited amount). */ +static void ZSTD_ldm_limitTableUpdate(ZSTD_MatchState_t* ms, const BYTE* anchor) +{ + U32 const curr = (U32)(anchor - ms->window.base); + if (curr > ms->nextToUpdate + 1024) { + ms->nextToUpdate = + curr - MIN(512, curr - ms->nextToUpdate - 1024); + } +} + +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_ldm_generateSequences_internal( + ldmState_t* ldmState, RawSeqStore_t* rawSeqStore, + ldmParams_t const* params, void const* src, size_t srcSize) +{ + /* LDM parameters */ + int const extDict = ZSTD_window_hasExtDict(ldmState->window); + U32 const minMatchLength = params->minMatchLength; + U32 const entsPerBucket = 1U << params->bucketSizeLog; + U32 const hBits = params->hashLog - params->bucketSizeLog; + /* Prefix and extDict parameters */ + U32 const dictLimit = ldmState->window.dictLimit; + U32 const lowestIndex = extDict ? ldmState->window.lowLimit : dictLimit; + BYTE const* const base = ldmState->window.base; + BYTE const* const dictBase = extDict ? ldmState->window.dictBase : NULL; + BYTE const* const dictStart = extDict ? dictBase + lowestIndex : NULL; + BYTE const* const dictEnd = extDict ? dictBase + dictLimit : NULL; + BYTE const* const lowPrefixPtr = base + dictLimit; + /* Input bounds */ + BYTE const* const istart = (BYTE const*)src; + BYTE const* const iend = istart + srcSize; + BYTE const* const ilimit = iend - HASH_READ_SIZE; + /* Input positions */ + BYTE const* anchor = istart; + BYTE const* ip = istart; + /* Rolling hash state */ + ldmRollingHashState_t hashState; + /* Arrays for staged-processing */ + size_t* const splits = ldmState->splitIndices; + ldmMatchCandidate_t* const candidates = ldmState->matchCandidates; + unsigned numSplits; + + if (srcSize < minMatchLength) + return iend - anchor; + + /* Initialize the rolling hash state with the first minMatchLength bytes */ + ZSTD_ldm_gear_init(&hashState, params); + ZSTD_ldm_gear_reset(&hashState, ip, minMatchLength); + ip += minMatchLength; + + while (ip < ilimit) { + size_t hashed; + unsigned n; + + numSplits = 0; + hashed = ZSTD_ldm_gear_feed(&hashState, ip, ilimit - ip, + splits, &numSplits); + + for (n = 0; n < numSplits; n++) { + BYTE const* const split = ip + splits[n] - minMatchLength; + U64 const xxhash = XXH64(split, minMatchLength, 0); + U32 const hash = (U32)(xxhash & (((U32)1 << hBits) - 1)); + + candidates[n].split = split; + candidates[n].hash = hash; + candidates[n].checksum = (U32)(xxhash >> 32); + candidates[n].bucket = ZSTD_ldm_getBucket(ldmState, hash, params->bucketSizeLog); + PREFETCH_L1(candidates[n].bucket); + } + + for (n = 0; n < numSplits; n++) { + size_t forwardMatchLength = 0, backwardMatchLength = 0, + bestMatchLength = 0, mLength; + U32 offset; + BYTE const* const split = candidates[n].split; + U32 const checksum = candidates[n].checksum; + U32 const hash = candidates[n].hash; + ldmEntry_t* const bucket = candidates[n].bucket; + ldmEntry_t const* cur; + ldmEntry_t const* bestEntry = NULL; + ldmEntry_t newEntry; + + newEntry.offset = (U32)(split - base); + newEntry.checksum = checksum; + + /* If a split point would generate a sequence overlapping with + * the previous one, we merely register it in the hash table and + * move on */ + if (split < anchor) { + ZSTD_ldm_insertEntry(ldmState, hash, newEntry, params->bucketSizeLog); + continue; + } + + for (cur = bucket; cur < bucket + entsPerBucket; cur++) { + size_t curForwardMatchLength, curBackwardMatchLength, + curTotalMatchLength; + if (cur->checksum != checksum || cur->offset <= lowestIndex) { + continue; + } + if (extDict) { + BYTE const* const curMatchBase = + cur->offset < dictLimit ? dictBase : base; + BYTE const* const pMatch = curMatchBase + cur->offset; + BYTE const* const matchEnd = + cur->offset < dictLimit ? dictEnd : iend; + BYTE const* const lowMatchPtr = + cur->offset < dictLimit ? dictStart : lowPrefixPtr; + curForwardMatchLength = + ZSTD_count_2segments(split, pMatch, iend, matchEnd, lowPrefixPtr); + if (curForwardMatchLength < minMatchLength) { + continue; + } + curBackwardMatchLength = ZSTD_ldm_countBackwardsMatch_2segments( + split, anchor, pMatch, lowMatchPtr, dictStart, dictEnd); + } else { /* !extDict */ + BYTE const* const pMatch = base + cur->offset; + curForwardMatchLength = ZSTD_count(split, pMatch, iend); + if (curForwardMatchLength < minMatchLength) { + continue; + } + curBackwardMatchLength = + ZSTD_ldm_countBackwardsMatch(split, anchor, pMatch, lowPrefixPtr); + } + curTotalMatchLength = curForwardMatchLength + curBackwardMatchLength; + + if (curTotalMatchLength > bestMatchLength) { + bestMatchLength = curTotalMatchLength; + forwardMatchLength = curForwardMatchLength; + backwardMatchLength = curBackwardMatchLength; + bestEntry = cur; + } + } + + /* No match found -- insert an entry into the hash table + * and process the next candidate match */ + if (bestEntry == NULL) { + ZSTD_ldm_insertEntry(ldmState, hash, newEntry, params->bucketSizeLog); + continue; + } + + /* Match found */ + offset = (U32)(split - base) - bestEntry->offset; + mLength = forwardMatchLength + backwardMatchLength; + { + rawSeq* const seq = rawSeqStore->seq + rawSeqStore->size; + + /* Out of sequence storage */ + if (rawSeqStore->size == rawSeqStore->capacity) + return ERROR(dstSize_tooSmall); + seq->litLength = (U32)(split - backwardMatchLength - anchor); + seq->matchLength = (U32)mLength; + seq->offset = offset; + rawSeqStore->size++; + } + + /* Insert the current entry into the hash table --- it must be + * done after the previous block to avoid clobbering bestEntry */ + ZSTD_ldm_insertEntry(ldmState, hash, newEntry, params->bucketSizeLog); + + anchor = split + forwardMatchLength; + + /* If we find a match that ends after the data that we've hashed + * then we have a repeating, overlapping, pattern. E.g. all zeros. + * If one repetition of the pattern matches our `stopMask` then all + * repetitions will. We don't need to insert them all into out table, + * only the first one. So skip over overlapping matches. + * This is a major speed boost (20x) for compressing a single byte + * repeated, when that byte ends up in the table. + */ + if (anchor > ip + hashed) { + ZSTD_ldm_gear_reset(&hashState, anchor - minMatchLength, minMatchLength); + /* Continue the outer loop at anchor (ip + hashed == anchor). */ + ip = anchor - hashed; + break; + } + } + + ip += hashed; + } + + return iend - anchor; +} + +/*! ZSTD_ldm_reduceTable() : + * reduce table indexes by `reducerValue` */ +static void ZSTD_ldm_reduceTable(ldmEntry_t* const table, U32 const size, + U32 const reducerValue) +{ + U32 u; + for (u = 0; u < size; u++) { + if (table[u].offset < reducerValue) table[u].offset = 0; + else table[u].offset -= reducerValue; + } +} + +size_t ZSTD_ldm_generateSequences( + ldmState_t* ldmState, RawSeqStore_t* sequences, + ldmParams_t const* params, void const* src, size_t srcSize) +{ + U32 const maxDist = 1U << params->windowLog; + BYTE const* const istart = (BYTE const*)src; + BYTE const* const iend = istart + srcSize; + size_t const kMaxChunkSize = 1 << 20; + size_t const nbChunks = (srcSize / kMaxChunkSize) + ((srcSize % kMaxChunkSize) != 0); + size_t chunk; + size_t leftoverSize = 0; + + assert(ZSTD_CHUNKSIZE_MAX >= kMaxChunkSize); + /* Check that ZSTD_window_update() has been called for this chunk prior + * to passing it to this function. + */ + assert(ldmState->window.nextSrc >= (BYTE const*)src + srcSize); + /* The input could be very large (in zstdmt), so it must be broken up into + * chunks to enforce the maximum distance and handle overflow correction. + */ + assert(sequences->pos <= sequences->size); + assert(sequences->size <= sequences->capacity); + for (chunk = 0; chunk < nbChunks && sequences->size < sequences->capacity; ++chunk) { + BYTE const* const chunkStart = istart + chunk * kMaxChunkSize; + size_t const remaining = (size_t)(iend - chunkStart); + BYTE const *const chunkEnd = + (remaining < kMaxChunkSize) ? iend : chunkStart + kMaxChunkSize; + size_t const chunkSize = chunkEnd - chunkStart; + size_t newLeftoverSize; + size_t const prevSize = sequences->size; + + assert(chunkStart < iend); + /* 1. Perform overflow correction if necessary. */ + if (ZSTD_window_needOverflowCorrection(ldmState->window, 0, maxDist, ldmState->loadedDictEnd, chunkStart, chunkEnd)) { + U32 const ldmHSize = 1U << params->hashLog; + U32 const correction = ZSTD_window_correctOverflow( + &ldmState->window, /* cycleLog */ 0, maxDist, chunkStart); + ZSTD_ldm_reduceTable(ldmState->hashTable, ldmHSize, correction); + /* invalidate dictionaries on overflow correction */ + ldmState->loadedDictEnd = 0; + } + /* 2. We enforce the maximum offset allowed. + * + * kMaxChunkSize should be small enough that we don't lose too much of + * the window through early invalidation. + * TODO: * Test the chunk size. + * * Try invalidation after the sequence generation and test the + * offset against maxDist directly. + * + * NOTE: Because of dictionaries + sequence splitting we MUST make sure + * that any offset used is valid at the END of the sequence, since it may + * be split into two sequences. This condition holds when using + * ZSTD_window_enforceMaxDist(), but if we move to checking offsets + * against maxDist directly, we'll have to carefully handle that case. + */ + ZSTD_window_enforceMaxDist(&ldmState->window, chunkEnd, maxDist, &ldmState->loadedDictEnd, NULL); + /* 3. Generate the sequences for the chunk, and get newLeftoverSize. */ + newLeftoverSize = ZSTD_ldm_generateSequences_internal( + ldmState, sequences, params, chunkStart, chunkSize); + if (ZSTD_isError(newLeftoverSize)) + return newLeftoverSize; + /* 4. We add the leftover literals from previous iterations to the first + * newly generated sequence, or add the `newLeftoverSize` if none are + * generated. + */ + /* Prepend the leftover literals from the last call */ + if (prevSize < sequences->size) { + sequences->seq[prevSize].litLength += (U32)leftoverSize; + leftoverSize = newLeftoverSize; + } else { + assert(newLeftoverSize == chunkSize); + leftoverSize += chunkSize; + } + } + return 0; +} + +void +ZSTD_ldm_skipSequences(RawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch) +{ + while (srcSize > 0 && rawSeqStore->pos < rawSeqStore->size) { + rawSeq* seq = rawSeqStore->seq + rawSeqStore->pos; + if (srcSize <= seq->litLength) { + /* Skip past srcSize literals */ + seq->litLength -= (U32)srcSize; + return; + } + srcSize -= seq->litLength; + seq->litLength = 0; + if (srcSize < seq->matchLength) { + /* Skip past the first srcSize of the match */ + seq->matchLength -= (U32)srcSize; + if (seq->matchLength < minMatch) { + /* The match is too short, omit it */ + if (rawSeqStore->pos + 1 < rawSeqStore->size) { + seq[1].litLength += seq[0].matchLength; + } + rawSeqStore->pos++; + } + return; + } + srcSize -= seq->matchLength; + seq->matchLength = 0; + rawSeqStore->pos++; + } +} + +/** + * If the sequence length is longer than remaining then the sequence is split + * between this block and the next. + * + * Returns the current sequence to handle, or if the rest of the block should + * be literals, it returns a sequence with offset == 0. + */ +static rawSeq maybeSplitSequence(RawSeqStore_t* rawSeqStore, + U32 const remaining, U32 const minMatch) +{ + rawSeq sequence = rawSeqStore->seq[rawSeqStore->pos]; + assert(sequence.offset > 0); + /* Likely: No partial sequence */ + if (remaining >= sequence.litLength + sequence.matchLength) { + rawSeqStore->pos++; + return sequence; + } + /* Cut the sequence short (offset == 0 ==> rest is literals). */ + if (remaining <= sequence.litLength) { + sequence.offset = 0; + } else if (remaining < sequence.litLength + sequence.matchLength) { + sequence.matchLength = remaining - sequence.litLength; + if (sequence.matchLength < minMatch) { + sequence.offset = 0; + } + } + /* Skip past `remaining` bytes for the future sequences. */ + ZSTD_ldm_skipSequences(rawSeqStore, remaining, minMatch); + return sequence; +} + +void ZSTD_ldm_skipRawSeqStoreBytes(RawSeqStore_t* rawSeqStore, size_t nbBytes) { + U32 currPos = (U32)(rawSeqStore->posInSequence + nbBytes); + while (currPos && rawSeqStore->pos < rawSeqStore->size) { + rawSeq currSeq = rawSeqStore->seq[rawSeqStore->pos]; + if (currPos >= currSeq.litLength + currSeq.matchLength) { + currPos -= currSeq.litLength + currSeq.matchLength; + rawSeqStore->pos++; + } else { + rawSeqStore->posInSequence = currPos; + break; + } + } + if (currPos == 0 || rawSeqStore->pos == rawSeqStore->size) { + rawSeqStore->posInSequence = 0; + } +} + +size_t ZSTD_ldm_blockCompress(RawSeqStore_t* rawSeqStore, + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_ParamSwitch_e useRowMatchFinder, + void const* src, size_t srcSize) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + unsigned const minMatch = cParams->minMatch; + ZSTD_BlockCompressor_f const blockCompressor = + ZSTD_selectBlockCompressor(cParams->strategy, useRowMatchFinder, ZSTD_matchState_dictMode(ms)); + /* Input bounds */ + BYTE const* const istart = (BYTE const*)src; + BYTE const* const iend = istart + srcSize; + /* Input positions */ + BYTE const* ip = istart; + + DEBUGLOG(5, "ZSTD_ldm_blockCompress: srcSize=%zu", srcSize); + /* If using opt parser, use LDMs only as candidates rather than always accepting them */ + if (cParams->strategy >= ZSTD_btopt) { + size_t lastLLSize; + ms->ldmSeqStore = rawSeqStore; + lastLLSize = blockCompressor(ms, seqStore, rep, src, srcSize); + ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore, srcSize); + return lastLLSize; + } + + assert(rawSeqStore->pos <= rawSeqStore->size); + assert(rawSeqStore->size <= rawSeqStore->capacity); + /* Loop through each sequence and apply the block compressor to the literals */ + while (rawSeqStore->pos < rawSeqStore->size && ip < iend) { + /* maybeSplitSequence updates rawSeqStore->pos */ + rawSeq const sequence = maybeSplitSequence(rawSeqStore, + (U32)(iend - ip), minMatch); + /* End signal */ + if (sequence.offset == 0) + break; + + assert(ip + sequence.litLength + sequence.matchLength <= iend); + + /* Fill tables for block compressor */ + ZSTD_ldm_limitTableUpdate(ms, ip); + ZSTD_ldm_fillFastTables(ms, ip); + /* Run the block compressor */ + DEBUGLOG(5, "pos %u : calling block compressor on segment of size %u", (unsigned)(ip-istart), sequence.litLength); + { + int i; + size_t const newLitLength = + blockCompressor(ms, seqStore, rep, ip, sequence.litLength); + ip += sequence.litLength; + /* Update the repcodes */ + for (i = ZSTD_REP_NUM - 1; i > 0; i--) + rep[i] = rep[i-1]; + rep[0] = sequence.offset; + /* Store the sequence */ + ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend, + OFFSET_TO_OFFBASE(sequence.offset), + sequence.matchLength); + ip += sequence.matchLength; + } + } + /* Fill the tables for the block compressor */ + ZSTD_ldm_limitTableUpdate(ms, ip); + ZSTD_ldm_fillFastTables(ms, ip); + /* Compress the last literals */ + return blockCompressor(ms, seqStore, rep, ip, iend - ip); +} diff --git a/lib/compress/zstd_opt.c b/lib/compress/zstd_opt.c new file mode 100644 index 0000000..3d7171b --- /dev/null +++ b/lib/compress/zstd_opt.c @@ -0,0 +1,1580 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#include "zstd_compress_internal.h" +#include "hist.h" +#include "zstd_opt.h" + +#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) + +#define ZSTD_LITFREQ_ADD 2 /* scaling factor for litFreq, so that frequencies adapt faster to new stats */ +#define ZSTD_MAX_PRICE (1<<30) + +#define ZSTD_PREDEF_THRESHOLD 8 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */ + + +/*-************************************* +* Price functions for optimal parser +***************************************/ + +#if 0 /* approximation at bit level (for tests) */ +# define BITCOST_ACCURACY 0 +# define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +# define WEIGHT(stat, opt) ((void)(opt), ZSTD_bitWeight(stat)) +#elif 0 /* fractional bit accuracy (for tests) */ +# define BITCOST_ACCURACY 8 +# define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +# define WEIGHT(stat,opt) ((void)(opt), ZSTD_fracWeight(stat)) +#else /* opt==approx, ultra==accurate */ +# define BITCOST_ACCURACY 8 +# define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +# define WEIGHT(stat,opt) ((opt) ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat)) +#endif + +/* ZSTD_bitWeight() : + * provide estimated "cost" of a stat in full bits only */ +MEM_STATIC U32 ZSTD_bitWeight(U32 stat) +{ + return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER); +} + +/* ZSTD_fracWeight() : + * provide fractional-bit "cost" of a stat, + * using linear interpolation approximation */ +MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat) +{ + U32 const stat = rawStat + 1; + U32 const hb = ZSTD_highbit32(stat); + U32 const BWeight = hb * BITCOST_MULTIPLIER; + /* Fweight was meant for "Fractional weight" + * but it's effectively a value between 1 and 2 + * using fixed point arithmetic */ + U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb; + U32 const weight = BWeight + FWeight; + assert(hb + BITCOST_ACCURACY < 31); + return weight; +} + +#if (DEBUGLEVEL>=2) +/* debugging function, + * @return price in bytes as fractional value + * for debug messages only */ +MEM_STATIC double ZSTD_fCost(int price) +{ + return (double)price / (BITCOST_MULTIPLIER*8); +} +#endif + +static int ZSTD_compressedLiterals(optState_t const* const optPtr) +{ + return optPtr->literalCompressionMode != ZSTD_ps_disable; +} + +static void ZSTD_setBasePrices(optState_t* optPtr, int optLevel) +{ + if (ZSTD_compressedLiterals(optPtr)) + optPtr->litSumBasePrice = WEIGHT(optPtr->litSum, optLevel); + optPtr->litLengthSumBasePrice = WEIGHT(optPtr->litLengthSum, optLevel); + optPtr->matchLengthSumBasePrice = WEIGHT(optPtr->matchLengthSum, optLevel); + optPtr->offCodeSumBasePrice = WEIGHT(optPtr->offCodeSum, optLevel); +} + + +static U32 sum_u32(const unsigned table[], size_t nbElts) +{ + size_t n; + U32 total = 0; + for (n=0; n0); + unsigned const newStat = base + (table[s] >> shift); + sum += newStat; + table[s] = newStat; + } + return sum; +} + +/* ZSTD_scaleStats() : + * reduce all elt frequencies in table if sum too large + * return the resulting sum of elements */ +static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget) +{ + U32 const prevsum = sum_u32(table, lastEltIndex+1); + U32 const factor = prevsum >> logTarget; + DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget); + assert(logTarget < 30); + if (factor <= 1) return prevsum; + return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor), base_1guaranteed); +} + +/* ZSTD_rescaleFreqs() : + * if first block (detected by optPtr->litLengthSum == 0) : init statistics + * take hints from dictionary if there is one + * and init from zero if there is none, + * using src for literals stats, and baseline stats for sequence symbols + * otherwise downscale existing stats, to be used as seed for next block. + */ +static void +ZSTD_rescaleFreqs(optState_t* const optPtr, + const BYTE* const src, size_t const srcSize, + int const optLevel) +{ + int const compressedLiterals = ZSTD_compressedLiterals(optPtr); + DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize); + optPtr->priceType = zop_dynamic; + + if (optPtr->litLengthSum == 0) { /* no literals stats collected -> first block assumed -> init */ + + /* heuristic: use pre-defined stats for too small inputs */ + if (srcSize <= ZSTD_PREDEF_THRESHOLD) { + DEBUGLOG(5, "srcSize <= %i : use predefined stats", ZSTD_PREDEF_THRESHOLD); + optPtr->priceType = zop_predef; + } + + assert(optPtr->symbolCosts != NULL); + if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) { + + /* huffman stats covering the full value set : table presumed generated by dictionary */ + optPtr->priceType = zop_dynamic; + + if (compressedLiterals) { + /* generate literals statistics from huffman table */ + unsigned lit; + assert(optPtr->litFreq != NULL); + optPtr->litSum = 0; + for (lit=0; lit<=MaxLit; lit++) { + U32 const scaleLog = 11; /* scale to 2K */ + U32 const bitCost = HUF_getNbBitsFromCTable(optPtr->symbolCosts->huf.CTable, lit); + assert(bitCost <= scaleLog); + optPtr->litFreq[lit] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/; + optPtr->litSum += optPtr->litFreq[lit]; + } } + + { unsigned ll; + FSE_CState_t llstate; + FSE_initCState(&llstate, optPtr->symbolCosts->fse.litlengthCTable); + optPtr->litLengthSum = 0; + for (ll=0; ll<=MaxLL; ll++) { + U32 const scaleLog = 10; /* scale to 1K */ + U32 const bitCost = FSE_getMaxNbBits(llstate.symbolTT, ll); + assert(bitCost < scaleLog); + optPtr->litLengthFreq[ll] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/; + optPtr->litLengthSum += optPtr->litLengthFreq[ll]; + } } + + { unsigned ml; + FSE_CState_t mlstate; + FSE_initCState(&mlstate, optPtr->symbolCosts->fse.matchlengthCTable); + optPtr->matchLengthSum = 0; + for (ml=0; ml<=MaxML; ml++) { + U32 const scaleLog = 10; + U32 const bitCost = FSE_getMaxNbBits(mlstate.symbolTT, ml); + assert(bitCost < scaleLog); + optPtr->matchLengthFreq[ml] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/; + optPtr->matchLengthSum += optPtr->matchLengthFreq[ml]; + } } + + { unsigned of; + FSE_CState_t ofstate; + FSE_initCState(&ofstate, optPtr->symbolCosts->fse.offcodeCTable); + optPtr->offCodeSum = 0; + for (of=0; of<=MaxOff; of++) { + U32 const scaleLog = 10; + U32 const bitCost = FSE_getMaxNbBits(ofstate.symbolTT, of); + assert(bitCost < scaleLog); + optPtr->offCodeFreq[of] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/; + optPtr->offCodeSum += optPtr->offCodeFreq[of]; + } } + + } else { /* first block, no dictionary */ + + assert(optPtr->litFreq != NULL); + if (compressedLiterals) { + /* base initial cost of literals on direct frequency within src */ + unsigned lit = MaxLit; + HIST_count_simple(optPtr->litFreq, &lit, src, srcSize); /* use raw first block to init statistics */ + optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8, base_0possible); + } + + { unsigned const baseLLfreqs[MaxLL+1] = { + 4, 2, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1 + }; + ZSTD_memcpy(optPtr->litLengthFreq, baseLLfreqs, sizeof(baseLLfreqs)); + optPtr->litLengthSum = sum_u32(baseLLfreqs, MaxLL+1); + } + + { unsigned ml; + for (ml=0; ml<=MaxML; ml++) + optPtr->matchLengthFreq[ml] = 1; + } + optPtr->matchLengthSum = MaxML+1; + + { unsigned const baseOFCfreqs[MaxOff+1] = { + 6, 2, 1, 1, 2, 3, 4, 4, + 4, 3, 2, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1 + }; + ZSTD_memcpy(optPtr->offCodeFreq, baseOFCfreqs, sizeof(baseOFCfreqs)); + optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1); + } + + } + + } else { /* new block : scale down accumulated statistics */ + + if (compressedLiterals) + optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12); + optPtr->litLengthSum = ZSTD_scaleStats(optPtr->litLengthFreq, MaxLL, 11); + optPtr->matchLengthSum = ZSTD_scaleStats(optPtr->matchLengthFreq, MaxML, 11); + optPtr->offCodeSum = ZSTD_scaleStats(optPtr->offCodeFreq, MaxOff, 11); + } + + ZSTD_setBasePrices(optPtr, optLevel); +} + +/* ZSTD_rawLiteralsCost() : + * price of literals (only) in specified segment (which length can be 0). + * does not include price of literalLength symbol */ +static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength, + const optState_t* const optPtr, + int optLevel) +{ + DEBUGLOG(8, "ZSTD_rawLiteralsCost (%u literals)", litLength); + if (litLength == 0) return 0; + + if (!ZSTD_compressedLiterals(optPtr)) + return (litLength << 3) * BITCOST_MULTIPLIER; /* Uncompressed - 8 bytes per literal. */ + + if (optPtr->priceType == zop_predef) + return (litLength*6) * BITCOST_MULTIPLIER; /* 6 bit per literal - no statistic used */ + + /* dynamic statistics */ + { U32 price = optPtr->litSumBasePrice * litLength; + U32 const litPriceMax = optPtr->litSumBasePrice - BITCOST_MULTIPLIER; + U32 u; + assert(optPtr->litSumBasePrice >= BITCOST_MULTIPLIER); + for (u=0; u < litLength; u++) { + U32 litPrice = WEIGHT(optPtr->litFreq[literals[u]], optLevel); + if (UNLIKELY(litPrice > litPriceMax)) litPrice = litPriceMax; + price -= litPrice; + } + return price; + } +} + +/* ZSTD_litLengthPrice() : + * cost of literalLength symbol */ +static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optPtr, int optLevel) +{ + assert(litLength <= ZSTD_BLOCKSIZE_MAX); + if (optPtr->priceType == zop_predef) + return WEIGHT(litLength, optLevel); + + /* ZSTD_LLcode() can't compute litLength price for sizes >= ZSTD_BLOCKSIZE_MAX + * because it isn't representable in the zstd format. + * So instead just pretend it would cost 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. + * In such a case, the block would be all literals. + */ + if (litLength == ZSTD_BLOCKSIZE_MAX) + return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel); + + /* dynamic statistics */ + { U32 const llCode = ZSTD_LLcode(litLength); + return (LL_bits[llCode] * BITCOST_MULTIPLIER) + + optPtr->litLengthSumBasePrice + - WEIGHT(optPtr->litLengthFreq[llCode], optLevel); + } +} + +/* ZSTD_getMatchPrice() : + * Provides the cost of the match part (offset + matchLength) of a sequence. + * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence. + * @offBase : sumtype, representing an offset or a repcode, and using numeric representation of ZSTD_storeSeq() + * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) + */ +FORCE_INLINE_TEMPLATE U32 +ZSTD_getMatchPrice(U32 const offBase, + U32 const matchLength, + const optState_t* const optPtr, + int const optLevel) +{ + U32 price; + U32 const offCode = ZSTD_highbit32(offBase); + U32 const mlBase = matchLength - MINMATCH; + assert(matchLength >= MINMATCH); + + if (optPtr->priceType == zop_predef) /* fixed scheme, does not use statistics */ + return WEIGHT(mlBase, optLevel) + + ((16 + offCode) * BITCOST_MULTIPLIER); /* emulated offset cost */ + + /* dynamic statistics */ + price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel)); + if ((optLevel<2) /*static*/ && offCode >= 20) + price += (offCode-19)*2 * BITCOST_MULTIPLIER; /* handicap for long distance offsets, favor decompression speed */ + + /* match Length */ + { U32 const mlCode = ZSTD_MLcode(mlBase); + price += (ML_bits[mlCode] * BITCOST_MULTIPLIER) + (optPtr->matchLengthSumBasePrice - WEIGHT(optPtr->matchLengthFreq[mlCode], optLevel)); + } + + price += BITCOST_MULTIPLIER / 5; /* heuristic : make matches a bit more costly to favor less sequences -> faster decompression speed */ + + DEBUGLOG(8, "ZSTD_getMatchPrice(ml:%u) = %u", matchLength, price); + return price; +} + +/* ZSTD_updateStats() : + * assumption : literals + litLength <= iend */ +static void ZSTD_updateStats(optState_t* const optPtr, + U32 litLength, const BYTE* literals, + U32 offBase, U32 matchLength) +{ + /* literals */ + if (ZSTD_compressedLiterals(optPtr)) { + U32 u; + for (u=0; u < litLength; u++) + optPtr->litFreq[literals[u]] += ZSTD_LITFREQ_ADD; + optPtr->litSum += litLength*ZSTD_LITFREQ_ADD; + } + + /* literal Length */ + { U32 const llCode = ZSTD_LLcode(litLength); + optPtr->litLengthFreq[llCode]++; + optPtr->litLengthSum++; + } + + /* offset code : follows storeSeq() numeric representation */ + { U32 const offCode = ZSTD_highbit32(offBase); + assert(offCode <= MaxOff); + optPtr->offCodeFreq[offCode]++; + optPtr->offCodeSum++; + } + + /* match Length */ + { U32 const mlBase = matchLength - MINMATCH; + U32 const mlCode = ZSTD_MLcode(mlBase); + optPtr->matchLengthFreq[mlCode]++; + optPtr->matchLengthSum++; + } +} + + +/* ZSTD_readMINMATCH() : + * function safe only for comparisons + * assumption : memPtr must be at least 4 bytes before end of buffer */ +MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length) +{ + switch (length) + { + default : + case 4 : return MEM_read32(memPtr); + case 3 : if (MEM_isLittleEndian()) + return MEM_read32(memPtr)<<8; + else + return MEM_read32(memPtr)>>8; + } +} + + +/* Update hashTable3 up to ip (excluded) + Assumption : always within prefix (i.e. not within extDict) */ +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_MatchState_t* ms, + U32* nextToUpdate3, + const BYTE* const ip) +{ + U32* const hashTable3 = ms->hashTable3; + U32 const hashLog3 = ms->hashLog3; + const BYTE* const base = ms->window.base; + U32 idx = *nextToUpdate3; + U32 const target = (U32)(ip - base); + size_t const hash3 = ZSTD_hash3Ptr(ip, hashLog3); + assert(hashLog3 > 0); + + while(idx < target) { + hashTable3[ZSTD_hash3Ptr(base+idx, hashLog3)] = idx; + idx++; + } + + *nextToUpdate3 = target; + return hashTable3[hash3]; +} + + +/*-************************************* +* Binary Tree search +***************************************/ +/** ZSTD_insertBt1() : add one or multiple positions to tree. + * @param ip assumed <= iend-8 . + * @param target The target of ZSTD_updateTree_internal() - we are filling to this position + * @return : nb of positions added */ +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +U32 ZSTD_insertBt1( + const ZSTD_MatchState_t* ms, + const BYTE* const ip, const BYTE* const iend, + U32 const target, + U32 const mls, const int extDict) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const hashTable = ms->hashTable; + U32 const hashLog = cParams->hashLog; + size_t const h = ZSTD_hashPtr(ip, hashLog, mls); + U32* const bt = ms->chainTable; + U32 const btLog = cParams->chainLog - 1; + U32 const btMask = (1 << btLog) - 1; + U32 matchIndex = hashTable[h]; + size_t commonLengthSmaller=0, commonLengthLarger=0; + const BYTE* const base = ms->window.base; + const BYTE* const dictBase = ms->window.dictBase; + const U32 dictLimit = ms->window.dictLimit; + const BYTE* const dictEnd = dictBase + dictLimit; + const BYTE* const prefixStart = base + dictLimit; + const BYTE* match; + const U32 curr = (U32)(ip-base); + const U32 btLow = btMask >= curr ? 0 : curr - btMask; + U32* smallerPtr = bt + 2*(curr&btMask); + U32* largerPtr = smallerPtr + 1; + U32 dummy32; /* to be nullified at the end */ + /* windowLow is based on target because + * we only need positions that will be in the window at the end of the tree update. + */ + U32 const windowLow = ZSTD_getLowestMatchIndex(ms, target, cParams->windowLog); + U32 matchEndIdx = curr+8+1; + size_t bestLength = 8; + U32 nbCompares = 1U << cParams->searchLog; +#ifdef ZSTD_C_PREDICT + U32 predictedSmall = *(bt + 2*((curr-1)&btMask) + 0); + U32 predictedLarge = *(bt + 2*((curr-1)&btMask) + 1); + predictedSmall += (predictedSmall>0); + predictedLarge += (predictedLarge>0); +#endif /* ZSTD_C_PREDICT */ + + DEBUGLOG(8, "ZSTD_insertBt1 (%u)", curr); + + assert(curr <= target); + assert(ip <= iend-8); /* required for h calculation */ + hashTable[h] = curr; /* Update Hash Table */ + + assert(windowLow > 0); + for (; nbCompares && (matchIndex >= windowLow); --nbCompares) { + U32* const nextPtr = bt + 2*(matchIndex & btMask); + size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ + assert(matchIndex < curr); + +#ifdef ZSTD_C_PREDICT /* note : can create issues when hlog small <= 11 */ + const U32* predictPtr = bt + 2*((matchIndex-1) & btMask); /* written this way, as bt is a roll buffer */ + if (matchIndex == predictedSmall) { + /* no need to check length, result known */ + *smallerPtr = matchIndex; + if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop the search */ + smallerPtr = nextPtr+1; /* new "smaller" => larger of match */ + matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */ + predictedSmall = predictPtr[1] + (predictPtr[1]>0); + continue; + } + if (matchIndex == predictedLarge) { + *largerPtr = matchIndex; + if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop the search */ + largerPtr = nextPtr; + matchIndex = nextPtr[0]; + predictedLarge = predictPtr[0] + (predictPtr[0]>0); + continue; + } +#endif + + if (!extDict || (matchIndex+matchLength >= dictLimit)) { + assert(matchIndex+matchLength >= dictLimit); /* might be wrong if actually extDict */ + match = base + matchIndex; + matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend); + } else { + match = dictBase + matchIndex; + matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart); + if (matchIndex+matchLength >= dictLimit) + match = base + matchIndex; /* to prepare for next usage of match[matchLength] */ + } + + if (matchLength > bestLength) { + bestLength = matchLength; + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + } + + if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ + break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */ + } + + if (match[matchLength] < ip[matchLength]) { /* necessarily within buffer */ + /* match is smaller than current */ + *smallerPtr = matchIndex; /* update smaller idx */ + commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ + if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop searching */ + smallerPtr = nextPtr+1; /* new "candidate" => larger than match, which was smaller than target */ + matchIndex = nextPtr[1]; /* new matchIndex, larger than previous and closer to current */ + } else { + /* match is larger than current */ + *largerPtr = matchIndex; + commonLengthLarger = matchLength; + if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop searching */ + largerPtr = nextPtr; + matchIndex = nextPtr[0]; + } } + + *smallerPtr = *largerPtr = 0; + { U32 positions = 0; + if (bestLength > 384) positions = MIN(192, (U32)(bestLength - 384)); /* speed optimization */ + assert(matchEndIdx > curr + 8); + return MAX(positions, matchEndIdx - (curr + 8)); + } +} + +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_updateTree_internal( + ZSTD_MatchState_t* ms, + const BYTE* const ip, const BYTE* const iend, + const U32 mls, const ZSTD_dictMode_e dictMode) +{ + const BYTE* const base = ms->window.base; + U32 const target = (U32)(ip - base); + U32 idx = ms->nextToUpdate; + DEBUGLOG(7, "ZSTD_updateTree_internal, from %u to %u (dictMode:%u)", + idx, target, dictMode); + + while(idx < target) { + U32 const forward = ZSTD_insertBt1(ms, base+idx, iend, target, mls, dictMode == ZSTD_extDict); + assert(idx < (U32)(idx + forward)); + idx += forward; + } + assert((size_t)(ip - base) <= (size_t)(U32)(-1)); + assert((size_t)(iend - base) <= (size_t)(U32)(-1)); + ms->nextToUpdate = target; +} + +void ZSTD_updateTree(ZSTD_MatchState_t* ms, const BYTE* ip, const BYTE* iend) { + ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.minMatch, ZSTD_noDict); +} + +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +U32 +ZSTD_insertBtAndGetAllMatches ( + ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */ + ZSTD_MatchState_t* ms, + U32* nextToUpdate3, + const BYTE* const ip, const BYTE* const iLimit, + const ZSTD_dictMode_e dictMode, + const U32 rep[ZSTD_REP_NUM], + const U32 ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */ + const U32 lengthToBeat, + const U32 mls /* template */) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1); + const BYTE* const base = ms->window.base; + U32 const curr = (U32)(ip-base); + U32 const hashLog = cParams->hashLog; + U32 const minMatch = (mls==3) ? 3 : 4; + U32* const hashTable = ms->hashTable; + size_t const h = ZSTD_hashPtr(ip, hashLog, mls); + U32 matchIndex = hashTable[h]; + U32* const bt = ms->chainTable; + U32 const btLog = cParams->chainLog - 1; + U32 const btMask= (1U << btLog) - 1; + size_t commonLengthSmaller=0, commonLengthLarger=0; + const BYTE* const dictBase = ms->window.dictBase; + U32 const dictLimit = ms->window.dictLimit; + const BYTE* const dictEnd = dictBase + dictLimit; + const BYTE* const prefixStart = base + dictLimit; + U32 const btLow = (btMask >= curr) ? 0 : curr - btMask; + U32 const windowLow = ZSTD_getLowestMatchIndex(ms, curr, cParams->windowLog); + U32 const matchLow = windowLow ? windowLow : 1; + U32* smallerPtr = bt + 2*(curr&btMask); + U32* largerPtr = bt + 2*(curr&btMask) + 1; + U32 matchEndIdx = curr+8+1; /* farthest referenced position of any match => detects repetitive patterns */ + U32 dummy32; /* to be nullified at the end */ + U32 mnum = 0; + U32 nbCompares = 1U << cParams->searchLog; + + const ZSTD_MatchState_t* dms = dictMode == ZSTD_dictMatchState ? ms->dictMatchState : NULL; + const ZSTD_compressionParameters* const dmsCParams = + dictMode == ZSTD_dictMatchState ? &dms->cParams : NULL; + const BYTE* const dmsBase = dictMode == ZSTD_dictMatchState ? dms->window.base : NULL; + const BYTE* const dmsEnd = dictMode == ZSTD_dictMatchState ? dms->window.nextSrc : NULL; + U32 const dmsHighLimit = dictMode == ZSTD_dictMatchState ? (U32)(dmsEnd - dmsBase) : 0; + U32 const dmsLowLimit = dictMode == ZSTD_dictMatchState ? dms->window.lowLimit : 0; + U32 const dmsIndexDelta = dictMode == ZSTD_dictMatchState ? windowLow - dmsHighLimit : 0; + U32 const dmsHashLog = dictMode == ZSTD_dictMatchState ? dmsCParams->hashLog : hashLog; + U32 const dmsBtLog = dictMode == ZSTD_dictMatchState ? dmsCParams->chainLog - 1 : btLog; + U32 const dmsBtMask = dictMode == ZSTD_dictMatchState ? (1U << dmsBtLog) - 1 : 0; + U32 const dmsBtLow = dictMode == ZSTD_dictMatchState && dmsBtMask < dmsHighLimit - dmsLowLimit ? dmsHighLimit - dmsBtMask : dmsLowLimit; + + size_t bestLength = lengthToBeat-1; + DEBUGLOG(8, "ZSTD_insertBtAndGetAllMatches: current=%u", curr); + + /* check repCode */ + assert(ll0 <= 1); /* necessarily 1 or 0 */ + { U32 const lastR = ZSTD_REP_NUM + ll0; + U32 repCode; + for (repCode = ll0; repCode < lastR; repCode++) { + U32 const repOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode]; + U32 const repIndex = curr - repOffset; + U32 repLen = 0; + assert(curr >= dictLimit); + if (repOffset-1 /* intentional overflow, discards 0 and -1 */ < curr-dictLimit) { /* equivalent to `curr > repIndex >= dictLimit` */ + /* We must validate the repcode offset because when we're using a dictionary the + * valid offset range shrinks when the dictionary goes out of bounds. + */ + if ((repIndex >= windowLow) & (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(ip - repOffset, minMatch))) { + repLen = (U32)ZSTD_count(ip+minMatch, ip+minMatch-repOffset, iLimit) + minMatch; + } + } else { /* repIndex < dictLimit || repIndex >= curr */ + const BYTE* const repMatch = dictMode == ZSTD_dictMatchState ? + dmsBase + repIndex - dmsIndexDelta : + dictBase + repIndex; + assert(curr >= windowLow); + if ( dictMode == ZSTD_extDict + && ( ((repOffset-1) /*intentional overflow*/ < curr - windowLow) /* equivalent to `curr > repIndex >= windowLow` */ + & (ZSTD_index_overlap_check(dictLimit, repIndex)) ) + && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) { + repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dictEnd, prefixStart) + minMatch; + } + if (dictMode == ZSTD_dictMatchState + && ( ((repOffset-1) /*intentional overflow*/ < curr - (dmsLowLimit + dmsIndexDelta)) /* equivalent to `curr > repIndex >= dmsLowLimit` */ + & (ZSTD_index_overlap_check(dictLimit, repIndex)) ) + && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) { + repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dmsEnd, prefixStart) + minMatch; + } } + /* save longer solution */ + if (repLen > bestLength) { + DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u", + repCode, ll0, repOffset, repLen); + bestLength = repLen; + matches[mnum].off = REPCODE_TO_OFFBASE(repCode - ll0 + 1); /* expect value between 1 and 3 */ + matches[mnum].len = (U32)repLen; + mnum++; + if ( (repLen > sufficient_len) + | (ip+repLen == iLimit) ) { /* best possible */ + return mnum; + } } } } + + /* HC3 match finder */ + if ((mls == 3) /*static*/ && (bestLength < mls)) { + U32 const matchIndex3 = ZSTD_insertAndFindFirstIndexHash3(ms, nextToUpdate3, ip); + if ((matchIndex3 >= matchLow) + & (curr - matchIndex3 < (1<<18)) /*heuristic : longer distance likely too expensive*/ ) { + size_t mlen; + if ((dictMode == ZSTD_noDict) /*static*/ || (dictMode == ZSTD_dictMatchState) /*static*/ || (matchIndex3 >= dictLimit)) { + const BYTE* const match = base + matchIndex3; + mlen = ZSTD_count(ip, match, iLimit); + } else { + const BYTE* const match = dictBase + matchIndex3; + mlen = ZSTD_count_2segments(ip, match, iLimit, dictEnd, prefixStart); + } + + /* save best solution */ + if (mlen >= mls /* == 3 > bestLength */) { + DEBUGLOG(8, "found small match with hlog3, of length %u", + (U32)mlen); + bestLength = mlen; + assert(curr > matchIndex3); + assert(mnum==0); /* no prior solution */ + matches[0].off = OFFSET_TO_OFFBASE(curr - matchIndex3); + matches[0].len = (U32)mlen; + mnum = 1; + if ( (mlen > sufficient_len) | + (ip+mlen == iLimit) ) { /* best possible length */ + ms->nextToUpdate = curr+1; /* skip insertion */ + return 1; + } } } + /* no dictMatchState lookup: dicts don't have a populated HC3 table */ + } /* if (mls == 3) */ + + hashTable[h] = curr; /* Update Hash Table */ + + for (; nbCompares && (matchIndex >= matchLow); --nbCompares) { + U32* const nextPtr = bt + 2*(matchIndex & btMask); + const BYTE* match; + size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ + assert(curr > matchIndex); + + if ((dictMode == ZSTD_noDict) || (dictMode == ZSTD_dictMatchState) || (matchIndex+matchLength >= dictLimit)) { + assert(matchIndex+matchLength >= dictLimit); /* ensure the condition is correct when !extDict */ + match = base + matchIndex; + if (matchIndex >= dictLimit) assert(memcmp(match, ip, matchLength) == 0); /* ensure early section of match is equal as expected */ + matchLength += ZSTD_count(ip+matchLength, match+matchLength, iLimit); + } else { + match = dictBase + matchIndex; + assert(memcmp(match, ip, matchLength) == 0); /* ensure early section of match is equal as expected */ + matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iLimit, dictEnd, prefixStart); + if (matchIndex+matchLength >= dictLimit) + match = base + matchIndex; /* prepare for match[matchLength] read */ + } + + if (matchLength > bestLength) { + DEBUGLOG(8, "found match of length %u at distance %u (offBase=%u)", + (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex)); + assert(matchEndIdx > matchIndex); + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + bestLength = matchLength; + matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex); + matches[mnum].len = (U32)matchLength; + mnum++; + if ( (matchLength > ZSTD_OPT_NUM) + | (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) { + if (dictMode == ZSTD_dictMatchState) nbCompares = 0; /* break should also skip searching dms */ + break; /* drop, to preserve bt consistency (miss a little bit of compression) */ + } } + + if (match[matchLength] < ip[matchLength]) { + /* match smaller than current */ + *smallerPtr = matchIndex; /* update smaller idx */ + commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ + if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop the search */ + smallerPtr = nextPtr+1; /* new candidate => larger than match, which was smaller than current */ + matchIndex = nextPtr[1]; /* new matchIndex, larger than previous, closer to current */ + } else { + *largerPtr = matchIndex; + commonLengthLarger = matchLength; + if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop the search */ + largerPtr = nextPtr; + matchIndex = nextPtr[0]; + } } + + *smallerPtr = *largerPtr = 0; + + assert(nbCompares <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */ + if (dictMode == ZSTD_dictMatchState && nbCompares) { + size_t const dmsH = ZSTD_hashPtr(ip, dmsHashLog, mls); + U32 dictMatchIndex = dms->hashTable[dmsH]; + const U32* const dmsBt = dms->chainTable; + commonLengthSmaller = commonLengthLarger = 0; + for (; nbCompares && (dictMatchIndex > dmsLowLimit); --nbCompares) { + const U32* const nextPtr = dmsBt + 2*(dictMatchIndex & dmsBtMask); + size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ + const BYTE* match = dmsBase + dictMatchIndex; + matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iLimit, dmsEnd, prefixStart); + if (dictMatchIndex+matchLength >= dmsHighLimit) + match = base + dictMatchIndex + dmsIndexDelta; /* to prepare for next usage of match[matchLength] */ + + if (matchLength > bestLength) { + matchIndex = dictMatchIndex + dmsIndexDelta; + DEBUGLOG(8, "found dms match of length %u at distance %u (offBase=%u)", + (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex)); + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + bestLength = matchLength; + matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex); + matches[mnum].len = (U32)matchLength; + mnum++; + if ( (matchLength > ZSTD_OPT_NUM) + | (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) { + break; /* drop, to guarantee consistency (miss a little bit of compression) */ + } } + + if (dictMatchIndex <= dmsBtLow) { break; } /* beyond tree size, stop the search */ + if (match[matchLength] < ip[matchLength]) { + commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ + dictMatchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */ + } else { + /* match is larger than current */ + commonLengthLarger = matchLength; + dictMatchIndex = nextPtr[0]; + } } } /* if (dictMode == ZSTD_dictMatchState) */ + + assert(matchEndIdx > curr+8); + ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ + return mnum; +} + +typedef U32 (*ZSTD_getAllMatchesFn)( + ZSTD_match_t*, + ZSTD_MatchState_t*, + U32*, + const BYTE*, + const BYTE*, + const U32 rep[ZSTD_REP_NUM], + U32 const ll0, + U32 const lengthToBeat); + +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +U32 ZSTD_btGetAllMatches_internal( + ZSTD_match_t* matches, + ZSTD_MatchState_t* ms, + U32* nextToUpdate3, + const BYTE* ip, + const BYTE* const iHighLimit, + const U32 rep[ZSTD_REP_NUM], + U32 const ll0, + U32 const lengthToBeat, + const ZSTD_dictMode_e dictMode, + const U32 mls) +{ + assert(BOUNDED(3, ms->cParams.minMatch, 6) == mls); + DEBUGLOG(8, "ZSTD_BtGetAllMatches(dictMode=%d, mls=%u)", (int)dictMode, mls); + if (ip < ms->window.base + ms->nextToUpdate) + return 0; /* skipped area */ + ZSTD_updateTree_internal(ms, ip, iHighLimit, mls, dictMode); + return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, mls); +} + +#define ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls) ZSTD_btGetAllMatches_##dictMode##_##mls + +#define GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, mls) \ + static U32 ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls)( \ + ZSTD_match_t* matches, \ + ZSTD_MatchState_t* ms, \ + U32* nextToUpdate3, \ + const BYTE* ip, \ + const BYTE* const iHighLimit, \ + const U32 rep[ZSTD_REP_NUM], \ + U32 const ll0, \ + U32 const lengthToBeat) \ + { \ + return ZSTD_btGetAllMatches_internal( \ + matches, ms, nextToUpdate3, ip, iHighLimit, \ + rep, ll0, lengthToBeat, ZSTD_##dictMode, mls); \ + } + +#define GEN_ZSTD_BT_GET_ALL_MATCHES(dictMode) \ + GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 3) \ + GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 4) \ + GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 5) \ + GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 6) + +GEN_ZSTD_BT_GET_ALL_MATCHES(noDict) +GEN_ZSTD_BT_GET_ALL_MATCHES(extDict) +GEN_ZSTD_BT_GET_ALL_MATCHES(dictMatchState) + +#define ZSTD_BT_GET_ALL_MATCHES_ARRAY(dictMode) \ + { \ + ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 3), \ + ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 4), \ + ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 5), \ + ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 6) \ + } + +static ZSTD_getAllMatchesFn +ZSTD_selectBtGetAllMatches(ZSTD_MatchState_t const* ms, ZSTD_dictMode_e const dictMode) +{ + ZSTD_getAllMatchesFn const getAllMatchesFns[3][4] = { + ZSTD_BT_GET_ALL_MATCHES_ARRAY(noDict), + ZSTD_BT_GET_ALL_MATCHES_ARRAY(extDict), + ZSTD_BT_GET_ALL_MATCHES_ARRAY(dictMatchState) + }; + U32 const mls = BOUNDED(3, ms->cParams.minMatch, 6); + assert((U32)dictMode < 3); + assert(mls - 3 < 4); + return getAllMatchesFns[(int)dictMode][mls - 3]; +} + +/************************* +* LDM helper functions * +*************************/ + +/* Struct containing info needed to make decision about ldm inclusion */ +typedef struct { + RawSeqStore_t seqStore; /* External match candidates store for this block */ + U32 startPosInBlock; /* Start position of the current match candidate */ + U32 endPosInBlock; /* End position of the current match candidate */ + U32 offset; /* Offset of the match candidate */ +} ZSTD_optLdm_t; + +/* ZSTD_optLdm_skipRawSeqStoreBytes(): + * Moves forward in @rawSeqStore by @nbBytes, + * which will update the fields 'pos' and 'posInSequence'. + */ +static void ZSTD_optLdm_skipRawSeqStoreBytes(RawSeqStore_t* rawSeqStore, size_t nbBytes) +{ + U32 currPos = (U32)(rawSeqStore->posInSequence + nbBytes); + while (currPos && rawSeqStore->pos < rawSeqStore->size) { + rawSeq currSeq = rawSeqStore->seq[rawSeqStore->pos]; + if (currPos >= currSeq.litLength + currSeq.matchLength) { + currPos -= currSeq.litLength + currSeq.matchLength; + rawSeqStore->pos++; + } else { + rawSeqStore->posInSequence = currPos; + break; + } + } + if (currPos == 0 || rawSeqStore->pos == rawSeqStore->size) { + rawSeqStore->posInSequence = 0; + } +} + +/* ZSTD_opt_getNextMatchAndUpdateSeqStore(): + * Calculates the beginning and end of the next match in the current block. + * Updates 'pos' and 'posInSequence' of the ldmSeqStore. + */ +static void +ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 currPosInBlock, + U32 blockBytesRemaining) +{ + rawSeq currSeq; + U32 currBlockEndPos; + U32 literalsBytesRemaining; + U32 matchBytesRemaining; + + /* Setting match end position to MAX to ensure we never use an LDM during this block */ + if (optLdm->seqStore.size == 0 || optLdm->seqStore.pos >= optLdm->seqStore.size) { + optLdm->startPosInBlock = UINT_MAX; + optLdm->endPosInBlock = UINT_MAX; + return; + } + /* Calculate appropriate bytes left in matchLength and litLength + * after adjusting based on ldmSeqStore->posInSequence */ + currSeq = optLdm->seqStore.seq[optLdm->seqStore.pos]; + assert(optLdm->seqStore.posInSequence <= currSeq.litLength + currSeq.matchLength); + currBlockEndPos = currPosInBlock + blockBytesRemaining; + literalsBytesRemaining = (optLdm->seqStore.posInSequence < currSeq.litLength) ? + currSeq.litLength - (U32)optLdm->seqStore.posInSequence : + 0; + matchBytesRemaining = (literalsBytesRemaining == 0) ? + currSeq.matchLength - ((U32)optLdm->seqStore.posInSequence - currSeq.litLength) : + currSeq.matchLength; + + /* If there are more literal bytes than bytes remaining in block, no ldm is possible */ + if (literalsBytesRemaining >= blockBytesRemaining) { + optLdm->startPosInBlock = UINT_MAX; + optLdm->endPosInBlock = UINT_MAX; + ZSTD_optLdm_skipRawSeqStoreBytes(&optLdm->seqStore, blockBytesRemaining); + return; + } + + /* Matches may be < minMatch by this process. In that case, we will reject them + when we are deciding whether or not to add the ldm */ + optLdm->startPosInBlock = currPosInBlock + literalsBytesRemaining; + optLdm->endPosInBlock = optLdm->startPosInBlock + matchBytesRemaining; + optLdm->offset = currSeq.offset; + + if (optLdm->endPosInBlock > currBlockEndPos) { + /* Match ends after the block ends, we can't use the whole match */ + optLdm->endPosInBlock = currBlockEndPos; + ZSTD_optLdm_skipRawSeqStoreBytes(&optLdm->seqStore, currBlockEndPos - currPosInBlock); + } else { + /* Consume nb of bytes equal to size of sequence left */ + ZSTD_optLdm_skipRawSeqStoreBytes(&optLdm->seqStore, literalsBytesRemaining + matchBytesRemaining); + } +} + +/* ZSTD_optLdm_maybeAddMatch(): + * Adds a match if it's long enough, + * based on it's 'matchStartPosInBlock' and 'matchEndPosInBlock', + * into 'matches'. Maintains the correct ordering of 'matches'. + */ +static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches, + const ZSTD_optLdm_t* optLdm, U32 currPosInBlock, + U32 minMatch) +{ + U32 const posDiff = currPosInBlock - optLdm->startPosInBlock; + /* Note: ZSTD_match_t actually contains offBase and matchLength (before subtracting MINMATCH) */ + U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff; + + /* Ensure that current block position is not outside of the match */ + if (currPosInBlock < optLdm->startPosInBlock + || currPosInBlock >= optLdm->endPosInBlock + || candidateMatchLength < minMatch) { + return; + } + + if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) { + U32 const candidateOffBase = OFFSET_TO_OFFBASE(optLdm->offset); + DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offBase: %u matchLength %u) at block position=%u", + candidateOffBase, candidateMatchLength, currPosInBlock); + matches[*nbMatches].len = candidateMatchLength; + matches[*nbMatches].off = candidateOffBase; + (*nbMatches)++; + } +} + +/* ZSTD_optLdm_processMatchCandidate(): + * Wrapper function to update ldm seq store and call ldm functions as necessary. + */ +static void +ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm, + ZSTD_match_t* matches, U32* nbMatches, + U32 currPosInBlock, U32 remainingBytes, + U32 minMatch) +{ + if (optLdm->seqStore.size == 0 || optLdm->seqStore.pos >= optLdm->seqStore.size) { + return; + } + + if (currPosInBlock >= optLdm->endPosInBlock) { + if (currPosInBlock > optLdm->endPosInBlock) { + /* The position at which ZSTD_optLdm_processMatchCandidate() is called is not necessarily + * at the end of a match from the ldm seq store, and will often be some bytes + * over beyond matchEndPosInBlock. As such, we need to correct for these "overshoots" + */ + U32 const posOvershoot = currPosInBlock - optLdm->endPosInBlock; + ZSTD_optLdm_skipRawSeqStoreBytes(&optLdm->seqStore, posOvershoot); + } + ZSTD_opt_getNextMatchAndUpdateSeqStore(optLdm, currPosInBlock, remainingBytes); + } + ZSTD_optLdm_maybeAddMatch(matches, nbMatches, optLdm, currPosInBlock, minMatch); +} + + +/*-******************************* +* Optimal parser +*********************************/ + +#if 0 /* debug */ + +static void +listStats(const U32* table, int lastEltID) +{ + int const nbElts = lastEltID + 1; + int enb; + for (enb=0; enb < nbElts; enb++) { + (void)table; + /* RAWLOG(2, "%3i:%3i, ", enb, table[enb]); */ + RAWLOG(2, "%4i,", table[enb]); + } + RAWLOG(2, " \n"); +} + +#endif + +#define LIT_PRICE(_p) (int)ZSTD_rawLiteralsCost(_p, 1, optStatePtr, optLevel) +#define LL_PRICE(_l) (int)ZSTD_litLengthPrice(_l, optStatePtr, optLevel) +#define LL_INCPRICE(_l) (LL_PRICE(_l) - LL_PRICE(_l-1)) + +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t +ZSTD_compressBlock_opt_generic(ZSTD_MatchState_t* ms, + SeqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, + const int optLevel, + const ZSTD_dictMode_e dictMode) +{ + optState_t* const optStatePtr = &ms->opt; + const BYTE* const istart = (const BYTE*)src; + const BYTE* ip = istart; + const BYTE* anchor = istart; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - 8; + const BYTE* const base = ms->window.base; + const BYTE* const prefixStart = base + ms->window.dictLimit; + const ZSTD_compressionParameters* const cParams = &ms->cParams; + + ZSTD_getAllMatchesFn getAllMatches = ZSTD_selectBtGetAllMatches(ms, dictMode); + + U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1); + U32 const minMatch = (cParams->minMatch == 3) ? 3 : 4; + U32 nextToUpdate3 = ms->nextToUpdate; + + ZSTD_optimal_t* const opt = optStatePtr->priceTable; + ZSTD_match_t* const matches = optStatePtr->matchTable; + ZSTD_optimal_t lastStretch; + ZSTD_optLdm_t optLdm; + + ZSTD_memset(&lastStretch, 0, sizeof(ZSTD_optimal_t)); + + optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore; + optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0; + ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip)); + + /* init */ + DEBUGLOG(5, "ZSTD_compressBlock_opt_generic: current=%u, prefix=%u, nextToUpdate=%u", + (U32)(ip - base), ms->window.dictLimit, ms->nextToUpdate); + assert(optLevel <= 2); + ZSTD_rescaleFreqs(optStatePtr, (const BYTE*)src, srcSize, optLevel); + ip += (ip==prefixStart); + + /* Match Loop */ + while (ip < ilimit) { + U32 cur, last_pos = 0; + + /* find first match */ + { U32 const litlen = (U32)(ip - anchor); + U32 const ll0 = !litlen; + U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, ip, iend, rep, ll0, minMatch); + ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches, + (U32)(ip-istart), (U32)(iend-ip), + minMatch); + if (!nbMatches) { + DEBUGLOG(8, "no match found at cPos %u", (unsigned)(ip-istart)); + ip++; + continue; + } + + /* Match found: let's store this solution, and eventually find more candidates. + * During this forward pass, @opt is used to store stretches, + * defined as "a match followed by N literals". + * Note how this is different from a Sequence, which is "N literals followed by a match". + * Storing stretches allows us to store different match predecessors + * for each literal position part of a literals run. */ + + /* initialize opt[0] */ + opt[0].mlen = 0; /* there are only literals so far */ + opt[0].litlen = litlen; + /* No need to include the actual price of the literals before the first match + * because it is static for the duration of the forward pass, and is included + * in every subsequent price. But, we include the literal length because + * the cost variation of litlen depends on the value of litlen. + */ + opt[0].price = LL_PRICE(litlen); + ZSTD_STATIC_ASSERT(sizeof(opt[0].rep[0]) == sizeof(rep[0])); + ZSTD_memcpy(&opt[0].rep, rep, sizeof(opt[0].rep)); + + /* large match -> immediate encoding */ + { U32 const maxML = matches[nbMatches-1].len; + U32 const maxOffBase = matches[nbMatches-1].off; + DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffBase=%u at cPos=%u => start new series", + nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart)); + + if (maxML > sufficient_len) { + lastStretch.litlen = 0; + lastStretch.mlen = maxML; + lastStretch.off = maxOffBase; + DEBUGLOG(6, "large match (%u>%u) => immediate encoding", + maxML, sufficient_len); + cur = 0; + last_pos = maxML; + goto _shortestPath; + } } + + /* set prices for first matches starting position == 0 */ + assert(opt[0].price >= 0); + { U32 pos; + U32 matchNb; + for (pos = 1; pos < minMatch; pos++) { + opt[pos].price = ZSTD_MAX_PRICE; + opt[pos].mlen = 0; + opt[pos].litlen = litlen + pos; + } + for (matchNb = 0; matchNb < nbMatches; matchNb++) { + U32 const offBase = matches[matchNb].off; + U32 const end = matches[matchNb].len; + for ( ; pos <= end ; pos++ ) { + int const matchPrice = (int)ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel); + int const sequencePrice = opt[0].price + matchPrice; + DEBUGLOG(7, "rPos:%u => set initial price : %.2f", + pos, ZSTD_fCost(sequencePrice)); + opt[pos].mlen = pos; + opt[pos].off = offBase; + opt[pos].litlen = 0; /* end of match */ + opt[pos].price = sequencePrice + LL_PRICE(0); + } + } + last_pos = pos-1; + opt[pos].price = ZSTD_MAX_PRICE; + } + } + + /* check further positions */ + for (cur = 1; cur <= last_pos; cur++) { + const BYTE* const inr = ip + cur; + assert(cur <= ZSTD_OPT_NUM); + DEBUGLOG(7, "cPos:%i==rPos:%u", (int)(inr-istart), cur); + + /* Fix current position with one literal if cheaper */ + { U32 const litlen = opt[cur-1].litlen + 1; + int const price = opt[cur-1].price + + LIT_PRICE(ip+cur-1) + + LL_INCPRICE(litlen); + assert(price < 1000000000); /* overflow check */ + if (price <= opt[cur].price) { + ZSTD_optimal_t const prevMatch = opt[cur]; + DEBUGLOG(7, "cPos:%i==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)", + (int)(inr-istart), cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen, + opt[cur-1].rep[0], opt[cur-1].rep[1], opt[cur-1].rep[2]); + opt[cur] = opt[cur-1]; + opt[cur].litlen = litlen; + opt[cur].price = price; + if ( (optLevel >= 1) /* additional check only for higher modes */ + && (prevMatch.litlen == 0) /* replace a match */ + && (LL_INCPRICE(1) < 0) /* ll1 is cheaper than ll0 */ + && LIKELY(ip + cur < iend) + ) { + /* check next position, in case it would be cheaper */ + int with1literal = prevMatch.price + LIT_PRICE(ip+cur) + LL_INCPRICE(1); + int withMoreLiterals = price + LIT_PRICE(ip+cur) + LL_INCPRICE(litlen+1); + DEBUGLOG(7, "then at next rPos %u : match+1lit %.2f vs %ulits %.2f", + cur+1, ZSTD_fCost(with1literal), litlen+1, ZSTD_fCost(withMoreLiterals)); + if ( (with1literal < withMoreLiterals) + && (with1literal < opt[cur+1].price) ) { + /* update offset history - before it disappears */ + U32 const prev = cur - prevMatch.mlen; + Repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, prevMatch.off, opt[prev].litlen==0); + assert(cur >= prevMatch.mlen); + DEBUGLOG(7, "==> match+1lit is cheaper (%.2f < %.2f) (hist:%u,%u,%u) !", + ZSTD_fCost(with1literal), ZSTD_fCost(withMoreLiterals), + newReps.rep[0], newReps.rep[1], newReps.rep[2] ); + opt[cur+1] = prevMatch; /* mlen & offbase */ + ZSTD_memcpy(opt[cur+1].rep, &newReps, sizeof(Repcodes_t)); + opt[cur+1].litlen = 1; + opt[cur+1].price = with1literal; + if (last_pos < cur+1) last_pos = cur+1; + } + } + } else { + DEBUGLOG(7, "cPos:%i==rPos:%u : literal would cost more (%.2f>%.2f)", + (int)(inr-istart), cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price)); + } + } + + /* Offset history is not updated during match comparison. + * Do it here, now that the match is selected and confirmed. + */ + ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(Repcodes_t)); + assert(cur >= opt[cur].mlen); + if (opt[cur].litlen == 0) { + /* just finished a match => alter offset history */ + U32 const prev = cur - opt[cur].mlen; + Repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[prev].litlen==0); + ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(Repcodes_t)); + } + + /* last match must start at a minimum distance of 8 from oend */ + if (inr > ilimit) continue; + + if (cur == last_pos) break; + + if ( (optLevel==0) /*static_test*/ + && (opt[cur+1].price <= opt[cur].price + (BITCOST_MULTIPLIER/2)) ) { + DEBUGLOG(7, "skip current position : next rPos(%u) price is cheaper", cur+1); + continue; /* skip unpromising positions; about ~+6% speed, -0.01 ratio */ + } + + assert(opt[cur].price >= 0); + { U32 const ll0 = (opt[cur].litlen == 0); + int const previousPrice = opt[cur].price; + int const basePrice = previousPrice + LL_PRICE(0); + U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, inr, iend, opt[cur].rep, ll0, minMatch); + U32 matchNb; + + ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches, + (U32)(inr-istart), (U32)(iend-inr), + minMatch); + + if (!nbMatches) { + DEBUGLOG(7, "rPos:%u : no match found", cur); + continue; + } + + { U32 const longestML = matches[nbMatches-1].len; + DEBUGLOG(7, "cPos:%i==rPos:%u, found %u matches, of longest ML=%u", + (int)(inr-istart), cur, nbMatches, longestML); + + if ( (longestML > sufficient_len) + || (cur + longestML >= ZSTD_OPT_NUM) + || (ip + cur + longestML >= iend) ) { + lastStretch.mlen = longestML; + lastStretch.off = matches[nbMatches-1].off; + lastStretch.litlen = 0; + last_pos = cur + longestML; + goto _shortestPath; + } } + + /* set prices using matches found at position == cur */ + for (matchNb = 0; matchNb < nbMatches; matchNb++) { + U32 const offset = matches[matchNb].off; + U32 const lastML = matches[matchNb].len; + U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch; + U32 mlen; + + DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u", + matchNb, matches[matchNb].off, lastML, opt[cur].litlen); + + for (mlen = lastML; mlen >= startML; mlen--) { /* scan downward */ + U32 const pos = cur + mlen; + int const price = basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel); + + if ((pos > last_pos) || (price < opt[pos].price)) { + DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)", + pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price)); + while (last_pos < pos) { + /* fill empty positions, for future comparisons */ + last_pos++; + opt[last_pos].price = ZSTD_MAX_PRICE; + opt[last_pos].litlen = !0; /* just needs to be != 0, to mean "not an end of match" */ + } + opt[pos].mlen = mlen; + opt[pos].off = offset; + opt[pos].litlen = 0; + opt[pos].price = price; + } else { + DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)", + pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price)); + if (optLevel==0) break; /* early update abort; gets ~+10% speed for about -0.01 ratio loss */ + } + } } } + opt[last_pos+1].price = ZSTD_MAX_PRICE; + } /* for (cur = 1; cur <= last_pos; cur++) */ + + lastStretch = opt[last_pos]; + assert(cur >= lastStretch.mlen); + cur = last_pos - lastStretch.mlen; + +_shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */ + assert(opt[0].mlen == 0); + assert(last_pos >= lastStretch.mlen); + assert(cur == last_pos - lastStretch.mlen); + + if (lastStretch.mlen==0) { + /* no solution : all matches have been converted into literals */ + assert(lastStretch.litlen == (ip - anchor) + last_pos); + ip += last_pos; + continue; + } + assert(lastStretch.off > 0); + + /* Update offset history */ + if (lastStretch.litlen == 0) { + /* finishing on a match : update offset history */ + Repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastStretch.off, opt[cur].litlen==0); + ZSTD_memcpy(rep, &reps, sizeof(Repcodes_t)); + } else { + ZSTD_memcpy(rep, lastStretch.rep, sizeof(Repcodes_t)); + assert(cur >= lastStretch.litlen); + cur -= lastStretch.litlen; + } + + /* Let's write the shortest path solution. + * It is stored in @opt in reverse order, + * starting from @storeEnd (==cur+2), + * effectively partially @opt overwriting. + * Content is changed too: + * - So far, @opt stored stretches, aka a match followed by literals + * - Now, it will store sequences, aka literals followed by a match + */ + { U32 const storeEnd = cur + 2; + U32 storeStart = storeEnd; + U32 stretchPos = cur; + + DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)", + last_pos, cur); (void)last_pos; + assert(storeEnd < ZSTD_OPT_SIZE); + DEBUGLOG(6, "last stretch copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", + storeEnd, lastStretch.litlen, lastStretch.mlen, lastStretch.off); + if (lastStretch.litlen > 0) { + /* last "sequence" is unfinished: just a bunch of literals */ + opt[storeEnd].litlen = lastStretch.litlen; + opt[storeEnd].mlen = 0; + storeStart = storeEnd-1; + opt[storeStart] = lastStretch; + } { + opt[storeEnd] = lastStretch; /* note: litlen will be fixed */ + storeStart = storeEnd; + } + while (1) { + ZSTD_optimal_t nextStretch = opt[stretchPos]; + opt[storeStart].litlen = nextStretch.litlen; + DEBUGLOG(6, "selected sequence (llen=%u,mlen=%u,ofc=%u)", + opt[storeStart].litlen, opt[storeStart].mlen, opt[storeStart].off); + if (nextStretch.mlen == 0) { + /* reaching beginning of segment */ + break; + } + storeStart--; + opt[storeStart] = nextStretch; /* note: litlen will be fixed */ + assert(nextStretch.litlen + nextStretch.mlen <= stretchPos); + stretchPos -= nextStretch.litlen + nextStretch.mlen; + } + + /* save sequences */ + DEBUGLOG(6, "sending selected sequences into seqStore"); + { U32 storePos; + for (storePos=storeStart; storePos <= storeEnd; storePos++) { + U32 const llen = opt[storePos].litlen; + U32 const mlen = opt[storePos].mlen; + U32 const offBase = opt[storePos].off; + U32 const advance = llen + mlen; + DEBUGLOG(6, "considering seq starting at %i, llen=%u, mlen=%u", + (int)(anchor - istart), (unsigned)llen, (unsigned)mlen); + + if (mlen==0) { /* only literals => must be last "sequence", actually starting a new stream of sequences */ + assert(storePos == storeEnd); /* must be last sequence */ + ip = anchor + llen; /* last "sequence" is a bunch of literals => don't progress anchor */ + continue; /* will finish */ + } + + assert(anchor + llen <= iend); + ZSTD_updateStats(optStatePtr, llen, anchor, offBase, mlen); + ZSTD_storeSeq(seqStore, llen, anchor, iend, offBase, mlen); + anchor += advance; + ip = anchor; + } } + DEBUGLOG(7, "new offset history : %u, %u, %u", rep[0], rep[1], rep[2]); + + /* update all costs */ + ZSTD_setBasePrices(optStatePtr, optLevel); + } + } /* while (ip < ilimit) */ + + /* Return the last literals size */ + return (size_t)(iend - anchor); +} +#endif /* build exclusions */ + +#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR +static size_t ZSTD_compressBlock_opt0( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) +{ + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /* optLevel */, dictMode); +} +#endif + +#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR +static size_t ZSTD_compressBlock_opt2( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) +{ + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /* optLevel */, dictMode); +} +#endif + +#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_btopt( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +{ + DEBUGLOG(5, "ZSTD_compressBlock_btopt"); + return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_noDict); +} +#endif + + + + +#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR +/* ZSTD_initStats_ultra(): + * make a first compression pass, just to seed stats with more accurate starting values. + * only works on first block, with no dictionary and no ldm. + * this function cannot error out, its narrow contract must be respected. + */ +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_initStats_ultra(ZSTD_MatchState_t* ms, + SeqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +{ + U32 tmpRep[ZSTD_REP_NUM]; /* updated rep codes will sink here */ + ZSTD_memcpy(tmpRep, rep, sizeof(tmpRep)); + + DEBUGLOG(4, "ZSTD_initStats_ultra (srcSize=%zu)", srcSize); + assert(ms->opt.litLengthSum == 0); /* first block */ + assert(seqStore->sequences == seqStore->sequencesStart); /* no ldm */ + assert(ms->window.dictLimit == ms->window.lowLimit); /* no dictionary */ + assert(ms->window.dictLimit - ms->nextToUpdate <= 1); /* no prefix (note: intentional overflow, defined as 2-complement) */ + + ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict); /* generate stats into ms->opt*/ + + /* invalidate first scan from history, only keep entropy stats */ + ZSTD_resetSeqStore(seqStore); + ms->window.base -= srcSize; + ms->window.dictLimit += (U32)srcSize; + ms->window.lowLimit = ms->window.dictLimit; + ms->nextToUpdate = ms->window.dictLimit; + +} + +size_t ZSTD_compressBlock_btultra( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +{ + DEBUGLOG(5, "ZSTD_compressBlock_btultra (srcSize=%zu)", srcSize); + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_btultra2( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +{ + U32 const curr = (U32)((const BYTE*)src - ms->window.base); + DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize); + + /* 2-passes strategy: + * this strategy makes a first pass over first block to collect statistics + * in order to seed next round's statistics with it. + * After 1st pass, function forgets history, and starts a new block. + * Consequently, this can only work if no data has been previously loaded in tables, + * aka, no dictionary, no prefix, no ldm preprocessing. + * The compression ratio gain is generally small (~0.5% on first block), + * the cost is 2x cpu time on first block. */ + assert(srcSize <= ZSTD_BLOCKSIZE_MAX); + if ( (ms->opt.litLengthSum==0) /* first block */ + && (seqStore->sequences == seqStore->sequencesStart) /* no ldm */ + && (ms->window.dictLimit == ms->window.lowLimit) /* no dictionary */ + && (curr == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */ + && (srcSize > ZSTD_PREDEF_THRESHOLD) /* input large enough to not employ default stats */ + ) { + ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize); + } + + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict); +} +#endif + +#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_btopt_dictMatchState( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +{ + return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_btopt_extDict( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +{ + return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict); +} +#endif + +#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_btultra_dictMatchState( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +{ + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_btultra_extDict( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +{ + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_extDict); +} +#endif + +/* note : no btultra2 variant for extDict nor dictMatchState, + * because btultra2 is not meant to work with dictionaries + * and is only specific for the first block (no prefix) */ diff --git a/lib/compress/zstdmt_compress.c b/lib/compress/zstdmt_compress.c new file mode 100644 index 0000000..0f1fe6d --- /dev/null +++ b/lib/compress/zstdmt_compress.c @@ -0,0 +1,1923 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + +/* ====== Compiler specifics ====== */ +#if defined(_MSC_VER) +# pragma warning(disable : 4204) /* disable: C4204: non-constant aggregate initializer */ +#endif + + +/* ====== Dependencies ====== */ +#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ +#include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memset, INT_MAX, UINT_MAX */ +#include "../common/mem.h" /* MEM_STATIC */ +#include "../common/pool.h" /* threadpool */ +#include "../common/threading.h" /* mutex */ +#include "zstd_compress_internal.h" /* MIN, ERROR, ZSTD_*, ZSTD_highbit32 */ +#include "zstd_ldm.h" +#include "zstdmt_compress.h" + +/* Guards code to support resizing the SeqPool. + * We will want to resize the SeqPool to save memory in the future. + * Until then, comment the code out since it is unused. + */ +#define ZSTD_RESIZE_SEQPOOL 0 + +/* ====== Debug ====== */ +#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=2) \ + && !defined(_MSC_VER) \ + && !defined(__MINGW32__) + +# include +# include +# include + +# define DEBUG_PRINTHEX(l,p,n) \ + do { \ + unsigned debug_u; \ + for (debug_u=0; debug_u<(n); debug_u++) \ + RAWLOG(l, "%02X ", ((const unsigned char*)(p))[debug_u]); \ + RAWLOG(l, " \n"); \ + } while (0) + +static unsigned long long GetCurrentClockTimeMicroseconds(void) +{ + static clock_t _ticksPerSecond = 0; + if (_ticksPerSecond <= 0) _ticksPerSecond = sysconf(_SC_CLK_TCK); + + { struct tms junk; clock_t newTicks = (clock_t) times(&junk); + return ((((unsigned long long)newTicks)*(1000000))/_ticksPerSecond); +} } + +#define MUTEX_WAIT_TIME_DLEVEL 6 +#define ZSTD_PTHREAD_MUTEX_LOCK(mutex) \ + do { \ + if (DEBUGLEVEL >= MUTEX_WAIT_TIME_DLEVEL) { \ + unsigned long long const beforeTime = GetCurrentClockTimeMicroseconds(); \ + ZSTD_pthread_mutex_lock(mutex); \ + { unsigned long long const afterTime = GetCurrentClockTimeMicroseconds(); \ + unsigned long long const elapsedTime = (afterTime-beforeTime); \ + if (elapsedTime > 1000) { \ + /* or whatever threshold you like; I'm using 1 millisecond here */ \ + DEBUGLOG(MUTEX_WAIT_TIME_DLEVEL, \ + "Thread took %llu microseconds to acquire mutex %s \n", \ + elapsedTime, #mutex); \ + } } \ + } else { \ + ZSTD_pthread_mutex_lock(mutex); \ + } \ + } while (0) + +#else + +# define ZSTD_PTHREAD_MUTEX_LOCK(m) ZSTD_pthread_mutex_lock(m) +# define DEBUG_PRINTHEX(l,p,n) do { } while (0) + +#endif + + +/* ===== Buffer Pool ===== */ +/* a single Buffer Pool can be invoked from multiple threads in parallel */ + +typedef struct buffer_s { + void* start; + size_t capacity; +} Buffer; + +static const Buffer g_nullBuffer = { NULL, 0 }; + +typedef struct ZSTDMT_bufferPool_s { + ZSTD_pthread_mutex_t poolMutex; + size_t bufferSize; + unsigned totalBuffers; + unsigned nbBuffers; + ZSTD_customMem cMem; + Buffer* buffers; +} ZSTDMT_bufferPool; + +static void ZSTDMT_freeBufferPool(ZSTDMT_bufferPool* bufPool) +{ + DEBUGLOG(3, "ZSTDMT_freeBufferPool (address:%08X)", (U32)(size_t)bufPool); + if (!bufPool) return; /* compatibility with free on NULL */ + if (bufPool->buffers) { + unsigned u; + for (u=0; utotalBuffers; u++) { + DEBUGLOG(4, "free buffer %2u (address:%08X)", u, (U32)(size_t)bufPool->buffers[u].start); + ZSTD_customFree(bufPool->buffers[u].start, bufPool->cMem); + } + ZSTD_customFree(bufPool->buffers, bufPool->cMem); + } + ZSTD_pthread_mutex_destroy(&bufPool->poolMutex); + ZSTD_customFree(bufPool, bufPool->cMem); +} + +static ZSTDMT_bufferPool* ZSTDMT_createBufferPool(unsigned maxNbBuffers, ZSTD_customMem cMem) +{ + ZSTDMT_bufferPool* const bufPool = + (ZSTDMT_bufferPool*)ZSTD_customCalloc(sizeof(ZSTDMT_bufferPool), cMem); + if (bufPool==NULL) return NULL; + if (ZSTD_pthread_mutex_init(&bufPool->poolMutex, NULL)) { + ZSTD_customFree(bufPool, cMem); + return NULL; + } + bufPool->buffers = (Buffer*)ZSTD_customCalloc(maxNbBuffers * sizeof(Buffer), cMem); + if (bufPool->buffers==NULL) { + ZSTDMT_freeBufferPool(bufPool); + return NULL; + } + bufPool->bufferSize = 64 KB; + bufPool->totalBuffers = maxNbBuffers; + bufPool->nbBuffers = 0; + bufPool->cMem = cMem; + return bufPool; +} + +/* only works at initialization, not during compression */ +static size_t ZSTDMT_sizeof_bufferPool(ZSTDMT_bufferPool* bufPool) +{ + size_t const poolSize = sizeof(*bufPool); + size_t const arraySize = bufPool->totalBuffers * sizeof(Buffer); + unsigned u; + size_t totalBufferSize = 0; + ZSTD_pthread_mutex_lock(&bufPool->poolMutex); + for (u=0; utotalBuffers; u++) + totalBufferSize += bufPool->buffers[u].capacity; + ZSTD_pthread_mutex_unlock(&bufPool->poolMutex); + + return poolSize + arraySize + totalBufferSize; +} + +/* ZSTDMT_setBufferSize() : + * all future buffers provided by this buffer pool will have _at least_ this size + * note : it's better for all buffers to have same size, + * as they become freely interchangeable, reducing malloc/free usages and memory fragmentation */ +static void ZSTDMT_setBufferSize(ZSTDMT_bufferPool* const bufPool, size_t const bSize) +{ + ZSTD_pthread_mutex_lock(&bufPool->poolMutex); + DEBUGLOG(4, "ZSTDMT_setBufferSize: bSize = %u", (U32)bSize); + bufPool->bufferSize = bSize; + ZSTD_pthread_mutex_unlock(&bufPool->poolMutex); +} + + +static ZSTDMT_bufferPool* ZSTDMT_expandBufferPool(ZSTDMT_bufferPool* srcBufPool, unsigned maxNbBuffers) +{ + if (srcBufPool==NULL) return NULL; + if (srcBufPool->totalBuffers >= maxNbBuffers) /* good enough */ + return srcBufPool; + /* need a larger buffer pool */ + { ZSTD_customMem const cMem = srcBufPool->cMem; + size_t const bSize = srcBufPool->bufferSize; /* forward parameters */ + ZSTDMT_bufferPool* newBufPool; + ZSTDMT_freeBufferPool(srcBufPool); + newBufPool = ZSTDMT_createBufferPool(maxNbBuffers, cMem); + if (newBufPool==NULL) return newBufPool; + ZSTDMT_setBufferSize(newBufPool, bSize); + return newBufPool; + } +} + +/** ZSTDMT_getBuffer() : + * assumption : bufPool must be valid + * @return : a buffer, with start pointer and size + * note: allocation may fail, in this case, start==NULL and size==0 */ +static Buffer ZSTDMT_getBuffer(ZSTDMT_bufferPool* bufPool) +{ + size_t const bSize = bufPool->bufferSize; + DEBUGLOG(5, "ZSTDMT_getBuffer: bSize = %u", (U32)bufPool->bufferSize); + ZSTD_pthread_mutex_lock(&bufPool->poolMutex); + if (bufPool->nbBuffers) { /* try to use an existing buffer */ + Buffer const buf = bufPool->buffers[--(bufPool->nbBuffers)]; + size_t const availBufferSize = buf.capacity; + bufPool->buffers[bufPool->nbBuffers] = g_nullBuffer; + if ((availBufferSize >= bSize) & ((availBufferSize>>3) <= bSize)) { + /* large enough, but not too much */ + DEBUGLOG(5, "ZSTDMT_getBuffer: provide buffer %u of size %u", + bufPool->nbBuffers, (U32)buf.capacity); + ZSTD_pthread_mutex_unlock(&bufPool->poolMutex); + return buf; + } + /* size conditions not respected : scratch this buffer, create new one */ + DEBUGLOG(5, "ZSTDMT_getBuffer: existing buffer does not meet size conditions => freeing"); + ZSTD_customFree(buf.start, bufPool->cMem); + } + ZSTD_pthread_mutex_unlock(&bufPool->poolMutex); + /* create new buffer */ + DEBUGLOG(5, "ZSTDMT_getBuffer: create a new buffer"); + { Buffer buffer; + void* const start = ZSTD_customMalloc(bSize, bufPool->cMem); + buffer.start = start; /* note : start can be NULL if malloc fails ! */ + buffer.capacity = (start==NULL) ? 0 : bSize; + if (start==NULL) { + DEBUGLOG(5, "ZSTDMT_getBuffer: buffer allocation failure !!"); + } else { + DEBUGLOG(5, "ZSTDMT_getBuffer: created buffer of size %u", (U32)bSize); + } + return buffer; + } +} + +#if ZSTD_RESIZE_SEQPOOL +/** ZSTDMT_resizeBuffer() : + * assumption : bufPool must be valid + * @return : a buffer that is at least the buffer pool buffer size. + * If a reallocation happens, the data in the input buffer is copied. + */ +static Buffer ZSTDMT_resizeBuffer(ZSTDMT_bufferPool* bufPool, Buffer buffer) +{ + size_t const bSize = bufPool->bufferSize; + if (buffer.capacity < bSize) { + void* const start = ZSTD_customMalloc(bSize, bufPool->cMem); + Buffer newBuffer; + newBuffer.start = start; + newBuffer.capacity = start == NULL ? 0 : bSize; + if (start != NULL) { + assert(newBuffer.capacity >= buffer.capacity); + ZSTD_memcpy(newBuffer.start, buffer.start, buffer.capacity); + DEBUGLOG(5, "ZSTDMT_resizeBuffer: created buffer of size %u", (U32)bSize); + return newBuffer; + } + DEBUGLOG(5, "ZSTDMT_resizeBuffer: buffer allocation failure !!"); + } + return buffer; +} +#endif + +/* store buffer for later re-use, up to pool capacity */ +static void ZSTDMT_releaseBuffer(ZSTDMT_bufferPool* bufPool, Buffer buf) +{ + DEBUGLOG(5, "ZSTDMT_releaseBuffer"); + if (buf.start == NULL) return; /* compatible with release on NULL */ + ZSTD_pthread_mutex_lock(&bufPool->poolMutex); + if (bufPool->nbBuffers < bufPool->totalBuffers) { + bufPool->buffers[bufPool->nbBuffers++] = buf; /* stored for later use */ + DEBUGLOG(5, "ZSTDMT_releaseBuffer: stored buffer of size %u in slot %u", + (U32)buf.capacity, (U32)(bufPool->nbBuffers-1)); + ZSTD_pthread_mutex_unlock(&bufPool->poolMutex); + return; + } + ZSTD_pthread_mutex_unlock(&bufPool->poolMutex); + /* Reached bufferPool capacity (note: should not happen) */ + DEBUGLOG(5, "ZSTDMT_releaseBuffer: pool capacity reached => freeing "); + ZSTD_customFree(buf.start, bufPool->cMem); +} + +/* We need 2 output buffers per worker since each dstBuff must be flushed after it is released. + * The 3 additional buffers are as follows: + * 1 buffer for input loading + * 1 buffer for "next input" when submitting current one + * 1 buffer stuck in queue */ +#define BUF_POOL_MAX_NB_BUFFERS(nbWorkers) (2*(nbWorkers) + 3) + +/* After a worker releases its rawSeqStore, it is immediately ready for reuse. + * So we only need one seq buffer per worker. */ +#define SEQ_POOL_MAX_NB_BUFFERS(nbWorkers) (nbWorkers) + +/* ===== Seq Pool Wrapper ====== */ + +typedef ZSTDMT_bufferPool ZSTDMT_seqPool; + +static size_t ZSTDMT_sizeof_seqPool(ZSTDMT_seqPool* seqPool) +{ + return ZSTDMT_sizeof_bufferPool(seqPool); +} + +static RawSeqStore_t bufferToSeq(Buffer buffer) +{ + RawSeqStore_t seq = kNullRawSeqStore; + seq.seq = (rawSeq*)buffer.start; + seq.capacity = buffer.capacity / sizeof(rawSeq); + return seq; +} + +static Buffer seqToBuffer(RawSeqStore_t seq) +{ + Buffer buffer; + buffer.start = seq.seq; + buffer.capacity = seq.capacity * sizeof(rawSeq); + return buffer; +} + +static RawSeqStore_t ZSTDMT_getSeq(ZSTDMT_seqPool* seqPool) +{ + if (seqPool->bufferSize == 0) { + return kNullRawSeqStore; + } + return bufferToSeq(ZSTDMT_getBuffer(seqPool)); +} + +#if ZSTD_RESIZE_SEQPOOL +static RawSeqStore_t ZSTDMT_resizeSeq(ZSTDMT_seqPool* seqPool, RawSeqStore_t seq) +{ + return bufferToSeq(ZSTDMT_resizeBuffer(seqPool, seqToBuffer(seq))); +} +#endif + +static void ZSTDMT_releaseSeq(ZSTDMT_seqPool* seqPool, RawSeqStore_t seq) +{ + ZSTDMT_releaseBuffer(seqPool, seqToBuffer(seq)); +} + +static void ZSTDMT_setNbSeq(ZSTDMT_seqPool* const seqPool, size_t const nbSeq) +{ + ZSTDMT_setBufferSize(seqPool, nbSeq * sizeof(rawSeq)); +} + +static ZSTDMT_seqPool* ZSTDMT_createSeqPool(unsigned nbWorkers, ZSTD_customMem cMem) +{ + ZSTDMT_seqPool* const seqPool = ZSTDMT_createBufferPool(SEQ_POOL_MAX_NB_BUFFERS(nbWorkers), cMem); + if (seqPool == NULL) return NULL; + ZSTDMT_setNbSeq(seqPool, 0); + return seqPool; +} + +static void ZSTDMT_freeSeqPool(ZSTDMT_seqPool* seqPool) +{ + ZSTDMT_freeBufferPool(seqPool); +} + +static ZSTDMT_seqPool* ZSTDMT_expandSeqPool(ZSTDMT_seqPool* pool, U32 nbWorkers) +{ + return ZSTDMT_expandBufferPool(pool, SEQ_POOL_MAX_NB_BUFFERS(nbWorkers)); +} + + +/* ===== CCtx Pool ===== */ +/* a single CCtx Pool can be invoked from multiple threads in parallel */ + +typedef struct { + ZSTD_pthread_mutex_t poolMutex; + int totalCCtx; + int availCCtx; + ZSTD_customMem cMem; + ZSTD_CCtx** cctxs; +} ZSTDMT_CCtxPool; + +/* note : all CCtx borrowed from the pool must be reverted back to the pool _before_ freeing the pool */ +static void ZSTDMT_freeCCtxPool(ZSTDMT_CCtxPool* pool) +{ + if (!pool) return; + ZSTD_pthread_mutex_destroy(&pool->poolMutex); + if (pool->cctxs) { + int cid; + for (cid=0; cidtotalCCtx; cid++) + ZSTD_freeCCtx(pool->cctxs[cid]); /* free compatible with NULL */ + ZSTD_customFree(pool->cctxs, pool->cMem); + } + ZSTD_customFree(pool, pool->cMem); +} + +/* ZSTDMT_createCCtxPool() : + * implies nbWorkers >= 1 , checked by caller ZSTDMT_createCCtx() */ +static ZSTDMT_CCtxPool* ZSTDMT_createCCtxPool(int nbWorkers, + ZSTD_customMem cMem) +{ + ZSTDMT_CCtxPool* const cctxPool = + (ZSTDMT_CCtxPool*) ZSTD_customCalloc(sizeof(ZSTDMT_CCtxPool), cMem); + assert(nbWorkers > 0); + if (!cctxPool) return NULL; + if (ZSTD_pthread_mutex_init(&cctxPool->poolMutex, NULL)) { + ZSTD_customFree(cctxPool, cMem); + return NULL; + } + cctxPool->totalCCtx = nbWorkers; + cctxPool->cctxs = (ZSTD_CCtx**)ZSTD_customCalloc(nbWorkers * sizeof(ZSTD_CCtx*), cMem); + if (!cctxPool->cctxs) { + ZSTDMT_freeCCtxPool(cctxPool); + return NULL; + } + cctxPool->cMem = cMem; + cctxPool->cctxs[0] = ZSTD_createCCtx_advanced(cMem); + if (!cctxPool->cctxs[0]) { ZSTDMT_freeCCtxPool(cctxPool); return NULL; } + cctxPool->availCCtx = 1; /* at least one cctx for single-thread mode */ + DEBUGLOG(3, "cctxPool created, with %u workers", nbWorkers); + return cctxPool; +} + +static ZSTDMT_CCtxPool* ZSTDMT_expandCCtxPool(ZSTDMT_CCtxPool* srcPool, + int nbWorkers) +{ + if (srcPool==NULL) return NULL; + if (nbWorkers <= srcPool->totalCCtx) return srcPool; /* good enough */ + /* need a larger cctx pool */ + { ZSTD_customMem const cMem = srcPool->cMem; + ZSTDMT_freeCCtxPool(srcPool); + return ZSTDMT_createCCtxPool(nbWorkers, cMem); + } +} + +/* only works during initialization phase, not during compression */ +static size_t ZSTDMT_sizeof_CCtxPool(ZSTDMT_CCtxPool* cctxPool) +{ + ZSTD_pthread_mutex_lock(&cctxPool->poolMutex); + { unsigned const nbWorkers = cctxPool->totalCCtx; + size_t const poolSize = sizeof(*cctxPool); + size_t const arraySize = cctxPool->totalCCtx * sizeof(ZSTD_CCtx*); + size_t totalCCtxSize = 0; + unsigned u; + for (u=0; ucctxs[u]); + } + ZSTD_pthread_mutex_unlock(&cctxPool->poolMutex); + assert(nbWorkers > 0); + return poolSize + arraySize + totalCCtxSize; + } +} + +static ZSTD_CCtx* ZSTDMT_getCCtx(ZSTDMT_CCtxPool* cctxPool) +{ + DEBUGLOG(5, "ZSTDMT_getCCtx"); + ZSTD_pthread_mutex_lock(&cctxPool->poolMutex); + if (cctxPool->availCCtx) { + cctxPool->availCCtx--; + { ZSTD_CCtx* const cctx = cctxPool->cctxs[cctxPool->availCCtx]; + ZSTD_pthread_mutex_unlock(&cctxPool->poolMutex); + return cctx; + } } + ZSTD_pthread_mutex_unlock(&cctxPool->poolMutex); + DEBUGLOG(5, "create one more CCtx"); + return ZSTD_createCCtx_advanced(cctxPool->cMem); /* note : can be NULL, when creation fails ! */ +} + +static void ZSTDMT_releaseCCtx(ZSTDMT_CCtxPool* pool, ZSTD_CCtx* cctx) +{ + if (cctx==NULL) return; /* compatibility with release on NULL */ + ZSTD_pthread_mutex_lock(&pool->poolMutex); + if (pool->availCCtx < pool->totalCCtx) + pool->cctxs[pool->availCCtx++] = cctx; + else { + /* pool overflow : should not happen, since totalCCtx==nbWorkers */ + DEBUGLOG(4, "CCtx pool overflow : free cctx"); + ZSTD_freeCCtx(cctx); + } + ZSTD_pthread_mutex_unlock(&pool->poolMutex); +} + +/* ==== Serial State ==== */ + +typedef struct { + void const* start; + size_t size; +} Range; + +typedef struct { + /* All variables in the struct are protected by mutex. */ + ZSTD_pthread_mutex_t mutex; + ZSTD_pthread_cond_t cond; + ZSTD_CCtx_params params; + ldmState_t ldmState; + XXH64_state_t xxhState; + unsigned nextJobID; + /* Protects ldmWindow. + * Must be acquired after the main mutex when acquiring both. + */ + ZSTD_pthread_mutex_t ldmWindowMutex; + ZSTD_pthread_cond_t ldmWindowCond; /* Signaled when ldmWindow is updated */ + ZSTD_window_t ldmWindow; /* A thread-safe copy of ldmState.window */ +} SerialState; + +static int +ZSTDMT_serialState_reset(SerialState* serialState, + ZSTDMT_seqPool* seqPool, + ZSTD_CCtx_params params, + size_t jobSize, + const void* dict, size_t const dictSize, + ZSTD_dictContentType_e dictContentType) +{ + /* Adjust parameters */ + if (params.ldmParams.enableLdm == ZSTD_ps_enable) { + DEBUGLOG(4, "LDM window size = %u KB", (1U << params.cParams.windowLog) >> 10); + ZSTD_ldm_adjustParameters(¶ms.ldmParams, ¶ms.cParams); + assert(params.ldmParams.hashLog >= params.ldmParams.bucketSizeLog); + assert(params.ldmParams.hashRateLog < 32); + } else { + ZSTD_memset(¶ms.ldmParams, 0, sizeof(params.ldmParams)); + } + serialState->nextJobID = 0; + if (params.fParams.checksumFlag) + XXH64_reset(&serialState->xxhState, 0); + if (params.ldmParams.enableLdm == ZSTD_ps_enable) { + ZSTD_customMem cMem = params.customMem; + unsigned const hashLog = params.ldmParams.hashLog; + size_t const hashSize = ((size_t)1 << hashLog) * sizeof(ldmEntry_t); + unsigned const bucketLog = + params.ldmParams.hashLog - params.ldmParams.bucketSizeLog; + unsigned const prevBucketLog = + serialState->params.ldmParams.hashLog - + serialState->params.ldmParams.bucketSizeLog; + size_t const numBuckets = (size_t)1 << bucketLog; + /* Size the seq pool tables */ + ZSTDMT_setNbSeq(seqPool, ZSTD_ldm_getMaxNbSeq(params.ldmParams, jobSize)); + /* Reset the window */ + ZSTD_window_init(&serialState->ldmState.window); + /* Resize tables and output space if necessary. */ + if (serialState->ldmState.hashTable == NULL || serialState->params.ldmParams.hashLog < hashLog) { + ZSTD_customFree(serialState->ldmState.hashTable, cMem); + serialState->ldmState.hashTable = (ldmEntry_t*)ZSTD_customMalloc(hashSize, cMem); + } + if (serialState->ldmState.bucketOffsets == NULL || prevBucketLog < bucketLog) { + ZSTD_customFree(serialState->ldmState.bucketOffsets, cMem); + serialState->ldmState.bucketOffsets = (BYTE*)ZSTD_customMalloc(numBuckets, cMem); + } + if (!serialState->ldmState.hashTable || !serialState->ldmState.bucketOffsets) + return 1; + /* Zero the tables */ + ZSTD_memset(serialState->ldmState.hashTable, 0, hashSize); + ZSTD_memset(serialState->ldmState.bucketOffsets, 0, numBuckets); + + /* Update window state and fill hash table with dict */ + serialState->ldmState.loadedDictEnd = 0; + if (dictSize > 0) { + if (dictContentType == ZSTD_dct_rawContent) { + BYTE const* const dictEnd = (const BYTE*)dict + dictSize; + ZSTD_window_update(&serialState->ldmState.window, dict, dictSize, /* forceNonContiguous */ 0); + ZSTD_ldm_fillHashTable(&serialState->ldmState, (const BYTE*)dict, dictEnd, ¶ms.ldmParams); + serialState->ldmState.loadedDictEnd = params.forceWindow ? 0 : (U32)(dictEnd - serialState->ldmState.window.base); + } else { + /* don't even load anything */ + } + } + + /* Initialize serialState's copy of ldmWindow. */ + serialState->ldmWindow = serialState->ldmState.window; + } + + serialState->params = params; + serialState->params.jobSize = (U32)jobSize; + return 0; +} + +static int ZSTDMT_serialState_init(SerialState* serialState) +{ + int initError = 0; + ZSTD_memset(serialState, 0, sizeof(*serialState)); + initError |= ZSTD_pthread_mutex_init(&serialState->mutex, NULL); + initError |= ZSTD_pthread_cond_init(&serialState->cond, NULL); + initError |= ZSTD_pthread_mutex_init(&serialState->ldmWindowMutex, NULL); + initError |= ZSTD_pthread_cond_init(&serialState->ldmWindowCond, NULL); + return initError; +} + +static void ZSTDMT_serialState_free(SerialState* serialState) +{ + ZSTD_customMem cMem = serialState->params.customMem; + ZSTD_pthread_mutex_destroy(&serialState->mutex); + ZSTD_pthread_cond_destroy(&serialState->cond); + ZSTD_pthread_mutex_destroy(&serialState->ldmWindowMutex); + ZSTD_pthread_cond_destroy(&serialState->ldmWindowCond); + ZSTD_customFree(serialState->ldmState.hashTable, cMem); + ZSTD_customFree(serialState->ldmState.bucketOffsets, cMem); +} + +static void +ZSTDMT_serialState_genSequences(SerialState* serialState, + RawSeqStore_t* seqStore, + Range src, unsigned jobID) +{ + /* Wait for our turn */ + ZSTD_PTHREAD_MUTEX_LOCK(&serialState->mutex); + while (serialState->nextJobID < jobID) { + DEBUGLOG(5, "wait for serialState->cond"); + ZSTD_pthread_cond_wait(&serialState->cond, &serialState->mutex); + } + /* A future job may error and skip our job */ + if (serialState->nextJobID == jobID) { + /* It is now our turn, do any processing necessary */ + if (serialState->params.ldmParams.enableLdm == ZSTD_ps_enable) { + size_t error; + DEBUGLOG(6, "ZSTDMT_serialState_genSequences: LDM update"); + assert(seqStore->seq != NULL && seqStore->pos == 0 && + seqStore->size == 0 && seqStore->capacity > 0); + assert(src.size <= serialState->params.jobSize); + ZSTD_window_update(&serialState->ldmState.window, src.start, src.size, /* forceNonContiguous */ 0); + error = ZSTD_ldm_generateSequences( + &serialState->ldmState, seqStore, + &serialState->params.ldmParams, src.start, src.size); + /* We provide a large enough buffer to never fail. */ + assert(!ZSTD_isError(error)); (void)error; + /* Update ldmWindow to match the ldmState.window and signal the main + * thread if it is waiting for a buffer. + */ + ZSTD_PTHREAD_MUTEX_LOCK(&serialState->ldmWindowMutex); + serialState->ldmWindow = serialState->ldmState.window; + ZSTD_pthread_cond_signal(&serialState->ldmWindowCond); + ZSTD_pthread_mutex_unlock(&serialState->ldmWindowMutex); + } + if (serialState->params.fParams.checksumFlag && src.size > 0) + XXH64_update(&serialState->xxhState, src.start, src.size); + } + /* Now it is the next jobs turn */ + serialState->nextJobID++; + ZSTD_pthread_cond_broadcast(&serialState->cond); + ZSTD_pthread_mutex_unlock(&serialState->mutex); +} + +static void +ZSTDMT_serialState_applySequences(const SerialState* serialState, /* just for an assert() check */ + ZSTD_CCtx* jobCCtx, + const RawSeqStore_t* seqStore) +{ + if (seqStore->size > 0) { + DEBUGLOG(5, "ZSTDMT_serialState_applySequences: uploading %u external sequences", (unsigned)seqStore->size); + assert(serialState->params.ldmParams.enableLdm == ZSTD_ps_enable); (void)serialState; + assert(jobCCtx); + ZSTD_referenceExternalSequences(jobCCtx, seqStore->seq, seqStore->size); + } +} + +static void ZSTDMT_serialState_ensureFinished(SerialState* serialState, + unsigned jobID, size_t cSize) +{ + ZSTD_PTHREAD_MUTEX_LOCK(&serialState->mutex); + if (serialState->nextJobID <= jobID) { + assert(ZSTD_isError(cSize)); (void)cSize; + DEBUGLOG(5, "Skipping past job %u because of error", jobID); + serialState->nextJobID = jobID + 1; + ZSTD_pthread_cond_broadcast(&serialState->cond); + + ZSTD_PTHREAD_MUTEX_LOCK(&serialState->ldmWindowMutex); + ZSTD_window_clear(&serialState->ldmWindow); + ZSTD_pthread_cond_signal(&serialState->ldmWindowCond); + ZSTD_pthread_mutex_unlock(&serialState->ldmWindowMutex); + } + ZSTD_pthread_mutex_unlock(&serialState->mutex); + +} + + +/* ------------------------------------------ */ +/* ===== Worker thread ===== */ +/* ------------------------------------------ */ + +static const Range kNullRange = { NULL, 0 }; + +typedef struct { + size_t consumed; /* SHARED - set0 by mtctx, then modified by worker AND read by mtctx */ + size_t cSize; /* SHARED - set0 by mtctx, then modified by worker AND read by mtctx, then set0 by mtctx */ + ZSTD_pthread_mutex_t job_mutex; /* Thread-safe - used by mtctx and worker */ + ZSTD_pthread_cond_t job_cond; /* Thread-safe - used by mtctx and worker */ + ZSTDMT_CCtxPool* cctxPool; /* Thread-safe - used by mtctx and (all) workers */ + ZSTDMT_bufferPool* bufPool; /* Thread-safe - used by mtctx and (all) workers */ + ZSTDMT_seqPool* seqPool; /* Thread-safe - used by mtctx and (all) workers */ + SerialState* serial; /* Thread-safe - used by mtctx and (all) workers */ + Buffer dstBuff; /* set by worker (or mtctx), then read by worker & mtctx, then modified by mtctx => no barrier */ + Range prefix; /* set by mtctx, then read by worker & mtctx => no barrier */ + Range src; /* set by mtctx, then read by worker & mtctx => no barrier */ + unsigned jobID; /* set by mtctx, then read by worker => no barrier */ + unsigned firstJob; /* set by mtctx, then read by worker => no barrier */ + unsigned lastJob; /* set by mtctx, then read by worker => no barrier */ + ZSTD_CCtx_params params; /* set by mtctx, then read by worker => no barrier */ + const ZSTD_CDict* cdict; /* set by mtctx, then read by worker => no barrier */ + unsigned long long fullFrameSize; /* set by mtctx, then read by worker => no barrier */ + size_t dstFlushed; /* used only by mtctx */ + unsigned frameChecksumNeeded; /* used only by mtctx */ +} ZSTDMT_jobDescription; + +#define JOB_ERROR(e) \ + do { \ + ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex); \ + job->cSize = e; \ + ZSTD_pthread_mutex_unlock(&job->job_mutex); \ + goto _endJob; \ + } while (0) + +/* ZSTDMT_compressionJob() is a POOL_function type */ +static void ZSTDMT_compressionJob(void* jobDescription) +{ + ZSTDMT_jobDescription* const job = (ZSTDMT_jobDescription*)jobDescription; + ZSTD_CCtx_params jobParams = job->params; /* do not modify job->params ! copy it, modify the copy */ + ZSTD_CCtx* const cctx = ZSTDMT_getCCtx(job->cctxPool); + RawSeqStore_t rawSeqStore = ZSTDMT_getSeq(job->seqPool); + Buffer dstBuff = job->dstBuff; + size_t lastCBlockSize = 0; + + DEBUGLOG(5, "ZSTDMT_compressionJob: job %u", job->jobID); + /* resources */ + if (cctx==NULL) JOB_ERROR(ERROR(memory_allocation)); + if (dstBuff.start == NULL) { /* streaming job : doesn't provide a dstBuffer */ + dstBuff = ZSTDMT_getBuffer(job->bufPool); + if (dstBuff.start==NULL) JOB_ERROR(ERROR(memory_allocation)); + job->dstBuff = dstBuff; /* this value can be read in ZSTDMT_flush, when it copies the whole job */ + } + if (jobParams.ldmParams.enableLdm == ZSTD_ps_enable && rawSeqStore.seq == NULL) + JOB_ERROR(ERROR(memory_allocation)); + + /* Don't compute the checksum for chunks, since we compute it externally, + * but write it in the header. + */ + if (job->jobID != 0) jobParams.fParams.checksumFlag = 0; + /* Don't run LDM for the chunks, since we handle it externally */ + jobParams.ldmParams.enableLdm = ZSTD_ps_disable; + /* Correct nbWorkers to 0. */ + jobParams.nbWorkers = 0; + + + /* init */ + + /* Perform serial step as early as possible */ + ZSTDMT_serialState_genSequences(job->serial, &rawSeqStore, job->src, job->jobID); + + if (job->cdict) { + size_t const initError = ZSTD_compressBegin_advanced_internal(cctx, NULL, 0, ZSTD_dct_auto, ZSTD_dtlm_fast, job->cdict, &jobParams, job->fullFrameSize); + assert(job->firstJob); /* only allowed for first job */ + if (ZSTD_isError(initError)) JOB_ERROR(initError); + } else { + U64 const pledgedSrcSize = job->firstJob ? job->fullFrameSize : job->src.size; + { size_t const forceWindowError = ZSTD_CCtxParams_setParameter(&jobParams, ZSTD_c_forceMaxWindow, !job->firstJob); + if (ZSTD_isError(forceWindowError)) JOB_ERROR(forceWindowError); + } + if (!job->firstJob) { + size_t const err = ZSTD_CCtxParams_setParameter(&jobParams, ZSTD_c_deterministicRefPrefix, 0); + if (ZSTD_isError(err)) JOB_ERROR(err); + } + DEBUGLOG(6, "ZSTDMT_compressionJob: job %u: loading prefix of size %zu", job->jobID, job->prefix.size); + { size_t const initError = ZSTD_compressBegin_advanced_internal(cctx, + job->prefix.start, job->prefix.size, ZSTD_dct_rawContent, + ZSTD_dtlm_fast, + NULL, /*cdict*/ + &jobParams, pledgedSrcSize); + if (ZSTD_isError(initError)) JOB_ERROR(initError); + } } + + /* External Sequences can only be applied after CCtx initialization */ + ZSTDMT_serialState_applySequences(job->serial, cctx, &rawSeqStore); + + if (!job->firstJob) { /* flush and overwrite frame header when it's not first job */ + size_t const hSize = ZSTD_compressContinue_public(cctx, dstBuff.start, dstBuff.capacity, job->src.start, 0); + if (ZSTD_isError(hSize)) JOB_ERROR(hSize); + DEBUGLOG(5, "ZSTDMT_compressionJob: flush and overwrite %u bytes of frame header (not first job)", (U32)hSize); + ZSTD_invalidateRepCodes(cctx); + } + + /* compress the entire job by smaller chunks, for better granularity */ + { size_t const chunkSize = 4*ZSTD_BLOCKSIZE_MAX; + int const nbChunks = (int)((job->src.size + (chunkSize-1)) / chunkSize); + const BYTE* ip = (const BYTE*) job->src.start; + BYTE* const ostart = (BYTE*)dstBuff.start; + BYTE* op = ostart; + BYTE* oend = op + dstBuff.capacity; + int chunkNb; + if (sizeof(size_t) > sizeof(int)) assert(job->src.size < ((size_t)INT_MAX) * chunkSize); /* check overflow */ + DEBUGLOG(5, "ZSTDMT_compressionJob: compress %u bytes in %i blocks", (U32)job->src.size, nbChunks); + assert(job->cSize == 0); + for (chunkNb = 1; chunkNb < nbChunks; chunkNb++) { + size_t const cSize = ZSTD_compressContinue_public(cctx, op, oend-op, ip, chunkSize); + if (ZSTD_isError(cSize)) JOB_ERROR(cSize); + ip += chunkSize; + op += cSize; assert(op < oend); + /* stats */ + ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex); + job->cSize += cSize; + job->consumed = chunkSize * chunkNb; + DEBUGLOG(5, "ZSTDMT_compressionJob: compress new block : cSize==%u bytes (total: %u)", + (U32)cSize, (U32)job->cSize); + ZSTD_pthread_cond_signal(&job->job_cond); /* warns some more data is ready to be flushed */ + ZSTD_pthread_mutex_unlock(&job->job_mutex); + } + /* last block */ + assert(chunkSize > 0); + assert((chunkSize & (chunkSize - 1)) == 0); /* chunkSize must be power of 2 for mask==(chunkSize-1) to work */ + if ((nbChunks > 0) | job->lastJob /*must output a "last block" flag*/ ) { + size_t const lastBlockSize1 = job->src.size & (chunkSize-1); + size_t const lastBlockSize = ((lastBlockSize1==0) & (job->src.size>=chunkSize)) ? chunkSize : lastBlockSize1; + size_t const cSize = (job->lastJob) ? + ZSTD_compressEnd_public(cctx, op, oend-op, ip, lastBlockSize) : + ZSTD_compressContinue_public(cctx, op, oend-op, ip, lastBlockSize); + if (ZSTD_isError(cSize)) JOB_ERROR(cSize); + lastCBlockSize = cSize; + } } + if (!job->firstJob) { + /* Double check that we don't have an ext-dict, because then our + * repcode invalidation doesn't work. + */ + assert(!ZSTD_window_hasExtDict(cctx->blockState.matchState.window)); + } + ZSTD_CCtx_trace(cctx, 0); + +_endJob: + ZSTDMT_serialState_ensureFinished(job->serial, job->jobID, job->cSize); + if (job->prefix.size > 0) + DEBUGLOG(5, "Finished with prefix: %zx", (size_t)job->prefix.start); + DEBUGLOG(5, "Finished with source: %zx", (size_t)job->src.start); + /* release resources */ + ZSTDMT_releaseSeq(job->seqPool, rawSeqStore); + ZSTDMT_releaseCCtx(job->cctxPool, cctx); + /* report */ + ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex); + if (ZSTD_isError(job->cSize)) assert(lastCBlockSize == 0); + job->cSize += lastCBlockSize; + job->consumed = job->src.size; /* when job->consumed == job->src.size , compression job is presumed completed */ + ZSTD_pthread_cond_signal(&job->job_cond); + ZSTD_pthread_mutex_unlock(&job->job_mutex); +} + + +/* ------------------------------------------ */ +/* ===== Multi-threaded compression ===== */ +/* ------------------------------------------ */ + +typedef struct { + Range prefix; /* read-only non-owned prefix buffer */ + Buffer buffer; + size_t filled; +} InBuff_t; + +typedef struct { + BYTE* buffer; /* The round input buffer. All jobs get references + * to pieces of the buffer. ZSTDMT_tryGetInputRange() + * handles handing out job input buffers, and makes + * sure it doesn't overlap with any pieces still in use. + */ + size_t capacity; /* The capacity of buffer. */ + size_t pos; /* The position of the current inBuff in the round + * buffer. Updated past the end if the inBuff once + * the inBuff is sent to the worker thread. + * pos <= capacity. + */ +} RoundBuff_t; + +static const RoundBuff_t kNullRoundBuff = {NULL, 0, 0}; + +#define RSYNC_LENGTH 32 +/* Don't create chunks smaller than the zstd block size. + * This stops us from regressing compression ratio too much, + * and ensures our output fits in ZSTD_compressBound(). + * + * If this is shrunk < ZSTD_BLOCKSIZELOG_MIN then + * ZSTD_COMPRESSBOUND() will need to be updated. + */ +#define RSYNC_MIN_BLOCK_LOG ZSTD_BLOCKSIZELOG_MAX +#define RSYNC_MIN_BLOCK_SIZE (1< one job is already prepared, but pool has shortage of workers. Don't create a new job. */ + InBuff_t inBuff; + RoundBuff_t roundBuff; + SerialState serial; + RSyncState_t rsync; + unsigned jobIDMask; + unsigned doneJobID; + unsigned nextJobID; + unsigned frameEnded; + unsigned allJobsCompleted; + unsigned long long frameContentSize; + unsigned long long consumed; + unsigned long long produced; + ZSTD_customMem cMem; + ZSTD_CDict* cdictLocal; + const ZSTD_CDict* cdict; + unsigned providedFactory: 1; +}; + +static void ZSTDMT_freeJobsTable(ZSTDMT_jobDescription* jobTable, U32 nbJobs, ZSTD_customMem cMem) +{ + U32 jobNb; + if (jobTable == NULL) return; + for (jobNb=0; jobNb mtctx->jobIDMask+1) { /* need more job capacity */ + ZSTDMT_freeJobsTable(mtctx->jobs, mtctx->jobIDMask+1, mtctx->cMem); + mtctx->jobIDMask = 0; + mtctx->jobs = ZSTDMT_createJobsTable(&nbJobs, mtctx->cMem); + if (mtctx->jobs==NULL) return ERROR(memory_allocation); + assert((nbJobs != 0) && ((nbJobs & (nbJobs - 1)) == 0)); /* ensure nbJobs is a power of 2 */ + mtctx->jobIDMask = nbJobs - 1; + } + return 0; +} + + +/* ZSTDMT_CCtxParam_setNbWorkers(): + * Internal use only */ +static size_t ZSTDMT_CCtxParam_setNbWorkers(ZSTD_CCtx_params* params, unsigned nbWorkers) +{ + return ZSTD_CCtxParams_setParameter(params, ZSTD_c_nbWorkers, (int)nbWorkers); +} + +MEM_STATIC ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced_internal(unsigned nbWorkers, ZSTD_customMem cMem, ZSTD_threadPool* pool) +{ + ZSTDMT_CCtx* mtctx; + U32 nbJobs = nbWorkers + 2; + int initError; + DEBUGLOG(3, "ZSTDMT_createCCtx_advanced (nbWorkers = %u)", nbWorkers); + + if (nbWorkers < 1) return NULL; + nbWorkers = MIN(nbWorkers , ZSTDMT_NBWORKERS_MAX); + if ((cMem.customAlloc!=NULL) ^ (cMem.customFree!=NULL)) + /* invalid custom allocator */ + return NULL; + + mtctx = (ZSTDMT_CCtx*) ZSTD_customCalloc(sizeof(ZSTDMT_CCtx), cMem); + if (!mtctx) return NULL; + ZSTDMT_CCtxParam_setNbWorkers(&mtctx->params, nbWorkers); + mtctx->cMem = cMem; + mtctx->allJobsCompleted = 1; + if (pool != NULL) { + mtctx->factory = pool; + mtctx->providedFactory = 1; + } + else { + mtctx->factory = POOL_create_advanced(nbWorkers, 0, cMem); + mtctx->providedFactory = 0; + } + mtctx->jobs = ZSTDMT_createJobsTable(&nbJobs, cMem); + assert(nbJobs > 0); assert((nbJobs & (nbJobs - 1)) == 0); /* ensure nbJobs is a power of 2 */ + mtctx->jobIDMask = nbJobs - 1; + mtctx->bufPool = ZSTDMT_createBufferPool(BUF_POOL_MAX_NB_BUFFERS(nbWorkers), cMem); + mtctx->cctxPool = ZSTDMT_createCCtxPool(nbWorkers, cMem); + mtctx->seqPool = ZSTDMT_createSeqPool(nbWorkers, cMem); + initError = ZSTDMT_serialState_init(&mtctx->serial); + mtctx->roundBuff = kNullRoundBuff; + if (!mtctx->factory | !mtctx->jobs | !mtctx->bufPool | !mtctx->cctxPool | !mtctx->seqPool | initError) { + ZSTDMT_freeCCtx(mtctx); + return NULL; + } + DEBUGLOG(3, "mt_cctx created, for %u threads", nbWorkers); + return mtctx; +} + +ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbWorkers, ZSTD_customMem cMem, ZSTD_threadPool* pool) +{ +#ifdef ZSTD_MULTITHREAD + return ZSTDMT_createCCtx_advanced_internal(nbWorkers, cMem, pool); +#else + (void)nbWorkers; + (void)cMem; + (void)pool; + return NULL; +#endif +} + + +/* ZSTDMT_releaseAllJobResources() : + * note : ensure all workers are killed first ! */ +static void ZSTDMT_releaseAllJobResources(ZSTDMT_CCtx* mtctx) +{ + unsigned jobID; + DEBUGLOG(3, "ZSTDMT_releaseAllJobResources"); + for (jobID=0; jobID <= mtctx->jobIDMask; jobID++) { + /* Copy the mutex/cond out */ + ZSTD_pthread_mutex_t const mutex = mtctx->jobs[jobID].job_mutex; + ZSTD_pthread_cond_t const cond = mtctx->jobs[jobID].job_cond; + + DEBUGLOG(4, "job%02u: release dst address %08X", jobID, (U32)(size_t)mtctx->jobs[jobID].dstBuff.start); + ZSTDMT_releaseBuffer(mtctx->bufPool, mtctx->jobs[jobID].dstBuff); + + /* Clear the job description, but keep the mutex/cond */ + ZSTD_memset(&mtctx->jobs[jobID], 0, sizeof(mtctx->jobs[jobID])); + mtctx->jobs[jobID].job_mutex = mutex; + mtctx->jobs[jobID].job_cond = cond; + } + mtctx->inBuff.buffer = g_nullBuffer; + mtctx->inBuff.filled = 0; + mtctx->allJobsCompleted = 1; +} + +static void ZSTDMT_waitForAllJobsCompleted(ZSTDMT_CCtx* mtctx) +{ + DEBUGLOG(4, "ZSTDMT_waitForAllJobsCompleted"); + while (mtctx->doneJobID < mtctx->nextJobID) { + unsigned const jobID = mtctx->doneJobID & mtctx->jobIDMask; + ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[jobID].job_mutex); + while (mtctx->jobs[jobID].consumed < mtctx->jobs[jobID].src.size) { + DEBUGLOG(4, "waiting for jobCompleted signal from job %u", mtctx->doneJobID); /* we want to block when waiting for data to flush */ + ZSTD_pthread_cond_wait(&mtctx->jobs[jobID].job_cond, &mtctx->jobs[jobID].job_mutex); + } + ZSTD_pthread_mutex_unlock(&mtctx->jobs[jobID].job_mutex); + mtctx->doneJobID++; + } +} + +size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* mtctx) +{ + if (mtctx==NULL) return 0; /* compatible with free on NULL */ + if (!mtctx->providedFactory) + POOL_free(mtctx->factory); /* stop and free worker threads */ + ZSTDMT_releaseAllJobResources(mtctx); /* release job resources into pools first */ + ZSTDMT_freeJobsTable(mtctx->jobs, mtctx->jobIDMask+1, mtctx->cMem); + ZSTDMT_freeBufferPool(mtctx->bufPool); + ZSTDMT_freeCCtxPool(mtctx->cctxPool); + ZSTDMT_freeSeqPool(mtctx->seqPool); + ZSTDMT_serialState_free(&mtctx->serial); + ZSTD_freeCDict(mtctx->cdictLocal); + if (mtctx->roundBuff.buffer) + ZSTD_customFree(mtctx->roundBuff.buffer, mtctx->cMem); + ZSTD_customFree(mtctx, mtctx->cMem); + return 0; +} + +size_t ZSTDMT_sizeof_CCtx(ZSTDMT_CCtx* mtctx) +{ + if (mtctx == NULL) return 0; /* supports sizeof NULL */ + return sizeof(*mtctx) + + POOL_sizeof(mtctx->factory) + + ZSTDMT_sizeof_bufferPool(mtctx->bufPool) + + (mtctx->jobIDMask+1) * sizeof(ZSTDMT_jobDescription) + + ZSTDMT_sizeof_CCtxPool(mtctx->cctxPool) + + ZSTDMT_sizeof_seqPool(mtctx->seqPool) + + ZSTD_sizeof_CDict(mtctx->cdictLocal) + + mtctx->roundBuff.capacity; +} + + +/* ZSTDMT_resize() : + * @return : error code if fails, 0 on success */ +static size_t ZSTDMT_resize(ZSTDMT_CCtx* mtctx, unsigned nbWorkers) +{ + if (POOL_resize(mtctx->factory, nbWorkers)) return ERROR(memory_allocation); + FORWARD_IF_ERROR( ZSTDMT_expandJobsTable(mtctx, nbWorkers) , ""); + mtctx->bufPool = ZSTDMT_expandBufferPool(mtctx->bufPool, BUF_POOL_MAX_NB_BUFFERS(nbWorkers)); + if (mtctx->bufPool == NULL) return ERROR(memory_allocation); + mtctx->cctxPool = ZSTDMT_expandCCtxPool(mtctx->cctxPool, nbWorkers); + if (mtctx->cctxPool == NULL) return ERROR(memory_allocation); + mtctx->seqPool = ZSTDMT_expandSeqPool(mtctx->seqPool, nbWorkers); + if (mtctx->seqPool == NULL) return ERROR(memory_allocation); + ZSTDMT_CCtxParam_setNbWorkers(&mtctx->params, nbWorkers); + return 0; +} + + +/*! ZSTDMT_updateCParams_whileCompressing() : + * Updates a selected set of compression parameters, remaining compatible with currently active frame. + * New parameters will be applied to next compression job. */ +void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_params* cctxParams) +{ + U32 const saved_wlog = mtctx->params.cParams.windowLog; /* Do not modify windowLog while compressing */ + int const compressionLevel = cctxParams->compressionLevel; + DEBUGLOG(5, "ZSTDMT_updateCParams_whileCompressing (level:%i)", + compressionLevel); + mtctx->params.compressionLevel = compressionLevel; + { ZSTD_compressionParameters cParams = ZSTD_getCParamsFromCCtxParams(cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); + cParams.windowLog = saved_wlog; + mtctx->params.cParams = cParams; + } +} + +/* ZSTDMT_getFrameProgression(): + * tells how much data has been consumed (input) and produced (output) for current frame. + * able to count progression inside worker threads. + * Note : mutex will be acquired during statistics collection inside workers. */ +ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx) +{ + ZSTD_frameProgression fps; + DEBUGLOG(5, "ZSTDMT_getFrameProgression"); + fps.ingested = mtctx->consumed + mtctx->inBuff.filled; + fps.consumed = mtctx->consumed; + fps.produced = fps.flushed = mtctx->produced; + fps.currentJobID = mtctx->nextJobID; + fps.nbActiveWorkers = 0; + { unsigned jobNb; + unsigned lastJobNb = mtctx->nextJobID + mtctx->jobReady; assert(mtctx->jobReady <= 1); + DEBUGLOG(6, "ZSTDMT_getFrameProgression: jobs: from %u to <%u (jobReady:%u)", + mtctx->doneJobID, lastJobNb, mtctx->jobReady); + for (jobNb = mtctx->doneJobID ; jobNb < lastJobNb ; jobNb++) { + unsigned const wJobID = jobNb & mtctx->jobIDMask; + ZSTDMT_jobDescription* jobPtr = &mtctx->jobs[wJobID]; + ZSTD_pthread_mutex_lock(&jobPtr->job_mutex); + { size_t const cResult = jobPtr->cSize; + size_t const produced = ZSTD_isError(cResult) ? 0 : cResult; + size_t const flushed = ZSTD_isError(cResult) ? 0 : jobPtr->dstFlushed; + assert(flushed <= produced); + fps.ingested += jobPtr->src.size; + fps.consumed += jobPtr->consumed; + fps.produced += produced; + fps.flushed += flushed; + fps.nbActiveWorkers += (jobPtr->consumed < jobPtr->src.size); + } + ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex); + } + } + return fps; +} + + +size_t ZSTDMT_toFlushNow(ZSTDMT_CCtx* mtctx) +{ + size_t toFlush; + unsigned const jobID = mtctx->doneJobID; + assert(jobID <= mtctx->nextJobID); + if (jobID == mtctx->nextJobID) return 0; /* no active job => nothing to flush */ + + /* look into oldest non-fully-flushed job */ + { unsigned const wJobID = jobID & mtctx->jobIDMask; + ZSTDMT_jobDescription* const jobPtr = &mtctx->jobs[wJobID]; + ZSTD_pthread_mutex_lock(&jobPtr->job_mutex); + { size_t const cResult = jobPtr->cSize; + size_t const produced = ZSTD_isError(cResult) ? 0 : cResult; + size_t const flushed = ZSTD_isError(cResult) ? 0 : jobPtr->dstFlushed; + assert(flushed <= produced); + assert(jobPtr->consumed <= jobPtr->src.size); + toFlush = produced - flushed; + /* if toFlush==0, nothing is available to flush. + * However, jobID is expected to still be active: + * if jobID was already completed and fully flushed, + * ZSTDMT_flushProduced() should have already moved onto next job. + * Therefore, some input has not yet been consumed. */ + if (toFlush==0) { + assert(jobPtr->consumed < jobPtr->src.size); + } + } + ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex); + } + + return toFlush; +} + + +/* ------------------------------------------ */ +/* ===== Multi-threaded compression ===== */ +/* ------------------------------------------ */ + +static unsigned ZSTDMT_computeTargetJobLog(const ZSTD_CCtx_params* params) +{ + unsigned jobLog; + if (params->ldmParams.enableLdm == ZSTD_ps_enable) { + /* In Long Range Mode, the windowLog is typically oversized. + * In which case, it's preferable to determine the jobSize + * based on cycleLog instead. */ + jobLog = MAX(21, ZSTD_cycleLog(params->cParams.chainLog, params->cParams.strategy) + 3); + } else { + jobLog = MAX(20, params->cParams.windowLog + 2); + } + return MIN(jobLog, (unsigned)ZSTDMT_JOBLOG_MAX); +} + +static int ZSTDMT_overlapLog_default(ZSTD_strategy strat) +{ + switch(strat) + { + case ZSTD_btultra2: + return 9; + case ZSTD_btultra: + case ZSTD_btopt: + return 8; + case ZSTD_btlazy2: + case ZSTD_lazy2: + return 7; + case ZSTD_lazy: + case ZSTD_greedy: + case ZSTD_dfast: + case ZSTD_fast: + default:; + } + return 6; +} + +static int ZSTDMT_overlapLog(int ovlog, ZSTD_strategy strat) +{ + assert(0 <= ovlog && ovlog <= 9); + if (ovlog == 0) return ZSTDMT_overlapLog_default(strat); + return ovlog; +} + +static size_t ZSTDMT_computeOverlapSize(const ZSTD_CCtx_params* params) +{ + int const overlapRLog = 9 - ZSTDMT_overlapLog(params->overlapLog, params->cParams.strategy); + int ovLog = (overlapRLog >= 8) ? 0 : (params->cParams.windowLog - overlapRLog); + assert(0 <= overlapRLog && overlapRLog <= 8); + if (params->ldmParams.enableLdm == ZSTD_ps_enable) { + /* In Long Range Mode, the windowLog is typically oversized. + * In which case, it's preferable to determine the jobSize + * based on chainLog instead. + * Then, ovLog becomes a fraction of the jobSize, rather than windowSize */ + ovLog = MIN(params->cParams.windowLog, ZSTDMT_computeTargetJobLog(params) - 2) + - overlapRLog; + } + assert(0 <= ovLog && ovLog <= ZSTD_WINDOWLOG_MAX); + DEBUGLOG(4, "overlapLog : %i", params->overlapLog); + DEBUGLOG(4, "overlap size : %i", 1 << ovLog); + return (ovLog==0) ? 0 : (size_t)1 << ovLog; +} + +/* ====================================== */ +/* ======= Streaming API ======= */ +/* ====================================== */ + +size_t ZSTDMT_initCStream_internal( + ZSTDMT_CCtx* mtctx, + const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType, + const ZSTD_CDict* cdict, ZSTD_CCtx_params params, + unsigned long long pledgedSrcSize) +{ + DEBUGLOG(4, "ZSTDMT_initCStream_internal (pledgedSrcSize=%u, nbWorkers=%u, cctxPool=%u)", + (U32)pledgedSrcSize, params.nbWorkers, mtctx->cctxPool->totalCCtx); + + /* params supposed partially fully validated at this point */ + assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); + assert(!((dict) && (cdict))); /* either dict or cdict, not both */ + + /* init */ + if (params.nbWorkers != mtctx->params.nbWorkers) + FORWARD_IF_ERROR( ZSTDMT_resize(mtctx, (unsigned)params.nbWorkers) , ""); + + if (params.jobSize != 0 && params.jobSize < ZSTDMT_JOBSIZE_MIN) params.jobSize = ZSTDMT_JOBSIZE_MIN; + if (params.jobSize > (size_t)ZSTDMT_JOBSIZE_MAX) params.jobSize = (size_t)ZSTDMT_JOBSIZE_MAX; + + if (mtctx->allJobsCompleted == 0) { /* previous compression not correctly finished */ + ZSTDMT_waitForAllJobsCompleted(mtctx); + ZSTDMT_releaseAllJobResources(mtctx); + mtctx->allJobsCompleted = 1; + } + + mtctx->params = params; + mtctx->frameContentSize = pledgedSrcSize; + ZSTD_freeCDict(mtctx->cdictLocal); + if (dict) { + mtctx->cdictLocal = ZSTD_createCDict_advanced(dict, dictSize, + ZSTD_dlm_byCopy, dictContentType, /* note : a loadPrefix becomes an internal CDict */ + params.cParams, mtctx->cMem); + mtctx->cdict = mtctx->cdictLocal; + if (mtctx->cdictLocal == NULL) return ERROR(memory_allocation); + } else { + mtctx->cdictLocal = NULL; + mtctx->cdict = cdict; + } + + mtctx->targetPrefixSize = ZSTDMT_computeOverlapSize(¶ms); + DEBUGLOG(4, "overlapLog=%i => %u KB", params.overlapLog, (U32)(mtctx->targetPrefixSize>>10)); + mtctx->targetSectionSize = params.jobSize; + if (mtctx->targetSectionSize == 0) { + mtctx->targetSectionSize = 1ULL << ZSTDMT_computeTargetJobLog(¶ms); + } + assert(mtctx->targetSectionSize <= (size_t)ZSTDMT_JOBSIZE_MAX); + + if (params.rsyncable) { + /* Aim for the targetsectionSize as the average job size. */ + U32 const jobSizeKB = (U32)(mtctx->targetSectionSize >> 10); + U32 const rsyncBits = (assert(jobSizeKB >= 1), ZSTD_highbit32(jobSizeKB) + 10); + /* We refuse to create jobs < RSYNC_MIN_BLOCK_SIZE bytes, so make sure our + * expected job size is at least 4x larger. */ + assert(rsyncBits >= RSYNC_MIN_BLOCK_LOG + 2); + DEBUGLOG(4, "rsyncLog = %u", rsyncBits); + mtctx->rsync.hash = 0; + mtctx->rsync.hitMask = (1ULL << rsyncBits) - 1; + mtctx->rsync.primePower = ZSTD_rollingHash_primePower(RSYNC_LENGTH); + } + if (mtctx->targetSectionSize < mtctx->targetPrefixSize) mtctx->targetSectionSize = mtctx->targetPrefixSize; /* job size must be >= overlap size */ + DEBUGLOG(4, "Job Size : %u KB (note : set to %u)", (U32)(mtctx->targetSectionSize>>10), (U32)params.jobSize); + DEBUGLOG(4, "inBuff Size : %u KB", (U32)(mtctx->targetSectionSize>>10)); + ZSTDMT_setBufferSize(mtctx->bufPool, ZSTD_compressBound(mtctx->targetSectionSize)); + { + /* If ldm is enabled we need windowSize space. */ + size_t const windowSize = mtctx->params.ldmParams.enableLdm == ZSTD_ps_enable ? (1U << mtctx->params.cParams.windowLog) : 0; + /* Two buffers of slack, plus extra space for the overlap + * This is the minimum slack that LDM works with. One extra because + * flush might waste up to targetSectionSize-1 bytes. Another extra + * for the overlap (if > 0), then one to fill which doesn't overlap + * with the LDM window. + */ + size_t const nbSlackBuffers = 2 + (mtctx->targetPrefixSize > 0); + size_t const slackSize = mtctx->targetSectionSize * nbSlackBuffers; + /* Compute the total size, and always have enough slack */ + size_t const nbWorkers = MAX(mtctx->params.nbWorkers, 1); + size_t const sectionsSize = mtctx->targetSectionSize * nbWorkers; + size_t const capacity = MAX(windowSize, sectionsSize) + slackSize; + if (mtctx->roundBuff.capacity < capacity) { + if (mtctx->roundBuff.buffer) + ZSTD_customFree(mtctx->roundBuff.buffer, mtctx->cMem); + mtctx->roundBuff.buffer = (BYTE*)ZSTD_customMalloc(capacity, mtctx->cMem); + if (mtctx->roundBuff.buffer == NULL) { + mtctx->roundBuff.capacity = 0; + return ERROR(memory_allocation); + } + mtctx->roundBuff.capacity = capacity; + } + } + DEBUGLOG(4, "roundBuff capacity : %u KB", (U32)(mtctx->roundBuff.capacity>>10)); + mtctx->roundBuff.pos = 0; + mtctx->inBuff.buffer = g_nullBuffer; + mtctx->inBuff.filled = 0; + mtctx->inBuff.prefix = kNullRange; + mtctx->doneJobID = 0; + mtctx->nextJobID = 0; + mtctx->frameEnded = 0; + mtctx->allJobsCompleted = 0; + mtctx->consumed = 0; + mtctx->produced = 0; + + /* update dictionary */ + ZSTD_freeCDict(mtctx->cdictLocal); + mtctx->cdictLocal = NULL; + mtctx->cdict = NULL; + if (dict) { + if (dictContentType == ZSTD_dct_rawContent) { + mtctx->inBuff.prefix.start = (const BYTE*)dict; + mtctx->inBuff.prefix.size = dictSize; + } else { + /* note : a loadPrefix becomes an internal CDict */ + mtctx->cdictLocal = ZSTD_createCDict_advanced(dict, dictSize, + ZSTD_dlm_byRef, dictContentType, + params.cParams, mtctx->cMem); + mtctx->cdict = mtctx->cdictLocal; + if (mtctx->cdictLocal == NULL) return ERROR(memory_allocation); + } + } else { + mtctx->cdict = cdict; + } + + if (ZSTDMT_serialState_reset(&mtctx->serial, mtctx->seqPool, params, mtctx->targetSectionSize, + dict, dictSize, dictContentType)) + return ERROR(memory_allocation); + + + return 0; +} + + +/* ZSTDMT_writeLastEmptyBlock() + * Write a single empty block with an end-of-frame to finish a frame. + * Job must be created from streaming variant. + * This function is always successful if expected conditions are fulfilled. + */ +static void ZSTDMT_writeLastEmptyBlock(ZSTDMT_jobDescription* job) +{ + assert(job->lastJob == 1); + assert(job->src.size == 0); /* last job is empty -> will be simplified into a last empty block */ + assert(job->firstJob == 0); /* cannot be first job, as it also needs to create frame header */ + assert(job->dstBuff.start == NULL); /* invoked from streaming variant only (otherwise, dstBuff might be user's output) */ + job->dstBuff = ZSTDMT_getBuffer(job->bufPool); + if (job->dstBuff.start == NULL) { + job->cSize = ERROR(memory_allocation); + return; + } + assert(job->dstBuff.capacity >= ZSTD_blockHeaderSize); /* no buffer should ever be that small */ + job->src = kNullRange; + job->cSize = ZSTD_writeLastEmptyBlock(job->dstBuff.start, job->dstBuff.capacity); + assert(!ZSTD_isError(job->cSize)); + assert(job->consumed == 0); +} + +static size_t ZSTDMT_createCompressionJob(ZSTDMT_CCtx* mtctx, size_t srcSize, ZSTD_EndDirective endOp) +{ + unsigned const jobID = mtctx->nextJobID & mtctx->jobIDMask; + int const endFrame = (endOp == ZSTD_e_end); + + if (mtctx->nextJobID > mtctx->doneJobID + mtctx->jobIDMask) { + DEBUGLOG(5, "ZSTDMT_createCompressionJob: will not create new job : table is full"); + assert((mtctx->nextJobID & mtctx->jobIDMask) == (mtctx->doneJobID & mtctx->jobIDMask)); + return 0; + } + + if (!mtctx->jobReady) { + BYTE const* src = (BYTE const*)mtctx->inBuff.buffer.start; + DEBUGLOG(5, "ZSTDMT_createCompressionJob: preparing job %u to compress %u bytes with %u preload ", + mtctx->nextJobID, (U32)srcSize, (U32)mtctx->inBuff.prefix.size); + mtctx->jobs[jobID].src.start = src; + mtctx->jobs[jobID].src.size = srcSize; + assert(mtctx->inBuff.filled >= srcSize); + mtctx->jobs[jobID].prefix = mtctx->inBuff.prefix; + mtctx->jobs[jobID].consumed = 0; + mtctx->jobs[jobID].cSize = 0; + mtctx->jobs[jobID].params = mtctx->params; + mtctx->jobs[jobID].cdict = mtctx->nextJobID==0 ? mtctx->cdict : NULL; + mtctx->jobs[jobID].fullFrameSize = mtctx->frameContentSize; + mtctx->jobs[jobID].dstBuff = g_nullBuffer; + mtctx->jobs[jobID].cctxPool = mtctx->cctxPool; + mtctx->jobs[jobID].bufPool = mtctx->bufPool; + mtctx->jobs[jobID].seqPool = mtctx->seqPool; + mtctx->jobs[jobID].serial = &mtctx->serial; + mtctx->jobs[jobID].jobID = mtctx->nextJobID; + mtctx->jobs[jobID].firstJob = (mtctx->nextJobID==0); + mtctx->jobs[jobID].lastJob = endFrame; + mtctx->jobs[jobID].frameChecksumNeeded = mtctx->params.fParams.checksumFlag && endFrame && (mtctx->nextJobID>0); + mtctx->jobs[jobID].dstFlushed = 0; + + /* Update the round buffer pos and clear the input buffer to be reset */ + mtctx->roundBuff.pos += srcSize; + mtctx->inBuff.buffer = g_nullBuffer; + mtctx->inBuff.filled = 0; + /* Set the prefix for next job */ + if (!endFrame) { + size_t const newPrefixSize = MIN(srcSize, mtctx->targetPrefixSize); + mtctx->inBuff.prefix.start = src + srcSize - newPrefixSize; + mtctx->inBuff.prefix.size = newPrefixSize; + } else { /* endFrame==1 => no need for another input buffer */ + mtctx->inBuff.prefix = kNullRange; + mtctx->frameEnded = endFrame; + if (mtctx->nextJobID == 0) { + /* single job exception : checksum is already calculated directly within worker thread */ + mtctx->params.fParams.checksumFlag = 0; + } } + + if ( (srcSize == 0) + && (mtctx->nextJobID>0)/*single job must also write frame header*/ ) { + DEBUGLOG(5, "ZSTDMT_createCompressionJob: creating a last empty block to end frame"); + assert(endOp == ZSTD_e_end); /* only possible case : need to end the frame with an empty last block */ + ZSTDMT_writeLastEmptyBlock(mtctx->jobs + jobID); + mtctx->nextJobID++; + return 0; + } + } + + DEBUGLOG(5, "ZSTDMT_createCompressionJob: posting job %u : %u bytes (end:%u, jobNb == %u (mod:%u))", + mtctx->nextJobID, + (U32)mtctx->jobs[jobID].src.size, + mtctx->jobs[jobID].lastJob, + mtctx->nextJobID, + jobID); + if (POOL_tryAdd(mtctx->factory, ZSTDMT_compressionJob, &mtctx->jobs[jobID])) { + mtctx->nextJobID++; + mtctx->jobReady = 0; + } else { + DEBUGLOG(5, "ZSTDMT_createCompressionJob: no worker available for job %u", mtctx->nextJobID); + mtctx->jobReady = 1; + } + return 0; +} + + +/*! ZSTDMT_flushProduced() : + * flush whatever data has been produced but not yet flushed in current job. + * move to next job if current one is fully flushed. + * `output` : `pos` will be updated with amount of data flushed . + * `blockToFlush` : if >0, the function will block and wait if there is no data available to flush . + * @return : amount of data remaining within internal buffer, 0 if no more, 1 if unknown but > 0, or an error code */ +static size_t ZSTDMT_flushProduced(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, unsigned blockToFlush, ZSTD_EndDirective end) +{ + unsigned const wJobID = mtctx->doneJobID & mtctx->jobIDMask; + DEBUGLOG(5, "ZSTDMT_flushProduced (blocking:%u , job %u <= %u)", + blockToFlush, mtctx->doneJobID, mtctx->nextJobID); + assert(output->size >= output->pos); + + ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[wJobID].job_mutex); + if ( blockToFlush + && (mtctx->doneJobID < mtctx->nextJobID) ) { + assert(mtctx->jobs[wJobID].dstFlushed <= mtctx->jobs[wJobID].cSize); + while (mtctx->jobs[wJobID].dstFlushed == mtctx->jobs[wJobID].cSize) { /* nothing to flush */ + if (mtctx->jobs[wJobID].consumed == mtctx->jobs[wJobID].src.size) { + DEBUGLOG(5, "job %u is completely consumed (%u == %u) => don't wait for cond, there will be none", + mtctx->doneJobID, (U32)mtctx->jobs[wJobID].consumed, (U32)mtctx->jobs[wJobID].src.size); + break; + } + DEBUGLOG(5, "waiting for something to flush from job %u (currently flushed: %u bytes)", + mtctx->doneJobID, (U32)mtctx->jobs[wJobID].dstFlushed); + ZSTD_pthread_cond_wait(&mtctx->jobs[wJobID].job_cond, &mtctx->jobs[wJobID].job_mutex); /* block when nothing to flush but some to come */ + } } + + /* try to flush something */ + { size_t cSize = mtctx->jobs[wJobID].cSize; /* shared */ + size_t const srcConsumed = mtctx->jobs[wJobID].consumed; /* shared */ + size_t const srcSize = mtctx->jobs[wJobID].src.size; /* read-only, could be done after mutex lock, but no-declaration-after-statement */ + ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex); + if (ZSTD_isError(cSize)) { + DEBUGLOG(5, "ZSTDMT_flushProduced: job %u : compression error detected : %s", + mtctx->doneJobID, ZSTD_getErrorName(cSize)); + ZSTDMT_waitForAllJobsCompleted(mtctx); + ZSTDMT_releaseAllJobResources(mtctx); + return cSize; + } + /* add frame checksum if necessary (can only happen once) */ + assert(srcConsumed <= srcSize); + if ( (srcConsumed == srcSize) /* job completed -> worker no longer active */ + && mtctx->jobs[wJobID].frameChecksumNeeded ) { + U32 const checksum = (U32)XXH64_digest(&mtctx->serial.xxhState); + DEBUGLOG(4, "ZSTDMT_flushProduced: writing checksum : %08X \n", checksum); + MEM_writeLE32((char*)mtctx->jobs[wJobID].dstBuff.start + mtctx->jobs[wJobID].cSize, checksum); + cSize += 4; + mtctx->jobs[wJobID].cSize += 4; /* can write this shared value, as worker is no longer active */ + mtctx->jobs[wJobID].frameChecksumNeeded = 0; + } + + if (cSize > 0) { /* compression is ongoing or completed */ + size_t const toFlush = MIN(cSize - mtctx->jobs[wJobID].dstFlushed, output->size - output->pos); + DEBUGLOG(5, "ZSTDMT_flushProduced: Flushing %u bytes from job %u (completion:%u/%u, generated:%u)", + (U32)toFlush, mtctx->doneJobID, (U32)srcConsumed, (U32)srcSize, (U32)cSize); + assert(mtctx->doneJobID < mtctx->nextJobID); + assert(cSize >= mtctx->jobs[wJobID].dstFlushed); + assert(mtctx->jobs[wJobID].dstBuff.start != NULL); + if (toFlush > 0) { + ZSTD_memcpy((char*)output->dst + output->pos, + (const char*)mtctx->jobs[wJobID].dstBuff.start + mtctx->jobs[wJobID].dstFlushed, + toFlush); + } + output->pos += toFlush; + mtctx->jobs[wJobID].dstFlushed += toFlush; /* can write : this value is only used by mtctx */ + + if ( (srcConsumed == srcSize) /* job is completed */ + && (mtctx->jobs[wJobID].dstFlushed == cSize) ) { /* output buffer fully flushed => free this job position */ + DEBUGLOG(5, "Job %u completed (%u bytes), moving to next one", + mtctx->doneJobID, (U32)mtctx->jobs[wJobID].dstFlushed); + ZSTDMT_releaseBuffer(mtctx->bufPool, mtctx->jobs[wJobID].dstBuff); + DEBUGLOG(5, "dstBuffer released"); + mtctx->jobs[wJobID].dstBuff = g_nullBuffer; + mtctx->jobs[wJobID].cSize = 0; /* ensure this job slot is considered "not started" in future check */ + mtctx->consumed += srcSize; + mtctx->produced += cSize; + mtctx->doneJobID++; + } } + + /* return value : how many bytes left in buffer ; fake it to 1 when unknown but >0 */ + if (cSize > mtctx->jobs[wJobID].dstFlushed) return (cSize - mtctx->jobs[wJobID].dstFlushed); + if (srcSize > srcConsumed) return 1; /* current job not completely compressed */ + } + if (mtctx->doneJobID < mtctx->nextJobID) return 1; /* some more jobs ongoing */ + if (mtctx->jobReady) return 1; /* one job is ready to push, just not yet in the list */ + if (mtctx->inBuff.filled > 0) return 1; /* input is not empty, and still needs to be converted into a job */ + mtctx->allJobsCompleted = mtctx->frameEnded; /* all jobs are entirely flushed => if this one is last one, frame is completed */ + if (end == ZSTD_e_end) return !mtctx->frameEnded; /* for ZSTD_e_end, question becomes : is frame completed ? instead of : are internal buffers fully flushed ? */ + return 0; /* internal buffers fully flushed */ +} + +/** + * Returns the range of data used by the earliest job that is not yet complete. + * If the data of the first job is broken up into two segments, we cover both + * sections. + */ +static Range ZSTDMT_getInputDataInUse(ZSTDMT_CCtx* mtctx) +{ + unsigned const firstJobID = mtctx->doneJobID; + unsigned const lastJobID = mtctx->nextJobID; + unsigned jobID; + + /* no need to check during first round */ + size_t roundBuffCapacity = mtctx->roundBuff.capacity; + size_t nbJobs1stRoundMin = roundBuffCapacity / mtctx->targetSectionSize; + if (lastJobID < nbJobs1stRoundMin) return kNullRange; + + for (jobID = firstJobID; jobID < lastJobID; ++jobID) { + unsigned const wJobID = jobID & mtctx->jobIDMask; + size_t consumed; + + ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[wJobID].job_mutex); + consumed = mtctx->jobs[wJobID].consumed; + ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex); + + if (consumed < mtctx->jobs[wJobID].src.size) { + Range range = mtctx->jobs[wJobID].prefix; + if (range.size == 0) { + /* Empty prefix */ + range = mtctx->jobs[wJobID].src; + } + /* Job source in multiple segments not supported yet */ + assert(range.start <= mtctx->jobs[wJobID].src.start); + return range; + } + } + return kNullRange; +} + +/** + * Returns non-zero iff buffer and range overlap. + */ +static int ZSTDMT_isOverlapped(Buffer buffer, Range range) +{ + BYTE const* const bufferStart = (BYTE const*)buffer.start; + BYTE const* const rangeStart = (BYTE const*)range.start; + + if (rangeStart == NULL || bufferStart == NULL) + return 0; + + { + BYTE const* const bufferEnd = bufferStart + buffer.capacity; + BYTE const* const rangeEnd = rangeStart + range.size; + + /* Empty ranges cannot overlap */ + if (bufferStart == bufferEnd || rangeStart == rangeEnd) + return 0; + + return bufferStart < rangeEnd && rangeStart < bufferEnd; + } +} + +static int ZSTDMT_doesOverlapWindow(Buffer buffer, ZSTD_window_t window) +{ + Range extDict; + Range prefix; + + DEBUGLOG(5, "ZSTDMT_doesOverlapWindow"); + extDict.start = window.dictBase + window.lowLimit; + extDict.size = window.dictLimit - window.lowLimit; + + prefix.start = window.base + window.dictLimit; + prefix.size = window.nextSrc - (window.base + window.dictLimit); + DEBUGLOG(5, "extDict [0x%zx, 0x%zx)", + (size_t)extDict.start, + (size_t)extDict.start + extDict.size); + DEBUGLOG(5, "prefix [0x%zx, 0x%zx)", + (size_t)prefix.start, + (size_t)prefix.start + prefix.size); + + return ZSTDMT_isOverlapped(buffer, extDict) + || ZSTDMT_isOverlapped(buffer, prefix); +} + +static void ZSTDMT_waitForLdmComplete(ZSTDMT_CCtx* mtctx, Buffer buffer) +{ + if (mtctx->params.ldmParams.enableLdm == ZSTD_ps_enable) { + ZSTD_pthread_mutex_t* mutex = &mtctx->serial.ldmWindowMutex; + DEBUGLOG(5, "ZSTDMT_waitForLdmComplete"); + DEBUGLOG(5, "source [0x%zx, 0x%zx)", + (size_t)buffer.start, + (size_t)buffer.start + buffer.capacity); + ZSTD_PTHREAD_MUTEX_LOCK(mutex); + while (ZSTDMT_doesOverlapWindow(buffer, mtctx->serial.ldmWindow)) { + DEBUGLOG(5, "Waiting for LDM to finish..."); + ZSTD_pthread_cond_wait(&mtctx->serial.ldmWindowCond, mutex); + } + DEBUGLOG(6, "Done waiting for LDM to finish"); + ZSTD_pthread_mutex_unlock(mutex); + } +} + +/** + * Attempts to set the inBuff to the next section to fill. + * If any part of the new section is still in use we give up. + * Returns non-zero if the buffer is filled. + */ +static int ZSTDMT_tryGetInputRange(ZSTDMT_CCtx* mtctx) +{ + Range const inUse = ZSTDMT_getInputDataInUse(mtctx); + size_t const spaceLeft = mtctx->roundBuff.capacity - mtctx->roundBuff.pos; + size_t const spaceNeeded = mtctx->targetSectionSize; + Buffer buffer; + + DEBUGLOG(5, "ZSTDMT_tryGetInputRange"); + assert(mtctx->inBuff.buffer.start == NULL); + assert(mtctx->roundBuff.capacity >= spaceNeeded); + + if (spaceLeft < spaceNeeded) { + /* ZSTD_invalidateRepCodes() doesn't work for extDict variants. + * Simply copy the prefix to the beginning in that case. + */ + BYTE* const start = (BYTE*)mtctx->roundBuff.buffer; + size_t const prefixSize = mtctx->inBuff.prefix.size; + + buffer.start = start; + buffer.capacity = prefixSize; + if (ZSTDMT_isOverlapped(buffer, inUse)) { + DEBUGLOG(5, "Waiting for buffer..."); + return 0; + } + ZSTDMT_waitForLdmComplete(mtctx, buffer); + ZSTD_memmove(start, mtctx->inBuff.prefix.start, prefixSize); + mtctx->inBuff.prefix.start = start; + mtctx->roundBuff.pos = prefixSize; + } + buffer.start = mtctx->roundBuff.buffer + mtctx->roundBuff.pos; + buffer.capacity = spaceNeeded; + + if (ZSTDMT_isOverlapped(buffer, inUse)) { + DEBUGLOG(5, "Waiting for buffer..."); + return 0; + } + assert(!ZSTDMT_isOverlapped(buffer, mtctx->inBuff.prefix)); + + ZSTDMT_waitForLdmComplete(mtctx, buffer); + + DEBUGLOG(5, "Using prefix range [%zx, %zx)", + (size_t)mtctx->inBuff.prefix.start, + (size_t)mtctx->inBuff.prefix.start + mtctx->inBuff.prefix.size); + DEBUGLOG(5, "Using source range [%zx, %zx)", + (size_t)buffer.start, + (size_t)buffer.start + buffer.capacity); + + + mtctx->inBuff.buffer = buffer; + mtctx->inBuff.filled = 0; + assert(mtctx->roundBuff.pos + buffer.capacity <= mtctx->roundBuff.capacity); + return 1; +} + +typedef struct { + size_t toLoad; /* The number of bytes to load from the input. */ + int flush; /* Boolean declaring if we must flush because we found a synchronization point. */ +} SyncPoint; + +/** + * Searches through the input for a synchronization point. If one is found, we + * will instruct the caller to flush, and return the number of bytes to load. + * Otherwise, we will load as many bytes as possible and instruct the caller + * to continue as normal. + */ +static SyncPoint +findSynchronizationPoint(ZSTDMT_CCtx const* mtctx, ZSTD_inBuffer const input) +{ + BYTE const* const istart = (BYTE const*)input.src + input.pos; + U64 const primePower = mtctx->rsync.primePower; + U64 const hitMask = mtctx->rsync.hitMask; + + SyncPoint syncPoint; + U64 hash; + BYTE const* prev; + size_t pos; + + syncPoint.toLoad = MIN(input.size - input.pos, mtctx->targetSectionSize - mtctx->inBuff.filled); + syncPoint.flush = 0; + if (!mtctx->params.rsyncable) + /* Rsync is disabled. */ + return syncPoint; + if (mtctx->inBuff.filled + input.size - input.pos < RSYNC_MIN_BLOCK_SIZE) + /* We don't emit synchronization points if it would produce too small blocks. + * We don't have enough input to find a synchronization point, so don't look. + */ + return syncPoint; + if (mtctx->inBuff.filled + syncPoint.toLoad < RSYNC_LENGTH) + /* Not enough to compute the hash. + * We will miss any synchronization points in this RSYNC_LENGTH byte + * window. However, since it depends only in the internal buffers, if the + * state is already synchronized, we will remain synchronized. + * Additionally, the probability that we miss a synchronization point is + * low: RSYNC_LENGTH / targetSectionSize. + */ + return syncPoint; + /* Initialize the loop variables. */ + if (mtctx->inBuff.filled < RSYNC_MIN_BLOCK_SIZE) { + /* We don't need to scan the first RSYNC_MIN_BLOCK_SIZE positions + * because they can't possibly be a sync point. So we can start + * part way through the input buffer. + */ + pos = RSYNC_MIN_BLOCK_SIZE - mtctx->inBuff.filled; + if (pos >= RSYNC_LENGTH) { + prev = istart + pos - RSYNC_LENGTH; + hash = ZSTD_rollingHash_compute(prev, RSYNC_LENGTH); + } else { + assert(mtctx->inBuff.filled >= RSYNC_LENGTH); + prev = (BYTE const*)mtctx->inBuff.buffer.start + mtctx->inBuff.filled - RSYNC_LENGTH; + hash = ZSTD_rollingHash_compute(prev + pos, (RSYNC_LENGTH - pos)); + hash = ZSTD_rollingHash_append(hash, istart, pos); + } + } else { + /* We have enough bytes buffered to initialize the hash, + * and have processed enough bytes to find a sync point. + * Start scanning at the beginning of the input. + */ + assert(mtctx->inBuff.filled >= RSYNC_MIN_BLOCK_SIZE); + assert(RSYNC_MIN_BLOCK_SIZE >= RSYNC_LENGTH); + pos = 0; + prev = (BYTE const*)mtctx->inBuff.buffer.start + mtctx->inBuff.filled - RSYNC_LENGTH; + hash = ZSTD_rollingHash_compute(prev, RSYNC_LENGTH); + if ((hash & hitMask) == hitMask) { + /* We're already at a sync point so don't load any more until + * we're able to flush this sync point. + * This likely happened because the job table was full so we + * couldn't add our job. + */ + syncPoint.toLoad = 0; + syncPoint.flush = 1; + return syncPoint; + } + } + /* Starting with the hash of the previous RSYNC_LENGTH bytes, roll + * through the input. If we hit a synchronization point, then cut the + * job off, and tell the compressor to flush the job. Otherwise, load + * all the bytes and continue as normal. + * If we go too long without a synchronization point (targetSectionSize) + * then a block will be emitted anyways, but this is okay, since if we + * are already synchronized we will remain synchronized. + */ + assert(pos < RSYNC_LENGTH || ZSTD_rollingHash_compute(istart + pos - RSYNC_LENGTH, RSYNC_LENGTH) == hash); + for (; pos < syncPoint.toLoad; ++pos) { + BYTE const toRemove = pos < RSYNC_LENGTH ? prev[pos] : istart[pos - RSYNC_LENGTH]; + /* This assert is very expensive, and Debian compiles with asserts enabled. + * So disable it for now. We can get similar coverage by checking it at the + * beginning & end of the loop. + * assert(pos < RSYNC_LENGTH || ZSTD_rollingHash_compute(istart + pos - RSYNC_LENGTH, RSYNC_LENGTH) == hash); + */ + hash = ZSTD_rollingHash_rotate(hash, toRemove, istart[pos], primePower); + assert(mtctx->inBuff.filled + pos >= RSYNC_MIN_BLOCK_SIZE); + if ((hash & hitMask) == hitMask) { + syncPoint.toLoad = pos + 1; + syncPoint.flush = 1; + ++pos; /* for assert */ + break; + } + } + assert(pos < RSYNC_LENGTH || ZSTD_rollingHash_compute(istart + pos - RSYNC_LENGTH, RSYNC_LENGTH) == hash); + return syncPoint; +} + +size_t ZSTDMT_nextInputSizeHint(const ZSTDMT_CCtx* mtctx) +{ + size_t hintInSize = mtctx->targetSectionSize - mtctx->inBuff.filled; + if (hintInSize==0) hintInSize = mtctx->targetSectionSize; + return hintInSize; +} + +/** ZSTDMT_compressStream_generic() : + * internal use only - exposed to be invoked from zstd_compress.c + * assumption : output and input are valid (pos <= size) + * @return : minimum amount of data remaining to flush, 0 if none */ +size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective endOp) +{ + unsigned forwardInputProgress = 0; + DEBUGLOG(5, "ZSTDMT_compressStream_generic (endOp=%u, srcSize=%u)", + (U32)endOp, (U32)(input->size - input->pos)); + assert(output->pos <= output->size); + assert(input->pos <= input->size); + + if ((mtctx->frameEnded) && (endOp==ZSTD_e_continue)) { + /* current frame being ended. Only flush/end are allowed */ + return ERROR(stage_wrong); + } + + /* fill input buffer */ + if ( (!mtctx->jobReady) + && (input->size > input->pos) ) { /* support NULL input */ + if (mtctx->inBuff.buffer.start == NULL) { + assert(mtctx->inBuff.filled == 0); /* Can't fill an empty buffer */ + if (!ZSTDMT_tryGetInputRange(mtctx)) { + /* It is only possible for this operation to fail if there are + * still compression jobs ongoing. + */ + DEBUGLOG(5, "ZSTDMT_tryGetInputRange failed"); + assert(mtctx->doneJobID != mtctx->nextJobID); + } else + DEBUGLOG(5, "ZSTDMT_tryGetInputRange completed successfully : mtctx->inBuff.buffer.start = %p", mtctx->inBuff.buffer.start); + } + if (mtctx->inBuff.buffer.start != NULL) { + SyncPoint const syncPoint = findSynchronizationPoint(mtctx, *input); + if (syncPoint.flush && endOp == ZSTD_e_continue) { + endOp = ZSTD_e_flush; + } + assert(mtctx->inBuff.buffer.capacity >= mtctx->targetSectionSize); + DEBUGLOG(5, "ZSTDMT_compressStream_generic: adding %u bytes on top of %u to buffer of size %u", + (U32)syncPoint.toLoad, (U32)mtctx->inBuff.filled, (U32)mtctx->targetSectionSize); + ZSTD_memcpy((char*)mtctx->inBuff.buffer.start + mtctx->inBuff.filled, (const char*)input->src + input->pos, syncPoint.toLoad); + input->pos += syncPoint.toLoad; + mtctx->inBuff.filled += syncPoint.toLoad; + forwardInputProgress = syncPoint.toLoad>0; + } + } + if ((input->pos < input->size) && (endOp == ZSTD_e_end)) { + /* Can't end yet because the input is not fully consumed. + * We are in one of these cases: + * - mtctx->inBuff is NULL & empty: we couldn't get an input buffer so don't create a new job. + * - We filled the input buffer: flush this job but don't end the frame. + * - We hit a synchronization point: flush this job but don't end the frame. + */ + assert(mtctx->inBuff.filled == 0 || mtctx->inBuff.filled == mtctx->targetSectionSize || mtctx->params.rsyncable); + endOp = ZSTD_e_flush; + } + + if ( (mtctx->jobReady) + || (mtctx->inBuff.filled >= mtctx->targetSectionSize) /* filled enough : let's compress */ + || ((endOp != ZSTD_e_continue) && (mtctx->inBuff.filled > 0)) /* something to flush : let's go */ + || ((endOp == ZSTD_e_end) && (!mtctx->frameEnded)) ) { /* must finish the frame with a zero-size block */ + size_t const jobSize = mtctx->inBuff.filled; + assert(mtctx->inBuff.filled <= mtctx->targetSectionSize); + FORWARD_IF_ERROR( ZSTDMT_createCompressionJob(mtctx, jobSize, endOp) , ""); + } + + /* check for potential compressed data ready to be flushed */ + { size_t const remainingToFlush = ZSTDMT_flushProduced(mtctx, output, !forwardInputProgress, endOp); /* block if there was no forward input progress */ + if (input->pos < input->size) return MAX(remainingToFlush, 1); /* input not consumed : do not end flush yet */ + DEBUGLOG(5, "end of ZSTDMT_compressStream_generic: remainingToFlush = %u", (U32)remainingToFlush); + return remainingToFlush; + } +} diff --git a/lib/decompress/huf_decompress.c b/lib/decompress/huf_decompress.c new file mode 100644 index 0000000..c7e3426 --- /dev/null +++ b/lib/decompress/huf_decompress.c @@ -0,0 +1,1945 @@ +/* ****************************************************************** + * huff0 huffman decoder, + * part of Finite State Entropy library + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ + +/* ************************************************************** +* Dependencies +****************************************************************/ +#include /* size_t */ +#include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memset */ +#include "../common/compiler.h" +#include "../common/bitstream.h" /* BIT_* */ +#include "../common/fse.h" /* to compress headers */ +#include "../common/huf.h" +#include "../common/error_private.h" +#include "../common/zstd_internal.h" +#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */ + +/* ************************************************************** +* Constants +****************************************************************/ + +#define HUF_DECODER_FAST_TABLELOG 11 + +/* ************************************************************** +* Macros +****************************************************************/ + +#ifdef HUF_DISABLE_FAST_DECODE +# define HUF_ENABLE_FAST_DECODE 0 +#else +# define HUF_ENABLE_FAST_DECODE 1 +#endif + +/* These two optional macros force the use one way or another of the two + * Huffman decompression implementations. You can't force in both directions + * at the same time. + */ +#if defined(HUF_FORCE_DECOMPRESS_X1) && \ + defined(HUF_FORCE_DECOMPRESS_X2) +#error "Cannot force the use of the X1 and X2 decoders at the same time!" +#endif + +/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is + * supported at runtime, so we can add the BMI2 target attribute. + * When it is disabled, we will still get BMI2 if it is enabled statically. + */ +#if DYNAMIC_BMI2 +# define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE +#else +# define HUF_FAST_BMI2_ATTRS +#endif + +#ifdef __cplusplus +# define HUF_EXTERN_C extern "C" +#else +# define HUF_EXTERN_C +#endif +#define HUF_ASM_DECL HUF_EXTERN_C + +#if DYNAMIC_BMI2 +# define HUF_NEED_BMI2_FUNCTION 1 +#else +# define HUF_NEED_BMI2_FUNCTION 0 +#endif + +/* ************************************************************** +* Error Management +****************************************************************/ +#define HUF_isError ERR_isError + + +/* ************************************************************** +* Byte alignment for workSpace management +****************************************************************/ +#define HUF_ALIGN(x, a) HUF_ALIGN_MASK((x), (a) - 1) +#define HUF_ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask)) + + +/* ************************************************************** +* BMI2 Variant Wrappers +****************************************************************/ +typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize, + const void *cSrc, + size_t cSrcSize, + const HUF_DTable *DTable); + +#if DYNAMIC_BMI2 + +#define HUF_DGEN(fn) \ + \ + static size_t fn##_default( \ + void* dst, size_t dstSize, \ + const void* cSrc, size_t cSrcSize, \ + const HUF_DTable* DTable) \ + { \ + return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ + } \ + \ + static BMI2_TARGET_ATTRIBUTE size_t fn##_bmi2( \ + void* dst, size_t dstSize, \ + const void* cSrc, size_t cSrcSize, \ + const HUF_DTable* DTable) \ + { \ + return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ + } \ + \ + static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ + size_t cSrcSize, HUF_DTable const* DTable, int flags) \ + { \ + if (flags & HUF_flags_bmi2) { \ + return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \ + } \ + return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \ + } + +#else + +#define HUF_DGEN(fn) \ + static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ + size_t cSrcSize, HUF_DTable const* DTable, int flags) \ + { \ + (void)flags; \ + return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ + } + +#endif + + +/*-***************************/ +/* generic DTableDesc */ +/*-***************************/ +typedef struct { BYTE maxTableLog; BYTE tableType; BYTE tableLog; BYTE reserved; } DTableDesc; + +static DTableDesc HUF_getDTableDesc(const HUF_DTable* table) +{ + DTableDesc dtd; + ZSTD_memcpy(&dtd, table, sizeof(dtd)); + return dtd; +} + +static size_t HUF_initFastDStream(BYTE const* ip) { + BYTE const lastByte = ip[7]; + size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; + size_t const value = MEM_readLEST(ip) | 1; + assert(bitsConsumed <= 8); + assert(sizeof(size_t) == 8); + return value << bitsConsumed; +} + + +/** + * The input/output arguments to the Huffman fast decoding loop: + * + * ip [in/out] - The input pointers, must be updated to reflect what is consumed. + * op [in/out] - The output pointers, must be updated to reflect what is written. + * bits [in/out] - The bitstream containers, must be updated to reflect the current state. + * dt [in] - The decoding table. + * ilowest [in] - The beginning of the valid range of the input. Decoders may read + * down to this pointer. It may be below iend[0]. + * oend [in] - The end of the output stream. op[3] must not cross oend. + * iend [in] - The end of each input stream. ip[i] may cross iend[i], + * as long as it is above ilowest, but that indicates corruption. + */ +typedef struct { + BYTE const* ip[4]; + BYTE* op[4]; + U64 bits[4]; + void const* dt; + BYTE const* ilowest; + BYTE* oend; + BYTE const* iend[4]; +} HUF_DecompressFastArgs; + +typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*); + +/** + * Initializes args for the fast decoding loop. + * @returns 1 on success + * 0 if the fallback implementation should be used. + * Or an error code on failure. + */ +static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) +{ + void const* dt = DTable + 1; + U32 const dtLog = HUF_getDTableDesc(DTable).tableLog; + + const BYTE* const istart = (const BYTE*)src; + + BYTE* const oend = (BYTE*)ZSTD_maybeNullPtrAdd(dst, (ptrdiff_t)dstSize); + + /* The fast decoding loop assumes 64-bit little-endian. + * This condition is false on x32. + */ + if (!MEM_isLittleEndian() || MEM_32bits()) + return 0; + + /* Avoid nullptr addition */ + if (dstSize == 0) + return 0; + assert(dst != NULL); + + /* strict minimum : jump table + 1 byte per stream */ + if (srcSize < 10) + return ERROR(corruption_detected); + + /* Must have at least 8 bytes per stream because we don't handle initializing smaller bit containers. + * If table log is not correct at this point, fallback to the old decoder. + * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder. + */ + if (dtLog != HUF_DECODER_FAST_TABLELOG) + return 0; + + /* Read the jump table. */ + { + size_t const length1 = MEM_readLE16(istart); + size_t const length2 = MEM_readLE16(istart+2); + size_t const length3 = MEM_readLE16(istart+4); + size_t const length4 = srcSize - (length1 + length2 + length3 + 6); + args->iend[0] = istart + 6; /* jumpTable */ + args->iend[1] = args->iend[0] + length1; + args->iend[2] = args->iend[1] + length2; + args->iend[3] = args->iend[2] + length3; + + /* HUF_initFastDStream() requires this, and this small of an input + * won't benefit from the ASM loop anyways. + */ + if (length1 < 8 || length2 < 8 || length3 < 8 || length4 < 8) + return 0; + if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */ + } + /* ip[] contains the position that is currently loaded into bits[]. */ + args->ip[0] = args->iend[1] - sizeof(U64); + args->ip[1] = args->iend[2] - sizeof(U64); + args->ip[2] = args->iend[3] - sizeof(U64); + args->ip[3] = (BYTE const*)src + srcSize - sizeof(U64); + + /* op[] contains the output pointers. */ + args->op[0] = (BYTE*)dst; + args->op[1] = args->op[0] + (dstSize+3)/4; + args->op[2] = args->op[1] + (dstSize+3)/4; + args->op[3] = args->op[2] + (dstSize+3)/4; + + /* No point to call the ASM loop for tiny outputs. */ + if (args->op[3] >= oend) + return 0; + + /* bits[] is the bit container. + * It is read from the MSB down to the LSB. + * It is shifted left as it is read, and zeros are + * shifted in. After the lowest valid bit a 1 is + * set, so that CountTrailingZeros(bits[]) can be used + * to count how many bits we've consumed. + */ + args->bits[0] = HUF_initFastDStream(args->ip[0]); + args->bits[1] = HUF_initFastDStream(args->ip[1]); + args->bits[2] = HUF_initFastDStream(args->ip[2]); + args->bits[3] = HUF_initFastDStream(args->ip[3]); + + /* The decoders must be sure to never read beyond ilowest. + * This is lower than iend[0], but allowing decoders to read + * down to ilowest can allow an extra iteration or two in the + * fast loop. + */ + args->ilowest = istart; + + args->oend = oend; + args->dt = dt; + + return 1; +} + +static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd) +{ + /* Validate that we haven't overwritten. */ + if (args->op[stream] > segmentEnd) + return ERROR(corruption_detected); + /* Validate that we haven't read beyond iend[]. + * Note that ip[] may be < iend[] because the MSB is + * the next bit to read, and we may have consumed 100% + * of the stream, so down to iend[i] - 8 is valid. + */ + if (args->ip[stream] < args->iend[stream] - 8) + return ERROR(corruption_detected); + + /* Construct the BIT_DStream_t. */ + assert(sizeof(size_t) == 8); + bit->bitContainer = MEM_readLEST(args->ip[stream]); + bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]); + bit->start = (const char*)args->ilowest; + bit->limitPtr = bit->start + sizeof(size_t); + bit->ptr = (const char*)args->ip[stream]; + + return 0; +} + +/* Calls X(N) for each stream 0, 1, 2, 3. */ +#define HUF_4X_FOR_EACH_STREAM(X) \ + do { \ + X(0); \ + X(1); \ + X(2); \ + X(3); \ + } while (0) + +/* Calls X(N, var) for each stream 0, 1, 2, 3. */ +#define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \ + do { \ + X(0, (var)); \ + X(1, (var)); \ + X(2, (var)); \ + X(3, (var)); \ + } while (0) + + +#ifndef HUF_FORCE_DECOMPRESS_X2 + +/*-***************************/ +/* single-symbol decoding */ +/*-***************************/ +typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decoding */ + +/** + * Packs 4 HUF_DEltX1 structs into a U64. This is used to lay down 4 entries at + * a time. + */ +static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) { + U64 D4; + if (MEM_isLittleEndian()) { + D4 = (U64)((symbol << 8) + nbBits); + } else { + D4 = (U64)(symbol + (nbBits << 8)); + } + assert(D4 < (1U << 16)); + D4 *= 0x0001000100010001ULL; + return D4; +} + +/** + * Increase the tableLog to targetTableLog and rescales the stats. + * If tableLog > targetTableLog this is a no-op. + * @returns New tableLog + */ +static U32 HUF_rescaleStats(BYTE* huffWeight, U32* rankVal, U32 nbSymbols, U32 tableLog, U32 targetTableLog) +{ + if (tableLog > targetTableLog) + return tableLog; + if (tableLog < targetTableLog) { + U32 const scale = targetTableLog - tableLog; + U32 s; + /* Increase the weight for all non-zero probability symbols by scale. */ + for (s = 0; s < nbSymbols; ++s) { + huffWeight[s] += (BYTE)((huffWeight[s] == 0) ? 0 : scale); + } + /* Update rankVal to reflect the new weights. + * All weights except 0 get moved to weight + scale. + * Weights [1, scale] are empty. + */ + for (s = targetTableLog; s > scale; --s) { + rankVal[s] = rankVal[s - scale]; + } + for (s = scale; s > 0; --s) { + rankVal[s] = 0; + } + } + return targetTableLog; +} + +typedef struct { + U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; + U32 rankStart[HUF_TABLELOG_ABSOLUTEMAX + 1]; + U32 statsWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; + BYTE symbols[HUF_SYMBOLVALUE_MAX + 1]; + BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1]; +} HUF_ReadDTableX1_Workspace; + +size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags) +{ + U32 tableLog = 0; + U32 nbSymbols = 0; + size_t iSize; + void* const dtPtr = DTable + 1; + HUF_DEltX1* const dt = (HUF_DEltX1*)dtPtr; + HUF_ReadDTableX1_Workspace* wksp = (HUF_ReadDTableX1_Workspace*)workSpace; + + DEBUG_STATIC_ASSERT(HUF_DECOMPRESS_WORKSPACE_SIZE >= sizeof(*wksp)); + if (sizeof(*wksp) > wkspSize) return ERROR(tableLog_tooLarge); + + DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable)); + /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */ + + iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags); + if (HUF_isError(iSize)) return iSize; + + + /* Table header */ + { DTableDesc dtd = HUF_getDTableDesc(DTable); + U32 const maxTableLog = dtd.maxTableLog + 1; + U32 const targetTableLog = MIN(maxTableLog, HUF_DECODER_FAST_TABLELOG); + tableLog = HUF_rescaleStats(wksp->huffWeight, wksp->rankVal, nbSymbols, tableLog, targetTableLog); + if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge); /* DTable too small, Huffman tree cannot fit in */ + dtd.tableType = 0; + dtd.tableLog = (BYTE)tableLog; + ZSTD_memcpy(DTable, &dtd, sizeof(dtd)); + } + + /* Compute symbols and rankStart given rankVal: + * + * rankVal already contains the number of values of each weight. + * + * symbols contains the symbols ordered by weight. First are the rankVal[0] + * weight 0 symbols, followed by the rankVal[1] weight 1 symbols, and so on. + * symbols[0] is filled (but unused) to avoid a branch. + * + * rankStart contains the offset where each rank belongs in the DTable. + * rankStart[0] is not filled because there are no entries in the table for + * weight 0. + */ + { int n; + U32 nextRankStart = 0; + int const unroll = 4; + int const nLimit = (int)nbSymbols - unroll + 1; + for (n=0; n<(int)tableLog+1; n++) { + U32 const curr = nextRankStart; + nextRankStart += wksp->rankVal[n]; + wksp->rankStart[n] = curr; + } + for (n=0; n < nLimit; n += unroll) { + int u; + for (u=0; u < unroll; ++u) { + size_t const w = wksp->huffWeight[n+u]; + wksp->symbols[wksp->rankStart[w]++] = (BYTE)(n+u); + } + } + for (; n < (int)nbSymbols; ++n) { + size_t const w = wksp->huffWeight[n]; + wksp->symbols[wksp->rankStart[w]++] = (BYTE)n; + } + } + + /* fill DTable + * We fill all entries of each weight in order. + * That way length is a constant for each iteration of the outer loop. + * We can switch based on the length to a different inner loop which is + * optimized for that particular case. + */ + { U32 w; + int symbol = wksp->rankVal[0]; + int rankStart = 0; + for (w=1; wrankVal[w]; + int const length = (1 << w) >> 1; + int uStart = rankStart; + BYTE const nbBits = (BYTE)(tableLog + 1 - w); + int s; + int u; + switch (length) { + case 1: + for (s=0; ssymbols[symbol + s]; + D.nbBits = nbBits; + dt[uStart] = D; + uStart += 1; + } + break; + case 2: + for (s=0; ssymbols[symbol + s]; + D.nbBits = nbBits; + dt[uStart+0] = D; + dt[uStart+1] = D; + uStart += 2; + } + break; + case 4: + for (s=0; ssymbols[symbol + s], nbBits); + MEM_write64(dt + uStart, D4); + uStart += 4; + } + break; + case 8: + for (s=0; ssymbols[symbol + s], nbBits); + MEM_write64(dt + uStart, D4); + MEM_write64(dt + uStart + 4, D4); + uStart += 8; + } + break; + default: + for (s=0; ssymbols[symbol + s], nbBits); + for (u=0; u < length; u += 16) { + MEM_write64(dt + uStart + u + 0, D4); + MEM_write64(dt + uStart + u + 4, D4); + MEM_write64(dt + uStart + u + 8, D4); + MEM_write64(dt + uStart + u + 12, D4); + } + assert(u == length); + uStart += length; + } + break; + } + symbol += symbolCount; + rankStart += symbolCount * length; + } + } + return iSize; +} + +FORCE_INLINE_TEMPLATE BYTE +HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog) +{ + size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */ + BYTE const c = dt[val].byte; + BIT_skipBits(Dstream, dt[val].nbBits); + return c; +} + +#define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \ + do { *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog); } while (0) + +#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \ + do { \ + if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ + HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \ + } while (0) + +#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \ + do { \ + if (MEM_64bits()) \ + HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \ + } while (0) + +HINT_INLINE size_t +HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog) +{ + BYTE* const pStart = p; + + /* up to 4 symbols at a time */ + if ((pEnd - p) > 3) { + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) { + HUF_DECODE_SYMBOLX1_2(p, bitDPtr); + HUF_DECODE_SYMBOLX1_1(p, bitDPtr); + HUF_DECODE_SYMBOLX1_2(p, bitDPtr); + HUF_DECODE_SYMBOLX1_0(p, bitDPtr); + } + } else { + BIT_reloadDStream(bitDPtr); + } + + /* [0-3] symbols remaining */ + if (MEM_32bits()) + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd)) + HUF_DECODE_SYMBOLX1_0(p, bitDPtr); + + /* no more data to retrieve from bitstream, no need to reload */ + while (p < pEnd) + HUF_DECODE_SYMBOLX1_0(p, bitDPtr); + + return (size_t)(pEnd-pStart); +} + +FORCE_INLINE_TEMPLATE size_t +HUF_decompress1X1_usingDTable_internal_body( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + BYTE* op = (BYTE*)dst; + BYTE* const oend = (BYTE*)ZSTD_maybeNullPtrAdd(op, (ptrdiff_t)dstSize); + const void* dtPtr = DTable + 1; + const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr; + BIT_DStream_t bitD; + DTableDesc const dtd = HUF_getDTableDesc(DTable); + U32 const dtLog = dtd.tableLog; + + CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) ); + + HUF_decodeStreamX1(op, &bitD, oend, dt, dtLog); + + if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected); + + return dstSize; +} + +/* HUF_decompress4X1_usingDTable_internal_body(): + * Conditions : + * @dstSize >= 6 + */ +FORCE_INLINE_TEMPLATE size_t +HUF_decompress4X1_usingDTable_internal_body( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + /* Check */ + if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ + if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ + + { const BYTE* const istart = (const BYTE*) cSrc; + BYTE* const ostart = (BYTE*) dst; + BYTE* const oend = ostart + dstSize; + BYTE* const olimit = oend - 3; + const void* const dtPtr = DTable + 1; + const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr; + + /* Init */ + BIT_DStream_t bitD1; + BIT_DStream_t bitD2; + BIT_DStream_t bitD3; + BIT_DStream_t bitD4; + size_t const length1 = MEM_readLE16(istart); + size_t const length2 = MEM_readLE16(istart+2); + size_t const length3 = MEM_readLE16(istart+4); + size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6); + const BYTE* const istart1 = istart + 6; /* jumpTable */ + const BYTE* const istart2 = istart1 + length1; + const BYTE* const istart3 = istart2 + length2; + const BYTE* const istart4 = istart3 + length3; + const size_t segmentSize = (dstSize+3) / 4; + BYTE* const opStart2 = ostart + segmentSize; + BYTE* const opStart3 = opStart2 + segmentSize; + BYTE* const opStart4 = opStart3 + segmentSize; + BYTE* op1 = ostart; + BYTE* op2 = opStart2; + BYTE* op3 = opStart3; + BYTE* op4 = opStart4; + DTableDesc const dtd = HUF_getDTableDesc(DTable); + U32 const dtLog = dtd.tableLog; + U32 endSignal = 1; + + if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ + if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ + assert(dstSize >= 6); /* validated above */ + CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); + CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); + CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); + CHECK_F( BIT_initDStream(&bitD4, istart4, length4) ); + + /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */ + if ((size_t)(oend - op4) >= sizeof(size_t)) { + for ( ; (endSignal) & (op4 < olimit) ; ) { + HUF_DECODE_SYMBOLX1_2(op1, &bitD1); + HUF_DECODE_SYMBOLX1_2(op2, &bitD2); + HUF_DECODE_SYMBOLX1_2(op3, &bitD3); + HUF_DECODE_SYMBOLX1_2(op4, &bitD4); + HUF_DECODE_SYMBOLX1_1(op1, &bitD1); + HUF_DECODE_SYMBOLX1_1(op2, &bitD2); + HUF_DECODE_SYMBOLX1_1(op3, &bitD3); + HUF_DECODE_SYMBOLX1_1(op4, &bitD4); + HUF_DECODE_SYMBOLX1_2(op1, &bitD1); + HUF_DECODE_SYMBOLX1_2(op2, &bitD2); + HUF_DECODE_SYMBOLX1_2(op3, &bitD3); + HUF_DECODE_SYMBOLX1_2(op4, &bitD4); + HUF_DECODE_SYMBOLX1_0(op1, &bitD1); + HUF_DECODE_SYMBOLX1_0(op2, &bitD2); + HUF_DECODE_SYMBOLX1_0(op3, &bitD3); + HUF_DECODE_SYMBOLX1_0(op4, &bitD4); + endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished; + } + } + + /* check corruption */ + /* note : should not be necessary : op# advance in lock step, and we control op4. + * but curiously, binary generated by gcc 7.2 & 7.3 with -mbmi2 runs faster when >=1 test is present */ + if (op1 > opStart2) return ERROR(corruption_detected); + if (op2 > opStart3) return ERROR(corruption_detected); + if (op3 > opStart4) return ERROR(corruption_detected); + /* note : op4 supposed already verified within main loop */ + + /* finish bitStreams one by one */ + HUF_decodeStreamX1(op1, &bitD1, opStart2, dt, dtLog); + HUF_decodeStreamX1(op2, &bitD2, opStart3, dt, dtLog); + HUF_decodeStreamX1(op3, &bitD3, opStart4, dt, dtLog); + HUF_decodeStreamX1(op4, &bitD4, oend, dt, dtLog); + + /* check */ + { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4); + if (!endCheck) return ERROR(corruption_detected); } + + /* decoded size */ + return dstSize; + } +} + +#if HUF_NEED_BMI2_FUNCTION +static BMI2_TARGET_ATTRIBUTE +size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); +} +#endif + +static +size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); +} + +#if ZSTD_ENABLE_ASM_X86_64_BMI2 + +HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; + +#endif + +static HUF_FAST_BMI2_ATTRS +void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args) +{ + U64 bits[4]; + BYTE const* ip[4]; + BYTE* op[4]; + U16 const* const dtable = (U16 const*)args->dt; + BYTE* const oend = args->oend; + BYTE const* const ilowest = args->ilowest; + + /* Copy the arguments to local variables */ + ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); + ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip)); + ZSTD_memcpy(&op, &args->op, sizeof(op)); + + assert(MEM_isLittleEndian()); + assert(!MEM_32bits()); + + for (;;) { + BYTE* olimit; + int stream; + + /* Assert loop preconditions */ +#ifndef NDEBUG + for (stream = 0; stream < 4; ++stream) { + assert(op[stream] <= (stream == 3 ? oend : op[stream + 1])); + assert(ip[stream] >= ilowest); + } +#endif + /* Compute olimit */ + { + /* Each iteration produces 5 output symbols per stream */ + size_t const oiters = (size_t)(oend - op[3]) / 5; + /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes + * per stream. + */ + size_t const iiters = (size_t)(ip[0] - ilowest) / 7; + /* We can safely run iters iterations before running bounds checks */ + size_t const iters = MIN(oiters, iiters); + size_t const symbols = iters * 5; + + /* We can simply check that op[3] < olimit, instead of checking all + * of our bounds, since we can't hit the other bounds until we've run + * iters iterations, which only happens when op[3] == olimit. + */ + olimit = op[3] + symbols; + + /* Exit fast decoding loop once we reach the end. */ + if (op[3] == olimit) + break; + + /* Exit the decoding loop if any input pointer has crossed the + * previous one. This indicates corruption, and a precondition + * to our loop is that ip[i] >= ip[0]. + */ + for (stream = 1; stream < 4; ++stream) { + if (ip[stream] < ip[stream - 1]) + goto _out; + } + } + +#ifndef NDEBUG + for (stream = 1; stream < 4; ++stream) { + assert(ip[stream] >= ip[stream - 1]); + } +#endif + +#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol) \ + do { \ + int const index = (int)(bits[(_stream)] >> 53); \ + int const entry = (int)dtable[index]; \ + bits[(_stream)] <<= (entry & 0x3F); \ + op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \ + } while (0) + +#define HUF_4X1_RELOAD_STREAM(_stream) \ + do { \ + int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \ + int const nbBits = ctz & 7; \ + int const nbBytes = ctz >> 3; \ + op[(_stream)] += 5; \ + ip[(_stream)] -= nbBytes; \ + bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \ + bits[(_stream)] <<= nbBits; \ + } while (0) + + /* Manually unroll the loop because compilers don't consistently + * unroll the inner loops, which destroys performance. + */ + do { + /* Decode 5 symbols in each of the 4 streams */ + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4); + + /* Reload each of the 4 the bitstreams */ + HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM); + } while (op[3] < olimit); + +#undef HUF_4X1_DECODE_SYMBOL +#undef HUF_4X1_RELOAD_STREAM + } + +_out: + + /* Save the final values of each of the state variables back to args. */ + ZSTD_memcpy(&args->bits, &bits, sizeof(bits)); + ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip)); + ZSTD_memcpy(&args->op, &op, sizeof(op)); +} + +/** + * @returns @p dstSize on success (>= 6) + * 0 if the fallback implementation should be used + * An error if an error occurred + */ +static HUF_FAST_BMI2_ATTRS +size_t +HUF_decompress4X1_usingDTable_internal_fast( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable, + HUF_DecompressFastLoopFn loopFn) +{ + void const* dt = DTable + 1; + BYTE const* const ilowest = (BYTE const*)cSrc; + BYTE* const oend = (BYTE*)ZSTD_maybeNullPtrAdd(dst, (ptrdiff_t)dstSize); + HUF_DecompressFastArgs args; + { size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); + FORWARD_IF_ERROR(ret, "Failed to init fast loop args"); + if (ret == 0) + return 0; + } + + assert(args.ip[0] >= args.ilowest); + loopFn(&args); + + /* Our loop guarantees that ip[] >= ilowest and that we haven't + * overwritten any op[]. + */ + assert(args.ip[0] >= ilowest); + assert(args.ip[0] >= ilowest); + assert(args.ip[1] >= ilowest); + assert(args.ip[2] >= ilowest); + assert(args.ip[3] >= ilowest); + assert(args.op[3] <= oend); + + assert(ilowest == args.ilowest); + assert(ilowest + 6 == args.iend[0]); + (void)ilowest; + + /* finish bit streams one by one. */ + { size_t const segmentSize = (dstSize+3) / 4; + BYTE* segmentEnd = (BYTE*)dst; + int i; + for (i = 0; i < 4; ++i) { + BIT_DStream_t bit; + if (segmentSize <= (size_t)(oend - segmentEnd)) + segmentEnd += segmentSize; + else + segmentEnd = oend; + FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption"); + /* Decompress and validate that we've produced exactly the expected length. */ + args.op[i] += HUF_decodeStreamX1(args.op[i], &bit, segmentEnd, (HUF_DEltX1 const*)dt, HUF_DECODER_FAST_TABLELOG); + if (args.op[i] != segmentEnd) return ERROR(corruption_detected); + } + } + + /* decoded size */ + assert(dstSize != 0); + return dstSize; +} + +HUF_DGEN(HUF_decompress1X1_usingDTable_internal) + +static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable, int flags) +{ + HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default; + HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop; + +#if DYNAMIC_BMI2 + if (flags & HUF_flags_bmi2) { + fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2; +# if ZSTD_ENABLE_ASM_X86_64_BMI2 + if (!(flags & HUF_flags_disableAsm)) { + loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; + } +# endif + } else { + return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } +#endif + +#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) + if (!(flags & HUF_flags_disableAsm)) { + loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; + } +#endif + + if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) { + size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); + if (ret != 0) + return ret; + } + return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); +} + +static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + void* workSpace, size_t wkspSize, int flags) +{ + const BYTE* ip = (const BYTE*) cSrc; + + size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + + return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); +} + +#endif /* HUF_FORCE_DECOMPRESS_X2 */ + + +#ifndef HUF_FORCE_DECOMPRESS_X1 + +/* *************************/ +/* double-symbols decoding */ +/* *************************/ + +typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2; /* double-symbols decoding */ +typedef struct { BYTE symbol; } sortedSymbol_t; +typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1]; +typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX]; + +/** + * Constructs a HUF_DEltX2 in a U32. + */ +static U32 HUF_buildDEltX2U32(U32 symbol, U32 nbBits, U32 baseSeq, int level) +{ + U32 seq; + DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, sequence) == 0); + DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, nbBits) == 2); + DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, length) == 3); + DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U32)); + if (MEM_isLittleEndian()) { + seq = level == 1 ? symbol : (baseSeq + (symbol << 8)); + return seq + (nbBits << 16) + ((U32)level << 24); + } else { + seq = level == 1 ? (symbol << 8) : ((baseSeq << 8) + symbol); + return (seq << 16) + (nbBits << 8) + (U32)level; + } +} + +/** + * Constructs a HUF_DEltX2. + */ +static HUF_DEltX2 HUF_buildDEltX2(U32 symbol, U32 nbBits, U32 baseSeq, int level) +{ + HUF_DEltX2 DElt; + U32 const val = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level); + DEBUG_STATIC_ASSERT(sizeof(DElt) == sizeof(val)); + ZSTD_memcpy(&DElt, &val, sizeof(val)); + return DElt; +} + +/** + * Constructs 2 HUF_DEltX2s and packs them into a U64. + */ +static U64 HUF_buildDEltX2U64(U32 symbol, U32 nbBits, U16 baseSeq, int level) +{ + U32 DElt = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level); + return (U64)DElt + ((U64)DElt << 32); +} + +/** + * Fills the DTable rank with all the symbols from [begin, end) that are each + * nbBits long. + * + * @param DTableRank The start of the rank in the DTable. + * @param begin The first symbol to fill (inclusive). + * @param end The last symbol to fill (exclusive). + * @param nbBits Each symbol is nbBits long. + * @param tableLog The table log. + * @param baseSeq If level == 1 { 0 } else { the first level symbol } + * @param level The level in the table. Must be 1 or 2. + */ +static void HUF_fillDTableX2ForWeight( + HUF_DEltX2* DTableRank, + sortedSymbol_t const* begin, sortedSymbol_t const* end, + U32 nbBits, U32 tableLog, + U16 baseSeq, int const level) +{ + U32 const length = 1U << ((tableLog - nbBits) & 0x1F /* quiet static-analyzer */); + const sortedSymbol_t* ptr; + assert(level >= 1 && level <= 2); + switch (length) { + case 1: + for (ptr = begin; ptr != end; ++ptr) { + HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level); + *DTableRank++ = DElt; + } + break; + case 2: + for (ptr = begin; ptr != end; ++ptr) { + HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level); + DTableRank[0] = DElt; + DTableRank[1] = DElt; + DTableRank += 2; + } + break; + case 4: + for (ptr = begin; ptr != end; ++ptr) { + U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level); + ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); + DTableRank += 4; + } + break; + case 8: + for (ptr = begin; ptr != end; ++ptr) { + U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level); + ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2)); + DTableRank += 8; + } + break; + default: + for (ptr = begin; ptr != end; ++ptr) { + U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level); + HUF_DEltX2* const DTableRankEnd = DTableRank + length; + for (; DTableRank != DTableRankEnd; DTableRank += 8) { + ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2)); + } + } + break; + } +} + +/* HUF_fillDTableX2Level2() : + * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */ +static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 consumedBits, + const U32* rankVal, const int minWeight, const int maxWeight1, + const sortedSymbol_t* sortedSymbols, U32 const* rankStart, + U32 nbBitsBaseline, U16 baseSeq) +{ + /* Fill skipped values (all positions up to rankVal[minWeight]). + * These are positions only get a single symbol because the combined weight + * is too large. + */ + if (minWeight>1) { + U32 const length = 1U << ((targetLog - consumedBits) & 0x1F /* quiet static-analyzer */); + U64 const DEltX2 = HUF_buildDEltX2U64(baseSeq, consumedBits, /* baseSeq */ 0, /* level */ 1); + int const skipSize = rankVal[minWeight]; + assert(length > 1); + assert((U32)skipSize < length); + switch (length) { + case 2: + assert(skipSize == 1); + ZSTD_memcpy(DTable, &DEltX2, sizeof(DEltX2)); + break; + case 4: + assert(skipSize <= 4); + ZSTD_memcpy(DTable + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTable + 2, &DEltX2, sizeof(DEltX2)); + break; + default: + { + int i; + for (i = 0; i < skipSize; i += 8) { + ZSTD_memcpy(DTable + i + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTable + i + 2, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTable + i + 4, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTable + i + 6, &DEltX2, sizeof(DEltX2)); + } + } + } + } + + /* Fill each of the second level symbols by weight. */ + { + int w; + for (w = minWeight; w < maxWeight1; ++w) { + int const begin = rankStart[w]; + int const end = rankStart[w+1]; + U32 const nbBits = nbBitsBaseline - w; + U32 const totalBits = nbBits + consumedBits; + HUF_fillDTableX2ForWeight( + DTable + rankVal[w], + sortedSymbols + begin, sortedSymbols + end, + totalBits, targetLog, + baseSeq, /* level */ 2); + } + } +} + +static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog, + const sortedSymbol_t* sortedList, + const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight, + const U32 nbBitsBaseline) +{ + U32* const rankVal = rankValOrigin[0]; + const int scaleLog = nbBitsBaseline - targetLog; /* note : targetLog >= srcLog, hence scaleLog <= 1 */ + const U32 minBits = nbBitsBaseline - maxWeight; + int w; + int const wEnd = (int)maxWeight + 1; + + /* Fill DTable in order of weight. */ + for (w = 1; w < wEnd; ++w) { + int const begin = (int)rankStart[w]; + int const end = (int)rankStart[w+1]; + U32 const nbBits = nbBitsBaseline - w; + + if (targetLog-nbBits >= minBits) { + /* Enough room for a second symbol. */ + int start = rankVal[w]; + U32 const length = 1U << ((targetLog - nbBits) & 0x1F /* quiet static-analyzer */); + int minWeight = nbBits + scaleLog; + int s; + if (minWeight < 1) minWeight = 1; + /* Fill the DTable for every symbol of weight w. + * These symbols get at least 1 second symbol. + */ + for (s = begin; s != end; ++s) { + HUF_fillDTableX2Level2( + DTable + start, targetLog, nbBits, + rankValOrigin[nbBits], minWeight, wEnd, + sortedList, rankStart, + nbBitsBaseline, sortedList[s].symbol); + start += length; + } + } else { + /* Only a single symbol. */ + HUF_fillDTableX2ForWeight( + DTable + rankVal[w], + sortedList + begin, sortedList + end, + nbBits, targetLog, + /* baseSeq */ 0, /* level */ 1); + } + } +} + +typedef struct { + rankValCol_t rankVal[HUF_TABLELOG_MAX]; + U32 rankStats[HUF_TABLELOG_MAX + 1]; + U32 rankStart0[HUF_TABLELOG_MAX + 3]; + sortedSymbol_t sortedSymbol[HUF_SYMBOLVALUE_MAX + 1]; + BYTE weightList[HUF_SYMBOLVALUE_MAX + 1]; + U32 calleeWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; +} HUF_ReadDTableX2_Workspace; + +size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, + const void* src, size_t srcSize, + void* workSpace, size_t wkspSize, int flags) +{ + U32 tableLog, maxW, nbSymbols; + DTableDesc dtd = HUF_getDTableDesc(DTable); + U32 maxTableLog = dtd.maxTableLog; + size_t iSize; + void* dtPtr = DTable+1; /* force compiler to avoid strict-aliasing */ + HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr; + U32 *rankStart; + + HUF_ReadDTableX2_Workspace* const wksp = (HUF_ReadDTableX2_Workspace*)workSpace; + + if (sizeof(*wksp) > wkspSize) return ERROR(GENERIC); + + rankStart = wksp->rankStart0 + 1; + ZSTD_memset(wksp->rankStats, 0, sizeof(wksp->rankStats)); + ZSTD_memset(wksp->rankStart0, 0, sizeof(wksp->rankStart0)); + + DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(HUF_DTable)); /* if compiler fails here, assertion is wrong */ + if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); + /* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */ + + iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags); + if (HUF_isError(iSize)) return iSize; + + /* check result */ + if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */ + if (tableLog <= HUF_DECODER_FAST_TABLELOG && maxTableLog > HUF_DECODER_FAST_TABLELOG) maxTableLog = HUF_DECODER_FAST_TABLELOG; + + /* find maxWeight */ + for (maxW = tableLog; wksp->rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */ + + /* Get start index of each weight */ + { U32 w, nextRankStart = 0; + for (w=1; wrankStats[w]; + rankStart[w] = curr; + } + rankStart[0] = nextRankStart; /* put all 0w symbols at the end of sorted list*/ + rankStart[maxW+1] = nextRankStart; + } + + /* sort symbols by weight */ + { U32 s; + for (s=0; sweightList[s]; + U32 const r = rankStart[w]++; + wksp->sortedSymbol[r].symbol = (BYTE)s; + } + rankStart[0] = 0; /* forget 0w symbols; this is beginning of weight(1) */ + } + + /* Build rankVal */ + { U32* const rankVal0 = wksp->rankVal[0]; + { int const rescale = (maxTableLog-tableLog) - 1; /* tableLog <= maxTableLog */ + U32 nextRankVal = 0; + U32 w; + for (w=1; wrankStats[w] << (w+rescale); + rankVal0[w] = curr; + } } + { U32 const minBits = tableLog+1 - maxW; + U32 consumed; + for (consumed = minBits; consumed < maxTableLog - minBits + 1; consumed++) { + U32* const rankValPtr = wksp->rankVal[consumed]; + U32 w; + for (w = 1; w < maxW+1; w++) { + rankValPtr[w] = rankVal0[w] >> consumed; + } } } } + + HUF_fillDTableX2(dt, maxTableLog, + wksp->sortedSymbol, + wksp->rankStart0, wksp->rankVal, maxW, + tableLog+1); + + dtd.tableLog = (BYTE)maxTableLog; + dtd.tableType = 1; + ZSTD_memcpy(DTable, &dtd, sizeof(dtd)); + return iSize; +} + + +FORCE_INLINE_TEMPLATE U32 +HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog) +{ + size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ + ZSTD_memcpy(op, &dt[val].sequence, 2); + BIT_skipBits(DStream, dt[val].nbBits); + return dt[val].length; +} + +FORCE_INLINE_TEMPLATE U32 +HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog) +{ + size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ + ZSTD_memcpy(op, &dt[val].sequence, 1); + if (dt[val].length==1) { + BIT_skipBits(DStream, dt[val].nbBits); + } else { + if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) { + BIT_skipBits(DStream, dt[val].nbBits); + if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8)) + /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */ + DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8); + } + } + return 1; +} + +#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \ + do { ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); } while (0) + +#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \ + do { \ + if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ + ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \ + } while (0) + +#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \ + do { \ + if (MEM_64bits()) \ + ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \ + } while (0) + +HINT_INLINE size_t +HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, + const HUF_DEltX2* const dt, const U32 dtLog) +{ + BYTE* const pStart = p; + + /* up to 8 symbols at a time */ + if ((size_t)(pEnd - p) >= sizeof(bitDPtr->bitContainer)) { + if (dtLog <= 11 && MEM_64bits()) { + /* up to 10 symbols at a time */ + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-9)) { + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + } + } else { + /* up to 8 symbols at a time */ + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) { + HUF_DECODE_SYMBOLX2_2(p, bitDPtr); + HUF_DECODE_SYMBOLX2_1(p, bitDPtr); + HUF_DECODE_SYMBOLX2_2(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + } + } + } else { + BIT_reloadDStream(bitDPtr); + } + + /* closer to end : up to 2 symbols at a time */ + if ((size_t)(pEnd - p) >= 2) { + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2)) + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + + while (p <= pEnd-2) + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */ + } + + if (p < pEnd) + p += HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog); + + return p-pStart; +} + +FORCE_INLINE_TEMPLATE size_t +HUF_decompress1X2_usingDTable_internal_body( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + BIT_DStream_t bitD; + + /* Init */ + CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) ); + + /* decode */ + { BYTE* const ostart = (BYTE*) dst; + BYTE* const oend = (BYTE*)ZSTD_maybeNullPtrAdd(ostart, (ptrdiff_t)dstSize); + const void* const dtPtr = DTable+1; /* force compiler to not use strict-aliasing */ + const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr; + DTableDesc const dtd = HUF_getDTableDesc(DTable); + HUF_decodeStreamX2(ostart, &bitD, oend, dt, dtd.tableLog); + } + + /* check */ + if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected); + + /* decoded size */ + return dstSize; +} + +/* HUF_decompress4X2_usingDTable_internal_body(): + * Conditions: + * @dstSize >= 6 + */ +FORCE_INLINE_TEMPLATE size_t +HUF_decompress4X2_usingDTable_internal_body( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ + if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ + + { const BYTE* const istart = (const BYTE*) cSrc; + BYTE* const ostart = (BYTE*) dst; + BYTE* const oend = ostart + dstSize; + BYTE* const olimit = oend - (sizeof(size_t)-1); + const void* const dtPtr = DTable+1; + const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr; + + /* Init */ + BIT_DStream_t bitD1; + BIT_DStream_t bitD2; + BIT_DStream_t bitD3; + BIT_DStream_t bitD4; + size_t const length1 = MEM_readLE16(istart); + size_t const length2 = MEM_readLE16(istart+2); + size_t const length3 = MEM_readLE16(istart+4); + size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6); + const BYTE* const istart1 = istart + 6; /* jumpTable */ + const BYTE* const istart2 = istart1 + length1; + const BYTE* const istart3 = istart2 + length2; + const BYTE* const istart4 = istart3 + length3; + size_t const segmentSize = (dstSize+3) / 4; + BYTE* const opStart2 = ostart + segmentSize; + BYTE* const opStart3 = opStart2 + segmentSize; + BYTE* const opStart4 = opStart3 + segmentSize; + BYTE* op1 = ostart; + BYTE* op2 = opStart2; + BYTE* op3 = opStart3; + BYTE* op4 = opStart4; + U32 endSignal = 1; + DTableDesc const dtd = HUF_getDTableDesc(DTable); + U32 const dtLog = dtd.tableLog; + + if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ + if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ + assert(dstSize >= 6 /* validated above */); + CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); + CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); + CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); + CHECK_F( BIT_initDStream(&bitD4, istart4, length4) ); + + /* 16-32 symbols per loop (4-8 symbols per stream) */ + if ((size_t)(oend - op4) >= sizeof(size_t)) { + for ( ; (endSignal) & (op4 < olimit); ) { +#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_1(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_0(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_1(op2, &bitD2); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_0(op2, &bitD2); + endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_1(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_0(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_1(op4, &bitD4); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_0(op4, &bitD4); + endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished; +#else + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_1(op1, &bitD1); + HUF_DECODE_SYMBOLX2_1(op2, &bitD2); + HUF_DECODE_SYMBOLX2_1(op3, &bitD3); + HUF_DECODE_SYMBOLX2_1(op4, &bitD4); + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_0(op1, &bitD1); + HUF_DECODE_SYMBOLX2_0(op2, &bitD2); + HUF_DECODE_SYMBOLX2_0(op3, &bitD3); + HUF_DECODE_SYMBOLX2_0(op4, &bitD4); + endSignal = (U32)LIKELY((U32) + (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished) + & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished) + & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished) + & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished)); +#endif + } + } + + /* check corruption */ + if (op1 > opStart2) return ERROR(corruption_detected); + if (op2 > opStart3) return ERROR(corruption_detected); + if (op3 > opStart4) return ERROR(corruption_detected); + /* note : op4 already verified within main loop */ + + /* finish bitStreams one by one */ + HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog); + HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog); + HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog); + HUF_decodeStreamX2(op4, &bitD4, oend, dt, dtLog); + + /* check */ + { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4); + if (!endCheck) return ERROR(corruption_detected); } + + /* decoded size */ + return dstSize; + } +} + +#if HUF_NEED_BMI2_FUNCTION +static BMI2_TARGET_ATTRIBUTE +size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); +} +#endif + +static +size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); +} + +#if ZSTD_ENABLE_ASM_X86_64_BMI2 + +HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; + +#endif + +static HUF_FAST_BMI2_ATTRS +void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args) +{ + U64 bits[4]; + BYTE const* ip[4]; + BYTE* op[4]; + BYTE* oend[4]; + HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt; + BYTE const* const ilowest = args->ilowest; + + /* Copy the arguments to local registers. */ + ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); + ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip)); + ZSTD_memcpy(&op, &args->op, sizeof(op)); + + oend[0] = op[1]; + oend[1] = op[2]; + oend[2] = op[3]; + oend[3] = args->oend; + + assert(MEM_isLittleEndian()); + assert(!MEM_32bits()); + + for (;;) { + BYTE* olimit; + int stream; + + /* Assert loop preconditions */ +#ifndef NDEBUG + for (stream = 0; stream < 4; ++stream) { + assert(op[stream] <= oend[stream]); + assert(ip[stream] >= ilowest); + } +#endif + /* Compute olimit */ + { + /* Each loop does 5 table lookups for each of the 4 streams. + * Each table lookup consumes up to 11 bits of input, and produces + * up to 2 bytes of output. + */ + /* We can consume up to 7 bytes of input per iteration per stream. + * We also know that each input pointer is >= ip[0]. So we can run + * iters loops before running out of input. + */ + size_t iters = (size_t)(ip[0] - ilowest) / 7; + /* Each iteration can produce up to 10 bytes of output per stream. + * Each output stream my advance at different rates. So take the + * minimum number of safe iterations among all the output streams. + */ + for (stream = 0; stream < 4; ++stream) { + size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10; + iters = MIN(iters, oiters); + } + + /* Each iteration produces at least 5 output symbols. So until + * op[3] crosses olimit, we know we haven't executed iters + * iterations yet. This saves us maintaining an iters counter, + * at the expense of computing the remaining # of iterations + * more frequently. + */ + olimit = op[3] + (iters * 5); + + /* Exit the fast decoding loop once we reach the end. */ + if (op[3] == olimit) + break; + + /* Exit the decoding loop if any input pointer has crossed the + * previous one. This indicates corruption, and a precondition + * to our loop is that ip[i] >= ip[0]. + */ + for (stream = 1; stream < 4; ++stream) { + if (ip[stream] < ip[stream - 1]) + goto _out; + } + } + +#ifndef NDEBUG + for (stream = 1; stream < 4; ++stream) { + assert(ip[stream] >= ip[stream - 1]); + } +#endif + +#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3) \ + do { \ + if ((_decode3) || (_stream) != 3) { \ + int const index = (int)(bits[(_stream)] >> 53); \ + HUF_DEltX2 const entry = dtable[index]; \ + MEM_write16(op[(_stream)], entry.sequence); \ + bits[(_stream)] <<= (entry.nbBits) & 0x3F; \ + op[(_stream)] += (entry.length); \ + } \ + } while (0) + +#define HUF_4X2_RELOAD_STREAM(_stream) \ + do { \ + HUF_4X2_DECODE_SYMBOL(3, 1); \ + { \ + int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \ + int const nbBits = ctz & 7; \ + int const nbBytes = ctz >> 3; \ + ip[(_stream)] -= nbBytes; \ + bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \ + bits[(_stream)] <<= nbBits; \ + } \ + } while (0) + + /* Manually unroll the loop because compilers don't consistently + * unroll the inner loops, which destroys performance. + */ + do { + /* Decode 5 symbols from each of the first 3 streams. + * The final stream will be decoded during the reload phase + * to reduce register pressure. + */ + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); + + /* Decode one symbol from the final stream */ + HUF_4X2_DECODE_SYMBOL(3, 1); + + /* Decode 4 symbols from the final stream & reload bitstreams. + * The final stream is reloaded last, meaning that all 5 symbols + * are decoded from the final stream before it is reloaded. + */ + HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM); + } while (op[3] < olimit); + } + +#undef HUF_4X2_DECODE_SYMBOL +#undef HUF_4X2_RELOAD_STREAM + +_out: + + /* Save the final values of each of the state variables back to args. */ + ZSTD_memcpy(&args->bits, &bits, sizeof(bits)); + ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip)); + ZSTD_memcpy(&args->op, &op, sizeof(op)); +} + + +static HUF_FAST_BMI2_ATTRS size_t +HUF_decompress4X2_usingDTable_internal_fast( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable, + HUF_DecompressFastLoopFn loopFn) { + void const* dt = DTable + 1; + const BYTE* const ilowest = (const BYTE*)cSrc; + BYTE* const oend = (BYTE*)ZSTD_maybeNullPtrAdd(dst, (ptrdiff_t)dstSize); + HUF_DecompressFastArgs args; + { + size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); + FORWARD_IF_ERROR(ret, "Failed to init asm args"); + if (ret == 0) + return 0; + } + + assert(args.ip[0] >= args.ilowest); + loopFn(&args); + + /* note : op4 already verified within main loop */ + assert(args.ip[0] >= ilowest); + assert(args.ip[1] >= ilowest); + assert(args.ip[2] >= ilowest); + assert(args.ip[3] >= ilowest); + assert(args.op[3] <= oend); + + assert(ilowest == args.ilowest); + assert(ilowest + 6 == args.iend[0]); + (void)ilowest; + + /* finish bitStreams one by one */ + { + size_t const segmentSize = (dstSize+3) / 4; + BYTE* segmentEnd = (BYTE*)dst; + int i; + for (i = 0; i < 4; ++i) { + BIT_DStream_t bit; + if (segmentSize <= (size_t)(oend - segmentEnd)) + segmentEnd += segmentSize; + else + segmentEnd = oend; + FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption"); + args.op[i] += HUF_decodeStreamX2(args.op[i], &bit, segmentEnd, (HUF_DEltX2 const*)dt, HUF_DECODER_FAST_TABLELOG); + if (args.op[i] != segmentEnd) + return ERROR(corruption_detected); + } + } + + /* decoded size */ + return dstSize; +} + +static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable, int flags) +{ + HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default; + HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop; + +#if DYNAMIC_BMI2 + if (flags & HUF_flags_bmi2) { + fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2; +# if ZSTD_ENABLE_ASM_X86_64_BMI2 + if (!(flags & HUF_flags_disableAsm)) { + loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; + } +# endif + } else { + return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } +#endif + +#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) + if (!(flags & HUF_flags_disableAsm)) { + loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; + } +#endif + + if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) { + size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); + if (ret != 0) + return ret; + } + return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); +} + +HUF_DGEN(HUF_decompress1X2_usingDTable_internal) + +size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + void* workSpace, size_t wkspSize, int flags) +{ + const BYTE* ip = (const BYTE*) cSrc; + + size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, + workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + + return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags); +} + +static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + void* workSpace, size_t wkspSize, int flags) +{ + const BYTE* ip = (const BYTE*) cSrc; + + size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, + workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + + return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); +} + +#endif /* HUF_FORCE_DECOMPRESS_X1 */ + + +/* ***********************************/ +/* Universal decompression selectors */ +/* ***********************************/ + + +#if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2) +typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t; +static const algo_time_t algoTime[16 /* Quantization */][2 /* single, double */] = +{ + /* single, double, quad */ + {{0,0}, {1,1}}, /* Q==0 : impossible */ + {{0,0}, {1,1}}, /* Q==1 : impossible */ + {{ 150,216}, { 381,119}}, /* Q == 2 : 12-18% */ + {{ 170,205}, { 514,112}}, /* Q == 3 : 18-25% */ + {{ 177,199}, { 539,110}}, /* Q == 4 : 25-32% */ + {{ 197,194}, { 644,107}}, /* Q == 5 : 32-38% */ + {{ 221,192}, { 735,107}}, /* Q == 6 : 38-44% */ + {{ 256,189}, { 881,106}}, /* Q == 7 : 44-50% */ + {{ 359,188}, {1167,109}}, /* Q == 8 : 50-56% */ + {{ 582,187}, {1570,114}}, /* Q == 9 : 56-62% */ + {{ 688,187}, {1712,122}}, /* Q ==10 : 62-69% */ + {{ 825,186}, {1965,136}}, /* Q ==11 : 69-75% */ + {{ 976,185}, {2131,150}}, /* Q ==12 : 75-81% */ + {{1180,186}, {2070,175}}, /* Q ==13 : 81-87% */ + {{1377,185}, {1731,202}}, /* Q ==14 : 87-93% */ + {{1412,185}, {1695,202}}, /* Q ==15 : 93-99% */ +}; +#endif + +/** HUF_selectDecoder() : + * Tells which decoder is likely to decode faster, + * based on a set of pre-computed metrics. + * @return : 0==HUF_decompress4X1, 1==HUF_decompress4X2 . + * Assumption : 0 < dstSize <= 128 KB */ +U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize) +{ + assert(dstSize > 0); + assert(dstSize <= 128*1024); +#if defined(HUF_FORCE_DECOMPRESS_X1) + (void)dstSize; + (void)cSrcSize; + return 0; +#elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)dstSize; + (void)cSrcSize; + return 1; +#else + /* decoder timing evaluation */ + { U32 const Q = (cSrcSize >= dstSize) ? 15 : (U32)(cSrcSize * 16 / dstSize); /* Q < 16 */ + U32 const D256 = (U32)(dstSize >> 8); + U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256); + U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256); + DTime1 += DTime1 >> 5; /* small advantage to algorithm using less memory, to reduce cache eviction */ + return DTime1 < DTime0; + } +#endif +} + +size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + void* workSpace, size_t wkspSize, int flags) +{ + /* validation checks */ + if (dstSize == 0) return ERROR(dstSize_tooSmall); + if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */ + if (cSrcSize == dstSize) { ZSTD_memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */ + if (cSrcSize == 1) { ZSTD_memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */ + + { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); +#if defined(HUF_FORCE_DECOMPRESS_X1) + (void)algoNb; + assert(algoNb == 0); + return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc, + cSrcSize, workSpace, wkspSize, flags); +#elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)algoNb; + assert(algoNb == 1); + return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, + cSrcSize, workSpace, wkspSize, flags); +#else + return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, + cSrcSize, workSpace, wkspSize, flags): + HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc, + cSrcSize, workSpace, wkspSize, flags); +#endif + } +} + + +size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags) +{ + DTableDesc const dtd = HUF_getDTableDesc(DTable); +#if defined(HUF_FORCE_DECOMPRESS_X1) + (void)dtd; + assert(dtd.tableType == 0); + return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); +#elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)dtd; + assert(dtd.tableType == 1); + return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); +#else + return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) : + HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); +#endif +} + +#ifndef HUF_FORCE_DECOMPRESS_X2 +size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags) +{ + const BYTE* ip = (const BYTE*) cSrc; + + size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + + return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); +} +#endif + +size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags) +{ + DTableDesc const dtd = HUF_getDTableDesc(DTable); +#if defined(HUF_FORCE_DECOMPRESS_X1) + (void)dtd; + assert(dtd.tableType == 0); + return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); +#elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)dtd; + assert(dtd.tableType == 1); + return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); +#else + return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) : + HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); +#endif +} + +size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags) +{ + /* validation checks */ + if (dstSize == 0) return ERROR(dstSize_tooSmall); + if (cSrcSize == 0) return ERROR(corruption_detected); + + { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); +#if defined(HUF_FORCE_DECOMPRESS_X1) + (void)algoNb; + assert(algoNb == 0); + return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); +#elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)algoNb; + assert(algoNb == 1); + return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); +#else + return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) : + HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); +#endif + } +} diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c new file mode 100644 index 0000000..9eb9832 --- /dev/null +++ b/lib/decompress/zstd_decompress.c @@ -0,0 +1,2410 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + +/* *************************************************************** +* Tuning parameters +*****************************************************************/ +/*! + * HEAPMODE : + * Select how default decompression function ZSTD_decompress() allocates its context, + * on stack (0), or into heap (1, default; requires malloc()). + * Note that functions with explicit context such as ZSTD_decompressDCtx() are unaffected. + */ +#ifndef ZSTD_HEAPMODE +# define ZSTD_HEAPMODE 1 +#endif + +/*! +* LEGACY_SUPPORT : +* if set to 1+, ZSTD_decompress() can decode older formats (v0.1+) +*/ +#ifndef ZSTD_LEGACY_SUPPORT +# define ZSTD_LEGACY_SUPPORT 0 +#endif + +/*! + * MAXWINDOWSIZE_DEFAULT : + * maximum window size accepted by DStream __by default__. + * Frames requiring more memory will be rejected. + * It's possible to set a different limit using ZSTD_DCtx_setMaxWindowSize(). + */ +#ifndef ZSTD_MAXWINDOWSIZE_DEFAULT +# define ZSTD_MAXWINDOWSIZE_DEFAULT (((U32)1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT) + 1) +#endif + +/*! + * NO_FORWARD_PROGRESS_MAX : + * maximum allowed nb of calls to ZSTD_decompressStream() + * without any forward progress + * (defined as: no byte read from input, and no byte flushed to output) + * before triggering an error. + */ +#ifndef ZSTD_NO_FORWARD_PROGRESS_MAX +# define ZSTD_NO_FORWARD_PROGRESS_MAX 16 +#endif + + +/*-******************************************************* +* Dependencies +*********************************************************/ +#include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ +#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ +#include "../common/error_private.h" +#include "../common/zstd_internal.h" /* blockProperties_t */ +#include "../common/mem.h" /* low level memory routines */ +#include "../common/bits.h" /* ZSTD_highbit32 */ +#define FSE_STATIC_LINKING_ONLY +#include "../common/fse.h" +#include "../common/huf.h" +#include "../common/xxhash.h" /* XXH64_reset, XXH64_update, XXH64_digest, XXH64 */ +#include "zstd_decompress_internal.h" /* ZSTD_DCtx */ +#include "zstd_ddict.h" /* ZSTD_DDictDictContent */ +#include "zstd_decompress_block.h" /* ZSTD_decompressBlock_internal */ + +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) +# include "../legacy/zstd_legacy.h" +#endif + + + +/************************************* + * Multiple DDicts Hashset internals * + *************************************/ + +#define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4 +#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3 /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float. + * Currently, that means a 0.75 load factor. + * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded + * the load factor of the ddict hash set. + */ + +#define DDICT_HASHSET_TABLE_BASE_SIZE 64 +#define DDICT_HASHSET_RESIZE_FACTOR 2 + +/* Hash function to determine starting position of dict insertion within the table + * Returns an index between [0, hashSet->ddictPtrTableSize] + */ +static size_t ZSTD_DDictHashSet_getIndex(const ZSTD_DDictHashSet* hashSet, U32 dictID) { + const U64 hash = XXH64(&dictID, sizeof(U32), 0); + /* DDict ptr table size is a multiple of 2, use size - 1 as mask to get index within [0, hashSet->ddictPtrTableSize) */ + return hash & (hashSet->ddictPtrTableSize - 1); +} + +/* Adds DDict to a hashset without resizing it. + * If inserting a DDict with a dictID that already exists in the set, replaces the one in the set. + * Returns 0 if successful, or a zstd error code if something went wrong. + */ +static size_t ZSTD_DDictHashSet_emplaceDDict(ZSTD_DDictHashSet* hashSet, const ZSTD_DDict* ddict) { + const U32 dictID = ZSTD_getDictID_fromDDict(ddict); + size_t idx = ZSTD_DDictHashSet_getIndex(hashSet, dictID); + const size_t idxRangeMask = hashSet->ddictPtrTableSize - 1; + RETURN_ERROR_IF(hashSet->ddictPtrCount == hashSet->ddictPtrTableSize, GENERIC, "Hash set is full!"); + DEBUGLOG(4, "Hashed index: for dictID: %u is %zu", dictID, idx); + while (hashSet->ddictPtrTable[idx] != NULL) { + /* Replace existing ddict if inserting ddict with same dictID */ + if (ZSTD_getDictID_fromDDict(hashSet->ddictPtrTable[idx]) == dictID) { + DEBUGLOG(4, "DictID already exists, replacing rather than adding"); + hashSet->ddictPtrTable[idx] = ddict; + return 0; + } + idx &= idxRangeMask; + idx++; + } + DEBUGLOG(4, "Final idx after probing for dictID %u is: %zu", dictID, idx); + hashSet->ddictPtrTable[idx] = ddict; + hashSet->ddictPtrCount++; + return 0; +} + +/* Expands hash table by factor of DDICT_HASHSET_RESIZE_FACTOR and + * rehashes all values, allocates new table, frees old table. + * Returns 0 on success, otherwise a zstd error code. + */ +static size_t ZSTD_DDictHashSet_expand(ZSTD_DDictHashSet* hashSet, ZSTD_customMem customMem) { + size_t newTableSize = hashSet->ddictPtrTableSize * DDICT_HASHSET_RESIZE_FACTOR; + const ZSTD_DDict** newTable = (const ZSTD_DDict**)ZSTD_customCalloc(sizeof(ZSTD_DDict*) * newTableSize, customMem); + const ZSTD_DDict** oldTable = hashSet->ddictPtrTable; + size_t oldTableSize = hashSet->ddictPtrTableSize; + size_t i; + + DEBUGLOG(4, "Expanding DDict hash table! Old size: %zu new size: %zu", oldTableSize, newTableSize); + RETURN_ERROR_IF(!newTable, memory_allocation, "Expanded hashset allocation failed!"); + hashSet->ddictPtrTable = newTable; + hashSet->ddictPtrTableSize = newTableSize; + hashSet->ddictPtrCount = 0; + for (i = 0; i < oldTableSize; ++i) { + if (oldTable[i] != NULL) { + FORWARD_IF_ERROR(ZSTD_DDictHashSet_emplaceDDict(hashSet, oldTable[i]), ""); + } + } + ZSTD_customFree((void*)oldTable, customMem); + DEBUGLOG(4, "Finished re-hash"); + return 0; +} + +/* Fetches a DDict with the given dictID + * Returns the ZSTD_DDict* with the requested dictID. If it doesn't exist, then returns NULL. + */ +static const ZSTD_DDict* ZSTD_DDictHashSet_getDDict(ZSTD_DDictHashSet* hashSet, U32 dictID) { + size_t idx = ZSTD_DDictHashSet_getIndex(hashSet, dictID); + const size_t idxRangeMask = hashSet->ddictPtrTableSize - 1; + DEBUGLOG(4, "Hashed index: for dictID: %u is %zu", dictID, idx); + for (;;) { + size_t currDictID = ZSTD_getDictID_fromDDict(hashSet->ddictPtrTable[idx]); + if (currDictID == dictID || currDictID == 0) { + /* currDictID == 0 implies a NULL ddict entry */ + break; + } else { + idx &= idxRangeMask; /* Goes to start of table when we reach the end */ + idx++; + } + } + DEBUGLOG(4, "Final idx after probing for dictID %u is: %zu", dictID, idx); + return hashSet->ddictPtrTable[idx]; +} + +/* Allocates space for and returns a ddict hash set + * The hash set's ZSTD_DDict* table has all values automatically set to NULL to begin with. + * Returns NULL if allocation failed. + */ +static ZSTD_DDictHashSet* ZSTD_createDDictHashSet(ZSTD_customMem customMem) { + ZSTD_DDictHashSet* ret = (ZSTD_DDictHashSet*)ZSTD_customMalloc(sizeof(ZSTD_DDictHashSet), customMem); + DEBUGLOG(4, "Allocating new hash set"); + if (!ret) + return NULL; + ret->ddictPtrTable = (const ZSTD_DDict**)ZSTD_customCalloc(DDICT_HASHSET_TABLE_BASE_SIZE * sizeof(ZSTD_DDict*), customMem); + if (!ret->ddictPtrTable) { + ZSTD_customFree(ret, customMem); + return NULL; + } + ret->ddictPtrTableSize = DDICT_HASHSET_TABLE_BASE_SIZE; + ret->ddictPtrCount = 0; + return ret; +} + +/* Frees the table of ZSTD_DDict* within a hashset, then frees the hashset itself. + * Note: The ZSTD_DDict* within the table are NOT freed. + */ +static void ZSTD_freeDDictHashSet(ZSTD_DDictHashSet* hashSet, ZSTD_customMem customMem) { + DEBUGLOG(4, "Freeing ddict hash set"); + if (hashSet && hashSet->ddictPtrTable) { + ZSTD_customFree((void*)hashSet->ddictPtrTable, customMem); + } + if (hashSet) { + ZSTD_customFree(hashSet, customMem); + } +} + +/* Public function: Adds a DDict into the ZSTD_DDictHashSet, possibly triggering a resize of the hash set. + * Returns 0 on success, or a ZSTD error. + */ +static size_t ZSTD_DDictHashSet_addDDict(ZSTD_DDictHashSet* hashSet, const ZSTD_DDict* ddict, ZSTD_customMem customMem) { + DEBUGLOG(4, "Adding dict ID: %u to hashset with - Count: %zu Tablesize: %zu", ZSTD_getDictID_fromDDict(ddict), hashSet->ddictPtrCount, hashSet->ddictPtrTableSize); + if (hashSet->ddictPtrCount * DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT / hashSet->ddictPtrTableSize * DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT != 0) { + FORWARD_IF_ERROR(ZSTD_DDictHashSet_expand(hashSet, customMem), ""); + } + FORWARD_IF_ERROR(ZSTD_DDictHashSet_emplaceDDict(hashSet, ddict), ""); + return 0; +} + +/*-************************************************************* +* Context management +***************************************************************/ +size_t ZSTD_sizeof_DCtx (const ZSTD_DCtx* dctx) +{ + if (dctx==NULL) return 0; /* support sizeof NULL */ + return sizeof(*dctx) + + ZSTD_sizeof_DDict(dctx->ddictLocal) + + dctx->inBuffSize + dctx->outBuffSize; +} + +size_t ZSTD_estimateDCtxSize(void) { return sizeof(ZSTD_DCtx); } + + +static size_t ZSTD_startingInputLength(ZSTD_format_e format) +{ + size_t const startingInputLength = ZSTD_FRAMEHEADERSIZE_PREFIX(format); + /* only supports formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless */ + assert( (format == ZSTD_f_zstd1) || (format == ZSTD_f_zstd1_magicless) ); + return startingInputLength; +} + +static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx) +{ + assert(dctx->streamStage == zdss_init); + dctx->format = ZSTD_f_zstd1; + dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT; + dctx->outBufferMode = ZSTD_bm_buffered; + dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum; + dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict; + dctx->disableHufAsm = 0; + dctx->maxBlockSizeParam = 0; +} + +static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) +{ + dctx->staticSize = 0; + dctx->ddict = NULL; + dctx->ddictLocal = NULL; + dctx->dictEnd = NULL; + dctx->ddictIsCold = 0; + dctx->dictUses = ZSTD_dont_use; + dctx->inBuff = NULL; + dctx->inBuffSize = 0; + dctx->outBuffSize = 0; + dctx->streamStage = zdss_init; +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) + dctx->legacyContext = NULL; + dctx->previousLegacyVersion = 0; +#endif + dctx->noForwardProgress = 0; + dctx->oversizedDuration = 0; + dctx->isFrameDecompression = 1; +#if DYNAMIC_BMI2 + dctx->bmi2 = ZSTD_cpuSupportsBmi2(); +#endif + dctx->ddictSet = NULL; + ZSTD_DCtx_resetParameters(dctx); +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + dctx->dictContentEndForFuzzing = NULL; +#endif +} + +ZSTD_DCtx* ZSTD_initStaticDCtx(void *workspace, size_t workspaceSize) +{ + ZSTD_DCtx* const dctx = (ZSTD_DCtx*) workspace; + + if ((size_t)workspace & 7) return NULL; /* 8-aligned */ + if (workspaceSize < sizeof(ZSTD_DCtx)) return NULL; /* minimum size */ + + ZSTD_initDCtx_internal(dctx); + dctx->staticSize = workspaceSize; + dctx->inBuff = (char*)(dctx+1); + return dctx; +} + +static ZSTD_DCtx* ZSTD_createDCtx_internal(ZSTD_customMem customMem) { + if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL; + + { ZSTD_DCtx* const dctx = (ZSTD_DCtx*)ZSTD_customMalloc(sizeof(*dctx), customMem); + if (!dctx) return NULL; + dctx->customMem = customMem; + ZSTD_initDCtx_internal(dctx); + return dctx; + } +} + +ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem) +{ + return ZSTD_createDCtx_internal(customMem); +} + +ZSTD_DCtx* ZSTD_createDCtx(void) +{ + DEBUGLOG(3, "ZSTD_createDCtx"); + return ZSTD_createDCtx_internal(ZSTD_defaultCMem); +} + +static void ZSTD_clearDict(ZSTD_DCtx* dctx) +{ + ZSTD_freeDDict(dctx->ddictLocal); + dctx->ddictLocal = NULL; + dctx->ddict = NULL; + dctx->dictUses = ZSTD_dont_use; +} + +size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx) +{ + if (dctx==NULL) return 0; /* support free on NULL */ + RETURN_ERROR_IF(dctx->staticSize, memory_allocation, "not compatible with static DCtx"); + { ZSTD_customMem const cMem = dctx->customMem; + ZSTD_clearDict(dctx); + ZSTD_customFree(dctx->inBuff, cMem); + dctx->inBuff = NULL; +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1) + if (dctx->legacyContext) + ZSTD_freeLegacyStreamContext(dctx->legacyContext, dctx->previousLegacyVersion); +#endif + if (dctx->ddictSet) { + ZSTD_freeDDictHashSet(dctx->ddictSet, cMem); + dctx->ddictSet = NULL; + } + ZSTD_customFree(dctx, cMem); + return 0; + } +} + +/* no longer useful */ +void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx) +{ + size_t const toCopy = (size_t)((char*)(&dstDCtx->inBuff) - (char*)dstDCtx); + ZSTD_memcpy(dstDCtx, srcDCtx, toCopy); /* no need to copy workspace */ +} + +/* Given a dctx with a digested frame params, re-selects the correct ZSTD_DDict based on + * the requested dict ID from the frame. If there exists a reference to the correct ZSTD_DDict, then + * accordingly sets the ddict to be used to decompress the frame. + * + * If no DDict is found, then no action is taken, and the ZSTD_DCtx::ddict remains as-is. + * + * ZSTD_d_refMultipleDDicts must be enabled for this function to be called. + */ +static void ZSTD_DCtx_selectFrameDDict(ZSTD_DCtx* dctx) { + assert(dctx->refMultipleDDicts && dctx->ddictSet); + DEBUGLOG(4, "Adjusting DDict based on requested dict ID from frame"); + if (dctx->ddict) { + const ZSTD_DDict* frameDDict = ZSTD_DDictHashSet_getDDict(dctx->ddictSet, dctx->fParams.dictID); + if (frameDDict) { + DEBUGLOG(4, "DDict found!"); + ZSTD_clearDict(dctx); + dctx->dictID = dctx->fParams.dictID; + dctx->ddict = frameDDict; + dctx->dictUses = ZSTD_use_indefinitely; + } + } +} + + +/*-************************************************************* + * Frame header decoding + ***************************************************************/ + +/*! ZSTD_isFrame() : + * Tells if the content of `buffer` starts with a valid Frame Identifier. + * Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0. + * Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled. + * Note 3 : Skippable Frame Identifiers are considered valid. */ +unsigned ZSTD_isFrame(const void* buffer, size_t size) +{ + if (size < ZSTD_FRAMEIDSIZE) return 0; + { U32 const magic = MEM_readLE32(buffer); + if (magic == ZSTD_MAGICNUMBER) return 1; + if ((magic & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) return 1; + } +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1) + if (ZSTD_isLegacy(buffer, size)) return 1; +#endif + return 0; +} + +/*! ZSTD_isSkippableFrame() : + * Tells if the content of `buffer` starts with a valid Frame Identifier for a skippable frame. + * Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0. + */ +unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size) +{ + if (size < ZSTD_FRAMEIDSIZE) return 0; + { U32 const magic = MEM_readLE32(buffer); + if ((magic & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) return 1; + } + return 0; +} + +/** ZSTD_frameHeaderSize_internal() : + * srcSize must be large enough to reach header size fields. + * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless. + * @return : size of the Frame Header + * or an error code, which can be tested with ZSTD_isError() */ +static size_t ZSTD_frameHeaderSize_internal(const void* src, size_t srcSize, ZSTD_format_e format) +{ + size_t const minInputSize = ZSTD_startingInputLength(format); + RETURN_ERROR_IF(srcSize < minInputSize, srcSize_wrong, ""); + + { BYTE const fhd = ((const BYTE*)src)[minInputSize-1]; + U32 const dictID= fhd & 3; + U32 const singleSegment = (fhd >> 5) & 1; + U32 const fcsId = fhd >> 6; + return minInputSize + !singleSegment + + ZSTD_did_fieldSize[dictID] + ZSTD_fcs_fieldSize[fcsId] + + (singleSegment && !fcsId); + } +} + +/** ZSTD_frameHeaderSize() : + * srcSize must be >= ZSTD_frameHeaderSize_prefix. + * @return : size of the Frame Header, + * or an error code (if srcSize is too small) */ +size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize) +{ + return ZSTD_frameHeaderSize_internal(src, srcSize, ZSTD_f_zstd1); +} + + +/** ZSTD_getFrameHeader_advanced() : + * decode Frame Header, or require larger `srcSize`. + * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless + * @return : 0, `zfhPtr` is correctly filled, + * >0, `srcSize` is too small, value is wanted `srcSize` amount, +** or an error code, which can be tested using ZSTD_isError() */ +size_t ZSTD_getFrameHeader_advanced(ZSTD_FrameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format) +{ + const BYTE* ip = (const BYTE*)src; + size_t const minInputSize = ZSTD_startingInputLength(format); + + DEBUGLOG(5, "ZSTD_getFrameHeader_advanced: minInputSize = %zu, srcSize = %zu", minInputSize, srcSize); + + if (srcSize > 0) { + /* note : technically could be considered an assert(), since it's an invalid entry */ + RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter : src==NULL, but srcSize>0"); + } + if (srcSize < minInputSize) { + if (srcSize > 0 && format != ZSTD_f_zstd1_magicless) { + /* when receiving less than @minInputSize bytes, + * control these bytes at least correspond to a supported magic number + * in order to error out early if they don't. + **/ + size_t const toCopy = MIN(4, srcSize); + unsigned char hbuf[4]; MEM_writeLE32(hbuf, ZSTD_MAGICNUMBER); + assert(src != NULL); + ZSTD_memcpy(hbuf, src, toCopy); + if ( MEM_readLE32(hbuf) != ZSTD_MAGICNUMBER ) { + /* not a zstd frame : let's check if it's a skippable frame */ + MEM_writeLE32(hbuf, ZSTD_MAGIC_SKIPPABLE_START); + ZSTD_memcpy(hbuf, src, toCopy); + if ((MEM_readLE32(hbuf) & ZSTD_MAGIC_SKIPPABLE_MASK) != ZSTD_MAGIC_SKIPPABLE_START) { + RETURN_ERROR(prefix_unknown, + "first bytes don't correspond to any supported magic number"); + } } } + return minInputSize; + } + + ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzers may not understand that zfhPtr will be read only if return value is zero, since they are 2 different signals */ + if ( (format != ZSTD_f_zstd1_magicless) + && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) { + if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { + /* skippable frame */ + if (srcSize < ZSTD_SKIPPABLEHEADERSIZE) + return ZSTD_SKIPPABLEHEADERSIZE; /* magic number + frame length */ + ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); + zfhPtr->frameType = ZSTD_skippableFrame; + zfhPtr->dictID = MEM_readLE32(src) - ZSTD_MAGIC_SKIPPABLE_START; + zfhPtr->headerSize = ZSTD_SKIPPABLEHEADERSIZE; + zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_FRAMEIDSIZE); + return 0; + } + RETURN_ERROR(prefix_unknown, ""); + } + + /* ensure there is enough `srcSize` to fully read/decode frame header */ + { size_t const fhsize = ZSTD_frameHeaderSize_internal(src, srcSize, format); + if (srcSize < fhsize) return fhsize; + zfhPtr->headerSize = (U32)fhsize; + } + + { BYTE const fhdByte = ip[minInputSize-1]; + size_t pos = minInputSize; + U32 const dictIDSizeCode = fhdByte&3; + U32 const checksumFlag = (fhdByte>>2)&1; + U32 const singleSegment = (fhdByte>>5)&1; + U32 const fcsID = fhdByte>>6; + U64 windowSize = 0; + U32 dictID = 0; + U64 frameContentSize = ZSTD_CONTENTSIZE_UNKNOWN; + RETURN_ERROR_IF((fhdByte & 0x08) != 0, frameParameter_unsupported, + "reserved bits, must be zero"); + + if (!singleSegment) { + BYTE const wlByte = ip[pos++]; + U32 const windowLog = (wlByte >> 3) + ZSTD_WINDOWLOG_ABSOLUTEMIN; + RETURN_ERROR_IF(windowLog > ZSTD_WINDOWLOG_MAX, frameParameter_windowTooLarge, ""); + windowSize = (1ULL << windowLog); + windowSize += (windowSize >> 3) * (wlByte&7); + } + switch(dictIDSizeCode) + { + default: + assert(0); /* impossible */ + ZSTD_FALLTHROUGH; + case 0 : break; + case 1 : dictID = ip[pos]; pos++; break; + case 2 : dictID = MEM_readLE16(ip+pos); pos+=2; break; + case 3 : dictID = MEM_readLE32(ip+pos); pos+=4; break; + } + switch(fcsID) + { + default: + assert(0); /* impossible */ + ZSTD_FALLTHROUGH; + case 0 : if (singleSegment) frameContentSize = ip[pos]; break; + case 1 : frameContentSize = MEM_readLE16(ip+pos)+256; break; + case 2 : frameContentSize = MEM_readLE32(ip+pos); break; + case 3 : frameContentSize = MEM_readLE64(ip+pos); break; + } + if (singleSegment) windowSize = frameContentSize; + + zfhPtr->frameType = ZSTD_frame; + zfhPtr->frameContentSize = frameContentSize; + zfhPtr->windowSize = windowSize; + zfhPtr->blockSizeMax = (unsigned) MIN(windowSize, ZSTD_BLOCKSIZE_MAX); + zfhPtr->dictID = dictID; + zfhPtr->checksumFlag = checksumFlag; + } + return 0; +} + +/** ZSTD_getFrameHeader() : + * decode Frame Header, or require larger `srcSize`. + * note : this function does not consume input, it only reads it. + * @return : 0, `zfhPtr` is correctly filled, + * >0, `srcSize` is too small, value is wanted `srcSize` amount, + * or an error code, which can be tested using ZSTD_isError() */ +size_t ZSTD_getFrameHeader(ZSTD_FrameHeader* zfhPtr, const void* src, size_t srcSize) +{ + return ZSTD_getFrameHeader_advanced(zfhPtr, src, srcSize, ZSTD_f_zstd1); +} + +/** ZSTD_getFrameContentSize() : + * compatible with legacy mode + * @return : decompressed size of the single frame pointed to be `src` if known, otherwise + * - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined + * - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) */ +unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize) +{ +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1) + if (ZSTD_isLegacy(src, srcSize)) { + unsigned long long const ret = ZSTD_getDecompressedSize_legacy(src, srcSize); + return ret == 0 ? ZSTD_CONTENTSIZE_UNKNOWN : ret; + } +#endif + { ZSTD_FrameHeader zfh; + if (ZSTD_getFrameHeader(&zfh, src, srcSize) != 0) + return ZSTD_CONTENTSIZE_ERROR; + if (zfh.frameType == ZSTD_skippableFrame) { + return 0; + } else { + return zfh.frameContentSize; + } } +} + +static size_t readSkippableFrameSize(void const* src, size_t srcSize) +{ + size_t const skippableHeaderSize = ZSTD_SKIPPABLEHEADERSIZE; + U32 sizeU32; + + RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, ""); + + sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE); + RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32, + frameParameter_unsupported, ""); + { size_t const skippableSize = skippableHeaderSize + sizeU32; + RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, ""); + return skippableSize; + } +} + +/*! ZSTD_readSkippableFrame() : + * Retrieves content of a skippable frame, and writes it to dst buffer. + * + * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written, + * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. This can be NULL if the caller is not interested + * in the magicVariant. + * + * Returns an error if destination buffer is not large enough, or if this is not a valid skippable frame. + * + * @return : number of bytes written or a ZSTD error. + */ +size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, + unsigned* magicVariant, /* optional, can be NULL */ + const void* src, size_t srcSize) +{ + RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, ""); + + { U32 const magicNumber = MEM_readLE32(src); + size_t skippableFrameSize = readSkippableFrameSize(src, srcSize); + size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE; + + /* check input validity */ + RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, ""); + RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, ""); + RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, ""); + + /* deliver payload */ + if (skippableContentSize > 0 && dst != NULL) + ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize); + if (magicVariant != NULL) + *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START; + return skippableContentSize; + } +} + +/** ZSTD_findDecompressedSize() : + * `srcSize` must be the exact length of some number of ZSTD compressed and/or + * skippable frames + * note: compatible with legacy mode + * @return : decompressed size of the frames contained */ +unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize) +{ + unsigned long long totalDstSize = 0; + + while (srcSize >= ZSTD_startingInputLength(ZSTD_f_zstd1)) { + U32 const magicNumber = MEM_readLE32(src); + + if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { + size_t const skippableSize = readSkippableFrameSize(src, srcSize); + if (ZSTD_isError(skippableSize)) return ZSTD_CONTENTSIZE_ERROR; + assert(skippableSize <= srcSize); + + src = (const BYTE *)src + skippableSize; + srcSize -= skippableSize; + continue; + } + + { unsigned long long const fcs = ZSTD_getFrameContentSize(src, srcSize); + if (fcs >= ZSTD_CONTENTSIZE_ERROR) return fcs; + + if (totalDstSize + fcs < totalDstSize) + return ZSTD_CONTENTSIZE_ERROR; /* check for overflow */ + totalDstSize += fcs; + } + /* skip to next frame */ + { size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize); + if (ZSTD_isError(frameSrcSize)) return ZSTD_CONTENTSIZE_ERROR; + assert(frameSrcSize <= srcSize); + + src = (const BYTE *)src + frameSrcSize; + srcSize -= frameSrcSize; + } + } /* while (srcSize >= ZSTD_frameHeaderSize_prefix) */ + + if (srcSize) return ZSTD_CONTENTSIZE_ERROR; + + return totalDstSize; +} + +/** ZSTD_getDecompressedSize() : + * compatible with legacy mode + * @return : decompressed size if known, 0 otherwise + note : 0 can mean any of the following : + - frame content is empty + - decompressed size field is not present in frame header + - frame header unknown / not supported + - frame header not complete (`srcSize` too small) */ +unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize) +{ + unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize); + ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_ERROR < ZSTD_CONTENTSIZE_UNKNOWN); + return (ret >= ZSTD_CONTENTSIZE_ERROR) ? 0 : ret; +} + + +/** ZSTD_decodeFrameHeader() : + * `headerSize` must be the size provided by ZSTD_frameHeaderSize(). + * If multiple DDict references are enabled, also will choose the correct DDict to use. + * @return : 0 if success, or an error code, which can be tested using ZSTD_isError() */ +static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx* dctx, const void* src, size_t headerSize) +{ + size_t const result = ZSTD_getFrameHeader_advanced(&(dctx->fParams), src, headerSize, dctx->format); + if (ZSTD_isError(result)) return result; /* invalid header */ + RETURN_ERROR_IF(result>0, srcSize_wrong, "headerSize too small"); + + /* Reference DDict requested by frame if dctx references multiple ddicts */ + if (dctx->refMultipleDDicts == ZSTD_rmd_refMultipleDDicts && dctx->ddictSet) { + ZSTD_DCtx_selectFrameDDict(dctx); + } + +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + /* Skip the dictID check in fuzzing mode, because it makes the search + * harder. + */ + RETURN_ERROR_IF(dctx->fParams.dictID && (dctx->dictID != dctx->fParams.dictID), + dictionary_wrong, ""); +#endif + dctx->validateChecksum = (dctx->fParams.checksumFlag && !dctx->forceIgnoreChecksum) ? 1 : 0; + if (dctx->validateChecksum) XXH64_reset(&dctx->xxhState, 0); + dctx->processedCSize += headerSize; + return 0; +} + +static ZSTD_frameSizeInfo ZSTD_errorFrameSizeInfo(size_t ret) +{ + ZSTD_frameSizeInfo frameSizeInfo; + frameSizeInfo.compressedSize = ret; + frameSizeInfo.decompressedBound = ZSTD_CONTENTSIZE_ERROR; + return frameSizeInfo; +} + +static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize, ZSTD_format_e format) +{ + ZSTD_frameSizeInfo frameSizeInfo; + ZSTD_memset(&frameSizeInfo, 0, sizeof(ZSTD_frameSizeInfo)); + +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1) + if (format == ZSTD_f_zstd1 && ZSTD_isLegacy(src, srcSize)) + return ZSTD_findFrameSizeInfoLegacy(src, srcSize); +#endif + + if (format == ZSTD_f_zstd1 && (srcSize >= ZSTD_SKIPPABLEHEADERSIZE) + && (MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { + frameSizeInfo.compressedSize = readSkippableFrameSize(src, srcSize); + assert(ZSTD_isError(frameSizeInfo.compressedSize) || + frameSizeInfo.compressedSize <= srcSize); + return frameSizeInfo; + } else { + const BYTE* ip = (const BYTE*)src; + const BYTE* const ipstart = ip; + size_t remainingSize = srcSize; + size_t nbBlocks = 0; + ZSTD_FrameHeader zfh; + + /* Extract Frame Header */ + { size_t const ret = ZSTD_getFrameHeader_advanced(&zfh, src, srcSize, format); + if (ZSTD_isError(ret)) + return ZSTD_errorFrameSizeInfo(ret); + if (ret > 0) + return ZSTD_errorFrameSizeInfo(ERROR(srcSize_wrong)); + } + + ip += zfh.headerSize; + remainingSize -= zfh.headerSize; + + /* Iterate over each block */ + while (1) { + blockProperties_t blockProperties; + size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties); + if (ZSTD_isError(cBlockSize)) + return ZSTD_errorFrameSizeInfo(cBlockSize); + + if (ZSTD_blockHeaderSize + cBlockSize > remainingSize) + return ZSTD_errorFrameSizeInfo(ERROR(srcSize_wrong)); + + ip += ZSTD_blockHeaderSize + cBlockSize; + remainingSize -= ZSTD_blockHeaderSize + cBlockSize; + nbBlocks++; + + if (blockProperties.lastBlock) break; + } + + /* Final frame content checksum */ + if (zfh.checksumFlag) { + if (remainingSize < 4) + return ZSTD_errorFrameSizeInfo(ERROR(srcSize_wrong)); + ip += 4; + } + + frameSizeInfo.nbBlocks = nbBlocks; + frameSizeInfo.compressedSize = (size_t)(ip - ipstart); + frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN) + ? zfh.frameContentSize + : (unsigned long long)nbBlocks * zfh.blockSizeMax; + return frameSizeInfo; + } +} + +static size_t ZSTD_findFrameCompressedSize_advanced(const void *src, size_t srcSize, ZSTD_format_e format) { + ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, format); + return frameSizeInfo.compressedSize; +} + +/** ZSTD_findFrameCompressedSize() : + * See docs in zstd.h + * Note: compatible with legacy mode */ +size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize) +{ + return ZSTD_findFrameCompressedSize_advanced(src, srcSize, ZSTD_f_zstd1); +} + +/** ZSTD_decompressBound() : + * compatible with legacy mode + * `src` must point to the start of a ZSTD frame or a skippable frame + * `srcSize` must be at least as large as the frame contained + * @return : the maximum decompressed size of the compressed source + */ +unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize) +{ + unsigned long long bound = 0; + /* Iterate over each frame */ + while (srcSize > 0) { + ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1); + size_t const compressedSize = frameSizeInfo.compressedSize; + unsigned long long const decompressedBound = frameSizeInfo.decompressedBound; + if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR) + return ZSTD_CONTENTSIZE_ERROR; + assert(srcSize >= compressedSize); + src = (const BYTE*)src + compressedSize; + srcSize -= compressedSize; + bound += decompressedBound; + } + return bound; +} + +size_t ZSTD_decompressionMargin(void const* src, size_t srcSize) +{ + size_t margin = 0; + unsigned maxBlockSize = 0; + + /* Iterate over each frame */ + while (srcSize > 0) { + ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1); + size_t const compressedSize = frameSizeInfo.compressedSize; + unsigned long long const decompressedBound = frameSizeInfo.decompressedBound; + ZSTD_FrameHeader zfh; + + FORWARD_IF_ERROR(ZSTD_getFrameHeader(&zfh, src, srcSize), ""); + if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR) + return ERROR(corruption_detected); + + if (zfh.frameType == ZSTD_frame) { + /* Add the frame header to our margin */ + margin += zfh.headerSize; + /* Add the checksum to our margin */ + margin += zfh.checksumFlag ? 4 : 0; + /* Add 3 bytes per block */ + margin += 3 * frameSizeInfo.nbBlocks; + + /* Compute the max block size */ + maxBlockSize = MAX(maxBlockSize, zfh.blockSizeMax); + } else { + assert(zfh.frameType == ZSTD_skippableFrame); + /* Add the entire skippable frame size to our margin. */ + margin += compressedSize; + } + + assert(srcSize >= compressedSize); + src = (const BYTE*)src + compressedSize; + srcSize -= compressedSize; + } + + /* Add the max block size back to the margin. */ + margin += maxBlockSize; + + return margin; +} + +/*-************************************************************* + * Frame decoding + ***************************************************************/ + +/** ZSTD_insertBlock() : + * insert `src` block into `dctx` history. Useful to track uncompressed blocks. */ +size_t ZSTD_insertBlock(ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize) +{ + DEBUGLOG(5, "ZSTD_insertBlock: %u bytes", (unsigned)blockSize); + ZSTD_checkContinuity(dctx, blockStart, blockSize); + dctx->previousDstEnd = (const char*)blockStart + blockSize; + return blockSize; +} + + +static size_t ZSTD_copyRawBlock(void* dst, size_t dstCapacity, + const void* src, size_t srcSize) +{ + DEBUGLOG(5, "ZSTD_copyRawBlock"); + RETURN_ERROR_IF(srcSize > dstCapacity, dstSize_tooSmall, ""); + if (dst == NULL) { + if (srcSize == 0) return 0; + RETURN_ERROR(dstBuffer_null, ""); + } + ZSTD_memmove(dst, src, srcSize); + return srcSize; +} + +static size_t ZSTD_setRleBlock(void* dst, size_t dstCapacity, + BYTE b, + size_t regenSize) +{ + RETURN_ERROR_IF(regenSize > dstCapacity, dstSize_tooSmall, ""); + if (dst == NULL) { + if (regenSize == 0) return 0; + RETURN_ERROR(dstBuffer_null, ""); + } + ZSTD_memset(dst, b, regenSize); + return regenSize; +} + +static void ZSTD_DCtx_trace_end(ZSTD_DCtx const* dctx, U64 uncompressedSize, U64 compressedSize, int streaming) +{ +#if ZSTD_TRACE + if (dctx->traceCtx && ZSTD_trace_decompress_end != NULL) { + ZSTD_Trace trace; + ZSTD_memset(&trace, 0, sizeof(trace)); + trace.version = ZSTD_VERSION_NUMBER; + trace.streaming = streaming; + if (dctx->ddict) { + trace.dictionaryID = ZSTD_getDictID_fromDDict(dctx->ddict); + trace.dictionarySize = ZSTD_DDict_dictSize(dctx->ddict); + trace.dictionaryIsCold = dctx->ddictIsCold; + } + trace.uncompressedSize = (size_t)uncompressedSize; + trace.compressedSize = (size_t)compressedSize; + trace.dctx = dctx; + ZSTD_trace_decompress_end(dctx->traceCtx, &trace); + } +#else + (void)dctx; + (void)uncompressedSize; + (void)compressedSize; + (void)streaming; +#endif +} + + +/*! ZSTD_decompressFrame() : + * @dctx must be properly initialized + * will update *srcPtr and *srcSizePtr, + * to make *srcPtr progress by one frame. */ +static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void** srcPtr, size_t *srcSizePtr) +{ + const BYTE* const istart = (const BYTE*)(*srcPtr); + const BYTE* ip = istart; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = dstCapacity != 0 ? ostart + dstCapacity : ostart; + BYTE* op = ostart; + size_t remainingSrcSize = *srcSizePtr; + + DEBUGLOG(4, "ZSTD_decompressFrame (srcSize:%i)", (int)*srcSizePtr); + + /* check */ + RETURN_ERROR_IF( + remainingSrcSize < ZSTD_FRAMEHEADERSIZE_MIN(dctx->format)+ZSTD_blockHeaderSize, + srcSize_wrong, ""); + + /* Frame Header */ + { size_t const frameHeaderSize = ZSTD_frameHeaderSize_internal( + ip, ZSTD_FRAMEHEADERSIZE_PREFIX(dctx->format), dctx->format); + if (ZSTD_isError(frameHeaderSize)) return frameHeaderSize; + RETURN_ERROR_IF(remainingSrcSize < frameHeaderSize+ZSTD_blockHeaderSize, + srcSize_wrong, ""); + FORWARD_IF_ERROR( ZSTD_decodeFrameHeader(dctx, ip, frameHeaderSize) , ""); + ip += frameHeaderSize; remainingSrcSize -= frameHeaderSize; + } + + /* Shrink the blockSizeMax if enabled */ + if (dctx->maxBlockSizeParam != 0) + dctx->fParams.blockSizeMax = MIN(dctx->fParams.blockSizeMax, (unsigned)dctx->maxBlockSizeParam); + + /* Loop on each block */ + while (1) { + BYTE* oBlockEnd = oend; + size_t decodedSize; + blockProperties_t blockProperties; + size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSrcSize, &blockProperties); + if (ZSTD_isError(cBlockSize)) return cBlockSize; + + ip += ZSTD_blockHeaderSize; + remainingSrcSize -= ZSTD_blockHeaderSize; + RETURN_ERROR_IF(cBlockSize > remainingSrcSize, srcSize_wrong, ""); + + if (ip >= op && ip < oBlockEnd) { + /* We are decompressing in-place. Limit the output pointer so that we + * don't overwrite the block that we are currently reading. This will + * fail decompression if the input & output pointers aren't spaced + * far enough apart. + * + * This is important to set, even when the pointers are far enough + * apart, because ZSTD_decompressBlock_internal() can decide to store + * literals in the output buffer, after the block it is decompressing. + * Since we don't want anything to overwrite our input, we have to tell + * ZSTD_decompressBlock_internal to never write past ip. + * + * See ZSTD_allocateLiteralsBuffer() for reference. + */ + oBlockEnd = op + (ip - op); + } + + switch(blockProperties.blockType) + { + case bt_compressed: + assert(dctx->isFrameDecompression == 1); + decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, not_streaming); + break; + case bt_raw : + /* Use oend instead of oBlockEnd because this function is safe to overlap. It uses memmove. */ + decodedSize = ZSTD_copyRawBlock(op, (size_t)(oend-op), ip, cBlockSize); + break; + case bt_rle : + decodedSize = ZSTD_setRleBlock(op, (size_t)(oBlockEnd-op), *ip, blockProperties.origSize); + break; + case bt_reserved : + default: + RETURN_ERROR(corruption_detected, "invalid block type"); + } + FORWARD_IF_ERROR(decodedSize, "Block decompression failure"); + DEBUGLOG(5, "Decompressed block of dSize = %u", (unsigned)decodedSize); + if (dctx->validateChecksum) { + XXH64_update(&dctx->xxhState, op, decodedSize); + } + if (decodedSize) /* support dst = NULL,0 */ { + op += decodedSize; + } + assert(ip != NULL); + ip += cBlockSize; + remainingSrcSize -= cBlockSize; + if (blockProperties.lastBlock) break; + } + + if (dctx->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN) { + RETURN_ERROR_IF((U64)(op-ostart) != dctx->fParams.frameContentSize, + corruption_detected, ""); + } + if (dctx->fParams.checksumFlag) { /* Frame content checksum verification */ + RETURN_ERROR_IF(remainingSrcSize<4, checksum_wrong, ""); + if (!dctx->forceIgnoreChecksum) { + U32 const checkCalc = (U32)XXH64_digest(&dctx->xxhState); + U32 checkRead; + checkRead = MEM_readLE32(ip); + RETURN_ERROR_IF(checkRead != checkCalc, checksum_wrong, ""); + } + ip += 4; + remainingSrcSize -= 4; + } + ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0); + /* Allow caller to get size read */ + DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %i, consuming %i bytes of input", (int)(op-ostart), (int)(ip - (const BYTE*)*srcPtr)); + *srcPtr = ip; + *srcSizePtr = remainingSrcSize; + return (size_t)(op-ostart); +} + +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize, + const ZSTD_DDict* ddict) +{ + void* const dststart = dst; + int moreThan1Frame = 0; + + DEBUGLOG(5, "ZSTD_decompressMultiFrame"); + assert(dict==NULL || ddict==NULL); /* either dict or ddict set, not both */ + + if (ddict) { + dict = ZSTD_DDict_dictContent(ddict); + dictSize = ZSTD_DDict_dictSize(ddict); + } + + while (srcSize >= ZSTD_startingInputLength(dctx->format)) { + +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1) + if (dctx->format == ZSTD_f_zstd1 && ZSTD_isLegacy(src, srcSize)) { + size_t decodedSize; + size_t const frameSize = ZSTD_findFrameCompressedSizeLegacy(src, srcSize); + if (ZSTD_isError(frameSize)) return frameSize; + RETURN_ERROR_IF(dctx->staticSize, memory_allocation, + "legacy support is not compatible with static dctx"); + + decodedSize = ZSTD_decompressLegacy(dst, dstCapacity, src, frameSize, dict, dictSize); + if (ZSTD_isError(decodedSize)) return decodedSize; + + { + unsigned long long const expectedSize = ZSTD_getFrameContentSize(src, srcSize); + RETURN_ERROR_IF(expectedSize == ZSTD_CONTENTSIZE_ERROR, corruption_detected, "Corrupted frame header!"); + if (expectedSize != ZSTD_CONTENTSIZE_UNKNOWN) { + RETURN_ERROR_IF(expectedSize != decodedSize, corruption_detected, + "Frame header size does not match decoded size!"); + } + } + + assert(decodedSize <= dstCapacity); + dst = (BYTE*)dst + decodedSize; + dstCapacity -= decodedSize; + + src = (const BYTE*)src + frameSize; + srcSize -= frameSize; + + continue; + } +#endif + + if (dctx->format == ZSTD_f_zstd1 && srcSize >= 4) { + U32 const magicNumber = MEM_readLE32(src); + DEBUGLOG(5, "reading magic number %08X", (unsigned)magicNumber); + if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { + /* skippable frame detected : skip it */ + size_t const skippableSize = readSkippableFrameSize(src, srcSize); + FORWARD_IF_ERROR(skippableSize, "invalid skippable frame"); + assert(skippableSize <= srcSize); + + src = (const BYTE *)src + skippableSize; + srcSize -= skippableSize; + continue; /* check next frame */ + } } + + if (ddict) { + /* we were called from ZSTD_decompress_usingDDict */ + FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(dctx, ddict), ""); + } else { + /* this will initialize correctly with no dict if dict == NULL, so + * use this in all cases but ddict */ + FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDict(dctx, dict, dictSize), ""); + } + ZSTD_checkContinuity(dctx, dst, dstCapacity); + + { const size_t res = ZSTD_decompressFrame(dctx, dst, dstCapacity, + &src, &srcSize); + RETURN_ERROR_IF( + (ZSTD_getErrorCode(res) == ZSTD_error_prefix_unknown) + && (moreThan1Frame==1), + srcSize_wrong, + "At least one frame successfully completed, " + "but following bytes are garbage: " + "it's more likely to be a srcSize error, " + "specifying more input bytes than size of frame(s). " + "Note: one could be unlucky, it might be a corruption error instead, " + "happening right at the place where we expect zstd magic bytes. " + "But this is _much_ less likely than a srcSize field error."); + if (ZSTD_isError(res)) return res; + assert(res <= dstCapacity); + if (res != 0) + dst = (BYTE*)dst + res; + dstCapacity -= res; + } + moreThan1Frame = 1; + } /* while (srcSize >= ZSTD_frameHeaderSize_prefix) */ + + RETURN_ERROR_IF(srcSize, srcSize_wrong, "input not entirely consumed"); + + return (size_t)((BYTE*)dst - (BYTE*)dststart); +} + +size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize) +{ + return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize, dict, dictSize, NULL); +} + + +static ZSTD_DDict const* ZSTD_getDDict(ZSTD_DCtx* dctx) +{ + switch (dctx->dictUses) { + default: + assert(0 /* Impossible */); + ZSTD_FALLTHROUGH; + case ZSTD_dont_use: + ZSTD_clearDict(dctx); + return NULL; + case ZSTD_use_indefinitely: + return dctx->ddict; + case ZSTD_use_once: + dctx->dictUses = ZSTD_dont_use; + return dctx->ddict; + } +} + +size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) +{ + return ZSTD_decompress_usingDDict(dctx, dst, dstCapacity, src, srcSize, ZSTD_getDDict(dctx)); +} + + +size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t srcSize) +{ +#if defined(ZSTD_HEAPMODE) && (ZSTD_HEAPMODE>=1) + size_t regenSize; + ZSTD_DCtx* const dctx = ZSTD_createDCtx_internal(ZSTD_defaultCMem); + RETURN_ERROR_IF(dctx==NULL, memory_allocation, "NULL pointer!"); + regenSize = ZSTD_decompressDCtx(dctx, dst, dstCapacity, src, srcSize); + ZSTD_freeDCtx(dctx); + return regenSize; +#else /* stack mode */ + ZSTD_DCtx dctx; + ZSTD_initDCtx_internal(&dctx); + return ZSTD_decompressDCtx(&dctx, dst, dstCapacity, src, srcSize); +#endif +} + + +/*-************************************** +* Advanced Streaming Decompression API +* Bufferless and synchronous +****************************************/ +size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; } + +/** + * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, we + * allow taking a partial block as the input. Currently only raw uncompressed blocks can + * be streamed. + * + * For blocks that can be streamed, this allows us to reduce the latency until we produce + * output, and avoid copying the input. + * + * @param inputSize - The total amount of input that the caller currently has. + */ +static size_t ZSTD_nextSrcSizeToDecompressWithInputSize(ZSTD_DCtx* dctx, size_t inputSize) { + if (!(dctx->stage == ZSTDds_decompressBlock || dctx->stage == ZSTDds_decompressLastBlock)) + return dctx->expected; + if (dctx->bType != bt_raw) + return dctx->expected; + return BOUNDED(1, inputSize, dctx->expected); +} + +ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx) { + switch(dctx->stage) + { + default: /* should not happen */ + assert(0); + ZSTD_FALLTHROUGH; + case ZSTDds_getFrameHeaderSize: + ZSTD_FALLTHROUGH; + case ZSTDds_decodeFrameHeader: + return ZSTDnit_frameHeader; + case ZSTDds_decodeBlockHeader: + return ZSTDnit_blockHeader; + case ZSTDds_decompressBlock: + return ZSTDnit_block; + case ZSTDds_decompressLastBlock: + return ZSTDnit_lastBlock; + case ZSTDds_checkChecksum: + return ZSTDnit_checksum; + case ZSTDds_decodeSkippableHeader: + ZSTD_FALLTHROUGH; + case ZSTDds_skipFrame: + return ZSTDnit_skippableFrame; + } +} + +static int ZSTD_isSkipFrame(ZSTD_DCtx* dctx) { return dctx->stage == ZSTDds_skipFrame; } + +/** ZSTD_decompressContinue() : + * srcSize : must be the exact nb of bytes expected (see ZSTD_nextSrcSizeToDecompress()) + * @return : nb of bytes generated into `dst` (necessarily <= `dstCapacity) + * or an error code, which can be tested using ZSTD_isError() */ +size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) +{ + DEBUGLOG(5, "ZSTD_decompressContinue (srcSize:%u)", (unsigned)srcSize); + /* Sanity check */ + RETURN_ERROR_IF(srcSize != ZSTD_nextSrcSizeToDecompressWithInputSize(dctx, srcSize), srcSize_wrong, "not allowed"); + ZSTD_checkContinuity(dctx, dst, dstCapacity); + + dctx->processedCSize += srcSize; + + switch (dctx->stage) + { + case ZSTDds_getFrameHeaderSize : + assert(src != NULL); + if (dctx->format == ZSTD_f_zstd1) { /* allows header */ + assert(srcSize >= ZSTD_FRAMEIDSIZE); /* to read skippable magic number */ + if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */ + ZSTD_memcpy(dctx->headerBuffer, src, srcSize); + dctx->expected = ZSTD_SKIPPABLEHEADERSIZE - srcSize; /* remaining to load to get full skippable frame header */ + dctx->stage = ZSTDds_decodeSkippableHeader; + return 0; + } } + dctx->headerSize = ZSTD_frameHeaderSize_internal(src, srcSize, dctx->format); + if (ZSTD_isError(dctx->headerSize)) return dctx->headerSize; + ZSTD_memcpy(dctx->headerBuffer, src, srcSize); + dctx->expected = dctx->headerSize - srcSize; + dctx->stage = ZSTDds_decodeFrameHeader; + return 0; + + case ZSTDds_decodeFrameHeader: + assert(src != NULL); + ZSTD_memcpy(dctx->headerBuffer + (dctx->headerSize - srcSize), src, srcSize); + FORWARD_IF_ERROR(ZSTD_decodeFrameHeader(dctx, dctx->headerBuffer, dctx->headerSize), ""); + dctx->expected = ZSTD_blockHeaderSize; + dctx->stage = ZSTDds_decodeBlockHeader; + return 0; + + case ZSTDds_decodeBlockHeader: + { blockProperties_t bp; + size_t const cBlockSize = ZSTD_getcBlockSize(src, ZSTD_blockHeaderSize, &bp); + if (ZSTD_isError(cBlockSize)) return cBlockSize; + RETURN_ERROR_IF(cBlockSize > dctx->fParams.blockSizeMax, corruption_detected, "Block Size Exceeds Maximum"); + dctx->expected = cBlockSize; + dctx->bType = bp.blockType; + dctx->rleSize = bp.origSize; + if (cBlockSize) { + dctx->stage = bp.lastBlock ? ZSTDds_decompressLastBlock : ZSTDds_decompressBlock; + return 0; + } + /* empty block */ + if (bp.lastBlock) { + if (dctx->fParams.checksumFlag) { + dctx->expected = 4; + dctx->stage = ZSTDds_checkChecksum; + } else { + dctx->expected = 0; /* end of frame */ + dctx->stage = ZSTDds_getFrameHeaderSize; + } + } else { + dctx->expected = ZSTD_blockHeaderSize; /* jump to next header */ + dctx->stage = ZSTDds_decodeBlockHeader; + } + return 0; + } + + case ZSTDds_decompressLastBlock: + case ZSTDds_decompressBlock: + DEBUGLOG(5, "ZSTD_decompressContinue: case ZSTDds_decompressBlock"); + { size_t rSize; + switch(dctx->bType) + { + case bt_compressed: + DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed"); + assert(dctx->isFrameDecompression == 1); + rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, is_streaming); + dctx->expected = 0; /* Streaming not supported */ + break; + case bt_raw : + assert(srcSize <= dctx->expected); + rSize = ZSTD_copyRawBlock(dst, dstCapacity, src, srcSize); + FORWARD_IF_ERROR(rSize, "ZSTD_copyRawBlock failed"); + assert(rSize == srcSize); + dctx->expected -= rSize; + break; + case bt_rle : + rSize = ZSTD_setRleBlock(dst, dstCapacity, *(const BYTE*)src, dctx->rleSize); + dctx->expected = 0; /* Streaming not supported */ + break; + case bt_reserved : /* should never happen */ + default: + RETURN_ERROR(corruption_detected, "invalid block type"); + } + FORWARD_IF_ERROR(rSize, ""); + RETURN_ERROR_IF(rSize > dctx->fParams.blockSizeMax, corruption_detected, "Decompressed Block Size Exceeds Maximum"); + DEBUGLOG(5, "ZSTD_decompressContinue: decoded size from block : %u", (unsigned)rSize); + dctx->decodedSize += rSize; + if (dctx->validateChecksum) XXH64_update(&dctx->xxhState, dst, rSize); + dctx->previousDstEnd = (char*)dst + rSize; + + /* Stay on the same stage until we are finished streaming the block. */ + if (dctx->expected > 0) { + return rSize; + } + + if (dctx->stage == ZSTDds_decompressLastBlock) { /* end of frame */ + DEBUGLOG(4, "ZSTD_decompressContinue: decoded size from frame : %u", (unsigned)dctx->decodedSize); + RETURN_ERROR_IF( + dctx->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN + && dctx->decodedSize != dctx->fParams.frameContentSize, + corruption_detected, ""); + if (dctx->fParams.checksumFlag) { /* another round for frame checksum */ + dctx->expected = 4; + dctx->stage = ZSTDds_checkChecksum; + } else { + ZSTD_DCtx_trace_end(dctx, dctx->decodedSize, dctx->processedCSize, /* streaming */ 1); + dctx->expected = 0; /* ends here */ + dctx->stage = ZSTDds_getFrameHeaderSize; + } + } else { + dctx->stage = ZSTDds_decodeBlockHeader; + dctx->expected = ZSTD_blockHeaderSize; + } + return rSize; + } + + case ZSTDds_checkChecksum: + assert(srcSize == 4); /* guaranteed by dctx->expected */ + { + if (dctx->validateChecksum) { + U32 const h32 = (U32)XXH64_digest(&dctx->xxhState); + U32 const check32 = MEM_readLE32(src); + DEBUGLOG(4, "ZSTD_decompressContinue: checksum : calculated %08X :: %08X read", (unsigned)h32, (unsigned)check32); + RETURN_ERROR_IF(check32 != h32, checksum_wrong, ""); + } + ZSTD_DCtx_trace_end(dctx, dctx->decodedSize, dctx->processedCSize, /* streaming */ 1); + dctx->expected = 0; + dctx->stage = ZSTDds_getFrameHeaderSize; + return 0; + } + + case ZSTDds_decodeSkippableHeader: + assert(src != NULL); + assert(srcSize <= ZSTD_SKIPPABLEHEADERSIZE); + assert(dctx->format != ZSTD_f_zstd1_magicless); + ZSTD_memcpy(dctx->headerBuffer + (ZSTD_SKIPPABLEHEADERSIZE - srcSize), src, srcSize); /* complete skippable header */ + dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE); /* note : dctx->expected can grow seriously large, beyond local buffer size */ + dctx->stage = ZSTDds_skipFrame; + return 0; + + case ZSTDds_skipFrame: + dctx->expected = 0; + dctx->stage = ZSTDds_getFrameHeaderSize; + return 0; + + default: + assert(0); /* impossible */ + RETURN_ERROR(GENERIC, "impossible to reach"); /* some compilers require default to do something */ + } +} + + +static size_t ZSTD_refDictContent(ZSTD_DCtx* dctx, const void* dict, size_t dictSize) +{ + dctx->dictEnd = dctx->previousDstEnd; + dctx->virtualStart = (const char*)dict - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart)); + dctx->prefixStart = dict; + dctx->previousDstEnd = (const char*)dict + dictSize; +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + dctx->dictContentBeginForFuzzing = dctx->prefixStart; + dctx->dictContentEndForFuzzing = dctx->previousDstEnd; +#endif + return 0; +} + +/*! ZSTD_loadDEntropy() : + * dict : must point at beginning of a valid zstd dictionary. + * @return : size of entropy tables read */ +size_t +ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy, + const void* const dict, size_t const dictSize) +{ + const BYTE* dictPtr = (const BYTE*)dict; + const BYTE* const dictEnd = dictPtr + dictSize; + + RETURN_ERROR_IF(dictSize <= 8, dictionary_corrupted, "dict is too small"); + assert(MEM_readLE32(dict) == ZSTD_MAGIC_DICTIONARY); /* dict must be valid */ + dictPtr += 8; /* skip header = magic + dictID */ + + ZSTD_STATIC_ASSERT(offsetof(ZSTD_entropyDTables_t, OFTable) == offsetof(ZSTD_entropyDTables_t, LLTable) + sizeof(entropy->LLTable)); + ZSTD_STATIC_ASSERT(offsetof(ZSTD_entropyDTables_t, MLTable) == offsetof(ZSTD_entropyDTables_t, OFTable) + sizeof(entropy->OFTable)); + ZSTD_STATIC_ASSERT(sizeof(entropy->LLTable) + sizeof(entropy->OFTable) + sizeof(entropy->MLTable) >= HUF_DECOMPRESS_WORKSPACE_SIZE); + { void* const workspace = &entropy->LLTable; /* use fse tables as temporary workspace; implies fse tables are grouped together */ + size_t const workspaceSize = sizeof(entropy->LLTable) + sizeof(entropy->OFTable) + sizeof(entropy->MLTable); +#ifdef HUF_FORCE_DECOMPRESS_X1 + /* in minimal huffman, we always use X1 variants */ + size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable, + dictPtr, dictEnd - dictPtr, + workspace, workspaceSize, /* flags */ 0); +#else + size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable, + dictPtr, (size_t)(dictEnd - dictPtr), + workspace, workspaceSize, /* flags */ 0); +#endif + RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, ""); + dictPtr += hSize; + } + + { short offcodeNCount[MaxOff+1]; + unsigned offcodeMaxValue = MaxOff, offcodeLog; + size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, (size_t)(dictEnd-dictPtr)); + RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(offcodeMaxValue > MaxOff, dictionary_corrupted, ""); + RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, ""); + ZSTD_buildFSETable( entropy->OFTable, + offcodeNCount, offcodeMaxValue, + OF_base, OF_bits, + offcodeLog, + entropy->workspace, sizeof(entropy->workspace), + /* bmi2 */0); + dictPtr += offcodeHeaderSize; + } + + { short matchlengthNCount[MaxML+1]; + unsigned matchlengthMaxValue = MaxML, matchlengthLog; + size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, (size_t)(dictEnd-dictPtr)); + RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(matchlengthMaxValue > MaxML, dictionary_corrupted, ""); + RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, ""); + ZSTD_buildFSETable( entropy->MLTable, + matchlengthNCount, matchlengthMaxValue, + ML_base, ML_bits, + matchlengthLog, + entropy->workspace, sizeof(entropy->workspace), + /* bmi2 */ 0); + dictPtr += matchlengthHeaderSize; + } + + { short litlengthNCount[MaxLL+1]; + unsigned litlengthMaxValue = MaxLL, litlengthLog; + size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, (size_t)(dictEnd-dictPtr)); + RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(litlengthMaxValue > MaxLL, dictionary_corrupted, ""); + RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, ""); + ZSTD_buildFSETable( entropy->LLTable, + litlengthNCount, litlengthMaxValue, + LL_base, LL_bits, + litlengthLog, + entropy->workspace, sizeof(entropy->workspace), + /* bmi2 */ 0); + dictPtr += litlengthHeaderSize; + } + + RETURN_ERROR_IF(dictPtr+12 > dictEnd, dictionary_corrupted, ""); + { int i; + size_t const dictContentSize = (size_t)(dictEnd - (dictPtr+12)); + for (i=0; i<3; i++) { + U32 const rep = MEM_readLE32(dictPtr); dictPtr += 4; + RETURN_ERROR_IF(rep==0 || rep > dictContentSize, + dictionary_corrupted, ""); + entropy->rep[i] = rep; + } } + + return (size_t)(dictPtr - (const BYTE*)dict); +} + +static size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize) +{ + if (dictSize < 8) return ZSTD_refDictContent(dctx, dict, dictSize); + { U32 const magic = MEM_readLE32(dict); + if (magic != ZSTD_MAGIC_DICTIONARY) { + return ZSTD_refDictContent(dctx, dict, dictSize); /* pure content mode */ + } } + dctx->dictID = MEM_readLE32((const char*)dict + ZSTD_FRAMEIDSIZE); + + /* load entropy tables */ + { size_t const eSize = ZSTD_loadDEntropy(&dctx->entropy, dict, dictSize); + RETURN_ERROR_IF(ZSTD_isError(eSize), dictionary_corrupted, ""); + dict = (const char*)dict + eSize; + dictSize -= eSize; + } + dctx->litEntropy = dctx->fseEntropy = 1; + + /* reference dictionary content */ + return ZSTD_refDictContent(dctx, dict, dictSize); +} + +size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx) +{ + assert(dctx != NULL); +#if ZSTD_TRACE + dctx->traceCtx = (ZSTD_trace_decompress_begin != NULL) ? ZSTD_trace_decompress_begin(dctx) : 0; +#endif + dctx->expected = ZSTD_startingInputLength(dctx->format); /* dctx->format must be properly set */ + dctx->stage = ZSTDds_getFrameHeaderSize; + dctx->processedCSize = 0; + dctx->decodedSize = 0; + dctx->previousDstEnd = NULL; + dctx->prefixStart = NULL; + dctx->virtualStart = NULL; + dctx->dictEnd = NULL; + dctx->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */ + dctx->litEntropy = dctx->fseEntropy = 0; + dctx->dictID = 0; + dctx->bType = bt_reserved; + dctx->isFrameDecompression = 1; + ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue)); + ZSTD_memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue)); /* initial repcodes */ + dctx->LLTptr = dctx->entropy.LLTable; + dctx->MLTptr = dctx->entropy.MLTable; + dctx->OFTptr = dctx->entropy.OFTable; + dctx->HUFptr = dctx->entropy.hufTable; + return 0; +} + +size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize) +{ + FORWARD_IF_ERROR( ZSTD_decompressBegin(dctx) , ""); + if (dict && dictSize) + RETURN_ERROR_IF( + ZSTD_isError(ZSTD_decompress_insertDictionary(dctx, dict, dictSize)), + dictionary_corrupted, ""); + return 0; +} + + +/* ====== ZSTD_DDict ====== */ + +size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict) +{ + DEBUGLOG(4, "ZSTD_decompressBegin_usingDDict"); + assert(dctx != NULL); + if (ddict) { + const char* const dictStart = (const char*)ZSTD_DDict_dictContent(ddict); + size_t const dictSize = ZSTD_DDict_dictSize(ddict); + const void* const dictEnd = dictStart + dictSize; + dctx->ddictIsCold = (dctx->dictEnd != dictEnd); + DEBUGLOG(4, "DDict is %s", + dctx->ddictIsCold ? "~cold~" : "hot!"); + } + FORWARD_IF_ERROR( ZSTD_decompressBegin(dctx) , ""); + if (ddict) { /* NULL ddict is equivalent to no dictionary */ + ZSTD_copyDDictParameters(dctx, ddict); + } + return 0; +} + +/*! ZSTD_getDictID_fromDict() : + * Provides the dictID stored within dictionary. + * if @return == 0, the dictionary is not conformant with Zstandard specification. + * It can still be loaded, but as a content-only dictionary. */ +unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize) +{ + if (dictSize < 8) return 0; + if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) return 0; + return MEM_readLE32((const char*)dict + ZSTD_FRAMEIDSIZE); +} + +/*! ZSTD_getDictID_fromFrame() : + * Provides the dictID required to decompress frame stored within `src`. + * If @return == 0, the dictID could not be decoded. + * This could for one of the following reasons : + * - The frame does not require a dictionary (most common case). + * - The frame was built with dictID intentionally removed. + * Needed dictionary is a hidden piece of information. + * Note : this use case also happens when using a non-conformant dictionary. + * - `srcSize` is too small, and as a result, frame header could not be decoded. + * Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`. + * - This is not a Zstandard frame. + * When identifying the exact failure cause, it's possible to use + * ZSTD_getFrameHeader(), which will provide a more precise error code. */ +unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize) +{ + ZSTD_FrameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0, 0, 0 }; + size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize); + if (ZSTD_isError(hError)) return 0; + return zfp.dictID; +} + + +/*! ZSTD_decompress_usingDDict() : +* Decompression using a pre-digested Dictionary +* Use dictionary without significant overhead. */ +size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_DDict* ddict) +{ + /* pass content and size in case legacy frames are encountered */ + return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize, + NULL, 0, + ddict); +} + + +/*===================================== +* Streaming decompression +*====================================*/ + +ZSTD_DStream* ZSTD_createDStream(void) +{ + DEBUGLOG(3, "ZSTD_createDStream"); + return ZSTD_createDCtx_internal(ZSTD_defaultCMem); +} + +ZSTD_DStream* ZSTD_initStaticDStream(void *workspace, size_t workspaceSize) +{ + return ZSTD_initStaticDCtx(workspace, workspaceSize); +} + +ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem) +{ + return ZSTD_createDCtx_internal(customMem); +} + +size_t ZSTD_freeDStream(ZSTD_DStream* zds) +{ + return ZSTD_freeDCtx(zds); +} + + +/* *** Initialization *** */ + +size_t ZSTD_DStreamInSize(void) { return ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize; } +size_t ZSTD_DStreamOutSize(void) { return ZSTD_BLOCKSIZE_MAX; } + +size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType) +{ + RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, ""); + ZSTD_clearDict(dctx); + if (dict && dictSize != 0) { + dctx->ddictLocal = ZSTD_createDDict_advanced(dict, dictSize, dictLoadMethod, dictContentType, dctx->customMem); + RETURN_ERROR_IF(dctx->ddictLocal == NULL, memory_allocation, "NULL pointer!"); + dctx->ddict = dctx->ddictLocal; + dctx->dictUses = ZSTD_use_indefinitely; + } + return 0; +} + +size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize) +{ + return ZSTD_DCtx_loadDictionary_advanced(dctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto); +} + +size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize) +{ + return ZSTD_DCtx_loadDictionary_advanced(dctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto); +} + +size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType) +{ + FORWARD_IF_ERROR(ZSTD_DCtx_loadDictionary_advanced(dctx, prefix, prefixSize, ZSTD_dlm_byRef, dictContentType), ""); + dctx->dictUses = ZSTD_use_once; + return 0; +} + +size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize) +{ + return ZSTD_DCtx_refPrefix_advanced(dctx, prefix, prefixSize, ZSTD_dct_rawContent); +} + + +/* ZSTD_initDStream_usingDict() : + * return : expected size, aka ZSTD_startingInputLength(). + * this function cannot fail */ +size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize) +{ + DEBUGLOG(4, "ZSTD_initDStream_usingDict"); + FORWARD_IF_ERROR( ZSTD_DCtx_reset(zds, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_DCtx_loadDictionary(zds, dict, dictSize) , ""); + return ZSTD_startingInputLength(zds->format); +} + +/* note : this variant can't fail */ +size_t ZSTD_initDStream(ZSTD_DStream* zds) +{ + DEBUGLOG(4, "ZSTD_initDStream"); + FORWARD_IF_ERROR(ZSTD_DCtx_reset(zds, ZSTD_reset_session_only), ""); + FORWARD_IF_ERROR(ZSTD_DCtx_refDDict(zds, NULL), ""); + return ZSTD_startingInputLength(zds->format); +} + +/* ZSTD_initDStream_usingDDict() : + * ddict will just be referenced, and must outlive decompression session + * this function cannot fail */ +size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict) +{ + DEBUGLOG(4, "ZSTD_initDStream_usingDDict"); + FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , ""); + return ZSTD_startingInputLength(dctx->format); +} + +/* ZSTD_resetDStream() : + * return : expected size, aka ZSTD_startingInputLength(). + * this function cannot fail */ +size_t ZSTD_resetDStream(ZSTD_DStream* dctx) +{ + DEBUGLOG(4, "ZSTD_resetDStream"); + FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), ""); + return ZSTD_startingInputLength(dctx->format); +} + + +size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict) +{ + RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, ""); + ZSTD_clearDict(dctx); + if (ddict) { + dctx->ddict = ddict; + dctx->dictUses = ZSTD_use_indefinitely; + if (dctx->refMultipleDDicts == ZSTD_rmd_refMultipleDDicts) { + if (dctx->ddictSet == NULL) { + dctx->ddictSet = ZSTD_createDDictHashSet(dctx->customMem); + if (!dctx->ddictSet) { + RETURN_ERROR(memory_allocation, "Failed to allocate memory for hash set!"); + } + } + assert(!dctx->staticSize); /* Impossible: ddictSet cannot have been allocated if static dctx */ + FORWARD_IF_ERROR(ZSTD_DDictHashSet_addDDict(dctx->ddictSet, ddict, dctx->customMem), ""); + } + } + return 0; +} + +/* ZSTD_DCtx_setMaxWindowSize() : + * note : no direct equivalence in ZSTD_DCtx_setParameter, + * since this version sets windowSize, and the other sets windowLog */ +size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize) +{ + ZSTD_bounds const bounds = ZSTD_dParam_getBounds(ZSTD_d_windowLogMax); + size_t const min = (size_t)1 << bounds.lowerBound; + size_t const max = (size_t)1 << bounds.upperBound; + RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, ""); + RETURN_ERROR_IF(maxWindowSize < min, parameter_outOfBound, ""); + RETURN_ERROR_IF(maxWindowSize > max, parameter_outOfBound, ""); + dctx->maxWindowSize = maxWindowSize; + return 0; +} + +size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format) +{ + return ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, (int)format); +} + +ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam) +{ + ZSTD_bounds bounds = { 0, 0, 0 }; + switch(dParam) { + case ZSTD_d_windowLogMax: + bounds.lowerBound = ZSTD_WINDOWLOG_ABSOLUTEMIN; + bounds.upperBound = ZSTD_WINDOWLOG_MAX; + return bounds; + case ZSTD_d_format: + bounds.lowerBound = (int)ZSTD_f_zstd1; + bounds.upperBound = (int)ZSTD_f_zstd1_magicless; + ZSTD_STATIC_ASSERT(ZSTD_f_zstd1 < ZSTD_f_zstd1_magicless); + return bounds; + case ZSTD_d_stableOutBuffer: + bounds.lowerBound = (int)ZSTD_bm_buffered; + bounds.upperBound = (int)ZSTD_bm_stable; + return bounds; + case ZSTD_d_forceIgnoreChecksum: + bounds.lowerBound = (int)ZSTD_d_validateChecksum; + bounds.upperBound = (int)ZSTD_d_ignoreChecksum; + return bounds; + case ZSTD_d_refMultipleDDicts: + bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict; + bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts; + return bounds; + case ZSTD_d_disableHuffmanAssembly: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + case ZSTD_d_maxBlockSize: + bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN; + bounds.upperBound = ZSTD_BLOCKSIZE_MAX; + return bounds; + + default:; + } + bounds.error = ERROR(parameter_unsupported); + return bounds; +} + +/* ZSTD_dParam_withinBounds: + * @return 1 if value is within dParam bounds, + * 0 otherwise */ +static int ZSTD_dParam_withinBounds(ZSTD_dParameter dParam, int value) +{ + ZSTD_bounds const bounds = ZSTD_dParam_getBounds(dParam); + if (ZSTD_isError(bounds.error)) return 0; + if (value < bounds.lowerBound) return 0; + if (value > bounds.upperBound) return 0; + return 1; +} + +#define CHECK_DBOUNDS(p,v) { \ + RETURN_ERROR_IF(!ZSTD_dParam_withinBounds(p, v), parameter_outOfBound, ""); \ +} + +size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value) +{ + switch (param) { + case ZSTD_d_windowLogMax: + *value = (int)ZSTD_highbit32((U32)dctx->maxWindowSize); + return 0; + case ZSTD_d_format: + *value = (int)dctx->format; + return 0; + case ZSTD_d_stableOutBuffer: + *value = (int)dctx->outBufferMode; + return 0; + case ZSTD_d_forceIgnoreChecksum: + *value = (int)dctx->forceIgnoreChecksum; + return 0; + case ZSTD_d_refMultipleDDicts: + *value = (int)dctx->refMultipleDDicts; + return 0; + case ZSTD_d_disableHuffmanAssembly: + *value = (int)dctx->disableHufAsm; + return 0; + case ZSTD_d_maxBlockSize: + *value = dctx->maxBlockSizeParam; + return 0; + default:; + } + RETURN_ERROR(parameter_unsupported, ""); +} + +size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value) +{ + RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, ""); + switch(dParam) { + case ZSTD_d_windowLogMax: + if (value == 0) value = ZSTD_WINDOWLOG_LIMIT_DEFAULT; + CHECK_DBOUNDS(ZSTD_d_windowLogMax, value); + dctx->maxWindowSize = ((size_t)1) << value; + return 0; + case ZSTD_d_format: + CHECK_DBOUNDS(ZSTD_d_format, value); + dctx->format = (ZSTD_format_e)value; + return 0; + case ZSTD_d_stableOutBuffer: + CHECK_DBOUNDS(ZSTD_d_stableOutBuffer, value); + dctx->outBufferMode = (ZSTD_bufferMode_e)value; + return 0; + case ZSTD_d_forceIgnoreChecksum: + CHECK_DBOUNDS(ZSTD_d_forceIgnoreChecksum, value); + dctx->forceIgnoreChecksum = (ZSTD_forceIgnoreChecksum_e)value; + return 0; + case ZSTD_d_refMultipleDDicts: + CHECK_DBOUNDS(ZSTD_d_refMultipleDDicts, value); + if (dctx->staticSize != 0) { + RETURN_ERROR(parameter_unsupported, "Static dctx does not support multiple DDicts!"); + } + dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value; + return 0; + case ZSTD_d_disableHuffmanAssembly: + CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value); + dctx->disableHufAsm = value != 0; + return 0; + case ZSTD_d_maxBlockSize: + if (value != 0) CHECK_DBOUNDS(ZSTD_d_maxBlockSize, value); + dctx->maxBlockSizeParam = value; + return 0; + default:; + } + RETURN_ERROR(parameter_unsupported, ""); +} + +size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset) +{ + if ( (reset == ZSTD_reset_session_only) + || (reset == ZSTD_reset_session_and_parameters) ) { + dctx->streamStage = zdss_init; + dctx->noForwardProgress = 0; + dctx->isFrameDecompression = 1; + } + if ( (reset == ZSTD_reset_parameters) + || (reset == ZSTD_reset_session_and_parameters) ) { + RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, ""); + ZSTD_clearDict(dctx); + ZSTD_DCtx_resetParameters(dctx); + } + return 0; +} + + +size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx) +{ + return ZSTD_sizeof_DCtx(dctx); +} + +static size_t ZSTD_decodingBufferSize_internal(unsigned long long windowSize, unsigned long long frameContentSize, size_t blockSizeMax) +{ + size_t const blockSize = MIN((size_t)MIN(windowSize, ZSTD_BLOCKSIZE_MAX), blockSizeMax); + /* We need blockSize + WILDCOPY_OVERLENGTH worth of buffer so that if a block + * ends at windowSize + WILDCOPY_OVERLENGTH + 1 bytes, we can start writing + * the block at the beginning of the output buffer, and maintain a full window. + * + * We need another blockSize worth of buffer so that we can store split + * literals at the end of the block without overwriting the extDict window. + */ + unsigned long long const neededRBSize = windowSize + (blockSize * 2) + (WILDCOPY_OVERLENGTH * 2); + unsigned long long const neededSize = MIN(frameContentSize, neededRBSize); + size_t const minRBSize = (size_t) neededSize; + RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize, + frameParameter_windowTooLarge, ""); + return minRBSize; +} + +size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize) +{ + return ZSTD_decodingBufferSize_internal(windowSize, frameContentSize, ZSTD_BLOCKSIZE_MAX); +} + +size_t ZSTD_estimateDStreamSize(size_t windowSize) +{ + size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX); + size_t const inBuffSize = blockSize; /* no block can be larger */ + size_t const outBuffSize = ZSTD_decodingBufferSize_min(windowSize, ZSTD_CONTENTSIZE_UNKNOWN); + return ZSTD_estimateDCtxSize() + inBuffSize + outBuffSize; +} + +size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize) +{ + U32 const windowSizeMax = 1U << ZSTD_WINDOWLOG_MAX; /* note : should be user-selectable, but requires an additional parameter (or a dctx) */ + ZSTD_FrameHeader zfh; + size_t const err = ZSTD_getFrameHeader(&zfh, src, srcSize); + if (ZSTD_isError(err)) return err; + RETURN_ERROR_IF(err>0, srcSize_wrong, ""); + RETURN_ERROR_IF(zfh.windowSize > windowSizeMax, + frameParameter_windowTooLarge, ""); + return ZSTD_estimateDStreamSize((size_t)zfh.windowSize); +} + + +/* ***** Decompression ***** */ + +static int ZSTD_DCtx_isOverflow(ZSTD_DStream* zds, size_t const neededInBuffSize, size_t const neededOutBuffSize) +{ + return (zds->inBuffSize + zds->outBuffSize) >= (neededInBuffSize + neededOutBuffSize) * ZSTD_WORKSPACETOOLARGE_FACTOR; +} + +static void ZSTD_DCtx_updateOversizedDuration(ZSTD_DStream* zds, size_t const neededInBuffSize, size_t const neededOutBuffSize) +{ + if (ZSTD_DCtx_isOverflow(zds, neededInBuffSize, neededOutBuffSize)) + zds->oversizedDuration++; + else + zds->oversizedDuration = 0; +} + +static int ZSTD_DCtx_isOversizedTooLong(ZSTD_DStream* zds) +{ + return zds->oversizedDuration >= ZSTD_WORKSPACETOOLARGE_MAXDURATION; +} + +/* Checks that the output buffer hasn't changed if ZSTD_obm_stable is used. */ +static size_t ZSTD_checkOutBuffer(ZSTD_DStream const* zds, ZSTD_outBuffer const* output) +{ + ZSTD_outBuffer const expect = zds->expectedOutBuffer; + /* No requirement when ZSTD_obm_stable is not enabled. */ + if (zds->outBufferMode != ZSTD_bm_stable) + return 0; + /* Any buffer is allowed in zdss_init, this must be the same for every other call until + * the context is reset. + */ + if (zds->streamStage == zdss_init) + return 0; + /* The buffer must match our expectation exactly. */ + if (expect.dst == output->dst && expect.pos == output->pos && expect.size == output->size) + return 0; + RETURN_ERROR(dstBuffer_wrong, "ZSTD_d_stableOutBuffer enabled but output differs!"); +} + +/* Calls ZSTD_decompressContinue() with the right parameters for ZSTD_decompressStream() + * and updates the stage and the output buffer state. This call is extracted so it can be + * used both when reading directly from the ZSTD_inBuffer, and in buffered input mode. + * NOTE: You must break after calling this function since the streamStage is modified. + */ +static size_t ZSTD_decompressContinueStream( + ZSTD_DStream* zds, char** op, char* oend, + void const* src, size_t srcSize) { + int const isSkipFrame = ZSTD_isSkipFrame(zds); + if (zds->outBufferMode == ZSTD_bm_buffered) { + size_t const dstSize = isSkipFrame ? 0 : zds->outBuffSize - zds->outStart; + size_t const decodedSize = ZSTD_decompressContinue(zds, + zds->outBuff + zds->outStart, dstSize, src, srcSize); + FORWARD_IF_ERROR(decodedSize, ""); + if (!decodedSize && !isSkipFrame) { + zds->streamStage = zdss_read; + } else { + zds->outEnd = zds->outStart + decodedSize; + zds->streamStage = zdss_flush; + } + } else { + /* Write directly into the output buffer */ + size_t const dstSize = isSkipFrame ? 0 : (size_t)(oend - *op); + size_t const decodedSize = ZSTD_decompressContinue(zds, *op, dstSize, src, srcSize); + FORWARD_IF_ERROR(decodedSize, ""); + *op += decodedSize; + /* Flushing is not needed. */ + zds->streamStage = zdss_read; + assert(*op <= oend); + assert(zds->outBufferMode == ZSTD_bm_stable); + } + return 0; +} + +size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input) +{ + const char* const src = (const char*)input->src; + const char* const istart = input->pos != 0 ? src + input->pos : src; + const char* const iend = input->size != 0 ? src + input->size : src; + const char* ip = istart; + char* const dst = (char*)output->dst; + char* const ostart = output->pos != 0 ? dst + output->pos : dst; + char* const oend = output->size != 0 ? dst + output->size : dst; + char* op = ostart; + U32 someMoreWork = 1; + + DEBUGLOG(5, "ZSTD_decompressStream"); + assert(zds != NULL); + RETURN_ERROR_IF( + input->pos > input->size, + srcSize_wrong, + "forbidden. in: pos: %u vs size: %u", + (U32)input->pos, (U32)input->size); + RETURN_ERROR_IF( + output->pos > output->size, + dstSize_tooSmall, + "forbidden. out: pos: %u vs size: %u", + (U32)output->pos, (U32)output->size); + DEBUGLOG(5, "input size : %u", (U32)(input->size - input->pos)); + FORWARD_IF_ERROR(ZSTD_checkOutBuffer(zds, output), ""); + + while (someMoreWork) { + switch(zds->streamStage) + { + case zdss_init : + DEBUGLOG(5, "stage zdss_init => transparent reset "); + zds->streamStage = zdss_loadHeader; + zds->lhSize = zds->inPos = zds->outStart = zds->outEnd = 0; +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) + zds->legacyVersion = 0; +#endif + zds->hostageByte = 0; + zds->expectedOutBuffer = *output; + ZSTD_FALLTHROUGH; + + case zdss_loadHeader : + DEBUGLOG(5, "stage zdss_loadHeader (srcSize : %u)", (U32)(iend - ip)); +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) + if (zds->legacyVersion) { + RETURN_ERROR_IF(zds->staticSize, memory_allocation, + "legacy support is incompatible with static dctx"); + { size_t const hint = ZSTD_decompressLegacyStream(zds->legacyContext, zds->legacyVersion, output, input); + if (hint==0) zds->streamStage = zdss_init; + return hint; + } } +#endif + { size_t const hSize = ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format); + if (zds->refMultipleDDicts && zds->ddictSet) { + ZSTD_DCtx_selectFrameDDict(zds); + } + if (ZSTD_isError(hSize)) { +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) + U32 const legacyVersion = ZSTD_isLegacy(istart, iend-istart); + if (legacyVersion) { + ZSTD_DDict const* const ddict = ZSTD_getDDict(zds); + const void* const dict = ddict ? ZSTD_DDict_dictContent(ddict) : NULL; + size_t const dictSize = ddict ? ZSTD_DDict_dictSize(ddict) : 0; + DEBUGLOG(5, "ZSTD_decompressStream: detected legacy version v0.%u", legacyVersion); + RETURN_ERROR_IF(zds->staticSize, memory_allocation, + "legacy support is incompatible with static dctx"); + FORWARD_IF_ERROR(ZSTD_initLegacyStream(&zds->legacyContext, + zds->previousLegacyVersion, legacyVersion, + dict, dictSize), ""); + zds->legacyVersion = zds->previousLegacyVersion = legacyVersion; + { size_t const hint = ZSTD_decompressLegacyStream(zds->legacyContext, legacyVersion, output, input); + if (hint==0) zds->streamStage = zdss_init; /* or stay in stage zdss_loadHeader */ + return hint; + } } +#endif + return hSize; /* error */ + } + if (hSize != 0) { /* need more input */ + size_t const toLoad = hSize - zds->lhSize; /* if hSize!=0, hSize > zds->lhSize */ + size_t const remainingInput = (size_t)(iend-ip); + assert(iend >= ip); + if (toLoad > remainingInput) { /* not enough input to load full header */ + if (remainingInput > 0) { + ZSTD_memcpy(zds->headerBuffer + zds->lhSize, ip, remainingInput); + zds->lhSize += remainingInput; + } + input->pos = input->size; + /* check first few bytes */ + FORWARD_IF_ERROR( + ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format), + "First few bytes detected incorrect" ); + /* return hint input size */ + return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize; /* remaining header bytes + next block header */ + } + assert(ip != NULL); + ZSTD_memcpy(zds->headerBuffer + zds->lhSize, ip, toLoad); zds->lhSize = hSize; ip += toLoad; + break; + } } + + /* check for single-pass mode opportunity */ + if (zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN + && zds->fParams.frameType != ZSTD_skippableFrame + && (U64)(size_t)(oend-op) >= zds->fParams.frameContentSize) { + size_t const cSize = ZSTD_findFrameCompressedSize_advanced(istart, (size_t)(iend-istart), zds->format); + if (cSize <= (size_t)(iend-istart)) { + /* shortcut : using single-pass mode */ + size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds)); + if (ZSTD_isError(decompressedSize)) return decompressedSize; + DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()"); + assert(istart != NULL); + ip = istart + cSize; + op = op ? op + decompressedSize : op; /* can occur if frameContentSize = 0 (empty frame) */ + zds->expected = 0; + zds->streamStage = zdss_init; + someMoreWork = 0; + break; + } } + + /* Check output buffer is large enough for ZSTD_odm_stable. */ + if (zds->outBufferMode == ZSTD_bm_stable + && zds->fParams.frameType != ZSTD_skippableFrame + && zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN + && (U64)(size_t)(oend-op) < zds->fParams.frameContentSize) { + RETURN_ERROR(dstSize_tooSmall, "ZSTD_obm_stable passed but ZSTD_outBuffer is too small"); + } + + /* Consume header (see ZSTDds_decodeFrameHeader) */ + DEBUGLOG(4, "Consume header"); + FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(zds, ZSTD_getDDict(zds)), ""); + + if (zds->format == ZSTD_f_zstd1 + && (MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */ + zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE); + zds->stage = ZSTDds_skipFrame; + } else { + FORWARD_IF_ERROR(ZSTD_decodeFrameHeader(zds, zds->headerBuffer, zds->lhSize), ""); + zds->expected = ZSTD_blockHeaderSize; + zds->stage = ZSTDds_decodeBlockHeader; + } + + /* control buffer memory usage */ + DEBUGLOG(4, "Control max memory usage (%u KB <= max %u KB)", + (U32)(zds->fParams.windowSize >>10), + (U32)(zds->maxWindowSize >> 10) ); + zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN); + RETURN_ERROR_IF(zds->fParams.windowSize > zds->maxWindowSize, + frameParameter_windowTooLarge, ""); + if (zds->maxBlockSizeParam != 0) + zds->fParams.blockSizeMax = MIN(zds->fParams.blockSizeMax, (unsigned)zds->maxBlockSizeParam); + + /* Adapt buffer sizes to frame header instructions */ + { size_t const neededInBuffSize = MAX(zds->fParams.blockSizeMax, 4 /* frame checksum */); + size_t const neededOutBuffSize = zds->outBufferMode == ZSTD_bm_buffered + ? ZSTD_decodingBufferSize_internal(zds->fParams.windowSize, zds->fParams.frameContentSize, zds->fParams.blockSizeMax) + : 0; + + ZSTD_DCtx_updateOversizedDuration(zds, neededInBuffSize, neededOutBuffSize); + + { int const tooSmall = (zds->inBuffSize < neededInBuffSize) || (zds->outBuffSize < neededOutBuffSize); + int const tooLarge = ZSTD_DCtx_isOversizedTooLong(zds); + + if (tooSmall || tooLarge) { + size_t const bufferSize = neededInBuffSize + neededOutBuffSize; + DEBUGLOG(4, "inBuff : from %u to %u", + (U32)zds->inBuffSize, (U32)neededInBuffSize); + DEBUGLOG(4, "outBuff : from %u to %u", + (U32)zds->outBuffSize, (U32)neededOutBuffSize); + if (zds->staticSize) { /* static DCtx */ + DEBUGLOG(4, "staticSize : %u", (U32)zds->staticSize); + assert(zds->staticSize >= sizeof(ZSTD_DCtx)); /* controlled at init */ + RETURN_ERROR_IF( + bufferSize > zds->staticSize - sizeof(ZSTD_DCtx), + memory_allocation, ""); + } else { + ZSTD_customFree(zds->inBuff, zds->customMem); + zds->inBuffSize = 0; + zds->outBuffSize = 0; + zds->inBuff = (char*)ZSTD_customMalloc(bufferSize, zds->customMem); + RETURN_ERROR_IF(zds->inBuff == NULL, memory_allocation, ""); + } + zds->inBuffSize = neededInBuffSize; + zds->outBuff = zds->inBuff + zds->inBuffSize; + zds->outBuffSize = neededOutBuffSize; + } } } + zds->streamStage = zdss_read; + ZSTD_FALLTHROUGH; + + case zdss_read: + DEBUGLOG(5, "stage zdss_read"); + { size_t const neededInSize = ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip)); + DEBUGLOG(5, "neededInSize = %u", (U32)neededInSize); + if (neededInSize==0) { /* end of frame */ + zds->streamStage = zdss_init; + someMoreWork = 0; + break; + } + if ((size_t)(iend-ip) >= neededInSize) { /* decode directly from src */ + FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), ""); + assert(ip != NULL); + ip += neededInSize; + /* Function modifies the stage so we must break */ + break; + } } + if (ip==iend) { someMoreWork = 0; break; } /* no more input */ + zds->streamStage = zdss_load; + ZSTD_FALLTHROUGH; + + case zdss_load: + { size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds); + size_t const toLoad = neededInSize - zds->inPos; + int const isSkipFrame = ZSTD_isSkipFrame(zds); + size_t loadedSize; + /* At this point we shouldn't be decompressing a block that we can stream. */ + assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip))); + if (isSkipFrame) { + loadedSize = MIN(toLoad, (size_t)(iend-ip)); + } else { + RETURN_ERROR_IF(toLoad > zds->inBuffSize - zds->inPos, + corruption_detected, + "should never happen"); + loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip)); + } + if (loadedSize != 0) { + /* ip may be NULL */ + ip += loadedSize; + zds->inPos += loadedSize; + } + if (loadedSize < toLoad) { someMoreWork = 0; break; } /* not enough input, wait for more */ + + /* decode loaded input */ + zds->inPos = 0; /* input is consumed */ + FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, zds->inBuff, neededInSize), ""); + /* Function modifies the stage so we must break */ + break; + } + case zdss_flush: + { + size_t const toFlushSize = zds->outEnd - zds->outStart; + size_t const flushedSize = ZSTD_limitCopy(op, (size_t)(oend-op), zds->outBuff + zds->outStart, toFlushSize); + + op = op ? op + flushedSize : op; + + zds->outStart += flushedSize; + if (flushedSize == toFlushSize) { /* flush completed */ + zds->streamStage = zdss_read; + if ( (zds->outBuffSize < zds->fParams.frameContentSize) + && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) { + DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)", + (int)(zds->outBuffSize - zds->outStart), + (U32)zds->fParams.blockSizeMax); + zds->outStart = zds->outEnd = 0; + } + break; + } } + /* cannot complete flush */ + someMoreWork = 0; + break; + + default: + assert(0); /* impossible */ + RETURN_ERROR(GENERIC, "impossible to reach"); /* some compilers require default to do something */ + } } + + /* result */ + input->pos = (size_t)(ip - (const char*)(input->src)); + output->pos = (size_t)(op - (char*)(output->dst)); + + /* Update the expected output buffer for ZSTD_obm_stable. */ + zds->expectedOutBuffer = *output; + + if ((ip==istart) && (op==ostart)) { /* no forward progress */ + zds->noForwardProgress ++; + if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) { + RETURN_ERROR_IF(op==oend, noForwardProgress_destFull, ""); + RETURN_ERROR_IF(ip==iend, noForwardProgress_inputEmpty, ""); + assert(0); + } + } else { + zds->noForwardProgress = 0; + } + { size_t nextSrcSizeHint = ZSTD_nextSrcSizeToDecompress(zds); + if (!nextSrcSizeHint) { /* frame fully decoded */ + if (zds->outEnd == zds->outStart) { /* output fully flushed */ + if (zds->hostageByte) { + if (input->pos >= input->size) { + /* can't release hostage (not present) */ + zds->streamStage = zdss_read; + return 1; + } + input->pos++; /* release hostage */ + } /* zds->hostageByte */ + return 0; + } /* zds->outEnd == zds->outStart */ + if (!zds->hostageByte) { /* output not fully flushed; keep last byte as hostage; will be released when all output is flushed */ + input->pos--; /* note : pos > 0, otherwise, impossible to finish reading last block */ + zds->hostageByte=1; + } + return 1; + } /* nextSrcSizeHint==0 */ + nextSrcSizeHint += ZSTD_blockHeaderSize * (ZSTD_nextInputType(zds) == ZSTDnit_block); /* preload header of next block */ + assert(zds->inPos <= nextSrcSizeHint); + nextSrcSizeHint -= zds->inPos; /* part already loaded*/ + return nextSrcSizeHint; + } +} + +size_t ZSTD_decompressStream_simpleArgs ( + ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos) +{ + ZSTD_outBuffer output; + ZSTD_inBuffer input; + output.dst = dst; + output.size = dstCapacity; + output.pos = *dstPos; + input.src = src; + input.size = srcSize; + input.pos = *srcPos; + { size_t const cErr = ZSTD_decompressStream(dctx, &output, &input); + *dstPos = output.pos; + *srcPos = input.pos; + return cErr; + } +} diff --git a/lib/decompress/zstd_decompress_block.c b/lib/decompress/zstd_decompress_block.c new file mode 100644 index 0000000..6174a25 --- /dev/null +++ b/lib/decompress/zstd_decompress_block.c @@ -0,0 +1,2217 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/* zstd_decompress_block : + * this module takes care of decompressing _compressed_ block */ + +/*-******************************************************* +* Dependencies +*********************************************************/ +#include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ +#include "../common/compiler.h" /* prefetch */ +#include "../common/mem.h" /* low level memory routines */ +#include +#define FSE_STATIC_LINKING_ONLY +#include "../common/fse.h" +#include "../common/huf.h" +#include "../common/zstd_internal.h" +#include "zstd_decompress_internal.h" /* ZSTD_DCtx */ +#include "zstd_decompress_block.h" +#include "../common/bits.h" /* ZSTD_highbit32 */ + +/*_******************************************************* +* Macros +**********************************************************/ + +/* These two optional macros force the use one way or another of the two + * ZSTD_decompressSequences implementations. You can't force in both directions + * at the same time. + */ +#if defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ + defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) +#error "Cannot force the use of the short and the long ZSTD_decompressSequences variants!" +#endif + + +/*_******************************************************* +* Memory operations +**********************************************************/ +static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); } + + +/*-************************************************************* + * Block decoding + ***************************************************************/ + +static size_t ZSTD_blockSizeMax(ZSTD_DCtx const* dctx) +{ + size_t const blockSizeMax = dctx->isFrameDecompression ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX; + assert(blockSizeMax <= ZSTD_BLOCKSIZE_MAX); + return blockSizeMax; +} + +/*! ZSTD_getcBlockSize() : + * Provides the size of compressed block from block header `src` */ +size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, + blockProperties_t* bpPtr) +{ + RETURN_ERROR_IF(srcSize < ZSTD_blockHeaderSize, srcSize_wrong, ""); + + { U32 const cBlockHeader = MEM_readLE24(src); + U32 const cSize = cBlockHeader >> 3; + bpPtr->lastBlock = cBlockHeader & 1; + bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3); + bpPtr->origSize = cSize; /* only useful for RLE */ + if (bpPtr->blockType == bt_rle) return 1; + RETURN_ERROR_IF(bpPtr->blockType == bt_reserved, corruption_detected, ""); + return cSize; + } +} + +/* Allocate buffer for literals, either overlapping current dst, or split between dst and litExtraBuffer, or stored entirely within litExtraBuffer */ +static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize, + const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately) +{ + size_t const blockSizeMax = ZSTD_blockSizeMax(dctx); + assert(litSize <= blockSizeMax); + assert(dctx->isFrameDecompression || streaming == not_streaming); + assert(expectedWriteSize <= blockSizeMax); + if (streaming == not_streaming && dstCapacity > blockSizeMax + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) { + /* If we aren't streaming, we can just put the literals after the output + * of the current block. We don't need to worry about overwriting the + * extDict of our window, because it doesn't exist. + * So if we have space after the end of the block, just put it there. + */ + dctx->litBuffer = (BYTE*)dst + blockSizeMax + WILDCOPY_OVERLENGTH; + dctx->litBufferEnd = dctx->litBuffer + litSize; + dctx->litBufferLocation = ZSTD_in_dst; + } else if (litSize <= ZSTD_LITBUFFEREXTRASIZE) { + /* Literals fit entirely within the extra buffer, put them there to avoid + * having to split the literals. + */ + dctx->litBuffer = dctx->litExtraBuffer; + dctx->litBufferEnd = dctx->litBuffer + litSize; + dctx->litBufferLocation = ZSTD_not_in_dst; + } else { + assert(blockSizeMax > ZSTD_LITBUFFEREXTRASIZE); + /* Literals must be split between the output block and the extra lit + * buffer. We fill the extra lit buffer with the tail of the literals, + * and put the rest of the literals at the end of the block, with + * WILDCOPY_OVERLENGTH of buffer room to allow for overreads. + * This MUST not write more than our maxBlockSize beyond dst, because in + * streaming mode, that could overwrite part of our extDict window. + */ + if (splitImmediately) { + /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ + dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; + dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE; + } else { + /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */ + dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize; + dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize; + } + dctx->litBufferLocation = ZSTD_split; + assert(dctx->litBufferEnd <= (BYTE*)dst + expectedWriteSize); + } +} + +/*! ZSTD_decodeLiteralsBlock() : + * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored + * in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current + * block will be output. Otherwise it will be stored at the end of the current dst blockspace, with a small portion being + * stored in dctx->litExtraBuffer to help keep it "ahead" of the current output write. + * + * @return : nb of bytes read from src (< srcSize ) + * note : symbol not declared but exposed for fullbench */ +static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */ + void* dst, size_t dstCapacity, const streaming_operation streaming) +{ + DEBUGLOG(5, "ZSTD_decodeLiteralsBlock"); + RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, ""); + + { const BYTE* const istart = (const BYTE*) src; + SymbolEncodingType_e const litEncType = (SymbolEncodingType_e)(istart[0] & 3); + size_t const blockSizeMax = ZSTD_blockSizeMax(dctx); + + switch(litEncType) + { + case set_repeat: + DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block"); + RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, ""); + ZSTD_FALLTHROUGH; + + case set_compressed: + RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3"); + { size_t lhSize, litSize, litCSize; + U32 singleStream=0; + U32 const lhlCode = (istart[0] >> 2) & 3; + U32 const lhc = MEM_readLE32(istart); + size_t hufSuccess; + size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); + int const flags = 0 + | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0) + | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0); + switch(lhlCode) + { + case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */ + /* 2 - 2 - 10 - 10 */ + singleStream = !lhlCode; + lhSize = 3; + litSize = (lhc >> 4) & 0x3FF; + litCSize = (lhc >> 14) & 0x3FF; + break; + case 2: + /* 2 - 2 - 14 - 14 */ + lhSize = 4; + litSize = (lhc >> 4) & 0x3FFF; + litCSize = lhc >> 18; + break; + case 3: + /* 2 - 2 - 18 - 18 */ + lhSize = 5; + litSize = (lhc >> 4) & 0x3FFFF; + litCSize = (lhc >> 22) + ((size_t)istart[4] << 10); + break; + } + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); + RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); + if (!singleStream) + RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong, + "Not enough literals (%zu) for the 4-streams mode (min %u)", + litSize, MIN_LITERALS_FOR_4_STREAMS); + RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, ""); + RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0); + + /* prefetch huffman table if cold */ + if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) { + PREFETCH_AREA(dctx->HUFptr, sizeof(dctx->entropy.hufTable)); + } + + if (litEncType==set_repeat) { + if (singleStream) { + hufSuccess = HUF_decompress1X_usingDTable( + dctx->litBuffer, litSize, istart+lhSize, litCSize, + dctx->HUFptr, flags); + } else { + assert(litSize >= MIN_LITERALS_FOR_4_STREAMS); + hufSuccess = HUF_decompress4X_usingDTable( + dctx->litBuffer, litSize, istart+lhSize, litCSize, + dctx->HUFptr, flags); + } + } else { + if (singleStream) { +#if defined(HUF_FORCE_DECOMPRESS_X2) + hufSuccess = HUF_decompress1X_DCtx_wksp( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, + sizeof(dctx->workspace), flags); +#else + hufSuccess = HUF_decompress1X1_DCtx_wksp( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, + sizeof(dctx->workspace), flags); +#endif + } else { + hufSuccess = HUF_decompress4X_hufOnly_wksp( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, + sizeof(dctx->workspace), flags); + } + } + if (dctx->litBufferLocation == ZSTD_split) + { + assert(litSize > ZSTD_LITBUFFEREXTRASIZE); + ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); + ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE); + dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; + dctx->litBufferEnd -= WILDCOPY_OVERLENGTH; + assert(dctx->litBufferEnd <= (BYTE*)dst + blockSizeMax); + } + + RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, ""); + + dctx->litPtr = dctx->litBuffer; + dctx->litSize = litSize; + dctx->litEntropy = 1; + if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable; + return litCSize + lhSize; + } + + case set_basic: + { size_t litSize, lhSize; + U32 const lhlCode = ((istart[0]) >> 2) & 3; + size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); + switch(lhlCode) + { + case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ + lhSize = 1; + litSize = istart[0] >> 3; + break; + case 1: + lhSize = 2; + litSize = MEM_readLE16(istart) >> 4; + break; + case 3: + lhSize = 3; + RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3"); + litSize = MEM_readLE24(istart) >> 4; + break; + } + + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); + RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); + RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); + if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */ + RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, ""); + if (dctx->litBufferLocation == ZSTD_split) + { + ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize - ZSTD_LITBUFFEREXTRASIZE); + ZSTD_memcpy(dctx->litExtraBuffer, istart + lhSize + litSize - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); + } + else + { + ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize); + } + dctx->litPtr = dctx->litBuffer; + dctx->litSize = litSize; + return lhSize+litSize; + } + /* direct reference into compressed stream */ + dctx->litPtr = istart+lhSize; + dctx->litSize = litSize; + dctx->litBufferEnd = dctx->litPtr + litSize; + dctx->litBufferLocation = ZSTD_not_in_dst; + return lhSize+litSize; + } + + case set_rle: + { U32 const lhlCode = ((istart[0]) >> 2) & 3; + size_t litSize, lhSize; + size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); + switch(lhlCode) + { + case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ + lhSize = 1; + litSize = istart[0] >> 3; + break; + case 1: + lhSize = 2; + RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3"); + litSize = MEM_readLE16(istart) >> 4; + break; + case 3: + lhSize = 3; + RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4"); + litSize = MEM_readLE24(istart) >> 4; + break; + } + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); + RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); + RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); + if (dctx->litBufferLocation == ZSTD_split) + { + ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize - ZSTD_LITBUFFEREXTRASIZE); + ZSTD_memset(dctx->litExtraBuffer, istart[lhSize], ZSTD_LITBUFFEREXTRASIZE); + } + else + { + ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize); + } + dctx->litPtr = dctx->litBuffer; + dctx->litSize = litSize; + return lhSize+1; + } + default: + RETURN_ERROR(corruption_detected, "impossible"); + } + } +} + +/* Hidden declaration for fullbench */ +size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx, + const void* src, size_t srcSize, + void* dst, size_t dstCapacity); +size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx, + const void* src, size_t srcSize, + void* dst, size_t dstCapacity) +{ + dctx->isFrameDecompression = 0; + return ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, not_streaming); +} + +/* Default FSE distribution tables. + * These are pre-calculated FSE decoding tables using default distributions as defined in specification : + * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions + * They were generated programmatically with following method : + * - start from default distributions, present in /lib/common/zstd_internal.h + * - generate tables normally, using ZSTD_buildFSETable() + * - printout the content of tables + * - prettify output, report below, test with fuzzer to ensure it's correct */ + +/* Default FSE distribution table for Literal Lengths */ +static const ZSTD_seqSymbol LL_defaultDTable[(1<tableLog = 0; + DTableH->fastMode = 0; + + cell->nbBits = 0; + cell->nextState = 0; + assert(nbAddBits < 255); + cell->nbAdditionalBits = nbAddBits; + cell->baseValue = baseValue; +} + + +/* ZSTD_buildFSETable() : + * generate FSE decoding table for one symbol (ll, ml or off) + * cannot fail if input is valid => + * all inputs are presumed validated at this stage */ +FORCE_INLINE_TEMPLATE +void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, + const short* normalizedCounter, unsigned maxSymbolValue, + const U32* baseValue, const U8* nbAdditionalBits, + unsigned tableLog, void* wksp, size_t wkspSize) +{ + ZSTD_seqSymbol* const tableDecode = dt+1; + U32 const maxSV1 = maxSymbolValue + 1; + U32 const tableSize = 1 << tableLog; + + U16* symbolNext = (U16*)wksp; + BYTE* spread = (BYTE*)(symbolNext + MaxSeq + 1); + U32 highThreshold = tableSize - 1; + + + /* Sanity Checks */ + assert(maxSymbolValue <= MaxSeq); + assert(tableLog <= MaxFSELog); + assert(wkspSize >= ZSTD_BUILD_FSE_TABLE_WKSP_SIZE); + (void)wkspSize; + /* Init, lay down lowprob symbols */ + { ZSTD_seqSymbol_header DTableH; + DTableH.tableLog = tableLog; + DTableH.fastMode = 1; + { S16 const largeLimit= (S16)(1 << (tableLog-1)); + U32 s; + for (s=0; s= largeLimit) DTableH.fastMode=0; + assert(normalizedCounter[s]>=0); + symbolNext[s] = (U16)normalizedCounter[s]; + } } } + ZSTD_memcpy(dt, &DTableH, sizeof(DTableH)); + } + + /* Spread symbols */ + assert(tableSize <= 512); + /* Specialized symbol spreading for the case when there are + * no low probability (-1 count) symbols. When compressing + * small blocks we avoid low probability symbols to hit this + * case, since header decoding speed matters more. + */ + if (highThreshold == tableSize - 1) { + size_t const tableMask = tableSize-1; + size_t const step = FSE_TABLESTEP(tableSize); + /* First lay down the symbols in order. + * We use a uint64_t to lay down 8 bytes at a time. This reduces branch + * misses since small blocks generally have small table logs, so nearly + * all symbols have counts <= 8. We ensure we have 8 bytes at the end of + * our buffer to handle the over-write. + */ + { + U64 const add = 0x0101010101010101ull; + size_t pos = 0; + U64 sv = 0; + U32 s; + for (s=0; s=0); + pos += (size_t)n; + } + } + /* Now we spread those positions across the table. + * The benefit of doing it in two stages is that we avoid the + * variable size inner loop, which caused lots of branch misses. + * Now we can run through all the positions without any branch misses. + * We unroll the loop twice, since that is what empirically worked best. + */ + { + size_t position = 0; + size_t s; + size_t const unroll = 2; + assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */ + for (s = 0; s < (size_t)tableSize; s += unroll) { + size_t u; + for (u = 0; u < unroll; ++u) { + size_t const uPosition = (position + (u * step)) & tableMask; + tableDecode[uPosition].baseValue = spread[s + u]; + } + position = (position + (unroll * step)) & tableMask; + } + assert(position == 0); + } + } else { + U32 const tableMask = tableSize-1; + U32 const step = FSE_TABLESTEP(tableSize); + U32 s, position = 0; + for (s=0; s highThreshold)) position = (position + step) & tableMask; /* lowprob area */ + } } + assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */ + } + + /* Build Decoding table */ + { + U32 u; + for (u=0; u max, corruption_detected, ""); + { U32 const symbol = *(const BYTE*)src; + U32 const baseline = baseValue[symbol]; + U8 const nbBits = nbAdditionalBits[symbol]; + ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits); + } + *DTablePtr = DTableSpace; + return 1; + case set_basic : + *DTablePtr = defaultTable; + return 0; + case set_repeat: + RETURN_ERROR_IF(!flagRepeatTable, corruption_detected, ""); + /* prefetch FSE table if used */ + if (ddictIsCold && (nbSeq > 24 /* heuristic */)) { + const void* const pStart = *DTablePtr; + size_t const pSize = sizeof(ZSTD_seqSymbol) * (SEQSYMBOL_TABLE_SIZE(maxLog)); + PREFETCH_AREA(pStart, pSize); + } + return 0; + case set_compressed : + { unsigned tableLog; + S16 norm[MaxSeq+1]; + size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize); + RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected, ""); + RETURN_ERROR_IF(tableLog > maxLog, corruption_detected, ""); + ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog, wksp, wkspSize, bmi2); + *DTablePtr = DTableSpace; + return headerSize; + } + default : + assert(0); + RETURN_ERROR(GENERIC, "impossible"); + } +} + +size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, + const void* src, size_t srcSize) +{ + const BYTE* const istart = (const BYTE*)src; + const BYTE* const iend = istart + srcSize; + const BYTE* ip = istart; + int nbSeq; + DEBUGLOG(5, "ZSTD_decodeSeqHeaders"); + + /* check */ + RETURN_ERROR_IF(srcSize < MIN_SEQUENCES_SIZE, srcSize_wrong, ""); + + /* SeqHead */ + nbSeq = *ip++; + if (nbSeq > 0x7F) { + if (nbSeq == 0xFF) { + RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, ""); + nbSeq = MEM_readLE16(ip) + LONGNBSEQ; + ip+=2; + } else { + RETURN_ERROR_IF(ip >= iend, srcSize_wrong, ""); + nbSeq = ((nbSeq-0x80)<<8) + *ip++; + } + } + *nbSeqPtr = nbSeq; + + if (nbSeq == 0) { + /* No sequence : section ends immediately */ + RETURN_ERROR_IF(ip != iend, corruption_detected, + "extraneous data present in the Sequences section"); + return (size_t)(ip - istart); + } + + /* FSE table descriptors */ + RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */ + RETURN_ERROR_IF(*ip & 3, corruption_detected, ""); /* The last field, Reserved, must be all-zeroes. */ + { SymbolEncodingType_e const LLtype = (SymbolEncodingType_e)(*ip >> 6); + SymbolEncodingType_e const OFtype = (SymbolEncodingType_e)((*ip >> 4) & 3); + SymbolEncodingType_e const MLtype = (SymbolEncodingType_e)((*ip >> 2) & 3); + ip++; + + /* Build DTables */ + assert(ip <= iend); + { size_t const llhSize = ZSTD_buildSeqTable(dctx->entropy.LLTable, &dctx->LLTptr, + LLtype, MaxLL, LLFSELog, + ip, (size_t)(iend-ip), + LL_base, LL_bits, + LL_defaultDTable, dctx->fseEntropy, + dctx->ddictIsCold, nbSeq, + dctx->workspace, sizeof(dctx->workspace), + ZSTD_DCtx_get_bmi2(dctx)); + RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed"); + ip += llhSize; + } + + assert(ip <= iend); + { size_t const ofhSize = ZSTD_buildSeqTable(dctx->entropy.OFTable, &dctx->OFTptr, + OFtype, MaxOff, OffFSELog, + ip, (size_t)(iend-ip), + OF_base, OF_bits, + OF_defaultDTable, dctx->fseEntropy, + dctx->ddictIsCold, nbSeq, + dctx->workspace, sizeof(dctx->workspace), + ZSTD_DCtx_get_bmi2(dctx)); + RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed"); + ip += ofhSize; + } + + assert(ip <= iend); + { size_t const mlhSize = ZSTD_buildSeqTable(dctx->entropy.MLTable, &dctx->MLTptr, + MLtype, MaxML, MLFSELog, + ip, (size_t)(iend-ip), + ML_base, ML_bits, + ML_defaultDTable, dctx->fseEntropy, + dctx->ddictIsCold, nbSeq, + dctx->workspace, sizeof(dctx->workspace), + ZSTD_DCtx_get_bmi2(dctx)); + RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed"); + ip += mlhSize; + } + } + + return (size_t)(ip-istart); +} + + +typedef struct { + size_t litLength; + size_t matchLength; + size_t offset; +} seq_t; + +typedef struct { + size_t state; + const ZSTD_seqSymbol* table; +} ZSTD_fseState; + +typedef struct { + BIT_DStream_t DStream; + ZSTD_fseState stateLL; + ZSTD_fseState stateOffb; + ZSTD_fseState stateML; + size_t prevOffset[ZSTD_REP_NUM]; +} seqState_t; + +/*! ZSTD_overlapCopy8() : + * Copies 8 bytes from ip to op and updates op and ip where ip <= op. + * If the offset is < 8 then the offset is spread to at least 8 bytes. + * + * Precondition: *ip <= *op + * Postcondition: *op - *op >= 8 + */ +HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) +{ + assert(*ip <= *op); + if (offset < 8) { + /* close range match, overlap */ + static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */ + static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* subtracted */ + int const sub2 = dec64table[offset]; + (*op)[0] = (*ip)[0]; + (*op)[1] = (*ip)[1]; + (*op)[2] = (*ip)[2]; + (*op)[3] = (*ip)[3]; + *ip += dec32table[offset]; + ZSTD_copy4(*op+4, *ip); + *ip -= sub2; + } else { + ZSTD_copy8(*op, *ip); + } + *ip += 8; + *op += 8; + assert(*op - *ip >= 8); +} + +/*! ZSTD_safecopy() : + * Specialized version of memcpy() that is allowed to READ up to WILDCOPY_OVERLENGTH past the input buffer + * and write up to 16 bytes past oend_w (op >= oend_w is allowed). + * This function is only called in the uncommon case where the sequence is near the end of the block. It + * should be fast for a single long sequence, but can be slow for several short sequences. + * + * @param ovtype controls the overlap detection + * - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart. + * - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart. + * The src buffer must be before the dst buffer. + */ +static void +ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, size_t length, ZSTD_overlap_e ovtype) +{ + ptrdiff_t const diff = op - ip; + BYTE* const oend = op + length; + + assert((ovtype == ZSTD_no_overlap && (diff <= -8 || diff >= 8 || op >= oend_w)) || + (ovtype == ZSTD_overlap_src_before_dst && diff >= 0)); + + if (length < 8) { + /* Handle short lengths. */ + while (op < oend) *op++ = *ip++; + return; + } + if (ovtype == ZSTD_overlap_src_before_dst) { + /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */ + assert(length >= 8); + assert(diff > 0); + ZSTD_overlapCopy8(&op, &ip, (size_t)diff); + length -= 8; + assert(op - ip >= 8); + assert(op <= oend); + } + + if (oend <= oend_w) { + /* No risk of overwrite. */ + ZSTD_wildcopy(op, ip, length, ovtype); + return; + } + if (op <= oend_w) { + /* Wildcopy until we get close to the end. */ + assert(oend > oend_w); + ZSTD_wildcopy(op, ip, (size_t)(oend_w - op), ovtype); + ip += oend_w - op; + op += oend_w - op; + } + /* Handle the leftovers. */ + while (op < oend) *op++ = *ip++; +} + +/* ZSTD_safecopyDstBeforeSrc(): + * This version allows overlap with dst before src, or handles the non-overlap case with dst after src + * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */ +static void ZSTD_safecopyDstBeforeSrc(BYTE* op, const BYTE* ip, size_t length) +{ + ptrdiff_t const diff = op - ip; + BYTE* const oend = op + length; + + if (length < 8 || diff > -8) { + /* Handle short lengths, close overlaps, and dst not before src. */ + while (op < oend) *op++ = *ip++; + return; + } + + if (op <= oend - WILDCOPY_OVERLENGTH && diff < -WILDCOPY_VECLEN) { + ZSTD_wildcopy(op, ip, (size_t)(oend - WILDCOPY_OVERLENGTH - op), ZSTD_no_overlap); + ip += oend - WILDCOPY_OVERLENGTH - op; + op += oend - WILDCOPY_OVERLENGTH - op; + } + + /* Handle the leftovers. */ + while (op < oend) *op++ = *ip++; +} + +/* ZSTD_execSequenceEnd(): + * This version handles cases that are near the end of the output buffer. It requires + * more careful checks to make sure there is no overflow. By separating out these hard + * and unlikely cases, we can speed up the common cases. + * + * NOTE: This function needs to be fast for a single long sequence, but doesn't need + * to be optimized for many small sequences, since those fall into ZSTD_execSequence(). + */ +FORCE_NOINLINE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_execSequenceEnd(BYTE* op, + BYTE* const oend, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, + const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) +{ + BYTE* const oLitEnd = op + sequence.litLength; + size_t const sequenceLength = sequence.litLength + sequence.matchLength; + const BYTE* const iLitEnd = *litPtr + sequence.litLength; + const BYTE* match = oLitEnd - sequence.offset; + BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; + + /* bounds checks : careful of address space overflow in 32-bit mode */ + RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer"); + RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer"); + assert(op < op + sequenceLength); + assert(oLitEnd < op + sequenceLength); + + /* copy literals */ + ZSTD_safecopy(op, oend_w, *litPtr, sequence.litLength, ZSTD_no_overlap); + op = oLitEnd; + *litPtr = iLitEnd; + + /* copy Match */ + if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { + /* offset beyond prefix */ + RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, ""); + match = dictEnd - (prefixStart - match); + if (match + sequence.matchLength <= dictEnd) { + ZSTD_memmove(oLitEnd, match, sequence.matchLength); + return sequenceLength; + } + /* span extDict & currentPrefixSegment */ + { size_t const length1 = (size_t)(dictEnd - match); + ZSTD_memmove(oLitEnd, match, length1); + op = oLitEnd + length1; + sequence.matchLength -= length1; + match = prefixStart; + } + } + ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst); + return sequenceLength; +} + +/* ZSTD_execSequenceEndSplitLitBuffer(): + * This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case. + */ +FORCE_NOINLINE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op, + BYTE* const oend, const BYTE* const oend_w, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, + const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) +{ + BYTE* const oLitEnd = op + sequence.litLength; + size_t const sequenceLength = sequence.litLength + sequence.matchLength; + const BYTE* const iLitEnd = *litPtr + sequence.litLength; + const BYTE* match = oLitEnd - sequence.offset; + + + /* bounds checks : careful of address space overflow in 32-bit mode */ + RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer"); + RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer"); + assert(op < op + sequenceLength); + assert(oLitEnd < op + sequenceLength); + + /* copy literals */ + RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dstSize_tooSmall, "output should not catch up to and overwrite literal buffer"); + ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength); + op = oLitEnd; + *litPtr = iLitEnd; + + /* copy Match */ + if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { + /* offset beyond prefix */ + RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, ""); + match = dictEnd - (prefixStart - match); + if (match + sequence.matchLength <= dictEnd) { + ZSTD_memmove(oLitEnd, match, sequence.matchLength); + return sequenceLength; + } + /* span extDict & currentPrefixSegment */ + { size_t const length1 = (size_t)(dictEnd - match); + ZSTD_memmove(oLitEnd, match, length1); + op = oLitEnd + length1; + sequence.matchLength -= length1; + match = prefixStart; + } + } + ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst); + return sequenceLength; +} + +HINT_INLINE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_execSequence(BYTE* op, + BYTE* const oend, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, + const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) +{ + BYTE* const oLitEnd = op + sequence.litLength; + size_t const sequenceLength = sequence.litLength + sequence.matchLength; + BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ + BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; /* risk : address space underflow on oend=NULL */ + const BYTE* const iLitEnd = *litPtr + sequence.litLength; + const BYTE* match = oLitEnd - sequence.offset; + + assert(op != NULL /* Precondition */); + assert(oend_w < oend /* No underflow */); + +#if defined(__aarch64__) + /* prefetch sequence starting from match that will be used for copy later */ + PREFETCH_L1(match); +#endif + /* Handle edge cases in a slow path: + * - Read beyond end of literals + * - Match end is within WILDCOPY_OVERLIMIT of oend + * - 32-bit mode and the match length overflows + */ + if (UNLIKELY( + iLitEnd > litLimit || + oMatchEnd > oend_w || + (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH))) + return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd); + + /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */ + assert(op <= oLitEnd /* No overflow */); + assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */); + assert(oMatchEnd <= oend /* No underflow */); + assert(iLitEnd <= litLimit /* Literal length is in bounds */); + assert(oLitEnd <= oend_w /* Can wildcopy literals */); + assert(oMatchEnd <= oend_w /* Can wildcopy matches */); + + /* Copy Literals: + * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9. + * We likely don't need the full 32-byte wildcopy. + */ + assert(WILDCOPY_OVERLENGTH >= 16); + ZSTD_copy16(op, (*litPtr)); + if (UNLIKELY(sequence.litLength > 16)) { + ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap); + } + op = oLitEnd; + *litPtr = iLitEnd; /* update for next sequence */ + + /* Copy Match */ + if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { + /* offset beyond prefix -> go into extDict */ + RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, ""); + match = dictEnd + (match - prefixStart); + if (match + sequence.matchLength <= dictEnd) { + ZSTD_memmove(oLitEnd, match, sequence.matchLength); + return sequenceLength; + } + /* span extDict & currentPrefixSegment */ + { size_t const length1 = (size_t)(dictEnd - match); + ZSTD_memmove(oLitEnd, match, length1); + op = oLitEnd + length1; + sequence.matchLength -= length1; + match = prefixStart; + } + } + /* Match within prefix of 1 or more bytes */ + assert(op <= oMatchEnd); + assert(oMatchEnd <= oend_w); + assert(match >= prefixStart); + assert(sequence.matchLength >= 1); + + /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy + * without overlap checking. + */ + if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) { + /* We bet on a full wildcopy for matches, since we expect matches to be + * longer than literals (in general). In silesia, ~10% of matches are longer + * than 16 bytes. + */ + ZSTD_wildcopy(op, match, sequence.matchLength, ZSTD_no_overlap); + return sequenceLength; + } + assert(sequence.offset < WILDCOPY_VECLEN); + + /* Copy 8 bytes and spread the offset to be >= 8. */ + ZSTD_overlapCopy8(&op, &match, sequence.offset); + + /* If the match length is > 8 bytes, then continue with the wildcopy. */ + if (sequence.matchLength > 8) { + assert(op < oMatchEnd); + ZSTD_wildcopy(op, match, sequence.matchLength - 8, ZSTD_overlap_src_before_dst); + } + return sequenceLength; +} + +HINT_INLINE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op, + BYTE* const oend, const BYTE* const oend_w, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, + const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) +{ + BYTE* const oLitEnd = op + sequence.litLength; + size_t const sequenceLength = sequence.litLength + sequence.matchLength; + BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ + const BYTE* const iLitEnd = *litPtr + sequence.litLength; + const BYTE* match = oLitEnd - sequence.offset; + + assert(op != NULL /* Precondition */); + assert(oend_w < oend /* No underflow */); + /* Handle edge cases in a slow path: + * - Read beyond end of literals + * - Match end is within WILDCOPY_OVERLIMIT of oend + * - 32-bit mode and the match length overflows + */ + if (UNLIKELY( + iLitEnd > litLimit || + oMatchEnd > oend_w || + (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH))) + return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd); + + /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */ + assert(op <= oLitEnd /* No overflow */); + assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */); + assert(oMatchEnd <= oend /* No underflow */); + assert(iLitEnd <= litLimit /* Literal length is in bounds */); + assert(oLitEnd <= oend_w /* Can wildcopy literals */); + assert(oMatchEnd <= oend_w /* Can wildcopy matches */); + + /* Copy Literals: + * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9. + * We likely don't need the full 32-byte wildcopy. + */ + assert(WILDCOPY_OVERLENGTH >= 16); + ZSTD_copy16(op, (*litPtr)); + if (UNLIKELY(sequence.litLength > 16)) { + ZSTD_wildcopy(op+16, (*litPtr)+16, sequence.litLength-16, ZSTD_no_overlap); + } + op = oLitEnd; + *litPtr = iLitEnd; /* update for next sequence */ + + /* Copy Match */ + if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { + /* offset beyond prefix -> go into extDict */ + RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, ""); + match = dictEnd + (match - prefixStart); + if (match + sequence.matchLength <= dictEnd) { + ZSTD_memmove(oLitEnd, match, sequence.matchLength); + return sequenceLength; + } + /* span extDict & currentPrefixSegment */ + { size_t const length1 = (size_t)(dictEnd - match); + ZSTD_memmove(oLitEnd, match, length1); + op = oLitEnd + length1; + sequence.matchLength -= length1; + match = prefixStart; + } } + /* Match within prefix of 1 or more bytes */ + assert(op <= oMatchEnd); + assert(oMatchEnd <= oend_w); + assert(match >= prefixStart); + assert(sequence.matchLength >= 1); + + /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy + * without overlap checking. + */ + if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) { + /* We bet on a full wildcopy for matches, since we expect matches to be + * longer than literals (in general). In silesia, ~10% of matches are longer + * than 16 bytes. + */ + ZSTD_wildcopy(op, match, sequence.matchLength, ZSTD_no_overlap); + return sequenceLength; + } + assert(sequence.offset < WILDCOPY_VECLEN); + + /* Copy 8 bytes and spread the offset to be >= 8. */ + ZSTD_overlapCopy8(&op, &match, sequence.offset); + + /* If the match length is > 8 bytes, then continue with the wildcopy. */ + if (sequence.matchLength > 8) { + assert(op < oMatchEnd); + ZSTD_wildcopy(op, match, sequence.matchLength-8, ZSTD_overlap_src_before_dst); + } + return sequenceLength; +} + + +static void +ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt) +{ + const void* ptr = dt; + const ZSTD_seqSymbol_header* const DTableH = (const ZSTD_seqSymbol_header*)ptr; + DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog); + DEBUGLOG(6, "ZSTD_initFseState : val=%u using %u bits", + (U32)DStatePtr->state, DTableH->tableLog); + BIT_reloadDStream(bitD); + DStatePtr->table = dt + 1; +} + +FORCE_INLINE_TEMPLATE void +ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 nextState, U32 nbBits) +{ + size_t const lowBits = BIT_readBits(bitD, nbBits); + DStatePtr->state = nextState + lowBits; +} + +/* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum + * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32 + * bits before reloading. This value is the maximum number of bytes we read + * after reloading when we are decoding long offsets. + */ +#define LONG_OFFSETS_MAX_EXTRA_BITS_32 \ + (ZSTD_WINDOWLOG_MAX_32 > STREAM_ACCUMULATOR_MIN_32 \ + ? ZSTD_WINDOWLOG_MAX_32 - STREAM_ACCUMULATOR_MIN_32 \ + : 0) + +typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e; + +/** + * ZSTD_decodeSequence(): + * @p longOffsets : tells the decoder to reload more bit while decoding large offsets + * only used in 32-bit mode + * @return : Sequence (litL + matchL + offset) + */ +FORCE_INLINE_TEMPLATE seq_t +ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const int isLastSeq) +{ + seq_t seq; + /* + * ZSTD_seqSymbol is a 64 bits wide structure. + * It can be loaded in one operation + * and its fields extracted by simply shifting or bit-extracting on aarch64. + * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh + * operations that cause performance drop. This can be avoided by using this + * ZSTD_memcpy hack. + */ +#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__)) + ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS; + ZSTD_seqSymbol* const llDInfo = &llDInfoS; + ZSTD_seqSymbol* const mlDInfo = &mlDInfoS; + ZSTD_seqSymbol* const ofDInfo = &ofDInfoS; + ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol)); + ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol)); + ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol)); +#else + const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state; + const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state; + const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state; +#endif + seq.matchLength = mlDInfo->baseValue; + seq.litLength = llDInfo->baseValue; + { U32 const ofBase = ofDInfo->baseValue; + BYTE const llBits = llDInfo->nbAdditionalBits; + BYTE const mlBits = mlDInfo->nbAdditionalBits; + BYTE const ofBits = ofDInfo->nbAdditionalBits; + BYTE const totalBits = llBits+mlBits+ofBits; + + U16 const llNext = llDInfo->nextState; + U16 const mlNext = mlDInfo->nextState; + U16 const ofNext = ofDInfo->nextState; + U32 const llnbBits = llDInfo->nbBits; + U32 const mlnbBits = mlDInfo->nbBits; + U32 const ofnbBits = ofDInfo->nbBits; + + assert(llBits <= MaxLLBits); + assert(mlBits <= MaxMLBits); + assert(ofBits <= MaxOff); + /* + * As gcc has better branch and block analyzers, sometimes it is only + * valuable to mark likeliness for clang, it gives around 3-4% of + * performance. + */ + + /* sequence */ + { size_t offset; + if (ofBits > 1) { + ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); + ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); + ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32); + ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits); + if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { + /* Always read extra bits, this keeps the logic simple, + * avoids branches, and avoids accidentally reading 0 bits. + */ + U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32; + offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); + BIT_reloadDStream(&seqState->DStream); + offset += BIT_readBitsFast(&seqState->DStream, extraBits); + } else { + offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); + } + seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset; + } else { + U32 const ll0 = (llDInfo->baseValue == 0); + if (LIKELY((ofBits == 0))) { + offset = seqState->prevOffset[ll0]; + seqState->prevOffset[1] = seqState->prevOffset[!ll0]; + seqState->prevOffset[0] = offset; + } else { + offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); + { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; + temp -= !temp; /* 0 is not valid: input corrupted => force offset to -1 => corruption detected at execSequence */ + if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset = temp; + } } } + seq.offset = offset; + } + + if (mlBits > 0) + seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); + + if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) + BIT_reloadDStream(&seqState->DStream); + if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog))) + BIT_reloadDStream(&seqState->DStream); + /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ + ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); + + if (llBits > 0) + seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); + + if (MEM_32bits()) + BIT_reloadDStream(&seqState->DStream); + + DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", + (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); + + if (!isLastSeq) { + /* don't update FSE state for last Sequence */ + ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ + ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ + ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ + BIT_reloadDStream(&seqState->DStream); + } + } + + return seq; +} + +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) +#if DEBUGLEVEL >= 1 +static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd) +{ + size_t const windowSize = dctx->fParams.windowSize; + /* No dictionary used. */ + if (dctx->dictContentEndForFuzzing == NULL) return 0; + /* Dictionary is our prefix. */ + if (prefixStart == dctx->dictContentBeginForFuzzing) return 1; + /* Dictionary is not our ext-dict. */ + if (dctx->dictEnd != dctx->dictContentEndForFuzzing) return 0; + /* Dictionary is not within our window size. */ + if ((size_t)(oLitEnd - prefixStart) >= windowSize) return 0; + /* Dictionary is active. */ + return 1; +} +#endif + +static void ZSTD_assertValidSequence( + ZSTD_DCtx const* dctx, + BYTE const* op, BYTE const* oend, + seq_t const seq, + BYTE const* prefixStart, BYTE const* virtualStart) +{ +#if DEBUGLEVEL >= 1 + if (dctx->isFrameDecompression) { + size_t const windowSize = dctx->fParams.windowSize; + size_t const sequenceSize = seq.litLength + seq.matchLength; + BYTE const* const oLitEnd = op + seq.litLength; + DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u", + (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); + assert(op <= oend); + assert((size_t)(oend - op) >= sequenceSize); + assert(sequenceSize <= ZSTD_blockSizeMax(dctx)); + if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) { + size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing); + /* Offset must be within the dictionary. */ + assert(seq.offset <= (size_t)(oLitEnd - virtualStart)); + assert(seq.offset <= windowSize + dictSize); + } else { + /* Offset must be within our window. */ + assert(seq.offset <= windowSize); + } + } +#else + (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart; +#endif +} +#endif + +#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG + + +FORCE_INLINE_TEMPLATE size_t +DONT_VECTORIZE +ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset) +{ + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = (BYTE*)ZSTD_maybeNullPtrAdd(ostart, (ptrdiff_t)maxDstSize); + BYTE* op = ostart; + const BYTE* litPtr = dctx->litPtr; + const BYTE* litBufferEnd = dctx->litBufferEnd; + const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); + const BYTE* const vBase = (const BYTE*) (dctx->virtualStart); + const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); + DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer (%i seqs)", nbSeq); + + /* Literals are split between internal buffer & output buffer */ + if (nbSeq) { + seqState_t seqState; + dctx->fseEntropy = 1; + { U32 i; for (i=0; ientropy.rep[i]; } + RETURN_ERROR_IF( + ERR_isError(BIT_initDStream(&seqState.DStream, seqStart, seqSize)), + corruption_detected, ""); + ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); + ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); + ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); + assert(dst != NULL); + + ZSTD_STATIC_ASSERT( + BIT_DStream_unfinished < BIT_DStream_completed && + BIT_DStream_endOfBuffer < BIT_DStream_completed && + BIT_DStream_completed < BIT_DStream_overflow); + + /* decompress without overrunning litPtr begins */ + { seq_t sequence = {0,0,0}; /* some static analyzer believe that @sequence is not initialized (it necessarily is, since for(;;) loop as at least one iteration) */ + /* Align the decompression loop to 32 + 16 bytes. + * + * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression + * speed swings based on the alignment of the decompression loop. This + * performance swing is caused by parts of the decompression loop falling + * out of the DSB. The entire decompression loop should fit in the DSB, + * when it can't we get much worse performance. You can measure if you've + * hit the good case or the bad case with this perf command for some + * compressed file test.zst: + * + * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \ + * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst + * + * If you see most cycles served out of the MITE you've hit the bad case. + * If you see most cycles served out of the DSB you've hit the good case. + * If it is pretty even then you may be in an okay case. + * + * This issue has been reproduced on the following CPUs: + * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9 + * Use Instruments->Counters to get DSB/MITE cycles. + * I never got performance swings, but I was able to + * go from the good case of mostly DSB to half of the + * cycles served from MITE. + * - Coffeelake: Intel i9-9900k + * - Coffeelake: Intel i7-9700k + * + * I haven't been able to reproduce the instability or DSB misses on any + * of the following CPUS: + * - Haswell + * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH + * - Skylake + * + * Alignment is done for each of the three major decompression loops: + * - ZSTD_decompressSequences_bodySplitLitBuffer - presplit section of the literal buffer + * - ZSTD_decompressSequences_bodySplitLitBuffer - postsplit section of the literal buffer + * - ZSTD_decompressSequences_body + * Alignment choices are made to minimize large swings on bad cases and influence on performance + * from changes external to this code, rather than to overoptimize on the current commit. + * + * If you are seeing performance stability this script can help test. + * It tests on 4 commits in zstd where I saw performance change. + * + * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4 + */ +#if defined(__GNUC__) && defined(__x86_64__) + __asm__(".p2align 6"); +# if __GNUC__ >= 7 + /* good for gcc-7, gcc-9, and gcc-11 */ + __asm__("nop"); + __asm__(".p2align 5"); + __asm__("nop"); + __asm__(".p2align 4"); +# if __GNUC__ == 8 || __GNUC__ == 10 + /* good for gcc-8 and gcc-10 */ + __asm__("nop"); + __asm__(".p2align 3"); +# endif +# endif +#endif + + /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */ + for ( ; nbSeq; nbSeq--) { + sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); + if (litPtr + sequence.litLength > dctx->litBufferEnd) break; + { size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); +#endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; + } } + DEBUGLOG(6, "reached: (litPtr + sequence.litLength > dctx->litBufferEnd)"); + + /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */ + if (nbSeq > 0) { + const size_t leftoverLit = (size_t)(dctx->litBufferEnd - litPtr); + assert(dctx->litBufferEnd >= litPtr); + DEBUGLOG(6, "There are %i sequences left, and %zu/%zu literals left in buffer", nbSeq, leftoverLit, sequence.litLength); + if (leftoverLit) { + RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); + ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); + sequence.litLength -= leftoverLit; + op += leftoverLit; + } + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; + { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); +#endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; + } + nbSeq--; + } + } + + if (nbSeq > 0) { + /* there is remaining lit from extra buffer */ + +#if defined(__GNUC__) && defined(__x86_64__) + __asm__(".p2align 6"); + __asm__("nop"); +# if __GNUC__ != 7 + /* worse for gcc-7 better for gcc-8, gcc-9, and gcc-10 and clang */ + __asm__(".p2align 4"); + __asm__("nop"); + __asm__(".p2align 3"); +# elif __GNUC__ >= 11 + __asm__(".p2align 3"); +# else + __asm__(".p2align 5"); + __asm__("nop"); + __asm__(".p2align 3"); +# endif +#endif + + for ( ; nbSeq ; nbSeq--) { + seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); + size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); +#endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; + } + } + + /* check if reached exact end */ + DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq); + RETURN_ERROR_IF(nbSeq, corruption_detected, ""); + DEBUGLOG(5, "bitStream : start=%p, ptr=%p, bitsConsumed=%u", seqState.DStream.start, seqState.DStream.ptr, seqState.DStream.bitsConsumed); + RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); + /* save reps for next block */ + { U32 i; for (i=0; ientropy.rep[i] = (U32)(seqState.prevOffset[i]); } + } + + /* last literal segment */ + if (dctx->litBufferLocation == ZSTD_split) { + /* split hasn't been reached yet, first get dst then copy litExtraBuffer */ + size_t const lastLLSize = (size_t)(litBufferEnd - litPtr); + DEBUGLOG(6, "copy last literals from segment : %u", (U32)lastLLSize); + RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memmove(op, litPtr, lastLLSize); + op += lastLLSize; + } + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; + } + /* copy last literals from internal buffer */ + { size_t const lastLLSize = (size_t)(litBufferEnd - litPtr); + DEBUGLOG(6, "copy last literals from internal buffer : %u", (U32)lastLLSize); + RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memcpy(op, litPtr, lastLLSize); + op += lastLLSize; + } } + + DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart)); + return (size_t)(op - ostart); +} + +FORCE_INLINE_TEMPLATE size_t +DONT_VECTORIZE +ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset) +{ + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = (dctx->litBufferLocation == ZSTD_not_in_dst) ? + (BYTE*)ZSTD_maybeNullPtrAdd(ostart, (ptrdiff_t)maxDstSize) : + dctx->litBuffer; + BYTE* op = ostart; + const BYTE* litPtr = dctx->litPtr; + const BYTE* const litEnd = litPtr + dctx->litSize; + const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart); + const BYTE* const vBase = (const BYTE*)(dctx->virtualStart); + const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd); + DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq); + + /* Regen sequences */ + if (nbSeq) { + seqState_t seqState; + dctx->fseEntropy = 1; + { U32 i; for (i = 0; i < ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; } + RETURN_ERROR_IF( + ERR_isError(BIT_initDStream(&seqState.DStream, seqStart, seqSize)), + corruption_detected, ""); + ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); + ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); + ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); + assert(dst != NULL); + +#if defined(__GNUC__) && defined(__x86_64__) + __asm__(".p2align 6"); + __asm__("nop"); +# if __GNUC__ >= 7 + __asm__(".p2align 5"); + __asm__("nop"); + __asm__(".p2align 3"); +# else + __asm__(".p2align 4"); + __asm__("nop"); + __asm__(".p2align 3"); +# endif +#endif + + for ( ; nbSeq ; nbSeq--) { + seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); + size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); +#endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; + } + + /* check if reached exact end */ + assert(nbSeq == 0); + RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); + /* save reps for next block */ + { U32 i; for (i=0; ientropy.rep[i] = (U32)(seqState.prevOffset[i]); } + } + + /* last literal segment */ + { size_t const lastLLSize = (size_t)(litEnd - litPtr); + DEBUGLOG(6, "copy last literals : %u", (U32)lastLLSize); + RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memcpy(op, litPtr, lastLLSize); + op += lastLLSize; + } } + + DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart)); + return (size_t)(op - ostart); +} + +static size_t +ZSTD_decompressSequences_default(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset) +{ + return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); +} + +static size_t +ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset) +{ + return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); +} +#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ + +#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT + +FORCE_INLINE_TEMPLATE + +size_t ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence, + const BYTE* const prefixStart, const BYTE* const dictEnd) +{ + prefetchPos += sequence.litLength; + { const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart; + /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. + * No consequence though : memory address is only used for prefetching, not for dereferencing */ + const BYTE* const match = (const BYTE*)ZSTD_wrappedPtrSub(ZSTD_wrappedPtrAdd(matchBase, (ptrdiff_t)prefetchPos), (ptrdiff_t)sequence.offset); + PREFETCH_L1(match); PREFETCH_L1(ZSTD_wrappedPtrAdd(match, CACHELINE_SIZE)); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */ + } + return prefetchPos + sequence.matchLength; +} + +/* This decoding function employs prefetching + * to reduce latency impact of cache misses. + * It's generally employed when block contains a significant portion of long-distance matches + * or when coupled with a "cold" dictionary */ +FORCE_INLINE_TEMPLATE size_t +ZSTD_decompressSequencesLong_body( + ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset) +{ + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = (dctx->litBufferLocation == ZSTD_in_dst) ? + dctx->litBuffer : + (BYTE*)ZSTD_maybeNullPtrAdd(ostart, (ptrdiff_t)maxDstSize); + BYTE* op = ostart; + const BYTE* litPtr = dctx->litPtr; + const BYTE* litBufferEnd = dctx->litBufferEnd; + const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); + const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart); + const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); + + /* Regen sequences */ + if (nbSeq) { +#define STORED_SEQS 8 +#define STORED_SEQS_MASK (STORED_SEQS-1) +#define ADVANCED_SEQS STORED_SEQS + seq_t sequences[STORED_SEQS]; + int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS); + seqState_t seqState; + int seqNb; + size_t prefetchPos = (size_t)(op-prefixStart); /* track position relative to prefixStart */ + + dctx->fseEntropy = 1; + { int i; for (i=0; ientropy.rep[i]; } + assert(dst != NULL); + RETURN_ERROR_IF( + ERR_isError(BIT_initDStream(&seqState.DStream, seqStart, seqSize)), + corruption_detected, ""); + ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); + ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); + ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); + + /* prepare in advance */ + for (seqNb=0; seqNblitBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) { + /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */ + const size_t leftoverLit = (size_t)(dctx->litBufferEnd - litPtr); + assert(dctx->litBufferEnd >= litPtr); + if (leftoverLit) { + RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); + ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength -= leftoverLit; + op += leftoverLit; + } + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; + { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); +#endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + + prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); + sequences[seqNb & STORED_SEQS_MASK] = sequence; + op += oneSeqSize; + } } + else + { + /* lit buffer is either wholly contained in first or second split, or not split at all*/ + size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ? + ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : + ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); +#endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + + prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); + sequences[seqNb & STORED_SEQS_MASK] = sequence; + op += oneSeqSize; + } + } + RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); + + /* finish queue */ + seqNb -= seqAdvance; + for ( ; seqNblitBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) { + const size_t leftoverLit = (size_t)(dctx->litBufferEnd - litPtr); + assert(dctx->litBufferEnd >= litPtr); + if (leftoverLit) { + RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); + ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); + sequence->litLength -= leftoverLit; + op += leftoverLit; + } + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; + { size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); +#endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + op += oneSeqSize; + } + } + else + { + size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ? + ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence->litLength - WILDCOPY_OVERLENGTH, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : + ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); +#endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + op += oneSeqSize; + } + } + + /* save reps for next block */ + { U32 i; for (i=0; ientropy.rep[i] = (U32)(seqState.prevOffset[i]); } + } + + /* last literal segment */ + if (dctx->litBufferLocation == ZSTD_split) { /* first deplete literal buffer in dst, then copy litExtraBuffer */ + size_t const lastLLSize = (size_t)(litBufferEnd - litPtr); + assert(litBufferEnd >= litPtr); + RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memmove(op, litPtr, lastLLSize); + op += lastLLSize; + } + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + } + { size_t const lastLLSize = (size_t)(litBufferEnd - litPtr); + assert(litBufferEnd >= litPtr); + RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memmove(op, litPtr, lastLLSize); + op += lastLLSize; + } + } + + return (size_t)(op - ostart); +} + +static size_t +ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset) +{ + return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); +} +#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ + + + +#if DYNAMIC_BMI2 + +#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG +static BMI2_TARGET_ATTRIBUTE size_t +DONT_VECTORIZE +ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset) +{ + return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); +} +static BMI2_TARGET_ATTRIBUTE size_t +DONT_VECTORIZE +ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset) +{ + return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); +} +#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ + +#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT +static BMI2_TARGET_ATTRIBUTE size_t +ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset) +{ + return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); +} +#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ + +#endif /* DYNAMIC_BMI2 */ + +#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG +static size_t +ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset) +{ + DEBUGLOG(5, "ZSTD_decompressSequences"); +#if DYNAMIC_BMI2 + if (ZSTD_DCtx_get_bmi2(dctx)) { + return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } +#endif + return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); +} +static size_t +ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset) +{ + DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer"); +#if DYNAMIC_BMI2 + if (ZSTD_DCtx_get_bmi2(dctx)) { + return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } +#endif + return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); +} +#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ + + +#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT +/* ZSTD_decompressSequencesLong() : + * decompression function triggered when a minimum share of offsets is considered "long", + * aka out of cache. + * note : "long" definition seems overloaded here, sometimes meaning "wider than bitstream register", and sometimes meaning "farther than memory cache distance". + * This function will try to mitigate main memory latency through the use of prefetching */ +static size_t +ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset) +{ + DEBUGLOG(5, "ZSTD_decompressSequencesLong"); +#if DYNAMIC_BMI2 + if (ZSTD_DCtx_get_bmi2(dctx)) { + return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } +#endif + return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); +} +#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ + + +/** + * @returns The total size of the history referenceable by zstd, including + * both the prefix and the extDict. At @p op any offset larger than this + * is invalid. + */ +static size_t ZSTD_totalHistorySize(void* curPtr, const void* virtualStart) +{ + return (size_t)((char*)curPtr - (const char*)virtualStart); +} + +typedef struct { + unsigned longOffsetShare; + unsigned maxNbAdditionalBits; +} ZSTD_OffsetInfo; + +/* ZSTD_getOffsetInfo() : + * condition : offTable must be valid + * @return : "share" of long offsets (arbitrarily defined as > (1<<23)) + * compared to maximum possible of (1< 22) info.longOffsetShare += 1; + } + + assert(tableLog <= OffFSELog); + info.longOffsetShare <<= (OffFSELog - tableLog); /* scale to OffFSELog */ + } + + return info; +} + +/** + * @returns The maximum offset we can decode in one read of our bitstream, without + * reloading more bits in the middle of the offset bits read. Any offsets larger + * than this must use the long offset decoder. + */ +static size_t ZSTD_maxShortOffset(void) +{ + if (MEM_64bits()) { + /* We can decode any offset without reloading bits. + * This might change if the max window size grows. + */ + ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31); + return (size_t)-1; + } else { + /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1. + * This offBase would require STREAM_ACCUMULATOR_MIN extra bits. + * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset. + */ + size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1; + size_t const maxOffset = maxOffbase - ZSTD_REP_NUM; + assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN); + return maxOffset; + } +} + +size_t +ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, const streaming_operation streaming) +{ /* blockType == blockCompressed */ + const BYTE* ip = (const BYTE*)src; + DEBUGLOG(5, "ZSTD_decompressBlock_internal (cSize : %u)", (unsigned)srcSize); + + /* Note : the wording of the specification + * allows compressed block to be sized exactly ZSTD_blockSizeMax(dctx). + * This generally does not happen, as it makes little sense, + * since an uncompressed block would feature same size and have no decompression cost. + * Also, note that decoder from reference libzstd before < v1.5.4 + * would consider this edge case as an error. + * As a consequence, avoid generating compressed blocks of size ZSTD_blockSizeMax(dctx) + * for broader compatibility with the deployed ecosystem of zstd decoders */ + RETURN_ERROR_IF(srcSize > ZSTD_blockSizeMax(dctx), srcSize_wrong, ""); + + /* Decode literals section */ + { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming); + DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize); + if (ZSTD_isError(litCSize)) return litCSize; + ip += litCSize; + srcSize -= litCSize; + } + + /* Build Decoding Tables */ + { + /* Compute the maximum block size, which must also work when !frame and fParams are unset. + * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t. + */ + size_t const blockSizeMax = MIN(dstCapacity, ZSTD_blockSizeMax(dctx)); + size_t const totalHistorySize = ZSTD_totalHistorySize(ZSTD_maybeNullPtrAdd(dst, (ptrdiff_t)blockSizeMax), (BYTE const*)dctx->virtualStart); + /* isLongOffset must be true if there are long offsets. + * Offsets are long if they are larger than ZSTD_maxShortOffset(). + * We don't expect that to be the case in 64-bit mode. + * + * We check here to see if our history is large enough to allow long offsets. + * If it isn't, then we can't possible have (valid) long offsets. If the offset + * is invalid, then it is okay to read it incorrectly. + * + * If isLongOffsets is true, then we will later check our decoding table to see + * if it is even possible to generate long offsets. + */ + ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset())); + /* These macros control at build-time which decompressor implementation + * we use. If neither is defined, we do some inspection and dispatch at + * runtime. + */ +#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ + !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) + int usePrefetchDecoder = dctx->ddictIsCold; +#else + /* Set to 1 to avoid computing offset info if we don't need to. + * Otherwise this value is ignored. + */ + int usePrefetchDecoder = 1; +#endif + int nbSeq; + size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize); + if (ZSTD_isError(seqHSize)) return seqHSize; + ip += seqHSize; + srcSize -= seqHSize; + + RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled"); + RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall, + "invalid dst"); + + /* If we could potentially have long offsets, or we might want to use the prefetch decoder, + * compute information about the share of long offsets, and the maximum nbAdditionalBits. + * NOTE: could probably use a larger nbSeq limit + */ + if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) { + ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq); + if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) { + /* If isLongOffset, but the maximum number of additional bits that we see in our table is small + * enough, then we know it is impossible to have too long an offset in this block, so we can + * use the regular offset decoder. + */ + isLongOffset = ZSTD_lo_isRegularOffset; + } + if (!usePrefetchDecoder) { + U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ + usePrefetchDecoder = (info.longOffsetShare >= minShare); + } + } + + dctx->ddictIsCold = 0; + +#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ + !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) + if (usePrefetchDecoder) { +#else + (void)usePrefetchDecoder; + { +#endif +#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT + return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); +#endif + } + +#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG + /* else */ + if (dctx->litBufferLocation == ZSTD_split) + return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); + else + return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); +#endif + } +} + + +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize) +{ + if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */ + dctx->dictEnd = dctx->previousDstEnd; + dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart)); + dctx->prefixStart = dst; + dctx->previousDstEnd = dst; + } +} + + +size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize) +{ + size_t dSize; + dctx->isFrameDecompression = 0; + ZSTD_checkContinuity(dctx, dst, dstCapacity); + dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, not_streaming); + FORWARD_IF_ERROR(dSize, ""); + dctx->previousDstEnd = (char*)dst + dSize; + return dSize; +} + + +/* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */ +size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize) +{ + return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize); +} diff --git a/lib/deprecated/zbuff_compress.c b/lib/deprecated/zbuff_compress.c new file mode 100644 index 0000000..1d86821 --- /dev/null +++ b/lib/deprecated/zbuff_compress.c @@ -0,0 +1,167 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + + +/* ************************************* +* Dependencies +***************************************/ +#define ZBUFF_STATIC_LINKING_ONLY +#include "zbuff.h" +#include "../common/error_private.h" + + +/*-*********************************************************** +* Streaming compression +* +* A ZBUFF_CCtx object is required to track streaming operation. +* Use ZBUFF_createCCtx() and ZBUFF_freeCCtx() to create/release resources. +* Use ZBUFF_compressInit() to start a new compression operation. +* ZBUFF_CCtx objects can be reused multiple times. +* +* Use ZBUFF_compressContinue() repetitively to consume your input. +* *srcSizePtr and *dstCapacityPtr can be any size. +* The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr. +* Note that it may not consume the entire input, in which case it's up to the caller to call again the function with remaining input. +* The content of dst will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters or change dst . +* @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to improve latency) +* or an error code, which can be tested using ZBUFF_isError(). +* +* ZBUFF_compressFlush() can be used to instruct ZBUFF to compress and output whatever remains within its buffer. +* Note that it will not output more than *dstCapacityPtr. +* Therefore, some content might still be left into its internal buffer if dst buffer is too small. +* @return : nb of bytes still present into internal buffer (0 if it's empty) +* or an error code, which can be tested using ZBUFF_isError(). +* +* ZBUFF_compressEnd() instructs to finish a frame. +* It will perform a flush and write frame epilogue. +* Similar to ZBUFF_compressFlush(), it may not be able to output the entire internal buffer content if *dstCapacityPtr is too small. +* @return : nb of bytes still present into internal buffer (0 if it's empty) +* or an error code, which can be tested using ZBUFF_isError(). +* +* Hint : recommended buffer sizes (not compulsory) +* input : ZSTD_BLOCKSIZE_MAX (128 KB), internal unit size, it improves latency to use this value. +* output : ZSTD_compressBound(ZSTD_BLOCKSIZE_MAX) + ZSTD_blockHeaderSize + ZBUFF_endFrameSize : ensures it's always possible to write/flush/end a full block at best speed. +* ***********************************************************/ + +ZBUFF_CCtx* ZBUFF_createCCtx(void) +{ + return ZSTD_createCStream(); +} + +ZBUFF_CCtx* ZBUFF_createCCtx_advanced(ZSTD_customMem customMem) +{ + return ZSTD_createCStream_advanced(customMem); +} + +size_t ZBUFF_freeCCtx(ZBUFF_CCtx* zbc) +{ + return ZSTD_freeCStream(zbc); +} + + +/* ====== Initialization ====== */ + +size_t ZBUFF_compressInit_advanced(ZBUFF_CCtx* zbc, + const void* dict, size_t dictSize, + ZSTD_parameters params, unsigned long long pledgedSrcSize) +{ + if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN; /* preserve "0 == unknown" behavior */ + FORWARD_IF_ERROR(ZSTD_CCtx_reset(zbc, ZSTD_reset_session_only), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setPledgedSrcSize(zbc, pledgedSrcSize), ""); + + FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(zbc, ZSTD_c_windowLog, params.cParams.windowLog), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(zbc, ZSTD_c_hashLog, params.cParams.hashLog), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(zbc, ZSTD_c_chainLog, params.cParams.chainLog), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(zbc, ZSTD_c_searchLog, params.cParams.searchLog), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(zbc, ZSTD_c_minMatch, params.cParams.minMatch), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(zbc, ZSTD_c_targetLength, params.cParams.targetLength), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(zbc, ZSTD_c_strategy, params.cParams.strategy), ""); + + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(zbc, ZSTD_c_contentSizeFlag, params.fParams.contentSizeFlag), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(zbc, ZSTD_c_checksumFlag, params.fParams.checksumFlag), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(zbc, ZSTD_c_dictIDFlag, params.fParams.noDictIDFlag), ""); + + FORWARD_IF_ERROR(ZSTD_CCtx_loadDictionary(zbc, dict, dictSize), ""); + return 0; +} + +size_t ZBUFF_compressInitDictionary(ZBUFF_CCtx* zbc, const void* dict, size_t dictSize, int compressionLevel) +{ + FORWARD_IF_ERROR(ZSTD_CCtx_reset(zbc, ZSTD_reset_session_only), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(zbc, ZSTD_c_compressionLevel, compressionLevel), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_loadDictionary(zbc, dict, dictSize), ""); + return 0; +} + +size_t ZBUFF_compressInit(ZBUFF_CCtx* zbc, int compressionLevel) +{ + return ZSTD_initCStream(zbc, compressionLevel); +} + +/* ====== Compression ====== */ + + +size_t ZBUFF_compressContinue(ZBUFF_CCtx* zbc, + void* dst, size_t* dstCapacityPtr, + const void* src, size_t* srcSizePtr) +{ + size_t result; + ZSTD_outBuffer outBuff; + ZSTD_inBuffer inBuff; + outBuff.dst = dst; + outBuff.pos = 0; + outBuff.size = *dstCapacityPtr; + inBuff.src = src; + inBuff.pos = 0; + inBuff.size = *srcSizePtr; + result = ZSTD_compressStream(zbc, &outBuff, &inBuff); + *dstCapacityPtr = outBuff.pos; + *srcSizePtr = inBuff.pos; + return result; +} + + + +/* ====== Finalize ====== */ + +size_t ZBUFF_compressFlush(ZBUFF_CCtx* zbc, void* dst, size_t* dstCapacityPtr) +{ + size_t result; + ZSTD_outBuffer outBuff; + outBuff.dst = dst; + outBuff.pos = 0; + outBuff.size = *dstCapacityPtr; + result = ZSTD_flushStream(zbc, &outBuff); + *dstCapacityPtr = outBuff.pos; + return result; +} + + +size_t ZBUFF_compressEnd(ZBUFF_CCtx* zbc, void* dst, size_t* dstCapacityPtr) +{ + size_t result; + ZSTD_outBuffer outBuff; + outBuff.dst = dst; + outBuff.pos = 0; + outBuff.size = *dstCapacityPtr; + result = ZSTD_endStream(zbc, &outBuff); + *dstCapacityPtr = outBuff.pos; + return result; +} + + + +/* ************************************* +* Tool functions +***************************************/ +size_t ZBUFF_recommendedCInSize(void) { return ZSTD_CStreamInSize(); } +size_t ZBUFF_recommendedCOutSize(void) { return ZSTD_CStreamOutSize(); } diff --git a/lib/dictBuilder/divsufsort.c b/lib/dictBuilder/divsufsort.c new file mode 100644 index 0000000..e5b117b --- /dev/null +++ b/lib/dictBuilder/divsufsort.c @@ -0,0 +1,1913 @@ +/* + * divsufsort.c for libdivsufsort-lite + * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/*- Compiler specifics -*/ +#ifdef __clang__ +#pragma clang diagnostic ignored "-Wshorten-64-to-32" +#endif + +#if defined(_MSC_VER) +# pragma warning(disable : 4244) +# pragma warning(disable : 4127) /* C4127 : Condition expression is constant */ +#endif + + +/*- Dependencies -*/ +#include +#include +#include + +#include "divsufsort.h" + +/*- Constants -*/ +#if defined(INLINE) +# undef INLINE +#endif +#if !defined(INLINE) +# define INLINE __inline +#endif +#if defined(ALPHABET_SIZE) && (ALPHABET_SIZE < 1) +# undef ALPHABET_SIZE +#endif +#if !defined(ALPHABET_SIZE) +# define ALPHABET_SIZE (256) +#endif +#define BUCKET_A_SIZE (ALPHABET_SIZE) +#define BUCKET_B_SIZE (ALPHABET_SIZE * ALPHABET_SIZE) +#if defined(SS_INSERTIONSORT_THRESHOLD) +# if SS_INSERTIONSORT_THRESHOLD < 1 +# undef SS_INSERTIONSORT_THRESHOLD +# define SS_INSERTIONSORT_THRESHOLD (1) +# endif +#else +# define SS_INSERTIONSORT_THRESHOLD (8) +#endif +#if defined(SS_BLOCKSIZE) +# if SS_BLOCKSIZE < 0 +# undef SS_BLOCKSIZE +# define SS_BLOCKSIZE (0) +# elif 32768 <= SS_BLOCKSIZE +# undef SS_BLOCKSIZE +# define SS_BLOCKSIZE (32767) +# endif +#else +# define SS_BLOCKSIZE (1024) +#endif +/* minstacksize = log(SS_BLOCKSIZE) / log(3) * 2 */ +#if SS_BLOCKSIZE == 0 +# define SS_MISORT_STACKSIZE (96) +#elif SS_BLOCKSIZE <= 4096 +# define SS_MISORT_STACKSIZE (16) +#else +# define SS_MISORT_STACKSIZE (24) +#endif +#define SS_SMERGE_STACKSIZE (32) +#define TR_INSERTIONSORT_THRESHOLD (8) +#define TR_STACKSIZE (64) + + +/*- Macros -*/ +#ifndef SWAP +# define SWAP(_a, _b) do { t = (_a); (_a) = (_b); (_b) = t; } while(0) +#endif /* SWAP */ +#ifndef MIN +# define MIN(_a, _b) (((_a) < (_b)) ? (_a) : (_b)) +#endif /* MIN */ +#ifndef MAX +# define MAX(_a, _b) (((_a) > (_b)) ? (_a) : (_b)) +#endif /* MAX */ +#define STACK_PUSH(_a, _b, _c, _d)\ + do {\ + assert(ssize < STACK_SIZE);\ + stack[ssize].a = (_a), stack[ssize].b = (_b),\ + stack[ssize].c = (_c), stack[ssize++].d = (_d);\ + } while(0) +#define STACK_PUSH5(_a, _b, _c, _d, _e)\ + do {\ + assert(ssize < STACK_SIZE);\ + stack[ssize].a = (_a), stack[ssize].b = (_b),\ + stack[ssize].c = (_c), stack[ssize].d = (_d), stack[ssize++].e = (_e);\ + } while(0) +#define STACK_POP(_a, _b, _c, _d)\ + do {\ + assert(0 <= ssize);\ + if(ssize == 0) { return; }\ + (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\ + (_c) = stack[ssize].c, (_d) = stack[ssize].d;\ + } while(0) +#define STACK_POP5(_a, _b, _c, _d, _e)\ + do {\ + assert(0 <= ssize);\ + if(ssize == 0) { return; }\ + (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\ + (_c) = stack[ssize].c, (_d) = stack[ssize].d, (_e) = stack[ssize].e;\ + } while(0) +#define BUCKET_A(_c0) bucket_A[(_c0)] +#if ALPHABET_SIZE == 256 +#define BUCKET_B(_c0, _c1) (bucket_B[((_c1) << 8) | (_c0)]) +#define BUCKET_BSTAR(_c0, _c1) (bucket_B[((_c0) << 8) | (_c1)]) +#else +#define BUCKET_B(_c0, _c1) (bucket_B[(_c1) * ALPHABET_SIZE + (_c0)]) +#define BUCKET_BSTAR(_c0, _c1) (bucket_B[(_c0) * ALPHABET_SIZE + (_c1)]) +#endif + + +/*- Private Functions -*/ + +static const int lg_table[256]= { + -1,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, + 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, + 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, + 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 +}; + +#if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) + +static INLINE +int +ss_ilg(int n) { +#if SS_BLOCKSIZE == 0 + return (n & 0xffff0000) ? + ((n & 0xff000000) ? + 24 + lg_table[(n >> 24) & 0xff] : + 16 + lg_table[(n >> 16) & 0xff]) : + ((n & 0x0000ff00) ? + 8 + lg_table[(n >> 8) & 0xff] : + 0 + lg_table[(n >> 0) & 0xff]); +#elif SS_BLOCKSIZE < 256 + return lg_table[n]; +#else + return (n & 0xff00) ? + 8 + lg_table[(n >> 8) & 0xff] : + 0 + lg_table[(n >> 0) & 0xff]; +#endif +} + +#endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */ + +#if SS_BLOCKSIZE != 0 + +static const int sqq_table[256] = { + 0, 16, 22, 27, 32, 35, 39, 42, 45, 48, 50, 53, 55, 57, 59, 61, + 64, 65, 67, 69, 71, 73, 75, 76, 78, 80, 81, 83, 84, 86, 87, 89, + 90, 91, 93, 94, 96, 97, 98, 99, 101, 102, 103, 104, 106, 107, 108, 109, +110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, +128, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, +143, 144, 144, 145, 146, 147, 148, 149, 150, 150, 151, 152, 153, 154, 155, 155, +156, 157, 158, 159, 160, 160, 161, 162, 163, 163, 164, 165, 166, 167, 167, 168, +169, 170, 170, 171, 172, 173, 173, 174, 175, 176, 176, 177, 178, 178, 179, 180, +181, 181, 182, 183, 183, 184, 185, 185, 186, 187, 187, 188, 189, 189, 190, 191, +192, 192, 193, 193, 194, 195, 195, 196, 197, 197, 198, 199, 199, 200, 201, 201, +202, 203, 203, 204, 204, 205, 206, 206, 207, 208, 208, 209, 209, 210, 211, 211, +212, 212, 213, 214, 214, 215, 215, 216, 217, 217, 218, 218, 219, 219, 220, 221, +221, 222, 222, 223, 224, 224, 225, 225, 226, 226, 227, 227, 228, 229, 229, 230, +230, 231, 231, 232, 232, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238, +239, 240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, +247, 248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255 +}; + +static INLINE +int +ss_isqrt(int x) { + int y, e; + + if(x >= (SS_BLOCKSIZE * SS_BLOCKSIZE)) { return SS_BLOCKSIZE; } + e = ((unsigned)x & 0xffff0000) ? + (((unsigned)x & 0xff000000) ? + 24 + lg_table[(x >> 24) & 0xff] : + 16 + lg_table[(x >> 16) & 0xff]) : + ((x & 0x0000ff00) ? + 8 + lg_table[(x >> 8) & 0xff] : + 0 + lg_table[(x >> 0) & 0xff]); + + if(e >= 16) { + y = sqq_table[x >> ((e - 6) - (e & 1))] << ((e >> 1) - 7); + if(e >= 24) { y = (y + 1 + x / y) >> 1; } + y = (y + 1 + x / y) >> 1; + } else if(e >= 8) { + y = (sqq_table[x >> ((e - 6) - (e & 1))] >> (7 - (e >> 1))) + 1; + } else { + return sqq_table[x] >> 4; + } + + return (x < (y * y)) ? y - 1 : y; +} + +#endif /* SS_BLOCKSIZE != 0 */ + + +/*---------------------------------------------------------------------------*/ + +/* Compares two suffixes. */ +static INLINE +int +ss_compare(const unsigned char *T, + const int *p1, const int *p2, + int depth) { + const unsigned char *U1, *U2, *U1n, *U2n; + + for(U1 = T + depth + *p1, + U2 = T + depth + *p2, + U1n = T + *(p1 + 1) + 2, + U2n = T + *(p2 + 1) + 2; + (U1 < U1n) && (U2 < U2n) && (*U1 == *U2); + ++U1, ++U2) { + } + + return U1 < U1n ? + (U2 < U2n ? *U1 - *U2 : 1) : + (U2 < U2n ? -1 : 0); +} + + +/*---------------------------------------------------------------------------*/ + +#if (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1) + +/* Insertionsort for small size groups */ +static +void +ss_insertionsort(const unsigned char *T, const int *PA, + int *first, int *last, int depth) { + int *i, *j; + int t; + int r; + + for(i = last - 2; first <= i; --i) { + for(t = *i, j = i + 1; 0 < (r = ss_compare(T, PA + t, PA + *j, depth));) { + do { *(j - 1) = *j; } while((++j < last) && (*j < 0)); + if(last <= j) { break; } + } + if(r == 0) { *j = ~*j; } + *(j - 1) = t; + } +} + +#endif /* (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1) */ + + +/*---------------------------------------------------------------------------*/ + +#if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) + +static INLINE +void +ss_fixdown(const unsigned char *Td, const int *PA, + int *SA, int i, int size) { + int j, k; + int v; + int c, d, e; + + for(v = SA[i], c = Td[PA[v]]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) { + d = Td[PA[SA[k = j++]]]; + if(d < (e = Td[PA[SA[j]]])) { k = j; d = e; } + if(d <= c) { break; } + } + SA[i] = v; +} + +/* Simple top-down heapsort. */ +static +void +ss_heapsort(const unsigned char *Td, const int *PA, int *SA, int size) { + int i, m; + int t; + + m = size; + if((size % 2) == 0) { + m--; + if(Td[PA[SA[m / 2]]] < Td[PA[SA[m]]]) { SWAP(SA[m], SA[m / 2]); } + } + + for(i = m / 2 - 1; 0 <= i; --i) { ss_fixdown(Td, PA, SA, i, m); } + if((size % 2) == 0) { SWAP(SA[0], SA[m]); ss_fixdown(Td, PA, SA, 0, m); } + for(i = m - 1; 0 < i; --i) { + t = SA[0], SA[0] = SA[i]; + ss_fixdown(Td, PA, SA, 0, i); + SA[i] = t; + } +} + + +/*---------------------------------------------------------------------------*/ + +/* Returns the median of three elements. */ +static INLINE +int * +ss_median3(const unsigned char *Td, const int *PA, + int *v1, int *v2, int *v3) { + int *t; + if(Td[PA[*v1]] > Td[PA[*v2]]) { SWAP(v1, v2); } + if(Td[PA[*v2]] > Td[PA[*v3]]) { + if(Td[PA[*v1]] > Td[PA[*v3]]) { return v1; } + else { return v3; } + } + return v2; +} + +/* Returns the median of five elements. */ +static INLINE +int * +ss_median5(const unsigned char *Td, const int *PA, + int *v1, int *v2, int *v3, int *v4, int *v5) { + int *t; + if(Td[PA[*v2]] > Td[PA[*v3]]) { SWAP(v2, v3); } + if(Td[PA[*v4]] > Td[PA[*v5]]) { SWAP(v4, v5); } + if(Td[PA[*v2]] > Td[PA[*v4]]) { SWAP(v2, v4); SWAP(v3, v5); } + if(Td[PA[*v1]] > Td[PA[*v3]]) { SWAP(v1, v3); } + if(Td[PA[*v1]] > Td[PA[*v4]]) { SWAP(v1, v4); SWAP(v3, v5); } + if(Td[PA[*v3]] > Td[PA[*v4]]) { return v4; } + return v3; +} + +/* Returns the pivot element. */ +static INLINE +int * +ss_pivot(const unsigned char *Td, const int *PA, int *first, int *last) { + int *middle; + int t; + + t = last - first; + middle = first + t / 2; + + if(t <= 512) { + if(t <= 32) { + return ss_median3(Td, PA, first, middle, last - 1); + } else { + t >>= 2; + return ss_median5(Td, PA, first, first + t, middle, last - 1 - t, last - 1); + } + } + t >>= 3; + first = ss_median3(Td, PA, first, first + t, first + (t << 1)); + middle = ss_median3(Td, PA, middle - t, middle, middle + t); + last = ss_median3(Td, PA, last - 1 - (t << 1), last - 1 - t, last - 1); + return ss_median3(Td, PA, first, middle, last); +} + + +/*---------------------------------------------------------------------------*/ + +/* Binary partition for substrings. */ +static INLINE +int * +ss_partition(const int *PA, + int *first, int *last, int depth) { + int *a, *b; + int t; + for(a = first - 1, b = last;;) { + for(; (++a < b) && ((PA[*a] + depth) >= (PA[*a + 1] + 1));) { *a = ~*a; } + for(; (a < --b) && ((PA[*b] + depth) < (PA[*b + 1] + 1));) { } + if(b <= a) { break; } + t = ~*b; + *b = *a; + *a = t; + } + if(first < a) { *first = ~*first; } + return a; +} + +/* Multikey introsort for medium size groups. */ +static +void +ss_mintrosort(const unsigned char *T, const int *PA, + int *first, int *last, + int depth) { +#define STACK_SIZE SS_MISORT_STACKSIZE + struct { int *a, *b, c; int d; } stack[STACK_SIZE]; + const unsigned char *Td; + int *a, *b, *c, *d, *e, *f; + int s, t; + int ssize; + int limit; + int v, x = 0; + + for(ssize = 0, limit = ss_ilg(last - first);;) { + + if((last - first) <= SS_INSERTIONSORT_THRESHOLD) { +#if 1 < SS_INSERTIONSORT_THRESHOLD + if(1 < (last - first)) { ss_insertionsort(T, PA, first, last, depth); } +#endif + STACK_POP(first, last, depth, limit); + continue; + } + + Td = T + depth; + if(limit-- == 0) { ss_heapsort(Td, PA, first, last - first); } + if(limit < 0) { + for(a = first + 1, v = Td[PA[*first]]; a < last; ++a) { + if((x = Td[PA[*a]]) != v) { + if(1 < (a - first)) { break; } + v = x; + first = a; + } + } + if(Td[PA[*first] - 1] < v) { + first = ss_partition(PA, first, a, depth); + } + if((a - first) <= (last - a)) { + if(1 < (a - first)) { + STACK_PUSH(a, last, depth, -1); + last = a, depth += 1, limit = ss_ilg(a - first); + } else { + first = a, limit = -1; + } + } else { + if(1 < (last - a)) { + STACK_PUSH(first, a, depth + 1, ss_ilg(a - first)); + first = a, limit = -1; + } else { + last = a, depth += 1, limit = ss_ilg(a - first); + } + } + continue; + } + + /* choose pivot */ + a = ss_pivot(Td, PA, first, last); + v = Td[PA[*a]]; + SWAP(*first, *a); + + /* partition */ + for(b = first; (++b < last) && ((x = Td[PA[*b]]) == v);) { } + if(((a = b) < last) && (x < v)) { + for(; (++b < last) && ((x = Td[PA[*b]]) <= v);) { + if(x == v) { SWAP(*b, *a); ++a; } + } + } + for(c = last; (b < --c) && ((x = Td[PA[*c]]) == v);) { } + if((b < (d = c)) && (x > v)) { + for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) { + if(x == v) { SWAP(*c, *d); --d; } + } + } + for(; b < c;) { + SWAP(*b, *c); + for(; (++b < c) && ((x = Td[PA[*b]]) <= v);) { + if(x == v) { SWAP(*b, *a); ++a; } + } + for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) { + if(x == v) { SWAP(*c, *d); --d; } + } + } + + if(a <= d) { + c = b - 1; + + if((s = a - first) > (t = b - a)) { s = t; } + for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } + if((s = d - c) > (t = last - d - 1)) { s = t; } + for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } + + a = first + (b - a), c = last - (d - c); + b = (v <= Td[PA[*a] - 1]) ? a : ss_partition(PA, a, c, depth); + + if((a - first) <= (last - c)) { + if((last - c) <= (c - b)) { + STACK_PUSH(b, c, depth + 1, ss_ilg(c - b)); + STACK_PUSH(c, last, depth, limit); + last = a; + } else if((a - first) <= (c - b)) { + STACK_PUSH(c, last, depth, limit); + STACK_PUSH(b, c, depth + 1, ss_ilg(c - b)); + last = a; + } else { + STACK_PUSH(c, last, depth, limit); + STACK_PUSH(first, a, depth, limit); + first = b, last = c, depth += 1, limit = ss_ilg(c - b); + } + } else { + if((a - first) <= (c - b)) { + STACK_PUSH(b, c, depth + 1, ss_ilg(c - b)); + STACK_PUSH(first, a, depth, limit); + first = c; + } else if((last - c) <= (c - b)) { + STACK_PUSH(first, a, depth, limit); + STACK_PUSH(b, c, depth + 1, ss_ilg(c - b)); + first = c; + } else { + STACK_PUSH(first, a, depth, limit); + STACK_PUSH(c, last, depth, limit); + first = b, last = c, depth += 1, limit = ss_ilg(c - b); + } + } + } else { + limit += 1; + if(Td[PA[*first] - 1] < v) { + first = ss_partition(PA, first, last, depth); + limit = ss_ilg(last - first); + } + depth += 1; + } + } +#undef STACK_SIZE +} + +#endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */ + + +/*---------------------------------------------------------------------------*/ + +#if SS_BLOCKSIZE != 0 + +static INLINE +void +ss_blockswap(int *a, int *b, int n) { + int t; + for(; 0 < n; --n, ++a, ++b) { + t = *a, *a = *b, *b = t; + } +} + +static INLINE +void +ss_rotate(int *first, int *middle, int *last) { + int *a, *b, t; + int l, r; + l = middle - first, r = last - middle; + for(; (0 < l) && (0 < r);) { + if(l == r) { ss_blockswap(first, middle, l); break; } + if(l < r) { + a = last - 1, b = middle - 1; + t = *a; + do { + *a-- = *b, *b-- = *a; + if(b < first) { + *a = t; + last = a; + if((r -= l + 1) <= l) { break; } + a -= 1, b = middle - 1; + t = *a; + } + } while(1); + } else { + a = first, b = middle; + t = *a; + do { + *a++ = *b, *b++ = *a; + if(last <= b) { + *a = t; + first = a + 1; + if((l -= r + 1) <= r) { break; } + a += 1, b = middle; + t = *a; + } + } while(1); + } + } +} + + +/*---------------------------------------------------------------------------*/ + +static +void +ss_inplacemerge(const unsigned char *T, const int *PA, + int *first, int *middle, int *last, + int depth) { + const int *p; + int *a, *b; + int len, half; + int q, r; + int x; + + for(;;) { + if(*(last - 1) < 0) { x = 1; p = PA + ~*(last - 1); } + else { x = 0; p = PA + *(last - 1); } + for(a = first, len = middle - first, half = len >> 1, r = -1; + 0 < len; + len = half, half >>= 1) { + b = a + half; + q = ss_compare(T, PA + ((0 <= *b) ? *b : ~*b), p, depth); + if(q < 0) { + a = b + 1; + half -= (len & 1) ^ 1; + } else { + r = q; + } + } + if(a < middle) { + if(r == 0) { *a = ~*a; } + ss_rotate(a, middle, last); + last -= middle - a; + middle = a; + if(first == middle) { break; } + } + --last; + if(x != 0) { while(*--last < 0) { } } + if(middle == last) { break; } + } +} + + +/*---------------------------------------------------------------------------*/ + +/* Merge-forward with internal buffer. */ +static +void +ss_mergeforward(const unsigned char *T, const int *PA, + int *first, int *middle, int *last, + int *buf, int depth) { + int *a, *b, *c, *bufend; + int t; + int r; + + bufend = buf + (middle - first) - 1; + ss_blockswap(buf, first, middle - first); + + for(t = *(a = first), b = buf, c = middle;;) { + r = ss_compare(T, PA + *b, PA + *c, depth); + if(r < 0) { + do { + *a++ = *b; + if(bufend <= b) { *bufend = t; return; } + *b++ = *a; + } while(*b < 0); + } else if(r > 0) { + do { + *a++ = *c, *c++ = *a; + if(last <= c) { + while(b < bufend) { *a++ = *b, *b++ = *a; } + *a = *b, *b = t; + return; + } + } while(*c < 0); + } else { + *c = ~*c; + do { + *a++ = *b; + if(bufend <= b) { *bufend = t; return; } + *b++ = *a; + } while(*b < 0); + + do { + *a++ = *c, *c++ = *a; + if(last <= c) { + while(b < bufend) { *a++ = *b, *b++ = *a; } + *a = *b, *b = t; + return; + } + } while(*c < 0); + } + } +} + +/* Merge-backward with internal buffer. */ +static +void +ss_mergebackward(const unsigned char *T, const int *PA, + int *first, int *middle, int *last, + int *buf, int depth) { + const int *p1, *p2; + int *a, *b, *c, *bufend; + int t; + int r; + int x; + + bufend = buf + (last - middle) - 1; + ss_blockswap(buf, middle, last - middle); + + x = 0; + if(*bufend < 0) { p1 = PA + ~*bufend; x |= 1; } + else { p1 = PA + *bufend; } + if(*(middle - 1) < 0) { p2 = PA + ~*(middle - 1); x |= 2; } + else { p2 = PA + *(middle - 1); } + for(t = *(a = last - 1), b = bufend, c = middle - 1;;) { + r = ss_compare(T, p1, p2, depth); + if(0 < r) { + if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; } + *a-- = *b; + if(b <= buf) { *buf = t; break; } + *b-- = *a; + if(*b < 0) { p1 = PA + ~*b; x |= 1; } + else { p1 = PA + *b; } + } else if(r < 0) { + if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; } + *a-- = *c, *c-- = *a; + if(c < first) { + while(buf < b) { *a-- = *b, *b-- = *a; } + *a = *b, *b = t; + break; + } + if(*c < 0) { p2 = PA + ~*c; x |= 2; } + else { p2 = PA + *c; } + } else { + if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; } + *a-- = ~*b; + if(b <= buf) { *buf = t; break; } + *b-- = *a; + if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; } + *a-- = *c, *c-- = *a; + if(c < first) { + while(buf < b) { *a-- = *b, *b-- = *a; } + *a = *b, *b = t; + break; + } + if(*b < 0) { p1 = PA + ~*b; x |= 1; } + else { p1 = PA + *b; } + if(*c < 0) { p2 = PA + ~*c; x |= 2; } + else { p2 = PA + *c; } + } + } +} + +/* D&C based merge. */ +static +void +ss_swapmerge(const unsigned char *T, const int *PA, + int *first, int *middle, int *last, + int *buf, int bufsize, int depth) { +#define STACK_SIZE SS_SMERGE_STACKSIZE +#define GETIDX(a) ((0 <= (a)) ? (a) : (~(a))) +#define MERGE_CHECK(a, b, c)\ + do {\ + if(((c) & 1) ||\ + (((c) & 2) && (ss_compare(T, PA + GETIDX(*((a) - 1)), PA + *(a), depth) == 0))) {\ + *(a) = ~*(a);\ + }\ + if(((c) & 4) && ((ss_compare(T, PA + GETIDX(*((b) - 1)), PA + *(b), depth) == 0))) {\ + *(b) = ~*(b);\ + }\ + } while(0) + struct { int *a, *b, *c; int d; } stack[STACK_SIZE]; + int *l, *r, *lm, *rm; + int m, len, half; + int ssize; + int check, next; + + for(check = 0, ssize = 0;;) { + if((last - middle) <= bufsize) { + if((first < middle) && (middle < last)) { + ss_mergebackward(T, PA, first, middle, last, buf, depth); + } + MERGE_CHECK(first, last, check); + STACK_POP(first, middle, last, check); + continue; + } + + if((middle - first) <= bufsize) { + if(first < middle) { + ss_mergeforward(T, PA, first, middle, last, buf, depth); + } + MERGE_CHECK(first, last, check); + STACK_POP(first, middle, last, check); + continue; + } + + for(m = 0, len = MIN(middle - first, last - middle), half = len >> 1; + 0 < len; + len = half, half >>= 1) { + if(ss_compare(T, PA + GETIDX(*(middle + m + half)), + PA + GETIDX(*(middle - m - half - 1)), depth) < 0) { + m += half + 1; + half -= (len & 1) ^ 1; + } + } + + if(0 < m) { + lm = middle - m, rm = middle + m; + ss_blockswap(lm, middle, m); + l = r = middle, next = 0; + if(rm < last) { + if(*rm < 0) { + *rm = ~*rm; + if(first < lm) { for(; *--l < 0;) { } next |= 4; } + next |= 1; + } else if(first < lm) { + for(; *r < 0; ++r) { } + next |= 2; + } + } + + if((l - first) <= (last - r)) { + STACK_PUSH(r, rm, last, (next & 3) | (check & 4)); + middle = lm, last = l, check = (check & 3) | (next & 4); + } else { + if((next & 2) && (r == middle)) { next ^= 6; } + STACK_PUSH(first, lm, l, (check & 3) | (next & 4)); + first = r, middle = rm, check = (next & 3) | (check & 4); + } + } else { + if(ss_compare(T, PA + GETIDX(*(middle - 1)), PA + *middle, depth) == 0) { + *middle = ~*middle; + } + MERGE_CHECK(first, last, check); + STACK_POP(first, middle, last, check); + } + } +#undef STACK_SIZE +} + +#endif /* SS_BLOCKSIZE != 0 */ + + +/*---------------------------------------------------------------------------*/ + +/* Substring sort */ +static +void +sssort(const unsigned char *T, const int *PA, + int *first, int *last, + int *buf, int bufsize, + int depth, int n, int lastsuffix) { + int *a; +#if SS_BLOCKSIZE != 0 + int *b, *middle, *curbuf; + int j, k, curbufsize, limit; +#endif + int i; + + if(lastsuffix != 0) { ++first; } + +#if SS_BLOCKSIZE == 0 + ss_mintrosort(T, PA, first, last, depth); +#else + if((bufsize < SS_BLOCKSIZE) && + (bufsize < (last - first)) && + (bufsize < (limit = ss_isqrt(last - first)))) { + if(SS_BLOCKSIZE < limit) { limit = SS_BLOCKSIZE; } + buf = middle = last - limit, bufsize = limit; + } else { + middle = last, limit = 0; + } + for(a = first, i = 0; SS_BLOCKSIZE < (middle - a); a += SS_BLOCKSIZE, ++i) { +#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE + ss_mintrosort(T, PA, a, a + SS_BLOCKSIZE, depth); +#elif 1 < SS_BLOCKSIZE + ss_insertionsort(T, PA, a, a + SS_BLOCKSIZE, depth); +#endif + curbufsize = last - (a + SS_BLOCKSIZE); + curbuf = a + SS_BLOCKSIZE; + if(curbufsize <= bufsize) { curbufsize = bufsize, curbuf = buf; } + for(b = a, k = SS_BLOCKSIZE, j = i; j & 1; b -= k, k <<= 1, j >>= 1) { + ss_swapmerge(T, PA, b - k, b, b + k, curbuf, curbufsize, depth); + } + } +#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE + ss_mintrosort(T, PA, a, middle, depth); +#elif 1 < SS_BLOCKSIZE + ss_insertionsort(T, PA, a, middle, depth); +#endif + for(k = SS_BLOCKSIZE; i != 0; k <<= 1, i >>= 1) { + if(i & 1) { + ss_swapmerge(T, PA, a - k, a, middle, buf, bufsize, depth); + a -= k; + } + } + if(limit != 0) { +#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE + ss_mintrosort(T, PA, middle, last, depth); +#elif 1 < SS_BLOCKSIZE + ss_insertionsort(T, PA, middle, last, depth); +#endif + ss_inplacemerge(T, PA, first, middle, last, depth); + } +#endif + + if(lastsuffix != 0) { + /* Insert last type B* suffix. */ + int PAi[2]; PAi[0] = PA[*(first - 1)], PAi[1] = n - 2; + for(a = first, i = *(first - 1); + (a < last) && ((*a < 0) || (0 < ss_compare(T, &(PAi[0]), PA + *a, depth))); + ++a) { + *(a - 1) = *a; + } + *(a - 1) = i; + } +} + + +/*---------------------------------------------------------------------------*/ + +static INLINE +int +tr_ilg(int n) { + return ((unsigned)n & 0xffff0000) ? + (((unsigned)n & 0xff000000) ? + 24 + lg_table[(n >> 24) & 0xff] : + 16 + lg_table[(n >> 16) & 0xff]) : + ((n & 0x0000ff00) ? + 8 + lg_table[(n >> 8) & 0xff] : + 0 + lg_table[(n >> 0) & 0xff]); +} + + +/*---------------------------------------------------------------------------*/ + +/* Simple insertionsort for small size groups. */ +static +void +tr_insertionsort(const int *ISAd, int *first, int *last) { + int *a, *b; + int t, r; + + for(a = first + 1; a < last; ++a) { + for(t = *a, b = a - 1; 0 > (r = ISAd[t] - ISAd[*b]);) { + do { *(b + 1) = *b; } while((first <= --b) && (*b < 0)); + if(b < first) { break; } + } + if(r == 0) { *b = ~*b; } + *(b + 1) = t; + } +} + + +/*---------------------------------------------------------------------------*/ + +static INLINE +void +tr_fixdown(const int *ISAd, int *SA, int i, int size) { + int j, k; + int v; + int c, d, e; + + for(v = SA[i], c = ISAd[v]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) { + d = ISAd[SA[k = j++]]; + if(d < (e = ISAd[SA[j]])) { k = j; d = e; } + if(d <= c) { break; } + } + SA[i] = v; +} + +/* Simple top-down heapsort. */ +static +void +tr_heapsort(const int *ISAd, int *SA, int size) { + int i, m; + int t; + + m = size; + if((size % 2) == 0) { + m--; + if(ISAd[SA[m / 2]] < ISAd[SA[m]]) { SWAP(SA[m], SA[m / 2]); } + } + + for(i = m / 2 - 1; 0 <= i; --i) { tr_fixdown(ISAd, SA, i, m); } + if((size % 2) == 0) { SWAP(SA[0], SA[m]); tr_fixdown(ISAd, SA, 0, m); } + for(i = m - 1; 0 < i; --i) { + t = SA[0], SA[0] = SA[i]; + tr_fixdown(ISAd, SA, 0, i); + SA[i] = t; + } +} + + +/*---------------------------------------------------------------------------*/ + +/* Returns the median of three elements. */ +static INLINE +int * +tr_median3(const int *ISAd, int *v1, int *v2, int *v3) { + int *t; + if(ISAd[*v1] > ISAd[*v2]) { SWAP(v1, v2); } + if(ISAd[*v2] > ISAd[*v3]) { + if(ISAd[*v1] > ISAd[*v3]) { return v1; } + else { return v3; } + } + return v2; +} + +/* Returns the median of five elements. */ +static INLINE +int * +tr_median5(const int *ISAd, + int *v1, int *v2, int *v3, int *v4, int *v5) { + int *t; + if(ISAd[*v2] > ISAd[*v3]) { SWAP(v2, v3); } + if(ISAd[*v4] > ISAd[*v5]) { SWAP(v4, v5); } + if(ISAd[*v2] > ISAd[*v4]) { SWAP(v2, v4); SWAP(v3, v5); } + if(ISAd[*v1] > ISAd[*v3]) { SWAP(v1, v3); } + if(ISAd[*v1] > ISAd[*v4]) { SWAP(v1, v4); SWAP(v3, v5); } + if(ISAd[*v3] > ISAd[*v4]) { return v4; } + return v3; +} + +/* Returns the pivot element. */ +static INLINE +int * +tr_pivot(const int *ISAd, int *first, int *last) { + int *middle; + int t; + + t = last - first; + middle = first + t / 2; + + if(t <= 512) { + if(t <= 32) { + return tr_median3(ISAd, first, middle, last - 1); + } else { + t >>= 2; + return tr_median5(ISAd, first, first + t, middle, last - 1 - t, last - 1); + } + } + t >>= 3; + first = tr_median3(ISAd, first, first + t, first + (t << 1)); + middle = tr_median3(ISAd, middle - t, middle, middle + t); + last = tr_median3(ISAd, last - 1 - (t << 1), last - 1 - t, last - 1); + return tr_median3(ISAd, first, middle, last); +} + + +/*---------------------------------------------------------------------------*/ + +typedef struct _trbudget_t trbudget_t; +struct _trbudget_t { + int chance; + int remain; + int incval; + int count; +}; + +static INLINE +void +trbudget_init(trbudget_t *budget, int chance, int incval) { + budget->chance = chance; + budget->remain = budget->incval = incval; +} + +static INLINE +int +trbudget_check(trbudget_t *budget, int size) { + if(size <= budget->remain) { budget->remain -= size; return 1; } + if(budget->chance == 0) { budget->count += size; return 0; } + budget->remain += budget->incval - size; + budget->chance -= 1; + return 1; +} + + +/*---------------------------------------------------------------------------*/ + +static INLINE +void +tr_partition(const int *ISAd, + int *first, int *middle, int *last, + int **pa, int **pb, int v) { + int *a, *b, *c, *d, *e, *f; + int t, s; + int x = 0; + + for(b = middle - 1; (++b < last) && ((x = ISAd[*b]) == v);) { } + if(((a = b) < last) && (x < v)) { + for(; (++b < last) && ((x = ISAd[*b]) <= v);) { + if(x == v) { SWAP(*b, *a); ++a; } + } + } + for(c = last; (b < --c) && ((x = ISAd[*c]) == v);) { } + if((b < (d = c)) && (x > v)) { + for(; (b < --c) && ((x = ISAd[*c]) >= v);) { + if(x == v) { SWAP(*c, *d); --d; } + } + } + for(; b < c;) { + SWAP(*b, *c); + for(; (++b < c) && ((x = ISAd[*b]) <= v);) { + if(x == v) { SWAP(*b, *a); ++a; } + } + for(; (b < --c) && ((x = ISAd[*c]) >= v);) { + if(x == v) { SWAP(*c, *d); --d; } + } + } + + if(a <= d) { + c = b - 1; + if((s = a - first) > (t = b - a)) { s = t; } + for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } + if((s = d - c) > (t = last - d - 1)) { s = t; } + for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } + first += (b - a), last -= (d - c); + } + *pa = first, *pb = last; +} + +static +void +tr_copy(int *ISA, const int *SA, + int *first, int *a, int *b, int *last, + int depth) { + /* sort suffixes of middle partition + by using sorted order of suffixes of left and right partition. */ + int *c, *d, *e; + int s, v; + + v = b - SA - 1; + for(c = first, d = a - 1; c <= d; ++c) { + if((0 <= (s = *c - depth)) && (ISA[s] == v)) { + *++d = s; + ISA[s] = d - SA; + } + } + for(c = last - 1, e = d + 1, d = b; e < d; --c) { + if((0 <= (s = *c - depth)) && (ISA[s] == v)) { + *--d = s; + ISA[s] = d - SA; + } + } +} + +static +void +tr_partialcopy(int *ISA, const int *SA, + int *first, int *a, int *b, int *last, + int depth) { + int *c, *d, *e; + int s, v; + int rank, lastrank, newrank = -1; + + v = b - SA - 1; + lastrank = -1; + for(c = first, d = a - 1; c <= d; ++c) { + if((0 <= (s = *c - depth)) && (ISA[s] == v)) { + *++d = s; + rank = ISA[s + depth]; + if(lastrank != rank) { lastrank = rank; newrank = d - SA; } + ISA[s] = newrank; + } + } + + lastrank = -1; + for(e = d; first <= e; --e) { + rank = ISA[*e]; + if(lastrank != rank) { lastrank = rank; newrank = e - SA; } + if(newrank != rank) { ISA[*e] = newrank; } + } + + lastrank = -1; + for(c = last - 1, e = d + 1, d = b; e < d; --c) { + if((0 <= (s = *c - depth)) && (ISA[s] == v)) { + *--d = s; + rank = ISA[s + depth]; + if(lastrank != rank) { lastrank = rank; newrank = d - SA; } + ISA[s] = newrank; + } + } +} + +static +void +tr_introsort(int *ISA, const int *ISAd, + int *SA, int *first, int *last, + trbudget_t *budget) { +#define STACK_SIZE TR_STACKSIZE + struct { const int *a; int *b, *c; int d, e; }stack[STACK_SIZE]; + int *a, *b, *c; + int t; + int v, x = 0; + int incr = ISAd - ISA; + int limit, next; + int ssize, trlink = -1; + + for(ssize = 0, limit = tr_ilg(last - first);;) { + + if(limit < 0) { + if(limit == -1) { + /* tandem repeat partition */ + tr_partition(ISAd - incr, first, first, last, &a, &b, last - SA - 1); + + /* update ranks */ + if(a < last) { + for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; } + } + if(b < last) { + for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; } + } + + /* push */ + if(1 < (b - a)) { + STACK_PUSH5(NULL, a, b, 0, 0); + STACK_PUSH5(ISAd - incr, first, last, -2, trlink); + trlink = ssize - 2; + } + if((a - first) <= (last - b)) { + if(1 < (a - first)) { + STACK_PUSH5(ISAd, b, last, tr_ilg(last - b), trlink); + last = a, limit = tr_ilg(a - first); + } else if(1 < (last - b)) { + first = b, limit = tr_ilg(last - b); + } else { + STACK_POP5(ISAd, first, last, limit, trlink); + } + } else { + if(1 < (last - b)) { + STACK_PUSH5(ISAd, first, a, tr_ilg(a - first), trlink); + first = b, limit = tr_ilg(last - b); + } else if(1 < (a - first)) { + last = a, limit = tr_ilg(a - first); + } else { + STACK_POP5(ISAd, first, last, limit, trlink); + } + } + } else if(limit == -2) { + /* tandem repeat copy */ + a = stack[--ssize].b, b = stack[ssize].c; + if(stack[ssize].d == 0) { + tr_copy(ISA, SA, first, a, b, last, ISAd - ISA); + } else { + if(0 <= trlink) { stack[trlink].d = -1; } + tr_partialcopy(ISA, SA, first, a, b, last, ISAd - ISA); + } + STACK_POP5(ISAd, first, last, limit, trlink); + } else { + /* sorted partition */ + if(0 <= *first) { + a = first; + do { ISA[*a] = a - SA; } while((++a < last) && (0 <= *a)); + first = a; + } + if(first < last) { + a = first; do { *a = ~*a; } while(*++a < 0); + next = (ISA[*a] != ISAd[*a]) ? tr_ilg(a - first + 1) : -1; + if(++a < last) { for(b = first, v = a - SA - 1; b < a; ++b) { ISA[*b] = v; } } + + /* push */ + if(trbudget_check(budget, a - first)) { + if((a - first) <= (last - a)) { + STACK_PUSH5(ISAd, a, last, -3, trlink); + ISAd += incr, last = a, limit = next; + } else { + if(1 < (last - a)) { + STACK_PUSH5(ISAd + incr, first, a, next, trlink); + first = a, limit = -3; + } else { + ISAd += incr, last = a, limit = next; + } + } + } else { + if(0 <= trlink) { stack[trlink].d = -1; } + if(1 < (last - a)) { + first = a, limit = -3; + } else { + STACK_POP5(ISAd, first, last, limit, trlink); + } + } + } else { + STACK_POP5(ISAd, first, last, limit, trlink); + } + } + continue; + } + + if((last - first) <= TR_INSERTIONSORT_THRESHOLD) { + tr_insertionsort(ISAd, first, last); + limit = -3; + continue; + } + + if(limit-- == 0) { + tr_heapsort(ISAd, first, last - first); + for(a = last - 1; first < a; a = b) { + for(x = ISAd[*a], b = a - 1; (first <= b) && (ISAd[*b] == x); --b) { *b = ~*b; } + } + limit = -3; + continue; + } + + /* choose pivot */ + a = tr_pivot(ISAd, first, last); + SWAP(*first, *a); + v = ISAd[*first]; + + /* partition */ + tr_partition(ISAd, first, first + 1, last, &a, &b, v); + if((last - first) != (b - a)) { + next = (ISA[*a] != v) ? tr_ilg(b - a) : -1; + + /* update ranks */ + for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; } + if(b < last) { for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; } } + + /* push */ + if((1 < (b - a)) && (trbudget_check(budget, b - a))) { + if((a - first) <= (last - b)) { + if((last - b) <= (b - a)) { + if(1 < (a - first)) { + STACK_PUSH5(ISAd + incr, a, b, next, trlink); + STACK_PUSH5(ISAd, b, last, limit, trlink); + last = a; + } else if(1 < (last - b)) { + STACK_PUSH5(ISAd + incr, a, b, next, trlink); + first = b; + } else { + ISAd += incr, first = a, last = b, limit = next; + } + } else if((a - first) <= (b - a)) { + if(1 < (a - first)) { + STACK_PUSH5(ISAd, b, last, limit, trlink); + STACK_PUSH5(ISAd + incr, a, b, next, trlink); + last = a; + } else { + STACK_PUSH5(ISAd, b, last, limit, trlink); + ISAd += incr, first = a, last = b, limit = next; + } + } else { + STACK_PUSH5(ISAd, b, last, limit, trlink); + STACK_PUSH5(ISAd, first, a, limit, trlink); + ISAd += incr, first = a, last = b, limit = next; + } + } else { + if((a - first) <= (b - a)) { + if(1 < (last - b)) { + STACK_PUSH5(ISAd + incr, a, b, next, trlink); + STACK_PUSH5(ISAd, first, a, limit, trlink); + first = b; + } else if(1 < (a - first)) { + STACK_PUSH5(ISAd + incr, a, b, next, trlink); + last = a; + } else { + ISAd += incr, first = a, last = b, limit = next; + } + } else if((last - b) <= (b - a)) { + if(1 < (last - b)) { + STACK_PUSH5(ISAd, first, a, limit, trlink); + STACK_PUSH5(ISAd + incr, a, b, next, trlink); + first = b; + } else { + STACK_PUSH5(ISAd, first, a, limit, trlink); + ISAd += incr, first = a, last = b, limit = next; + } + } else { + STACK_PUSH5(ISAd, first, a, limit, trlink); + STACK_PUSH5(ISAd, b, last, limit, trlink); + ISAd += incr, first = a, last = b, limit = next; + } + } + } else { + if((1 < (b - a)) && (0 <= trlink)) { stack[trlink].d = -1; } + if((a - first) <= (last - b)) { + if(1 < (a - first)) { + STACK_PUSH5(ISAd, b, last, limit, trlink); + last = a; + } else if(1 < (last - b)) { + first = b; + } else { + STACK_POP5(ISAd, first, last, limit, trlink); + } + } else { + if(1 < (last - b)) { + STACK_PUSH5(ISAd, first, a, limit, trlink); + first = b; + } else if(1 < (a - first)) { + last = a; + } else { + STACK_POP5(ISAd, first, last, limit, trlink); + } + } + } + } else { + if(trbudget_check(budget, last - first)) { + limit = tr_ilg(last - first), ISAd += incr; + } else { + if(0 <= trlink) { stack[trlink].d = -1; } + STACK_POP5(ISAd, first, last, limit, trlink); + } + } + } +#undef STACK_SIZE +} + + + +/*---------------------------------------------------------------------------*/ + +/* Tandem repeat sort */ +static +void +trsort(int *ISA, int *SA, int n, int depth) { + int *ISAd; + int *first, *last; + trbudget_t budget; + int t, skip, unsorted; + + trbudget_init(&budget, tr_ilg(n) * 2 / 3, n); +/* trbudget_init(&budget, tr_ilg(n) * 3 / 4, n); */ + for(ISAd = ISA + depth; -n < *SA; ISAd += ISAd - ISA) { + first = SA; + skip = 0; + unsorted = 0; + do { + if((t = *first) < 0) { first -= t; skip += t; } + else { + if(skip != 0) { *(first + skip) = skip; skip = 0; } + last = SA + ISA[t] + 1; + if(1 < (last - first)) { + budget.count = 0; + tr_introsort(ISA, ISAd, SA, first, last, &budget); + if(budget.count != 0) { unsorted += budget.count; } + else { skip = first - last; } + } else if((last - first) == 1) { + skip = -1; + } + first = last; + } + } while(first < (SA + n)); + if(skip != 0) { *(first + skip) = skip; } + if(unsorted == 0) { break; } + } +} + + +/*---------------------------------------------------------------------------*/ + +/* Sorts suffixes of type B*. */ +static +int +sort_typeBstar(const unsigned char *T, int *SA, + int *bucket_A, int *bucket_B, + int n, int openMP) { + int *PAb, *ISAb, *buf; +#ifdef LIBBSC_OPENMP + int *curbuf; + int l; +#endif + int i, j, k, t, m, bufsize; + int c0, c1; +#ifdef LIBBSC_OPENMP + int d0, d1; +#endif + (void)openMP; + + /* Initialize bucket arrays. */ + for(i = 0; i < BUCKET_A_SIZE; ++i) { bucket_A[i] = 0; } + for(i = 0; i < BUCKET_B_SIZE; ++i) { bucket_B[i] = 0; } + + /* Count the number of occurrences of the first one or two characters of each + type A, B and B* suffix. Moreover, store the beginning position of all + type B* suffixes into the array SA. */ + for(i = n - 1, m = n, c0 = T[n - 1]; 0 <= i;) { + /* type A suffix. */ + do { ++BUCKET_A(c1 = c0); } while((0 <= --i) && ((c0 = T[i]) >= c1)); + if(0 <= i) { + /* type B* suffix. */ + ++BUCKET_BSTAR(c0, c1); + SA[--m] = i; + /* type B suffix. */ + for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { + ++BUCKET_B(c0, c1); + } + } + } + m = n - m; +/* +note: + A type B* suffix is lexicographically smaller than a type B suffix that + begins with the same first two characters. +*/ + + /* Calculate the index of start/end point of each bucket. */ + for(c0 = 0, i = 0, j = 0; c0 < ALPHABET_SIZE; ++c0) { + t = i + BUCKET_A(c0); + BUCKET_A(c0) = i + j; /* start point */ + i = t + BUCKET_B(c0, c0); + for(c1 = c0 + 1; c1 < ALPHABET_SIZE; ++c1) { + j += BUCKET_BSTAR(c0, c1); + BUCKET_BSTAR(c0, c1) = j; /* end point */ + i += BUCKET_B(c0, c1); + } + } + + if(0 < m) { + /* Sort the type B* suffixes by their first two characters. */ + PAb = SA + n - m; ISAb = SA + m; + for(i = m - 2; 0 <= i; --i) { + t = PAb[i], c0 = T[t], c1 = T[t + 1]; + SA[--BUCKET_BSTAR(c0, c1)] = i; + } + t = PAb[m - 1], c0 = T[t], c1 = T[t + 1]; + SA[--BUCKET_BSTAR(c0, c1)] = m - 1; + + /* Sort the type B* substrings using sssort. */ +#ifdef LIBBSC_OPENMP + if (openMP) + { + buf = SA + m; + c0 = ALPHABET_SIZE - 2, c1 = ALPHABET_SIZE - 1, j = m; +#pragma omp parallel default(shared) private(bufsize, curbuf, k, l, d0, d1) + { + bufsize = (n - (2 * m)) / omp_get_num_threads(); + curbuf = buf + omp_get_thread_num() * bufsize; + k = 0; + for(;;) { + #pragma omp critical(sssort_lock) + { + if(0 < (l = j)) { + d0 = c0, d1 = c1; + do { + k = BUCKET_BSTAR(d0, d1); + if(--d1 <= d0) { + d1 = ALPHABET_SIZE - 1; + if(--d0 < 0) { break; } + } + } while(((l - k) <= 1) && (0 < (l = k))); + c0 = d0, c1 = d1, j = k; + } + } + if(l == 0) { break; } + sssort(T, PAb, SA + k, SA + l, + curbuf, bufsize, 2, n, *(SA + k) == (m - 1)); + } + } + } + else + { + buf = SA + m, bufsize = n - (2 * m); + for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) { + for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) { + i = BUCKET_BSTAR(c0, c1); + if(1 < (j - i)) { + sssort(T, PAb, SA + i, SA + j, + buf, bufsize, 2, n, *(SA + i) == (m - 1)); + } + } + } + } +#else + buf = SA + m, bufsize = n - (2 * m); + for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) { + for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) { + i = BUCKET_BSTAR(c0, c1); + if(1 < (j - i)) { + sssort(T, PAb, SA + i, SA + j, + buf, bufsize, 2, n, *(SA + i) == (m - 1)); + } + } + } +#endif + + /* Compute ranks of type B* substrings. */ + for(i = m - 1; 0 <= i; --i) { + if(0 <= SA[i]) { + j = i; + do { ISAb[SA[i]] = i; } while((0 <= --i) && (0 <= SA[i])); + SA[i + 1] = i - j; + if(i <= 0) { break; } + } + j = i; + do { ISAb[SA[i] = ~SA[i]] = j; } while(SA[--i] < 0); + ISAb[SA[i]] = j; + } + + /* Construct the inverse suffix array of type B* suffixes using trsort. */ + trsort(ISAb, SA, m, 1); + + /* Set the sorted order of type B* suffixes. */ + for(i = n - 1, j = m, c0 = T[n - 1]; 0 <= i;) { + for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) >= c1); --i, c1 = c0) { } + if(0 <= i) { + t = i; + for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { } + SA[ISAb[--j]] = ((t == 0) || (1 < (t - i))) ? t : ~t; + } + } + + /* Calculate the index of start/end point of each bucket. */ + BUCKET_B(ALPHABET_SIZE - 1, ALPHABET_SIZE - 1) = n; /* end point */ + for(c0 = ALPHABET_SIZE - 2, k = m - 1; 0 <= c0; --c0) { + i = BUCKET_A(c0 + 1) - 1; + for(c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) { + t = i - BUCKET_B(c0, c1); + BUCKET_B(c0, c1) = i; /* end point */ + + /* Move all type B* suffixes to the correct position. */ + for(i = t, j = BUCKET_BSTAR(c0, c1); + j <= k; + --i, --k) { SA[i] = SA[k]; } + } + BUCKET_BSTAR(c0, c0 + 1) = i - BUCKET_B(c0, c0) + 1; /* start point */ + BUCKET_B(c0, c0) = i; /* end point */ + } + } + + return m; +} + +/* Constructs the suffix array by using the sorted order of type B* suffixes. */ +static +void +construct_SA(const unsigned char *T, int *SA, + int *bucket_A, int *bucket_B, + int n, int m) { + int *i, *j, *k; + int s; + int c0, c1, c2; + + if(0 < m) { + /* Construct the sorted order of type B suffixes by using + the sorted order of type B* suffixes. */ + for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) { + /* Scan the suffix array from right to left. */ + for(i = SA + BUCKET_BSTAR(c1, c1 + 1), + j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1; + i <= j; + --j) { + if(0 < (s = *j)) { + assert(T[s] == c1); + assert(((s + 1) < n) && (T[s] <= T[s + 1])); + assert(T[s - 1] <= T[s]); + *j = ~s; + c0 = T[--s]; + if((0 < s) && (T[s - 1] > c0)) { s = ~s; } + if(c0 != c2) { + if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; } + k = SA + BUCKET_B(c2 = c0, c1); + } + assert(k < j); assert(k != NULL); + *k-- = s; + } else { + assert(((s == 0) && (T[s] == c1)) || (s < 0)); + *j = ~s; + } + } + } + } + + /* Construct the suffix array by using + the sorted order of type B suffixes. */ + k = SA + BUCKET_A(c2 = T[n - 1]); + *k++ = (T[n - 2] < c2) ? ~(n - 1) : (n - 1); + /* Scan the suffix array from left to right. */ + for(i = SA, j = SA + n; i < j; ++i) { + if(0 < (s = *i)) { + assert(T[s - 1] >= T[s]); + c0 = T[--s]; + if((s == 0) || (T[s - 1] < c0)) { s = ~s; } + if(c0 != c2) { + BUCKET_A(c2) = k - SA; + k = SA + BUCKET_A(c2 = c0); + } + assert(i < k); + *k++ = s; + } else { + assert(s < 0); + *i = ~s; + } + } +} + +/* Constructs the burrows-wheeler transformed string directly + by using the sorted order of type B* suffixes. */ +static +int +construct_BWT(const unsigned char *T, int *SA, + int *bucket_A, int *bucket_B, + int n, int m) { + int *i, *j, *k, *orig; + int s; + int c0, c1, c2; + + if(0 < m) { + /* Construct the sorted order of type B suffixes by using + the sorted order of type B* suffixes. */ + for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) { + /* Scan the suffix array from right to left. */ + for(i = SA + BUCKET_BSTAR(c1, c1 + 1), + j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1; + i <= j; + --j) { + if(0 < (s = *j)) { + assert(T[s] == c1); + assert(((s + 1) < n) && (T[s] <= T[s + 1])); + assert(T[s - 1] <= T[s]); + c0 = T[--s]; + *j = ~((int)c0); + if((0 < s) && (T[s - 1] > c0)) { s = ~s; } + if(c0 != c2) { + if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; } + k = SA + BUCKET_B(c2 = c0, c1); + } + assert(k < j); assert(k != NULL); + *k-- = s; + } else if(s != 0) { + *j = ~s; +#ifndef NDEBUG + } else { + assert(T[s] == c1); +#endif + } + } + } + } + + /* Construct the BWTed string by using + the sorted order of type B suffixes. */ + k = SA + BUCKET_A(c2 = T[n - 1]); + *k++ = (T[n - 2] < c2) ? ~((int)T[n - 2]) : (n - 1); + /* Scan the suffix array from left to right. */ + for(i = SA, j = SA + n, orig = SA; i < j; ++i) { + if(0 < (s = *i)) { + assert(T[s - 1] >= T[s]); + c0 = T[--s]; + *i = c0; + if((0 < s) && (T[s - 1] < c0)) { s = ~((int)T[s - 1]); } + if(c0 != c2) { + BUCKET_A(c2) = k - SA; + k = SA + BUCKET_A(c2 = c0); + } + assert(i < k); + *k++ = s; + } else if(s != 0) { + *i = ~s; + } else { + orig = i; + } + } + + return orig - SA; +} + +/* Constructs the burrows-wheeler transformed string directly + by using the sorted order of type B* suffixes. */ +static +int +construct_BWT_indexes(const unsigned char *T, int *SA, + int *bucket_A, int *bucket_B, + int n, int m, + unsigned char * num_indexes, int * indexes) { + int *i, *j, *k, *orig; + int s; + int c0, c1, c2; + + int mod = n / 8; + { + mod |= mod >> 1; mod |= mod >> 2; + mod |= mod >> 4; mod |= mod >> 8; + mod |= mod >> 16; mod >>= 1; + + *num_indexes = (unsigned char)((n - 1) / (mod + 1)); + } + + if(0 < m) { + /* Construct the sorted order of type B suffixes by using + the sorted order of type B* suffixes. */ + for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) { + /* Scan the suffix array from right to left. */ + for(i = SA + BUCKET_BSTAR(c1, c1 + 1), + j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1; + i <= j; + --j) { + if(0 < (s = *j)) { + assert(T[s] == c1); + assert(((s + 1) < n) && (T[s] <= T[s + 1])); + assert(T[s - 1] <= T[s]); + + if ((s & mod) == 0) indexes[s / (mod + 1) - 1] = j - SA; + + c0 = T[--s]; + *j = ~((int)c0); + if((0 < s) && (T[s - 1] > c0)) { s = ~s; } + if(c0 != c2) { + if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; } + k = SA + BUCKET_B(c2 = c0, c1); + } + assert(k < j); assert(k != NULL); + *k-- = s; + } else if(s != 0) { + *j = ~s; +#ifndef NDEBUG + } else { + assert(T[s] == c1); +#endif + } + } + } + } + + /* Construct the BWTed string by using + the sorted order of type B suffixes. */ + k = SA + BUCKET_A(c2 = T[n - 1]); + if (T[n - 2] < c2) { + if (((n - 1) & mod) == 0) indexes[(n - 1) / (mod + 1) - 1] = k - SA; + *k++ = ~((int)T[n - 2]); + } + else { + *k++ = n - 1; + } + + /* Scan the suffix array from left to right. */ + for(i = SA, j = SA + n, orig = SA; i < j; ++i) { + if(0 < (s = *i)) { + assert(T[s - 1] >= T[s]); + + if ((s & mod) == 0) indexes[s / (mod + 1) - 1] = i - SA; + + c0 = T[--s]; + *i = c0; + if(c0 != c2) { + BUCKET_A(c2) = k - SA; + k = SA + BUCKET_A(c2 = c0); + } + assert(i < k); + if((0 < s) && (T[s - 1] < c0)) { + if ((s & mod) == 0) indexes[s / (mod + 1) - 1] = k - SA; + *k++ = ~((int)T[s - 1]); + } else + *k++ = s; + } else if(s != 0) { + *i = ~s; + } else { + orig = i; + } + } + + return orig - SA; +} + + +/*---------------------------------------------------------------------------*/ + +/*- Function -*/ + +int +divsufsort(const unsigned char *T, int *SA, int n, int openMP) { + int *bucket_A, *bucket_B; + int m; + int err = 0; + + /* Check arguments. */ + if((T == NULL) || (SA == NULL) || (n < 0)) { return -1; } + else if(n == 0) { return 0; } + else if(n == 1) { SA[0] = 0; return 0; } + else if(n == 2) { m = (T[0] < T[1]); SA[m ^ 1] = 0, SA[m] = 1; return 0; } + + bucket_A = (int *)malloc(BUCKET_A_SIZE * sizeof(int)); + bucket_B = (int *)malloc(BUCKET_B_SIZE * sizeof(int)); + + /* Suffixsort. */ + if((bucket_A != NULL) && (bucket_B != NULL)) { + m = sort_typeBstar(T, SA, bucket_A, bucket_B, n, openMP); + construct_SA(T, SA, bucket_A, bucket_B, n, m); + } else { + err = -2; + } + + free(bucket_B); + free(bucket_A); + + return err; +} + +int +divbwt(const unsigned char *T, unsigned char *U, int *A, int n, unsigned char * num_indexes, int * indexes, int openMP) { + int *B; + int *bucket_A, *bucket_B; + int m, pidx, i; + + /* Check arguments. */ + if((T == NULL) || (U == NULL) || (n < 0)) { return -1; } + else if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; } + + if((B = A) == NULL) { B = (int *)malloc((size_t)(n + 1) * sizeof(int)); } + bucket_A = (int *)malloc(BUCKET_A_SIZE * sizeof(int)); + bucket_B = (int *)malloc(BUCKET_B_SIZE * sizeof(int)); + + /* Burrows-Wheeler Transform. */ + if((B != NULL) && (bucket_A != NULL) && (bucket_B != NULL)) { + m = sort_typeBstar(T, B, bucket_A, bucket_B, n, openMP); + + if (num_indexes == NULL || indexes == NULL) { + pidx = construct_BWT(T, B, bucket_A, bucket_B, n, m); + } else { + pidx = construct_BWT_indexes(T, B, bucket_A, bucket_B, n, m, num_indexes, indexes); + } + + /* Copy to output string. */ + U[0] = T[n - 1]; + for(i = 0; i < pidx; ++i) { U[i + 1] = (unsigned char)B[i]; } + for(i += 1; i < n; ++i) { U[i] = (unsigned char)B[i]; } + pidx += 1; + } else { + pidx = -2; + } + + free(bucket_B); + free(bucket_A); + if(A == NULL) { free(B); } + + return pidx; +} diff --git a/lib/dictBuilder/fastcover.c b/lib/dictBuilder/fastcover.c new file mode 100644 index 0000000..56a0813 --- /dev/null +++ b/lib/dictBuilder/fastcover.c @@ -0,0 +1,765 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/*-************************************* +* Dependencies +***************************************/ +#include /* fprintf */ +#include /* malloc, free, qsort */ +#include /* memset */ +#include /* clock */ + +#ifndef ZDICT_STATIC_LINKING_ONLY +# define ZDICT_STATIC_LINKING_ONLY +#endif + +#include "../common/mem.h" /* read */ +#include "../common/pool.h" +#include "../common/threading.h" +#include "../common/zstd_internal.h" /* includes zstd.h */ +#include "../compress/zstd_compress_internal.h" /* ZSTD_hash*() */ +#include "../zdict.h" +#include "cover.h" + + +/*-************************************* +* Constants +***************************************/ +/** +* There are 32bit indexes used to ref samples, so limit samples size to 4GB +* on 64bit builds. +* For 32bit builds we choose 1 GB. +* Most 32bit platforms have 2GB user-mode addressable space and we allocate a large +* contiguous buffer, so 1GB is already a high limit. +*/ +#define FASTCOVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB)) +#define FASTCOVER_MAX_F 31 +#define FASTCOVER_MAX_ACCEL 10 +#define FASTCOVER_DEFAULT_SPLITPOINT 0.75 +#define DEFAULT_F 20 +#define DEFAULT_ACCEL 1 + + +/*-************************************* +* Console display +* +* Captures the `displayLevel` variable in the local scope. +***************************************/ +#undef DISPLAY +#define DISPLAY(...) \ + { \ + fprintf(stderr, __VA_ARGS__); \ + fflush(stderr); \ + } +#undef DISPLAYLEVEL +#define DISPLAYLEVEL(l, ...) \ + if (displayLevel >= l) { \ + DISPLAY(__VA_ARGS__); \ + } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */ + +#undef DISPLAYUPDATE +#define DISPLAYUPDATE(lastUpdateTime, l, ...) \ + if (displayLevel >= l) { \ + const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100; \ + if ((clock() - lastUpdateTime > refreshRate) || (displayLevel >= 4)) { \ + lastUpdateTime = clock(); \ + DISPLAY(__VA_ARGS__); \ + } \ + } + + +/*-************************************* +* Hash Functions +***************************************/ +/** + * Hash the d-byte value pointed to by p and mod 2^f into the frequency vector + */ +static size_t FASTCOVER_hashPtrToIndex(const void* p, U32 f, unsigned d) { + if (d == 6) { + return ZSTD_hash6Ptr(p, f); + } + return ZSTD_hash8Ptr(p, f); +} + + +/*-************************************* +* Acceleration +***************************************/ +typedef struct { + unsigned finalize; /* Percentage of training samples used for ZDICT_finalizeDictionary */ + unsigned skip; /* Number of dmer skipped between each dmer counted in computeFrequency */ +} FASTCOVER_accel_t; + + +static const FASTCOVER_accel_t FASTCOVER_defaultAccelParameters[FASTCOVER_MAX_ACCEL+1] = { + { 100, 0 }, /* accel = 0, should not happen because accel = 0 defaults to accel = 1 */ + { 100, 0 }, /* accel = 1 */ + { 50, 1 }, /* accel = 2 */ + { 34, 2 }, /* accel = 3 */ + { 25, 3 }, /* accel = 4 */ + { 20, 4 }, /* accel = 5 */ + { 17, 5 }, /* accel = 6 */ + { 14, 6 }, /* accel = 7 */ + { 13, 7 }, /* accel = 8 */ + { 11, 8 }, /* accel = 9 */ + { 10, 9 }, /* accel = 10 */ +}; + + +/*-************************************* +* Context +***************************************/ +typedef struct { + const BYTE *samples; + size_t *offsets; + const size_t *samplesSizes; + size_t nbSamples; + size_t nbTrainSamples; + size_t nbTestSamples; + size_t nbDmers; + U32 *freqs; + unsigned d; + unsigned f; + FASTCOVER_accel_t accelParams; + int displayLevel; +} FASTCOVER_ctx_t; + + +/*-************************************* +* Helper functions +***************************************/ +/** + * Selects the best segment in an epoch. + * Segments of are scored according to the function: + * + * Let F(d) be the frequency of all dmers with hash value d. + * Let S_i be hash value of the dmer at position i of segment S which has length k. + * + * Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1}) + * + * Once the dmer with hash value d is in the dictionary we set F(d) = 0. + */ +static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx, + U32 *freqs, U32 begin, U32 end, + ZDICT_cover_params_t parameters, + U16* segmentFreqs) { + /* Constants */ + const U32 k = parameters.k; + const U32 d = parameters.d; + const U32 f = ctx->f; + const U32 dmersInK = k - d + 1; + + /* Try each segment (activeSegment) and save the best (bestSegment) */ + COVER_segment_t bestSegment = {0, 0, 0}; + COVER_segment_t activeSegment; + + /* Reset the activeDmers in the segment */ + /* The activeSegment starts at the beginning of the epoch. */ + activeSegment.begin = begin; + activeSegment.end = begin; + activeSegment.score = 0; + + /* Slide the activeSegment through the whole epoch. + * Save the best segment in bestSegment. + */ + while (activeSegment.end < end) { + /* Get hash value of current dmer */ + const size_t idx = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, f, d); + + /* Add frequency of this index to score if this is the first occurrence of index in active segment */ + if (segmentFreqs[idx] == 0) { + activeSegment.score += freqs[idx]; + } + /* Increment end of segment and segmentFreqs*/ + activeSegment.end += 1; + segmentFreqs[idx] += 1; + /* If the window is now too large, drop the first position */ + if (activeSegment.end - activeSegment.begin == dmersInK + 1) { + /* Get hash value of the dmer to be eliminated from active segment */ + const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, f, d); + segmentFreqs[delIndex] -= 1; + /* Subtract frequency of this index from score if this is the last occurrence of this index in active segment */ + if (segmentFreqs[delIndex] == 0) { + activeSegment.score -= freqs[delIndex]; + } + /* Increment start of segment */ + activeSegment.begin += 1; + } + + /* If this segment is the best so far save it */ + if (activeSegment.score > bestSegment.score) { + bestSegment = activeSegment; + } + } + + /* Zero out rest of segmentFreqs array */ + while (activeSegment.begin < end) { + const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, f, d); + segmentFreqs[delIndex] -= 1; + activeSegment.begin += 1; + } + + { + /* Zero the frequency of hash value of each dmer covered by the chosen segment. */ + U32 pos; + for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) { + const size_t i = FASTCOVER_hashPtrToIndex(ctx->samples + pos, f, d); + freqs[i] = 0; + } + } + + return bestSegment; +} + + +static int FASTCOVER_checkParameters(ZDICT_cover_params_t parameters, + size_t maxDictSize, unsigned f, + unsigned accel) { + /* k, d, and f are required parameters */ + if (parameters.d == 0 || parameters.k == 0) { + return 0; + } + /* d has to be 6 or 8 */ + if (parameters.d != 6 && parameters.d != 8) { + return 0; + } + /* k <= maxDictSize */ + if (parameters.k > maxDictSize) { + return 0; + } + /* d <= k */ + if (parameters.d > parameters.k) { + return 0; + } + /* 0 < f <= FASTCOVER_MAX_F*/ + if (f > FASTCOVER_MAX_F || f == 0) { + return 0; + } + /* 0 < splitPoint <= 1 */ + if (parameters.splitPoint <= 0 || parameters.splitPoint > 1) { + return 0; + } + /* 0 < accel <= 10 */ + if (accel > 10 || accel == 0) { + return 0; + } + return 1; +} + + +/** + * Clean up a context initialized with `FASTCOVER_ctx_init()`. + */ +static void +FASTCOVER_ctx_destroy(FASTCOVER_ctx_t* ctx) +{ + if (!ctx) return; + + free(ctx->freqs); + ctx->freqs = NULL; + + free(ctx->offsets); + ctx->offsets = NULL; +} + + +/** + * Calculate for frequency of hash value of each dmer in ctx->samples + */ +static void +FASTCOVER_computeFrequency(U32* freqs, const FASTCOVER_ctx_t* ctx) +{ + const unsigned f = ctx->f; + const unsigned d = ctx->d; + const unsigned skip = ctx->accelParams.skip; + const unsigned readLength = MAX(d, 8); + size_t i; + assert(ctx->nbTrainSamples >= 5); + assert(ctx->nbTrainSamples <= ctx->nbSamples); + for (i = 0; i < ctx->nbTrainSamples; i++) { + size_t start = ctx->offsets[i]; /* start of current dmer */ + size_t const currSampleEnd = ctx->offsets[i+1]; + while (start + readLength <= currSampleEnd) { + const size_t dmerIndex = FASTCOVER_hashPtrToIndex(ctx->samples + start, f, d); + freqs[dmerIndex]++; + start = start + skip + 1; + } + } +} + + +/** + * Prepare a context for dictionary building. + * The context is only dependent on the parameter `d` and can be used multiple + * times. + * Returns 0 on success or error code on error. + * The context must be destroyed with `FASTCOVER_ctx_destroy()`. + */ +static size_t +FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx, + const void* samplesBuffer, + const size_t* samplesSizes, unsigned nbSamples, + unsigned d, double splitPoint, unsigned f, + FASTCOVER_accel_t accelParams, + int displayLevel) +{ + const BYTE* const samples = (const BYTE*)samplesBuffer; + const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples); + /* Split samples into testing and training sets */ + const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples; + const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples; + const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize; + const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize; + ctx->displayLevel = displayLevel; + + /* Checks */ + if (totalSamplesSize < MAX(d, sizeof(U64)) || + totalSamplesSize >= (size_t)FASTCOVER_MAX_SAMPLES_SIZE) { + DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n", + (unsigned)(totalSamplesSize >> 20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20)); + return ERROR(srcSize_wrong); + } + + /* Check if there are at least 5 training samples */ + if (nbTrainSamples < 5) { + DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid\n", nbTrainSamples); + return ERROR(srcSize_wrong); + } + + /* Check if there's testing sample */ + if (nbTestSamples < 1) { + DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.\n", nbTestSamples); + return ERROR(srcSize_wrong); + } + + /* Zero the context */ + memset(ctx, 0, sizeof(*ctx)); + DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples, + (unsigned)trainingSamplesSize); + DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples, + (unsigned)testSamplesSize); + + ctx->samples = samples; + ctx->samplesSizes = samplesSizes; + ctx->nbSamples = nbSamples; + ctx->nbTrainSamples = nbTrainSamples; + ctx->nbTestSamples = nbTestSamples; + ctx->nbDmers = trainingSamplesSize - MAX(d, sizeof(U64)) + 1; + ctx->d = d; + ctx->f = f; + ctx->accelParams = accelParams; + + /* The offsets of each file */ + ctx->offsets = (size_t*)calloc((nbSamples + 1), sizeof(size_t)); + if (ctx->offsets == NULL) { + DISPLAYLEVEL(1, "Failed to allocate scratch buffers \n"); + FASTCOVER_ctx_destroy(ctx); + return ERROR(memory_allocation); + } + + /* Fill offsets from the samplesSizes */ + { U32 i; + ctx->offsets[0] = 0; + assert(nbSamples >= 5); + for (i = 1; i <= nbSamples; ++i) { + ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1]; + } + } + + /* Initialize frequency array of size 2^f */ + ctx->freqs = (U32*)calloc(((U64)1 << f), sizeof(U32)); + if (ctx->freqs == NULL) { + DISPLAYLEVEL(1, "Failed to allocate frequency table \n"); + FASTCOVER_ctx_destroy(ctx); + return ERROR(memory_allocation); + } + + DISPLAYLEVEL(2, "Computing frequencies\n"); + FASTCOVER_computeFrequency(ctx->freqs, ctx); + + return 0; +} + + +/** + * Given the prepared context build the dictionary. + */ +static size_t +FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx, + U32* freqs, + void* dictBuffer, size_t dictBufferCapacity, + ZDICT_cover_params_t parameters, + U16* segmentFreqs) +{ + BYTE *const dict = (BYTE *)dictBuffer; + size_t tail = dictBufferCapacity; + /* Divide the data into epochs. We will select one segment from each epoch. */ + const COVER_epoch_info_t epochs = COVER_computeEpochs( + (U32)dictBufferCapacity, (U32)ctx->nbDmers, parameters.k, 1); + const size_t maxZeroScoreRun = 10; + const int displayLevel = ctx->displayLevel; + size_t zeroScoreRun = 0; + clock_t lastUpdateTime = 0; + size_t epoch; + DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", + (U32)epochs.num, (U32)epochs.size); + /* Loop through the epochs until there are no more segments or the dictionary + * is full. + */ + for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) { + const U32 epochBegin = (U32)(epoch * epochs.size); + const U32 epochEnd = epochBegin + epochs.size; + size_t segmentSize; + /* Select a segment */ + COVER_segment_t segment = FASTCOVER_selectSegment( + ctx, freqs, epochBegin, epochEnd, parameters, segmentFreqs); + + /* If the segment covers no dmers, then we are out of content. + * There may be new content in other epochs, for continue for some time. + */ + if (segment.score == 0) { + if (++zeroScoreRun >= maxZeroScoreRun) { + break; + } + continue; + } + zeroScoreRun = 0; + + /* Trim the segment if necessary and if it is too small then we are done */ + segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail); + if (segmentSize < parameters.d) { + break; + } + + /* We fill the dictionary from the back to allow the best segments to be + * referenced with the smallest offsets. + */ + tail -= segmentSize; + memcpy(dict + tail, ctx->samples + segment.begin, segmentSize); + DISPLAYUPDATE( + lastUpdateTime, + 2, "\r%u%% ", + (unsigned)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity)); + } + DISPLAYLEVEL(2, "\r%79s\r", ""); + return tail; +} + +/** + * Parameters for FASTCOVER_tryParameters(). + */ +typedef struct FASTCOVER_tryParameters_data_s { + const FASTCOVER_ctx_t* ctx; + COVER_best_t* best; + size_t dictBufferCapacity; + ZDICT_cover_params_t parameters; +} FASTCOVER_tryParameters_data_t; + + +/** + * Tries a set of parameters and updates the COVER_best_t with the results. + * This function is thread safe if zstd is compiled with multithreaded support. + * It takes its parameters as an *OWNING* opaque pointer to support threading. + */ +static void FASTCOVER_tryParameters(void* opaque) +{ + /* Save parameters as local variables */ + FASTCOVER_tryParameters_data_t *const data = (FASTCOVER_tryParameters_data_t*)opaque; + const FASTCOVER_ctx_t *const ctx = data->ctx; + const ZDICT_cover_params_t parameters = data->parameters; + size_t dictBufferCapacity = data->dictBufferCapacity; + size_t totalCompressedSize = ERROR(GENERIC); + /* Initialize array to keep track of frequency of dmer within activeSegment */ + U16* segmentFreqs = (U16*)calloc(((U64)1 << ctx->f), sizeof(U16)); + /* Allocate space for hash table, dict, and freqs */ + BYTE *const dict = (BYTE*)malloc(dictBufferCapacity); + COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC)); + U32* freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32)); + const int displayLevel = ctx->displayLevel; + if (!segmentFreqs || !dict || !freqs) { + DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n"); + goto _cleanup; + } + /* Copy the frequencies because we need to modify them */ + memcpy(freqs, ctx->freqs, ((U64)1 << ctx->f) * sizeof(U32)); + /* Build the dictionary */ + { const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict, dictBufferCapacity, + parameters, segmentFreqs); + + const unsigned nbFinalizeSamples = (unsigned)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100); + selection = COVER_selectDict(dict + tail, dictBufferCapacity, dictBufferCapacity - tail, + ctx->samples, ctx->samplesSizes, nbFinalizeSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets, + totalCompressedSize); + + if (COVER_dictSelectionIsError(selection)) { + DISPLAYLEVEL(1, "Failed to select dictionary\n"); + goto _cleanup; + } + } +_cleanup: + free(dict); + COVER_best_finish(data->best, parameters, selection); + free(data); + free(segmentFreqs); + COVER_dictSelectionFree(selection); + free(freqs); +} + + +static void +FASTCOVER_convertToCoverParams(ZDICT_fastCover_params_t fastCoverParams, + ZDICT_cover_params_t* coverParams) +{ + coverParams->k = fastCoverParams.k; + coverParams->d = fastCoverParams.d; + coverParams->steps = fastCoverParams.steps; + coverParams->nbThreads = fastCoverParams.nbThreads; + coverParams->splitPoint = fastCoverParams.splitPoint; + coverParams->zParams = fastCoverParams.zParams; + coverParams->shrinkDict = fastCoverParams.shrinkDict; +} + + +static void +FASTCOVER_convertToFastCoverParams(ZDICT_cover_params_t coverParams, + ZDICT_fastCover_params_t* fastCoverParams, + unsigned f, unsigned accel) +{ + fastCoverParams->k = coverParams.k; + fastCoverParams->d = coverParams.d; + fastCoverParams->steps = coverParams.steps; + fastCoverParams->nbThreads = coverParams.nbThreads; + fastCoverParams->splitPoint = coverParams.splitPoint; + fastCoverParams->f = f; + fastCoverParams->accel = accel; + fastCoverParams->zParams = coverParams.zParams; + fastCoverParams->shrinkDict = coverParams.shrinkDict; +} + + +ZDICTLIB_STATIC_API size_t +ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity, + const void* samplesBuffer, + const size_t* samplesSizes, unsigned nbSamples, + ZDICT_fastCover_params_t parameters) +{ + BYTE* const dict = (BYTE*)dictBuffer; + FASTCOVER_ctx_t ctx; + ZDICT_cover_params_t coverParams; + FASTCOVER_accel_t accelParams; + const int displayLevel = (int)parameters.zParams.notificationLevel; + /* Assign splitPoint and f if not provided */ + parameters.splitPoint = 1.0; + parameters.f = parameters.f == 0 ? DEFAULT_F : parameters.f; + parameters.accel = parameters.accel == 0 ? DEFAULT_ACCEL : parameters.accel; + /* Convert to cover parameter */ + memset(&coverParams, 0 , sizeof(coverParams)); + FASTCOVER_convertToCoverParams(parameters, &coverParams); + /* Checks */ + if (!FASTCOVER_checkParameters(coverParams, dictBufferCapacity, parameters.f, + parameters.accel)) { + DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n"); + return ERROR(parameter_outOfBound); + } + if (nbSamples == 0) { + DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n"); + return ERROR(srcSize_wrong); + } + if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) { + DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n", + ZDICT_DICTSIZE_MIN); + return ERROR(dstSize_tooSmall); + } + /* Assign corresponding FASTCOVER_accel_t to accelParams*/ + accelParams = FASTCOVER_defaultAccelParameters[parameters.accel]; + /* Initialize context */ + { + size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, + coverParams.d, parameters.splitPoint, parameters.f, + accelParams, displayLevel); + if (ZSTD_isError(initVal)) { + DISPLAYLEVEL(1, "Failed to initialize context\n"); + return initVal; + } + } + COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, displayLevel); + /* Build the dictionary */ + DISPLAYLEVEL(2, "Building dictionary\n"); + { + /* Initialize array to keep track of frequency of dmer within activeSegment */ + U16* segmentFreqs = (U16 *)calloc(((U64)1 << parameters.f), sizeof(U16)); + const size_t tail = FASTCOVER_buildDictionary(&ctx, ctx.freqs, dictBuffer, + dictBufferCapacity, coverParams, segmentFreqs); + const unsigned nbFinalizeSamples = (unsigned)(ctx.nbTrainSamples * ctx.accelParams.finalize / 100); + const size_t dictionarySize = ZDICT_finalizeDictionary( + dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, + samplesBuffer, samplesSizes, nbFinalizeSamples, coverParams.zParams); + if (!ZSTD_isError(dictionarySize)) { + DISPLAYLEVEL(2, "Constructed dictionary of size %u\n", + (unsigned)dictionarySize); + } + FASTCOVER_ctx_destroy(&ctx); + free(segmentFreqs); + return dictionarySize; + } +} + + +ZDICTLIB_STATIC_API size_t +ZDICT_optimizeTrainFromBuffer_fastCover( + void* dictBuffer, size_t dictBufferCapacity, + const void* samplesBuffer, + const size_t* samplesSizes, unsigned nbSamples, + ZDICT_fastCover_params_t* parameters) +{ + ZDICT_cover_params_t coverParams; + FASTCOVER_accel_t accelParams; + /* constants */ + const unsigned nbThreads = parameters->nbThreads; + const double splitPoint = + parameters->splitPoint <= 0.0 ? FASTCOVER_DEFAULT_SPLITPOINT : parameters->splitPoint; + const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d; + const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d; + const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k; + const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k; + const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps; + const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1); + const unsigned kIterations = + (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize); + const unsigned f = parameters->f == 0 ? DEFAULT_F : parameters->f; + const unsigned accel = parameters->accel == 0 ? DEFAULT_ACCEL : parameters->accel; + const unsigned shrinkDict = 0; + /* Local variables */ + const int displayLevel = (int)parameters->zParams.notificationLevel; + unsigned iteration = 1; + unsigned d; + unsigned k; + COVER_best_t best; + POOL_ctx *pool = NULL; + int warned = 0; + clock_t lastUpdateTime = 0; + /* Checks */ + if (splitPoint <= 0 || splitPoint > 1) { + DISPLAYLEVEL(1, "Incorrect splitPoint\n"); + return ERROR(parameter_outOfBound); + } + if (accel == 0 || accel > FASTCOVER_MAX_ACCEL) { + DISPLAYLEVEL(1, "Incorrect accel\n"); + return ERROR(parameter_outOfBound); + } + if (kMinK < kMaxD || kMaxK < kMinK) { + DISPLAYLEVEL(1, "Incorrect k\n"); + return ERROR(parameter_outOfBound); + } + if (nbSamples == 0) { + DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n"); + return ERROR(srcSize_wrong); + } + if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) { + DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n", + ZDICT_DICTSIZE_MIN); + return ERROR(dstSize_tooSmall); + } + if (nbThreads > 1) { + pool = POOL_create(nbThreads, 1); + if (!pool) { + return ERROR(memory_allocation); + } + } + /* Initialization */ + COVER_best_init(&best); + memset(&coverParams, 0 , sizeof(coverParams)); + FASTCOVER_convertToCoverParams(*parameters, &coverParams); + accelParams = FASTCOVER_defaultAccelParameters[accel]; + /* Loop through d first because each new value needs a new context */ + DISPLAYLEVEL(2, "Trying %u different sets of parameters\n", kIterations); + for (d = kMinD; d <= kMaxD; d += 2) { + /* Initialize the context for this value of d */ + FASTCOVER_ctx_t ctx; + DISPLAYLEVEL(3, "d=%u\n", d); + { + /* Turn down global display level to clean up display at level 2 and below */ + const int childDisplayLevel = displayLevel == 0 ? 0 : displayLevel - 1; + size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f, accelParams, childDisplayLevel); + if (ZSTD_isError(initVal)) { + DISPLAYLEVEL(1, "Failed to initialize context\n"); + COVER_best_destroy(&best); + POOL_free(pool); + return initVal; + } + } + if (!warned) { + COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, displayLevel); + warned = 1; + } + /* Loop through k reusing the same context */ + for (k = kMinK; k <= kMaxK; k += kStepSize) { + /* Prepare the arguments */ + FASTCOVER_tryParameters_data_t *data = (FASTCOVER_tryParameters_data_t *)malloc( + sizeof(FASTCOVER_tryParameters_data_t)); + DISPLAYLEVEL(3, "k=%u\n", k); + if (!data) { + DISPLAYLEVEL(1, "Failed to allocate parameters\n"); + COVER_best_destroy(&best); + FASTCOVER_ctx_destroy(&ctx); + POOL_free(pool); + return ERROR(memory_allocation); + } + data->ctx = &ctx; + data->best = &best; + data->dictBufferCapacity = dictBufferCapacity; + data->parameters = coverParams; + data->parameters.k = k; + data->parameters.d = d; + data->parameters.splitPoint = splitPoint; + data->parameters.steps = kSteps; + data->parameters.shrinkDict = shrinkDict; + data->parameters.zParams.notificationLevel = (unsigned)ctx.displayLevel; + /* Check the parameters */ + if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity, + data->ctx->f, accel)) { + DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n"); + free(data); + continue; + } + /* Call the function and pass ownership of data to it */ + COVER_best_start(&best); + if (pool) { + POOL_add(pool, &FASTCOVER_tryParameters, data); + } else { + FASTCOVER_tryParameters(data); + } + /* Print status */ + DISPLAYUPDATE(lastUpdateTime, + 2, "\r%u%% ", + (unsigned)((iteration * 100) / kIterations)); + ++iteration; + } + COVER_best_wait(&best); + FASTCOVER_ctx_destroy(&ctx); + } + DISPLAYLEVEL(2, "\r%79s\r", ""); + /* Fill the output buffer and parameters with output of the best parameters */ + { + const size_t dictSize = best.dictSize; + if (ZSTD_isError(best.compressedSize)) { + const size_t compressedSize = best.compressedSize; + COVER_best_destroy(&best); + POOL_free(pool); + return compressedSize; + } + FASTCOVER_convertToFastCoverParams(best.parameters, parameters, f, accel); + memcpy(dictBuffer, best.dict, dictSize); + COVER_best_destroy(&best); + POOL_free(pool); + return dictSize; + } + +} diff --git a/lib/dictBuilder/zdict.c b/lib/dictBuilder/zdict.c new file mode 100644 index 0000000..befa616 --- /dev/null +++ b/lib/dictBuilder/zdict.c @@ -0,0 +1,1137 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + +/*-************************************** +* Tuning parameters +****************************************/ +#define MINRATIO 4 /* minimum nb of apparition to be selected in dictionary */ +#define ZDICT_MAX_SAMPLES_SIZE (2000U << 20) +#define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO) + + +/*-************************************** +* Compiler Options +****************************************/ +/* Unix Large Files support (>4GB) */ +#define _FILE_OFFSET_BITS 64 +#if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */ +# ifndef _LARGEFILE_SOURCE +# define _LARGEFILE_SOURCE +# endif +#elif ! defined(__LP64__) /* No point defining Large file for 64 bit */ +# ifndef _LARGEFILE64_SOURCE +# define _LARGEFILE64_SOURCE +# endif +#endif + + +/*-************************************* +* Dependencies +***************************************/ +#include /* malloc, free */ +#include /* memset */ +#include /* fprintf, fopen, ftello64 */ +#include /* clock */ + +#ifndef ZDICT_STATIC_LINKING_ONLY +# define ZDICT_STATIC_LINKING_ONLY +#endif + +#include "../common/mem.h" /* read */ +#include "../common/fse.h" /* FSE_normalizeCount, FSE_writeNCount */ +#include "../common/huf.h" /* HUF_buildCTable, HUF_writeCTable */ +#include "../common/zstd_internal.h" /* includes zstd.h */ +#include "../common/xxhash.h" /* XXH64 */ +#include "../compress/zstd_compress_internal.h" /* ZSTD_loadCEntropy() */ +#include "../zdict.h" +#include "divsufsort.h" +#include "../common/bits.h" /* ZSTD_NbCommonBytes */ + + +/*-************************************* +* Constants +***************************************/ +#define KB *(1 <<10) +#define MB *(1 <<20) +#define GB *(1U<<30) + +#define DICTLISTSIZE_DEFAULT 10000 + +#define NOISELENGTH 32 + +static const U32 g_selectivity_default = 9; + + +/*-************************************* +* Console display +***************************************/ +#undef DISPLAY +#define DISPLAY(...) do { fprintf(stderr, __VA_ARGS__); fflush( stderr ); } while (0) +#undef DISPLAYLEVEL +#define DISPLAYLEVEL(l, ...) do { if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } } while (0) /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */ + +static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; } + +static void ZDICT_printHex(const void* ptr, size_t length) +{ + const BYTE* const b = (const BYTE*)ptr; + size_t u; + for (u=0; u126) c = '.'; /* non-printable char */ + DISPLAY("%c", c); + } +} + + +/*-******************************************************** +* Helper functions +**********************************************************/ +unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); } + +const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); } + +unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize) +{ + if (dictSize < 8) return 0; + if (MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return 0; + return MEM_readLE32((const char*)dictBuffer + 4); +} + +size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize) +{ + size_t headerSize; + if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted); + + { ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t)); + U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE); + if (!bs || !wksp) { + headerSize = ERROR(memory_allocation); + } else { + ZSTD_reset_compressedBlockState(bs); + headerSize = ZSTD_loadCEntropy(bs, wksp, dictBuffer, dictSize); + } + + free(bs); + free(wksp); + } + + return headerSize; +} + +/*-******************************************************** +* Dictionary training functions +**********************************************************/ +/*! ZDICT_count() : + Count the nb of common bytes between 2 pointers. + Note : this function presumes end of buffer followed by noisy guard band. +*/ +static size_t ZDICT_count(const void* pIn, const void* pMatch) +{ + const char* const pStart = (const char*)pIn; + for (;;) { + size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn); + if (!diff) { + pIn = (const char*)pIn+sizeof(size_t); + pMatch = (const char*)pMatch+sizeof(size_t); + continue; + } + pIn = (const char*)pIn+ZSTD_NbCommonBytes(diff); + return (size_t)((const char*)pIn - pStart); + } +} + + +typedef struct { + U32 pos; + U32 length; + U32 savings; +} dictItem; + +static void ZDICT_initDictItem(dictItem* d) +{ + d->pos = 1; + d->length = 0; + d->savings = (U32)(-1); +} + + +#define LLIMIT 64 /* heuristic determined experimentally */ +#define MINMATCHLENGTH 7 /* heuristic determined experimentally */ +static dictItem ZDICT_analyzePos( + BYTE* doneMarks, + const unsigned* suffix, U32 start, + const void* buffer, U32 minRatio, U32 notificationLevel) +{ + U32 lengthList[LLIMIT] = {0}; + U32 cumulLength[LLIMIT] = {0}; + U32 savings[LLIMIT] = {0}; + const BYTE* b = (const BYTE*)buffer; + size_t maxLength = LLIMIT; + size_t pos = (size_t)suffix[start]; + U32 end = start; + dictItem solution; + + /* init */ + memset(&solution, 0, sizeof(solution)); + doneMarks[pos] = 1; + + /* trivial repetition cases */ + if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2)) + ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3)) + ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) { + /* skip and mark segment */ + U16 const pattern16 = MEM_read16(b+pos+4); + U32 u, patternEnd = 6; + while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ; + if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++; + for (u=1; u= MINMATCHLENGTH); + } + + /* look backward */ + { size_t length; + do { + length = ZDICT_count(b + pos, b + *(suffix+start-1)); + if (length >=MINMATCHLENGTH) start--; + } while(length >= MINMATCHLENGTH); + } + + /* exit if not found a minimum nb of repetitions */ + if (end-start < minRatio) { + U32 idx; + for(idx=start; idx= %i at pos %7u ", (unsigned)(end-start), MINMATCHLENGTH, (unsigned)pos); + DISPLAYLEVEL(4, "\n"); + + for (mml = MINMATCHLENGTH ; ; mml++) { + BYTE currentChar = 0; + U32 currentCount = 0; + U32 currentID = refinedStart; + U32 id; + U32 selectedCount = 0; + U32 selectedID = currentID; + for (id =refinedStart; id < refinedEnd; id++) { + if (b[suffix[id] + mml] != currentChar) { + if (currentCount > selectedCount) { + selectedCount = currentCount; + selectedID = currentID; + } + currentID = id; + currentChar = b[ suffix[id] + mml]; + currentCount = 0; + } + currentCount ++; + } + if (currentCount > selectedCount) { /* for last */ + selectedCount = currentCount; + selectedID = currentID; + } + + if (selectedCount < minRatio) + break; + refinedStart = selectedID; + refinedEnd = refinedStart + selectedCount; + } + + /* evaluate gain based on new dict */ + start = refinedStart; + pos = suffix[refinedStart]; + end = start; + memset(lengthList, 0, sizeof(lengthList)); + + /* look forward */ + { size_t length; + do { + end++; + length = ZDICT_count(b + pos, b + suffix[end]); + if (length >= LLIMIT) length = LLIMIT-1; + lengthList[length]++; + } while (length >=MINMATCHLENGTH); + } + + /* look backward */ + { size_t length = MINMATCHLENGTH; + while ((length >= MINMATCHLENGTH) & (start > 0)) { + length = ZDICT_count(b + pos, b + suffix[start - 1]); + if (length >= LLIMIT) length = LLIMIT - 1; + lengthList[length]++; + if (length >= MINMATCHLENGTH) start--; + } + } + + /* largest useful length */ + memset(cumulLength, 0, sizeof(cumulLength)); + cumulLength[maxLength-1] = lengthList[maxLength-1]; + for (i=(int)(maxLength-2); i>=0; i--) + cumulLength[i] = cumulLength[i+1] + lengthList[i]; + + { unsigned u; + for (u=LLIMIT-1; u>=MINMATCHLENGTH; u--) if (cumulLength[u]>=minRatio) break; + maxLength = u; + } + + /* reduce maxLength in case of final into repetitive data */ + { U32 l = (U32)maxLength; + BYTE const c = b[pos + maxLength-1]; + while (b[pos+l-2]==c) l--; + maxLength = l; + } + if (maxLength < MINMATCHLENGTH) return solution; /* skip : no long-enough solution */ + + /* calculate savings */ + savings[5] = 0; + { unsigned u; + for (u=MINMATCHLENGTH; u<=maxLength; u++) + savings[u] = savings[u-1] + (lengthList[u] * (u-3)); + } + + DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f) \n", + (unsigned)pos, (unsigned)maxLength, (unsigned)savings[maxLength], (double)savings[maxLength] / (double)maxLength); + + solution.pos = (U32)pos; + solution.length = (U32)maxLength; + solution.savings = savings[maxLength]; + + /* mark positions done */ + { U32 id; + for (id=start; id solution.length) length = solution.length; + } + pEnd = (U32)(testedPos + length); + for (p=testedPos; ppos; + const U32 eltEnd = elt.pos + elt.length; + const char* const buf = (const char*) buffer; + + /* tail overlap */ + U32 u; for (u=1; u elt.pos) && (table[u].pos <= eltEnd)) { /* overlap, existing > new */ + /* append */ + U32 const addedLength = table[u].pos - elt.pos; + table[u].length += addedLength; + table[u].pos = elt.pos; + table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */ + table[u].savings += elt.length / 8; /* rough approx bonus */ + elt = table[u]; + /* sort : improve rank */ + while ((u>1) && (table[u-1].savings < elt.savings)) + table[u] = table[u-1], u--; + table[u] = elt; + return u; + } } + + /* front overlap */ + for (u=1; u= elt.pos) && (table[u].pos < elt.pos)) { /* overlap, existing < new */ + /* append */ + int const addedLength = (int)eltEnd - (int)(table[u].pos + table[u].length); /* note: can be negative */ + table[u].savings += elt.length / 8; /* rough approx bonus */ + if (addedLength > 0) { /* otherwise, elt fully included into existing */ + table[u].length += (unsigned)addedLength; + table[u].savings += elt.savings * (unsigned)addedLength / elt.length; /* rough approx */ + } + /* sort : improve rank */ + elt = table[u]; + while ((u>1) && (table[u-1].savings < elt.savings)) + table[u] = table[u-1], u--; + table[u] = elt; + return u; + } + + if (MEM_read64(buf + table[u].pos) == MEM_read64(buf + elt.pos + 1)) { + if (isIncluded(buf + table[u].pos, buf + elt.pos + 1, table[u].length)) { + size_t const addedLength = MAX( elt.length - table[u].length , 1 ); + table[u].pos = elt.pos; + table[u].savings += (U32)(elt.savings * addedLength / elt.length); + table[u].length = MIN(elt.length, table[u].length + 1); + return u; + } + } + } + + return 0; +} + + +static void ZDICT_removeDictItem(dictItem* table, U32 id) +{ + /* convention : table[0].pos stores nb of elts */ + U32 const max = table[0].pos; + U32 u; + if (!id) return; /* protection, should never happen */ + for (u=id; upos--; +} + + +static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt, const void* buffer) +{ + /* merge if possible */ + U32 mergeId = ZDICT_tryMerge(table, elt, 0, buffer); + if (mergeId) { + U32 newMerge = 1; + while (newMerge) { + newMerge = ZDICT_tryMerge(table, table[mergeId], mergeId, buffer); + if (newMerge) ZDICT_removeDictItem(table, mergeId); + mergeId = newMerge; + } + return; + } + + /* insert */ + { U32 current; + U32 nextElt = table->pos; + if (nextElt >= maxSize) nextElt = maxSize-1; + current = nextElt-1; + while (table[current].savings < elt.savings) { + table[current+1] = table[current]; + current--; + } + table[current+1] = elt; + table->pos = nextElt+1; + } +} + + +static U32 ZDICT_dictSize(const dictItem* dictList) +{ + U32 u, dictSize = 0; + for (u=1; u=l) { \ + if (ZDICT_clockSpan(displayClock) > refreshRate) { \ + displayClock = clock(); \ + DISPLAY(__VA_ARGS__); \ + } \ + if (notificationLevel>=4) fflush(stderr); \ + } \ + } while (0) + + /* init */ + DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ + if (!suffix0 || !reverseSuffix || !doneMarks || !filePos) { + result = ERROR(memory_allocation); + goto _cleanup; + } + if (minRatio < MINRATIO) minRatio = MINRATIO; + memset(doneMarks, 0, bufferSize+16); + + /* limit sample set size (divsufsort limitation)*/ + if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (unsigned)(ZDICT_MAX_SAMPLES_SIZE>>20)); + while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles]; + + /* sort */ + DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (unsigned)(bufferSize>>20)); + { int const divSuftSortResult = divsufsort((const unsigned char*)buffer, (int*)suffix, (int)bufferSize, 0); + if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; } + } + suffix[bufferSize] = (unsigned)bufferSize; /* leads into noise */ + suffix0[0] = (unsigned)bufferSize; /* leads into noise */ + /* build reverse suffix sort */ + { size_t pos; + for (pos=0; pos < bufferSize; pos++) + reverseSuffix[suffix[pos]] = (U32)pos; + /* note filePos tracks borders between samples. + It's not used at this stage, but planned to become useful in a later update */ + filePos[0] = 0; + for (pos=1; pos> 21); + } +} + + +typedef struct +{ + ZSTD_CDict* dict; /* dictionary */ + ZSTD_CCtx* zc; /* working context */ + void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */ +} EStats_ress_t; + +#define MAXREPOFFSET 1024 + +static void ZDICT_countEStats(EStats_ress_t esr, const ZSTD_parameters* params, + unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets, + const void* src, size_t srcSize, + U32 notificationLevel) +{ + size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params->cParams.windowLog); + size_t cSize; + + if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */ + { size_t const errorCode = ZSTD_compressBegin_usingCDict_deprecated(esr.zc, esr.dict); + if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; } + + } + cSize = ZSTD_compressBlock_deprecated(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize); + if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (unsigned)srcSize); return; } + + if (cSize) { /* if == 0; block is not compressible */ + const SeqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc); + + /* literals stats */ + { const BYTE* bytePtr; + for(bytePtr = seqStorePtr->litStart; bytePtr < seqStorePtr->lit; bytePtr++) + countLit[*bytePtr]++; + } + + /* seqStats */ + { U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + ZSTD_seqToCodes(seqStorePtr); + + { const BYTE* codePtr = seqStorePtr->ofCode; + U32 u; + for (u=0; umlCode; + U32 u; + for (u=0; ullCode; + U32 u; + for (u=0; u= 2) { /* rep offsets */ + const SeqDef* const seq = seqStorePtr->sequencesStart; + U32 offset1 = seq[0].offBase - ZSTD_REP_NUM; + U32 offset2 = seq[1].offBase - ZSTD_REP_NUM; + if (offset1 >= MAXREPOFFSET) offset1 = 0; + if (offset2 >= MAXREPOFFSET) offset2 = 0; + repOffsets[offset1] += 3; + repOffsets[offset2] += 1; + } } } +} + +static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles) +{ + size_t total=0; + unsigned u; + for (u=0; u0; u--) { + offsetCount_t tmp; + if (table[u-1].count >= table[u].count) break; + tmp = table[u-1]; + table[u-1] = table[u]; + table[u] = tmp; + } +} + +/* ZDICT_flatLit() : + * rewrite `countLit` to contain a mostly flat but still compressible distribution of literals. + * necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode. + */ +static void ZDICT_flatLit(unsigned* countLit) +{ + int u; + for (u=1; u<256; u++) countLit[u] = 2; + countLit[0] = 4; + countLit[253] = 1; + countLit[254] = 1; +} + +#define OFFCODE_MAX 30 /* only applicable to first block */ +static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, + int compressionLevel, + const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles, + const void* dictBuffer, size_t dictBufferSize, + unsigned notificationLevel) +{ + unsigned countLit[256]; + HUF_CREATE_STATIC_CTABLE(hufTable, 255); + unsigned offcodeCount[OFFCODE_MAX+1]; + short offcodeNCount[OFFCODE_MAX+1]; + U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB)); + unsigned matchLengthCount[MaxML+1]; + short matchLengthNCount[MaxML+1]; + unsigned litLengthCount[MaxLL+1]; + short litLengthNCount[MaxLL+1]; + U32 repOffset[MAXREPOFFSET]; + offsetCount_t bestRepOffset[ZSTD_REP_NUM+1]; + EStats_ress_t esr = { NULL, NULL, NULL }; + ZSTD_parameters params; + U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total; + size_t pos = 0, errorCode; + size_t eSize = 0; + size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles); + size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles); + BYTE* dstPtr = (BYTE*)dstBuffer; + U32 wksp[HUF_CTABLE_WORKSPACE_SIZE_U32]; + + /* init */ + DEBUGLOG(4, "ZDICT_analyzeEntropy"); + if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; } /* too large dictionary */ + for (u=0; u<256; u++) countLit[u] = 1; /* any character must be described */ + for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1; + for (u=0; u<=MaxML; u++) matchLengthCount[u] = 1; + for (u=0; u<=MaxLL; u++) litLengthCount[u] = 1; + memset(repOffset, 0, sizeof(repOffset)); + repOffset[1] = repOffset[4] = repOffset[8] = 1; + memset(bestRepOffset, 0, sizeof(bestRepOffset)); + if (compressionLevel==0) compressionLevel = ZSTD_CLEVEL_DEFAULT; + params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize); + + esr.dict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent, params.cParams, ZSTD_defaultCMem); + esr.zc = ZSTD_createCCtx(); + esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX); + if (!esr.dict || !esr.zc || !esr.workPlace) { + eSize = ERROR(memory_allocation); + DISPLAYLEVEL(1, "Not enough memory \n"); + goto _cleanup; + } + + /* collect stats on all samples */ + for (u=0; u= 4) { + /* writeStats */ + DISPLAYLEVEL(4, "Offset Code Frequencies : \n"); + for (u=0; u<=offcodeMax; u++) { + DISPLAYLEVEL(4, "%2u :%7u \n", u, offcodeCount[u]); + } } + + /* analyze, build stats, starting with literals */ + { size_t maxNbBits = HUF_buildCTable_wksp(hufTable, countLit, 255, huffLog, wksp, sizeof(wksp)); + if (HUF_isError(maxNbBits)) { + eSize = maxNbBits; + DISPLAYLEVEL(1, " HUF_buildCTable error \n"); + goto _cleanup; + } + if (maxNbBits==8) { /* not compressible : will fail on HUF_writeCTable() */ + DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n"); + ZDICT_flatLit(countLit); /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */ + maxNbBits = HUF_buildCTable_wksp(hufTable, countLit, 255, huffLog, wksp, sizeof(wksp)); + assert(maxNbBits==9); + } + huffLog = (U32)maxNbBits; + } + + /* looking for most common first offsets */ + { U32 offset; + for (offset=1; offset dictBufferCapacity) { + dictContentSize = dictBufferCapacity - hSize; + } + + /* Pad the dictionary content with zeros if it is too small */ + if (dictContentSize < minContentSize) { + RETURN_ERROR_IF(hSize + minContentSize > dictBufferCapacity, dstSize_tooSmall, + "dictBufferCapacity too small to fit max repcode"); + paddingSize = minContentSize - dictContentSize; + } else { + paddingSize = 0; + } + + { + size_t const dictSize = hSize + paddingSize + dictContentSize; + + /* The dictionary consists of the header, optional padding, and the content. + * The padding comes before the content because the "best" position in the + * dictionary is the last byte. + */ + BYTE* const outDictHeader = (BYTE*)dictBuffer; + BYTE* const outDictPadding = outDictHeader + hSize; + BYTE* const outDictContent = outDictPadding + paddingSize; + + assert(dictSize <= dictBufferCapacity); + assert(outDictContent + dictContentSize == (BYTE*)dictBuffer + dictSize); + + /* First copy the customDictContent into its final location. + * `customDictContent` and `dictBuffer` may overlap, so we must + * do this before any other writes into the output buffer. + * Then copy the header & padding into the output buffer. + */ + memmove(outDictContent, customDictContent, dictContentSize); + memcpy(outDictHeader, header, hSize); + memset(outDictPadding, 0, paddingSize); + + return dictSize; + } +} + + +static size_t ZDICT_addEntropyTablesFromBuffer_advanced( + void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, + const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, + ZDICT_params_t params) +{ + int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel; + U32 const notificationLevel = params.notificationLevel; + size_t hSize = 8; + + /* calculate entropy tables */ + DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ + DISPLAYLEVEL(2, "statistics ... \n"); + { size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize, + compressionLevel, + samplesBuffer, samplesSizes, nbSamples, + (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, + notificationLevel); + if (ZDICT_isError(eSize)) return eSize; + hSize += eSize; + } + + /* add dictionary header (after entropy tables) */ + MEM_writeLE32(dictBuffer, ZSTD_MAGIC_DICTIONARY); + { U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0); + U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768; + U32 const dictID = params.dictID ? params.dictID : compliantID; + MEM_writeLE32((char*)dictBuffer+4, dictID); + } + + if (hSize + dictContentSize < dictBufferCapacity) + memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize); + return MIN(dictBufferCapacity, hSize+dictContentSize); +} + +/*! ZDICT_trainFromBuffer_unsafe_legacy() : +* Warning : `samplesBuffer` must be followed by noisy guard band !!! +* @return : size of dictionary, or an error code which can be tested with ZDICT_isError() +*/ +static size_t ZDICT_trainFromBuffer_unsafe_legacy( + void* dictBuffer, size_t maxDictSize, + const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, + ZDICT_legacy_params_t params) +{ + U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16)); + dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList)); + unsigned const selectivity = params.selectivityLevel == 0 ? g_selectivity_default : params.selectivityLevel; + unsigned const minRep = (selectivity > 30) ? MINRATIO : nbSamples >> selectivity; + size_t const targetDictSize = maxDictSize; + size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples); + size_t dictSize = 0; + U32 const notificationLevel = params.zParams.notificationLevel; + + /* checks */ + if (!dictList) return ERROR(memory_allocation); + if (maxDictSize < ZDICT_DICTSIZE_MIN) { free(dictList); return ERROR(dstSize_tooSmall); } /* requested dictionary size is too small */ + if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); } /* not enough source to create dictionary */ + + /* init */ + ZDICT_initDictItem(dictList); + + /* build dictionary */ + ZDICT_trainBuffer_legacy(dictList, dictListSize, + samplesBuffer, samplesBuffSize, + samplesSizes, nbSamples, + minRep, notificationLevel); + + /* display best matches */ + if (params.zParams.notificationLevel>= 3) { + unsigned const nb = MIN(25, dictList[0].pos); + unsigned const dictContentSize = ZDICT_dictSize(dictList); + unsigned u; + DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", (unsigned)dictList[0].pos-1, dictContentSize); + DISPLAYLEVEL(3, "list %u best segments \n", nb-1); + for (u=1; u samplesBuffSize) || ((pos + length) > samplesBuffSize)) { + free(dictList); + return ERROR(GENERIC); /* should never happen */ + } + DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |", + u, length, pos, (unsigned)dictList[u].savings); + ZDICT_printHex((const char*)samplesBuffer+pos, printedLength); + DISPLAYLEVEL(3, "| \n"); + } } + + + /* create dictionary */ + { unsigned dictContentSize = ZDICT_dictSize(dictList); + if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); } /* dictionary content too small */ + if (dictContentSize < targetDictSize/4) { + DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (unsigned)maxDictSize); + if (samplesBuffSize < 10 * targetDictSize) + DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (unsigned)(samplesBuffSize>>20)); + if (minRep > MINRATIO) { + DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1); + DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n"); + } + } + + if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) { + unsigned proposedSelectivity = selectivity-1; + while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; } + DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (unsigned)maxDictSize); + DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity); + DISPLAYLEVEL(2, "! always test dictionary efficiency on real samples \n"); + } + + /* limit dictionary size */ + { U32 const max = dictList->pos; /* convention : nb of useful elts within dictList */ + U32 currentSize = 0; + U32 n; for (n=1; n targetDictSize) { currentSize -= dictList[n].length; break; } + } + dictList->pos = n; + dictContentSize = currentSize; + } + + /* build dict content */ + { U32 u; + BYTE* ptr = (BYTE*)dictBuffer + maxDictSize; + for (u=1; upos; u++) { + U32 l = dictList[u].length; + ptr -= l; + if (ptr<(BYTE*)dictBuffer) { free(dictList); return ERROR(GENERIC); } /* should not happen */ + memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l); + } } + + dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize, + samplesBuffer, samplesSizes, nbSamples, + params.zParams); + } + + /* clean up */ + free(dictList); + return dictSize; +} + + +/* ZDICT_trainFromBuffer_legacy() : + * issue : samplesBuffer need to be followed by a noisy guard band. + * work around : duplicate the buffer, and add the noise */ +size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity, + const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, + ZDICT_legacy_params_t params) +{ + size_t result; + void* newBuff; + size_t const sBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples); + if (sBuffSize < ZDICT_MIN_SAMPLES_SIZE) return 0; /* not enough content => no dictionary */ + + newBuff = malloc(sBuffSize + NOISELENGTH); + if (!newBuff) return ERROR(memory_allocation); + + memcpy(newBuff, samplesBuffer, sBuffSize); + ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */ + + result = + ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff, + samplesSizes, nbSamples, params); + free(newBuff); + return result; +} + + +size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, + const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) +{ + ZDICT_fastCover_params_t params; + DEBUGLOG(3, "ZDICT_trainFromBuffer"); + memset(¶ms, 0, sizeof(params)); + params.d = 8; + params.steps = 4; + /* Use default level since no compression level information is available */ + params.zParams.compressionLevel = ZSTD_CLEVEL_DEFAULT; +#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1) + params.zParams.notificationLevel = DEBUGLEVEL; +#endif + return ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, dictBufferCapacity, + samplesBuffer, samplesSizes, nbSamples, + ¶ms); +} + +size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, + const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) +{ + ZDICT_params_t params; + memset(¶ms, 0, sizeof(params)); + return ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, dictBufferCapacity, + samplesBuffer, samplesSizes, nbSamples, + params); +} diff --git a/lib/legacy/zstd_v02.c b/lib/legacy/zstd_v02.c new file mode 100644 index 0000000..d1e0038 --- /dev/null +++ b/lib/legacy/zstd_v02.c @@ -0,0 +1,3465 @@ +/* + * Copyright (c) Yann Collet, Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + +#include /* size_t, ptrdiff_t */ +#include "zstd_v02.h" +#include "../common/compiler.h" +#include "../common/error_private.h" + + +/****************************************** +* Compiler-specific +******************************************/ +#if defined(_MSC_VER) /* Visual Studio */ +# include /* _byteswap_ulong */ +# include /* _byteswap_* */ +#endif + + +/* ****************************************************************** + mem.h + low-level memory access routines + Copyright (C) 2013-2015, Yann Collet. + + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy + - Public forum : https://groups.google.com/forum/#!forum/lz4c +****************************************************************** */ +#ifndef MEM_H_MODULE +#define MEM_H_MODULE + +#if defined (__cplusplus) +extern "C" { +#endif + +/****************************************** +* Includes +******************************************/ +#include /* size_t, ptrdiff_t */ +#include /* memcpy */ + + +/**************************************************************** +* Basic Types +*****************************************************************/ +#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# if defined(_AIX) +# include +# else +# include /* intptr_t */ +# endif + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef int16_t S16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; + typedef int64_t S64; +#else + typedef unsigned char BYTE; + typedef unsigned short U16; + typedef signed short S16; + typedef unsigned int U32; + typedef signed int S32; + typedef unsigned long long U64; + typedef signed long long S64; +#endif + + +/**************************************************************** +* Memory I/O +*****************************************************************/ + +MEM_STATIC unsigned MEM_32bits(void) { return sizeof(void*)==4; } +MEM_STATIC unsigned MEM_64bits(void) { return sizeof(void*)==8; } + +MEM_STATIC unsigned MEM_isLittleEndian(void) +{ + const union { U32 u; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */ + return one.c[0]; +} + +MEM_STATIC U16 MEM_read16(const void* memPtr) +{ + U16 val; memcpy(&val, memPtr, sizeof(val)); return val; +} + +MEM_STATIC U32 MEM_read32(const void* memPtr) +{ + U32 val; memcpy(&val, memPtr, sizeof(val)); return val; +} + +MEM_STATIC U64 MEM_read64(const void* memPtr) +{ + U64 val; memcpy(&val, memPtr, sizeof(val)); return val; +} + +MEM_STATIC void MEM_write16(void* memPtr, U16 value) +{ + memcpy(memPtr, &value, sizeof(value)); +} + +MEM_STATIC U16 MEM_readLE16(const void* memPtr) +{ + if (MEM_isLittleEndian()) + return MEM_read16(memPtr); + else + { + const BYTE* p = (const BYTE*)memPtr; + return (U16)(p[0] + (p[1]<<8)); + } +} + +MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val) +{ + if (MEM_isLittleEndian()) + { + MEM_write16(memPtr, val); + } + else + { + BYTE* p = (BYTE*)memPtr; + p[0] = (BYTE)val; + p[1] = (BYTE)(val>>8); + } +} + +MEM_STATIC U32 MEM_readLE24(const void* memPtr) +{ + return MEM_readLE16(memPtr) + (((const BYTE*)memPtr)[2] << 16); +} + +MEM_STATIC U32 MEM_readLE32(const void* memPtr) +{ + if (MEM_isLittleEndian()) + return MEM_read32(memPtr); + else + { + const BYTE* p = (const BYTE*)memPtr; + return (U32)((U32)p[0] + ((U32)p[1]<<8) + ((U32)p[2]<<16) + ((U32)p[3]<<24)); + } +} + + +MEM_STATIC U64 MEM_readLE64(const void* memPtr) +{ + if (MEM_isLittleEndian()) + return MEM_read64(memPtr); + else + { + const BYTE* p = (const BYTE*)memPtr; + return (U64)((U64)p[0] + ((U64)p[1]<<8) + ((U64)p[2]<<16) + ((U64)p[3]<<24) + + ((U64)p[4]<<32) + ((U64)p[5]<<40) + ((U64)p[6]<<48) + ((U64)p[7]<<56)); + } +} + + +MEM_STATIC size_t MEM_readLEST(const void* memPtr) +{ + if (MEM_32bits()) + return (size_t)MEM_readLE32(memPtr); + else + return (size_t)MEM_readLE64(memPtr); +} + +#if defined (__cplusplus) +} +#endif + +#endif /* MEM_H_MODULE */ + + +/* ****************************************************************** + bitstream + Part of NewGen Entropy library + header file (to include) + Copyright (C) 2013-2015, Yann Collet. + + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + - Public forum : https://groups.google.com/forum/#!forum/lz4c +****************************************************************** */ +#ifndef BITSTREAM_H_MODULE +#define BITSTREAM_H_MODULE + +#if defined (__cplusplus) +extern "C" { +#endif + + +/* +* This API consists of small unitary functions, which highly benefit from being inlined. +* Since link-time-optimization is not available for all compilers, +* these functions are defined into a .h to be included. +*/ + + +/********************************************** +* bitStream decompression API (read backward) +**********************************************/ +typedef struct +{ + size_t bitContainer; + unsigned bitsConsumed; + const char* ptr; + const char* start; +} BIT_DStream_t; + +typedef enum { BIT_DStream_unfinished = 0, + BIT_DStream_endOfBuffer = 1, + BIT_DStream_completed = 2, + BIT_DStream_overflow = 3 } BIT_DStream_status; /* result of BIT_reloadDStream() */ + /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */ + +MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize); +MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits); +MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD); +MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD); + + +/****************************************** +* unsafe API +******************************************/ +MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits); +/* faster, but works only if nbBits >= 1 */ + + + +/**************************************************************** +* Helper functions +****************************************************************/ +MEM_STATIC unsigned BIT_highbit32 (U32 val) +{ +# if defined(_MSC_VER) /* Visual */ + unsigned long r; + return _BitScanReverse(&r, val) ? (unsigned)r : 0; +# elif defined(__GNUC__) && (__GNUC__ >= 3) /* Use GCC Intrinsic */ + return __builtin_clz (val) ^ 31; +# else /* Software version */ + static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 }; + U32 v = val; + unsigned r; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + r = DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27]; + return r; +# endif +} + + + +/********************************************************** +* bitStream decoding +**********************************************************/ + +/*!BIT_initDStream +* Initialize a BIT_DStream_t. +* @bitD : a pointer to an already allocated BIT_DStream_t structure +* @srcBuffer must point at the beginning of a bitStream +* @srcSize must be the exact size of the bitStream +* @result : size of stream (== srcSize) or an errorCode if a problem is detected +*/ +MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize) +{ + if (srcSize < 1) { memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); } + + if (srcSize >= sizeof(size_t)) /* normal case */ + { + U32 contain32; + bitD->start = (const char*)srcBuffer; + bitD->ptr = (const char*)srcBuffer + srcSize - sizeof(size_t); + bitD->bitContainer = MEM_readLEST(bitD->ptr); + contain32 = ((const BYTE*)srcBuffer)[srcSize-1]; + if (contain32 == 0) return ERROR(GENERIC); /* endMark not present */ + bitD->bitsConsumed = 8 - BIT_highbit32(contain32); + } + else + { + U32 contain32; + bitD->start = (const char*)srcBuffer; + bitD->ptr = bitD->start; + bitD->bitContainer = *(const BYTE*)(bitD->start); + switch(srcSize) + { + case 7: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[6]) << (sizeof(size_t)*8 - 16); + /* fallthrough */ + case 6: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[5]) << (sizeof(size_t)*8 - 24); + /* fallthrough */ + case 5: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[4]) << (sizeof(size_t)*8 - 32); + /* fallthrough */ + case 4: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[3]) << 24; + /* fallthrough */ + case 3: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[2]) << 16; + /* fallthrough */ + case 2: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[1]) << 8; + /* fallthrough */ + default:; + } + contain32 = ((const BYTE*)srcBuffer)[srcSize-1]; + if (contain32 == 0) return ERROR(GENERIC); /* endMark not present */ + bitD->bitsConsumed = 8 - BIT_highbit32(contain32); + bitD->bitsConsumed += (U32)(sizeof(size_t) - srcSize)*8; + } + + return srcSize; +} + +MEM_STATIC size_t BIT_lookBits(BIT_DStream_t* bitD, U32 nbBits) +{ + const U32 bitMask = sizeof(bitD->bitContainer)*8 - 1; + return ((bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> 1) >> ((bitMask-nbBits) & bitMask); +} + +/*! BIT_lookBitsFast : +* unsafe version; only works if nbBits >= 1 */ +MEM_STATIC size_t BIT_lookBitsFast(BIT_DStream_t* bitD, U32 nbBits) +{ + const U32 bitMask = sizeof(bitD->bitContainer)*8 - 1; + return (bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> (((bitMask+1)-nbBits) & bitMask); +} + +MEM_STATIC void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) +{ + bitD->bitsConsumed += nbBits; +} + +MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, U32 nbBits) +{ + size_t value = BIT_lookBits(bitD, nbBits); + BIT_skipBits(bitD, nbBits); + return value; +} + +/*!BIT_readBitsFast : +* unsafe version; only works if nbBits >= 1 */ +MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, U32 nbBits) +{ + size_t value = BIT_lookBitsFast(bitD, nbBits); + BIT_skipBits(bitD, nbBits); + return value; +} + +MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) +{ + if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8)) /* should never happen */ + return BIT_DStream_overflow; + + if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer)) + { + bitD->ptr -= bitD->bitsConsumed >> 3; + bitD->bitsConsumed &= 7; + bitD->bitContainer = MEM_readLEST(bitD->ptr); + return BIT_DStream_unfinished; + } + if (bitD->ptr == bitD->start) + { + if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer; + return BIT_DStream_completed; + } + { + U32 nbBytes = bitD->bitsConsumed >> 3; + BIT_DStream_status result = BIT_DStream_unfinished; + if (bitD->ptr - nbBytes < bitD->start) + { + nbBytes = (U32)(bitD->ptr - bitD->start); /* ptr > start */ + result = BIT_DStream_endOfBuffer; + } + bitD->ptr -= nbBytes; + bitD->bitsConsumed -= nbBytes*8; + bitD->bitContainer = MEM_readLEST(bitD->ptr); /* reminder : srcSize > sizeof(bitD) */ + return result; + } +} + +/*! BIT_endOfDStream +* @return Tells if DStream has reached its exact end +*/ +MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream) +{ + return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8)); +} + +#if defined (__cplusplus) +} +#endif + +#endif /* BITSTREAM_H_MODULE */ +/* ****************************************************************** + Error codes and messages + Copyright (C) 2013-2015, Yann Collet + + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + - Public forum : https://groups.google.com/forum/#!forum/lz4c +****************************************************************** */ +#ifndef ERROR_H_MODULE +#define ERROR_H_MODULE + +#if defined (__cplusplus) +extern "C" { +#endif + + +/****************************************** +* Compiler-specific +******************************************/ +#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define ERR_STATIC static inline +#elif defined(_MSC_VER) +# define ERR_STATIC static __inline +#elif defined(__GNUC__) +# define ERR_STATIC static __attribute__((unused)) +#else +# define ERR_STATIC static /* this version may generate warnings for unused static functions; disable the relevant warning */ +#endif + + +/****************************************** +* Error Management +******************************************/ +#define PREFIX(name) ZSTD_error_##name + +#define ERROR(name) (size_t)-PREFIX(name) + +#define ERROR_LIST(ITEM) \ + ITEM(PREFIX(No_Error)) ITEM(PREFIX(GENERIC)) \ + ITEM(PREFIX(dstSize_tooSmall)) ITEM(PREFIX(srcSize_wrong)) \ + ITEM(PREFIX(prefix_unknown)) ITEM(PREFIX(corruption_detected)) \ + ITEM(PREFIX(tableLog_tooLarge)) ITEM(PREFIX(maxSymbolValue_tooLarge)) ITEM(PREFIX(maxSymbolValue_tooSmall)) \ + ITEM(PREFIX(maxCode)) + +#define ERROR_GENERATE_ENUM(ENUM) ENUM, +typedef enum { ERROR_LIST(ERROR_GENERATE_ENUM) } ERR_codes; /* enum is exposed, to detect & handle specific errors; compare function result to -enum value */ + +#define ERROR_CONVERTTOSTRING(STRING) #STRING, +#define ERROR_GENERATE_STRING(EXPR) ERROR_CONVERTTOSTRING(EXPR) +static const char* ERR_strings[] = { ERROR_LIST(ERROR_GENERATE_STRING) }; + +ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); } + +ERR_STATIC const char* ERR_getErrorName(size_t code) +{ + static const char* codeError = "Unspecified error code"; + if (ERR_isError(code)) return ERR_strings[-(int)(code)]; + return codeError; +} + + +#if defined (__cplusplus) +} +#endif + +#endif /* ERROR_H_MODULE */ +/* +Constructor and Destructor of type FSE_CTable + Note that its size depends on 'tableLog' and 'maxSymbolValue' */ +typedef unsigned FSE_CTable; /* don't allocate that. It's just a way to be more restrictive than void* */ +typedef unsigned FSE_DTable; /* don't allocate that. It's just a way to be more restrictive than void* */ + + +/* ****************************************************************** + FSE : Finite State Entropy coder + header file for static linking (only) + Copyright (C) 2013-2015, Yann Collet + + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + - Public forum : https://groups.google.com/forum/#!forum/lz4c +****************************************************************** */ +#if defined (__cplusplus) +extern "C" { +#endif + + +/****************************************** +* Static allocation +******************************************/ +/* FSE buffer bounds */ +#define FSE_NCOUNTBOUND 512 +#define FSE_BLOCKBOUND(size) (size + (size>>7)) +#define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size)) /* Macro version, useful for static allocation */ + +/* You can statically allocate FSE CTable/DTable as a table of unsigned using below macro */ +#define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) (1 + (1<<(maxTableLog-1)) + ((maxSymbolValue+1)*2)) +#define FSE_DTABLE_SIZE_U32(maxTableLog) (1 + (1<= 1 (otherwise, result will be corrupted) */ + + +/****************************************** +* Implementation of inline functions +******************************************/ + +/* decompression */ + +typedef struct { + U16 tableLog; + U16 fastMode; +} FSE_DTableHeader; /* sizeof U32 */ + +typedef struct +{ + unsigned short newState; + unsigned char symbol; + unsigned char nbBits; +} FSE_decode_t; /* size == U32 */ + +MEM_STATIC void FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt) +{ + FSE_DTableHeader DTableH; + memcpy(&DTableH, dt, sizeof(DTableH)); + DStatePtr->state = BIT_readBits(bitD, DTableH.tableLog); + BIT_reloadDStream(bitD); + DStatePtr->table = dt + 1; +} + +MEM_STATIC BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD) +{ + const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state]; + const U32 nbBits = DInfo.nbBits; + BYTE symbol = DInfo.symbol; + size_t lowBits = BIT_readBits(bitD, nbBits); + + DStatePtr->state = DInfo.newState + lowBits; + return symbol; +} + +MEM_STATIC BYTE FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD) +{ + const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state]; + const U32 nbBits = DInfo.nbBits; + BYTE symbol = DInfo.symbol; + size_t lowBits = BIT_readBitsFast(bitD, nbBits); + + DStatePtr->state = DInfo.newState + lowBits; + return symbol; +} + +MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr) +{ + return DStatePtr->state == 0; +} + + +#if defined (__cplusplus) +} +#endif +/* ****************************************************************** + Huff0 : Huffman coder, part of New Generation Entropy library + header file for static linking (only) + Copyright (C) 2013-2015, Yann Collet + + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + - Public forum : https://groups.google.com/forum/#!forum/lz4c +****************************************************************** */ + +#if defined (__cplusplus) +extern "C" { +#endif + +/****************************************** +* Static allocation macros +******************************************/ +/* Huff0 buffer bounds */ +#define HUF_CTABLEBOUND 129 +#define HUF_BLOCKBOUND(size) (size + (size>>8) + 8) /* only true if incompressible pre-filtered with fast heuristic */ +#define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size)) /* Macro version, useful for static allocation */ + +/* static allocation of Huff0's DTable */ +#define HUF_DTABLE_SIZE(maxTableLog) (1 + (1< /* size_t */ + + +/* ************************************* +* Version +***************************************/ +#define ZSTD_VERSION_MAJOR 0 /* for breaking interface changes */ +#define ZSTD_VERSION_MINOR 2 /* for new (non-breaking) interface capabilities */ +#define ZSTD_VERSION_RELEASE 2 /* for tweaks, bug-fixes, or development */ +#define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE) + + +/* ************************************* +* Advanced functions +***************************************/ +typedef struct ZSTD_CCtx_s ZSTD_CCtx; /* incomplete type */ + +#if defined (__cplusplus) +} +#endif +/* + zstd - standard compression library + Header File for static linking only + Copyright (C) 2014-2015, Yann Collet. + + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - zstd source repository : https://github.com/Cyan4973/zstd + - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c +*/ + +/* The objects defined into this file should be considered experimental. + * They are not labelled stable, as their prototype may change in the future. + * You can use them for tests, provide feedback, or if you can endure risk of future changes. + */ + +#if defined (__cplusplus) +extern "C" { +#endif + +/* ************************************* +* Streaming functions +***************************************/ + +typedef struct ZSTDv02_Dctx_s ZSTD_DCtx; + +/* + Use above functions alternatively. + ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue(). + ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block. + Result is the number of bytes regenerated within 'dst'. + It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header. +*/ + +/* ************************************* +* Prefix - version detection +***************************************/ +#define ZSTD_magicNumber 0xFD2FB522 /* v0.2 (current)*/ + + +#if defined (__cplusplus) +} +#endif +/* ****************************************************************** + FSE : Finite State Entropy coder + Copyright (C) 2013-2015, Yann Collet. + + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy + - Public forum : https://groups.google.com/forum/#!forum/lz4c +****************************************************************** */ + +#ifndef FSE_COMMONDEFS_ONLY + +/**************************************************************** +* Tuning parameters +****************************************************************/ +/* MEMORY_USAGE : +* Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) +* Increasing memory usage improves compression ratio +* Reduced memory usage can improve speed, due to cache effect +* Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */ +#define FSE_MAX_MEMORY_USAGE 14 +#define FSE_DEFAULT_MEMORY_USAGE 13 + +/* FSE_MAX_SYMBOL_VALUE : +* Maximum symbol value authorized. +* Required for proper stack allocation */ +#define FSE_MAX_SYMBOL_VALUE 255 + + +/**************************************************************** +* template functions type & suffix +****************************************************************/ +#define FSE_FUNCTION_TYPE BYTE +#define FSE_FUNCTION_EXTENSION + + +/**************************************************************** +* Byte symbol type +****************************************************************/ +#endif /* !FSE_COMMONDEFS_ONLY */ + + +/**************************************************************** +* Compiler specifics +****************************************************************/ +#ifdef _MSC_VER /* Visual Studio */ +# define FORCE_INLINE static __forceinline +# include /* For Visual 2005 */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +# pragma warning(disable : 4214) /* disable: C4214: non-int bitfields */ +#else +# if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ +# ifdef __GNUC__ +# define FORCE_INLINE static inline __attribute__((always_inline)) +# else +# define FORCE_INLINE static inline +# endif +# else +# define FORCE_INLINE static +# endif /* __STDC_VERSION__ */ +#endif + + +/**************************************************************** +* Includes +****************************************************************/ +#include /* malloc, free, qsort */ +#include /* memcpy, memset */ +#include /* printf (debug) */ + +/**************************************************************** +* Constants +*****************************************************************/ +#define FSE_MAX_TABLELOG (FSE_MAX_MEMORY_USAGE-2) +#define FSE_MAX_TABLESIZE (1U< FSE_TABLELOG_ABSOLUTE_MAX +#error "FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX is not supported" +#endif + + +/**************************************************************** +* Error Management +****************************************************************/ +#define FSE_STATIC_ASSERT(c) { enum { FSE_static_assert = 1/(int)(!!(c)) }; } /* use only *after* variable declarations */ + + +/**************************************************************** +* Complex types +****************************************************************/ +typedef U32 DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)]; + + +/**************************************************************** +* Templates +****************************************************************/ +/* + designed to be included + for type-specific functions (template emulation in C) + Objective is to write these functions only once, for improved maintenance +*/ + +/* safety checks */ +#ifndef FSE_FUNCTION_EXTENSION +# error "FSE_FUNCTION_EXTENSION must be defined" +#endif +#ifndef FSE_FUNCTION_TYPE +# error "FSE_FUNCTION_TYPE must be defined" +#endif + +/* Function names */ +#define FSE_CAT(X,Y) X##Y +#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y) +#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y) + + +/* Function templates */ + +#define FSE_DECODE_TYPE FSE_decode_t + +static U32 FSE_tableStep(U32 tableSize) { return (tableSize>>1) + (tableSize>>3) + 3; } + +static size_t FSE_buildDTable +(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog) +{ + void* ptr = dt+1; + FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*)ptr; + FSE_DTableHeader DTableH; + const U32 tableSize = 1 << tableLog; + const U32 tableMask = tableSize-1; + const U32 step = FSE_tableStep(tableSize); + U16 symbolNext[FSE_MAX_SYMBOL_VALUE+1]; + U32 position = 0; + U32 highThreshold = tableSize-1; + const S16 largeLimit= (S16)(1 << (tableLog-1)); + U32 noLarge = 1; + U32 s; + + /* Sanity Checks */ + if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge); + if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge); + + /* Init, lay down lowprob symbols */ + DTableH.tableLog = (U16)tableLog; + for (s=0; s<=maxSymbolValue; s++) + { + if (normalizedCounter[s]==-1) + { + tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s; + symbolNext[s] = 1; + } + else + { + if (normalizedCounter[s] >= largeLimit) noLarge=0; + symbolNext[s] = normalizedCounter[s]; + } + } + + /* Spread symbols */ + for (s=0; s<=maxSymbolValue; s++) + { + int i; + for (i=0; i highThreshold) position = (position + step) & tableMask; /* lowprob area */ + } + } + + if (position!=0) return ERROR(GENERIC); /* position must reach all cells once, otherwise normalizedCounter is incorrect */ + + /* Build Decoding table */ + { + U32 i; + for (i=0; i FSE_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge); + bitStream >>= 4; + bitCount = 4; + *tableLogPtr = nbBits; + remaining = (1<1) && (charnum<=*maxSVPtr)) + { + if (previous0) + { + unsigned n0 = charnum; + while ((bitStream & 0xFFFF) == 0xFFFF) + { + n0+=24; + if (ip < iend-5) + { + ip+=2; + bitStream = MEM_readLE32(ip) >> bitCount; + } + else + { + bitStream >>= 16; + bitCount+=16; + } + } + while ((bitStream & 3) == 3) + { + n0+=3; + bitStream>>=2; + bitCount+=2; + } + n0 += bitStream & 3; + bitCount += 2; + if (n0 > *maxSVPtr) return ERROR(maxSymbolValue_tooSmall); + while (charnum < n0) normalizedCounter[charnum++] = 0; + if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) + { + ip += bitCount>>3; + bitCount &= 7; + bitStream = MEM_readLE32(ip) >> bitCount; + } + else + bitStream >>= 2; + } + { + const short max = (short)((2*threshold-1)-remaining); + short count; + + if ((bitStream & (threshold-1)) < (U32)max) + { + count = (short)(bitStream & (threshold-1)); + bitCount += nbBits-1; + } + else + { + count = (short)(bitStream & (2*threshold-1)); + if (count >= threshold) count -= max; + bitCount += nbBits; + } + + count--; /* extra accuracy */ + remaining -= FSE_abs(count); + normalizedCounter[charnum++] = count; + previous0 = !count; + while (remaining < threshold) + { + nbBits--; + threshold >>= 1; + } + + { + if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) + { + ip += bitCount>>3; + bitCount &= 7; + } + else + { + bitCount -= (int)(8 * (iend - 4 - ip)); + ip = iend - 4; + } + bitStream = MEM_readLE32(ip) >> (bitCount & 31); + } + } + } + if (remaining != 1) return ERROR(GENERIC); + *maxSVPtr = charnum-1; + + ip += (bitCount+7)>>3; + if ((size_t)(ip-istart) > hbSize) return ERROR(srcSize_wrong); + return ip-istart; +} + + +/********************************************************* +* Decompression (Byte symbols) +*********************************************************/ +static size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue) +{ + void* ptr = dt; + FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr; + FSE_decode_t* const cell = (FSE_decode_t*)(ptr) + 1; /* because dt is unsigned */ + + DTableH->tableLog = 0; + DTableH->fastMode = 0; + + cell->newState = 0; + cell->symbol = symbolValue; + cell->nbBits = 0; + + return 0; +} + + +static size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits) +{ + void* ptr = dt; + FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr; + FSE_decode_t* const dinfo = (FSE_decode_t*)(ptr) + 1; /* because dt is unsigned */ + const unsigned tableSize = 1 << nbBits; + const unsigned tableMask = tableSize - 1; + const unsigned maxSymbolValue = tableMask; + unsigned s; + + /* Sanity checks */ + if (nbBits < 1) return ERROR(GENERIC); /* min size */ + + /* Build Decoding Table */ + DTableH->tableLog = (U16)nbBits; + DTableH->fastMode = 1; + for (s=0; s<=maxSymbolValue; s++) + { + dinfo[s].newState = 0; + dinfo[s].symbol = (BYTE)s; + dinfo[s].nbBits = (BYTE)nbBits; + } + + return 0; +} + +FORCE_INLINE size_t FSE_decompress_usingDTable_generic( + void* dst, size_t maxDstSize, + const void* cSrc, size_t cSrcSize, + const FSE_DTable* dt, const unsigned fast) +{ + BYTE* const ostart = (BYTE*) dst; + BYTE* op = ostart; + BYTE* const omax = op + maxDstSize; + BYTE* const olimit = omax-3; + + BIT_DStream_t bitD; + FSE_DState_t state1; + FSE_DState_t state2; + size_t errorCode; + + /* Init */ + errorCode = BIT_initDStream(&bitD, cSrc, cSrcSize); /* replaced last arg by maxCompressed Size */ + if (FSE_isError(errorCode)) return errorCode; + + FSE_initDState(&state1, &bitD, dt); + FSE_initDState(&state2, &bitD, dt); + +#define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD) + + /* 4 symbols per loop */ + for ( ; (BIT_reloadDStream(&bitD)==BIT_DStream_unfinished) && (op sizeof(bitD.bitContainer)*8) /* This test must be static */ + BIT_reloadDStream(&bitD); + + op[1] = FSE_GETSYMBOL(&state2); + + if (FSE_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8) /* This test must be static */ + { if (BIT_reloadDStream(&bitD) > BIT_DStream_unfinished) { op+=2; break; } } + + op[2] = FSE_GETSYMBOL(&state1); + + if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8) /* This test must be static */ + BIT_reloadDStream(&bitD); + + op[3] = FSE_GETSYMBOL(&state2); + } + + /* tail */ + /* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */ + while (1) + { + if ( (BIT_reloadDStream(&bitD)>BIT_DStream_completed) || (op==omax) || (BIT_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state1))) ) + break; + + *op++ = FSE_GETSYMBOL(&state1); + + if ( (BIT_reloadDStream(&bitD)>BIT_DStream_completed) || (op==omax) || (BIT_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state2))) ) + break; + + *op++ = FSE_GETSYMBOL(&state2); + } + + /* end ? */ + if (BIT_endOfDStream(&bitD) && FSE_endOfDState(&state1) && FSE_endOfDState(&state2)) + return op-ostart; + + if (op==omax) return ERROR(dstSize_tooSmall); /* dst buffer is full, but cSrc unfinished */ + + return ERROR(corruption_detected); +} + + +static size_t FSE_decompress_usingDTable(void* dst, size_t originalSize, + const void* cSrc, size_t cSrcSize, + const FSE_DTable* dt) +{ + FSE_DTableHeader DTableH; + memcpy(&DTableH, dt, sizeof(DTableH)); + + /* select fast mode (static) */ + if (DTableH.fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1); + return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0); +} + + +static size_t FSE_decompress(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize) +{ + const BYTE* const istart = (const BYTE*)cSrc; + const BYTE* ip = istart; + short counting[FSE_MAX_SYMBOL_VALUE+1]; + DTable_max_t dt; /* Static analyzer seems unable to understand this table will be properly initialized later */ + unsigned tableLog; + unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE; + size_t errorCode; + + if (cSrcSize<2) return ERROR(srcSize_wrong); /* too small input size */ + + /* normal FSE decoding mode */ + errorCode = FSE_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize); + if (FSE_isError(errorCode)) return errorCode; + if (errorCode >= cSrcSize) return ERROR(srcSize_wrong); /* too small input size */ + ip += errorCode; + cSrcSize -= errorCode; + + errorCode = FSE_buildDTable (dt, counting, maxSymbolValue, tableLog); + if (FSE_isError(errorCode)) return errorCode; + + /* always return, even if it is an error code */ + return FSE_decompress_usingDTable (dst, maxDstSize, ip, cSrcSize, dt); +} + + + +#endif /* FSE_COMMONDEFS_ONLY */ +/* ****************************************************************** + Huff0 : Huffman coder, part of New Generation Entropy library + Copyright (C) 2013-2015, Yann Collet. + + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - FSE+Huff0 source repository : https://github.com/Cyan4973/FiniteStateEntropy + - Public forum : https://groups.google.com/forum/#!forum/lz4c +****************************************************************** */ + +/**************************************************************** +* Compiler specifics +****************************************************************/ +#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +/* inline is defined */ +#elif defined(_MSC_VER) +# define inline __inline +#else +# define inline /* disable inline */ +#endif + + +#ifdef _MSC_VER /* Visual Studio */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +#endif + + +/**************************************************************** +* Includes +****************************************************************/ +#include /* malloc, free, qsort */ +#include /* memcpy, memset */ +#include /* printf (debug) */ + +/**************************************************************** +* Error Management +****************************************************************/ +#define HUF_STATIC_ASSERT(c) { enum { HUF_static_assert = 1/(int)(!!(c)) }; } /* use only *after* variable declarations */ + + +/****************************************** +* Helper functions +******************************************/ +static unsigned HUF_isError(size_t code) { return ERR_isError(code); } + +#define HUF_ABSOLUTEMAX_TABLELOG 16 /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */ +#define HUF_MAX_TABLELOG 12 /* max configured tableLog (for static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */ +#define HUF_DEFAULT_TABLELOG HUF_MAX_TABLELOG /* tableLog by default, when not specified */ +#define HUF_MAX_SYMBOL_VALUE 255 +#if (HUF_MAX_TABLELOG > HUF_ABSOLUTEMAX_TABLELOG) +# error "HUF_MAX_TABLELOG is too large !" +#endif + + + +/********************************************************* +* Huff0 : Huffman block decompression +*********************************************************/ +typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX2; /* single-symbol decoding */ + +typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX4; /* double-symbols decoding */ + +typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t; + +/*! HUF_readStats + Read compact Huffman tree, saved by HUF_writeCTable + @huffWeight : destination buffer + @return : size read from `src` +*/ +static size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats, + U32* nbSymbolsPtr, U32* tableLogPtr, + const void* src, size_t srcSize) +{ + U32 weightTotal; + U32 tableLog; + const BYTE* ip = (const BYTE*) src; + size_t iSize; + size_t oSize; + U32 n; + + if (!srcSize) return ERROR(srcSize_wrong); + iSize = ip[0]; + //memset(huffWeight, 0, hwSize); /* is not necessary, even though some analyzer complain ... */ + + if (iSize >= 128) /* special header */ + { + if (iSize >= (242)) /* RLE */ + { + static int l[14] = { 1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128 }; + oSize = l[iSize-242]; + memset(huffWeight, 1, hwSize); + iSize = 0; + } + else /* Incompressible */ + { + oSize = iSize - 127; + iSize = ((oSize+1)/2); + if (iSize+1 > srcSize) return ERROR(srcSize_wrong); + if (oSize >= hwSize) return ERROR(corruption_detected); + ip += 1; + for (n=0; n> 4; + huffWeight[n+1] = ip[n/2] & 15; + } + } + } + else /* header compressed with FSE (normal case) */ + { + if (iSize+1 > srcSize) return ERROR(srcSize_wrong); + oSize = FSE_decompress(huffWeight, hwSize-1, ip+1, iSize); /* max (hwSize-1) values decoded, as last one is implied */ + if (FSE_isError(oSize)) return oSize; + } + + /* collect weight stats */ + memset(rankStats, 0, (HUF_ABSOLUTEMAX_TABLELOG + 1) * sizeof(U32)); + weightTotal = 0; + for (n=0; n= HUF_ABSOLUTEMAX_TABLELOG) return ERROR(corruption_detected); + rankStats[huffWeight[n]]++; + weightTotal += (1 << huffWeight[n]) >> 1; + } + if (weightTotal == 0) return ERROR(corruption_detected); + + /* get last non-null symbol weight (implied, total must be 2^n) */ + tableLog = BIT_highbit32(weightTotal) + 1; + if (tableLog > HUF_ABSOLUTEMAX_TABLELOG) return ERROR(corruption_detected); + { + U32 total = 1 << tableLog; + U32 rest = total - weightTotal; + U32 verif = 1 << BIT_highbit32(rest); + U32 lastWeight = BIT_highbit32(rest) + 1; + if (verif != rest) return ERROR(corruption_detected); /* last value must be a clean power of 2 */ + huffWeight[oSize] = (BYTE)lastWeight; + rankStats[lastWeight]++; + } + + /* check tree construction validity */ + if ((rankStats[1] < 2) || (rankStats[1] & 1)) return ERROR(corruption_detected); /* by construction : at least 2 elts of rank 1, must be even */ + + /* results */ + *nbSymbolsPtr = (U32)(oSize+1); + *tableLogPtr = tableLog; + return iSize+1; +} + + +/**************************/ +/* single-symbol decoding */ +/**************************/ + +static size_t HUF_readDTableX2 (U16* DTable, const void* src, size_t srcSize) +{ + BYTE huffWeight[HUF_MAX_SYMBOL_VALUE + 1]; + U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1]; /* large enough for values from 0 to 16 */ + U32 tableLog = 0; + const BYTE* ip = (const BYTE*) src; + size_t iSize = ip[0]; + U32 nbSymbols = 0; + U32 n; + U32 nextRankStart; + void* ptr = DTable+1; + HUF_DEltX2* const dt = (HUF_DEltX2*)ptr; + + HUF_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U16)); /* if compilation fails here, assertion is false */ + //memset(huffWeight, 0, sizeof(huffWeight)); /* is not necessary, even though some analyzer complain ... */ + + iSize = HUF_readStats(huffWeight, HUF_MAX_SYMBOL_VALUE + 1, rankVal, &nbSymbols, &tableLog, src, srcSize); + if (HUF_isError(iSize)) return iSize; + + /* check result */ + if (tableLog > DTable[0]) return ERROR(tableLog_tooLarge); /* DTable is too small */ + DTable[0] = (U16)tableLog; /* maybe should separate sizeof DTable, as allocated, from used size of DTable, in case of DTable re-use */ + + /* Prepare ranks */ + nextRankStart = 0; + for (n=1; n<=tableLog; n++) + { + U32 current = nextRankStart; + nextRankStart += (rankVal[n] << (n-1)); + rankVal[n] = current; + } + + /* fill DTable */ + for (n=0; n> 1; + U32 i; + HUF_DEltX2 D; + D.byte = (BYTE)n; D.nbBits = (BYTE)(tableLog + 1 - w); + for (i = rankVal[w]; i < rankVal[w] + length; i++) + dt[i] = D; + rankVal[w] += length; + } + + return iSize; +} + +static BYTE HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, const U32 dtLog) +{ + const size_t val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */ + const BYTE c = dt[val].byte; + BIT_skipBits(Dstream, dt[val].nbBits); + return c; +} + +#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \ + *ptr++ = HUF_decodeSymbolX2(DStreamPtr, dt, dtLog) + +#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \ + if (MEM_64bits() || (HUF_MAX_TABLELOG<=12)) \ + HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) + +#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \ + if (MEM_64bits()) \ + HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) + +static inline size_t HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX2* const dt, const U32 dtLog) +{ + BYTE* const pStart = p; + + /* up to 4 symbols at a time */ + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd-4)) + { + HUF_DECODE_SYMBOLX2_2(p, bitDPtr); + HUF_DECODE_SYMBOLX2_1(p, bitDPtr); + HUF_DECODE_SYMBOLX2_2(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + } + + /* closer to the end */ + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p < pEnd)) + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + + /* no more data to retrieve from bitstream, hence no need to reload */ + while (p < pEnd) + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + + return pEnd-pStart; +} + + +static size_t HUF_decompress4X2_usingDTable( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const U16* DTable) +{ + if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ + + { + const BYTE* const istart = (const BYTE*) cSrc; + BYTE* const ostart = (BYTE*) dst; + BYTE* const oend = ostart + dstSize; + + const void* ptr = DTable; + const HUF_DEltX2* const dt = ((const HUF_DEltX2*)ptr) +1; + const U32 dtLog = DTable[0]; + size_t errorCode; + + /* Init */ + BIT_DStream_t bitD1; + BIT_DStream_t bitD2; + BIT_DStream_t bitD3; + BIT_DStream_t bitD4; + const size_t length1 = MEM_readLE16(istart); + const size_t length2 = MEM_readLE16(istart+2); + const size_t length3 = MEM_readLE16(istart+4); + size_t length4; + const BYTE* const istart1 = istart + 6; /* jumpTable */ + const BYTE* const istart2 = istart1 + length1; + const BYTE* const istart3 = istart2 + length2; + const BYTE* const istart4 = istart3 + length3; + const size_t segmentSize = (dstSize+3) / 4; + BYTE* const opStart2 = ostart + segmentSize; + BYTE* const opStart3 = opStart2 + segmentSize; + BYTE* const opStart4 = opStart3 + segmentSize; + BYTE* op1 = ostart; + BYTE* op2 = opStart2; + BYTE* op3 = opStart3; + BYTE* op4 = opStart4; + U32 endSignal; + + length4 = cSrcSize - (length1 + length2 + length3 + 6); + if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ + errorCode = BIT_initDStream(&bitD1, istart1, length1); + if (HUF_isError(errorCode)) return errorCode; + errorCode = BIT_initDStream(&bitD2, istart2, length2); + if (HUF_isError(errorCode)) return errorCode; + errorCode = BIT_initDStream(&bitD3, istart3, length3); + if (HUF_isError(errorCode)) return errorCode; + errorCode = BIT_initDStream(&bitD4, istart4, length4); + if (HUF_isError(errorCode)) return errorCode; + + /* 16-32 symbols per loop (4-8 symbols per stream) */ + endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4); + for ( ; (endSignal==BIT_DStream_unfinished) && (op4<(oend-7)) ; ) + { + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_1(op1, &bitD1); + HUF_DECODE_SYMBOLX2_1(op2, &bitD2); + HUF_DECODE_SYMBOLX2_1(op3, &bitD3); + HUF_DECODE_SYMBOLX2_1(op4, &bitD4); + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_0(op1, &bitD1); + HUF_DECODE_SYMBOLX2_0(op2, &bitD2); + HUF_DECODE_SYMBOLX2_0(op3, &bitD3); + HUF_DECODE_SYMBOLX2_0(op4, &bitD4); + + endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4); + } + + /* check corruption */ + if (op1 > opStart2) return ERROR(corruption_detected); + if (op2 > opStart3) return ERROR(corruption_detected); + if (op3 > opStart4) return ERROR(corruption_detected); + /* note : op4 supposed already verified within main loop */ + + /* finish bitStreams one by one */ + HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog); + HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog); + HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog); + HUF_decodeStreamX2(op4, &bitD4, oend, dt, dtLog); + + /* check */ + endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4); + if (!endSignal) return ERROR(corruption_detected); + + /* decoded size */ + return dstSize; + } +} + + +static size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) +{ + HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_MAX_TABLELOG); + const BYTE* ip = (const BYTE*) cSrc; + size_t errorCode; + + errorCode = HUF_readDTableX2 (DTable, cSrc, cSrcSize); + if (HUF_isError(errorCode)) return errorCode; + if (errorCode >= cSrcSize) return ERROR(srcSize_wrong); + ip += errorCode; + cSrcSize -= errorCode; + + return HUF_decompress4X2_usingDTable (dst, dstSize, ip, cSrcSize, DTable); +} + + +/***************************/ +/* double-symbols decoding */ +/***************************/ + +static void HUF_fillDTableX4Level2(HUF_DEltX4* DTable, U32 sizeLog, const U32 consumed, + const U32* rankValOrigin, const int minWeight, + const sortedSymbol_t* sortedSymbols, const U32 sortedListSize, + U32 nbBitsBaseline, U16 baseSeq) +{ + HUF_DEltX4 DElt; + U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1]; + U32 s; + + /* get pre-calculated rankVal */ + memcpy(rankVal, rankValOrigin, sizeof(rankVal)); + + /* fill skipped values */ + if (minWeight>1) + { + U32 i, skipSize = rankVal[minWeight]; + MEM_writeLE16(&(DElt.sequence), baseSeq); + DElt.nbBits = (BYTE)(consumed); + DElt.length = 1; + for (i = 0; i < skipSize; i++) + DTable[i] = DElt; + } + + /* fill DTable */ + for (s=0; s= 1 */ + + rankVal[weight] += length; + } +} + +typedef U32 rankVal_t[HUF_ABSOLUTEMAX_TABLELOG][HUF_ABSOLUTEMAX_TABLELOG + 1]; + +static void HUF_fillDTableX4(HUF_DEltX4* DTable, const U32 targetLog, + const sortedSymbol_t* sortedList, const U32 sortedListSize, + const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight, + const U32 nbBitsBaseline) +{ + U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1]; + const int scaleLog = nbBitsBaseline - targetLog; /* note : targetLog >= srcLog, hence scaleLog <= 1 */ + const U32 minBits = nbBitsBaseline - maxWeight; + U32 s; + + memcpy(rankVal, rankValOrigin, sizeof(rankVal)); + + /* fill DTable */ + for (s=0; s= minBits) /* enough room for a second symbol */ + { + U32 sortedRank; + int minWeight = nbBits + scaleLog; + if (minWeight < 1) minWeight = 1; + sortedRank = rankStart[minWeight]; + HUF_fillDTableX4Level2(DTable+start, targetLog-nbBits, nbBits, + rankValOrigin[nbBits], minWeight, + sortedList+sortedRank, sortedListSize-sortedRank, + nbBitsBaseline, symbol); + } + else + { + U32 i; + const U32 end = start + length; + HUF_DEltX4 DElt; + + MEM_writeLE16(&(DElt.sequence), symbol); + DElt.nbBits = (BYTE)(nbBits); + DElt.length = 1; + for (i = start; i < end; i++) + DTable[i] = DElt; + } + rankVal[weight] += length; + } +} + +static size_t HUF_readDTableX4 (U32* DTable, const void* src, size_t srcSize) +{ + BYTE weightList[HUF_MAX_SYMBOL_VALUE + 1]; + sortedSymbol_t sortedSymbol[HUF_MAX_SYMBOL_VALUE + 1]; + U32 rankStats[HUF_ABSOLUTEMAX_TABLELOG + 1] = { 0 }; + U32 rankStart0[HUF_ABSOLUTEMAX_TABLELOG + 2] = { 0 }; + U32* const rankStart = rankStart0+1; + rankVal_t rankVal; + U32 tableLog, maxW, sizeOfSort, nbSymbols; + const U32 memLog = DTable[0]; + const BYTE* ip = (const BYTE*) src; + size_t iSize = ip[0]; + void* ptr = DTable; + HUF_DEltX4* const dt = ((HUF_DEltX4*)ptr) + 1; + + HUF_STATIC_ASSERT(sizeof(HUF_DEltX4) == sizeof(U32)); /* if compilation fails here, assertion is false */ + if (memLog > HUF_ABSOLUTEMAX_TABLELOG) return ERROR(tableLog_tooLarge); + //memset(weightList, 0, sizeof(weightList)); /* is not necessary, even though some analyzer complain ... */ + + iSize = HUF_readStats(weightList, HUF_MAX_SYMBOL_VALUE + 1, rankStats, &nbSymbols, &tableLog, src, srcSize); + if (HUF_isError(iSize)) return iSize; + + /* check result */ + if (tableLog > memLog) return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */ + + /* find maxWeight */ + for (maxW = tableLog; rankStats[maxW]==0; maxW--) + {if (!maxW) return ERROR(GENERIC); } /* necessarily finds a solution before maxW==0 */ + + /* Get start index of each weight */ + { + U32 w, nextRankStart = 0; + for (w=1; w<=maxW; w++) + { + U32 current = nextRankStart; + nextRankStart += rankStats[w]; + rankStart[w] = current; + } + rankStart[0] = nextRankStart; /* put all 0w symbols at the end of sorted list*/ + sizeOfSort = nextRankStart; + } + + /* sort symbols by weight */ + { + U32 s; + for (s=0; s> consumed; + } + } + } + + HUF_fillDTableX4(dt, memLog, + sortedSymbol, sizeOfSort, + rankStart0, rankVal, maxW, + tableLog+1); + + return iSize; +} + + +static U32 HUF_decodeSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog) +{ + const size_t val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ + memcpy(op, dt+val, 2); + BIT_skipBits(DStream, dt[val].nbBits); + return dt[val].length; +} + +static U32 HUF_decodeLastSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog) +{ + const size_t val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ + memcpy(op, dt+val, 1); + if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits); + else + { + if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) + { + BIT_skipBits(DStream, dt[val].nbBits); + if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8)) + DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8); /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */ + } + } + return 1; +} + + +#define HUF_DECODE_SYMBOLX4_0(ptr, DStreamPtr) \ + ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog) + +#define HUF_DECODE_SYMBOLX4_1(ptr, DStreamPtr) \ + if (MEM_64bits() || (HUF_MAX_TABLELOG<=12)) \ + ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog) + +#define HUF_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \ + if (MEM_64bits()) \ + ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog) + +static inline size_t HUF_decodeStreamX4(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, const HUF_DEltX4* const dt, const U32 dtLog) +{ + BYTE* const pStart = p; + + /* up to 8 symbols at a time */ + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p < pEnd-7)) + { + HUF_DECODE_SYMBOLX4_2(p, bitDPtr); + HUF_DECODE_SYMBOLX4_1(p, bitDPtr); + HUF_DECODE_SYMBOLX4_2(p, bitDPtr); + HUF_DECODE_SYMBOLX4_0(p, bitDPtr); + } + + /* closer to the end */ + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd-2)) + HUF_DECODE_SYMBOLX4_0(p, bitDPtr); + + while (p <= pEnd-2) + HUF_DECODE_SYMBOLX4_0(p, bitDPtr); /* no need to reload : reached the end of DStream */ + + if (p < pEnd) + p += HUF_decodeLastSymbolX4(p, bitDPtr, dt, dtLog); + + return p-pStart; +} + + + +static size_t HUF_decompress4X4_usingDTable( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const U32* DTable) +{ + if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ + + { + const BYTE* const istart = (const BYTE*) cSrc; + BYTE* const ostart = (BYTE*) dst; + BYTE* const oend = ostart + dstSize; + + const void* ptr = DTable; + const HUF_DEltX4* const dt = ((const HUF_DEltX4*)ptr) +1; + const U32 dtLog = DTable[0]; + size_t errorCode; + + /* Init */ + BIT_DStream_t bitD1; + BIT_DStream_t bitD2; + BIT_DStream_t bitD3; + BIT_DStream_t bitD4; + const size_t length1 = MEM_readLE16(istart); + const size_t length2 = MEM_readLE16(istart+2); + const size_t length3 = MEM_readLE16(istart+4); + size_t length4; + const BYTE* const istart1 = istart + 6; /* jumpTable */ + const BYTE* const istart2 = istart1 + length1; + const BYTE* const istart3 = istart2 + length2; + const BYTE* const istart4 = istart3 + length3; + const size_t segmentSize = (dstSize+3) / 4; + BYTE* const opStart2 = ostart + segmentSize; + BYTE* const opStart3 = opStart2 + segmentSize; + BYTE* const opStart4 = opStart3 + segmentSize; + BYTE* op1 = ostart; + BYTE* op2 = opStart2; + BYTE* op3 = opStart3; + BYTE* op4 = opStart4; + U32 endSignal; + + length4 = cSrcSize - (length1 + length2 + length3 + 6); + if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ + errorCode = BIT_initDStream(&bitD1, istart1, length1); + if (HUF_isError(errorCode)) return errorCode; + errorCode = BIT_initDStream(&bitD2, istart2, length2); + if (HUF_isError(errorCode)) return errorCode; + errorCode = BIT_initDStream(&bitD3, istart3, length3); + if (HUF_isError(errorCode)) return errorCode; + errorCode = BIT_initDStream(&bitD4, istart4, length4); + if (HUF_isError(errorCode)) return errorCode; + + /* 16-32 symbols per loop (4-8 symbols per stream) */ + endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4); + for ( ; (endSignal==BIT_DStream_unfinished) && (op4<(oend-7)) ; ) + { + HUF_DECODE_SYMBOLX4_2(op1, &bitD1); + HUF_DECODE_SYMBOLX4_2(op2, &bitD2); + HUF_DECODE_SYMBOLX4_2(op3, &bitD3); + HUF_DECODE_SYMBOLX4_2(op4, &bitD4); + HUF_DECODE_SYMBOLX4_1(op1, &bitD1); + HUF_DECODE_SYMBOLX4_1(op2, &bitD2); + HUF_DECODE_SYMBOLX4_1(op3, &bitD3); + HUF_DECODE_SYMBOLX4_1(op4, &bitD4); + HUF_DECODE_SYMBOLX4_2(op1, &bitD1); + HUF_DECODE_SYMBOLX4_2(op2, &bitD2); + HUF_DECODE_SYMBOLX4_2(op3, &bitD3); + HUF_DECODE_SYMBOLX4_2(op4, &bitD4); + HUF_DECODE_SYMBOLX4_0(op1, &bitD1); + HUF_DECODE_SYMBOLX4_0(op2, &bitD2); + HUF_DECODE_SYMBOLX4_0(op3, &bitD3); + HUF_DECODE_SYMBOLX4_0(op4, &bitD4); + + endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4); + } + + /* check corruption */ + if (op1 > opStart2) return ERROR(corruption_detected); + if (op2 > opStart3) return ERROR(corruption_detected); + if (op3 > opStart4) return ERROR(corruption_detected); + /* note : op4 supposed already verified within main loop */ + + /* finish bitStreams one by one */ + HUF_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog); + HUF_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog); + HUF_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog); + HUF_decodeStreamX4(op4, &bitD4, oend, dt, dtLog); + + /* check */ + endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4); + if (!endSignal) return ERROR(corruption_detected); + + /* decoded size */ + return dstSize; + } +} + + +static size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) +{ + HUF_CREATE_STATIC_DTABLEX4(DTable, HUF_MAX_TABLELOG); + const BYTE* ip = (const BYTE*) cSrc; + + size_t hSize = HUF_readDTableX4 (DTable, cSrc, cSrcSize); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; + cSrcSize -= hSize; + + return HUF_decompress4X4_usingDTable (dst, dstSize, ip, cSrcSize, DTable); +} + + +/**********************************/ +/* quad-symbol decoding */ +/**********************************/ +typedef struct { BYTE nbBits; BYTE nbBytes; } HUF_DDescX6; +typedef union { BYTE byte[4]; U32 sequence; } HUF_DSeqX6; + +/* recursive, up to level 3; may benefit from