diff --git a/lib/dictBuilder/fastcover.c b/lib/dictBuilder/fastcover.c index 56a08138520..09522f6e948 100644 --- a/lib/dictBuilder/fastcover.c +++ b/lib/dictBuilder/fastcover.c @@ -155,6 +155,8 @@ static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx, const U32 d = parameters.d; const U32 f = ctx->f; const U32 dmersInK = k - d + 1; + /* Last position where a full d-byte dmer can be read without OOB. See #4499. */ + const U32 hashEnd = (end >= d) ? end - d + 1 : begin; /* Try each segment (activeSegment) and save the best (bestSegment) */ COVER_segment_t bestSegment = {0, 0, 0}; @@ -168,8 +170,12 @@ static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx, /* Slide the activeSegment through the whole epoch. * Save the best segment in bestSegment. + * + * We need at least d bytes starting from activeSegment.end to compute + * a dmer hash, so stop d-1 bytes before the epoch end to avoid an + * out-of-bounds read. See #4499. */ - while (activeSegment.end < end) { + while (activeSegment.end < hashEnd) { /* Get hash value of current dmer */ const size_t idx = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, f, d); @@ -200,7 +206,7 @@ static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx, } /* Zero out rest of segmentFreqs array */ - while (activeSegment.begin < end) { + while (activeSegment.begin < hashEnd) { const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, f, d); segmentFreqs[delIndex] -= 1; activeSegment.begin += 1; @@ -209,7 +215,8 @@ static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx, { /* Zero the frequency of hash value of each dmer covered by the chosen segment. */ U32 pos; - for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) { + const U32 bestEnd = (bestSegment.end <= hashEnd) ? bestSegment.end : hashEnd; + for (pos = bestSegment.begin; pos != bestEnd; ++pos) { const size_t i = FASTCOVER_hashPtrToIndex(ctx->samples + pos, f, d); freqs[i] = 0; } @@ -415,7 +422,12 @@ FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx, */ for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) { const U32 epochBegin = (U32)(epoch * epochs.size); - const U32 epochEnd = epochBegin + epochs.size; + /* Clamp epochEnd to the sample buffer size to avoid passing an + * out-of-bounds end position to FASTCOVER_selectSegment. See #4499. */ + const U32 epochEnd = MIN(epochBegin + epochs.size, + (U32)ctx->nbTrainSamples > 0 + ? (U32)ctx->offsets[ctx->nbTrainSamples] + : epochBegin); size_t segmentSize; /* Select a segment */ COVER_segment_t segment = FASTCOVER_selectSegment( diff --git a/tests/fuzzer.c b/tests/fuzzer.c index 472f8c8b308..ee16fed9ff3 100644 --- a/tests/fuzzer.c +++ b/tests/fuzzer.c @@ -3781,6 +3781,29 @@ static int basicUnitTests(U32 const seed, double compressibility) if (dictID==0) goto _output_error; DISPLAYLEVEL(3, "OK : %u \n", (unsigned)dictID); + DISPLAYLEVEL(3, "test%3i : FASTCOVER with many small samples (issue #4499) : ", testNb++); + /* FASTCOVER_hashPtrToIndex reads d bytes from the sample buffer. + * When all samples are shorter than d, the sliding window must not + * read past the end of the buffer. Triggered a stack-buffer-overflow + * with ASAN before the fix. */ + { const char smallData[8] = {'A','B','C','D','E','F','G','H'}; + const size_t smallSizes[8] = {1,1,1,1,1,1,1,1}; + ZDICT_fastCover_params_t fcParams; + char *smallDict = (char *)malloc(1024); + size_t smallDictSize; + if (!smallDict) goto _output_error; + memset(&fcParams, 0, sizeof(fcParams)); + /* d=8 forces hash8Ptr path; samples are only 1 byte each */ + fcParams.d = 8; + fcParams.k = 16; + smallDictSize = ZDICT_trainFromBuffer_fastCover( + smallDict, 1024, smallData, smallSizes, 8, fcParams); + /* May return an error (too-small corpus) but must NOT crash */ + (void)smallDictSize; + free(smallDict); + } + DISPLAYLEVEL(3, "OK \n"); + ZSTD_freeCCtx(cctx); free(dictBuffer); free(samplesSizes);