diff --git a/doc/lzma-file-format.txt b/doc/lzma-file-format.txt index 015b0fae..4865defd 100644 --- a/doc/lzma-file-format.txt +++ b/doc/lzma-file-format.txt @@ -40,7 +40,11 @@ The .lzma File Format 0.2. Changes - Last modified: 2011-04-12 11:55+0300 + Last modified: 2022-07-13 21:00+0300 + + Compared to the previous version (2011-04-12 11:55+0300) + the section 1.1.3 was modified to allow End of Payload Marker + with a known Uncompressed Size. 1. File Format @@ -129,7 +133,10 @@ The .lzma File Format Uncompressed Size is stored as unsigned 64-bit little endian integer. A special value of 0xFFFF_FFFF_FFFF_FFFF indicates that Uncompressed Size is unknown. End of Payload Marker (*) - is used if and only if Uncompressed Size is unknown. + is used if Uncompressed Size is unknown. End of Payload Marker + is allowed but rarely used if Uncompressed Size is known. + XZ Utils 5.2.5 and older don't support .lzma files that have + End of Payload Marker together with a known Uncompressed Size. XZ Utils rejects files whose Uncompressed Size field specifies a known size that is 256 GiB or more. This is to reject false diff --git a/src/liblzma/common/alone_decoder.c b/src/liblzma/common/alone_decoder.c index 239b230e..a3ea20a2 100644 --- a/src/liblzma/common/alone_decoder.c +++ b/src/liblzma/common/alone_decoder.c @@ -146,7 +146,7 @@ alone_decode(void *coder_ptr, const lzma_allocator *allocator, // Use a hack to set the uncompressed size. lzma_lz_decoder_uncompressed(coder->next.coder, - coder->uncompressed_size); + coder->uncompressed_size, true); coder->sequence = SEQ_CODE; break; diff --git a/src/liblzma/lz/lz_decoder.c b/src/liblzma/lz/lz_decoder.c index 09b57438..ab6af0dd 100644 --- a/src/liblzma/lz/lz_decoder.c +++ b/src/liblzma/lz/lz_decoder.c @@ -304,8 +304,14 @@ lzma_lz_decoder_memusage(size_t dictionary_size) extern void -lzma_lz_decoder_uncompressed(void *coder_ptr, lzma_vli uncompressed_size) +lzma_lz_decoder_uncompressed(void *coder_ptr, lzma_vli uncompressed_size, + bool allow_eopm) { lzma_coder *coder = coder_ptr; - coder->lz.set_uncompressed(coder->lz.coder, uncompressed_size); + + if (uncompressed_size == LZMA_VLI_UNKNOWN) + allow_eopm = true; + + coder->lz.set_uncompressed(coder->lz.coder, uncompressed_size, + allow_eopm); } diff --git a/src/liblzma/lz/lz_decoder.h b/src/liblzma/lz/lz_decoder.h index 754ccf37..e6d7ab2a 100644 --- a/src/liblzma/lz/lz_decoder.h +++ b/src/liblzma/lz/lz_decoder.h @@ -62,8 +62,10 @@ typedef struct { void (*reset)(void *coder, const void *options); - /// Set the uncompressed size - void (*set_uncompressed)(void *coder, lzma_vli uncompressed_size); + /// Set the uncompressed size. If uncompressed_size == LZMA_VLI_UNKNOWN + /// then allow_eopm will always be true. + void (*set_uncompressed)(void *coder, lzma_vli uncompressed_size, + bool allow_eopm); /// Free allocated resources void (*end)(void *coder, const lzma_allocator *allocator); @@ -91,7 +93,7 @@ extern lzma_ret lzma_lz_decoder_init(lzma_next_coder *next, extern uint64_t lzma_lz_decoder_memusage(size_t dictionary_size); extern void lzma_lz_decoder_uncompressed( - void *coder, lzma_vli uncompressed_size); + void *coder, lzma_vli uncompressed_size, bool allow_eopm); ////////////////////// diff --git a/src/liblzma/lzma/lzma2_decoder.c b/src/liblzma/lzma/lzma2_decoder.c index cf1b5110..105a28dc 100644 --- a/src/liblzma/lzma/lzma2_decoder.c +++ b/src/liblzma/lzma/lzma2_decoder.c @@ -139,7 +139,7 @@ lzma2_decode(void *coder_ptr, lzma_dict *restrict dict, coder->uncompressed_size += in[(*in_pos)++] + 1U; coder->sequence = SEQ_COMPRESSED_0; coder->lzma.set_uncompressed(coder->lzma.coder, - coder->uncompressed_size); + coder->uncompressed_size, false); break; case SEQ_COMPRESSED_0: diff --git a/src/liblzma/lzma/lzma_decoder.c b/src/liblzma/lzma/lzma_decoder.c index e605a0a9..140a44f1 100644 --- a/src/liblzma/lzma/lzma_decoder.c +++ b/src/liblzma/lzma/lzma_decoder.c @@ -238,6 +238,11 @@ typedef struct { /// payload marker is expected. lzma_vli uncompressed_size; + /// True if end of payload marker (EOPM) is allowed even when + /// uncompressed_size is known; false if EOPM must not be present. + /// This is ignored if uncompressed_size == LZMA_VLI_UNKNOWN. + bool allow_eopm; + //////////////////////////////// // State of incomplete symbol // //////////////////////////////// @@ -343,12 +348,19 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, lzma_ret ret = LZMA_OK; - // If uncompressed size is known, there must be no end of payload - // marker. - const bool no_eopm = coder->uncompressed_size - != LZMA_VLI_UNKNOWN; - if (no_eopm && coder->uncompressed_size < dict.limit - dict.pos) + // EOPM is always required (not just allowed) when + // the uncompressed size isn't known. + bool eopm_allowed = coder->uncompressed_size == LZMA_VLI_UNKNOWN; + + // If uncompressed size is known and there is enough output space + // to decode all the data, limit the available buffer space so that + // the main loop won't try to decode past the end of the stream. + bool might_finish_without_eopm = false; + if (coder->uncompressed_size != LZMA_VLI_UNKNOWN + && coder->uncompressed_size <= dict.limit - dict.pos) { dict.limit = dict.pos + (size_t)(coder->uncompressed_size); + might_finish_without_eopm = true; + } // The main decoder loop. The "switch" is used to restart the decoder at // correct location. Once restarted, the "switch" is no longer used. @@ -361,8 +373,32 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, case SEQ_NORMALIZE: case SEQ_IS_MATCH: - if (unlikely(no_eopm && dict.pos == dict.limit)) - break; + if (unlikely(might_finish_without_eopm + && dict.pos == dict.limit)) { + // In rare cases there is a useless byte that needs + // to be read anyway. + rc_normalize(SEQ_NORMALIZE); + + // If the range decoder state is such that we can + // be at the end of the LZMA stream, then the + // decoding is finished. + if (rc_is_finished(rc)) { + ret = LZMA_STREAM_END; + goto out; + } + + // If the caller hasn't allowed EOPM to be present + // together with known uncompressed size, then the + // LZMA stream is corrupt. + if (!coder->allow_eopm) { + ret = LZMA_DATA_ERROR; + goto out; + } + + // Otherwise continue decoding with the expectation + // that the next LZMA symbol is EOPM. + eopm_allowed = true; + } rc_if_0(coder->is_match[state][pos_state], SEQ_IS_MATCH) { rc_update_0(coder->is_match[state][pos_state]); @@ -658,11 +694,18 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, if (rep0 == UINT32_MAX) { // End of payload marker was - // found. It must not be - // present if uncompressed - // size is known. - if (coder->uncompressed_size - != LZMA_VLI_UNKNOWN) { + // found. It may only be + // present if + // - uncompressed size is + // unknown or + // - after known uncompressed + // size amount of bytes has + // been decompressed and + // caller has indicated + // that EOPM might be used + // (it's not allowed in + // LZMA2). + if (!eopm_allowed) { ret = LZMA_DATA_ERROR; goto out; } @@ -671,7 +714,9 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, // LZMA1 stream with // end-of-payload marker. rc_normalize(SEQ_EOPM); - ret = LZMA_STREAM_END; + ret = rc_is_finished(rc) + ? LZMA_STREAM_END + : LZMA_DATA_ERROR; goto out; } } @@ -793,9 +838,6 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr, } } - rc_normalize(SEQ_NORMALIZE); - coder->sequence = SEQ_IS_MATCH; - out: // Save state @@ -822,24 +864,21 @@ out: if (coder->uncompressed_size != LZMA_VLI_UNKNOWN) { coder->uncompressed_size -= dict.pos - dict_start; - // Since there cannot be end of payload marker if the - // uncompressed size was known, we check here if we - // finished decoding. + // If we have gotten all the output but the decoder wants + // to write more output, the file is corrupt. There are + // three SEQ values where output is produced. if (coder->uncompressed_size == 0 && ret == LZMA_OK - && coder->sequence != SEQ_NORMALIZE) - ret = coder->sequence == SEQ_IS_MATCH - ? LZMA_STREAM_END : LZMA_DATA_ERROR; + && (coder->sequence == SEQ_LITERAL_WRITE + || coder->sequence == SEQ_SHORTREP + || coder->sequence == SEQ_COPY)) + ret = LZMA_DATA_ERROR; } - // We can do an additional check in the range decoder to catch some - // corrupted files. if (ret == LZMA_STREAM_END) { - if (!rc_is_finished(coder->rc)) - ret = LZMA_DATA_ERROR; - // Reset the range decoder so that it is ready to reinitialize // for a new LZMA2 chunk. rc_reset(coder->rc); + coder->sequence = SEQ_IS_MATCH; } return ret; @@ -848,10 +887,12 @@ out: static void -lzma_decoder_uncompressed(void *coder_ptr, lzma_vli uncompressed_size) +lzma_decoder_uncompressed(void *coder_ptr, lzma_vli uncompressed_size, + bool allow_eopm) { lzma_lzma1_decoder *coder = coder_ptr; coder->uncompressed_size = uncompressed_size; + coder->allow_eopm = allow_eopm; } @@ -977,7 +1018,7 @@ lzma_decoder_init(lzma_lz_decoder *lz, const lzma_allocator *allocator, lz, allocator, options, lz_options)); lzma_decoder_reset(lz->coder, options); - lzma_decoder_uncompressed(lz->coder, LZMA_VLI_UNKNOWN); + lzma_decoder_uncompressed(lz->coder, LZMA_VLI_UNKNOWN, true); return LZMA_OK; }