mirror of
https://git.tukaani.org/xz.git
synced 2024-04-04 12:36:23 +02:00
1148 lines
42 KiB
Text
1148 lines
42 KiB
Text
|
|
The .lzma File Format
|
|
---------------------
|
|
|
|
0. Preface
|
|
0.1. Copyright Notices
|
|
0.2. Changes
|
|
1. Conventions
|
|
1.1. Byte and Its Representation
|
|
1.2. Multibyte Integers
|
|
2. Overall Structure of .lzma File
|
|
2.1. Stream
|
|
2.1.1. Stream Header
|
|
2.1.1.1. Header Magic Bytes
|
|
2.1.1.2. Stream Flags
|
|
2.1.1.3. CRC32
|
|
2.1.2. Stream Footer
|
|
2.1.2.1. CRC32
|
|
2.1.2.2. Backward Size
|
|
2.1.2.3. Stream Flags
|
|
2.1.2.4. Footer Magic Bytes
|
|
2.2. Stream Padding
|
|
3. Block
|
|
3.1. Block Header
|
|
3.1.1. Block Header Size
|
|
3.1.2. Block Flags
|
|
3.1.3. Compressed Size
|
|
3.1.4. Uncompressed Size
|
|
3.1.5. List of Filter Flags
|
|
3.1.6. Header Padding
|
|
3.1.7. CRC32
|
|
3.2. Compressed Data
|
|
3.3. Check
|
|
4. Index
|
|
4.1. Index Indicator
|
|
4.2. Number of Records
|
|
4.3. List of Records
|
|
4.3.1. Total Size
|
|
4.3.2. Uncompressed Size
|
|
4.4. Index Padding
|
|
4.5. CRC32
|
|
5. Filter Chains
|
|
5.1. Alignment
|
|
5.2. Security
|
|
5.3. Filters
|
|
5.3.1. LZMA
|
|
5.3.2. LZMA2
|
|
5.3.3. Branch/Call/Jump Filters for Executables
|
|
5.3.4. Delta
|
|
5.3.4.1. Format of the Encoded Output
|
|
5.4. Custom Filter IDs
|
|
5.4.1. Reserved Custom Filter ID Ranges
|
|
6. Cyclic Redundancy Checks
|
|
7. References
|
|
|
|
|
|
0. Preface
|
|
|
|
This document describes the .lzma file format (filename suffix
|
|
`.lzma', MIME type `application/x-lzma'). It is intended that
|
|
this format replace the format used by the LZMA_Alone tool
|
|
included in LZMA SDK up to and including version 4.57.
|
|
|
|
IMPORTANT: The version described in this document is a
|
|
draft, NOT a final, official version. Changes
|
|
are possible.
|
|
|
|
|
|
0.1. Copyright Notices
|
|
|
|
Copyright (C) 2006-2008 Lasse Collin <lasse.collin@tukaani.org>
|
|
Copyright (C) 2006 Ville Koskinen <w-ber@iki.fi>
|
|
|
|
Copying and distribution of this file, with or without
|
|
modification, are permitted in any medium without royalty
|
|
provided the copyright notice and this notice are preserved.
|
|
Modified versions must be marked as such.
|
|
|
|
All source code examples given in this document are put into
|
|
the public domain by the authors of this document.
|
|
|
|
Special thanks for helping with this document goes to
|
|
Igor Pavlov. Thanks for helping with this document goes to
|
|
Mark Adler, H. Peter Anvin, and Mikko Pouru.
|
|
|
|
|
|
0.2. Changes
|
|
|
|
Last modified: 2008-09-03 14:10+0300
|
|
|
|
(A changelog will be kept once the first official version
|
|
is made.)
|
|
|
|
|
|
1. Conventions
|
|
|
|
The keywords `must', `must not', `required', `should',
|
|
`should not', `recommended', `may', and `optional' in this
|
|
document are to be interpreted as described in [RFC-2119].
|
|
These words are not capitalized in this document.
|
|
|
|
Indicating a warning means displaying a message, returning
|
|
appropriate exit status, or something else to let the user
|
|
know that something worth warning occurred. The operation
|
|
should still finish if a warning is indicated.
|
|
|
|
Indicating an error means displaying a message, returning
|
|
appropriate exit status, or something else to let the user
|
|
know that something prevented successfully finishing the
|
|
operation. The operation must be aborted once an error has
|
|
been indicated.
|
|
|
|
|
|
1.1. Byte and Its Representation
|
|
|
|
In this document, byte is always 8 bits.
|
|
|
|
A `nul byte' has all bits unset. That is, the value of a nul
|
|
byte is 0x00.
|
|
|
|
To represent byte blocks, this document uses notation that
|
|
is similar to the notation used in [RFC-1952]:
|
|
|
|
+-------+
|
|
| Foo | One byte.
|
|
+-------+
|
|
|
|
+---+---+
|
|
| Foo | Two bytes; that is, some of the vertical bars
|
|
+---+---+ can be missing.
|
|
|
|
+=======+
|
|
| Foo | Zero or more bytes.
|
|
+=======+
|
|
|
|
In this document, a boxed byte or a byte sequence declared
|
|
using this notation is called `a field'. The example field
|
|
above would be called `the Foo field' or plain `Foo'.
|
|
|
|
|
|
1.2. Multibyte Integers
|
|
|
|
Multibyte integers of static length, such as CRC values,
|
|
are stored in little endian byte order (least significant
|
|
byte first).
|
|
|
|
When smaller values are more likely than bigger values (for
|
|
example file sizes), multibyte integers are encoded in a
|
|
variable-length representation:
|
|
- Numbers in the range [0, 127] are copied as is, and take
|
|
one byte of space.
|
|
- Bigger numbers will occupy two or more bytes. All but the
|
|
last byte of the multibyte representation have the highest
|
|
(eighth) bit set.
|
|
|
|
For now, the value of the variable-length integers is limited
|
|
to 63 bits, which limits the encoded size of the integer to
|
|
nine bytes. These limits may be increased in future if needed.
|
|
|
|
The following C code illustrates encoding and decoding of
|
|
variable-length integers. The functions return the number of
|
|
bytes occupied by the integer (1-9), or zero on error.
|
|
|
|
#include <sys/types.h>
|
|
#include <inttypes.h>
|
|
|
|
size_t
|
|
encode(uint8_t buf[static 9], uint64_t num)
|
|
{
|
|
if (num >= UINT64_MAX / 2)
|
|
return 0;
|
|
|
|
size_t i = 0;
|
|
|
|
while (num >= 0x80) {
|
|
buf[i++] = (uint8_t)(num) | 0x80;
|
|
num >>= 7;
|
|
}
|
|
|
|
buf[i++] = (uint8_t)(num);
|
|
|
|
return i;
|
|
}
|
|
|
|
size_t
|
|
decode(const uint8_t buf[], size_t size_max, uint64_t *num)
|
|
{
|
|
if (size_max == 0)
|
|
return 0;
|
|
|
|
if (size_max > 9)
|
|
size_max = 9;
|
|
|
|
*num = buf[0] & 0x7F;
|
|
size_t i = 0;
|
|
|
|
while (buf[i++] & 0x80) {
|
|
if (i > size_max || buf[i] == 0x00)
|
|
return 0;
|
|
|
|
*num |= (uint64_t)(buf[i] & 0x7F) << (i * 7);
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
|
|
2. Overall Structure of .lzma File
|
|
|
|
+========+================+========+================+
|
|
| Stream | Stream Padding | Stream | Stream Padding | ...
|
|
+========+================+========+================+
|
|
|
|
A file contains usually only one Stream. However, it is
|
|
possible to concatenate multiple Streams together with no
|
|
additional processing. It is up to the implementation to
|
|
decide if the decoder will continue decoding from the next
|
|
Stream once the end of the first Stream has been reached.
|
|
|
|
|
|
2.1. Stream
|
|
|
|
+-+-+-+-+-+-+-+-+-+-+-+-+=======+=======+ +=======+
|
|
| Stream Header | Block | Block | ... | Block |
|
|
+-+-+-+-+-+-+-+-+-+-+-+-+=======+=======+ +=======+
|
|
|
|
+=======+-+-+-+-+-+-+-+-+-+-+-+-+
|
|
---> | Index | Stream Footer |
|
|
+=======+-+-+-+-+-+-+-+-+-+-+-+-+
|
|
|
|
All the above fields have a size that is a multiple of four. If
|
|
Stream is used as an internal part of another file format, it
|
|
is recommended to make the Stream start at an offset that is
|
|
a multiple of four bytes.
|
|
|
|
Stream Header, Index, and Stream Footer are always present in
|
|
a Stream. The maximum size of the Index field is 16 GiB (2^34).
|
|
|
|
There are zero or more Blocks. The maximum number of Blocks is
|
|
limited only by the maximum size of the Index field.
|
|
|
|
Total size of a Stream must be less than 8 EiB (2^63 bytes).
|
|
The same limit applies to the total amount of uncompressed
|
|
data stored in a Stream.
|
|
|
|
If an implementation supports handling .lzma files with
|
|
multiple concatenated Streams, it may apply the above limits
|
|
to the file as a whole instead of limiting per Stream basis.
|
|
|
|
|
|
2.1.1. Stream Header
|
|
|
|
+---+---+---+---+---+---+-------+------+--+--+--+--+
|
|
| Header Magic Bytes | Stream Flags | CRC32 |
|
|
+---+---+---+---+---+---+-------+------+--+--+--+--+
|
|
|
|
|
|
2.1.1.1. Header Magic Bytes
|
|
|
|
The first six (6) bytes of the Stream are so called Header
|
|
Magic Bytes. They can be used to identify the file type.
|
|
|
|
Using a C array and ASCII:
|
|
const uint8_t HEADER_MAGIC[6]
|
|
= { 0xFF, 'L', 'Z', 'M', 'A', 0x00 };
|
|
|
|
In plain hexadecimal:
|
|
FF 4C 5A 4D 41 00
|
|
|
|
Notes:
|
|
- The first byte (0xFF) was chosen so that the files cannot
|
|
be erroneously detected as being in LZMA_Alone format, in
|
|
which the first byte is in the range [0x00, 0xE0].
|
|
- The sixth byte (0x00) was chosen to prevent applications
|
|
from misdetecting the file as a text file.
|
|
|
|
If the Header Magic Bytes don't match, the decoder must
|
|
indicate an error.
|
|
|
|
|
|
2.1.1.2. Stream Flags
|
|
|
|
The first byte of Stream Flags is always a nul byte. In future
|
|
this byte may be used to indicate new Stream version or other
|
|
Stream properties.
|
|
|
|
The second byte of Stream Flags is a bit field:
|
|
|
|
Bit(s) Mask Description
|
|
0-3 0x0F Type of Check (see Section 3.3):
|
|
ID Size Check name
|
|
0x00 0 bytes None
|
|
0x01 4 bytes CRC32
|
|
0x02 4 bytes (Reserved)
|
|
0x03 4 bytes (Reserved)
|
|
0x04 8 bytes CRC64
|
|
0x05 8 bytes (Reserved)
|
|
0x06 8 bytes (Reserved)
|
|
0x07 16 bytes (Reserved)
|
|
0x08 16 bytes (Reserved)
|
|
0x09 16 bytes (Reserved)
|
|
0x0A 32 bytes SHA-256
|
|
0x0B 32 bytes (Reserved)
|
|
0x0C 32 bytes (Reserved)
|
|
0x0D 64 bytes (Reserved)
|
|
0x0E 64 bytes (Reserved)
|
|
0x0F 64 bytes (Reserved)
|
|
4-7 0xF0 Reserved for future use; must be zero for now.
|
|
|
|
Implementations must support at least the Check IDs 0x00 (None)
|
|
and 0x01 (CRC32). Supporting other Check IDs is optional. If
|
|
an unsupported Check is used, the decoder should indicate a
|
|
warning or error.
|
|
|
|
If any reserved bit is set, the decoder must indicate an error.
|
|
It is possible that there is a new field present which the
|
|
decoder is not aware of, and can thus parse the Stream Header
|
|
incorrectly.
|
|
|
|
|
|
2.1.1.3. CRC32
|
|
|
|
The CRC32 is calculated from the Stream Flags field. It is
|
|
stored as an unsigned 32-bit little endian integer. If the
|
|
calculated value does not match the stored one, the decoder
|
|
must indicate an error.
|
|
|
|
The idea is that Stream Flags would always be two bytes, even
|
|
if new features are needed. This way old decoders will be able
|
|
to verify the CRC32 calculated from Stream Flags, and thus
|
|
distinguish between corrupt files (CRC32 doesn't match) and
|
|
files that the decoder doesn't support (CRC32 matches but
|
|
Stream Flags has reserved bits set).
|
|
|
|
|
|
2.1.2. Stream Footer
|
|
|
|
+-+-+-+-+---+---+---+---+-------+------+----------+---------+
|
|
| CRC32 | Backward Size | Stream Flags | Footer Magic Bytes |
|
|
+-+-+-+-+---+---+---+---+-------+------+----------+---------+
|
|
|
|
|
|
2.1.2.1. CRC32
|
|
|
|
The CRC32 is calculated from the Backward Size and Stream Flags
|
|
fields. It is stored as an unsigned 32-bit little endian
|
|
integer. If the calculated value does not match the stored one,
|
|
the decoder must indicate an error.
|
|
|
|
The reason to have the CRC32 field before the Backward Size and
|
|
Stream Flags fields is to keep the four-byte fields aligned to
|
|
a multiple of four bytes.
|
|
|
|
|
|
2.1.2.2. Backward Size
|
|
|
|
Backward Size is stored as a 32-bit little endian integer,
|
|
which indicates the size of the Index field as multiple of
|
|
four bytes, minimum value being four bytes:
|
|
|
|
real_backward_size = (stored_backward_size + 1) * 4;
|
|
|
|
Using a fixed-size integer to store this value makes it
|
|
slightly simpler to parse the Stream Footer when the
|
|
application needs to parse the Stream backwards.
|
|
|
|
|
|
2.1.2.3. Stream Flags
|
|
|
|
This is a copy of the Stream Flags field from the Stream
|
|
Header. The information stored to Stream Flags is needed
|
|
when parsing the Stream backwards. The decoder must compare
|
|
the Stream Flags fields in both Stream Header and Stream
|
|
Footer, and indicate an error if they are not identical.
|
|
|
|
|
|
2.1.2.4. Footer Magic Bytes
|
|
|
|
As the last step of the decoding process, the decoder must
|
|
verify the existence of Footer Magic Bytes. If they don't
|
|
match, an error must be indicated.
|
|
|
|
Using a C array and ASCII:
|
|
const uint8_t FOOTER_MAGIC[2] = { 'Y', 'Z' };
|
|
|
|
In hexadecimal:
|
|
59 5A
|
|
|
|
The primary reason to have Footer Magic Bytes is to make
|
|
it easier to detect incomplete files quickly, without
|
|
uncompressing. If the file does not end with Footer Magic Bytes
|
|
(excluding Stream Padding described in Section 2.2), it cannot
|
|
be undamaged, unless someone has intentionally appended garbage
|
|
after the end of the Stream.
|
|
|
|
|
|
2.2. Stream Padding
|
|
|
|
Only the decoders that support decoding of concatenated Streams
|
|
must support Stream Padding.
|
|
|
|
Stream Padding must contain only nul bytes. Any non-nul byte
|
|
should be considered as the beginning of a new Stream. To
|
|
preserve the four-byte alignment of consecutive Streams, the
|
|
size of Stream Padding must be a multiple of four bytes. Empty
|
|
Stream Padding is allowed.
|
|
|
|
Note that non-empty Stream Padding is allowed at the end of the
|
|
file; there doesn't need to be a new Stream after non-empty
|
|
Stream Padding. This can be convenient in certain situations
|
|
[GNU-tar].
|
|
|
|
The possibility of Padding should be taken into account when
|
|
designing an application that parses the Stream backwards.
|
|
|
|
|
|
3. Block
|
|
|
|
+==============+=================+=======+
|
|
| Block Header | Compressed Data | Check |
|
|
+==============+=================+=======+
|
|
|
|
|
|
3.1. Block Header
|
|
|
|
+-------------------+-------------+=================+
|
|
| Block Header Size | Block Flags | Compressed Size |
|
|
+-------------------+-------------+=================+
|
|
|
|
+===================+======================+
|
|
---> | Uncompressed Size | List of Filter Flags |
|
|
+===================+======================+
|
|
|
|
+================+--+--+--+--+
|
|
---> | Header Padding | CRC32 |
|
|
+================+--+--+--+--+
|
|
|
|
|
|
3.1.1. Block Header Size
|
|
|
|
This field overlaps with the Index Indicator field (see
|
|
Section 4.1).
|
|
|
|
This field contains the size of the Block Header field,
|
|
including the Block Header Size field itself. Valid values are
|
|
in the range [0x01, 0xFF], which indicate the size of the Block
|
|
Header as multiples of four bytes, minimum size being eight
|
|
bytes:
|
|
|
|
real_header_size = (encoded_header_size + 1) * 4;
|
|
|
|
If bigger Block Header is needed in future, a new field can be
|
|
added between the current Block Header and Compressed Data
|
|
fields. The presence of this new field would be indicated in
|
|
the Block Header.
|
|
|
|
|
|
3.1.2. Block Flags
|
|
|
|
The first byte of the Block Flags field is a bit field:
|
|
|
|
Bit(s) Mask Description
|
|
0-1 0x03 Number of filters (1-4)
|
|
2-5 0x3C Reserved for future use; must be zero for now.
|
|
6 0x40 The Compressed Size field is present.
|
|
7 0x80 The Uncompressed Size field is present.
|
|
|
|
If any reserved bit is set, the decoder must indicate an error.
|
|
It is possible that there is a new field present which the
|
|
decoder is not aware of, and can thus parse the Block Header
|
|
incorrectly.
|
|
|
|
|
|
3.1.3. Compressed Size
|
|
|
|
This field is present only if the appropriate bit is set in
|
|
the Block Flags field (see Section 3.1.2).
|
|
|
|
This field contains the size of the Compressed Data field as
|
|
multiple of four bytes, minimum value being four bytes:
|
|
|
|
real_compressed_size = (stored_compressed_size + 1) * 4;
|
|
|
|
The size is stored using the encoding described in Section 1.2.
|
|
If the Compressed Size does not match the real size of the
|
|
Compressed Data field, the decoder must indicate an error.
|
|
|
|
|
|
3.1.4. Uncompressed Size
|
|
|
|
This field is present only if the appropriate bit is set in
|
|
the Block Flags field (see Section 3.1.2).
|
|
|
|
The Uncompressed Size field contains the size of the Block
|
|
after uncompressing. Uncompressed Size is stored using the
|
|
encoding described in Section 1.2. If the Uncompressed Size
|
|
does not match the real uncompressed size, the decoder must
|
|
indicate an error.
|
|
|
|
Storing the Compressed Size and Uncompressed Size fields serves
|
|
several purposes:
|
|
- The decoder knows how much memory it needs to allocate
|
|
for a temporary buffer in multithreaded mode.
|
|
- Simple error detection: wrong size indicates a broken file.
|
|
- Seeking forwards to a specific location in streamed mode.
|
|
|
|
It should be noted that the only reliable way to determine
|
|
the real uncompressed size is to uncompress the Block,
|
|
because the Block Header and Index fields may contain
|
|
(intentionally or unintentionally) invalid information.
|
|
|
|
|
|
3.1.5. List of Filter Flags
|
|
|
|
+================+================+ +================+
|
|
| Filter 0 Flags | Filter 1 Flags | ... | Filter n Flags |
|
|
+================+================+ +================+
|
|
|
|
The number of Filter Flags fields is stored in the Block Flags
|
|
field (see Section 3.1.2).
|
|
|
|
The format of each Filter Flags field is as follows:
|
|
|
|
+===========+====================+===================+
|
|
| Filter ID | Size of Properties | Filter Properties |
|
|
+===========+====================+===================+
|
|
|
|
Both Filter ID and Size of Properties are stored using the
|
|
encoding described in Section 1.2. Size of Properties indicates
|
|
the size of the Filter Properties field as bytes. The list of
|
|
officially defined Filter IDs and the formats of their Filter
|
|
Properties are described in Section 5.3.
|
|
|
|
Filter IDs greater than or equal to 0x4000_0000_0000_0000
|
|
(2^62) are reserved for implementation-specific internal use.
|
|
These Filter IDs must never be used in List of Filter Flags.
|
|
|
|
|
|
3.1.6. Header Padding
|
|
|
|
This field contains as many nul byte as it is needed to make
|
|
the Block Header have the size specified in Block Header Size.
|
|
If any of the bytes are not nul bytes, the decoder must
|
|
indicate an error. It is possible that there is a new field
|
|
present which the decoder is not aware of, and can thus parse
|
|
the Block Header incorrectly.
|
|
|
|
|
|
3.1.7. CRC32
|
|
|
|
The CRC32 is calculated over everything in the Block Header
|
|
field except the CRC32 field itself. It is stored as an
|
|
unsigned 32-bit little endian integer. If the calculated
|
|
value does not match the stored one, the decoder must indicate
|
|
an error.
|
|
|
|
By verifying the CRC32 of the Block Header before parsing the
|
|
actual contents allows the decoder to distinguish between
|
|
corrupt and unsupported files.
|
|
|
|
|
|
3.2. Compressed Data
|
|
|
|
The format of Compressed Data depends on Block Flags and List
|
|
of Filter Flags. Excluding the descriptions of the simplest
|
|
filters in Section 5.3, the format of the filter-specific
|
|
encoded data is out of scope of this document.
|
|
|
|
If the natural size of Compressed Data is not a multiple of
|
|
four bytes, it must be padded with 1-3 nul bytes to make it
|
|
a multiple of four bytes.
|
|
|
|
|
|
3.3. Check
|
|
|
|
The type and size of the Check field depends on which bits
|
|
are set in the Stream Flags field (see Section 2.1.1.2).
|
|
|
|
The Check, when used, is calculated from the original
|
|
uncompressed data. If the calculated Check does not match the
|
|
stored one, the decoder must indicate an error. If the selected
|
|
type of Check is not supported by the decoder, it must indicate
|
|
a warning or error.
|
|
|
|
|
|
4. Index
|
|
|
|
+-----------------+=========================+
|
|
| Index Indicator | Number of Index Records |
|
|
+-----------------+=========================+
|
|
|
|
+=================+=========+-+-+-+-+
|
|
---> | List of Records | Padding | CRC32 |
|
|
+=================+=========+-+-+-+-+
|
|
|
|
Index serves several purporses. Using it, one can
|
|
- verify that all Blocks in a Stream have been processed;
|
|
- find out the uncompressed size of a Stream; and
|
|
- quickly access the beginning of any Block (random access).
|
|
|
|
|
|
4.1. Index Indicator
|
|
|
|
This field overlaps with the Block Header Size field (see
|
|
Section 3.1.1). The value of Index Indicator is always 0x00.
|
|
|
|
|
|
4.2. Number of Records
|
|
|
|
This field indicates how many Records there are in the List
|
|
of Records field, and thus how many Blocks there are in the
|
|
Stream. The value is stored using the encoding described in
|
|
Section 1.2. If the decoder has decoded all the Blocks of the
|
|
Stream, and then notices that the Number of Records doesn't
|
|
match the real number of Blocks, the decoder must indicate an
|
|
error.
|
|
|
|
|
|
4.3. List of Records
|
|
|
|
List of Records consists of as many Records as indicated by the
|
|
Number of Records field:
|
|
|
|
+========+========+
|
|
| Record | Record | ...
|
|
+========+========+
|
|
|
|
Each Record contains two fields:
|
|
|
|
+============+===================+
|
|
| Total Size | Uncompressed Size |
|
|
+============+===================+
|
|
|
|
If the decoder has decoded all the Blocks of the Stream, it
|
|
must verify that the contents of the Records match the real
|
|
Total Size and Uncompressed Size of the respective Blocks.
|
|
|
|
Implementation hint: It is possible to verify the Index with
|
|
constant memory usage by calculating for example SHA256 of both
|
|
the real size values and the List of Records, then comparing
|
|
the check values. Implementing this using non-cryptographic
|
|
check like CRC32 should be avoided unless small code size is
|
|
important.
|
|
|
|
If the decoder supports random-access reading, it must verify
|
|
that Total Size and Uncompressed Size of every completely
|
|
decoded Block match the sizes stored in the Index. If only
|
|
partial Block is decoded, the decoder must verify that the
|
|
processed sizes don't exceed the sizes stored in the Index.
|
|
|
|
|
|
4.3.1. Total Size
|
|
|
|
This field indicates the encoded size of the respective Block
|
|
as multiples of four bytes, minimum value being four bytes:
|
|
|
|
real_total_size = (stored_total_size + 1) * 4;
|
|
|
|
The value is stored using the encoding described in Section
|
|
1.2.
|
|
|
|
|
|
4.3.2. Uncompressed Size
|
|
|
|
This field indicates the Uncompressed Size of the respective
|
|
Block as bytes. The value is stored using the encoding
|
|
described in Section 1.2.
|
|
|
|
|
|
4.4. Index Padding
|
|
|
|
This field must contain 0-3 nul bytes to pad the Index to
|
|
a multiple of four bytes.
|
|
|
|
|
|
4.5. CRC32
|
|
|
|
The CRC32 is calculated over everything in the Index field
|
|
except the CRC32 field itself. The CRC32 is stored as an
|
|
unsigned 32-bit little endian integer. If the calculated
|
|
value does not match the stored one, the decoder must indicate
|
|
an error.
|
|
|
|
|
|
5. Filter Chains
|
|
|
|
The Block Flags field defines how many filters are used. When
|
|
more than one filter is used, the filters are chained; that is,
|
|
the output of one filter is the input of another filter. The
|
|
following figure illustrates the direction of data flow.
|
|
|
|
v Uncompressed Data ^
|
|
| Filter 0 |
|
|
Encoder | Filter 1 | Decoder
|
|
| Filter n |
|
|
v Compressed Data ^
|
|
|
|
|
|
5.1. Alignment
|
|
|
|
Alignment of uncompressed input data is usually the job of
|
|
the application producing the data. For example, to get the
|
|
best results, an archiver tool should make sure that all
|
|
PowerPC executable files in the archive stream start at
|
|
offsets that are multiples of four bytes.
|
|
|
|
Some filters, for example LZMA, can be configured to take
|
|
advantage of specified alignment of input data. Note that
|
|
taking advantage of aligned input can be benefical also when
|
|
a filter is not the first filter in the chain. For example,
|
|
if you compress PowerPC executables, you may want to use the
|
|
PowerPC filter and chain that with the LZMA filter. Because not
|
|
only the input but also the output alignment of the PowerPC
|
|
filter is four bytes, it is now benefical to set LZMA settings
|
|
so that the LZMA encoder can take advantage of its
|
|
four-byte-aligned input data.
|
|
|
|
The output of the last filter in the chain is stored to the
|
|
Compressed Data field, which is is guaranteed to be aligned
|
|
to a multiple of four bytes relative to the beginning of the
|
|
Stream. This can increase
|
|
- speed, if the filtered data is handled multiple bytes at
|
|
a time by the filter-specific encoder and decoder,
|
|
because accessing aligned data in computer memory is
|
|
usually faster; and
|
|
- compression ratio, if the output data is later compressed
|
|
with an external compression tool.
|
|
|
|
|
|
5.2. Security
|
|
|
|
If filters would be allowed to be chained freely, it would be
|
|
possible to create malicious files, that would be very slow to
|
|
decode. Such files could be used to create denial of service
|
|
attacks.
|
|
|
|
Slow files could occur when multiple filters are chained:
|
|
|
|
v Compressed input data
|
|
| Filter 1 decoder (last filter)
|
|
| Filter 0 decoder (non-last filter)
|
|
v Uncompressed output data
|
|
|
|
The decoder of the last filter in the chain produces a lot of
|
|
output from little input. Another filter in the chain takes the
|
|
output of the last filter, and produces very little output
|
|
while consuming a lot of input. As a result, a lot of data is
|
|
moved inside the filter chain, but the filter chain as a whole
|
|
gets very little work done.
|
|
|
|
To prevent this kind of slow files, there are restrictions on
|
|
how the filters can be chained. These restrictions must be
|
|
taken into account when designing new filters.
|
|
|
|
The maximum number of filters in the chain has been limited to
|
|
four, thus there can be at maximum of three non-last filters.
|
|
Of these three non-last filters, only two are allowed to change
|
|
the size of the data.
|
|
|
|
The non-last filters, that change the size of the data, must
|
|
have a limit how much the decoder can compress the data: the
|
|
decoder should produce at least n bytes of output when the
|
|
filter is given 2n bytes of input. This limit is not
|
|
absolute, but significant deviations must be avoided.
|
|
|
|
The above limitations guarantee that if the last filter in the
|
|
chain produces 4n bytes of output, the chain as a whole will
|
|
produce at least n bytes of output.
|
|
|
|
|
|
5.3. Filters
|
|
|
|
5.3.1. LZMA
|
|
|
|
LZMA (Lempel-Ziv-Markov chain-Algorithm) is a general-purporse
|
|
compression algorithm with high compression ratio and fast
|
|
decompression. LZMA is based on LZ77 and range coding
|
|
algorithms.
|
|
|
|
Filter ID: 0x40
|
|
Size of Filter Properties: 5 bytes
|
|
Changes size of data: Yes
|
|
Allow as a non-last filter: No
|
|
Allow as the last filter: Yes
|
|
|
|
Preferred alignment:
|
|
Input data: Adjustable to 1/2/4/8/16 byte(s)
|
|
Output data: 1 byte
|
|
|
|
At the time of writing, there is no other documentation about
|
|
how LZMA works than the source code in LZMA SDK. Once such
|
|
documentation gets written, it will probably be published as
|
|
a separate document, because including the documentation here
|
|
would lengthen this document considerably.
|
|
|
|
The format of the Filter Properties field is as follows:
|
|
|
|
+-----------------+----+----+----+----+
|
|
| LZMA Properties | Dictionary Size |
|
|
+-----------------+----+----+----+----+
|
|
|
|
The LZMA Properties field contains three properties. An
|
|
abbreviation is given in parentheses, followed by the value
|
|
range of the property. The field consists of
|
|
|
|
1) the number of literal context bits (lc, [0, 4]);
|
|
2) the number of literal position bits (lp, [0, 4]); and
|
|
3) the number of position bits (pb, [0, 4]).
|
|
|
|
In addition to above ranges, the sum of lc and lp must not
|
|
exceed four. Note that this limit didn't exist in the old
|
|
LZMA_Alone format, which allowed lc to be in the range [0, 8].
|
|
|
|
The properties are encoded using the following formula:
|
|
|
|
LZMA Properties = (pb * 5 + lp) * 9 + lc
|
|
|
|
The following C code illustrates a straightforward way to
|
|
decode the properties:
|
|
|
|
uint8_t lc, lp, pb;
|
|
uint8_t prop = get_lzma_properties();
|
|
if (prop > (4 * 5 + 4) * 9 + 8)
|
|
return LZMA_PROPERTIES_ERROR;
|
|
|
|
pb = prop / (9 * 5);
|
|
prop -= pb * 9 * 5;
|
|
lp = prop / 9;
|
|
lc = prop - lp * 9;
|
|
|
|
if (lc + lp > 4)
|
|
return LZMA_PROPERTIES_ERROR;
|
|
|
|
Dictionary Size is encoded as unsigned 32-bit little endian
|
|
integer.
|
|
|
|
|
|
5.3.2. LZMA2
|
|
|
|
LZMA2 is an extensions on top of the original LZMA. LZMA2 uses
|
|
LZMA internally, but adds support for flushing the encoder,
|
|
uncompressed chunks, eases stateful decoder implementations,
|
|
and improves support for multithreading. For most uses, it is
|
|
recommended to use LZMA2 instead of LZMA.
|
|
|
|
Filter ID: 0x21
|
|
Size of Filter Properties: 1 byte
|
|
Changes size of data: Yes
|
|
Allow as a non-last filter: No
|
|
Allow as the last filter: Yes
|
|
|
|
Preferred alignment:
|
|
Input data: Adjustable to 1/2/4/8/16 byte(s)
|
|
Output data: 1 byte
|
|
|
|
The format of the one-byte Filter Properties field is as
|
|
follows:
|
|
|
|
Bits Mask Description
|
|
0-5 0x3F Dictionary Size
|
|
6-7 0xC0 Reserved for future use; must be zero for now.
|
|
|
|
Dictionary Size is encoded with one-bit mantissa and five-bit
|
|
exponent. The smallest dictionary size is 4 KiB and the biggest
|
|
is 4 GiB.
|
|
|
|
Raw value Mantissa Exponent Dictionary size
|
|
0 2 11 4 KiB
|
|
1 3 11 6 KiB
|
|
2 2 12 8 KiB
|
|
3 3 12 12 KiB
|
|
4 2 13 16 KiB
|
|
5 3 13 24 KiB
|
|
6 2 14 32 KiB
|
|
... ... ... ...
|
|
35 3 27 768 MiB
|
|
36 2 28 1024 MiB
|
|
37 3 29 1536 MiB
|
|
38 2 30 2048 MiB
|
|
39 3 30 3072 MiB
|
|
40 2 31 4096 MiB - 1 B
|
|
|
|
Instead of having a table in the decoder, the dictionary size
|
|
can be decoded using the following C code:
|
|
|
|
const uint8_t bits = get_dictionary_flags() & 0x3F;
|
|
if (bits > 40)
|
|
return DICTIONARY_TOO_BIG; // Bigger than 4 GiB
|
|
|
|
uint32_t dictionary_size;
|
|
if (bits == 40) {
|
|
dictionary_size = UINT32_MAX;
|
|
} else {
|
|
dictionary_size = 2 | (bits & 1);
|
|
dictionary_size <<= bits / 2 + 11;
|
|
}
|
|
|
|
|
|
5.3.3. Branch/Call/Jump Filters for Executables
|
|
|
|
These filters convert relative branch, call, and jump
|
|
instructions to their absolute counterparts in executable
|
|
files. This conversion increases redundancy and thus
|
|
compression ratio.
|
|
|
|
Size of Filter Properties: 0 or 4 bytes
|
|
Changes size of data: No
|
|
Allow as a non-last filter: Yes
|
|
Allow as the last filter: No
|
|
|
|
Detecting when all of the data has been decoded:
|
|
Uncompressed size: Yes
|
|
End of Payload Marker: No
|
|
End of Input: Yes
|
|
|
|
Below is the list of filters in this category. The alignment
|
|
is the same for both input and output data.
|
|
|
|
Filter ID Alignment Description
|
|
0x04 1 byte x86 filter (BCJ)
|
|
0x05 4 bytes PowerPC (big endian) filter
|
|
0x06 16 bytes IA64 filter
|
|
0x07 4 bytes ARM (little endian) filter
|
|
0x08 2 bytes ARM Thumb (little endian) filter
|
|
0x09 4 bytes SPARC filter
|
|
|
|
If the size of Filter Properties is four bytes, the Filter
|
|
Properties field contains the start offset used for address
|
|
conversions. It is stored as an unsigned 32-bit little endian
|
|
integer. If the size of Filter Properties is zero, the start
|
|
offset is zero.
|
|
|
|
Setting the start offset may be useful if an executable has
|
|
multiple sections, and there are many cross-section calls.
|
|
Taking advantage of this feature usually requires usage of
|
|
the Subblock filter.
|
|
|
|
|
|
5.3.4. Delta
|
|
|
|
The Delta filter may increase compression ratio when the value
|
|
of the next byte correlates with the value of an earlier byte
|
|
at specified distance.
|
|
|
|
Filter ID: 0x03
|
|
Size of Filter Properties: 1 byte
|
|
Changes size of data: No
|
|
Allow as a non-last filter: Yes
|
|
Allow as the last filter: No
|
|
|
|
Preferred alignment:
|
|
Input data: 1 byte
|
|
Output data: Same as the original input data
|
|
|
|
The Properties byte indicates the delta distance, which can be
|
|
1-256 bytes backwards from the current byte: 0x00 indicates
|
|
distance of 1 byte and 0xFF distance of 256 bytes.
|
|
|
|
|
|
5.3.4.1. Format of the Encoded Output
|
|
|
|
The code below illustrates both encoding and decoding with
|
|
the Delta filter.
|
|
|
|
// Distance is in the range [1, 256].
|
|
const unsigned int distance = get_properties_byte() + 1;
|
|
uint8_t pos = 0;
|
|
uint8_t delta[256];
|
|
|
|
memset(delta, 0, sizeof(delta));
|
|
|
|
while (1) {
|
|
const int byte = read_byte();
|
|
if (byte == EOF)
|
|
break;
|
|
|
|
uint8_t tmp = delta[(uint8_t)(distance + pos)];
|
|
if (is_encoder) {
|
|
tmp = (uint8_t)(byte) - tmp;
|
|
delta[pos] = (uint8_t)(byte);
|
|
} else {
|
|
tmp = (uint8_t)(byte) + tmp;
|
|
delta[pos] = tmp;
|
|
}
|
|
|
|
write_byte(tmp);
|
|
--pos;
|
|
}
|
|
|
|
|
|
5.4. Custom Filter IDs
|
|
|
|
If a developer wants to use custom Filter IDs, he has two
|
|
choices. The first choice is to contact Lasse Collin and ask
|
|
him to allocate a range of IDs for the developer.
|
|
|
|
The second choice is to generate a 40-bit random integer,
|
|
which the developer can use as his personal Developer ID.
|
|
To minimalize the risk of collisions, Developer ID has to be
|
|
a randomly generated integer, not manually selected "hex word".
|
|
The following command, which works on many free operating
|
|
systems, can be used to generate Developer ID:
|
|
|
|
dd if=/dev/urandom bs=5 count=1 | hexdump
|
|
|
|
The developer can then use his Developer ID to create unique
|
|
(well, hopefully unique) Filter IDs.
|
|
|
|
Bits Mask Description
|
|
0-15 0x0000_0000_0000_FFFF Filter ID
|
|
16-55 0x00FF_FFFF_FFFF_0000 Developer ID
|
|
56-62 0x3F00_0000_0000_0000 Static prefix: 0x3F
|
|
|
|
The resulting 63-bit integer will use 9 bytes of space when
|
|
stored using the encoding described in Section 1.2. To get
|
|
a shorter ID, see the beginning of this Section how to
|
|
request a custom ID range.
|
|
|
|
|
|
5.4.1. Reserved Custom Filter ID Ranges
|
|
|
|
Range Description
|
|
0x0002_0000 - 0x0007_FFFF Reserved to ease .7z compatibility
|
|
0x0200_0000 - 0x07FF_FFFF Reserved to ease .7z compatibility
|
|
|
|
|
|
6. Cyclic Redundancy Checks
|
|
|
|
There are several incompatible variations to calculate CRC32
|
|
and CRC64. For simplicity and clarity, complete examples are
|
|
provided to calculate the checks as they are used in this file
|
|
format. Implementations may use different code as long as it
|
|
gives identical results.
|
|
|
|
The program below reads data from standard input, calculates
|
|
the CRC32 and CRC64 values, and prints the calculated values
|
|
as big endian hexadecimal strings to standard output.
|
|
|
|
#include <sys/types.h>
|
|
#include <inttypes.h>
|
|
#include <stdio.h>
|
|
|
|
uint32_t crc32_table[256];
|
|
uint64_t crc64_table[256];
|
|
|
|
void
|
|
init(void)
|
|
{
|
|
static const uint32_t poly32 = UINT32_C(0xEDB88320);
|
|
static const uint64_t poly64
|
|
= UINT64_C(0xC96C5795D7870F42);
|
|
|
|
for (size_t i = 0; i < 256; ++i) {
|
|
uint32_t crc32 = i;
|
|
uint64_t crc64 = i;
|
|
|
|
for (size_t j = 0; j < 8; ++j) {
|
|
if (crc32 & 1)
|
|
crc32 = (crc32 >> 1) ^ poly32;
|
|
else
|
|
crc32 >>= 1;
|
|
|
|
if (crc64 & 1)
|
|
crc64 = (crc64 >> 1) ^ poly64;
|
|
else
|
|
crc64 >>= 1;
|
|
}
|
|
|
|
crc32_table[i] = crc32;
|
|
crc64_table[i] = crc64;
|
|
}
|
|
}
|
|
|
|
uint32_t
|
|
crc32(const uint8_t *buf, size_t size, uint32_t crc)
|
|
{
|
|
crc = ~crc;
|
|
for (size_t i = 0; i < size; ++i)
|
|
crc = crc32_table[buf[i] ^ (crc & 0xFF)]
|
|
^ (crc >> 8);
|
|
return ~crc;
|
|
}
|
|
|
|
uint64_t
|
|
crc64(const uint8_t *buf, size_t size, uint64_t crc)
|
|
{
|
|
crc = ~crc;
|
|
for (size_t i = 0; i < size; ++i)
|
|
crc = crc64_table[buf[i] ^ (crc & 0xFF)]
|
|
^ (crc >> 8);
|
|
return ~crc;
|
|
}
|
|
|
|
int
|
|
main()
|
|
{
|
|
init();
|
|
|
|
uint32_t value32 = 0;
|
|
uint64_t value64 = 0;
|
|
uint64_t total_size = 0;
|
|
uint8_t buf[8192];
|
|
|
|
while (1) {
|
|
const size_t buf_size = fread(buf, 1, 8192, stdin);
|
|
if (buf_size == 0)
|
|
break;
|
|
|
|
total_size += buf_size;
|
|
value32 = crc32(buf, buf_size, value32);
|
|
value64 = crc64(buf, buf_size, value64);
|
|
}
|
|
|
|
printf("Bytes: %" PRIu64 "\n", total_size);
|
|
printf("CRC-32: 0x%08" PRIX32 "\n", value32);
|
|
printf("CRC-64: 0x%016" PRIX64 "\n", value64);
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
7. References
|
|
|
|
LZMA SDK - The original LZMA implementation
|
|
http://7-zip.org/sdk.html
|
|
|
|
LZMA Utils - LZMA adapted to POSIX-like systems
|
|
http://tukaani.org/lzma/
|
|
|
|
[RFC-1952]
|
|
GZIP file format specification version 4.3
|
|
http://www.ietf.org/rfc/rfc1952.txt
|
|
- Notation of byte boxes in section `2.1. Overall conventions'
|
|
|
|
[RFC-2119]
|
|
Key words for use in RFCs to Indicate Requirement Levels
|
|
http://www.ietf.org/rfc/rfc2119.txt
|
|
|
|
[GNU-tar]
|
|
GNU tar 1.16.1 manual
|
|
http://www.gnu.org/software/tar/manual/html_node/Blocking-Factor.html
|
|
- Node 9.4.2 `Blocking Factor', paragraph that begins
|
|
`gzip will complain about trailing garbage'
|
|
- Note that this URL points to the latest version of the
|
|
manual, and may some day not contain the note which is in
|
|
1.16.1. For the exact version of the manual, download GNU
|
|
tar 1.16.1: ftp://ftp.gnu.org/pub/gnu/tar/tar-1.16.1.tar.gz
|
|
|