From 3176f992c55b8d788c4633809aaf9447376a5a12 Mon Sep 17 00:00:00 2001
From: Lasse Collin <lasse.collin@tukaani.org>
Date: Sat, 8 Oct 2022 21:28:15 +0300
Subject: [PATCH] xz: Add .lz (lzip) decompression support.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If configured with --disable-lzip-decoder then --long-help will
still list `lzip' in --format but I left it like that since
due to translations it would be messy to have two help strings.
Features are disabled only in special situations so wrong help
in such a situation shouldn't matter much.

Thanks to Michał Górny for the original patch.
---
 src/xz/args.c    |  9 +++++++
 src/xz/coder.c   | 68 +++++++++++++++++++++++++++++++++++++++++++++---
 src/xz/coder.h   |  3 +++
 src/xz/message.c |  2 +-
 src/xz/suffix.c  | 26 +++++++++++++++---
 src/xz/xz.1      | 46 ++++++++++++++++++++++++++++----
 6 files changed, 141 insertions(+), 13 deletions(-)

diff --git a/src/xz/args.c b/src/xz/args.c
index 941214b5..2af39098 100644
--- a/src/xz/args.c
+++ b/src/xz/args.c
@@ -412,6 +412,9 @@ parse_real(args_info *args, int argc, char **argv)
 				{ "xz",     FORMAT_XZ },
 				{ "lzma",   FORMAT_LZMA },
 				{ "alone",  FORMAT_LZMA },
+#ifdef HAVE_LZIP_DECODER
+				{ "lzip",   FORMAT_LZIP },
+#endif
 				// { "gzip",   FORMAT_GZIP },
 				// { "gz",     FORMAT_GZIP },
 				{ "raw",    FORMAT_RAW },
@@ -668,6 +671,12 @@ args_parse(args_info *args, int argc, char **argv)
 				"at build time"));
 #endif
 
+#ifdef HAVE_LZIP_DECODER
+	if (opt_mode == MODE_COMPRESS && opt_format == FORMAT_LZIP)
+		message_fatal(_("Compression of lzip files (.lz) "
+				"is not supported"));
+#endif
+
 	// Never remove the source file when the destination is not on disk.
 	// In test mode the data is written nowhere, but setting opt_stdout
 	// will make the rest of the code behave well.
diff --git a/src/xz/coder.c b/src/xz/coder.c
index 5bca958f..05f22888 100644
--- a/src/xz/coder.c
+++ b/src/xz/coder.c
@@ -51,6 +51,11 @@ static lzma_check check;
 /// This becomes false if the --check=CHECK option is used.
 static bool check_default = true;
 
+/// Indicates if unconsumed input is allowed to remain after
+/// decoding has successfully finished. This is set for each file
+/// in coder_init().
+static bool allow_trailing_input;
+
 #ifdef MYTHREAD_ENABLED
 static lzma_mt mt_options = {
 	.flags = 0,
@@ -136,6 +141,11 @@ memlimit_too_small(uint64_t memory_usage)
 extern void
 coder_set_compression_settings(void)
 {
+#ifdef HAVE_LZIP_DECODER
+	// .lz compression isn't supported.
+	assert(opt_format != FORMAT_LZIP);
+#endif
+
 	// The default check type is CRC64, but fallback to CRC32
 	// if CRC64 isn't supported by the copy of liblzma we are
 	// using. CRC32 is always supported.
@@ -470,6 +480,18 @@ is_format_lzma(void)
 
 	return true;
 }
+
+
+#ifdef HAVE_LZIP_DECODER
+/// Return true if the data in in_buf seems to be in the .lz format.
+static bool
+is_format_lzip(void)
+{
+	static const uint8_t magic[4] = { 0x4C, 0x5A, 0x49, 0x50 };
+	return strm.avail_in >= sizeof(magic)
+			&& memcmp(in_buf.u8, magic, sizeof(magic)) == 0;
+}
+#endif
 #endif
 
 
@@ -483,6 +505,12 @@ coder_init(file_pair *pair)
 {
 	lzma_ret ret = LZMA_PROG_ERROR;
 
+	// In most cases if there is input left when coding finishes,
+	// something has gone wrong. Exceptions are --single-stream
+	// and decoding .lz files which can contain trailing non-.lz data.
+	// These will be handled later in this function.
+	allow_trailing_input = false;
+
 	if (opt_mode == MODE_COMPRESS) {
 #ifdef HAVE_ENCODERS
 		switch (opt_format) {
@@ -506,6 +534,14 @@ coder_init(file_pair *pair)
 			ret = lzma_alone_encoder(&strm, filters[0].options);
 			break;
 
+#	ifdef HAVE_LZIP_DECODER
+		case FORMAT_LZIP:
+			// args.c should disallow this.
+			assert(0);
+			ret = LZMA_PROG_ERROR;
+			break;
+#	endif
+
 		case FORMAT_RAW:
 			ret = lzma_raw_encoder(&strm, filters);
 			break;
@@ -522,7 +558,9 @@ coder_init(file_pair *pair)
 		else
 			flags |= LZMA_TELL_UNSUPPORTED_CHECK;
 
-		if (!opt_single_stream)
+		if (opt_single_stream)
+			allow_trailing_input = true;
+		else
 			flags |= LZMA_CONCATENATED;
 
 		// We abuse FORMAT_AUTO to indicate unknown file format,
@@ -531,8 +569,14 @@ coder_init(file_pair *pair)
 
 		switch (opt_format) {
 		case FORMAT_AUTO:
+			// .lz is checked before .lzma since .lzma detection
+			// is more complicated (no magic bytes).
 			if (is_format_xz())
 				init_format = FORMAT_XZ;
+#	ifdef HAVE_LZIP_DECODER
+			else if (is_format_lzip())
+				init_format = FORMAT_LZIP;
+#	endif
 			else if (is_format_lzma())
 				init_format = FORMAT_LZMA;
 			break;
@@ -547,6 +591,13 @@ coder_init(file_pair *pair)
 				init_format = FORMAT_LZMA;
 			break;
 
+#	ifdef HAVE_LZIP_DECODER
+		case FORMAT_LZIP:
+			if (is_format_lzip())
+				init_format = FORMAT_LZIP;
+			break;
+#	endif
+
 		case FORMAT_RAW:
 			init_format = FORMAT_RAW;
 			break;
@@ -604,6 +655,15 @@ coder_init(file_pair *pair)
 						MODE_DECOMPRESS));
 			break;
 
+#	ifdef HAVE_LZIP_DECODER
+		case FORMAT_LZIP:
+			allow_trailing_input = true;
+			ret = lzma_lzip_decoder(&strm,
+					hardware_memlimit_get(
+						MODE_DECOMPRESS), flags);
+			break;
+#	endif
+
 		case FORMAT_RAW:
 			// Memory usage has already been checked in
 			// coder_set_compression_settings().
@@ -864,7 +924,7 @@ coder_normal(file_pair *pair)
 			}
 
 			if (ret == LZMA_STREAM_END) {
-				if (opt_single_stream) {
+				if (allow_trailing_input) {
 					io_fix_src_pos(pair, strm.avail_in);
 					success = true;
 					break;
@@ -872,7 +932,9 @@ coder_normal(file_pair *pair)
 
 				// Check that there is no trailing garbage.
 				// This is needed for LZMA_Alone and raw
-				// streams.
+				// streams. This is *not* done with .lz files
+				// as that format specifically requires
+				// allowing trailing garbage.
 				if (strm.avail_in == 0 && !pair->src_eof) {
 					// Try reading one more byte.
 					// Hopefully we don't get any more
diff --git a/src/xz/coder.h b/src/xz/coder.h
index 583da8f6..2930df9a 100644
--- a/src/xz/coder.h
+++ b/src/xz/coder.h
@@ -23,6 +23,9 @@ enum format_type {
 	FORMAT_AUTO,
 	FORMAT_XZ,
 	FORMAT_LZMA,
+#ifdef HAVE_LZIP_DECODER
+	FORMAT_LZIP,
+#endif
 	// HEADER_GZIP,
 	FORMAT_RAW,
 };
diff --git a/src/xz/message.c b/src/xz/message.c
index 651a890f..831b4f9d 100644
--- a/src/xz/message.c
+++ b/src/xz/message.c
@@ -1150,7 +1150,7 @@ message_help(bool long_help)
 		puts(_("\n Basic file format and compression options:\n"));
 		puts(_(
 "  -F, --format=FMT    file format to encode or decode; possible values are\n"
-"                      `auto' (default), `xz', `lzma', and `raw'\n"
+"                      `auto' (default), `xz', `lzma', `lzip', and `raw'\n"
 "  -C, --check=CHECK   integrity check type: `none' (use with caution),\n"
 "                      `crc32', `crc64' (default), or `sha256'"));
 		puts(_(
diff --git a/src/xz/suffix.c b/src/xz/suffix.c
index 9d4fcd13..55e4ee2f 100644
--- a/src/xz/suffix.c
+++ b/src/xz/suffix.c
@@ -119,7 +119,10 @@ uncompressed_name(const char *src_name, const size_t src_len)
 #ifdef __DJGPP__
 		{ ".lzm",   "" },
 #endif
-		{ ".tlz",   ".tar" },
+		{ ".tlz",   ".tar" }, // Both .tar.lzma and .tar.lz
+#ifdef HAVE_LZIP_DECODER
+		{ ".lz",    "" },
+#endif
 		// { ".gz",    "" },
 		// { ".tgz",   ".tar" },
 	};
@@ -208,6 +211,15 @@ compressed_name(const char *src_name, size_t src_len)
 #endif
 			".tlz",
 			NULL
+#ifdef HAVE_LZIP_DECODER
+		// This is needed to keep the table indexing in sync with
+		// enum format_type from coder.h.
+		}, {
+/*
+			".lz",
+*/
+			NULL
+#endif
 /*
 		}, {
 			".gz",
@@ -221,8 +233,11 @@ compressed_name(const char *src_name, size_t src_len)
 		}
 	};
 
-	// args.c ensures this.
+	// args.c ensures these.
 	assert(opt_format != FORMAT_AUTO);
+#ifdef HAVE_LZIP_DECODER
+	assert(opt_format != FORMAT_LZIP);
+#endif
 
 	const size_t format = opt_format - 1;
 	const char *const *suffixes = all_suffixes[format];
@@ -299,8 +314,11 @@ compressed_name(const char *src_name, size_t src_len)
 			// xz foo.tar          -> foo.txz
 			// xz -F lzma foo.tar  -> foo.tlz
 			static const char *const tar_suffixes[] = {
-				".txz",
-				".tlz",
+				".txz", // .tar.xz
+				".tlz", // .tar.lzma
+/*
+				".tlz", // .tar.lz
+*/
 				// ".tgz",
 			};
 			suffix = tar_suffixes[format];
diff --git a/src/xz/xz.1 b/src/xz/xz.1
index e11f4ac2..5e11a332 100644
--- a/src/xz/xz.1
+++ b/src/xz/xz.1
@@ -5,7 +5,7 @@
 .\" This file has been put into the public domain.
 .\" You can do whatever you want with this file.
 .\"
-.TH XZ 1 "2022-11-07" "Tukaani" "XZ Utils"
+.TH XZ 1 "2022-11-09" "Tukaani" "XZ Utils"
 .
 .SH NAME
 xz, unxz, xzcat, lzma, unlzma, lzcat \- Compress or decompress .xz and .lzma files
@@ -62,6 +62,11 @@ format, but the legacy
 format used by LZMA Utils and
 raw compressed streams with no container format headers
 are also supported.
+In addition, decompression of the
+.B .lz
+format used by
+.B lzip
+is supported.
 .PP
 .B xz
 compresses or decompresses each
@@ -102,9 +107,10 @@ or
 is appended to the source filename to get the target filename.
 .IP \(bu 3
 When decompressing, the
-.B .xz
+.BR .xz ,
+.BR .lzma ,
 or
-.B .lzma
+.B .lz
 suffix is removed from the filename to get the target filename.
 .B xz
 also recognizes the suffixes
@@ -158,8 +164,9 @@ doesn't have a suffix of any of the supported file formats
 .RB ( .xz ,
 .BR .txz ,
 .BR .lzma ,
+.BR .tlz ,
 or
-.BR .tlz ).
+.BR .lz ).
 .PP
 After successfully compressing or decompressing the
 .IR file ,
@@ -507,8 +514,9 @@ in addition to files with the
 .BR .xz ,
 .BR .txz ,
 .BR .lzma ,
+.BR .tlz ,
 or
-.B .tlz
+.B .lz
 suffix.
 If the source file has the suffix
 .IR .suf ,
@@ -575,6 +583,34 @@ The alternative name
 .B alone
 is provided for backwards compatibility with LZMA Utils.
 .TP
+.B lzip
+Accept only
+.B .lz
+files when decompressing.
+Compression is not supported.
+.IP ""
+The
+.B .lz
+format version 0 and the unextended version 1 are supported.
+Version 0 files were produced by
+.B lzip
+1.3 and older.
+Such files aren't common but may be found from file archives
+as a few source packages were released in this format.
+People might have old personal files in this format too.
+Decompression support for the format version 0 was removed in
+.B lzip
+1.18.
+.IP ""
+.B lzip
+1.4 and later create files in the format version 1.
+The sync flush marker extension to the format version 1 was added in
+.B lzip
+1.6.
+This extension is rarely used and isn't supported by
+.B xz
+(diagnosed as corrupt input).
+.TP
 .B raw
 Compress or uncompress a raw stream (no headers).
 This is meant for advanced users only.