diff --git a/src/liblzma/lz/lz_decoder.c b/src/liblzma/lz/lz_decoder.c
index 73bf20d9..92913f22 100644
--- a/src/liblzma/lz/lz_decoder.c
+++ b/src/liblzma/lz/lz_decoder.c
@@ -53,9 +53,10 @@ typedef struct {
 static void
 lz_decoder_reset(lzma_coder *coder)
 {
-	coder->dict.pos = 0;
+	coder->dict.pos = 2 * LZ_DICT_REPEAT_MAX;
 	coder->dict.full = 0;
-	coder->dict.buf[coder->dict.size - 1] = '\0';
+	coder->dict.buf[2 * LZ_DICT_REPEAT_MAX - 1] = '\0';
+	coder->dict.has_wrapped = false;
 	coder->dict.need_reset = false;
 	return;
 }
@@ -69,8 +70,15 @@ decode_buffer(lzma_coder *coder,
 {
 	while (true) {
 		// Wrap the dictionary if needed.
-		if (coder->dict.pos == coder->dict.size)
-			coder->dict.pos = 0;
+		if (coder->dict.pos == coder->dict.size) {
+			// See the comment of #define LZ_DICT_REPEAT_MAX.
+			coder->dict.pos = LZ_DICT_REPEAT_MAX;
+			coder->dict.has_wrapped = true;
+			memcpy(coder->dict.buf, coder->dict.buf
+						+ coder->dict.size
+						- LZ_DICT_REPEAT_MAX,
+					LZ_DICT_REPEAT_MAX);
+		}
 
 		// Store the current dictionary position. It is needed to know
 		// where to start copying to the out[] buffer.
@@ -252,21 +260,31 @@ lzma_lz_decoder_init(lzma_next_coder *next, const lzma_allocator *allocator,
 	// dictionary to the output buffer, since applications are
 	// recommended to give aligned buffers to liblzma.
 	//
+	// Reserve 2 * LZ_DICT_REPEAT_MAX bytes of extra space which is
+	// needed for alloc_size.
+	//
 	// Avoid integer overflow.
-	if (lz_options.dict_size > SIZE_MAX - 15)
+	if (lz_options.dict_size > SIZE_MAX - 15 - 2 * LZ_DICT_REPEAT_MAX)
 		return LZMA_MEM_ERROR;
 
 	lz_options.dict_size = (lz_options.dict_size + 15) & ~((size_t)(15));
 
+	// Reserve extra space as explained in the comment
+	// of #define LZ_DICT_REPEAT_MAX.
+	const size_t alloc_size
+			= lz_options.dict_size + 2 * LZ_DICT_REPEAT_MAX;
+
 	// Allocate and initialize the dictionary.
-	if (coder->dict.size != lz_options.dict_size) {
+	if (coder->dict.size != alloc_size) {
 		lzma_free(coder->dict.buf, allocator);
-		coder->dict.buf
-				= lzma_alloc(lz_options.dict_size, allocator);
+		coder->dict.buf = lzma_alloc(alloc_size, allocator);
 		if (coder->dict.buf == NULL)
 			return LZMA_MEM_ERROR;
 
-		coder->dict.size = lz_options.dict_size;
+		// NOTE: Yes, alloc_size, not lz_options.dict_size. The way
+		// coder->dict.full is updated will take care that we will
+		// still reject distances larger than lz_options.dict_size.
+		coder->dict.size = alloc_size;
 	}
 
 	lz_decoder_reset(next->coder);
@@ -279,9 +297,12 @@ lzma_lz_decoder_init(lzma_next_coder *next, const lzma_allocator *allocator,
 		const size_t copy_size = my_min(lz_options.preset_dict_size,
 				lz_options.dict_size);
 		const size_t offset = lz_options.preset_dict_size - copy_size;
-		memcpy(coder->dict.buf, lz_options.preset_dict + offset,
+		memcpy(coder->dict.buf + coder->dict.pos,
+				lz_options.preset_dict + offset,
 				copy_size);
-		coder->dict.pos = copy_size;
+
+		// dict.pos isn't zero after lz_decoder_reset().
+		coder->dict.pos += copy_size;
 		coder->dict.full = copy_size;
 	}
 
diff --git a/src/liblzma/lz/lz_decoder.h b/src/liblzma/lz/lz_decoder.h
index 3b41649c..da273480 100644
--- a/src/liblzma/lz/lz_decoder.h
+++ b/src/liblzma/lz/lz_decoder.h
@@ -16,10 +16,28 @@
 #include "common.h"
 
 
+/// Maximum length of a match rounded up to a nice power of 2 which is
+/// a good size for aligned memcpy(). The allocated dictionary buffer will
+/// be 2 * LZ_DICT_REPEAT_MAX bytes larger than the actual dictionary size:
+///
+/// (1) Every time the decoder reaches the end of the dictionary buffer,
+///     the last LZ_DICT_REPEAT_MAX bytes will be copied to the beginning.
+///     This way dict_repeat() will only need to copy from one place,
+///     never from both the end and beginning of the buffer.
+///
+/// (2) The other LZ_DICT_REPEAT_MAX bytes is kept as a buffer between
+///     the oldest byte still in the dictionary and the current write
+///     position. This way dict_repeat(dict, dict->size - 1, &len)
+///     won't need memmove() as the copying cannot overlap.
+///
+/// Note that memcpy() still cannot be used if distance < len.
+///
+/// LZMA's longest match length is 273 so pick a multiple of 16 above that.
+#define LZ_DICT_REPEAT_MAX 288
+
+
 typedef struct {
-	/// Pointer to the dictionary buffer. It can be an allocated buffer
-	/// internal to liblzma, or it can a be a buffer given by the
-	/// application when in single-call mode (not implemented yet).
+	/// Pointer to the dictionary buffer.
 	uint8_t *buf;
 
 	/// Write position in dictionary. The next byte will be written to
@@ -34,9 +52,16 @@ typedef struct {
 	/// Write limit
 	size_t limit;
 
-	/// Size of the dictionary
+	/// Allocated size of buf. This is 2 * LZ_DICT_REPEAT_MAX bytes
+	/// larger than the actual dictionary size. This is enforced by
+	/// how the value for "full" is set; it can be at most
+	/// "size - 2 * LZ_DICT_REPEAT_MAX".
 	size_t size;
 
+	/// True once the dictionary has become full and the writing position
+	/// has been wrapped in decode_buffer() in lz_decoder.c.
+	bool has_wrapped;
+
 	/// True when dictionary should be reset before decoding more data.
 	bool need_reset;
 
@@ -102,7 +127,16 @@ static inline uint8_t
 dict_get(const lzma_dict *const dict, const uint32_t distance)
 {
 	return dict->buf[dict->pos - distance - 1
-			+ (distance < dict->pos ? 0 : dict->size)];
+			+ (distance < dict->pos
+				? 0 : dict->size - LZ_DICT_REPEAT_MAX)];
+}
+
+
+/// Optimized version of dict_get(dict, 0)
+static inline uint8_t
+dict_get0(const lzma_dict *const dict)
+{
+	return dict->buf[dict->pos - 1];
 }
 
 
@@ -131,50 +165,27 @@ dict_repeat(lzma_dict *dict, uint32_t distance, uint32_t *len)
 	uint32_t left = my_min(dict_avail, *len);
 	*len -= left;
 
+	size_t back = dict->pos - distance - 1;
+	if (distance >= dict->pos)
+		back += dict->size - LZ_DICT_REPEAT_MAX;
+
 	// Repeat a block of data from the history. Because memcpy() is faster
 	// than copying byte by byte in a loop, the copying process gets split
-	// into three cases.
+	// into two cases.
 	if (distance < left) {
 		// Source and target areas overlap, thus we can't use
 		// memcpy() nor even memmove() safely.
 		do {
-			dict->buf[dict->pos] = dict_get(dict, distance);
-			++dict->pos;
+			dict->buf[dict->pos++] = dict->buf[back++];
 		} while (--left > 0);
-
-	} else if (distance < dict->pos) {
-		// The easiest and fastest case
-		memcpy(dict->buf + dict->pos,
-				dict->buf + dict->pos - distance - 1,
-				left);
-		dict->pos += left;
-
 	} else {
-		// The bigger the dictionary, the more rare this
-		// case occurs. We need to "wrap" the dict, thus
-		// we might need two memcpy() to copy all the data.
-		assert(dict->full == dict->size);
-		const uint32_t copy_pos
-				= dict->pos - distance - 1 + dict->size;
-		uint32_t copy_size = dict->size - copy_pos;
-
-		if (copy_size < left) {
-			memmove(dict->buf + dict->pos, dict->buf + copy_pos,
-					copy_size);
-			dict->pos += copy_size;
-			copy_size = left - copy_size;
-			memcpy(dict->buf + dict->pos, dict->buf, copy_size);
-			dict->pos += copy_size;
-		} else {
-			memmove(dict->buf + dict->pos, dict->buf + copy_pos,
-					left);
-			dict->pos += left;
-		}
+		memcpy(dict->buf + dict->pos, dict->buf + back, left);
+		dict->pos += left;
 	}
 
 	// Update how full the dictionary is.
-	if (dict->full < dict->pos)
-		dict->full = dict->pos;
+	if (!dict->has_wrapped)
+		dict->full = dict->pos - 2 * LZ_DICT_REPEAT_MAX;
 
 	return unlikely(*len != 0);
 }
@@ -185,8 +196,8 @@ dict_put(lzma_dict *dict, uint8_t byte)
 {
 	dict->buf[dict->pos++] = byte;
 
-	if (dict->pos > dict->full)
-		dict->full = dict->pos;
+	if (!dict->has_wrapped)
+		dict->full = dict->pos - 2 * LZ_DICT_REPEAT_MAX;
 }
 
 
@@ -198,11 +209,7 @@ dict_put_safe(lzma_dict *dict, uint8_t byte)
 	if (dict->pos == dict->limit)
 		return true;
 
-	dict->buf[dict->pos++] = byte;
-
-	if (dict->pos > dict->full)
-		dict->full = dict->pos;
-
+	dict_put(dict, byte);
 	return false;
 }
 
@@ -226,8 +233,8 @@ dict_write(lzma_dict *restrict dict, const uint8_t *restrict in,
 	*left -= lzma_bufcpy(in, in_pos, in_size,
 			dict->buf, &dict->pos, dict->limit);
 
-	if (dict->pos > dict->full)
-		dict->full = dict->pos;
+	if (!dict->has_wrapped)
+		dict->full = dict->pos - 2 * LZ_DICT_REPEAT_MAX;
 
 	return;
 }
diff --git a/src/liblzma/lzma/lzma_decoder.c b/src/liblzma/lzma/lzma_decoder.c
index c5049a48..81149006 100644
--- a/src/liblzma/lzma/lzma_decoder.c
+++ b/src/liblzma/lzma/lzma_decoder.c
@@ -360,7 +360,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 			// lc params.
 			probs = literal_subcoder(coder->literal,
 					literal_context_bits, literal_pos_mask,
-					dict.pos, dict_get(&dict, 0));
+					dict.pos, dict_get0(&dict));
 
 			if (is_literal_state(state)) {
 				update_literal_normal(state);
@@ -685,7 +685,7 @@ slow:
 
 			probs = literal_subcoder(coder->literal,
 					literal_context_bits, literal_pos_mask,
-					dict.pos, dict_get(&dict, 0));
+					dict.pos, dict_get0(&dict));
 			symbol = 1;
 
 			if (is_literal_state(state)) {