using Ryujinx.Common;
using Ryujinx.Graphics.Device;
using Ryujinx.Graphics.Gpu.Engine.Threed;
using Ryujinx.Graphics.Gpu.Memory;
using Ryujinx.Graphics.Texture;
using System;
using System.Collections.Generic;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
namespace Ryujinx.Graphics.Gpu.Engine.Dma
{
///
/// Represents a DMA copy engine class.
///
class DmaClass : IDeviceState
{
private readonly GpuContext _context;
private readonly GpuChannel _channel;
private readonly ThreedClass _3dEngine;
private readonly DeviceState _state;
///
/// Copy flags passed on DMA launch.
///
[Flags]
private enum CopyFlags
{
SrcLinear = 1 << 7,
DstLinear = 1 << 8,
MultiLineEnable = 1 << 9,
RemapEnable = 1 << 10
}
///
/// Creates a new instance of the DMA copy engine class.
///
/// GPU context
/// GPU channel
/// 3D engine
public DmaClass(GpuContext context, GpuChannel channel, ThreedClass threedEngine)
{
_context = context;
_channel = channel;
_3dEngine = threedEngine;
_state = new DeviceState(new Dictionary
{
{ nameof(DmaClassState.LaunchDma), new RwCallback(LaunchDma, null) }
});
}
///
/// Reads data from the class registers.
///
/// Register byte offset
/// Data at the specified offset
public int Read(int offset) => _state.Read(offset);
///
/// Writes data to the class registers.
///
/// Register byte offset
/// Data to be written
public void Write(int offset, int data) => _state.Write(offset, data);
///
/// Determine if a buffer-to-texture region covers the entirety of a texture.
///
/// Texture to compare
/// True if the texture is linear, false if block linear
/// Texture bytes per pixel
/// Texture stride
/// Number of pixels to be copied
/// Number of lines to be copied
///
private static bool IsTextureCopyComplete(DmaTexture tex, bool linear, int bpp, int stride, int xCount, int yCount)
{
if (linear)
{
// If the stride is negative, the texture has to be flipped, so
// the fast copy is not trivial, use the slow path.
if (stride <= 0)
{
return false;
}
int alignWidth = Constants.StrideAlignment / bpp;
return stride / bpp == BitUtils.AlignUp(xCount, alignWidth);
}
else
{
int alignWidth = Constants.GobAlignment / bpp;
return tex.RegionX == 0 &&
tex.RegionY == 0 &&
tex.Width == BitUtils.AlignUp(xCount, alignWidth) &&
tex.Height == yCount;
}
}
///
/// Releases a semaphore for a given LaunchDma method call.
///
/// The LaunchDma call argument
private void ReleaseSemaphore(int argument)
{
LaunchDmaSemaphoreType type = (LaunchDmaSemaphoreType)((argument >> 3) & 0x3);
if (type != LaunchDmaSemaphoreType.None)
{
ulong address = ((ulong)_state.State.SetSemaphoreA << 32) | _state.State.SetSemaphoreB;
if (type == LaunchDmaSemaphoreType.ReleaseOneWordSemaphore)
{
_channel.MemoryManager.Write(address, _state.State.SetSemaphorePayload);
}
else /* if (type == LaunchDmaSemaphoreType.ReleaseFourWordSemaphore) */
{
_channel.MemoryManager.Write(address + 8, _context.GetTimestamp());
_channel.MemoryManager.Write(address, (ulong)_state.State.SetSemaphorePayload);
}
}
}
///
/// Performs a buffer to buffer, or buffer to texture copy.
///
/// The LaunchDma call argument
private void DmaCopy(int argument)
{
var memoryManager = _channel.MemoryManager;
CopyFlags copyFlags = (CopyFlags)argument;
bool srcLinear = copyFlags.HasFlag(CopyFlags.SrcLinear);
bool dstLinear = copyFlags.HasFlag(CopyFlags.DstLinear);
bool copy2D = copyFlags.HasFlag(CopyFlags.MultiLineEnable);
bool remap = copyFlags.HasFlag(CopyFlags.RemapEnable);
uint size = _state.State.LineLengthIn;
if (size == 0)
{
return;
}
ulong srcGpuVa = ((ulong)_state.State.OffsetInUpperUpper << 32) | _state.State.OffsetInLower;
ulong dstGpuVa = ((ulong)_state.State.OffsetOutUpperUpper << 32) | _state.State.OffsetOutLower;
int xCount = (int)_state.State.LineLengthIn;
int yCount = (int)_state.State.LineCount;
_3dEngine.CreatePendingSyncs();
_3dEngine.FlushUboDirty();
if (copy2D)
{
// Buffer to texture copy.
int componentSize = (int)_state.State.SetRemapComponentsComponentSize + 1;
int srcBpp = remap ? ((int)_state.State.SetRemapComponentsNumSrcComponents + 1) * componentSize : 1;
int dstBpp = remap ? ((int)_state.State.SetRemapComponentsNumDstComponents + 1) * componentSize : 1;
var dst = Unsafe.As(ref _state.State.SetDstBlockSize);
var src = Unsafe.As(ref _state.State.SetSrcBlockSize);
int srcRegionX = 0, srcRegionY = 0, dstRegionX = 0, dstRegionY = 0;
if (!srcLinear)
{
srcRegionX = src.RegionX;
srcRegionY = src.RegionY;
}
if (!dstLinear)
{
dstRegionX = dst.RegionX;
dstRegionY = dst.RegionY;
}
int srcStride = (int)_state.State.PitchIn;
int dstStride = (int)_state.State.PitchOut;
var srcCalculator = new OffsetCalculator(
src.Width,
src.Height,
srcStride,
srcLinear,
src.MemoryLayout.UnpackGobBlocksInY(),
src.MemoryLayout.UnpackGobBlocksInZ(),
srcBpp);
var dstCalculator = new OffsetCalculator(
dst.Width,
dst.Height,
dstStride,
dstLinear,
dst.MemoryLayout.UnpackGobBlocksInY(),
dst.MemoryLayout.UnpackGobBlocksInZ(),
dstBpp);
(int srcBaseOffset, int srcSize) = srcCalculator.GetRectangleRange(srcRegionX, srcRegionY, xCount, yCount);
(int dstBaseOffset, int dstSize) = dstCalculator.GetRectangleRange(dstRegionX, dstRegionY, xCount, yCount);
if (srcLinear && srcStride < 0)
{
srcBaseOffset += srcStride * (yCount - 1);
}
if (dstLinear && dstStride < 0)
{
dstBaseOffset += dstStride * (yCount - 1);
}
ReadOnlySpan srcSpan = memoryManager.GetSpan(srcGpuVa + (ulong)srcBaseOffset, srcSize, true);
bool completeSource = IsTextureCopyComplete(src, srcLinear, srcBpp, srcStride, xCount, yCount);
bool completeDest = IsTextureCopyComplete(dst, dstLinear, dstBpp, dstStride, xCount, yCount);
if (completeSource && completeDest)
{
var target = memoryManager.Physical.TextureCache.FindTexture(
memoryManager,
dstGpuVa,
dstBpp,
dstStride,
dst.Height,
xCount,
yCount,
dstLinear,
dst.MemoryLayout.UnpackGobBlocksInY(),
dst.MemoryLayout.UnpackGobBlocksInZ());
if (target != null)
{
byte[] data;
if (srcLinear)
{
data = LayoutConverter.ConvertLinearStridedToLinear(
target.Info.Width,
target.Info.Height,
1,
1,
xCount * srcBpp,
srcStride,
target.Info.FormatInfo.BytesPerPixel,
srcSpan);
}
else
{
data = LayoutConverter.ConvertBlockLinearToLinear(
src.Width,
src.Height,
src.Depth,
1,
1,
1,
1,
1,
srcBpp,
src.MemoryLayout.UnpackGobBlocksInY(),
src.MemoryLayout.UnpackGobBlocksInZ(),
1,
new SizeInfo((int)target.Size),
srcSpan);
}
target.SynchronizeMemory();
target.SetData(data);
target.SignalModified();
return;
}
else if (srcCalculator.LayoutMatches(dstCalculator))
{
// No layout conversion has to be performed, just copy the data entirely.
memoryManager.Write(dstGpuVa + (ulong)dstBaseOffset, srcSpan);
return;
}
}
unsafe bool Convert(Span dstSpan, ReadOnlySpan srcSpan) where T : unmanaged
{
if (srcLinear && dstLinear && srcBpp == dstBpp)
{
// Optimized path for purely linear copies - we don't need to calculate every single byte offset,
// and we can make use of Span.CopyTo which is very very fast (even compared to pointers)
for (int y = 0; y < yCount; y++)
{
srcCalculator.SetY(srcRegionY + y);
dstCalculator.SetY(dstRegionY + y);
int srcOffset = srcCalculator.GetOffset(srcRegionX);
int dstOffset = dstCalculator.GetOffset(dstRegionX);
srcSpan.Slice(srcOffset - srcBaseOffset, xCount * srcBpp)
.CopyTo(dstSpan.Slice(dstOffset - dstBaseOffset, xCount * dstBpp));
}
}
else
{
fixed (byte* dstPtr = dstSpan, srcPtr = srcSpan)
{
byte* dstBase = dstPtr - dstBaseOffset; // Layout offset is relative to the base, so we need to subtract the span's offset.
byte* srcBase = srcPtr - srcBaseOffset;
for (int y = 0; y < yCount; y++)
{
srcCalculator.SetY(srcRegionY + y);
dstCalculator.SetY(dstRegionY + y);
for (int x = 0; x < xCount; x++)
{
int srcOffset = srcCalculator.GetOffset(srcRegionX + x);
int dstOffset = dstCalculator.GetOffset(dstRegionX + x);
*(T*)(dstBase + dstOffset) = *(T*)(srcBase + srcOffset);
}
}
}
}
return true;
}
// OPT: This allocates a (potentially) huge temporary array and then copies an existing
// region of memory into it, data that might get overwritten entirely anyways. Ideally this should
// all be rewritten to use pooled arrays, but that gets complicated with packed data and strides
Span dstSpan = memoryManager.GetSpan(dstGpuVa + (ulong)dstBaseOffset, dstSize).ToArray();
bool _ = srcBpp switch
{
1 => Convert(dstSpan, srcSpan),
2 => Convert(dstSpan, srcSpan),
4 => Convert(dstSpan, srcSpan),
8 => Convert(dstSpan, srcSpan),
12 => Convert(dstSpan, srcSpan),
16 => Convert>(dstSpan, srcSpan),
_ => throw new NotSupportedException($"Unable to copy ${srcBpp} bpp pixel format.")
};
memoryManager.Write(dstGpuVa + (ulong)dstBaseOffset, dstSpan);
}
else
{
if (remap &&
_state.State.SetRemapComponentsDstX == SetRemapComponentsDst.ConstA &&
_state.State.SetRemapComponentsDstY == SetRemapComponentsDst.ConstA &&
_state.State.SetRemapComponentsDstZ == SetRemapComponentsDst.ConstA &&
_state.State.SetRemapComponentsDstW == SetRemapComponentsDst.ConstA &&
_state.State.SetRemapComponentsNumSrcComponents == SetRemapComponentsNumComponents.One &&
_state.State.SetRemapComponentsNumDstComponents == SetRemapComponentsNumComponents.One &&
_state.State.SetRemapComponentsComponentSize == SetRemapComponentsComponentSize.Four)
{
// Fast path for clears when remap is enabled.
memoryManager.Physical.BufferCache.ClearBuffer(memoryManager, dstGpuVa, size * 4, _state.State.SetRemapConstA);
}
else
{
// TODO: Implement remap functionality.
// Buffer to buffer copy.
bool srcIsPitchKind = memoryManager.GetKind(srcGpuVa).IsPitch();
bool dstIsPitchKind = memoryManager.GetKind(dstGpuVa).IsPitch();
if (!srcIsPitchKind && dstIsPitchKind)
{
CopyGobBlockLinearToLinear(memoryManager, srcGpuVa, dstGpuVa, size);
}
else if (srcIsPitchKind && !dstIsPitchKind)
{
CopyGobLinearToBlockLinear(memoryManager, srcGpuVa, dstGpuVa, size);
}
else
{
memoryManager.Physical.BufferCache.CopyBuffer(memoryManager, srcGpuVa, dstGpuVa, size);
}
}
}
}
///
/// Copies block linear data with block linear GOBs to a block linear destination with linear GOBs.
///
/// GPU memory manager
/// Source GPU virtual address
/// Destination GPU virtual address
/// Size in bytes of the copy
private static void CopyGobBlockLinearToLinear(MemoryManager memoryManager, ulong srcGpuVa, ulong dstGpuVa, ulong size)
{
if (((srcGpuVa | dstGpuVa | size) & 0xf) == 0)
{
for (ulong offset = 0; offset < size; offset += 16)
{
Vector128 data = memoryManager.Read>(ConvertGobLinearToBlockLinearAddress(srcGpuVa + offset), true);
memoryManager.Write(dstGpuVa + offset, data);
}
}
else
{
for (ulong offset = 0; offset < size; offset++)
{
byte data = memoryManager.Read(ConvertGobLinearToBlockLinearAddress(srcGpuVa + offset), true);
memoryManager.Write(dstGpuVa + offset, data);
}
}
}
///
/// Copies block linear data with linear GOBs to a block linear destination with block linear GOBs.
///
/// GPU memory manager
/// Source GPU virtual address
/// Destination GPU virtual address
/// Size in bytes of the copy
private static void CopyGobLinearToBlockLinear(MemoryManager memoryManager, ulong srcGpuVa, ulong dstGpuVa, ulong size)
{
if (((srcGpuVa | dstGpuVa | size) & 0xf) == 0)
{
for (ulong offset = 0; offset < size; offset += 16)
{
Vector128 data = memoryManager.Read>(srcGpuVa + offset, true);
memoryManager.Write(ConvertGobLinearToBlockLinearAddress(dstGpuVa + offset), data);
}
}
else
{
for (ulong offset = 0; offset < size; offset++)
{
byte data = memoryManager.Read(srcGpuVa + offset, true);
memoryManager.Write(ConvertGobLinearToBlockLinearAddress(dstGpuVa + offset), data);
}
}
}
///
/// Calculates the GOB block linear address from a linear address.
///
/// Linear address
/// Block linear address
private static ulong ConvertGobLinearToBlockLinearAddress(ulong address)
{
// y2 y1 y0 x5 x4 x3 x2 x1 x0 -> x5 y2 y1 x4 y0 x3 x2 x1 x0
return (address & ~0x1f0UL) |
((address & 0x40) >> 2) |
((address & 0x10) << 1) |
((address & 0x180) >> 1) |
((address & 0x20) << 3);
}
///
/// Performs a buffer to buffer, or buffer to texture copy, then optionally releases a semaphore.
///
/// Method call argument
private void LaunchDma(int argument)
{
DmaCopy(argument);
ReleaseSemaphore(argument);
}
}
}