Ryujinx/ARMeilleure/Common/BitMap.cs
FICTURE7 69093cf2d6
Optimize LSRA (#2563)
* Optimize `TryAllocateRegWithtoutSpill` a bit

* Add a fast path for when all registers are live.
* Do not query `GetOverlapPosition` if the register is already in use
  (i.e: free position is 0).

* Do not allocate child split list if not parent

* Turn `LiveRange` into a reference struct

`LiveRange` is now a reference wrapping struct like `Operand` and
`Operation`.

It has also been changed into a singly linked-list. In micro-benchmarks
traversing the linked-list was faster than binary search on `List<T>`.
Even for quite large input sizes (e.g: 1,000,000), surprisingly.

Could be because the code gen for traversing the linked-list is much
much cleaner and there is no virtual dispatch happening when checking if
intervals overlaps.

* Turn `LiveInterval` into an iterator

The LSRA allocates in forward order and never inspect previous
`LiveInterval` once they are expired. Something similar can be done for
the `LiveRange`s within the `LiveInterval`s themselves.

The `LiveInterval` is turned into a iterator which expires `LiveRange`
within it. The iterator is moved forward along with interval walking
code, i.e: AllocateInterval(context, interval, cIndex).

* Remove `LinearScanAllocator.Sources`

Local methods are less susceptible to do allocations than lambdas.

* Optimize `GetOverlapPosition(interval)` a bit

Time complexity should be in O(n+m) instead of O(nm) now.

* Optimize `NumberLocals` a bit

Use the same idea as in `HybridAllocator` to store the visited state
in the MSB of the Operand's value instead of using a `HashSet<T>`.

* Optimize `InsertSplitCopies` a bit

Avoid allocating a redundant `CopyResolver`.

* Optimize `InsertSplitCopiesAtEdges` a bit

Avoid redundant allocations of `CopyResolver`.

* Use stack allocation for `freePositions`

Avoid redundant computations.

* Add `UseList`

Replace `SortedIntegerList` with an even more specialized data
structure. It allocates memory on the arena allocators and does not
require copying use positions when splitting it.

* Turn `LiveInterval` into a reference struct

`LiveInterval` is now a reference wrapping struct like `Operand` and
`Operation`.

The rationale behind turning this in a reference wrapping struct is
because a `LiveInterval` is associated with each local variable, and
these intervals may themselves be split further. I've seen translations
having up to 8000 local variables.

To make the `LiveInterval` unmanaged, a new data structure called
`LiveIntervalList` was added to store child splits. This differs from
`SortedList<,>` because it can contain intervals with the same start
position.

Really wished we got some more of C++ template in C#. :^(

* Optimize `GetChildSplit` a bit

No need to inspect the remaining ranges if we've reached a range which
starts after position, since the split list is ordered.

* Optimize `CopyResolver` a bit

Lazily allocate the fill, spill and parallel copy structures since most
of the time only one of them is needed.

* Optimize `BitMap.Enumerator` a bit

Marking `MoveNext` as `AggressiveInlining` allows RyuJIT to promote the
`Enumerator` struct into registers completely, reducing load/store code
a lot since it does not have to store the struct on the stack for ABI
purposes.

* Use stack allocation for `use/blockedPositions`

* Optimize `AllocateWithSpill` a bit

* Address feedback

* Make `LiveInterval.AddRange(,)` more conservative

Produces no diff against master, but just for good measure.
2021-10-08 18:15:44 -03:00

222 lines
5.2 KiB
C#

using System;
using System.Collections;
using System.Collections.Generic;
using System.Numerics;
using System.Runtime.CompilerServices;
namespace ARMeilleure.Common
{
unsafe class BitMap : IEnumerable<int>, IDisposable
{
private const int IntSize = 64;
private const int IntMask = IntSize - 1;
private int _count;
private long* _masks;
private readonly Allocator _allocator;
public BitMap(Allocator allocator)
{
_allocator = allocator;
}
public BitMap(Allocator allocator, int capacity) : this(allocator)
{
EnsureCapacity(capacity);
}
public bool Set(int bit)
{
EnsureCapacity(bit + 1);
int wordIndex = bit / IntSize;
int wordBit = bit & IntMask;
long wordMask = 1L << wordBit;
if ((_masks[wordIndex] & wordMask) != 0)
{
return false;
}
_masks[wordIndex] |= wordMask;
return true;
}
public void Clear(int bit)
{
EnsureCapacity(bit + 1);
int wordIndex = bit / IntSize;
int wordBit = bit & IntMask;
long wordMask = 1L << wordBit;
_masks[wordIndex] &= ~wordMask;
}
public bool IsSet(int bit)
{
EnsureCapacity(bit + 1);
int wordIndex = bit / IntSize;
int wordBit = bit & IntMask;
return (_masks[wordIndex] & (1L << wordBit)) != 0;
}
public int FindFirstUnset()
{
for (int index = 0; index < _count; index++)
{
long mask = _masks[index];
if (mask != -1L)
{
return BitOperations.TrailingZeroCount(~mask) + index * IntSize;
}
}
return _count * IntSize;
}
public bool Set(BitMap map)
{
EnsureCapacity(map._count * IntSize);
bool modified = false;
for (int index = 0; index < _count; index++)
{
long newValue = _masks[index] | map._masks[index];
if (_masks[index] != newValue)
{
_masks[index] = newValue;
modified = true;
}
}
return modified;
}
public bool Clear(BitMap map)
{
EnsureCapacity(map._count * IntSize);
bool modified = false;
for (int index = 0; index < _count; index++)
{
long newValue = _masks[index] & ~map._masks[index];
if (_masks[index] != newValue)
{
_masks[index] = newValue;
modified = true;
}
}
return modified;
}
private void EnsureCapacity(int size)
{
int count = (size + IntMask) / IntSize;
if (count > _count)
{
var oldMask = _masks;
var oldSpan = new Span<long>(_masks, _count);
_masks = _allocator.Allocate<long>((uint)count);
_count = count;
var newSpan = new Span<long>(_masks, _count);
oldSpan.CopyTo(newSpan);
newSpan.Slice(oldSpan.Length).Clear();
_allocator.Free(oldMask);
}
}
public void Dispose()
{
if (_masks != null)
{
_allocator.Free(_masks);
_masks = null;
}
}
IEnumerator IEnumerable.GetEnumerator()
{
return GetEnumerator();
}
IEnumerator<int> IEnumerable<int>.GetEnumerator()
{
return GetEnumerator();
}
public Enumerator GetEnumerator()
{
return new Enumerator(this);
}
public struct Enumerator : IEnumerator<int>
{
private long _index;
private long _mask;
private int _bit;
private readonly BitMap _map;
public int Current => (int)_index * IntSize + _bit;
object IEnumerator.Current => Current;
public Enumerator(BitMap map)
{
_index = -1;
_mask = 0;
_bit = 0;
_map = map;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool MoveNext()
{
if (_mask != 0)
{
_mask &= ~(1L << _bit);
}
// Manually hoist these loads, because RyuJIT does not.
long count = (uint)_map._count;
long* masks = _map._masks;
while (_mask == 0)
{
if (++_index >= count)
{
return false;
}
_mask = masks[_index];
}
_bit = BitOperations.TrailingZeroCount(_mask);
return true;
}
public void Reset() { }
public void Dispose() { }
}
}
}