using Ryujinx.Common.Memory; using Ryujinx.Graphics.Nvdec.Vp9.Common; using Ryujinx.Graphics.Nvdec.Vp9.Dsp; using Ryujinx.Graphics.Nvdec.Vp9.Types; using System; using System.Diagnostics; using System.Runtime.InteropServices; using System.Threading; using System.Threading.Tasks; namespace Ryujinx.Graphics.Nvdec.Vp9 { internal static class LoopFilter { public const int MaxLoopFilter = 63; public const int MaxRefLfDeltas = 4; public const int MaxModeLfDeltas = 2; private struct LfSync { private int[] _curSbCol; private object[] _syncObjects; private int _syncRange; private static int GetSyncRange(int width) { // nsync numbers are picked by testing. For example, for 4k // video, using 4 gives best performance. if (width < 640) { return 1; } if (width <= 1280) { return 2; } if (width <= 4096) { return 4; } return 8; } public void Initialize(int width, int sbRows) { if (_curSbCol == null || _curSbCol.Length != sbRows) { _curSbCol = new int[sbRows]; _syncObjects = new object[sbRows]; for (int i = 0; i < sbRows; i++) { _syncObjects[i] = new object(); } } _syncRange = GetSyncRange(width); _curSbCol.AsSpan().Fill(-1); } public readonly void SyncRead(int r, int c) { if (_curSbCol == null) { return; } int nsync = _syncRange; if (r != 0 && (c & (nsync - 1)) == 0) { object syncObject = _syncObjects[r - 1]; lock (syncObject) { while (c > _curSbCol[r - 1] - nsync) { Monitor.Wait(syncObject); } } } } public readonly void SyncWrite(int r, int c, int sbCols) { if (_curSbCol == null) { return; } int nsync = _syncRange; int cur; // Only signal when there are enough filtered SB for next row to run. bool sig = true; if (c < sbCols - 1) { cur = c; if (c % nsync != 0) { sig = false; } } else { cur = sbCols + nsync; } if (sig) { object syncObject = _syncObjects[r]; lock (syncObject) { _curSbCol[r] = cur; Monitor.Pulse(syncObject); } } } } // 64 bit masks for left transform size. Each 1 represents a position where // we should apply a loop filter across the left border of an 8x8 block // boundary. // // In the case of (int)TxSize.Tx16x16 . ( in low order byte first we end up with // a mask that looks like this // // 10101010 // 10101010 // 10101010 // 10101010 // 10101010 // 10101010 // 10101010 // 10101010 // // A loopfilter should be applied to every other 8x8 horizontally. private static readonly ulong[] _left64X64TxformMask = [ 0xffffffffffffffffUL, // (int)TxSize.Tx4x4 0xffffffffffffffffUL, // (int)TxSize.Tx8x8 0x5555555555555555UL, // (int)TxSize.Tx16x16 0x1111111111111111UL // (int)TxSize.Tx32x32 ]; // 64 bit masks for above transform size. Each 1 represents a position where // we should apply a loop filter across the top border of an 8x8 block // boundary. // // In the case of (int)TxSize.Tx32x32 . ( in low order byte first we end up with // a mask that looks like this // // 11111111 // 00000000 // 00000000 // 00000000 // 11111111 // 00000000 // 00000000 // 00000000 // // A loopfilter should be applied to every other 4 the row vertically. private static readonly ulong[] _above64X64TxformMask = [ 0xffffffffffffffffUL, // (int)TxSize.Tx4x4 0xffffffffffffffffUL, // (int)TxSize.Tx8x8 0x00ff00ff00ff00ffUL, // (int)TxSize.Tx16x16 0x000000ff000000ffUL // (int)TxSize.Tx32x32 ]; // 64 bit masks for prediction sizes (left). Each 1 represents a position // where left border of an 8x8 block. These are aligned to the right most // appropriate bit, and then shifted into place. // // In the case of TX_16x32 . ( low order byte first ) we end up with // a mask that looks like this : // // 10000000 // 10000000 // 10000000 // 10000000 // 00000000 // 00000000 // 00000000 // 00000000 private static readonly ulong[] _leftPredictionMask = [ 0x0000000000000001UL, // BLOCK_4x4, 0x0000000000000001UL, // BLOCK_4x8, 0x0000000000000001UL, // BLOCK_8x4, 0x0000000000000001UL, // BLOCK_8x8, 0x0000000000000101UL, // BLOCK_8x16, 0x0000000000000001UL, // BLOCK_16x8, 0x0000000000000101UL, // BLOCK_16x16, 0x0000000001010101UL, // BLOCK_16x32, 0x0000000000000101UL, // BLOCK_32x16, 0x0000000001010101UL, // BLOCK_32x32, 0x0101010101010101UL, // BLOCK_32x64, 0x0000000001010101UL, // BLOCK_64x32, 0x0101010101010101UL // BLOCK_64x64 ]; // 64 bit mask to shift and set for each prediction size. private static readonly ulong[] _abovePredictionMask = [ 0x0000000000000001UL, // BLOCK_4x4 0x0000000000000001UL, // BLOCK_4x8 0x0000000000000001UL, // BLOCK_8x4 0x0000000000000001UL, // BLOCK_8x8 0x0000000000000001UL, // BLOCK_8x16, 0x0000000000000003UL, // BLOCK_16x8 0x0000000000000003UL, // BLOCK_16x16 0x0000000000000003UL, // BLOCK_16x32, 0x000000000000000fUL, // BLOCK_32x16, 0x000000000000000fUL, // BLOCK_32x32, 0x000000000000000fUL, // BLOCK_32x64, 0x00000000000000ffUL, // BLOCK_64x32, 0x00000000000000ffUL // BLOCK_64x64 ]; // 64 bit mask to shift and set for each prediction size. A bit is set for // each 8x8 block that would be in the left most block of the given block // size in the 64x64 block. private static readonly ulong[] _sizeMask = [ 0x0000000000000001UL, // BLOCK_4x4 0x0000000000000001UL, // BLOCK_4x8 0x0000000000000001UL, // BLOCK_8x4 0x0000000000000001UL, // BLOCK_8x8 0x0000000000000101UL, // BLOCK_8x16, 0x0000000000000003UL, // BLOCK_16x8 0x0000000000000303UL, // BLOCK_16x16 0x0000000003030303UL, // BLOCK_16x32, 0x0000000000000f0fUL, // BLOCK_32x16, 0x000000000f0f0f0fUL, // BLOCK_32x32, 0x0f0f0f0f0f0f0f0fUL, // BLOCK_32x64, 0x00000000ffffffffUL, // BLOCK_64x32, 0xffffffffffffffffUL // BLOCK_64x64 ]; // These are used for masking the left and above borders. private const ulong LeftBorder = 0x1111111111111111UL; private const ulong AboveBorder = 0x000000ff000000ffUL; // 16 bit masks for uv transform sizes. private static readonly ushort[] _left64X64TxformMaskUv = [ 0xffff, // (int)TxSize.Tx4x4 0xffff, // (int)TxSize.Tx8x8 0x5555, // (int)TxSize.Tx16x16 0x1111 // (int)TxSize.Tx32x32 ]; private static readonly ushort[] _above64X64TxformMaskUv = [ 0xffff, // (int)TxSize.Tx4x4 0xffff, // (int)TxSize.Tx8x8 0x0f0f, // (int)TxSize.Tx16x16 0x000f // (int)TxSize.Tx32x32 ]; // 16 bit left mask to shift and set for each uv prediction size. private static readonly ushort[] _leftPredictionMaskUv = [ 0x0001, // BLOCK_4x4, 0x0001, // BLOCK_4x8, 0x0001, // BLOCK_8x4, 0x0001, // BLOCK_8x8, 0x0001, // BLOCK_8x16, 0x0001, // BLOCK_16x8, 0x0001, // BLOCK_16x16, 0x0011, // BLOCK_16x32, 0x0001, // BLOCK_32x16, 0x0011, // BLOCK_32x32, 0x1111, // BLOCK_32x64 0x0011, // BLOCK_64x32, 0x1111 // BLOCK_64x64 ]; // 16 bit above mask to shift and set for uv each prediction size. private static readonly ushort[] _abovePredictionMaskUv = [ 0x0001, // BLOCK_4x4 0x0001, // BLOCK_4x8 0x0001, // BLOCK_8x4 0x0001, // BLOCK_8x8 0x0001, // BLOCK_8x16, 0x0001, // BLOCK_16x8 0x0001, // BLOCK_16x16 0x0001, // BLOCK_16x32, 0x0003, // BLOCK_32x16, 0x0003, // BLOCK_32x32, 0x0003, // BLOCK_32x64, 0x000f, // BLOCK_64x32, 0x000f // BLOCK_64x64 ]; // 64 bit mask to shift and set for each uv prediction size private static readonly ushort[] _sizeMaskUv = [ 0x0001, // BLOCK_4x4 0x0001, // BLOCK_4x8 0x0001, // BLOCK_8x4 0x0001, // BLOCK_8x8 0x0001, // BLOCK_8x16, 0x0001, // BLOCK_16x8 0x0001, // BLOCK_16x16 0x0011, // BLOCK_16x32, 0x0003, // BLOCK_32x16, 0x0033, // BLOCK_32x32, 0x3333, // BLOCK_32x64, 0x00ff, // BLOCK_64x32, 0xffff // BLOCK_64x64 ]; private const ushort LeftBorderUv = 0x1111; private const ushort AboveBorderUv = 0x000f; private static readonly int[] _modeLfLut = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // INTRA_MODES 1, 1, 0, 1 // INTER_MODES (ZEROMV == 0) ]; private static byte GetFilterLevel(ref LoopFilterInfoN lfiN, ref ModeInfo mi) { return lfiN.Lvl[mi.SegmentId][mi.RefFrame[0]][_modeLfLut[(int)mi.Mode]]; } private static Span GetLfm(ref Types.LoopFilter lf, int miRow, int miCol) { return lf.Lfm.AsSpan()[((miCol >> 3) + ((miRow >> 3) * lf.LfmStride))..]; } // 8x8 blocks in a superblock. A "1" represents the first block in a 16x16 // or greater area. private static readonly byte[][] _firstBlockIn16X16 = [ [1, 0, 1, 0, 1, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 1, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 1, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 1, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0] ]; // This function sets up the bit masks for a block represented // by miRow, miCol in a 64x64 region. public static void BuildMask(ref Vp9Common cm, ref ModeInfo mi, int miRow, int miCol, int bw, int bh) { BlockSize blockSize = mi.SbType; TxSize txSizeY = mi.TxSize; ref LoopFilterInfoN lfiN = ref cm.LfInfo; int filterLevel = GetFilterLevel(ref lfiN, ref mi); TxSize txSizeUv = Luts.UvTxsizeLookup[(int)blockSize][(int)txSizeY][1][1]; ref LoopFilterMask lfm = ref GetLfm(ref cm.Lf, miRow, miCol)[0]; ref ulong leftY = ref lfm.LeftY[(int)txSizeY]; ref ulong aboveY = ref lfm.AboveY[(int)txSizeY]; ref ulong int4X4Y = ref lfm.Int4X4Y; ref ushort leftUv = ref lfm.LeftUv[(int)txSizeUv]; ref ushort aboveUv = ref lfm.AboveUv[(int)txSizeUv]; ref ushort int4X4Uv = ref lfm.Int4X4Uv; int rowInSb = miRow & 7; int colInSb = miCol & 7; int shiftY = colInSb + (rowInSb << 3); int shiftUv = (colInSb >> 1) + ((rowInSb >> 1) << 2); int buildUv = _firstBlockIn16X16[rowInSb][colInSb]; if (filterLevel == 0) { return; } int index = shiftY; Span lflYSpan = lfm.LflY.AsSpan(); for (int i = 0; i < bh; i++) { MemoryMarshal.CreateSpan(ref lflYSpan[index], 64 - index)[..bw].Fill((byte)filterLevel); index += 8; } // These set 1 in the current block size for the block size edges. // For instance if the block size is 32x16, we'll set: // above = 1111 // 0000 // and // left = 1000 // = 1000 // NOTE : In this example the low bit is left most ( 1000 ) is stored as // 1, not 8... // // U and V set things on a 16 bit scale. // aboveY |= _abovePredictionMask[(int)blockSize] << shiftY; leftY |= _leftPredictionMask[(int)blockSize] << shiftY; if (buildUv != 0) { aboveUv |= (ushort)(_abovePredictionMaskUv[(int)blockSize] << shiftUv); leftUv |= (ushort)(_leftPredictionMaskUv[(int)blockSize] << shiftUv); } // If the block has no coefficients and is not intra we skip applying // the loop filter on block edges. if (mi.Skip != 0 && mi.IsInterBlock()) { return; } // Add a mask for the transform size. The transform size mask is set to // be correct for a 64x64 prediction block size. Mask to match the size of // the block we are working on and then shift it into place. aboveY |= (_sizeMask[(int)blockSize] & _above64X64TxformMask[(int)txSizeY]) << shiftY; leftY |= (_sizeMask[(int)blockSize] & _left64X64TxformMask[(int)txSizeY]) << shiftY; if (buildUv != 0) { aboveUv |= (ushort)((_sizeMaskUv[(int)blockSize] & _above64X64TxformMaskUv[(int)txSizeUv]) << shiftUv); leftUv |= (ushort)((_sizeMaskUv[(int)blockSize] & _left64X64TxformMaskUv[(int)txSizeUv]) << shiftUv); } // Try to determine what to do with the internal 4x4 block boundaries. These // differ from the 4x4 boundaries on the outside edge of an 8x8 in that the // internal ones can be skipped and don't depend on the prediction block size. if (txSizeY == TxSize.Tx4X4) { int4X4Y |= _sizeMask[(int)blockSize] << shiftY; } if (buildUv != 0 && txSizeUv == TxSize.Tx4X4) { int4X4Uv |= (ushort)((_sizeMaskUv[(int)blockSize] & 0xffff) << shiftUv); } } private static void AdjustMask(ref Vp9Common cm, int miRow, int miCol, ref LoopFilterMask lfm) { const ulong LeftBorder = 0x1111111111111111UL; const ulong AboveBorder = 0x000000ff000000ffUL; const ushort LeftBorderUv = 0x1111; const ushort AboveBorderUv = 0x000f; Span leftYSpan = lfm.LeftY.AsSpan(); Span aboveYSpan = lfm.AboveY.AsSpan(); Span leftUvSpan = lfm.LeftUv.AsSpan(); Span aboveUvSpan = lfm.AboveUv.AsSpan(); // The largest loopfilter we have is 16x16 so we use the 16x16 mask // for 32x32 transforms also. leftYSpan[(int)TxSize.Tx16X16] |= leftYSpan[(int)TxSize.Tx32X32]; aboveYSpan[(int)TxSize.Tx16X16] |= aboveYSpan[(int)TxSize.Tx32X32]; leftUvSpan[(int)TxSize.Tx16X16] |= leftUvSpan[(int)TxSize.Tx32X32]; aboveUvSpan[(int)TxSize.Tx16X16] |= aboveUvSpan[(int)TxSize.Tx32X32]; // We do at least 8 tap filter on every 32x32 even if the transform size // is 4x4. So if the 4x4 is set on a border pixel add it to the 8x8 and // remove it from the 4x4. leftYSpan[(int)TxSize.Tx8X8] |= leftYSpan[(int)TxSize.Tx4X4] & LeftBorder; leftYSpan[(int)TxSize.Tx4X4] &= ~LeftBorder; aboveYSpan[(int)TxSize.Tx8X8] |= aboveYSpan[(int)TxSize.Tx4X4] & AboveBorder; aboveYSpan[(int)TxSize.Tx4X4] &= ~AboveBorder; leftUvSpan[(int)TxSize.Tx8X8] |= (ushort)(leftUvSpan[(int)TxSize.Tx4X4] & LeftBorderUv); leftUvSpan[(int)TxSize.Tx4X4] &= unchecked((ushort)~LeftBorderUv); aboveUvSpan[(int)TxSize.Tx8X8] |= (ushort)(aboveUvSpan[(int)TxSize.Tx4X4] & AboveBorderUv); aboveUvSpan[(int)TxSize.Tx4X4] &= unchecked((ushort)~AboveBorderUv); // We do some special edge handling. if (miRow + Constants.MiBlockSize > cm.MiRows) { int rows = cm.MiRows - miRow; // Each pixel inside the border gets a 1, ulong maskY = (1UL << (rows << 3)) - 1; ushort maskUv = (ushort)((1 << (((rows + 1) >> 1) << 2)) - 1); // Remove values completely outside our border. for (int i = 0; i < (int)TxSize.Tx32X32; i++) { leftYSpan[i] &= maskY; aboveYSpan[i] &= maskY; leftUvSpan[i] &= maskUv; aboveUvSpan[i] &= maskUv; } lfm.Int4X4Y &= maskY; lfm.Int4X4Uv &= maskUv; // We don't apply a wide loop filter on the last uv block row. If set // apply the shorter one instead. if (rows == 1) { aboveUvSpan[(int)TxSize.Tx8X8] |= aboveUvSpan[(int)TxSize.Tx16X16]; aboveUvSpan[(int)TxSize.Tx16X16] = 0; } if (rows == 5) { aboveUvSpan[(int)TxSize.Tx8X8] |= (ushort)(aboveUvSpan[(int)TxSize.Tx16X16] & 0xff00); aboveUvSpan[(int)TxSize.Tx16X16] &= (ushort)~(aboveUvSpan[(int)TxSize.Tx16X16] & 0xff00); } } if (miCol + Constants.MiBlockSize > cm.MiCols) { int columns = cm.MiCols - miCol; // Each pixel inside the border gets a 1, the multiply copies the border // to where we need it. ulong maskY = ((1UL << columns) - 1) * 0x0101010101010101UL; ushort maskUv = (ushort)(((1 << ((columns + 1) >> 1)) - 1) * 0x1111); // Internal edges are not applied on the last column of the image so // we mask 1 more for the internal edges ushort maskUvInt = (ushort)(((1 << (columns >> 1)) - 1) * 0x1111); // Remove the bits outside the image edge. for (int i = 0; i < (int)TxSize.Tx32X32; i++) { leftYSpan[i] &= maskY; aboveYSpan[i] &= maskY; leftUvSpan[i] &= maskUv; aboveUvSpan[i] &= maskUv; } lfm.Int4X4Y &= maskY; lfm.Int4X4Uv &= maskUvInt; // We don't apply a wide loop filter on the last uv column. If set // apply the shorter one instead. if (columns == 1) { leftUvSpan[(int)TxSize.Tx8X8] |= leftUvSpan[(int)TxSize.Tx16X16]; leftUvSpan[(int)TxSize.Tx16X16] = 0; } if (columns == 5) { leftUvSpan[(int)TxSize.Tx8X8] |= (ushort)(leftUvSpan[(int)TxSize.Tx16X16] & 0xcccc); leftUvSpan[(int)TxSize.Tx16X16] &= (ushort)~(leftUvSpan[(int)TxSize.Tx16X16] & 0xcccc); } } // We don't apply a loop filter on the first column in the image, mask that // out. if (miCol == 0) { for (int i = 0; i < (int)TxSize.Tx32X32; i++) { leftYSpan[i] &= 0xfefefefefefefefeUL; leftUvSpan[i] &= 0xeeee; } } // Assert if we try to apply 2 different loop filters at the same position. Debug.Assert((leftYSpan[(int)TxSize.Tx16X16] & leftYSpan[(int)TxSize.Tx8X8]) == 0); Debug.Assert((leftYSpan[(int)TxSize.Tx16X16] & leftYSpan[(int)TxSize.Tx4X4]) == 0); Debug.Assert((leftYSpan[(int)TxSize.Tx8X8] & leftYSpan[(int)TxSize.Tx4X4]) == 0); Debug.Assert((lfm.Int4X4Y & leftYSpan[(int)TxSize.Tx16X16]) == 0); Debug.Assert((leftUvSpan[(int)TxSize.Tx16X16] & leftUvSpan[(int)TxSize.Tx8X8]) == 0); Debug.Assert((leftUvSpan[(int)TxSize.Tx16X16] & leftUvSpan[(int)TxSize.Tx4X4]) == 0); Debug.Assert((leftUvSpan[(int)TxSize.Tx8X8] & leftUvSpan[(int)TxSize.Tx4X4]) == 0); Debug.Assert((lfm.Int4X4Uv & leftUvSpan[(int)TxSize.Tx16X16]) == 0); Debug.Assert((aboveYSpan[(int)TxSize.Tx16X16] & aboveYSpan[(int)TxSize.Tx8X8]) == 0); Debug.Assert((aboveYSpan[(int)TxSize.Tx16X16] & aboveYSpan[(int)TxSize.Tx4X4]) == 0); Debug.Assert((aboveYSpan[(int)TxSize.Tx8X8] & aboveYSpan[(int)TxSize.Tx4X4]) == 0); Debug.Assert((lfm.Int4X4Y & aboveYSpan[(int)TxSize.Tx16X16]) == 0); Debug.Assert((aboveUvSpan[(int)TxSize.Tx16X16] & aboveUvSpan[(int)TxSize.Tx8X8]) == 0); Debug.Assert((aboveUvSpan[(int)TxSize.Tx16X16] & aboveUvSpan[(int)TxSize.Tx4X4]) == 0); Debug.Assert((aboveUvSpan[(int)TxSize.Tx8X8] & aboveUvSpan[(int)TxSize.Tx4X4]) == 0); Debug.Assert((lfm.Int4X4Uv & aboveUvSpan[(int)TxSize.Tx16X16]) == 0); } public static unsafe void ResetLfm(ref Vp9Common cm) { if (cm.Lf.FilterLevel != 0) { MemoryUtil.Fill(cm.Lf.Lfm.ToPointer(), new LoopFilterMask(), ((cm.MiRows + (Constants.MiBlockSize - 1)) >> 3) * cm.Lf.LfmStride); } } private static void UpdateSharpness(ref LoopFilterInfoN lfi, int sharpnessLvl) { int lvl; Span lFThrSpan = lfi.Lfthr.AsSpan(); // For each possible value for the loop filter fill out limits for (lvl = 0; lvl <= MaxLoopFilter; lvl++) { // Set loop filter parameters that control sharpness. int blockInsideLimit = lvl >> ((sharpnessLvl > 0 ? 1 : 0) + (sharpnessLvl > 4 ? 1 : 0)); if (sharpnessLvl > 0) { if (blockInsideLimit > 9 - sharpnessLvl) { blockInsideLimit = 9 - sharpnessLvl; } } if (blockInsideLimit < 1) { blockInsideLimit = 1; } lFThrSpan[lvl].Lim.AsSpan().Fill((byte)blockInsideLimit); lFThrSpan[lvl].Mblim.AsSpan().Fill((byte)((2 * (lvl + 2)) + blockInsideLimit)); } } public static void LoopFilterFrameInit(ref Vp9Common cm, int defaultFiltLvl) { int segId; // nShift is the multiplier for lfDeltas // the multiplier is 1 for when filterLvl is between 0 and 31; // 2 when filterLvl is between 32 and 63 int scale = 1 << (defaultFiltLvl >> 5); ref LoopFilterInfoN lfi = ref cm.LfInfo; ref Types.LoopFilter lf = ref cm.Lf; ref Segmentation seg = ref cm.Seg; // Update limits if sharpness has changed if (lf.LastSharpnessLevel != lf.SharpnessLevel) { UpdateSharpness(ref lfi, lf.SharpnessLevel); lf.LastSharpnessLevel = lf.SharpnessLevel; } Span>> lvlSpan = lfi.Lvl.AsSpan(); Span refDeltasSpan = lf.RefDeltas.AsSpan(); Span modeDeltasSpan = lf.ModeDeltas.AsSpan(); sbyte intraFrameRefDelta = refDeltasSpan[Constants.IntraFrame]; for (segId = 0; segId < Constants.MaxSegments; segId++) { int lvlSeg = defaultFiltLvl; if (seg.IsSegFeatureActive(segId, SegLvlFeatures.AltLf) != 0) { int data = seg.GetSegData(segId, SegLvlFeatures.AltLf); lvlSeg = Math.Clamp(seg.AbsDelta == Constants.SegmentAbsData ? data : defaultFiltLvl + data, 0, MaxLoopFilter); } if (!lf.ModeRefDeltaEnabled) { // We could get rid of this if we assume that deltas are set to // zero when not in use; encoder always uses deltas MemoryMarshal.Cast, byte>(lvlSpan[segId].AsSpan()).Fill((byte)lvlSeg); } else { int refr, mode; int intraLvl = lvlSeg + (intraFrameRefDelta * scale); lvlSpan[segId][Constants.IntraFrame][0] = (byte)Math.Clamp(intraLvl, 0, MaxLoopFilter); Span> lvlSpan2 = lvlSpan[segId].AsSpan(); for (refr = Constants.LastFrame; refr < Constants.MaxRefFrames; ++refr) { Span lvlSpan3 = lvlSpan2[refr].AsSpan(); for (mode = 0; mode < MaxModeLfDeltas; ++mode) { int interLvl = lvlSeg + (refDeltasSpan[refr] * scale) + (modeDeltasSpan[mode] * scale); lvlSpan3[mode] = (byte)Math.Clamp(interLvl, 0, MaxLoopFilter); } } } } } private static void FilterSelectivelyVertRow2( int subsamplingFactor, ArrayPtr s, int pitch, uint mask16X16, uint mask8X8, uint mask4X4, uint mask4X4Int, ReadOnlySpan lfthr, ReadOnlySpan lfl) { uint dualMaskCutoff = subsamplingFactor != 0 ? 0xffu : 0xffffu; int lflForward = subsamplingFactor != 0 ? 4 : 8; uint dualOne = 1u | (1u << lflForward); Span> ss = stackalloc ArrayPtr[2]; Span lfis = stackalloc LoopFilterThresh[2]; ss[0] = s; for (uint mask = (mask16X16 | mask8X8 | mask4X4 | mask4X4Int) & dualMaskCutoff; mask != 0; mask = (mask & ~dualOne) >> 1) { if ((mask & dualOne) != 0) { lfis[0] = lfthr[lfl[0]]; lfis[1] = lfthr[lfl[lflForward]]; ss[1] = ss[0].Slice(8 * pitch); if ((mask16X16 & dualOne) != 0) { if ((mask16X16 & dualOne) == dualOne) { LoopFilterAuto.LpfVertical16Dual(ss[0], pitch, lfis[0].Mblim.AsSpan(), lfis[0].Lim.AsSpan(), lfis[0].HevThr.AsSpan()); } else { ref LoopFilterThresh lfi = ref lfis[(mask16X16 & 1) == 0 ? 1 : 0]; LoopFilterAuto.LpfVertical16(ss[(mask16X16 & 1) == 0 ? 1 : 0], pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), lfi.HevThr.AsSpan()); } } if ((mask8X8 & dualOne) != 0) { if ((mask8X8 & dualOne) == dualOne) { LoopFilterAuto.LpfVertical8Dual( ss[0], pitch, lfis[0].Mblim.AsSpan(), lfis[0].Lim.AsSpan(), lfis[0].HevThr.AsSpan(), lfis[1].Mblim.AsSpan(), lfis[1].Lim.AsSpan(), lfis[1].HevThr.AsSpan()); } else { ref LoopFilterThresh lfi = ref lfis[(mask8X8 & 1) == 0 ? 1 : 0]; LoopFilterAuto.LpfVertical8( ss[(mask8X8 & 1) == 0 ? 1 : 0], pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), lfi.HevThr.AsSpan()); } } if ((mask4X4 & dualOne) != 0) { if ((mask4X4 & dualOne) == dualOne) { LoopFilterAuto.LpfVertical4Dual( ss[0], pitch, lfis[0].Mblim.AsSpan(), lfis[0].Lim.AsSpan(), lfis[0].HevThr.AsSpan(), lfis[1].Mblim.AsSpan(), lfis[1].Lim.AsSpan(), lfis[1].HevThr.AsSpan()); } else { ref LoopFilterThresh lfi = ref lfis[(mask4X4 & 1) == 0 ? 1 : 0]; LoopFilterAuto.LpfVertical4(ss[(mask4X4 & 1) == 0 ? 1 : 0], pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), lfi.HevThr.AsSpan()); } } if ((mask4X4Int & dualOne) != 0) { if ((mask4X4Int & dualOne) == dualOne) { LoopFilterAuto.LpfVertical4Dual( ss[0].Slice(4), pitch, lfis[0].Mblim.AsSpan(), lfis[0].Lim.AsSpan(), lfis[0].HevThr.AsSpan(), lfis[1].Mblim.AsSpan(), lfis[1].Lim.AsSpan(), lfis[1].HevThr.AsSpan()); } else { ref LoopFilterThresh lfi = ref lfis[(mask4X4Int & 1) == 0 ? 1 : 0]; LoopFilterAuto.LpfVertical4(ss[(mask4X4Int & 1) == 0 ? 1 : 0].Slice(4), pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), lfi.HevThr.AsSpan()); } } } ss[0] = ss[0].Slice(8); lfl = lfl[1..]; mask16X16 >>= 1; mask8X8 >>= 1; mask4X4 >>= 1; mask4X4Int >>= 1; } } private static void HighbdFilterSelectivelyVertRow2( int subsamplingFactor, ArrayPtr s, int pitch, uint mask16X16, uint mask8X8, uint mask4X4, uint mask4X4Int, ReadOnlySpan lfthr, ReadOnlySpan lfl, int bd) { uint dualMaskCutoff = subsamplingFactor != 0 ? 0xffu : 0xffffu; int lflForward = subsamplingFactor != 0 ? 4 : 8; uint dualOne = 1u | (1u << lflForward); Span> ss = stackalloc ArrayPtr[2]; Span lfis = stackalloc LoopFilterThresh[2]; ss[0] = s; for (uint mask = (mask16X16 | mask8X8 | mask4X4 | mask4X4Int) & dualMaskCutoff; mask != 0; mask = (mask & ~dualOne) >> 1) { if ((mask & dualOne) != 0) { lfis[0] = lfthr[lfl[0]]; lfis[1] = lfthr[lfl[lflForward]]; ss[1] = ss[0].Slice(8 * pitch); Span mblim0Span = lfis[0].Mblim.AsSpan(); Span lim0Span = lfis[0].Lim.AsSpan(); Span hevThr0Span = lfis[0].HevThr.AsSpan(); Span mblim1Span = lfis[1].Mblim.AsSpan(); Span lim1Span = lfis[1].Lim.AsSpan(); Span hevThr1Span = lfis[1].HevThr.AsSpan(); if ((mask16X16 & dualOne) != 0) { if ((mask16X16 & dualOne) == dualOne) { LoopFilterScalar.HighBdLpfVertical16Dual(ss[0], pitch, mblim0Span[0], lim0Span[0], hevThr0Span[0], bd); } else { if ((mask16X16 & 1) == 0) { LoopFilterScalar.HighBdLpfVertical16(ss[1], pitch, mblim1Span[0], lim1Span[0], hevThr1Span[0], bd); } else { LoopFilterScalar.HighBdLpfVertical16(ss[0], pitch, mblim0Span[0], lim0Span[0], hevThr0Span[0], bd); } } } if ((mask8X8 & dualOne) != 0) { if ((mask8X8 & dualOne) == dualOne) { LoopFilterScalar.HighBdLpfVertical8Dual( ss[0], pitch, mblim0Span[0], lim0Span[0], hevThr0Span[0], mblim1Span[0], lim1Span[0], hevThr1Span[0], bd); } else { if ((mask8X8 & 1) == 0) { LoopFilterScalar.HighBdLpfVertical8(ss[1], pitch, mblim1Span[0], lim1Span[0], hevThr1Span[0], bd); } else { LoopFilterScalar.HighBdLpfVertical8(ss[0], pitch, mblim0Span[0], lim0Span[0], hevThr0Span[0], bd); } } } if ((mask4X4 & dualOne) != 0) { if ((mask4X4 & dualOne) == dualOne) { LoopFilterScalar.HighBdLpfVertical4Dual( ss[0], pitch, mblim0Span[0], lim0Span[0], hevThr0Span[0], mblim1Span[0], lim1Span[0], hevThr1Span[0], bd); } else { if ((mask4X4 & 1) == 0) { LoopFilterScalar.HighBdLpfVertical4(ss[1], pitch, mblim1Span[0], lim1Span[0], hevThr1Span[0], bd); } else { LoopFilterScalar.HighBdLpfVertical4(ss[0], pitch, mblim0Span[0], lim0Span[0], hevThr0Span[0], bd); } } } if ((mask4X4Int & dualOne) != 0) { if ((mask4X4Int & dualOne) == dualOne) { LoopFilterScalar.HighBdLpfVertical4Dual( ss[0].Slice(4), pitch, mblim0Span[0], lim0Span[0], hevThr0Span[0], mblim1Span[0], lim1Span[0], hevThr1Span[0], bd); } else { if ((mask4X4Int & 1) == 0) { LoopFilterScalar.HighBdLpfVertical4(ss[1].Slice(4), pitch, mblim1Span[0], lim1Span[0], hevThr1Span[0], bd); } else { LoopFilterScalar.HighBdLpfVertical4(ss[0].Slice(4), pitch, mblim0Span[0], lim0Span[0], hevThr0Span[0], bd); } } } } ss[0] = ss[0].Slice(8); lfl = lfl[1..]; mask16X16 >>= 1; mask8X8 >>= 1; mask4X4 >>= 1; mask4X4Int >>= 1; } } private static void FilterSelectivelyHoriz( ArrayPtr s, int pitch, uint mask16X16, uint mask8X8, uint mask4X4, uint mask4X4Int, ReadOnlySpan lfthr, ReadOnlySpan lfl) { int count; for (uint mask = mask16X16 | mask8X8 | mask4X4 | mask4X4Int; mask != 0; mask >>= count) { count = 1; if ((mask & 1) != 0) { LoopFilterThresh lfi = lfthr[lfl[0]]; if ((mask16X16 & 1) != 0) { if ((mask16X16 & 3) == 3) { LoopFilterAuto.LpfHorizontal16Dual(s, pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), lfi.HevThr.AsSpan()); count = 2; } else { LoopFilterAuto.LpfHorizontal16(s, pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), lfi.HevThr.AsSpan()); } } else if ((mask8X8 & 1) != 0) { if ((mask8X8 & 3) == 3) { // Next block's thresholds. LoopFilterThresh lfin = lfthr[lfl[1]]; LoopFilterAuto.LpfHorizontal8Dual( s, pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), lfi.HevThr.AsSpan(), lfin.Mblim.AsSpan(), lfin.Lim.AsSpan(), lfin.HevThr.AsSpan()); if ((mask4X4Int & 3) == 3) { LoopFilterAuto.LpfHorizontal4Dual( s.Slice(4 * pitch), pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), lfi.HevThr.AsSpan(), lfin.Mblim.AsSpan(), lfin.Lim.AsSpan(), lfin.HevThr.AsSpan()); } else if ((mask4X4Int & 1) != 0) { LoopFilterAuto.LpfHorizontal4(s.Slice(4 * pitch), pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), lfi.HevThr.AsSpan()); } else if ((mask4X4Int & 2) != 0) { LoopFilterAuto.LpfHorizontal4(s.Slice(8 + (4 * pitch)), pitch, lfin.Mblim.AsSpan(), lfin.Lim.AsSpan(), lfin.HevThr.AsSpan()); } count = 2; } else { LoopFilterAuto.LpfHorizontal8(s, pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), lfi.HevThr.AsSpan()); if ((mask4X4Int & 1) != 0) { LoopFilterAuto.LpfHorizontal4(s.Slice(4 * pitch), pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), lfi.HevThr.AsSpan()); } } } else if ((mask4X4 & 1) != 0) { if ((mask4X4 & 3) == 3) { // Next block's thresholds. LoopFilterThresh lfin = lfthr[lfl[1]]; LoopFilterAuto.LpfHorizontal4Dual( s, pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), lfi.HevThr.AsSpan(), lfin.Mblim.AsSpan(), lfin.Lim.AsSpan(), lfin.HevThr.AsSpan()); if ((mask4X4Int & 3) == 3) { LoopFilterAuto.LpfHorizontal4Dual( s.Slice(4 * pitch), pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), lfi.HevThr.AsSpan(), lfin.Mblim.AsSpan(), lfin.Lim.AsSpan(), lfin.HevThr.AsSpan()); } else if ((mask4X4Int & 1) != 0) { LoopFilterAuto.LpfHorizontal4(s.Slice(4 * pitch), pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), lfi.HevThr.AsSpan()); } else if ((mask4X4Int & 2) != 0) { LoopFilterAuto.LpfHorizontal4(s.Slice(8 + (4 * pitch)), pitch, lfin.Mblim.AsSpan(), lfin.Lim.AsSpan(), lfin.HevThr.AsSpan()); } count = 2; } else { LoopFilterAuto.LpfHorizontal4(s, pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), lfi.HevThr.AsSpan()); if ((mask4X4Int & 1) != 0) { LoopFilterAuto.LpfHorizontal4(s.Slice(4 * pitch), pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), lfi.HevThr.AsSpan()); } } } else { LoopFilterAuto.LpfHorizontal4(s.Slice(4 * pitch), pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), lfi.HevThr.AsSpan()); } } s = s.Slice(8 * count); lfl = lfl[count..]; mask16X16 >>= count; mask8X8 >>= count; mask4X4 >>= count; mask4X4Int >>= count; } } private static void HighbdFilterSelectivelyHoriz( ArrayPtr s, int pitch, uint mask16X16, uint mask8X8, uint mask4X4, uint mask4X4Int, ReadOnlySpan lfthr, ReadOnlySpan lfl, int bd) { int count; for (uint mask = mask16X16 | mask8X8 | mask4X4 | mask4X4Int; mask != 0; mask >>= count) { count = 1; if ((mask & 1) != 0) { LoopFilterThresh lfi = lfthr[lfl[0]]; Span mblimSpan = lfi.Mblim.AsSpan(); Span limSpan = lfi.Lim.AsSpan(); Span hevThrSpan = lfi.HevThr.AsSpan(); if ((mask16X16 & 1) != 0) { if ((mask16X16 & 3) == 3) { LoopFilterScalar.HighBdLpfHorizontal16Dual(s, pitch, mblimSpan[0], limSpan[0], hevThrSpan[0], bd); count = 2; } else { LoopFilterScalar.HighBdLpfHorizontal16(s, pitch, mblimSpan[0], limSpan[0], hevThrSpan[0], bd); } } else if ((mask8X8 & 1) != 0) { if ((mask8X8 & 3) == 3) { // Next block's thresholds. LoopFilterThresh lfin = lfthr[lfl[1]]; Span nMblimSpan = lfin.Mblim.AsSpan(); Span nLimSpan = lfin.Lim.AsSpan(); Span nHevThrSpan = lfin.HevThr.AsSpan(); LoopFilterScalar.HighBdLpfHorizontal8Dual( s, pitch, mblimSpan[0], limSpan[0], hevThrSpan[0], nMblimSpan[0], nLimSpan[0], nHevThrSpan[0], bd); if ((mask4X4Int & 3) == 3) { LoopFilterScalar.HighBdLpfHorizontal4Dual( s.Slice(4 * pitch), pitch, mblimSpan[0], limSpan[0], hevThrSpan[0], nMblimSpan[0], nLimSpan[0], nHevThrSpan[0], bd); } else if ((mask4X4Int & 1) != 0) { LoopFilterScalar.HighBdLpfHorizontal4(s.Slice(4 * pitch), pitch, mblimSpan[0], limSpan[0], hevThrSpan[0], bd); } else if ((mask4X4Int & 2) != 0) { LoopFilterScalar.HighBdLpfHorizontal4(s.Slice(8 + (4 * pitch)), pitch, nMblimSpan[0], nLimSpan[0], nHevThrSpan[0], bd); } count = 2; } else { LoopFilterScalar.HighBdLpfHorizontal8(s, pitch, mblimSpan[0], limSpan[0], hevThrSpan[0], bd); if ((mask4X4Int & 1) != 0) { LoopFilterScalar.HighBdLpfHorizontal4(s.Slice(4 * pitch), pitch, mblimSpan[0], limSpan[0], hevThrSpan[0], bd); } } } else if ((mask4X4 & 1) != 0) { if ((mask4X4 & 3) == 3) { // Next block's thresholds. LoopFilterThresh lfin = lfthr[lfl[1]]; Span nMblimSpan = lfin.Mblim.AsSpan(); Span nLimSpan = lfin.Lim.AsSpan(); Span nHevThrSpan = lfin.HevThr.AsSpan(); LoopFilterScalar.HighBdLpfHorizontal4Dual( s, pitch, mblimSpan[0], limSpan[0], hevThrSpan[0], nMblimSpan[0], nLimSpan[0], nHevThrSpan[0], bd); if ((mask4X4Int & 3) == 3) { LoopFilterScalar.HighBdLpfHorizontal4Dual( s.Slice(4 * pitch), pitch, mblimSpan[0], limSpan[0], hevThrSpan[0], nMblimSpan[0], nLimSpan[0], nHevThrSpan[0], bd); } else if ((mask4X4Int & 1) != 0) { LoopFilterScalar.HighBdLpfHorizontal4(s.Slice(4 * pitch), pitch, mblimSpan[0], limSpan[0], hevThrSpan[0], bd); } else if ((mask4X4Int & 2) != 0) { LoopFilterScalar.HighBdLpfHorizontal4(s.Slice(8 + (4 * pitch)), pitch, nMblimSpan[0], nLimSpan[0], nHevThrSpan[0], bd); } count = 2; } else { LoopFilterScalar.HighBdLpfHorizontal4(s, pitch, mblimSpan[0], limSpan[0], hevThrSpan[0], bd); if ((mask4X4Int & 1) != 0) { LoopFilterScalar.HighBdLpfHorizontal4(s.Slice(4 * pitch), pitch, mblimSpan[0], limSpan[0], hevThrSpan[0], bd); } } } else { LoopFilterScalar.HighBdLpfHorizontal4(s.Slice(4 * pitch), pitch, mblimSpan[0], limSpan[0], hevThrSpan[0], bd); } } s = s.Slice(8 * count); lfl = lfl[count..]; mask16X16 >>= count; mask8X8 >>= count; mask4X4 >>= count; mask4X4Int >>= count; } } private static void FilterSelectivelyVert( ArrayPtr s, int pitch, uint mask16X16, uint mask8X8, uint mask4X4, uint mask4X4Int, ReadOnlySpan lfthr, ReadOnlySpan lfl) { for (uint mask = mask16X16 | mask8X8 | mask4X4 | mask4X4Int; mask != 0; mask >>= 1) { LoopFilterThresh lfi = lfthr[lfl[0]]; if ((mask & 1) != 0) { if ((mask16X16 & 1) != 0) { LoopFilterAuto.LpfVertical16(s, pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), lfi.HevThr.AsSpan()); } else if ((mask8X8 & 1) != 0) { LoopFilterAuto.LpfVertical8(s, pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), lfi.HevThr.AsSpan()); } else if ((mask4X4 & 1) != 0) { LoopFilterAuto.LpfVertical4(s, pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), lfi.HevThr.AsSpan()); } } if ((mask4X4Int & 1) != 0) { LoopFilterAuto.LpfVertical4(s.Slice(4), pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), lfi.HevThr.AsSpan()); } s = s.Slice(8); lfl = lfl[1..]; mask16X16 >>= 1; mask8X8 >>= 1; mask4X4 >>= 1; mask4X4Int >>= 1; } } private static void HighbdFilterSelectivelyVert( ArrayPtr s, int pitch, uint mask16X16, uint mask8X8, uint mask4X4, uint mask4X4Int, ReadOnlySpan lfthr, ReadOnlySpan lfl, int bd) { for (uint mask = mask16X16 | mask8X8 | mask4X4 | mask4X4Int; mask != 0; mask >>= 1) { LoopFilterThresh lfi = lfthr[lfl[0]]; Span mblimSpan = lfi.Mblim.AsSpan(); Span limSpan = lfi.Lim.AsSpan(); Span hevThrSpan = lfi.HevThr.AsSpan(); if ((mask & 1) != 0) { if ((mask16X16 & 1) != 0) { LoopFilterScalar.HighBdLpfVertical16(s, pitch, mblimSpan[0], limSpan[0], hevThrSpan[0], bd); } else if ((mask8X8 & 1) != 0) { LoopFilterScalar.HighBdLpfVertical8(s, pitch, mblimSpan[0], limSpan[0], hevThrSpan[0], bd); } else if ((mask4X4 & 1) != 0) { LoopFilterScalar.HighBdLpfVertical4(s, pitch, mblimSpan[0], limSpan[0], hevThrSpan[0], bd); } } if ((mask4X4Int & 1) != 0) { LoopFilterScalar.HighBdLpfVertical4(s.Slice(4), pitch, mblimSpan[0], limSpan[0], hevThrSpan[0], bd); } s = s.Slice(8); lfl = lfl[1..]; mask16X16 >>= 1; mask8X8 >>= 1; mask4X4 >>= 1; mask4X4Int >>= 1; } } private static readonly byte[] _num4X4BlocksWideLookup = [1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16]; private static readonly byte[] _num4X4BlocksHighLookup = [1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16]; private static readonly byte[] _num8X8BlocksWideLookup = [1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8]; private static readonly byte[] _num8X8BlocksHighLookup = [1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8]; private static void FilterBlockPlaneNon420( ref Vp9Common cm, ref MacroBlockDPlane plane, ArrayPtr> mi8X8, int miRow, int miCol) { int ssX = plane.SubsamplingX; int ssY = plane.SubsamplingY; int rowStep = 1 << ssY; int colStep = 1 << ssX; int rowStepStride = cm.MiStride * rowStep; ref Buf2D dst = ref plane.Dst; ArrayPtr dst0 = dst.Buf; Span mask16X16 = stackalloc int[Constants.MiBlockSize]; Span mask8X8 = stackalloc int[Constants.MiBlockSize]; Span mask4X4 = stackalloc int[Constants.MiBlockSize]; Span mask4X4Int = stackalloc int[Constants.MiBlockSize]; Span lfl = stackalloc byte[Constants.MiBlockSize * Constants.MiBlockSize]; for (int r = 0; r < Constants.MiBlockSize && miRow + r < cm.MiRows; r += rowStep) { uint mask16X16C = 0; uint mask8X8C = 0; uint mask4X4C = 0; uint borderMask; // Determine the vertical edges that need filtering for (int c = 0; c < Constants.MiBlockSize && miCol + c < cm.MiCols; c += colStep) { ref ModeInfo mi = ref mi8X8[c].Value; BlockSize sbType = mi.SbType; bool skipThis = mi.Skip != 0 && mi.IsInterBlock(); // left edge of current unit is block/partition edge -> no skip bool blockEdgeLeft = _num4X4BlocksWideLookup[(int)sbType] <= 1 || (c & (_num8X8BlocksWideLookup[(int)sbType] - 1)) == 0; bool skipThisC = skipThis && !blockEdgeLeft; // top edge of current unit is block/partition edge -> no skip bool blockEdgeAbove = _num4X4BlocksHighLookup[(int)sbType] <= 1 || (r & (_num8X8BlocksHighLookup[(int)sbType] - 1)) == 0; bool skipThisR = skipThis && !blockEdgeAbove; TxSize txSize = mi.GetUvTxSize(ref plane); bool skipBorder4X4C = ssX != 0 && miCol + c == cm.MiCols - 1; bool skipBorder4X4R = ssY != 0 && miRow + r == cm.MiRows - 1; // Filter level can vary per MI if ((lfl[(r << 3) + (c >> ssX)] = GetFilterLevel(ref cm.LfInfo, ref mi)) == 0) { continue; } // Build masks based on the transform size of each block if (txSize == TxSize.Tx32X32) { if (!skipThisC && ((c >> ssX) & 3) == 0) { if (!skipBorder4X4C) { mask16X16C |= 1u << (c >> ssX); } else { mask8X8C |= 1u << (c >> ssX); } } if (!skipThisR && ((r >> ssY) & 3) == 0) { if (!skipBorder4X4R) { mask16X16[r] |= 1 << (c >> ssX); } else { mask8X8[r] |= 1 << (c >> ssX); } } } else if (txSize == TxSize.Tx16X16) { if (!skipThisC && ((c >> ssX) & 1) == 0) { if (!skipBorder4X4C) { mask16X16C |= 1u << (c >> ssX); } else { mask8X8C |= 1u << (c >> ssX); } } if (!skipThisR && ((r >> ssY) & 1) == 0) { if (!skipBorder4X4R) { mask16X16[r] |= 1 << (c >> ssX); } else { mask8X8[r] |= 1 << (c >> ssX); } } } else { // force 8x8 filtering on 32x32 boundaries if (!skipThisC) { if (txSize == TxSize.Tx8X8 || ((c >> ssX) & 3) == 0) { mask8X8C |= 1u << (c >> ssX); } else { mask4X4C |= 1u << (c >> ssX); } } if (!skipThisR) { if (txSize == TxSize.Tx8X8 || ((r >> ssY) & 3) == 0) { mask8X8[r] |= 1 << (c >> ssX); } else { mask4X4[r] |= 1 << (c >> ssX); } } if (!skipThis && txSize < TxSize.Tx8X8 && !skipBorder4X4C) { mask4X4Int[r] |= 1 << (c >> ssX); } } } // Disable filtering on the leftmost column borderMask = ~(miCol == 0 ? 1u : 0u); if (cm.UseHighBitDepth) { HighbdFilterSelectivelyVert( ConvertToUshortPtr(dst.Buf), dst.Stride, mask16X16C & borderMask, mask8X8C & borderMask, mask4X4C & borderMask, (uint)mask4X4Int[r], cm.LfInfo.Lfthr.AsSpan(), lfl[(r << 3)..], (int)cm.BitDepth); } else { FilterSelectivelyVert( dst.Buf, dst.Stride, mask16X16C & borderMask, mask8X8C & borderMask, mask4X4C & borderMask, (uint)mask4X4Int[r], cm.LfInfo.Lfthr.AsSpan(), lfl[(r << 3)..]); } dst.Buf = dst.Buf.Slice(8 * dst.Stride); mi8X8 = mi8X8.Slice(rowStepStride); } // Now do horizontal pass dst.Buf = dst0; for (int r = 0; r < Constants.MiBlockSize && miRow + r < cm.MiRows; r += rowStep) { bool skipBorder4X4R = ssY != 0 && miRow + r == cm.MiRows - 1; uint mask4X4IntR = skipBorder4X4R ? 0u : (uint)mask4X4Int[r]; uint mask16X16R; uint mask8X8R; uint mask4X4R; if (miRow + r == 0) { mask16X16R = 0; mask8X8R = 0; mask4X4R = 0; } else { mask16X16R = (uint)mask16X16[r]; mask8X8R = (uint)mask8X8[r]; mask4X4R = (uint)mask4X4[r]; } if (cm.UseHighBitDepth) { HighbdFilterSelectivelyHoriz( ConvertToUshortPtr(dst.Buf), dst.Stride, mask16X16R, mask8X8R, mask4X4R, mask4X4IntR, cm.LfInfo.Lfthr.AsSpan(), lfl[(r << 3)..], (int)cm.BitDepth); } else { FilterSelectivelyHoriz( dst.Buf, dst.Stride, mask16X16R, mask8X8R, mask4X4R, mask4X4IntR, cm.LfInfo.Lfthr.AsSpan(), lfl[(r << 3)..]); } dst.Buf = dst.Buf.Slice(8 * dst.Stride); } } private static void FilterBlockPlaneSs00(ref Vp9Common cm, ref MacroBlockDPlane plane, int miRow, ref LoopFilterMask lfm) { ref Buf2D dst = ref plane.Dst; ArrayPtr dst0 = dst.Buf; Span leftYSpan = lfm.LeftY.AsSpan(); ulong mask16X16 = leftYSpan[(int)TxSize.Tx16X16]; ulong mask8X8 = leftYSpan[(int)TxSize.Tx8X8]; ulong mask4X4 = leftYSpan[(int)TxSize.Tx4X4]; ulong mask4X4Int = lfm.Int4X4Y; Debug.Assert(plane.SubsamplingX == 0 && plane.SubsamplingY == 0); // Vertical pass: do 2 rows at one time for (int r = 0; r < Constants.MiBlockSize && miRow + r < cm.MiRows; r += 2) { if (cm.UseHighBitDepth) { // Disable filtering on the leftmost column. HighbdFilterSelectivelyVertRow2( plane.SubsamplingX, ConvertToUshortPtr(dst.Buf), dst.Stride, (uint)mask16X16, (uint)mask8X8, (uint)mask4X4, (uint)mask4X4Int, cm.LfInfo.Lfthr.AsSpan(), lfm.LflY.AsSpan()[(r << 3)..], (int)cm.BitDepth); } else { // Disable filtering on the leftmost column. FilterSelectivelyVertRow2( plane.SubsamplingX, dst.Buf, dst.Stride, (uint)mask16X16, (uint)mask8X8, (uint)mask4X4, (uint)mask4X4Int, cm.LfInfo.Lfthr.AsSpan(), lfm.LflY.AsSpan()[(r << 3)..]); } dst.Buf = dst.Buf.Slice(16 * dst.Stride); mask16X16 >>= 16; mask8X8 >>= 16; mask4X4 >>= 16; mask4X4Int >>= 16; } // Horizontal pass dst.Buf = dst0; Span aboveYSpan = lfm.AboveY.AsSpan(); mask16X16 = aboveYSpan[(int)TxSize.Tx16X16]; mask8X8 = aboveYSpan[(int)TxSize.Tx8X8]; mask4X4 = aboveYSpan[(int)TxSize.Tx4X4]; mask4X4Int = lfm.Int4X4Y; for (int r = 0; r < Constants.MiBlockSize && miRow + r < cm.MiRows; r++) { uint mask16X16R; uint mask8X8R; uint mask4X4R; if (miRow + r == 0) { mask16X16R = 0; mask8X8R = 0; mask4X4R = 0; } else { mask16X16R = (uint)mask16X16 & 0xff; mask8X8R = (uint)mask8X8 & 0xff; mask4X4R = (uint)mask4X4 & 0xff; } if (cm.UseHighBitDepth) { HighbdFilterSelectivelyHoriz( ConvertToUshortPtr(dst.Buf), dst.Stride, mask16X16R, mask8X8R, mask4X4R, (uint)mask4X4Int & 0xff, cm.LfInfo.Lfthr.AsSpan(), lfm.LflY.AsSpan()[(r << 3)..], (int)cm.BitDepth); } else { FilterSelectivelyHoriz( dst.Buf, dst.Stride, mask16X16R, mask8X8R, mask4X4R, (uint)mask4X4Int & 0xff, cm.LfInfo.Lfthr.AsSpan(), lfm.LflY.AsSpan()[(r << 3)..]); } dst.Buf = dst.Buf.Slice(8 * dst.Stride); mask16X16 >>= 8; mask8X8 >>= 8; mask4X4 >>= 8; mask4X4Int >>= 8; } } private static void FilterBlockPlaneSs11(ref Vp9Common cm, ref MacroBlockDPlane plane, int miRow, ref LoopFilterMask lfm) { Buf2D dst = plane.Dst; ArrayPtr dst0 = dst.Buf; Span lflUv = stackalloc byte[16]; Span lflY = lfm.LflY.AsSpan(); Span leftUvSpan = lfm.LeftUv.AsSpan(); ushort mask16X16 = leftUvSpan[(int)TxSize.Tx16X16]; ushort mask8X8 = leftUvSpan[(int)TxSize.Tx8X8]; ushort mask4X4 = leftUvSpan[(int)TxSize.Tx4X4]; ushort mask4X4Int = lfm.Int4X4Uv; Debug.Assert(plane.SubsamplingX == 1 && plane.SubsamplingY == 1); // Vertical pass: do 2 rows at one time for (int r = 0; r < Constants.MiBlockSize && miRow + r < cm.MiRows; r += 4) { for (int c = 0; c < Constants.MiBlockSize >> 1; c++) { lflUv[(r << 1) + c] = lflY[(r << 3) + (c << 1)]; lflUv[((r + 2) << 1) + c] = lflY[((r + 2) << 3) + (c << 1)]; } if (cm.UseHighBitDepth) { // Disable filtering on the leftmost column. HighbdFilterSelectivelyVertRow2( plane.SubsamplingX, ConvertToUshortPtr(dst.Buf), dst.Stride, mask16X16, mask8X8, mask4X4, mask4X4Int, cm.LfInfo.Lfthr.AsSpan(), lflUv[(r << 1)..], (int)cm.BitDepth); } else { // Disable filtering on the leftmost column. FilterSelectivelyVertRow2( plane.SubsamplingX, dst.Buf, dst.Stride, mask16X16, mask8X8, mask4X4, mask4X4Int, cm.LfInfo.Lfthr.AsSpan(), lflUv[(r << 1)..]); } dst.Buf = dst.Buf.Slice(16 * dst.Stride); mask16X16 >>= 8; mask8X8 >>= 8; mask4X4 >>= 8; mask4X4Int >>= 8; } // Horizontal pass dst.Buf = dst0; Span aboveUvSpan = lfm.AboveUv.AsSpan(); mask16X16 = aboveUvSpan[(int)TxSize.Tx16X16]; mask8X8 = aboveUvSpan[(int)TxSize.Tx8X8]; mask4X4 = aboveUvSpan[(int)TxSize.Tx4X4]; mask4X4Int = lfm.Int4X4Uv; for (int r = 0; r < Constants.MiBlockSize && miRow + r < cm.MiRows; r += 2) { bool skipBorder4X4R = miRow + r == cm.MiRows - 1; uint mask4X4IntR = skipBorder4X4R ? 0u : (uint)mask4X4Int & 0xf; uint mask16X16R; uint mask8X8R; uint mask4X4R; if (miRow + r == 0) { mask16X16R = 0; mask8X8R = 0; mask4X4R = 0; } else { mask16X16R = (uint)mask16X16 & 0xf; mask8X8R = (uint)mask8X8 & 0xf; mask4X4R = (uint)mask4X4 & 0xf; } if (cm.UseHighBitDepth) { HighbdFilterSelectivelyHoriz( ConvertToUshortPtr(dst.Buf), dst.Stride, mask16X16R, mask8X8R, mask4X4R, mask4X4IntR, cm.LfInfo.Lfthr.AsSpan(), lflUv[(r << 1)..], (int)cm.BitDepth); } else { FilterSelectivelyHoriz( dst.Buf, dst.Stride, mask16X16R, mask8X8R, mask4X4R, mask4X4IntR, cm.LfInfo.Lfthr.AsSpan(), lflUv[(r << 1)..]); } dst.Buf = dst.Buf.Slice(8 * dst.Stride); mask16X16 >>= 4; mask8X8 >>= 4; mask4X4 >>= 4; mask4X4Int >>= 4; } } private enum LfPath { LfPathSlow, LfPath420, LfPath444 } private static void LoopFilterRows( ref Surface frameBuffer, ref Vp9Common cm, Array3 planes, int start, int stop, int step, bool yOnly, LfSync lfSync) { int numPlanes = yOnly ? 1 : Constants.MaxMbPlane; int sbCols = TileInfo.MiColsAlignedToSb(cm.MiCols) >> Constants.MiBlockSizeLog2; LfPath path; int miRow, miCol; Span planesSpan = planes.AsSpan(); if (yOnly) { path = LfPath.LfPath444; } else if (planesSpan[1].SubsamplingY == 1 && planesSpan[1].SubsamplingX == 1) { path = LfPath.LfPath420; } else if (planesSpan[1].SubsamplingY == 0 && planesSpan[1].SubsamplingX == 0) { path = LfPath.LfPath444; } else { path = LfPath.LfPathSlow; } for (miRow = start; miRow < stop; miRow += step) { ArrayPtr> mi = cm.MiGridVisible.Slice(miRow * cm.MiStride); Span lfm = GetLfm(ref cm.Lf, miRow, 0); for (miCol = 0; miCol < cm.MiCols; miCol += Constants.MiBlockSize, lfm = lfm[1..]) { int r = miRow >> Constants.MiBlockSizeLog2; int c = miCol >> Constants.MiBlockSizeLog2; int plane; lfSync.SyncRead(r, c); ReconInter.SetupDstPlanes(planesSpan, ref frameBuffer, miRow, miCol); AdjustMask(ref cm, miRow, miCol, ref lfm[0]); FilterBlockPlaneSs00(ref cm, ref planesSpan[0], miRow, ref lfm[0]); for (plane = 1; plane < numPlanes; ++plane) { switch (path) { case LfPath.LfPath420: FilterBlockPlaneSs11(ref cm, ref planesSpan[plane], miRow, ref lfm[0]); break; case LfPath.LfPath444: FilterBlockPlaneSs00(ref cm, ref planesSpan[plane], miRow, ref lfm[0]); break; case LfPath.LfPathSlow: FilterBlockPlaneNon420(ref cm, ref planesSpan[plane], mi.Slice(miCol), miRow, miCol); break; } } lfSync.SyncWrite(r, c, sbCols); } } } public static void LoopFilterFrame( ref Surface frame, ref Vp9Common cm, ref MacroBlockD xd, int frameFilterLevel, bool yOnly, bool partialFrame) { if (frameFilterLevel == 0) { return; } int startMiRow = 0; int miRowsToFilter = cm.MiRows; if (partialFrame && cm.MiRows > 8) { startMiRow = cm.MiRows >> 1; startMiRow &= ~7; miRowsToFilter = Math.Max(cm.MiRows / 8, 8); } int endMiRow = startMiRow + miRowsToFilter; LoopFilterRows(ref frame, ref cm, xd.Plane, startMiRow, endMiRow, Constants.MiBlockSize, yOnly, default); } private static void LoopFilterRowsMt( ref Surface frameBuffer, ref Vp9Common cm, Array3 planes, int start, int stop, bool yOnly, int threadCount) { int sbRows = TileInfo.MiColsAlignedToSb(cm.MiRows) >> Constants.MiBlockSizeLog2; int numTileCols = 1 << cm.Log2TileCols; int numWorkers = Math.Min(threadCount, Math.Min(numTileCols, sbRows)); LfSync lfSync = new(); lfSync.Initialize(cm.Width, sbRows); Ptr frameBufferPtr = new(ref frameBuffer); Ptr cmPtr = new(ref cm); Parallel.For(0, numWorkers, n => { LoopFilterRows( ref frameBufferPtr.Value, ref cmPtr.Value, planes, start + (n * Constants.MiBlockSize), stop, numWorkers * Constants.MiBlockSize, yOnly, lfSync); }); } public static void LoopFilterFrameMt( ref Surface frame, ref Vp9Common cm, ref MacroBlockD xd, int frameFilterLevel, bool yOnly, bool partialFrame, int threadCount) { if (frameFilterLevel == 0) { return; } int startMiRow = 0; int miRowsToFilter = cm.MiRows; if (partialFrame && cm.MiRows > 8) { startMiRow = cm.MiRows >> 1; startMiRow &= ~7; miRowsToFilter = Math.Max(cm.MiRows / 8, 8); } int endMiRow = startMiRow + miRowsToFilter; LoopFilterFrameInit(ref cm, frameFilterLevel); LoopFilterRowsMt(ref frame, ref cm, xd.Plane, startMiRow, endMiRow, yOnly, threadCount); } private static unsafe ArrayPtr ConvertToUshortPtr(ArrayPtr s) { return new ArrayPtr((ushort*)s.ToPointer(), s.Length / 2); } } }