exjam/gpu7_tiler.h

## gpu7_tiler.h
#pragma once
#include <cstdint>
#include <cstring>

namespace gpu7::tiler
{

enum TileMode : uint32_t
{
   LinearGeneral = 0x0,
   LinearAligned = 0x1,
   Tiled1DThin1 = 0x2,
   Tiled1DThick = 0x3,
   Tiled2DThin1 = 0x4,
   Tiled2DThin2 = 0x5,
   Tiled2DThin4 = 0x6,
   Tiled2DThick = 0x7,
   Tiled2BThin1 = 0x8,
   Tiled2BThin2 = 0x9,
   Tiled2BThin4 = 0xA,
   Tiled2BThick = 0xB,
   Tiled3DThin1 = 0xC,
   Tiled3DThick = 0xD,
   Tiled3BThin1 = 0xE,
   Tiled3BThick = 0xF,
   LinearSpecial = 0x10,
};

struct TiledSurface
{
   void *image;
   uint32_t bpp;
   uint32_t tileMode;
   uint32_t swizzle;
   uint32_t pitch;
   uint32_t height;
   uint32_t depth;
   uint32_t numSamples;
   bool isDepth;
   uint32_t bankSwizzle;
   uint32_t pipeSwizzle;
};

namespace detail
{

static constexpr auto MicroTileWidth = 8;
static constexpr auto MicroTileHeight = 8;

struct MicroTiler8
{
   /*
      8 bits per element:
       0:   0,  1,  2,  3,  4,  5,  6,  7,
       8:  16, 17, 18, 19, 20, 21, 22, 23,
      16:   8,  9, 10, 11, 12, 13, 14, 15,
      24:  24, 25, 26, 27, 28, 29, 30, 31,

      32:  32, 33, 34, 35, 36, 37, 38, 39,
      40:  48, 49, 50, 51, 52, 53, 54, 55,
      48:  40, 41, 42, 43, 44, 45, 46, 47,
      56:  56, 57, 58, 59, 60, 61, 62, 63,
   */

   void apply(uint8_t *src, unsigned srcStrideBytes, uint8_t *dst, unsigned dstStrideBytes)
   {
      static constexpr auto rowSize = MicroTileWidth * sizeof(uint8_t);
      auto dstRow = [&](int row) { return dst + row * dstStrideBytes; };
      auto srcRow = [&](int row) { return src + row * srcStrideBytes; };

      for (int y = 0; y < MicroTileHeight; y += 4) {
         std::memcpy(dstRow(y + 0), srcRow(y + 0), rowSize);
         std::memcpy(dstRow(y + 1), srcRow(y + 2), rowSize);
         std::memcpy(dstRow(y + 2), srcRow(y + 1), rowSize);
         std::memcpy(dstRow(y + 3), srcRow(y + 3), rowSize);
      }
   }
};

struct MicroTiler16
{
   /*
      16 bits per element:
       0:   0,  1,  2,  3,  4,  5,  6,  7,
       8:   8,  9, 10, 11, 12, 13, 14, 15,
      16:  16, 17, 18, 19, 20, 21, 22, 23,
      24:  24, 25, 26, 27, 28, 29, 30, 31,
      32:  32, 33, 34, 35, 36, 37, 38, 39,
      40:  40, 41, 42, 43, 44, 45, 46, 47,
      48:  48, 49, 50, 51, 52, 53, 54, 55,
      56:  56, 57, 58, 59, 60, 61, 62, 63,
   */

   void apply(uint8_t *src, unsigned srcStrideBytes, uint8_t *dst, unsigned dstStrideBytes)
   {
      static constexpr auto rowSize = MicroTileWidth * sizeof(uint16_t);

      for (int y = 0; y < MicroTileHeight; ++y) {
         std::memcpy(dst, src, rowSize);
         src += srcStrideBytes;
         dst += dstStrideBytes;
      }
   }
};

struct MicroTiler32
{
   /*
      32 bits per element:
       0:   0,  1,  2,  3,    8,  9, 10, 11,
       8:   4,  5,  6,  7,   12, 13, 14, 15,

      16:  16, 17, 18, 19,   24, 25, 26, 27,
      24:  20, 21, 22, 23,   28, 29, 30, 31,

      32:  32, 33, 34, 35,   40, 41, 42, 43,
      40:  36, 37, 38, 39,   44, 45, 46, 47,

      48:  48, 49, 50, 51,   56, 57, 58, 59,
      56:  52, 53, 54, 55,   60, 61, 62, 63,
   */

   void apply(uint8_t *src, unsigned srcStrideBytes, uint8_t *dst, unsigned dstStrideBytes)
   {
      static constexpr auto groupSize = 4 * sizeof(uint32_t);

      auto srcElem = [&](int idx) { return src + (idx * 4) + (idx / 8) * srcStrideBytes; };
      auto dstElem = [&](int idx) { return dst + (idx * 4) + (idx / 8) * dstStrideBytes; };

      for (int y = 0; y < MicroTileHeight; y += 2) {
         auto yElem = y * MicroTileWidth;
         std::memcpy(dstElem(0 + yElem), srcElem(0 + yElem), groupSize);
         std::memcpy(dstElem(8 + yElem), srcElem(4 + yElem), groupSize);

         std::memcpy(dstElem(4 + yElem), srcElem(8 + yElem), groupSize);
         std::memcpy(dstElem(12 + yElem), srcElem(12 + yElem), groupSize);
      }
   }
};

struct MicroTiler64
{
   /*
      64 bits per element:
       0:   0,  1,    4,  5,    8,  9,   12, 13,
       8:   2,  3,    6,  7,   10, 11,   14, 15,

      16:  16, 17,   20, 21,   24, 25,   28, 29,
      24:  18, 19,   22, 23,   26, 27,   30, 31,

      32:  32, 33,   36, 37,   40, 41,   44, 45,
      40:  34, 35,   38, 39,   42, 43,   46, 47,

      48:  48, 49,   52, 53,   56, 57,   60, 61,
      56:  50, 51,   54, 55,   58, 59,   62, 63,
   */

   inline void apply(uint8_t *src, unsigned srcStrideBytes, uint8_t *dst, unsigned dstStrideBytes)
   {
      static constexpr auto groupBytes = 2 * sizeof(uint64_t);
      auto srcElem = [&](int idx) { return src + (idx * groupBytes) + (idx / MicroTileWidth) * srcStrideBytes; };
      auto dstElem = [&](int idx) { return dst + (idx * groupBytes) + (idx / MicroTileWidth) * dstStrideBytes; };

      for (int y = 0; y < MicroTileHeight; y += 2) {
         for (int x = 0; x < MicroTileWidth; x += 2) {
            auto idx = x + y * MicroTileWidth;
            std::memcpy(dstElem(0 + idx), srcElem(0 + idx), groupBytes);
            std::memcpy(dstElem(8 + idx), srcElem(2 + idx), groupBytes);
         }
      }
   }
};

struct MicroTiler128
{
   /*
      128 bits per element:
         0:   0,  2,    4,  6,    8, 10,   12, 14,
         8:   1,  3,    5,  7,    9, 11,   13, 15,

        16:  16, 18,   20, 22,   24, 26,   28, 30,
        24:  17, 19,   21, 23,   25, 27,   29, 31,

        32:  32, 34,   36, 38,   40, 42,   44, 46,
        40:  33, 35,   37, 39,   41, 43,   45, 47,

        48:  48, 50,   52, 54,   56, 58,   60, 62,
        56:  49, 51,   53, 55,   57, 59,   61, 63,
   */

   inline void apply(uint8_t *src, unsigned srcStrideBytes, uint8_t *dst, unsigned dstStrideBytes)
   {
      static constexpr auto elemBytes = 8;
      auto srcElem = [&](int idx) { return src + (idx * elemBytes) + (idx / MicroTileWidth) * srcStrideBytes; };
      auto dstElem = [&](int idx) { return dst + (idx * elemBytes) + (idx / MicroTileWidth) * dstStrideBytes; };

      for (int y = 0; y < MicroTileHeight; y += 2) {
         for (int x = 0; x < MicroTileWidth; x += 2) {
            auto idx = x + y * MicroTileWidth;
            std::memcpy(dstElem(idx + 0), srcElem(idx + 0), elemBytes);
            std::memcpy(dstElem(idx + 1), srcElem(idx + 2), elemBytes);

            std::memcpy(dstElem(idx + 8), srcElem(idx + 1), elemBytes);
            std::memcpy(dstElem(idx + 9), srcElem(idx + 3), elemBytes);
         }
      }
   }
};

struct MicroTilerDepth
{
   /*
      depth elements:
          0:   0,  1,  4,  5,   16, 17, 20, 21,
          8:   2,  3,  6,  7,   18, 19, 22, 23,
         16:   8,  9, 12, 13,   24, 25, 28, 29,
         24:  10, 11, 14, 15,   26, 27, 30, 31,

         32:  32, 33, 36, 37,   48, 49, 52, 53,
         40:  34, 35, 38, 39,   50, 51, 54, 55,
         48:  40, 41, 44, 45,   56, 57, 60, 61,
         56:  42, 43, 46, 47,   58, 59, 62, 63,
   */

   inline void apply(uint8_t *src, unsigned srcStrideBytes, uint8_t *dst, unsigned dstStrideBytes, unsigned bpp)
   {
      auto groupSize = 2 * bpp;
      auto srcElem = [&](int idx) { return src + (idx * bpp) + (idx / 8) * srcStrideBytes; };
      auto dstElem = [&](int idx) { return dst + (idx * bpp) + (idx / 8) * dstStrideBytes; };

      for (int y = 0; y < MicroTileHeight; y += 4) {
         for (int x = 0; x < MicroTileWidth; x += 4) {
            auto idx = x * 4 + y * 4;
            std::memcpy(dstElem(0 + idx), srcElem(0 + idx), groupSize);
            std::memcpy(dstElem(2 + idx), srcElem(4 + idx), groupSize);

            std::memcpy(dstElem(8 + idx), srcElem(2 + idx), groupSize);
            std::memcpy(dstElem(10 + idx), srcElem(6 + idx), groupSize);

            std::memcpy(dstElem(16 + idx), srcElem(8 + idx), groupSize);
            std::memcpy(dstElem(18 + idx), srcElem(12 + idx), groupSize);

            std::memcpy(dstElem(24 + idx), srcElem(10 + idx), groupSize);
            std::memcpy(dstElem(26 + idx), srcElem(14 + idx), groupSize);
         }
      }
   }
};

template<typename MicroTiler>
void applyMicroTiler(TiledSurface &tiled, unsigned sliceOffset, unsigned microTileBytes)
{
   auto bytesPerPixel = tiled.bpp / 8;
   auto microTilesPerRow = tiled.pitch / MicroTileWidth;
   auto microTilesNumRows = tiled.height / MicroTileHeight;
   auto microTileOffset = sliceOffset;
   auto dstStrideBytes = tiled.pitch * bytesPerPixel;

   for (auto microTileIndexY = 0; microTileIndexY < microTilesNumRows; ++microTileIndexY) {
      for (auto microTileIndexX = 0; microTileIndexX < microTilesPerRow; ++microTileIndexX) {
         auto pixelX = microTileIndexX * MicroTileWidth;
         auto pixelY = microTileIndexY * MicroTileHeight;
         auto dstOffset = (pixelX + pixelY * tiled.pitch) * bytesPerPixel;

         MicroTiler::apply(static_cast<uint8_t *>(tiled.image) + microTileOffset,
                           MicroTileWidth * bytesPerPixel,
                           static_cast<uint8_t *>(dst) + dstOffset,
                           dstStrideBytes);
         microTileOffset += microTileBytes;
      }
   }
}

bool untileMicroTiledSurface(TiledSurface &tiled, void *dst, int slice)
{
   auto bytesPerPixel = tiled.bpp / 8;
   auto microTileThickness = (tiled.tileMode == TileMode::Tiled1DThick) ? 4 : 1;
   auto microTileBytes = MicroTileWidth * MicroTileHeight * microTileThickness * bytesPerPixel;

   // Calculate slice offset
   auto microTileIndexZ = slice / microTileThickness;
   auto sliceBytes = tiled.pitch * tiled.height * microTileThickness * bytesPerPixel;
   auto sliceOffset = microTileIndexZ * sliceBytes;

   if (tiled.isDepth) {
      applyMicroTiler<MicroTilerDepth>(tiled, sliceOffset, microTileBytes);
      return true;
   }

   switch (tiled.bpp) {
   case 8:
      applyMicroTiler<MicroTiler8>(tiled, sliceOffset, microTileBytes);
      break;
   case 16:
      applyMicroTiler<MicroTiler16>(tiled, sliceOffset, microTileBytes);
      break;
   case 32:
      applyMicroTiler<MicroTiler32>(tiled, sliceOffset, microTileBytes);
      break;
   case 64:
      applyMicroTiler<MicroTiler64>(tiled, sliceOffset, microTileBytes);
      break;
   case 128:
      applyMicroTiler<MicroTiler128>(tiled, sliceOffset, microTileBytes);
      break;
   default:
      return false;
   }

   return true;
}

} // namespace detail

bool untile(TiledSurface &tiled, void *dst)
{
   switch (static_cast<TileMode>(tiled.tileMode)) {
   case TileMode::LinearGeneral:
   case TileMode::LinearAligned:
   case TileMode::LinearSpecial:
      // Already "untiled"
      return true;
   case TileMode::Tiled1DThin1:
      return detail::untileMicroTiledSurface(tiled, dst, 0);
   case TileMode::Tiled1DThick:
      return detail::untileMicroTiledSurface(tiled, dst, 0);
   default:
      return false;
   }
   // Linear
   // MicroTiled
   // MacroTiled
   /*
   case ADDR_TM_2D_TILED_THIN1:
   case ADDR_TM_2D_TILED_THIN2:
   case ADDR_TM_2D_TILED_THIN4:
   case ADDR_TM_2D_TILED_THICK:
   case ADDR_TM_2B_TILED_THIN1:
   case ADDR_TM_2B_TILED_THIN2:
   case ADDR_TM_2B_TILED_THIN4:
   case ADDR_TM_2B_TILED_THICK:
   case ADDR_TM_3D_TILED_THIN1:
   case ADDR_TM_3D_TILED_THICK:
   case ADDR_TM_3B_TILED_THIN1:
   case ADDR_TM_3B_TILED_THICK:
      addr = ComputeSurfaceAddrFromCoordMacroTiled(pIn->x,
                                                   pIn->y,
                                                   pIn->slice,
                                                   pIn->sample,
                                                   pIn->bpp,
                                                   pIn->pitch,
                                                   pIn->height,
                                                   numSamples,
                                                   pIn->tileMode,
                                                   pIn->isDepth,
                                                   pIn->tileBase,
                                                   pIn->compBits,
                                                   pIn->pipeSwizzle,
                                                   pIn->bankSwizzle,
                                                   &pOut->bitPosition);
      break;
   default:
      addr = 0;
   }

   return addr;
   */
}

} // namespace gpu7::tiler
	#pragma once
	#include <cstdint>
	#include <cstring>

	namespace gpu7::tiler
	{

	enum TileMode : uint32_t
	{
	LinearGeneral = 0x0,
	LinearAligned = 0x1,
	Tiled1DThin1 = 0x2,
	Tiled1DThick = 0x3,
	Tiled2DThin1 = 0x4,
	Tiled2DThin2 = 0x5,
	Tiled2DThin4 = 0x6,
	Tiled2DThick = 0x7,
	Tiled2BThin1 = 0x8,
	Tiled2BThin2 = 0x9,
	Tiled2BThin4 = 0xA,
	Tiled2BThick = 0xB,
	Tiled3DThin1 = 0xC,
	Tiled3DThick = 0xD,
	Tiled3BThin1 = 0xE,
	Tiled3BThick = 0xF,
	LinearSpecial = 0x10,
	};

	struct TiledSurface
	{
	void *image;
	uint32_t bpp;
	uint32_t tileMode;
	uint32_t swizzle;
	uint32_t pitch;
	uint32_t height;
	uint32_t depth;
	uint32_t numSamples;
	bool isDepth;
	uint32_t bankSwizzle;
	uint32_t pipeSwizzle;
	};

	namespace detail
	{

	static constexpr auto MicroTileWidth = 8;
	static constexpr auto MicroTileHeight = 8;

	struct MicroTiler8
	{
	/*
	8 bits per element:
	0: 0, 1, 2, 3, 4, 5, 6, 7,
	8: 16, 17, 18, 19, 20, 21, 22, 23,
	16: 8, 9, 10, 11, 12, 13, 14, 15,
	24: 24, 25, 26, 27, 28, 29, 30, 31,

	32: 32, 33, 34, 35, 36, 37, 38, 39,
	40: 48, 49, 50, 51, 52, 53, 54, 55,
	48: 40, 41, 42, 43, 44, 45, 46, 47,
	56: 56, 57, 58, 59, 60, 61, 62, 63,
	*/

	void apply(uint8_t src, unsigned srcStrideBytes, uint8_t dst, unsigned dstStrideBytes)
	{
	static constexpr auto rowSize = MicroTileWidth * sizeof(uint8_t);
	auto dstRow = [&](int row) { return dst + row * dstStrideBytes; };
	auto srcRow = [&](int row) { return src + row * srcStrideBytes; };

	for (int y = 0; y < MicroTileHeight; y += 4) {
	std::memcpy(dstRow(y + 0), srcRow(y + 0), rowSize);
	std::memcpy(dstRow(y + 1), srcRow(y + 2), rowSize);
	std::memcpy(dstRow(y + 2), srcRow(y + 1), rowSize);
	std::memcpy(dstRow(y + 3), srcRow(y + 3), rowSize);
	}
	}
	};

	struct MicroTiler16
	{
	/*
	16 bits per element:
	0: 0, 1, 2, 3, 4, 5, 6, 7,
	8: 8, 9, 10, 11, 12, 13, 14, 15,
	16: 16, 17, 18, 19, 20, 21, 22, 23,
	24: 24, 25, 26, 27, 28, 29, 30, 31,
	32: 32, 33, 34, 35, 36, 37, 38, 39,
	40: 40, 41, 42, 43, 44, 45, 46, 47,
	48: 48, 49, 50, 51, 52, 53, 54, 55,
	56: 56, 57, 58, 59, 60, 61, 62, 63,
	*/

	void apply(uint8_t src, unsigned srcStrideBytes, uint8_t dst, unsigned dstStrideBytes)
	{
	static constexpr auto rowSize = MicroTileWidth * sizeof(uint16_t);

	for (int y = 0; y < MicroTileHeight; ++y) {
	std::memcpy(dst, src, rowSize);
	src += srcStrideBytes;
	dst += dstStrideBytes;
	}
	}
	};

	struct MicroTiler32
	{
	/*
	32 bits per element:
	0: 0, 1, 2, 3, 8, 9, 10, 11,
	8: 4, 5, 6, 7, 12, 13, 14, 15,

	16: 16, 17, 18, 19, 24, 25, 26, 27,
	24: 20, 21, 22, 23, 28, 29, 30, 31,

	32: 32, 33, 34, 35, 40, 41, 42, 43,
	40: 36, 37, 38, 39, 44, 45, 46, 47,

	48: 48, 49, 50, 51, 56, 57, 58, 59,
	56: 52, 53, 54, 55, 60, 61, 62, 63,
	*/

	void apply(uint8_t src, unsigned srcStrideBytes, uint8_t dst, unsigned dstStrideBytes)
	{
	static constexpr auto groupSize = 4 * sizeof(uint32_t);

	auto srcElem = [&](int idx) { return src + (idx * 4) + (idx / 8) * srcStrideBytes; };
	auto dstElem = [&](int idx) { return dst + (idx * 4) + (idx / 8) * dstStrideBytes; };

	for (int y = 0; y < MicroTileHeight; y += 2) {
	auto yElem = y * MicroTileWidth;
	std::memcpy(dstElem(0 + yElem), srcElem(0 + yElem), groupSize);
	std::memcpy(dstElem(8 + yElem), srcElem(4 + yElem), groupSize);

	std::memcpy(dstElem(4 + yElem), srcElem(8 + yElem), groupSize);
	std::memcpy(dstElem(12 + yElem), srcElem(12 + yElem), groupSize);
	}
	}
	};

	struct MicroTiler64
	{
	/*
	64 bits per element:
	0: 0, 1, 4, 5, 8, 9, 12, 13,
	8: 2, 3, 6, 7, 10, 11, 14, 15,

	16: 16, 17, 20, 21, 24, 25, 28, 29,
	24: 18, 19, 22, 23, 26, 27, 30, 31,

	32: 32, 33, 36, 37, 40, 41, 44, 45,
	40: 34, 35, 38, 39, 42, 43, 46, 47,

	48: 48, 49, 52, 53, 56, 57, 60, 61,
	56: 50, 51, 54, 55, 58, 59, 62, 63,
	*/

	inline void apply(uint8_t src, unsigned srcStrideBytes, uint8_t dst, unsigned dstStrideBytes)
	{
	static constexpr auto groupBytes = 2 * sizeof(uint64_t);
	auto srcElem = [&](int idx) { return src + (idx * groupBytes) + (idx / MicroTileWidth) * srcStrideBytes; };
	auto dstElem = [&](int idx) { return dst + (idx * groupBytes) + (idx / MicroTileWidth) * dstStrideBytes; };

	for (int y = 0; y < MicroTileHeight; y += 2) {
	for (int x = 0; x < MicroTileWidth; x += 2) {
	auto idx = x + y * MicroTileWidth;
	std::memcpy(dstElem(0 + idx), srcElem(0 + idx), groupBytes);
	std::memcpy(dstElem(8 + idx), srcElem(2 + idx), groupBytes);
	}
	}
	}
	};

	struct MicroTiler128
	{
	/*
	128 bits per element:
	0: 0, 2, 4, 6, 8, 10, 12, 14,
	8: 1, 3, 5, 7, 9, 11, 13, 15,

	16: 16, 18, 20, 22, 24, 26, 28, 30,
	24: 17, 19, 21, 23, 25, 27, 29, 31,

	32: 32, 34, 36, 38, 40, 42, 44, 46,
	40: 33, 35, 37, 39, 41, 43, 45, 47,

	48: 48, 50, 52, 54, 56, 58, 60, 62,
	56: 49, 51, 53, 55, 57, 59, 61, 63,
	*/

	inline void apply(uint8_t src, unsigned srcStrideBytes, uint8_t dst, unsigned dstStrideBytes)
	{
	static constexpr auto elemBytes = 8;
	auto srcElem = [&](int idx) { return src + (idx * elemBytes) + (idx / MicroTileWidth) * srcStrideBytes; };
	auto dstElem = [&](int idx) { return dst + (idx * elemBytes) + (idx / MicroTileWidth) * dstStrideBytes; };

	for (int y = 0; y < MicroTileHeight; y += 2) {
	for (int x = 0; x < MicroTileWidth; x += 2) {
	auto idx = x + y * MicroTileWidth;
	std::memcpy(dstElem(idx + 0), srcElem(idx + 0), elemBytes);
	std::memcpy(dstElem(idx + 1), srcElem(idx + 2), elemBytes);

	std::memcpy(dstElem(idx + 8), srcElem(idx + 1), elemBytes);
	std::memcpy(dstElem(idx + 9), srcElem(idx + 3), elemBytes);
	}
	}
	}
	};

	struct MicroTilerDepth
	{
	/*
	depth elements:
	0: 0, 1, 4, 5, 16, 17, 20, 21,
	8: 2, 3, 6, 7, 18, 19, 22, 23,
	16: 8, 9, 12, 13, 24, 25, 28, 29,
	24: 10, 11, 14, 15, 26, 27, 30, 31,

	32: 32, 33, 36, 37, 48, 49, 52, 53,
	40: 34, 35, 38, 39, 50, 51, 54, 55,
	48: 40, 41, 44, 45, 56, 57, 60, 61,
	56: 42, 43, 46, 47, 58, 59, 62, 63,
	*/

	inline void apply(uint8_t src, unsigned srcStrideBytes, uint8_t dst, unsigned dstStrideBytes, unsigned bpp)
	{
	auto groupSize = 2 * bpp;
	auto srcElem = [&](int idx) { return src + (idx * bpp) + (idx / 8) * srcStrideBytes; };
	auto dstElem = [&](int idx) { return dst + (idx * bpp) + (idx / 8) * dstStrideBytes; };

	for (int y = 0; y < MicroTileHeight; y += 4) {
	for (int x = 0; x < MicroTileWidth; x += 4) {
	auto idx = x * 4 + y * 4;
	std::memcpy(dstElem(0 + idx), srcElem(0 + idx), groupSize);
	std::memcpy(dstElem(2 + idx), srcElem(4 + idx), groupSize);

	std::memcpy(dstElem(8 + idx), srcElem(2 + idx), groupSize);
	std::memcpy(dstElem(10 + idx), srcElem(6 + idx), groupSize);

	std::memcpy(dstElem(16 + idx), srcElem(8 + idx), groupSize);
	std::memcpy(dstElem(18 + idx), srcElem(12 + idx), groupSize);

	std::memcpy(dstElem(24 + idx), srcElem(10 + idx), groupSize);
	std::memcpy(dstElem(26 + idx), srcElem(14 + idx), groupSize);
	}
	}
	}
	};

	template<typename MicroTiler>
	void applyMicroTiler(TiledSurface &tiled, unsigned sliceOffset, unsigned microTileBytes)
	{
	auto bytesPerPixel = tiled.bpp / 8;
	auto microTilesPerRow = tiled.pitch / MicroTileWidth;
	auto microTilesNumRows = tiled.height / MicroTileHeight;
	auto microTileOffset = sliceOffset;
	auto dstStrideBytes = tiled.pitch * bytesPerPixel;

	for (auto microTileIndexY = 0; microTileIndexY < microTilesNumRows; ++microTileIndexY) {
	for (auto microTileIndexX = 0; microTileIndexX < microTilesPerRow; ++microTileIndexX) {
	auto pixelX = microTileIndexX * MicroTileWidth;
	auto pixelY = microTileIndexY * MicroTileHeight;
	auto dstOffset = (pixelX + pixelY * tiled.pitch) * bytesPerPixel;

	MicroTiler::apply(static_cast<uint8_t *>(tiled.image) + microTileOffset,
	MicroTileWidth * bytesPerPixel,
	static_cast<uint8_t *>(dst) + dstOffset,
	dstStrideBytes);
	microTileOffset += microTileBytes;
	}
	}
	}

	bool untileMicroTiledSurface(TiledSurface &tiled, void *dst, int slice)
	{
	auto bytesPerPixel = tiled.bpp / 8;
	auto microTileThickness = (tiled.tileMode == TileMode::Tiled1DThick) ? 4 : 1;
	auto microTileBytes = MicroTileWidth * MicroTileHeight * microTileThickness * bytesPerPixel;

	// Calculate slice offset
	auto microTileIndexZ = slice / microTileThickness;
	auto sliceBytes = tiled.pitch * tiled.height * microTileThickness * bytesPerPixel;
	auto sliceOffset = microTileIndexZ * sliceBytes;

	if (tiled.isDepth) {
	applyMicroTiler<MicroTilerDepth>(tiled, sliceOffset, microTileBytes);
	return true;
	}

	switch (tiled.bpp) {
	case 8:
	applyMicroTiler<MicroTiler8>(tiled, sliceOffset, microTileBytes);
	break;
	case 16:
	applyMicroTiler<MicroTiler16>(tiled, sliceOffset, microTileBytes);
	break;
	case 32:
	applyMicroTiler<MicroTiler32>(tiled, sliceOffset, microTileBytes);
	break;
	case 64:
	applyMicroTiler<MicroTiler64>(tiled, sliceOffset, microTileBytes);
	break;
	case 128:
	applyMicroTiler<MicroTiler128>(tiled, sliceOffset, microTileBytes);
	break;
	default:
	return false;
	}

	return true;
	}

	} // namespace detail

	bool untile(TiledSurface &tiled, void *dst)
	{
	switch (static_cast<TileMode>(tiled.tileMode)) {
	case TileMode::LinearGeneral:
	case TileMode::LinearAligned:
	case TileMode::LinearSpecial:
	// Already "untiled"
	return true;
	case TileMode::Tiled1DThin1:
	return detail::untileMicroTiledSurface(tiled, dst, 0);
	case TileMode::Tiled1DThick:
	return detail::untileMicroTiledSurface(tiled, dst, 0);
	default:
	return false;
	}
	// Linear
	// MicroTiled
	// MacroTiled
	/*
	case ADDR_TM_2D_TILED_THIN1:
	case ADDR_TM_2D_TILED_THIN2:
	case ADDR_TM_2D_TILED_THIN4:
	case ADDR_TM_2D_TILED_THICK:
	case ADDR_TM_2B_TILED_THIN1:
	case ADDR_TM_2B_TILED_THIN2:
	case ADDR_TM_2B_TILED_THIN4:
	case ADDR_TM_2B_TILED_THICK:
	case ADDR_TM_3D_TILED_THIN1:
	case ADDR_TM_3D_TILED_THICK:
	case ADDR_TM_3B_TILED_THIN1:
	case ADDR_TM_3B_TILED_THICK:
	addr = ComputeSurfaceAddrFromCoordMacroTiled(pIn->x,
	pIn->y,
	pIn->slice,
	pIn->sample,
	pIn->bpp,
	pIn->pitch,
	pIn->height,
	numSamples,
	pIn->tileMode,
	pIn->isDepth,
	pIn->tileBase,
	pIn->compBits,
	pIn->pipeSwizzle,
	pIn->bankSwizzle,
	&pOut->bitPosition);
	break;
	default:
	addr = 0;
	}

	return addr;
	*/
	}

	} // namespace gpu7::tiler