swr: [rasterizer core] NUMA optimizations...
- Affinitize hot-tile memory to specific NUMA nodes. - Only do BE work for macrotiles assoicated with the numa node
This commit is contained in:
parent
090be2e434
commit
93c1a2dedf
5 changed files with 106 additions and 66 deletions
|
|
@ -184,7 +184,7 @@ void QueueWork(SWR_CONTEXT *pContext)
|
|||
static TileSet lockedTiles;
|
||||
uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
|
||||
WorkOnFifoFE(pContext, 0, curDraw[0], 0);
|
||||
WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles);
|
||||
WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles, 0, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
|
|||
|
|
@ -349,7 +349,9 @@ void WorkOnFifoBE(
|
|||
SWR_CONTEXT *pContext,
|
||||
uint32_t workerId,
|
||||
uint64_t &curDrawBE,
|
||||
TileSet& lockedTiles)
|
||||
TileSet& lockedTiles,
|
||||
uint32_t numaNode,
|
||||
uint32_t numaMask)
|
||||
{
|
||||
// Find the first incomplete draw that has pending work. If no such draw is found then
|
||||
// return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
|
||||
|
|
@ -390,68 +392,78 @@ void WorkOnFifoBE(
|
|||
|
||||
for (uint32_t tileID : macroTiles)
|
||||
{
|
||||
// Only work on tiles for for this numa node
|
||||
uint32_t x, y;
|
||||
pDC->pTileMgr->getTileIndices(tileID, x, y);
|
||||
if (((x ^ y) & numaMask) != numaNode)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
MacroTileQueue &tile = pDC->pTileMgr->getMacroTileQueue(tileID);
|
||||
|
||||
// can only work on this draw if it's not in use by other threads
|
||||
if (lockedTiles.find(tileID) == lockedTiles.end())
|
||||
if (!tile.getNumQueued())
|
||||
{
|
||||
if (tile.getNumQueued())
|
||||
continue;
|
||||
}
|
||||
|
||||
// can only work on this draw if it's not in use by other threads
|
||||
if (lockedTiles.find(tileID) != lockedTiles.end())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (tile.tryLock())
|
||||
{
|
||||
BE_WORK *pWork;
|
||||
|
||||
RDTSC_START(WorkerFoundWork);
|
||||
|
||||
uint32_t numWorkItems = tile.getNumQueued();
|
||||
SWR_ASSERT(numWorkItems);
|
||||
|
||||
pWork = tile.peek();
|
||||
SWR_ASSERT(pWork);
|
||||
if (pWork->type == DRAW)
|
||||
{
|
||||
if (tile.tryLock())
|
||||
{
|
||||
BE_WORK *pWork;
|
||||
|
||||
RDTSC_START(WorkerFoundWork);
|
||||
|
||||
uint32_t numWorkItems = tile.getNumQueued();
|
||||
|
||||
if (numWorkItems != 0)
|
||||
{
|
||||
pWork = tile.peek();
|
||||
SWR_ASSERT(pWork);
|
||||
if (pWork->type == DRAW)
|
||||
{
|
||||
pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, tileID);
|
||||
}
|
||||
}
|
||||
|
||||
while ((pWork = tile.peek()) != nullptr)
|
||||
{
|
||||
pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
|
||||
tile.dequeue();
|
||||
}
|
||||
RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId);
|
||||
|
||||
_ReadWriteBarrier();
|
||||
|
||||
pDC->pTileMgr->markTileComplete(tileID);
|
||||
|
||||
// Optimization: If the draw is complete and we're the last one to have worked on it then
|
||||
// we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
|
||||
if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete())
|
||||
{
|
||||
// We can increment the current BE and safely move to next draw since we know this draw is complete.
|
||||
curDrawBE++;
|
||||
CompleteDrawContext(pContext, pDC);
|
||||
|
||||
lastRetiredDraw++;
|
||||
|
||||
lockedTiles.clear();
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
|
||||
lockedTiles.insert(tileID);
|
||||
}
|
||||
pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, tileID);
|
||||
}
|
||||
|
||||
while ((pWork = tile.peek()) != nullptr)
|
||||
{
|
||||
pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
|
||||
tile.dequeue();
|
||||
}
|
||||
RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId);
|
||||
|
||||
_ReadWriteBarrier();
|
||||
|
||||
pDC->pTileMgr->markTileComplete(tileID);
|
||||
|
||||
// Optimization: If the draw is complete and we're the last one to have worked on it then
|
||||
// we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
|
||||
if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete())
|
||||
{
|
||||
// We can increment the current BE and safely move to next draw since we know this draw is complete.
|
||||
curDrawBE++;
|
||||
CompleteDrawContext(pContext, pDC);
|
||||
|
||||
lastRetiredDraw++;
|
||||
|
||||
lockedTiles.clear();
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
|
||||
lockedTiles.insert(tileID);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, int numaNode)
|
||||
void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode)
|
||||
{
|
||||
// Try to grab the next DC from the ring
|
||||
uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
|
||||
|
|
@ -547,7 +559,8 @@ DWORD workerThreadMain(LPVOID pData)
|
|||
|
||||
RDTSC_INIT(threadId);
|
||||
|
||||
int numaNode = (int)pThreadData->numaId;
|
||||
uint32_t numaNode = pThreadData->numaId;
|
||||
uint32_t numaMask = pContext->threadPool.numaMask;
|
||||
|
||||
// flush denormals to 0
|
||||
_mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
|
||||
|
|
@ -619,7 +632,7 @@ DWORD workerThreadMain(LPVOID pData)
|
|||
}
|
||||
|
||||
RDTSC_START(WorkerWorkOnFifoBE);
|
||||
WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles);
|
||||
WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
|
||||
RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);
|
||||
|
||||
WorkOnCompute(pContext, workerId, curDrawBE);
|
||||
|
|
@ -740,6 +753,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
|
|||
|
||||
pPool->inThreadShutdown = false;
|
||||
pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA));
|
||||
pPool->numaMask = 0;
|
||||
|
||||
if (KNOB_MAX_WORKER_THREADS)
|
||||
{
|
||||
|
|
@ -760,6 +774,8 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
|
|||
}
|
||||
else
|
||||
{
|
||||
pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.)
|
||||
|
||||
uint32_t workerId = 0;
|
||||
for (uint32_t n = 0; n < numNodes; ++n)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@ struct THREAD_POOL
|
|||
{
|
||||
THREAD_PTR threads[KNOB_MAX_NUM_THREADS];
|
||||
uint32_t numThreads;
|
||||
uint32_t numaMask;
|
||||
volatile bool inThreadShutdown;
|
||||
THREAD_DATA *pThreadData;
|
||||
};
|
||||
|
|
@ -61,7 +62,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
|
|||
void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
|
||||
|
||||
// Expose FE and BE worker functions to the API thread if single threaded
|
||||
void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, int numaNode);
|
||||
void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles);
|
||||
void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode);
|
||||
void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask);
|
||||
void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE);
|
||||
int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC);
|
||||
|
|
@ -119,7 +119,8 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32
|
|||
if (create)
|
||||
{
|
||||
uint32_t size = numSamples * mHotTileSize[attachment];
|
||||
hotTile.pBuffer = (uint8_t*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
|
||||
uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
|
||||
hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, KNOB_SIMD_WIDTH * 4, numaNode);
|
||||
hotTile.state = HOTTILE_INVALID;
|
||||
hotTile.numSamples = numSamples;
|
||||
hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
|
||||
|
|
@ -139,10 +140,11 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32
|
|||
SWR_ASSERT((hotTile.state == HOTTILE_INVALID) ||
|
||||
(hotTile.state == HOTTILE_RESOLVED) ||
|
||||
(hotTile.state == HOTTILE_CLEAR));
|
||||
_aligned_free(hotTile.pBuffer);
|
||||
FreeHotTileMem(hotTile.pBuffer);
|
||||
|
||||
uint32_t size = numSamples * mHotTileSize[attachment];
|
||||
hotTile.pBuffer = (uint8_t*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
|
||||
uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
|
||||
hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, KNOB_SIMD_WIDTH * 4, numaNode);
|
||||
hotTile.state = HOTTILE_INVALID;
|
||||
hotTile.numSamples = numSamples;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -291,11 +291,7 @@ public:
|
|||
{
|
||||
for (int a = 0; a < SWR_NUM_ATTACHMENTS; ++a)
|
||||
{
|
||||
if (mHotTiles[x][y].Attachment[a].pBuffer != NULL)
|
||||
{
|
||||
_aligned_free(mHotTiles[x][y].Attachment[a].pBuffer);
|
||||
mHotTiles[x][y].Attachment[a].pBuffer = NULL;
|
||||
}
|
||||
FreeHotTileMem(mHotTiles[x][y].Attachment[a].pBuffer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -315,5 +311,30 @@ public:
|
|||
private:
|
||||
HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y];
|
||||
uint32_t mHotTileSize[SWR_NUM_ATTACHMENTS];
|
||||
|
||||
void* AllocHotTileMem(size_t size, uint32_t align, uint32_t numaNode)
|
||||
{
|
||||
void* p = nullptr;
|
||||
#if defined(_WIN32)
|
||||
HANDLE hProcess = GetCurrentProcess();
|
||||
p = VirtualAllocExNuma(hProcess, nullptr, size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE, numaNode);
|
||||
#else
|
||||
p = _aligned_malloc(size, align);
|
||||
#endif
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
void FreeHotTileMem(void* pBuffer)
|
||||
{
|
||||
if (pBuffer)
|
||||
{
|
||||
#if defined(_WIN32)
|
||||
VirtualFree(pBuffer, 0, MEM_RELEASE);
|
||||
#else
|
||||
_aligned_free(pBuffer);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue