Various optimizations

This commit is contained in:
Wojtek Figat
2025-08-06 18:48:18 +02:00
parent a5838f739d
commit cf9c203855
12 changed files with 22 additions and 16 deletions

BIN
Content/Shaders/BitonicSort.flax (Stored with Git LFS)

Binary file not shown.

View File

@@ -632,6 +632,7 @@ Variant::Variant(ScriptingObject* v)
AsObject = v;
if (v)
{
// TODO: optimize VariantType to support statically linked typename of ScriptingType (via 1 bit flag within Types enum, only in game as editor might hot-reload types)
Type.SetTypeName(v->GetType().Fullname);
v->Deleted.Bind<Variant, &Variant::OnObjectDeleted>(this);
}
@@ -643,6 +644,7 @@ Variant::Variant(Asset* v)
AsAsset = v;
if (v)
{
// TODO: optimize VariantType to support statically linked typename of ScriptingType (via 1 bit flag within Types enum, only in game as editor might hot-reload types)
Type.SetTypeName(v->GetType().Fullname);
v->AddReference();
v->OnUnloaded.Bind<Variant, &Variant::OnAssetUnloaded>(this);

View File

@@ -334,11 +334,12 @@ public:
/// Creates argument buffer description.
/// </summary>
/// <param name="size">The size (in bytes).</param>
/// <param name="additionalFlags">The additional bindings (for example, to use as UAV, pass <see cref="GPUBufferFlags::UnorderedAccess" />).</param>
/// <param name="usage">The usage.</param>
/// <returns>The buffer description.</returns>
static GPUBufferDescription Argument(int32 size, GPUResourceUsage usage = GPUResourceUsage::Default)
static GPUBufferDescription Argument(int32 size, GPUResourceUsage usage = GPUResourceUsage::Default, GPUBufferFlags additionalFlags = GPUBufferFlags::None)
{
return Buffer(size, GPUBufferFlags::Argument, PixelFormat::Unknown, nullptr, 0, usage);
return Buffer(size, GPUBufferFlags::Argument | additionalFlags, PixelFormat::R32_UInt, nullptr, sizeof(uint32), usage);
}
/// <summary>

View File

@@ -20,6 +20,7 @@ void GPUContext::LogInvalidResourceUsage(int32 slot, const GPUResourceView* view
GPUResource* resource = view ? view->GetParent() : nullptr;
const Char* resourceType = TEXT("resource");
const Char* flagType = TEXT("flags");
StringView resourceName;
if (resource)
{
switch (resource->GetResourceType())
@@ -36,6 +37,7 @@ void GPUContext::LogInvalidResourceUsage(int32 slot, const GPUResourceView* view
flagType = TEXT("GPUBufferFlags");
break;
}
resourceName = resource->GetName();
}
const Char* usage = TEXT("-");
switch (bindPoint)
@@ -53,7 +55,7 @@ void GPUContext::LogInvalidResourceUsage(int32 slot, const GPUResourceView* view
usage = TEXT("render target");
break;
}
LOG(Error, "Incorrect {} bind at slot {} as {} (ensure to setup correct {} when creating that resource)", resourceType, slot, usage, flagType);
LOG(Error, "Incorrect {} '{}' bind at slot {} as {} (ensure to setup correct {} when creating that resource)", resourceType, resourceName, slot, usage, flagType);
}
#endif

View File

@@ -359,7 +359,8 @@ bool GPUDeviceDX12::Init()
// Debug Layer
#if GPU_ENABLE_DIAGNOSTICS
ComPtr<ID3D12InfoQueue> infoQueue;
VALIDATE_DIRECTX_CALL(_device->QueryInterface(IID_PPV_ARGS(&infoQueue)));
HRESULT result = _device->QueryInterface(IID_PPV_ARGS(&infoQueue));
LOG_DIRECTX_RESULT(result);
if (infoQueue)
{
D3D12_INFO_QUEUE_FILTER filter;

View File

@@ -1358,7 +1358,7 @@ void GPUContextVulkan::UpdateBuffer(GPUBuffer* buffer, const void* data, uint32
// Use direct update for small buffers
const uint32 alignedSize = Math::AlignUp<uint32>(size, 4);
if (size <= 16 * 1024 && alignedSize <= buffer->GetSize())
if (size <= 4 * 1024 && alignedSize <= buffer->GetSize())
{
//AddBufferBarrier(bufferVulkan, VK_ACCESS_TRANSFER_WRITE_BIT);
//FlushBarriers();

View File

@@ -12,6 +12,7 @@
#include "Engine/Graphics/GPUContext.h"
#include "Engine/Graphics/Shaders/GPUShader.h"
#include "Engine/Graphics/Shaders/GPUConstantBuffer.h"
#include "Engine/Profiler/Profiler.h"
GPU_CB_STRUCT(GPUParticlesData {
Matrix ViewProjectionMatrix;
@@ -131,6 +132,8 @@ void GPUParticles::CopyParticlesCount(GPUContext* context, ParticleEmitter* emit
void GPUParticles::Execute(GPUContext* context, ParticleEmitter* emitter, ParticleEffect* effect, int32 emitterIndex, ParticleEmitterInstance& data)
{
PROFILE_CPU_ASSET(emitter);
PROFILE_GPU("GPUParticles");
ASSERT(emitter->Graph.Version == data.Version);
ASSERT(emitter->Graph.Version == data.Buffer->Version);
uint32 counterDefaultValue = 0;

View File

@@ -770,7 +770,6 @@ void DrawEmitterGPU(RenderContext& renderContext, ParticleBuffer* buffer, DrawCa
context->BindCB(0, GPUParticlesSortingCB);
context->BindSR(0, buffer->GPU.Buffer->View());
context->BindUA(0, buffer->GPU.SortingKeysBuffer->View());
// TODO: optimize it by using DispatchIndirect with shared invoke args generated after particles update
const int32 threadGroupSize = 1024;
context->Dispatch(GPUParticlesSortingCS[permutationIndex], Math::DivideAndRoundUp(buffer->GPU.ParticlesCountMax, threadGroupSize), 1, 1);

View File

@@ -264,6 +264,7 @@ void RenderList::AddDelayedDraw(DelayedDraw&& func)
void RenderList::DrainDelayedDraws(RenderContext& renderContext)
{
PROFILE_GPU_CPU_NAMED("DelayedDraws");
for (DelayedDraw& e : _delayedDraws)
e(renderContext);
_delayedDraws.SetCapacity(0);

View File

@@ -6,8 +6,6 @@
#include "Engine/Graphics/GPUContext.h"
#include "Engine/Graphics/GPULimits.h"
#define INDIRECT_ARGS_STRIDE 12
// The sorting keys buffer item structure template. Matches the shader type.
struct Item
{
@@ -39,7 +37,7 @@ bool BitonicSort::Init()
// Create indirect dispatch arguments buffer
_dispatchArgsBuffer = GPUDevice::Instance->CreateBuffer(TEXT("BitonicSortDispatchArgs"));
if (_dispatchArgsBuffer->Init(GPUBufferDescription::Raw(22 * 23 / 2 * INDIRECT_ARGS_STRIDE, GPUBufferFlags::Argument | GPUBufferFlags::UnorderedAccess)))
if (_dispatchArgsBuffer->Init(GPUBufferDescription::Raw(22 * 23 / 2 * sizeof(GPUDispatchIndirectArgs), GPUBufferFlags::Argument | GPUBufferFlags::UnorderedAccess)))
return true;
// Load asset
@@ -122,7 +120,7 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf
// We have already pre-sorted up through k = 2048 when first writing our list, so we continue sorting with k = 4096
// For really large values of k, these indirect dispatches will be skipped over with thread counts of 0
uint32 indirectArgsOffset = INDIRECT_ARGS_STRIDE;
uint32 indirectArgsOffset = sizeof(GPUDispatchIndirectArgs);
for (uint32 k = 4096; k <= alignedMaxNumElements; k *= 2)
{
for (uint32 j = k / 2; j >= 2048; j /= 2)
@@ -133,11 +131,11 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf
context->BindCB(0, _cb);
context->DispatchIndirect(_outerSortCS, _dispatchArgsBuffer, indirectArgsOffset);
indirectArgsOffset += INDIRECT_ARGS_STRIDE;
indirectArgsOffset += sizeof(GPUDispatchIndirectArgs);
}
context->DispatchIndirect(_innerSortCS, _dispatchArgsBuffer, indirectArgsOffset);
indirectArgsOffset += INDIRECT_ARGS_STRIDE;
indirectArgsOffset += sizeof(GPUDispatchIndirectArgs);
}
context->ResetUA();

View File

@@ -68,6 +68,7 @@ void CS_IndirectArgs(uint groupIndex : SV_GroupIndex)
uint offset = 12 * prevDispatches;
// Generate outer sort dispatch arguments
UNROLL
for (uint j = k / 2; j > 1024; j /= 2)
{
// All of the groups of size 2j that are full

View File

@@ -51,8 +51,6 @@ void CS_Sort(uint3 dispatchThreadId : SV_DispatchThreadID)
if (index >= particlesCount)
return;
// TODO: maybe process more than 1 particle at once and pre-sort them?
#if SORT_MODE == 0
// Sort particles by depth to the view's near plane