Various optimizations
This commit is contained in:
BIN
Content/Shaders/BitonicSort.flax
(Stored with Git LFS)
BIN
Content/Shaders/BitonicSort.flax
(Stored with Git LFS)
Binary file not shown.
@@ -632,6 +632,7 @@ Variant::Variant(ScriptingObject* v)
|
||||
AsObject = v;
|
||||
if (v)
|
||||
{
|
||||
// TODO: optimize VariantType to support statically linked typename of ScriptingType (via 1 bit flag within Types enum, only in game as editor might hot-reload types)
|
||||
Type.SetTypeName(v->GetType().Fullname);
|
||||
v->Deleted.Bind<Variant, &Variant::OnObjectDeleted>(this);
|
||||
}
|
||||
@@ -643,6 +644,7 @@ Variant::Variant(Asset* v)
|
||||
AsAsset = v;
|
||||
if (v)
|
||||
{
|
||||
// TODO: optimize VariantType to support statically linked typename of ScriptingType (via 1 bit flag within Types enum, only in game as editor might hot-reload types)
|
||||
Type.SetTypeName(v->GetType().Fullname);
|
||||
v->AddReference();
|
||||
v->OnUnloaded.Bind<Variant, &Variant::OnAssetUnloaded>(this);
|
||||
|
||||
@@ -334,11 +334,12 @@ public:
|
||||
/// Creates argument buffer description.
|
||||
/// </summary>
|
||||
/// <param name="size">The size (in bytes).</param>
|
||||
/// <param name="additionalFlags">The additional bindings (for example, to use as UAV, pass <see cref="GPUBufferFlags::UnorderedAccess" />).</param>
|
||||
/// <param name="usage">The usage.</param>
|
||||
/// <returns>The buffer description.</returns>
|
||||
static GPUBufferDescription Argument(int32 size, GPUResourceUsage usage = GPUResourceUsage::Default)
|
||||
static GPUBufferDescription Argument(int32 size, GPUResourceUsage usage = GPUResourceUsage::Default, GPUBufferFlags additionalFlags = GPUBufferFlags::None)
|
||||
{
|
||||
return Buffer(size, GPUBufferFlags::Argument, PixelFormat::Unknown, nullptr, 0, usage);
|
||||
return Buffer(size, GPUBufferFlags::Argument | additionalFlags, PixelFormat::R32_UInt, nullptr, sizeof(uint32), usage);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
||||
@@ -20,6 +20,7 @@ void GPUContext::LogInvalidResourceUsage(int32 slot, const GPUResourceView* view
|
||||
GPUResource* resource = view ? view->GetParent() : nullptr;
|
||||
const Char* resourceType = TEXT("resource");
|
||||
const Char* flagType = TEXT("flags");
|
||||
StringView resourceName;
|
||||
if (resource)
|
||||
{
|
||||
switch (resource->GetResourceType())
|
||||
@@ -36,6 +37,7 @@ void GPUContext::LogInvalidResourceUsage(int32 slot, const GPUResourceView* view
|
||||
flagType = TEXT("GPUBufferFlags");
|
||||
break;
|
||||
}
|
||||
resourceName = resource->GetName();
|
||||
}
|
||||
const Char* usage = TEXT("-");
|
||||
switch (bindPoint)
|
||||
@@ -53,7 +55,7 @@ void GPUContext::LogInvalidResourceUsage(int32 slot, const GPUResourceView* view
|
||||
usage = TEXT("render target");
|
||||
break;
|
||||
}
|
||||
LOG(Error, "Incorrect {} bind at slot {} as {} (ensure to setup correct {} when creating that resource)", resourceType, slot, usage, flagType);
|
||||
LOG(Error, "Incorrect {} '{}' bind at slot {} as {} (ensure to setup correct {} when creating that resource)", resourceType, resourceName, slot, usage, flagType);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -359,7 +359,8 @@ bool GPUDeviceDX12::Init()
|
||||
// Debug Layer
|
||||
#if GPU_ENABLE_DIAGNOSTICS
|
||||
ComPtr<ID3D12InfoQueue> infoQueue;
|
||||
VALIDATE_DIRECTX_CALL(_device->QueryInterface(IID_PPV_ARGS(&infoQueue)));
|
||||
HRESULT result = _device->QueryInterface(IID_PPV_ARGS(&infoQueue));
|
||||
LOG_DIRECTX_RESULT(result);
|
||||
if (infoQueue)
|
||||
{
|
||||
D3D12_INFO_QUEUE_FILTER filter;
|
||||
|
||||
@@ -1358,7 +1358,7 @@ void GPUContextVulkan::UpdateBuffer(GPUBuffer* buffer, const void* data, uint32
|
||||
|
||||
// Use direct update for small buffers
|
||||
const uint32 alignedSize = Math::AlignUp<uint32>(size, 4);
|
||||
if (size <= 16 * 1024 && alignedSize <= buffer->GetSize())
|
||||
if (size <= 4 * 1024 && alignedSize <= buffer->GetSize())
|
||||
{
|
||||
//AddBufferBarrier(bufferVulkan, VK_ACCESS_TRANSFER_WRITE_BIT);
|
||||
//FlushBarriers();
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
#include "Engine/Graphics/GPUContext.h"
|
||||
#include "Engine/Graphics/Shaders/GPUShader.h"
|
||||
#include "Engine/Graphics/Shaders/GPUConstantBuffer.h"
|
||||
#include "Engine/Profiler/Profiler.h"
|
||||
|
||||
GPU_CB_STRUCT(GPUParticlesData {
|
||||
Matrix ViewProjectionMatrix;
|
||||
@@ -131,6 +132,8 @@ void GPUParticles::CopyParticlesCount(GPUContext* context, ParticleEmitter* emit
|
||||
|
||||
void GPUParticles::Execute(GPUContext* context, ParticleEmitter* emitter, ParticleEffect* effect, int32 emitterIndex, ParticleEmitterInstance& data)
|
||||
{
|
||||
PROFILE_CPU_ASSET(emitter);
|
||||
PROFILE_GPU("GPUParticles");
|
||||
ASSERT(emitter->Graph.Version == data.Version);
|
||||
ASSERT(emitter->Graph.Version == data.Buffer->Version);
|
||||
uint32 counterDefaultValue = 0;
|
||||
|
||||
@@ -770,7 +770,6 @@ void DrawEmitterGPU(RenderContext& renderContext, ParticleBuffer* buffer, DrawCa
|
||||
context->BindCB(0, GPUParticlesSortingCB);
|
||||
context->BindSR(0, buffer->GPU.Buffer->View());
|
||||
context->BindUA(0, buffer->GPU.SortingKeysBuffer->View());
|
||||
// TODO: optimize it by using DispatchIndirect with shared invoke args generated after particles update
|
||||
const int32 threadGroupSize = 1024;
|
||||
context->Dispatch(GPUParticlesSortingCS[permutationIndex], Math::DivideAndRoundUp(buffer->GPU.ParticlesCountMax, threadGroupSize), 1, 1);
|
||||
|
||||
|
||||
@@ -264,6 +264,7 @@ void RenderList::AddDelayedDraw(DelayedDraw&& func)
|
||||
|
||||
void RenderList::DrainDelayedDraws(RenderContext& renderContext)
|
||||
{
|
||||
PROFILE_GPU_CPU_NAMED("DelayedDraws");
|
||||
for (DelayedDraw& e : _delayedDraws)
|
||||
e(renderContext);
|
||||
_delayedDraws.SetCapacity(0);
|
||||
|
||||
@@ -6,8 +6,6 @@
|
||||
#include "Engine/Graphics/GPUContext.h"
|
||||
#include "Engine/Graphics/GPULimits.h"
|
||||
|
||||
#define INDIRECT_ARGS_STRIDE 12
|
||||
|
||||
// The sorting keys buffer item structure template. Matches the shader type.
|
||||
struct Item
|
||||
{
|
||||
@@ -39,7 +37,7 @@ bool BitonicSort::Init()
|
||||
|
||||
// Create indirect dispatch arguments buffer
|
||||
_dispatchArgsBuffer = GPUDevice::Instance->CreateBuffer(TEXT("BitonicSortDispatchArgs"));
|
||||
if (_dispatchArgsBuffer->Init(GPUBufferDescription::Raw(22 * 23 / 2 * INDIRECT_ARGS_STRIDE, GPUBufferFlags::Argument | GPUBufferFlags::UnorderedAccess)))
|
||||
if (_dispatchArgsBuffer->Init(GPUBufferDescription::Raw(22 * 23 / 2 * sizeof(GPUDispatchIndirectArgs), GPUBufferFlags::Argument | GPUBufferFlags::UnorderedAccess)))
|
||||
return true;
|
||||
|
||||
// Load asset
|
||||
@@ -122,7 +120,7 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf
|
||||
|
||||
// We have already pre-sorted up through k = 2048 when first writing our list, so we continue sorting with k = 4096
|
||||
// For really large values of k, these indirect dispatches will be skipped over with thread counts of 0
|
||||
uint32 indirectArgsOffset = INDIRECT_ARGS_STRIDE;
|
||||
uint32 indirectArgsOffset = sizeof(GPUDispatchIndirectArgs);
|
||||
for (uint32 k = 4096; k <= alignedMaxNumElements; k *= 2)
|
||||
{
|
||||
for (uint32 j = k / 2; j >= 2048; j /= 2)
|
||||
@@ -133,11 +131,11 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf
|
||||
context->BindCB(0, _cb);
|
||||
|
||||
context->DispatchIndirect(_outerSortCS, _dispatchArgsBuffer, indirectArgsOffset);
|
||||
indirectArgsOffset += INDIRECT_ARGS_STRIDE;
|
||||
indirectArgsOffset += sizeof(GPUDispatchIndirectArgs);
|
||||
}
|
||||
|
||||
context->DispatchIndirect(_innerSortCS, _dispatchArgsBuffer, indirectArgsOffset);
|
||||
indirectArgsOffset += INDIRECT_ARGS_STRIDE;
|
||||
indirectArgsOffset += sizeof(GPUDispatchIndirectArgs);
|
||||
}
|
||||
|
||||
context->ResetUA();
|
||||
|
||||
@@ -68,6 +68,7 @@ void CS_IndirectArgs(uint groupIndex : SV_GroupIndex)
|
||||
uint offset = 12 * prevDispatches;
|
||||
|
||||
// Generate outer sort dispatch arguments
|
||||
UNROLL
|
||||
for (uint j = k / 2; j > 1024; j /= 2)
|
||||
{
|
||||
// All of the groups of size 2j that are full
|
||||
|
||||
@@ -51,8 +51,6 @@ void CS_Sort(uint3 dispatchThreadId : SV_DispatchThreadID)
|
||||
if (index >= particlesCount)
|
||||
return;
|
||||
|
||||
// TODO: maybe process more than 1 particle at once and pre-sort them?
|
||||
|
||||
#if SORT_MODE == 0
|
||||
|
||||
// Sort particles by depth to the view's near plane
|
||||
|
||||
Reference in New Issue
Block a user