Various optimizations
This commit is contained in:
BIN
Content/Shaders/BitonicSort.flax
(Stored with Git LFS)
BIN
Content/Shaders/BitonicSort.flax
(Stored with Git LFS)
Binary file not shown.
@@ -632,6 +632,7 @@ Variant::Variant(ScriptingObject* v)
|
|||||||
AsObject = v;
|
AsObject = v;
|
||||||
if (v)
|
if (v)
|
||||||
{
|
{
|
||||||
|
// TODO: optimize VariantType to support statically linked typename of ScriptingType (via 1 bit flag within Types enum, only in game as editor might hot-reload types)
|
||||||
Type.SetTypeName(v->GetType().Fullname);
|
Type.SetTypeName(v->GetType().Fullname);
|
||||||
v->Deleted.Bind<Variant, &Variant::OnObjectDeleted>(this);
|
v->Deleted.Bind<Variant, &Variant::OnObjectDeleted>(this);
|
||||||
}
|
}
|
||||||
@@ -643,6 +644,7 @@ Variant::Variant(Asset* v)
|
|||||||
AsAsset = v;
|
AsAsset = v;
|
||||||
if (v)
|
if (v)
|
||||||
{
|
{
|
||||||
|
// TODO: optimize VariantType to support statically linked typename of ScriptingType (via 1 bit flag within Types enum, only in game as editor might hot-reload types)
|
||||||
Type.SetTypeName(v->GetType().Fullname);
|
Type.SetTypeName(v->GetType().Fullname);
|
||||||
v->AddReference();
|
v->AddReference();
|
||||||
v->OnUnloaded.Bind<Variant, &Variant::OnAssetUnloaded>(this);
|
v->OnUnloaded.Bind<Variant, &Variant::OnAssetUnloaded>(this);
|
||||||
|
|||||||
@@ -334,11 +334,12 @@ public:
|
|||||||
/// Creates argument buffer description.
|
/// Creates argument buffer description.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="size">The size (in bytes).</param>
|
/// <param name="size">The size (in bytes).</param>
|
||||||
|
/// <param name="additionalFlags">The additional bindings (for example, to use as UAV, pass <see cref="GPUBufferFlags::UnorderedAccess" />).</param>
|
||||||
/// <param name="usage">The usage.</param>
|
/// <param name="usage">The usage.</param>
|
||||||
/// <returns>The buffer description.</returns>
|
/// <returns>The buffer description.</returns>
|
||||||
static GPUBufferDescription Argument(int32 size, GPUResourceUsage usage = GPUResourceUsage::Default)
|
static GPUBufferDescription Argument(int32 size, GPUResourceUsage usage = GPUResourceUsage::Default, GPUBufferFlags additionalFlags = GPUBufferFlags::None)
|
||||||
{
|
{
|
||||||
return Buffer(size, GPUBufferFlags::Argument, PixelFormat::Unknown, nullptr, 0, usage);
|
return Buffer(size, GPUBufferFlags::Argument | additionalFlags, PixelFormat::R32_UInt, nullptr, sizeof(uint32), usage);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ void GPUContext::LogInvalidResourceUsage(int32 slot, const GPUResourceView* view
|
|||||||
GPUResource* resource = view ? view->GetParent() : nullptr;
|
GPUResource* resource = view ? view->GetParent() : nullptr;
|
||||||
const Char* resourceType = TEXT("resource");
|
const Char* resourceType = TEXT("resource");
|
||||||
const Char* flagType = TEXT("flags");
|
const Char* flagType = TEXT("flags");
|
||||||
|
StringView resourceName;
|
||||||
if (resource)
|
if (resource)
|
||||||
{
|
{
|
||||||
switch (resource->GetResourceType())
|
switch (resource->GetResourceType())
|
||||||
@@ -36,6 +37,7 @@ void GPUContext::LogInvalidResourceUsage(int32 slot, const GPUResourceView* view
|
|||||||
flagType = TEXT("GPUBufferFlags");
|
flagType = TEXT("GPUBufferFlags");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
resourceName = resource->GetName();
|
||||||
}
|
}
|
||||||
const Char* usage = TEXT("-");
|
const Char* usage = TEXT("-");
|
||||||
switch (bindPoint)
|
switch (bindPoint)
|
||||||
@@ -53,7 +55,7 @@ void GPUContext::LogInvalidResourceUsage(int32 slot, const GPUResourceView* view
|
|||||||
usage = TEXT("render target");
|
usage = TEXT("render target");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
LOG(Error, "Incorrect {} bind at slot {} as {} (ensure to setup correct {} when creating that resource)", resourceType, slot, usage, flagType);
|
LOG(Error, "Incorrect {} '{}' bind at slot {} as {} (ensure to setup correct {} when creating that resource)", resourceType, resourceName, slot, usage, flagType);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -359,7 +359,8 @@ bool GPUDeviceDX12::Init()
|
|||||||
// Debug Layer
|
// Debug Layer
|
||||||
#if GPU_ENABLE_DIAGNOSTICS
|
#if GPU_ENABLE_DIAGNOSTICS
|
||||||
ComPtr<ID3D12InfoQueue> infoQueue;
|
ComPtr<ID3D12InfoQueue> infoQueue;
|
||||||
VALIDATE_DIRECTX_CALL(_device->QueryInterface(IID_PPV_ARGS(&infoQueue)));
|
HRESULT result = _device->QueryInterface(IID_PPV_ARGS(&infoQueue));
|
||||||
|
LOG_DIRECTX_RESULT(result);
|
||||||
if (infoQueue)
|
if (infoQueue)
|
||||||
{
|
{
|
||||||
D3D12_INFO_QUEUE_FILTER filter;
|
D3D12_INFO_QUEUE_FILTER filter;
|
||||||
|
|||||||
@@ -1358,7 +1358,7 @@ void GPUContextVulkan::UpdateBuffer(GPUBuffer* buffer, const void* data, uint32
|
|||||||
|
|
||||||
// Use direct update for small buffers
|
// Use direct update for small buffers
|
||||||
const uint32 alignedSize = Math::AlignUp<uint32>(size, 4);
|
const uint32 alignedSize = Math::AlignUp<uint32>(size, 4);
|
||||||
if (size <= 16 * 1024 && alignedSize <= buffer->GetSize())
|
if (size <= 4 * 1024 && alignedSize <= buffer->GetSize())
|
||||||
{
|
{
|
||||||
//AddBufferBarrier(bufferVulkan, VK_ACCESS_TRANSFER_WRITE_BIT);
|
//AddBufferBarrier(bufferVulkan, VK_ACCESS_TRANSFER_WRITE_BIT);
|
||||||
//FlushBarriers();
|
//FlushBarriers();
|
||||||
|
|||||||
@@ -12,6 +12,7 @@
|
|||||||
#include "Engine/Graphics/GPUContext.h"
|
#include "Engine/Graphics/GPUContext.h"
|
||||||
#include "Engine/Graphics/Shaders/GPUShader.h"
|
#include "Engine/Graphics/Shaders/GPUShader.h"
|
||||||
#include "Engine/Graphics/Shaders/GPUConstantBuffer.h"
|
#include "Engine/Graphics/Shaders/GPUConstantBuffer.h"
|
||||||
|
#include "Engine/Profiler/Profiler.h"
|
||||||
|
|
||||||
GPU_CB_STRUCT(GPUParticlesData {
|
GPU_CB_STRUCT(GPUParticlesData {
|
||||||
Matrix ViewProjectionMatrix;
|
Matrix ViewProjectionMatrix;
|
||||||
@@ -131,6 +132,8 @@ void GPUParticles::CopyParticlesCount(GPUContext* context, ParticleEmitter* emit
|
|||||||
|
|
||||||
void GPUParticles::Execute(GPUContext* context, ParticleEmitter* emitter, ParticleEffect* effect, int32 emitterIndex, ParticleEmitterInstance& data)
|
void GPUParticles::Execute(GPUContext* context, ParticleEmitter* emitter, ParticleEffect* effect, int32 emitterIndex, ParticleEmitterInstance& data)
|
||||||
{
|
{
|
||||||
|
PROFILE_CPU_ASSET(emitter);
|
||||||
|
PROFILE_GPU("GPUParticles");
|
||||||
ASSERT(emitter->Graph.Version == data.Version);
|
ASSERT(emitter->Graph.Version == data.Version);
|
||||||
ASSERT(emitter->Graph.Version == data.Buffer->Version);
|
ASSERT(emitter->Graph.Version == data.Buffer->Version);
|
||||||
uint32 counterDefaultValue = 0;
|
uint32 counterDefaultValue = 0;
|
||||||
|
|||||||
@@ -770,7 +770,6 @@ void DrawEmitterGPU(RenderContext& renderContext, ParticleBuffer* buffer, DrawCa
|
|||||||
context->BindCB(0, GPUParticlesSortingCB);
|
context->BindCB(0, GPUParticlesSortingCB);
|
||||||
context->BindSR(0, buffer->GPU.Buffer->View());
|
context->BindSR(0, buffer->GPU.Buffer->View());
|
||||||
context->BindUA(0, buffer->GPU.SortingKeysBuffer->View());
|
context->BindUA(0, buffer->GPU.SortingKeysBuffer->View());
|
||||||
// TODO: optimize it by using DispatchIndirect with shared invoke args generated after particles update
|
|
||||||
const int32 threadGroupSize = 1024;
|
const int32 threadGroupSize = 1024;
|
||||||
context->Dispatch(GPUParticlesSortingCS[permutationIndex], Math::DivideAndRoundUp(buffer->GPU.ParticlesCountMax, threadGroupSize), 1, 1);
|
context->Dispatch(GPUParticlesSortingCS[permutationIndex], Math::DivideAndRoundUp(buffer->GPU.ParticlesCountMax, threadGroupSize), 1, 1);
|
||||||
|
|
||||||
|
|||||||
@@ -264,6 +264,7 @@ void RenderList::AddDelayedDraw(DelayedDraw&& func)
|
|||||||
|
|
||||||
void RenderList::DrainDelayedDraws(RenderContext& renderContext)
|
void RenderList::DrainDelayedDraws(RenderContext& renderContext)
|
||||||
{
|
{
|
||||||
|
PROFILE_GPU_CPU_NAMED("DelayedDraws");
|
||||||
for (DelayedDraw& e : _delayedDraws)
|
for (DelayedDraw& e : _delayedDraws)
|
||||||
e(renderContext);
|
e(renderContext);
|
||||||
_delayedDraws.SetCapacity(0);
|
_delayedDraws.SetCapacity(0);
|
||||||
|
|||||||
@@ -6,8 +6,6 @@
|
|||||||
#include "Engine/Graphics/GPUContext.h"
|
#include "Engine/Graphics/GPUContext.h"
|
||||||
#include "Engine/Graphics/GPULimits.h"
|
#include "Engine/Graphics/GPULimits.h"
|
||||||
|
|
||||||
#define INDIRECT_ARGS_STRIDE 12
|
|
||||||
|
|
||||||
// The sorting keys buffer item structure template. Matches the shader type.
|
// The sorting keys buffer item structure template. Matches the shader type.
|
||||||
struct Item
|
struct Item
|
||||||
{
|
{
|
||||||
@@ -39,7 +37,7 @@ bool BitonicSort::Init()
|
|||||||
|
|
||||||
// Create indirect dispatch arguments buffer
|
// Create indirect dispatch arguments buffer
|
||||||
_dispatchArgsBuffer = GPUDevice::Instance->CreateBuffer(TEXT("BitonicSortDispatchArgs"));
|
_dispatchArgsBuffer = GPUDevice::Instance->CreateBuffer(TEXT("BitonicSortDispatchArgs"));
|
||||||
if (_dispatchArgsBuffer->Init(GPUBufferDescription::Raw(22 * 23 / 2 * INDIRECT_ARGS_STRIDE, GPUBufferFlags::Argument | GPUBufferFlags::UnorderedAccess)))
|
if (_dispatchArgsBuffer->Init(GPUBufferDescription::Raw(22 * 23 / 2 * sizeof(GPUDispatchIndirectArgs), GPUBufferFlags::Argument | GPUBufferFlags::UnorderedAccess)))
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
// Load asset
|
// Load asset
|
||||||
@@ -122,7 +120,7 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf
|
|||||||
|
|
||||||
// We have already pre-sorted up through k = 2048 when first writing our list, so we continue sorting with k = 4096
|
// We have already pre-sorted up through k = 2048 when first writing our list, so we continue sorting with k = 4096
|
||||||
// For really large values of k, these indirect dispatches will be skipped over with thread counts of 0
|
// For really large values of k, these indirect dispatches will be skipped over with thread counts of 0
|
||||||
uint32 indirectArgsOffset = INDIRECT_ARGS_STRIDE;
|
uint32 indirectArgsOffset = sizeof(GPUDispatchIndirectArgs);
|
||||||
for (uint32 k = 4096; k <= alignedMaxNumElements; k *= 2)
|
for (uint32 k = 4096; k <= alignedMaxNumElements; k *= 2)
|
||||||
{
|
{
|
||||||
for (uint32 j = k / 2; j >= 2048; j /= 2)
|
for (uint32 j = k / 2; j >= 2048; j /= 2)
|
||||||
@@ -133,11 +131,11 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf
|
|||||||
context->BindCB(0, _cb);
|
context->BindCB(0, _cb);
|
||||||
|
|
||||||
context->DispatchIndirect(_outerSortCS, _dispatchArgsBuffer, indirectArgsOffset);
|
context->DispatchIndirect(_outerSortCS, _dispatchArgsBuffer, indirectArgsOffset);
|
||||||
indirectArgsOffset += INDIRECT_ARGS_STRIDE;
|
indirectArgsOffset += sizeof(GPUDispatchIndirectArgs);
|
||||||
}
|
}
|
||||||
|
|
||||||
context->DispatchIndirect(_innerSortCS, _dispatchArgsBuffer, indirectArgsOffset);
|
context->DispatchIndirect(_innerSortCS, _dispatchArgsBuffer, indirectArgsOffset);
|
||||||
indirectArgsOffset += INDIRECT_ARGS_STRIDE;
|
indirectArgsOffset += sizeof(GPUDispatchIndirectArgs);
|
||||||
}
|
}
|
||||||
|
|
||||||
context->ResetUA();
|
context->ResetUA();
|
||||||
|
|||||||
@@ -68,6 +68,7 @@ void CS_IndirectArgs(uint groupIndex : SV_GroupIndex)
|
|||||||
uint offset = 12 * prevDispatches;
|
uint offset = 12 * prevDispatches;
|
||||||
|
|
||||||
// Generate outer sort dispatch arguments
|
// Generate outer sort dispatch arguments
|
||||||
|
UNROLL
|
||||||
for (uint j = k / 2; j > 1024; j /= 2)
|
for (uint j = k / 2; j > 1024; j /= 2)
|
||||||
{
|
{
|
||||||
// All of the groups of size 2j that are full
|
// All of the groups of size 2j that are full
|
||||||
|
|||||||
@@ -51,8 +51,6 @@ void CS_Sort(uint3 dispatchThreadId : SV_DispatchThreadID)
|
|||||||
if (index >= particlesCount)
|
if (index >= particlesCount)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
// TODO: maybe process more than 1 particle at once and pre-sort them?
|
|
||||||
|
|
||||||
#if SORT_MODE == 0
|
#if SORT_MODE == 0
|
||||||
|
|
||||||
// Sort particles by depth to the view's near plane
|
// Sort particles by depth to the view's near plane
|
||||||
|
|||||||
Reference in New Issue
Block a user