diff --git a/Content/Editor/DebugMaterials/DDGIDebugProbes.flax b/Content/Editor/DebugMaterials/DDGIDebugProbes.flax index 4289244c8..39677f815 100644 --- a/Content/Editor/DebugMaterials/DDGIDebugProbes.flax +++ b/Content/Editor/DebugMaterials/DDGIDebugProbes.flax @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:740621fb235edae990ffa259a833b12001eb5027bc6036af0aa34ebca4bcec64 +oid sha256:d317dc7b2fc2700b28e4a2581c567b888ea1ebb62c5da84f826d9b29c510ff17 size 40805 diff --git a/Source/Engine/GraphicsDevice/DirectX/DX11/GPUContextDX11.cpp b/Source/Engine/GraphicsDevice/DirectX/DX11/GPUContextDX11.cpp index 9b648bd51..2a50fbfd8 100644 --- a/Source/Engine/GraphicsDevice/DirectX/DX11/GPUContextDX11.cpp +++ b/Source/Engine/GraphicsDevice/DirectX/DX11/GPUContextDX11.cpp @@ -95,6 +95,7 @@ void GPUContextDX11::FrameBegin() GPUContext::FrameBegin(); // Setup + _flushOnDispatch = false; _omDirtyFlag = false; _uaDirtyFlag = false; _cbDirtyFlag = false; @@ -497,50 +498,19 @@ void GPUContextDX11::UpdateCB(GPUConstantBuffer* cb, const void* data) void GPUContextDX11::Dispatch(GPUShaderProgramCS* shader, uint32 threadGroupCountX, uint32 threadGroupCountY, uint32 threadGroupCountZ) { - CurrentCS = (GPUShaderProgramCSDX11*)shader; - - // Flush - flushCBs(); - flushSRVs(); - flushUAVs(); - flushOM(); - - // Dispatch - auto compute = (ID3D11ComputeShader*)shader->GetBufferHandle(); - if (_currentCompute != compute) - { - _currentCompute = compute; - _context->CSSetShader(compute, nullptr, 0); - } + onDispatch(shader); _context->Dispatch(threadGroupCountX, threadGroupCountY, threadGroupCountZ); RENDER_STAT_DISPATCH_CALL(); - CurrentCS = nullptr; } void GPUContextDX11::DispatchIndirect(GPUShaderProgramCS* shader, GPUBuffer* bufferForArgs, uint32 offsetForArgs) { ASSERT(bufferForArgs && EnumHasAnyFlags(bufferForArgs->GetFlags(), GPUBufferFlags::Argument)); - CurrentCS = (GPUShaderProgramCSDX11*)shader; - auto bufferForArgsDX11 = (GPUBufferDX11*)bufferForArgs; - - // Flush - flushCBs(); - flushSRVs(); - flushUAVs(); - flushOM(); - - // Dispatch - auto compute = (ID3D11ComputeShader*)shader->GetBufferHandle(); - if (_currentCompute != compute) - { - _currentCompute = compute; - _context->CSSetShader(compute, nullptr, 0); - } + onDispatch(shader); _context->DispatchIndirect(bufferForArgsDX11->GetBuffer(), offsetForArgs); RENDER_STAT_DISPATCH_CALL(); - CurrentCS = nullptr; } @@ -921,6 +891,7 @@ void GPUContextDX11::OverlapUA(bool end) NvAPI_D3D11_EndUAVOverlap(_context); else NvAPI_D3D11_BeginUAVOverlap(_context); + _flushOnDispatch |= end; return; } #endif @@ -931,6 +902,7 @@ void GPUContextDX11::OverlapUA(bool end) agsDriverExtensionsDX11_EndUAVOverlap(AgsContext, _context); else agsDriverExtensionsDX11_BeginUAVOverlap(AgsContext, _context); + _flushOnDispatch |= end; return; } #endif @@ -1046,6 +1018,7 @@ void GPUContextDX11::flushIA() void GPUContextDX11::onDrawCall() { + _flushOnDispatch = false; flushCBs(); flushSRVs(); flushUAVs(); @@ -1053,4 +1026,27 @@ void GPUContextDX11::onDrawCall() flushOM(); } +void GPUContextDX11::onDispatch(GPUShaderProgramCS* shader) +{ + CurrentCS = (GPUShaderProgramCSDX11*)shader; + + flushCBs(); + flushSRVs(); + flushUAVs(); + flushOM(); + + if (_flushOnDispatch) + { + _flushOnDispatch = false; + _context->Flush(); + } + + auto compute = (ID3D11ComputeShader*)shader->GetBufferHandle(); + if (_currentCompute != compute) + { + _currentCompute = compute; + _context->CSSetShader(compute, nullptr, 0); + } +} + #endif diff --git a/Source/Engine/GraphicsDevice/DirectX/DX11/GPUContextDX11.h b/Source/Engine/GraphicsDevice/DirectX/DX11/GPUContextDX11.h index ccdac0d70..7dc693019 100644 --- a/Source/Engine/GraphicsDevice/DirectX/DX11/GPUContextDX11.h +++ b/Source/Engine/GraphicsDevice/DirectX/DX11/GPUContextDX11.h @@ -30,6 +30,7 @@ private: byte _tracyZone[TracyD3D11ZoneSize]; #endif int32 _maxUASlots; + bool _flushOnDispatch; // Output Merger bool _omDirtyFlag; @@ -111,6 +112,7 @@ private: void flushOM(); void flushIA(); void onDrawCall(); + void onDispatch(GPUShaderProgramCS* shader); public: diff --git a/Source/Engine/GraphicsDevice/DirectX/DX12/GPUBufferDX12.cpp b/Source/Engine/GraphicsDevice/DirectX/DX12/GPUBufferDX12.cpp index 673146148..bb1eaacdb 100644 --- a/Source/Engine/GraphicsDevice/DirectX/DX12/GPUBufferDX12.cpp +++ b/Source/Engine/GraphicsDevice/DirectX/DX12/GPUBufferDX12.cpp @@ -38,7 +38,7 @@ void* GPUBufferDX12::Map(GPUResourceMapMode mode) { D3D12_RANGE readRange; D3D12_RANGE* readRangePtr; - switch (mode) + switch (mode & GPUResourceMapMode::ReadWrite) { case GPUResourceMapMode::Read: readRangePtr = nullptr; diff --git a/Source/Engine/Platform/Win32/Win32ConditionVariable.h b/Source/Engine/Platform/Win32/Win32ConditionVariable.h index fa9c04ce4..5b8c23c05 100644 --- a/Source/Engine/Platform/Win32/Win32ConditionVariable.h +++ b/Source/Engine/Platform/Win32/Win32ConditionVariable.h @@ -26,7 +26,7 @@ public: /// /// Initializes a new instance of the class. /// - Win32ConditionVariable() + __forceinline Win32ConditionVariable() { Windows::InitializeConditionVariable(&_cond); } @@ -44,7 +44,7 @@ public: /// Blocks the current thread execution until the condition variable is woken up. /// /// The critical section locked by the current thread. - void Wait(const Win32CriticalSection& lock) + __forceinline void Wait(const Win32CriticalSection& lock) { Windows::SleepConditionVariableCS(&_cond, &lock._criticalSection, 0xFFFFFFFF); } @@ -55,7 +55,7 @@ public: /// The critical section locked by the current thread. /// The time-out interval, in milliseconds. If the time-out interval elapses, the function re-acquires the critical section and returns zero. If timeout is zero, the function tests the states of the specified objects and returns immediately. If timeout is INFINITE, the function's time-out interval never elapses. /// If the function succeeds, the return value is true, otherwise, if the function fails or the time-out interval elapses, the return value is false. - bool Wait(const Win32CriticalSection& lock, const int32 timeout) + __forceinline bool Wait(const Win32CriticalSection& lock, const int32 timeout) { return !!Windows::SleepConditionVariableCS(&_cond, &lock._criticalSection, timeout); } @@ -63,7 +63,7 @@ public: /// /// Notifies one waiting thread. /// - void NotifyOne() + __forceinline void NotifyOne() { Windows::WakeConditionVariable(&_cond); } @@ -71,7 +71,7 @@ public: /// /// Notifies all waiting threads. /// - void NotifyAll() + __forceinline void NotifyAll() { Windows::WakeAllConditionVariable(&_cond); } diff --git a/Source/Engine/Platform/Win32/Win32CriticalSection.h b/Source/Engine/Platform/Win32/Win32CriticalSection.h index e6375c3fc..748840530 100644 --- a/Source/Engine/Platform/Win32/Win32CriticalSection.h +++ b/Source/Engine/Platform/Win32/Win32CriticalSection.h @@ -26,7 +26,7 @@ public: /// /// Initializes a new instance of the class. /// - Win32CriticalSection() + __forceinline Win32CriticalSection() { Windows::InitializeCriticalSectionEx(&_criticalSection, 4000, 0x01000000); } @@ -34,7 +34,7 @@ public: /// /// Finalizes an instance of the class. /// - ~Win32CriticalSection() + __forceinline ~Win32CriticalSection() { Windows::DeleteCriticalSection(&_criticalSection); } @@ -43,7 +43,7 @@ public: /// /// Locks the critical section. /// - void Lock() const + __forceinline void Lock() const { Windows::EnterCriticalSection(&_criticalSection); } @@ -52,7 +52,7 @@ public: /// Attempts to enter a critical section without blocking. If the call is successful, the calling thread takes ownership of the critical section. /// /// True if calling thread took ownership of the critical section. - bool TryLock() const + __forceinline bool TryLock() const { return Windows::TryEnterCriticalSection(&_criticalSection) != 0; } @@ -60,7 +60,7 @@ public: /// /// Releases the lock on the critical section. /// - void Unlock() const + __forceinline void Unlock() const { Windows::LeaveCriticalSection(&_criticalSection); } diff --git a/Source/Engine/Renderer/GlobalSignDistanceFieldPass.cpp b/Source/Engine/Renderer/GlobalSignDistanceFieldPass.cpp index 1240f148c..e4ed7d74f 100644 --- a/Source/Engine/Renderer/GlobalSignDistanceFieldPass.cpp +++ b/Source/Engine/Renderer/GlobalSignDistanceFieldPass.cpp @@ -924,12 +924,6 @@ bool GlobalSignDistanceFieldPass::Render(RenderContext& renderContext, GPUContex } } -#if PLATFORM_WINDOWS - // Hack to fix D3D11 bug that doesn't insert UAV barrier after overlap region ends (between two GPUComputePass) - if (context->GetDevice()->GetRendererType() == RendererType::DirectX11) - context->Dispatch(_csRasterizeModel0, chunkDispatchGroups, chunkDispatchGroups, chunkDispatchGroups); -#endif - // Rasterize non-empty chunks (additive layers so need combine with existing chunk data) for (uint32 layer = 0; layer <= maxLayer; layer++) {