Skip to content

Commit 1535561

Browse files
committed
added persistent workgroup toggle
1 parent f823771 commit 1535561

File tree

3 files changed

+123
-8
lines changed

3 files changed

+123
-8
lines changed

31_HLSLPathTracer/app_resources/glsl/common.glsl

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ vec2 getTexCoords() {
3535
#include <nbl/builtin/glsl/limits/numeric.glsl>
3636
#include <nbl/builtin/glsl/math/constants.glsl>
3737
#include <nbl/builtin/glsl/utils/common.glsl>
38+
#ifdef PERSISTENT_WORKGROUPS
39+
#include <nbl/builtin/glsl/utils/morton.glsl>
40+
#endif
3841

3942
#include <nbl/builtin/glsl/sampling/box_muller_transform.glsl>
4043

@@ -688,19 +691,37 @@ bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nb
688691
void main()
689692
{
690693
const ivec2 imageExtents = imageSize(outImage);
694+
695+
#ifdef PERSISTENT_WORKGROUPS
696+
uint virtualThreadIndex;
697+
for (uint virtualThreadBase = gl_WorkGroupID.x * _NBL_GLSL_WORKGROUP_SIZE_; virtualThreadBase < 1920*1080; virtualThreadBase += gl_NumWorkGroups.x * _NBL_GLSL_WORKGROUP_SIZE_) // not sure why 1280*720 doesn't cover draw surface
698+
{
699+
virtualThreadIndex = virtualThreadBase + gl_LocalInvocationIndex.x;
700+
const ivec2 coords = ivec2(nbl_glsl_morton_decode2d32b(virtualThreadIndex));
701+
#else
691702
const ivec2 coords = getCoordinates();
703+
#endif
704+
692705
vec2 texCoord = vec2(coords) / vec2(imageExtents);
693706
texCoord.y = 1.0 - texCoord.y;
694707

695708
if (false == (all(lessThanEqual(ivec2(0),coords)) && all(greaterThan(imageExtents,coords)))) {
709+
#ifdef PERSISTENT_WORKGROUPS
710+
continue;
711+
#else
696712
return;
713+
#endif
697714
}
698715

699716
if (((PTPushConstant.depth-1)>>MAX_DEPTH_LOG2)>0 || ((PTPushConstant.sampleCount-1)>>MAX_SAMPLES_LOG2)>0)
700717
{
701718
vec4 pixelCol = vec4(1.0,0.0,0.0,1.0);
702719
imageStore(outImage, coords, pixelCol);
720+
#ifdef PERSISTENT_WORKGROUPS
721+
continue;
722+
#else
703723
return;
724+
#endif
704725
}
705726

706727
nbl_glsl_xoroshiro64star_state_t scramble_start_state = texelFetch(scramblebuf,coords,0).rg;
@@ -791,6 +812,10 @@ void main()
791812

792813
vec4 pixelCol = vec4(color, 1.0);
793814
imageStore(outImage, coords, pixelCol);
815+
816+
#ifdef PERSISTENT_WORKGROUPS
817+
}
818+
#endif
794819
}
795820
/** TODO: Improving Rendering
796821

31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22
#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
33
#include "nbl/builtin/hlsl/random/pcg.hlsl"
44
#include "nbl/builtin/hlsl/random/xoroshiro.hlsl"
5+
#ifdef PERSISTENT_WORKGROUPS
6+
#include "nbl/builtin/hlsl/math/morton.hlsl"
7+
#endif
58

69
#include "nbl/builtin/hlsl/bxdf/reflection.hlsl"
710
#include "nbl/builtin/hlsl/bxdf/transmission.hlsl"
@@ -156,19 +159,36 @@ void main(uint32_t3 threadID : SV_DispatchThreadID)
156159
{
157160
uint32_t width, height;
158161
outImage.GetDimensions(width, height);
162+
#ifdef PERSISTENT_WORKGROUPS
163+
uint32_t virtualThreadIndex;
164+
[loop]
165+
for (uint32_t virtualThreadBase = glsl::gl_WorkGroupID().x * WorkgroupSize; virtualThreadBase < 1920*1080; virtualThreadBase += glsl::gl_NumWorkGroups().x * WorkgroupSize) // not sure why 1280*720 doesn't cover draw surface
166+
{
167+
virtualThreadIndex = virtualThreadBase + glsl::gl_LocalInvocationIndex().x;
168+
const int32_t2 coords = (int32_t2)math::Morton<uint32_t>::decode2d(virtualThreadIndex);
169+
#else
159170
const int32_t2 coords = getCoordinates();
171+
#endif
160172
float32_t2 texCoord = float32_t2(coords) / float32_t2(width, height);
161173
texCoord.y = 1.0 - texCoord.y;
162174

163175
if (false == (all((int32_t2)0 < coords)) && all(int32_t2(width, height) < coords)) {
176+
#ifdef PERSISTENT_WORKGROUPS
177+
continue;
178+
#else
164179
return;
180+
#endif
165181
}
166182

167183
if (((pc.depth - 1) >> MAX_DEPTH_LOG2) > 0 || ((pc.sampleCount - 1) >> MAX_SAMPLES_LOG2) > 0)
168184
{
169185
float32_t4 pixelCol = float32_t4(1.0,0.0,0.0,1.0);
170186
outImage[coords] = pixelCol;
187+
#ifdef PERSISTENT_WORKGROUPS
188+
continue;
189+
#else
171190
return;
191+
#endif
172192
}
173193

174194
int flatIdx = glsl::gl_GlobalInvocationID().y * glsl::gl_NumWorkGroups().x * WorkgroupSize + glsl::gl_GlobalInvocationID().x;
@@ -200,4 +220,8 @@ void main(uint32_t3 threadID : SV_DispatchThreadID)
200220
float32_t3 color = pathtracer.getMeasure(pc.sampleCount, pc.depth, scene);
201221
float32_t4 pixCol = float32_t4(color, 1.0);
202222
outImage[coords] = pixCol;
223+
224+
#ifdef PERSISTENT_WORKGROUPS
225+
}
226+
#endif
203227
}

31_HLSLPathTracer/main.cpp

Lines changed: 74 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
323323
m_presentDescriptorSet = presentDSPool->createDescriptorSet(gpuPresentDescriptorSetLayout);
324324

325325
// Create Shaders
326-
auto loadAndCompileGLSLShader = [&](const std::string& pathToShader) -> smart_refctd_ptr<IGPUShader>
326+
auto loadAndCompileGLSLShader = [&](const std::string& pathToShader, bool persistentWorkGroups = false) -> smart_refctd_ptr<IGPUShader>
327327
{
328328
IAssetLoader::SAssetLoadParams lp = {};
329329
lp.workingDirectory = localInputCWD;
@@ -339,6 +339,27 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
339339
// The down-cast should not fail!
340340
assert(source);
341341

342+
auto compiler = make_smart_refctd_ptr<asset::CGLSLCompiler>(smart_refctd_ptr(m_system));
343+
CGLSLCompiler::SOptions options = {};
344+
options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE; // should be compute
345+
options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
346+
options.spirvOptimizer = nullptr;
347+
#ifndef _NBL_DEBUG
348+
ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO;
349+
auto opt = make_smart_refctd_ptr<ISPIRVOptimizer>(std::span<ISPIRVOptimizer::E_OPTIMIZER_PASS>(&optPasses, 1));
350+
options.spirvOptimizer = opt.get();
351+
#endif
352+
options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT;
353+
options.preprocessorOptions.sourceIdentifier = source->getFilepathHint();
354+
options.preprocessorOptions.logger = m_logger.get();
355+
options.preprocessorOptions.includeFinder = compiler->getDefaultIncludeFinder();
356+
357+
const IShaderCompiler::SMacroDefinition persistentDefine = { "PERSISTENT_WORKGROUPS", "1" };
358+
if (persistentWorkGroups)
359+
options.preprocessorOptions.extraDefines = { &persistentDefine, &persistentDefine + 1 };
360+
361+
source = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
362+
342363
// this time we skip the use of the asset converter since the ICPUShader->IGPUShader path is quick and simple
343364
auto shader = m_device->createShader(source.get());
344365
if (!shader)
@@ -350,7 +371,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
350371
return shader;
351372
};
352373

353-
auto loadAndCompileHLSLShader = [&](const std::string& pathToShader, const std::string& defineMacro) -> smart_refctd_ptr<IGPUShader>
374+
auto loadAndCompileHLSLShader = [&](const std::string& pathToShader, const std::string& defineMacro = "", bool persistentWorkGroups = false) -> smart_refctd_ptr<IGPUShader>
354375
{
355376
IAssetLoader::SAssetLoadParams lp = {};
356377
lp.workingDirectory = localInputCWD;
@@ -368,7 +389,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
368389

369390
auto compiler = make_smart_refctd_ptr<asset::CHLSLCompiler>(smart_refctd_ptr(m_system));
370391
CHLSLCompiler::SOptions options = {};
371-
options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE; // should be compute
392+
options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE;
372393
options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
373394
options.spirvOptimizer = nullptr;
374395
#ifndef _NBL_DEBUG
@@ -381,8 +402,11 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
381402
options.preprocessorOptions.logger = m_logger.get();
382403
options.preprocessorOptions.includeFinder = compiler->getDefaultIncludeFinder();
383404

384-
const IShaderCompiler::SMacroDefinition variantDefine = { defineMacro, "" };
385-
options.preprocessorOptions.extraDefines = { &variantDefine, &variantDefine + 1 };
405+
const IShaderCompiler::SMacroDefinition defines[2] = { {defineMacro, ""}, { "PERSISTENT_WORKGROUPS", "1" } };
406+
if (!defineMacro.empty() && persistentWorkGroups)
407+
options.preprocessorOptions.extraDefines = { defines, defines + 2 };
408+
else if (!defineMacro.empty() && !persistentWorkGroups)
409+
options.preprocessorOptions.extraDefines = { defines, defines + 1 };
386410

387411
source = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
388412

@@ -441,6 +465,34 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
441465
if (!m_device->createComputePipelines(nullptr, { &params, 1 }, m_PTHLSLPipelines.data() + index))
442466
return logFail("Failed to create HLSL compute pipeline!\n");
443467
}
468+
469+
// persistent wg pipelines
470+
{
471+
auto ptShader = loadAndCompileGLSLShader(PTGLSLShaderPaths[index], true);
472+
473+
IGPUComputePipeline::SCreationParams params = {};
474+
params.layout = ptPipelineLayout.get();
475+
params.shader.shader = ptShader.get();
476+
params.shader.entryPoint = "main";
477+
params.shader.entries = nullptr;
478+
params.shader.requireFullSubgroups = true;
479+
params.shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(5);
480+
if (!m_device->createComputePipelines(nullptr, { &params, 1 }, m_PTGLSLPersistentWGPipelines.data() + index))
481+
return logFail("Failed to create GLSL PersistentWG compute pipeline!\n");
482+
}
483+
{
484+
auto ptShader = loadAndCompileHLSLShader(PTHLSLShaderPath, PTHLSLShaderVariants[index], true);
485+
486+
IGPUComputePipeline::SCreationParams params = {};
487+
params.layout = ptPipelineLayout.get();
488+
params.shader.shader = ptShader.get();
489+
params.shader.entryPoint = "main";
490+
params.shader.entries = nullptr;
491+
params.shader.requireFullSubgroups = true;
492+
params.shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(5);
493+
if (!m_device->createComputePipelines(nullptr, { &params, 1 }, m_PTHLSLPersistentWGPipelines.data() + index))
494+
return logFail("Failed to create HLSL PersistentWG compute pipeline!\n");
495+
}
444496
}
445497
}
446498

@@ -452,7 +504,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
452504
return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!");
453505

454506
// Load Fragment Shader
455-
auto fragmentShader = loadAndCompileGLSLShader(PresentShaderPath);
507+
auto fragmentShader = loadAndCompileHLSLShader(PresentShaderPath);
456508
if (!fragmentShader)
457509
return logFail("Failed to Load and Compile Fragment Shader: lumaMeterShader!");
458510

@@ -944,6 +996,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
944996
ImGui::Combo("Render Mode", &renderMode, shaderTypes, E_RENDER_MODE::ERM_COUNT);
945997
ImGui::SliderInt("SPP", &spp, 1, MaxBufferSamples);
946998
ImGui::SliderInt("Depth", &depth, 1, MaxBufferDimensions / 3);
999+
ImGui::Checkbox("Persistent WorkGroups", &usePersistentWorkGroups);
9471000

9481001
ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y);
9491002

@@ -1069,12 +1122,22 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
10691122

10701123
// cube envmap handle
10711124
{
1072-
auto pipeline = renderMode == E_RENDER_MODE::ERM_HLSL ? m_PTHLSLPipelines[PTPipeline].get() : m_PTGLSLPipelines[PTPipeline].get();
1125+
IGPUComputePipeline* pipeline;
1126+
if (usePersistentWorkGroups)
1127+
pipeline = renderMode == E_RENDER_MODE::ERM_HLSL ? m_PTHLSLPersistentWGPipelines[PTPipeline].get() : m_PTGLSLPersistentWGPipelines[PTPipeline].get();
1128+
else
1129+
pipeline = renderMode == E_RENDER_MODE::ERM_HLSL ? m_PTHLSLPipelines[PTPipeline].get() : m_PTGLSLPipelines[PTPipeline].get();
10731130
cmdbuf->bindComputePipeline(pipeline);
10741131
cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &m_descriptorSet0.get());
10751132
cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 2u, 1u, &m_descriptorSet2.get());
10761133
cmdbuf->pushConstants(pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PTPushConstant), &pc);
1077-
cmdbuf->dispatch(1 + (WindowDimensions.x * WindowDimensions.y - 1) / DefaultWorkGroupSize, 1u, 1u);
1134+
if (usePersistentWorkGroups)
1135+
{
1136+
uint32_t dispatchSize = m_physicalDevice->getLimits().computeOptimalPersistentWorkgroupDispatchSize(WindowDimensions.x * WindowDimensions.y, DefaultWorkGroupSize);
1137+
cmdbuf->dispatch(dispatchSize, 1u, 1u);
1138+
}
1139+
else
1140+
cmdbuf->dispatch(1 + (WindowDimensions.x * WindowDimensions.y - 1) / DefaultWorkGroupSize, 1u, 1u);
10781141
}
10791142

10801143
// TRANSITION m_outImgView to READ (because of descriptorSets0 -> ComputeShader Writes into the image)
@@ -1306,6 +1369,8 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
13061369
smart_refctd_ptr<IGPUCommandPool> m_cmdPool;
13071370
std::array<smart_refctd_ptr<IGPUComputePipeline>, E_LIGHT_GEOMETRY::ELG_COUNT> m_PTGLSLPipelines;
13081371
std::array<smart_refctd_ptr<IGPUComputePipeline>, E_LIGHT_GEOMETRY::ELG_COUNT> m_PTHLSLPipelines;
1372+
std::array<smart_refctd_ptr<IGPUComputePipeline>, E_LIGHT_GEOMETRY::ELG_COUNT> m_PTGLSLPersistentWGPipelines;
1373+
std::array<smart_refctd_ptr<IGPUComputePipeline>, E_LIGHT_GEOMETRY::ELG_COUNT> m_PTHLSLPersistentWGPipelines;
13091374
smart_refctd_ptr<IGPUGraphicsPipeline> m_presentPipeline;
13101375
uint64_t m_realFrameIx = 0;
13111376
std::array<smart_refctd_ptr<IGPUCommandBuffer>, MaxFramesInFlight> m_cmdBufs;
@@ -1357,6 +1422,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
13571422
int renderMode = E_RENDER_MODE::ERM_HLSL;
13581423
int spp = 32;
13591424
int depth = 3;
1425+
bool usePersistentWorkGroups = false;
13601426

13611427
bool m_firstFrame = true;
13621428
IGPUCommandBuffer::SClearColorValue clearColor = { .float32 = {0.f,0.f,0.f,1.f} };

0 commit comments

Comments
 (0)