reverted virtual index, fix hlsl colors

keptsecret · keptsecret · commit 5f93cec878ea · 2025-03-24T15:32:09.000+07:00
diff --git a/31_HLSLPathTracer/app_resources/glsl/common.glsl b/31_HLSLPathTracer/app_resources/glsl/common.glsl
@@ -35,7 +35,6 @@ vec2 getTexCoords() {
 #include <nbl/builtin/glsl/limits/numeric.glsl>
 #include <nbl/builtin/glsl/math/constants.glsl>
 #include <nbl/builtin/glsl/utils/common.glsl>
-#include <nbl/builtin/glsl/utils/morton.glsl>
 
 #include <nbl/builtin/glsl/sampling/box_muller_transform.glsl>
 
@@ -689,115 +688,109 @@ bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nb
 void main()
 {
     const ivec2 imageExtents = imageSize(outImage);
+    const ivec2 coords = getCoordinates();
+    vec2 texCoord = vec2(coords) / vec2(imageExtents);
+    texCoord.y = 1.0 - texCoord.y;
 
-    uint virtualThreadIndex;
-    for (uint virtualThreadBase = gl_WorkGroupID.x * _NBL_GLSL_WORKGROUP_SIZE_; virtualThreadBase < 1920*1080; virtualThreadBase += gl_NumWorkGroups.x * _NBL_GLSL_WORKGROUP_SIZE_) // not sure why 1280*720 doesn't cover entire window
+    if (false == (all(lessThanEqual(ivec2(0),coords)) && all(greaterThan(imageExtents,coords)))) {
+        return;
+    }
+
+    if (((PTPushConstant.depth-1)>>MAX_DEPTH_LOG2)>0 || ((PTPushConstant.sampleCount-1)>>MAX_SAMPLES_LOG2)>0)
     {
-        virtualThreadIndex = virtualThreadBase + gl_LocalInvocationIndex.x;
-        const ivec2 coords = ivec2(nbl_glsl_morton_decode2d32b(virtualThreadIndex));    // getCoordinates();
-        vec2 texCoord = vec2(coords) / vec2(imageExtents);
-        texCoord.y = 1.0 - texCoord.y;
+        vec4 pixelCol = vec4(1.0,0.0,0.0,1.0);
+        imageStore(outImage, coords, pixelCol);
+        return;
+    }
 
-        if (false == (all(lessThanEqual(ivec2(0),coords)) && all(greaterThan(imageExtents,coords)))) {
-            continue;
-        }
+    nbl_glsl_xoroshiro64star_state_t scramble_start_state = texelFetch(scramblebuf,coords,0).rg;
+    const vec2 pixOffsetParam = vec2(1.0)/vec2(textureSize(scramblebuf,0));
 
-        if (((PTPushConstant.depth-1)>>MAX_DEPTH_LOG2)>0 || ((PTPushConstant.sampleCount-1)>>MAX_SAMPLES_LOG2)>0)
-        {
-            vec4 pixelCol = vec4(1.0,0.0,0.0,1.0);
-            imageStore(outImage, coords, pixelCol);
-            continue;
-        }
 
-        nbl_glsl_xoroshiro64star_state_t scramble_start_state = texelFetch(scramblebuf,coords,0).rg;
-        const vec2 pixOffsetParam = vec2(1.0)/vec2(textureSize(scramblebuf,0));
+    const mat4 invMVP = PTPushConstant.invMVP;
 
+    vec4 NDC = vec4(texCoord*vec2(2.0,-2.0)+vec2(-1.0,1.0),0.0,1.0);
+    vec3 camPos;
+    {
+        vec4 tmp = invMVP*NDC;
+        camPos = tmp.xyz/tmp.w;
+        NDC.z = 1.0;
+    }
 
-        const mat4 invMVP = PTPushConstant.invMVP;
+    vec3 color = vec3(0.0);
+    float meanLumaSquared = 0.0;
+    // TODO: if we collapse the nested for loop, then all GPUs will get `PTPushConstant.depth` factor speedup, not just NV with separate PC
+    for (int i=0; i<PTPushConstant.sampleCount; i++)
+    {
+        nbl_glsl_xoroshiro64star_state_t scramble_state = scramble_start_state;
 
-        vec4 NDC = vec4(texCoord*vec2(2.0,-2.0)+vec2(-1.0,1.0),0.0,1.0);
-        vec3 camPos;
+        Ray_t ray;
+        // raygen
         {
-            vec4 tmp = invMVP*NDC;
-            camPos = tmp.xyz/tmp.w;
-            NDC.z = 1.0;
-        }
+            ray._immutable.origin = camPos;
+
+            vec4 tmp = NDC;
+            // apply stochastic reconstruction filter
+            const float gaussianFilterCutoff = 2.5;
+            const float truncation = exp(-0.5*gaussianFilterCutoff*gaussianFilterCutoff);
+            vec2 remappedRand = rand3d(0u,i,scramble_state)[0].xy;
+            remappedRand.x *= 1.0-truncation;
+            remappedRand.x += truncation;
+            tmp.xy += pixOffsetParam*nbl_glsl_BoxMullerTransform(remappedRand,1.5);
+            // for depth of field we could do another stochastic point-pick
+            tmp = invMVP*tmp;
+            ray._immutable.direction = normalize(tmp.xyz/tmp.w-camPos);
 
-        vec3 color = vec3(0.0);
-        float meanLumaSquared = 0.0;
-        // TODO: if we collapse the nested for loop, then all GPUs will get `PTPushConstant.depth` factor speedup, not just NV with separate PC
-        for (int i=0; i<PTPushConstant.sampleCount; i++)
-        {
-            nbl_glsl_xoroshiro64star_state_t scramble_state = scramble_start_state;
+            #if POLYGON_METHOD==2
+                ray._immutable.normalAtOrigin = vec3(0.0,0.0,0.0);
+                ray._immutable.wasBSDFAtOrigin = false;
+            #endif
 
-            Ray_t ray;
-            // raygen
-            {
-                ray._immutable.origin = camPos;
-
-                vec4 tmp = NDC;
-                // apply stochastic reconstruction filter
-                const float gaussianFilterCutoff = 2.5;
-                const float truncation = exp(-0.5*gaussianFilterCutoff*gaussianFilterCutoff);
-                vec2 remappedRand = rand3d(0u,i,scramble_state)[0].xy;
-                remappedRand.x *= 1.0-truncation;
-                remappedRand.x += truncation;
-                tmp.xy += pixOffsetParam*nbl_glsl_BoxMullerTransform(remappedRand,1.5);
-                // for depth of field we could do another stochastic point-pick
-                tmp = invMVP*tmp;
-                ray._immutable.direction = normalize(tmp.xyz/tmp.w-camPos);
-
-                #if POLYGON_METHOD==2
-                    ray._immutable.normalAtOrigin = vec3(0.0,0.0,0.0);
-                    ray._immutable.wasBSDFAtOrigin = false;
-                #endif
-
-                ray._payload.accumulation = vec3(0.0);
-                ray._payload.otherTechniqueHeuristic = 0.0; // needed for direct eye-light paths
-                ray._payload.throughput = vec3(1.0);
-                #ifdef KILL_DIFFUSE_SPECULAR_PATHS
-                ray._payload.hasDiffuse = false;
-                #endif
-            }
+            ray._payload.accumulation = vec3(0.0);
+            ray._payload.otherTechniqueHeuristic = 0.0; // needed for direct eye-light paths
+            ray._payload.throughput = vec3(1.0);
+            #ifdef KILL_DIFFUSE_SPECULAR_PATHS
+            ray._payload.hasDiffuse = false;
+            #endif
+        }
 
-            // bounces
+        // bounces
+        {
+            bool hit = true; bool rayAlive = true;
+            for (int d=1; d<=PTPushConstant.depth && hit && rayAlive; d+=2)
             {
-                bool hit = true; bool rayAlive = true;
-                for (int d=1; d<=PTPushConstant.depth && hit && rayAlive; d+=2)
-                {
-                    ray._mutable.intersectionT = nbl_glsl_FLT_MAX;
-                    ray._mutable.objectID = traceRay(ray._mutable.intersectionT,ray._immutable.origin,ray._immutable.direction);
-                    hit = ray._mutable.objectID!=-1;
-                    if (hit)
-                        rayAlive = closestHitProgram(d, i, ray, scramble_state);
-                }
-                // was last trace a miss?
-                if (!hit)
-                    missProgram(ray._immutable,ray._payload);
+                ray._mutable.intersectionT = nbl_glsl_FLT_MAX;
+                ray._mutable.objectID = traceRay(ray._mutable.intersectionT,ray._immutable.origin,ray._immutable.direction);
+                hit = ray._mutable.objectID!=-1;
+                if (hit)
+                    rayAlive = closestHitProgram(d, i, ray, scramble_state);
             }
+            // was last trace a miss?
+            if (!hit)
+                missProgram(ray._immutable,ray._payload);
+        }
 
-            vec3 accumulation = ray._payload.accumulation;
-
-            float rcpSampleSize = 1.0/float(i+1);
-            color += (accumulation-color)*rcpSampleSize;
+        vec3 accumulation = ray._payload.accumulation;
 
-            #ifdef VISUALIZE_HIGH_VARIANCE
-                float luma = getLuma(accumulation);
-                meanLumaSquared += (luma*luma-meanLumaSquared)*rcpSampleSize;
-            #endif
-        }
+        float rcpSampleSize = 1.0/float(i+1);
+        color += (accumulation-color)*rcpSampleSize;
 
         #ifdef VISUALIZE_HIGH_VARIANCE
-            float variance = getLuma(color);
-            variance *= variance;
-            variance = meanLumaSquared-variance;
-            if (variance>5.0)
-                color = vec3(1.0,0.0,0.0);
+            float luma = getLuma(accumulation);
+            meanLumaSquared += (luma*luma-meanLumaSquared)*rcpSampleSize;
         #endif
-
-        vec4 pixelCol = vec4(color, 1.0);
-        imageStore(outImage, coords, pixelCol);
     }
+
+    #ifdef VISUALIZE_HIGH_VARIANCE
+        float variance = getLuma(color);
+        variance *= variance;
+        variance = meanLumaSquared-variance;
+        if (variance>5.0)
+            color = vec3(1.0,0.0,0.0);
+    #endif
+
+    vec4 pixelCol = vec4(color, 1.0);
+    imageStore(outImage, coords, pixelCol);
 }
 /** TODO: Improving Rendering
 
diff --git a/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl b/31_HLSLPathTracer/app_resources/hlsl/render.comp.hlsl
@@ -2,7 +2,6 @@
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
 #include "nbl/builtin/hlsl/random/pcg.hlsl"
 #include "nbl/builtin/hlsl/random/xoroshiro.hlsl"
-#include "nbl/builtin/hlsl/math/morton.hlsl"
 
 #include "nbl/builtin/hlsl/bxdf/reflection.hlsl"
 #include "nbl/builtin/hlsl/bxdf/transmission.hlsl"
@@ -140,9 +139,9 @@ static const bxdfnode_type bxdfs[BXDF_COUNT] = {
     bxdfnode_type::create(ext::MaterialSystem::MaterialType::DIFFUSE, false, float2(0,0), spectral_t(0.8,0.8,0.8)),
     bxdfnode_type::create(ext::MaterialSystem::MaterialType::DIFFUSE, false, float2(0,0), spectral_t(0.8,0.4,0.4)),
     bxdfnode_type::create(ext::MaterialSystem::MaterialType::DIFFUSE, false, float2(0,0), spectral_t(0.4,0.8,0.4)),
-    bxdfnode_type::create(ext::MaterialSystem::MaterialType::CONDUCTOR, false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.98,0.77)),
-    bxdfnode_type::create(ext::MaterialSystem::MaterialType::CONDUCTOR, false, float2(0,0), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98)),
-    bxdfnode_type::create(ext::MaterialSystem::MaterialType::CONDUCTOR, false, float2(0.15,0.15), spectral_t(1,1,1), spectral_t(0.98,0.77,0.98)),
+    bxdfnode_type::create(ext::MaterialSystem::MaterialType::CONDUCTOR, false, float2(0,0), spectral_t(1.02,1.02,1.3), spectral_t(1.0,1.0,2.0)),
+    bxdfnode_type::create(ext::MaterialSystem::MaterialType::CONDUCTOR, false, float2(0,0), spectral_t(1.02,1.3,1.02), spectral_t(1.0,2.0,1.0)),
+    bxdfnode_type::create(ext::MaterialSystem::MaterialType::CONDUCTOR, false, float2(0.15,0.15), spectral_t(1.02,1.3,1.02), spectral_t(1.0,2.0,1.0)),
     bxdfnode_type::create(ext::MaterialSystem::MaterialType::DIELECTRIC, false, float2(0.0625,0.0625), spectral_t(1,1,1), spectral_t(0.71,0.69,0.67))
 };
 
@@ -157,55 +156,48 @@ void main(uint32_t3 threadID : SV_DispatchThreadID)
 {
     uint32_t width, height;
     outImage.GetDimensions(width, height);
+    const int32_t2 coords = getCoordinates();
+    float32_t2 texCoord = float32_t2(coords) / float32_t2(width, height);
+    texCoord.y = 1.0 - texCoord.y;
 
-    uint32_t virtualThreadIndex;
-    [loop]
-    for (uint32_t virtualThreadBase = glsl::gl_WorkGroupID().x * WorkgroupSize; virtualThreadBase < 1920*1080; virtualThreadBase += glsl::gl_NumWorkGroups().x * WorkgroupSize) // not sure why 1280*720 doesn't cover entire window
+    if (false == (all((int32_t2)0 < coords)) && all(int32_t2(width, height) < coords)) {
+        return;
+    }
+
+    if (((pc.depth - 1) >> MAX_DEPTH_LOG2) > 0 || ((pc.sampleCount - 1) >> MAX_SAMPLES_LOG2) > 0)
+    {
+        float32_t4 pixelCol = float32_t4(1.0,0.0,0.0,1.0);
+        outImage[coords] = pixelCol;
+        return;
+    }
+
+    int flatIdx = glsl::gl_GlobalInvocationID().y * glsl::gl_NumWorkGroups().x * WorkgroupSize + glsl::gl_GlobalInvocationID().x;
+
+    // set up path tracer
+    ext::PathTracer::PathTracerCreationParams<create_params_t, float> ptCreateParams;
+    ptCreateParams.rngState = scramblebuf[coords].rg;
+
+    uint2 scrambleDim;
+    scramblebuf.GetDimensions(scrambleDim.x, scrambleDim.y);
+    ptCreateParams.pixOffsetParam = (float2)1.0 / float2(scrambleDim);
+
+    float4 NDC = float4(texCoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0);
     {
-        virtualThreadIndex = virtualThreadBase + glsl::gl_LocalInvocationIndex().x;
-        const int32_t2 coords = (int32_t2)math::Morton<uint32_t>::decode2d(virtualThreadIndex);   // getCoordinates();
-        float32_t2 texCoord = float32_t2(coords) / float32_t2(width, height);
-        texCoord.y = 1.0 - texCoord.y;
-
-        if (false == (hlsl::all((int32_t2)0 < coords)) && hlsl::all(int32_t2(width, height) < coords)) {
-            continue;
-        }
-
-        if (((pc.depth - 1) >> MAX_DEPTH_LOG2) > 0 || ((pc.sampleCount - 1) >> MAX_SAMPLES_LOG2) > 0)
-        {
-            float32_t4 pixelCol = float32_t4(1.0,0.0,0.0,1.0);
-            outImage[coords] = pixelCol;
-            continue;
-        }
-
-        int flatIdx = glsl::gl_GlobalInvocationID().y * glsl::gl_NumWorkGroups().x * WorkgroupSize + glsl::gl_GlobalInvocationID().x;
-
-        // set up path tracer
-        ext::PathTracer::PathTracerCreationParams<create_params_t, float> ptCreateParams;
-        ptCreateParams.rngState = scramblebuf[coords].rg;
-
-        uint2 scrambleDim;
-        scramblebuf.GetDimensions(scrambleDim.x, scrambleDim.y);
-        ptCreateParams.pixOffsetParam = (float2)1.0 / float2(scrambleDim);
-
-        float4 NDC = float4(texCoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0);
-        {
-            float4 tmp = hlsl::mul(pc.invMVP, NDC);
-            ptCreateParams.camPos = tmp.xyz / tmp.w;
-            NDC.z = 1.0;
-        }
-
-        ptCreateParams.NDC = NDC;
-        ptCreateParams.invMVP = pc.invMVP;
-
-        ptCreateParams.diffuseParams = bxdfs[0].params;
-        ptCreateParams.conductorParams = bxdfs[3].params;
-        ptCreateParams.dielectricParams = bxdfs[6].params;
-
-        pathtracer_type pathtracer = pathtracer_type::create(ptCreateParams);
-
-        float32_t3 color = pathtracer.getMeasure(pc.sampleCount, pc.depth, scene);
-        float32_t4 pixCol = float32_t4(color, 1.0);
-        outImage[coords] = pixCol;
+        float4 tmp = mul(pc.invMVP, NDC);
+        ptCreateParams.camPos = tmp.xyz / tmp.w;
+        NDC.z = 1.0;
     }
+
+    ptCreateParams.NDC = NDC;
+    ptCreateParams.invMVP = pc.invMVP;
+
+    ptCreateParams.diffuseParams = bxdfs[0].params;
+    ptCreateParams.conductorParams = bxdfs[3].params;
+    ptCreateParams.dielectricParams = bxdfs[6].params;
+
+    pathtracer_type pathtracer = pathtracer_type::create(ptCreateParams);
+
+    float32_t3 color = pathtracer.getMeasure(pc.sampleCount, pc.depth, scene);
+    float32_t4 pixCol = float32_t4(color, 1.0);
+    outImage[coords] = pixCol;
 }
diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp
@@ -1068,8 +1068,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,
 					cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &m_descriptorSet0.get());
 					cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 2u, 1u, &m_descriptorSet2.get());
 					cmdbuf->pushConstants(pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PTPushConstant), &pc);
-					uint32_t dispatchSize = m_physicalDevice->getLimits().computeOptimalPersistentWorkgroupDispatchSize(WindowDimensions.x * WindowDimensions.y, DefaultWorkGroupSize);
-					cmdbuf->dispatch(dispatchSize, 1u, 1u);
+					cmdbuf->dispatch(1 + (WindowDimensions.x * WindowDimensions.y - 1) / DefaultWorkGroupSize, 1u, 1u);
 				}
 
 				// TRANSITION m_outImgView to READ (because of descriptorSets0 -> ComputeShader Writes into the image)

Original file line number	Diff line number	Diff line change
`@@ -1068,8 +1068,7 @@ class HLSLComputePathtracer final : public examples::SimpleWindowedApplication,`
`1068`	`1068`	`cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &m_descriptorSet0.get());`
`1069`	`1069`	`cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 2u, 1u, &m_descriptorSet2.get());`
`1070`	`1070`	`cmdbuf->pushConstants(pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PTPushConstant), &pc);`
`1071`		`- uint32_t dispatchSize = m_physicalDevice->getLimits().computeOptimalPersistentWorkgroupDispatchSize(WindowDimensions.x * WindowDimensions.y, DefaultWorkGroupSize);`
`1072`		`- cmdbuf->dispatch(dispatchSize, 1u, 1u);`
	`1071`	`+ cmdbuf->dispatch(1 + (WindowDimensions.x * WindowDimensions.y - 1) / DefaultWorkGroupSize, 1u, 1u);`
`1073`	`1072`	`}`
`1074`	`1073`
`1075`	`1074`	`// TRANSITION m_outImgView to READ (because of descriptorSets0 -> ComputeShader Writes into the image)`