99// debug
1010// #define NEE_ONLY
1111
12- layout (set = 2 , binding = 0 ) uniform sampler2D envMap;
12+ layout (set = 2 , binding = 0 ) uniform sampler2D envMap;
1313layout (set = 2 , binding = 1 ) uniform usamplerBuffer sampleSequence;
1414layout (set = 2 , binding = 2 ) uniform usampler2D scramblebuf;
1515
@@ -35,6 +35,7 @@ vec2 getTexCoords() {
3535#include < nbl/ builtin/ glsl/ limits/ numeric.glsl>
3636#include < nbl/ builtin/ glsl/ math/ constants.glsl>
3737#include < nbl/ builtin/ glsl/ utils/ common.glsl>
38+ #include < nbl/ builtin/ glsl/ utils/ morton.glsl>
3839
3940#include < nbl/ builtin/ glsl/ sampling/ box_muller_transform.glsl>
4041
@@ -51,7 +52,7 @@ struct Sphere
5152 vec3 position;
5253 float radius2;
5354 uint bsdfLightIDs;
54- };
55+ };
5556
5657Sphere Sphere_Sphere(in vec3 position, in float radius, in uint bsdfID, in uint lightID)
5758{
@@ -188,7 +189,7 @@ void Rectangle_getNormalBasis(in Rectangle rect, out mat3 basis, out vec2 extent
188189 basis[0 ] = rect.edge0/ extents[0 ];
189190 basis[1 ] = rect.edge1/ extents[1 ];
190191 basis[2 ] = normalize (cross (basis[0 ],basis[1 ]));
191- }
192+ }
192193
193194// return intersection distance if found, nbl_glsl_FLT_NAN otherwise
194195float Rectangle_intersect(in Rectangle rect, in vec3 origin, in vec3 direction)
@@ -222,7 +223,7 @@ vec3 Rectangle_getNormalTimesArea(in Rectangle rect)
222223#define OP_BITS_OFFSET 0
223224#define OP_BITS_SIZE 2
224225struct BSDFNode
225- {
226+ {
226227 uvec4 data[2 ];
227228};
228229
@@ -386,13 +387,13 @@ vec2 SampleSphericalMap(vec3 v)
386387{
387388 vec2 uv = vec2 (atan (v.z, v.x), asin (v.y));
388389 uv *= nbl_glsl_RECIPROCAL_PI* 0.5 ;
389- uv += 0.5 ;
390+ uv += 0.5 ;
390391 return uv;
391392}
392393
393394void missProgram(in ImmutableRay_t _immutable, inout Payload_t _payload)
394395{
395- vec3 finalContribution = _payload.throughput;
396+ vec3 finalContribution = _payload.throughput;
396397 // #define USE_ENVMAP
397398#ifdef USE_ENVMAP
398399 vec2 uv = SampleSphericalMap(_immutable.direction);
@@ -415,7 +416,7 @@ nbl_glsl_LightSample nbl_glsl_bsdf_cos_generate(in nbl_glsl_AnisotropicViewSurfa
415416{
416417 const float a = BSDFNode_getRoughness(bsdf);
417418 const mat2x3 ior = BSDFNode_getEta(bsdf);
418-
419+
419420 // fresnel stuff for dielectrics
420421 float orientedEta, rcpOrientedEta;
421422 const bool viewerInsideMedium = nbl_glsl_getOrientedEtas(orientedEta,rcpOrientedEta,interaction.isotropic.NdotV,monochromeEta);
@@ -519,7 +520,7 @@ int traceRay(inout float intersectionT, in vec3 origin, in vec3 direction)
519520
520521 intersectionT = closerIntersection ? t : intersectionT;
521522 objectID = closerIntersection ? i: objectID;
522-
523+
523524 // allowing early out results in a performance regression, WTF!?
524525 // if (anyHit && closerIntersection)
525526 // break;
@@ -543,7 +544,7 @@ nbl_glsl_LightSample nbl_glsl_light_generate_and_remainder_and_pdf(out vec3 rema
543544{
544545 // normally we'd pick from set of lights, using `xi.z`
545546 const Light light = lights[0 ];
546-
547+
547548 vec3 L = nbl_glsl_light_generate_and_pdf(pdf,newRayMaxT,origin,interaction,isBSDF,xi,Light_getObjectID(light));
548549
549550 newRayMaxT *= getEndTolerance(depth);
@@ -663,15 +664,15 @@ bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nb
663664 //
664665 bsdfSampleL = bsdf_sample.L;
665666 }
666-
667+
667668 // additional threshold
668669 const float lumaThroughputThreshold = lumaContributionThreshold;
669670 if (bsdfPdf> bsdfPdfThreshold && getLuma(throughput)> lumaThroughputThreshold)
670671 {
671672 ray._payload.throughput = throughput;
672673 ray._payload.otherTechniqueHeuristic = neeProbability/ bsdfPdf; // numerically stable, don't touch
673674 ray._payload.otherTechniqueHeuristic *= ray._payload.otherTechniqueHeuristic;
674-
675+
675676 // trace new ray
676677 ray._immutable.origin = intersection+ bsdfSampleL* (1.0 /* kSceneSize*/ )* getStartTolerance(depth);
677678 ray._immutable.direction = bsdfSampleL;
@@ -688,109 +689,115 @@ bool closestHitProgram(in uint depth, in uint _sample, inout Ray_t ray, inout nb
688689void main()
689690{
690691 const ivec2 imageExtents = imageSize(outImage);
691- const ivec2 coords = getCoordinates();
692- vec2 texCoord = vec2 (coords) / vec2 (imageExtents);
693- texCoord.y = 1.0 - texCoord.y;
694-
695- if (false == (all (lessThanEqual (ivec2 (0 ),coords)) && all (greaterThan (imageExtents,coords)))) {
696- return ;
697- }
698692
699- if (((PTPushConstant.depth- 1 )>> MAX_DEPTH_LOG2)> 0 || ((PTPushConstant.sampleCount- 1 )>> MAX_SAMPLES_LOG2)> 0 )
693+ uint virtualThreadIndex;
694+ for (uint virtualThreadBase = gl_WorkGroupID.x * _NBL_GLSL_WORKGROUP_SIZE_; virtualThreadBase < 1920 * 1080 ; virtualThreadBase += gl_NumWorkGroups.x * _NBL_GLSL_WORKGROUP_SIZE_) // not sure why 1280*720 doesn't cover entire window
700695 {
701- vec4 pixelCol = vec4 (1.0 ,0.0 ,0.0 ,1.0 );
702- imageStore(outImage, coords, pixelCol);
703- return ;
704- }
705-
706- nbl_glsl_xoroshiro64star_state_t scramble_start_state = texelFetch(scramblebuf,coords,0 ).rg;
707- const vec2 pixOffsetParam = vec2 (1.0 )/ vec2 (textureSize(scramblebuf,0 ));
696+ virtualThreadIndex = virtualThreadBase + gl_LocalInvocationIndex.x;
697+ const ivec2 coords = ivec2 (nbl_glsl_morton_decode2d32b(virtualThreadIndex)); // getCoordinates();
698+ vec2 texCoord = vec2 (coords) / vec2 (imageExtents);
699+ texCoord.y = 1.0 - texCoord.y;
708700
701+ if (false == (all (lessThanEqual (ivec2 (0 ),coords)) && all (greaterThan (imageExtents,coords)))) {
702+ continue ;
703+ }
709704
710- const mat4 invMVP = PTPushConstant.invMVP;
711-
712- vec4 NDC = vec4 (texCoord* vec2 (2.0 ,- 2.0 )+ vec2 (- 1.0 ,1.0 ),0.0 ,1.0 );
713- vec3 camPos;
714- {
715- vec4 tmp = invMVP* NDC;
716- camPos = tmp.xyz/ tmp.w;
717- NDC.z = 1.0 ;
718- }
705+ if (((PTPushConstant.depth- 1 )>> MAX_DEPTH_LOG2)> 0 || ((PTPushConstant.sampleCount- 1 )>> MAX_SAMPLES_LOG2)> 0 )
706+ {
707+ vec4 pixelCol = vec4 (1.0 ,0.0 ,0.0 ,1.0 );
708+ imageStore(outImage, coords, pixelCol);
709+ continue ;
710+ }
719711
720- vec3 color = vec3 (0.0 );
721- float meanLumaSquared = 0.0 ;
722- // TODO: if we collapse the nested for loop, then all GPUs will get `PTPushConstant.depth` factor speedup, not just NV with separate PC
723- for (int i= 0 ; i< PTPushConstant.sampleCount; i++ )
724- {
725- nbl_glsl_xoroshiro64star_state_t scramble_state = scramble_start_state;
712+ nbl_glsl_xoroshiro64star_state_t scramble_start_state = texelFetch(scramblebuf,coords,0 ).rg;
713+ const vec2 pixOffsetParam = vec2 (1.0 )/ vec2 (textureSize(scramblebuf,0 ));
726714
727- Ray_t ray;
728- // raygen
729- {
730- ray._immutable.origin = camPos;
731-
732- vec4 tmp = NDC;
733- // apply stochastic reconstruction filter
734- const float gaussianFilterCutoff = 2.5 ;
735- const float truncation = exp (- 0.5 * gaussianFilterCutoff* gaussianFilterCutoff);
736- vec2 remappedRand = rand3d(0u,i,scramble_state)[0 ].xy;
737- remappedRand.x *= 1.0 - truncation;
738- remappedRand.x += truncation;
739- tmp.xy += pixOffsetParam* nbl_glsl_BoxMullerTransform(remappedRand,1.5 );
740- // for depth of field we could do another stochastic point-pick
741- tmp = invMVP* tmp;
742- ray._immutable.direction = normalize (tmp.xyz/ tmp.w- camPos);
743715
744- #if POLYGON_METHOD== 2
745- ray._immutable.normalAtOrigin = vec3 (0.0 ,0.0 ,0.0 );
746- ray._immutable.wasBSDFAtOrigin = false;
747- #endif
716+ const mat4 invMVP = PTPushConstant.invMVP;
748717
749- ray._payload.accumulation = vec3 ( 0 .0 );
750- ray._payload.otherTechniqueHeuristic = 0.0 ; // needed for direct eye-light paths
751- ray._payload.throughput = vec3 ( 1.0 );
752- #ifdef KILL_DIFFUSE_SPECULAR_PATHS
753- ray._payload.hasDiffuse = false ;
754- #endif
718+ vec4 NDC = vec4 (texCoord * vec2 ( 2.0 , - 2.0 ) + vec2 ( - 1.0 , 1.0 ), 0.0 , 1 .0 );
719+ vec3 camPos;
720+ {
721+ vec4 tmp = invMVP * NDC;
722+ camPos = tmp.xyz / tmp.w ;
723+ NDC.z = 1.0 ;
755724 }
756725
757- // bounces
726+ vec3 color = vec3 (0.0 );
727+ float meanLumaSquared = 0.0 ;
728+ // TODO: if we collapse the nested for loop, then all GPUs will get `PTPushConstant.depth` factor speedup, not just NV with separate PC
729+ for (int i= 0 ; i< PTPushConstant.sampleCount; i++ )
758730 {
759- bool hit = true; bool rayAlive = true;
760- for (int d= 1 ; d<= PTPushConstant.depth && hit && rayAlive; d+= 2 )
731+ nbl_glsl_xoroshiro64star_state_t scramble_state = scramble_start_state;
732+
733+ Ray_t ray;
734+ // raygen
761735 {
762- ray._mutable.intersectionT = nbl_glsl_FLT_MAX;
763- ray._mutable.objectID = traceRay(ray._mutable.intersectionT,ray._immutable.origin,ray._immutable.direction);
764- hit = ray._mutable.objectID!=-1 ;
765- if (hit)
766- rayAlive = closestHitProgram(d, i, ray, scramble_state);
736+ ray._immutable.origin = camPos;
737+
738+ vec4 tmp = NDC;
739+ // apply stochastic reconstruction filter
740+ const float gaussianFilterCutoff = 2.5 ;
741+ const float truncation = exp (- 0.5 * gaussianFilterCutoff* gaussianFilterCutoff);
742+ vec2 remappedRand = rand3d(0u,i,scramble_state)[0 ].xy;
743+ remappedRand.x *= 1.0 - truncation;
744+ remappedRand.x += truncation;
745+ tmp.xy += pixOffsetParam* nbl_glsl_BoxMullerTransform(remappedRand,1.5 );
746+ // for depth of field we could do another stochastic point-pick
747+ tmp = invMVP* tmp;
748+ ray._immutable.direction = normalize (tmp.xyz/ tmp.w- camPos);
749+
750+ #if POLYGON_METHOD== 2
751+ ray._immutable.normalAtOrigin = vec3 (0.0 ,0.0 ,0.0 );
752+ ray._immutable.wasBSDFAtOrigin = false;
753+ #endif
754+
755+ ray._payload.accumulation = vec3 (0.0 );
756+ ray._payload.otherTechniqueHeuristic = 0.0 ; // needed for direct eye-light paths
757+ ray._payload.throughput = vec3 (1.0 );
758+ #ifdef KILL_DIFFUSE_SPECULAR_PATHS
759+ ray._payload.hasDiffuse = false;
760+ #endif
761+ }
762+
763+ // bounces
764+ {
765+ bool hit = true; bool rayAlive = true;
766+ for (int d= 1 ; d<= PTPushConstant.depth && hit && rayAlive; d+= 2 )
767+ {
768+ ray._mutable.intersectionT = nbl_glsl_FLT_MAX;
769+ ray._mutable.objectID = traceRay(ray._mutable.intersectionT,ray._immutable.origin,ray._immutable.direction);
770+ hit = ray._mutable.objectID!=-1 ;
771+ if (hit)
772+ rayAlive = closestHitProgram(d, i, ray, scramble_state);
773+ }
774+ // was last trace a miss?
775+ if (! hit)
776+ missProgram(ray._immutable,ray._payload);
767777 }
768- // was last trace a miss?
769- if (! hit)
770- missProgram(ray._immutable,ray._payload);
771- }
772778
773- vec3 accumulation = ray._payload.accumulation;
779+ vec3 accumulation = ray._payload.accumulation;
780+
781+ float rcpSampleSize = 1.0 / float (i+ 1 );
782+ color += (accumulation- color)* rcpSampleSize;
783+
784+ #ifdef VISUALIZE_HIGH_VARIANCE
785+ float luma = getLuma(accumulation);
786+ meanLumaSquared += (luma* luma- meanLumaSquared)* rcpSampleSize;
787+ #endif
788+ }
774789
775- float rcpSampleSize = 1.0 / float (i+ 1 );
776- color += (accumulation- color)* rcpSampleSize;
777-
778790 #ifdef VISUALIZE_HIGH_VARIANCE
779- float luma = getLuma(accumulation);
780- meanLumaSquared += (luma* luma- meanLumaSquared)* rcpSampleSize;
791+ float variance = getLuma(color);
792+ variance *= variance;
793+ variance = meanLumaSquared- variance;
794+ if (variance> 5.0 )
795+ color = vec3 (1.0 ,0.0 ,0.0 );
781796 #endif
782- }
783797
784- #ifdef VISUALIZE_HIGH_VARIANCE
785- float variance = getLuma(color);
786- variance *= variance;
787- variance = meanLumaSquared- variance;
788- if (variance> 5.0 )
789- color = vec3 (1.0 ,0.0 ,0.0 );
790- #endif
791-
792- vec4 pixelCol = vec4 (color, 1.0 );
793- imageStore(outImage, coords, pixelCol);
798+ vec4 pixelCol = vec4 (color, 1.0 );
799+ imageStore(outImage, coords, pixelCol);
800+ }
794801}
795802/* * TODO: Improving Rendering
796803
0 commit comments