@@ -43,25 +43,26 @@ class CDirQuantCacheBase
4343
4444 Vector8u3 () : x (0u ),y (0u ),z (0u ) {}
4545 Vector8u3 (const Vector8u3&) = default ;
46- explicit Vector8u3 (const core::vectorSIMDu32 & val)
46+ explicit Vector8u3 (const hlsl::uint32_t4 & val)
4747 {
4848 operator =(val);
4949 }
5050
5151 Vector8u3& operator =(const Vector8u3&) = default ;
52- Vector8u3& operator =(const core::vectorSIMDu32 & val)
52+ Vector8u3& operator =(const hlsl::uint32_t4 & val)
5353 {
5454 x = val.x ;
5555 y = val.y ;
5656 z = val.z ;
5757 return *this ;
5858 }
5959
60- inline core::vectorSIMDu32 getValue () const
60+ hlsl::uint32_t4 getValue () const
6161 {
62- return core::vectorSIMDu32 (x,y,z) ;
62+ return { x, y, z, 0 } ;
6363 }
6464
65+
6566 private:
6667 uint8_t x;
6768 uint8_t y;
@@ -74,13 +75,13 @@ class CDirQuantCacheBase
7475
7576 Vector8u4 () : x (0u ),y (0u ),z (0u ),w (0u ) {}
7677 Vector8u4 (const Vector8u4&) = default ;
77- explicit Vector8u4 (const core::vectorSIMDu32 & val)
78+ explicit Vector8u4 (const hlsl::uint32_t4 & val)
7879 {
7980 operator =(val);
8081 }
8182
8283 Vector8u4& operator =(const Vector8u4&) = default ;
83- Vector8u4& operator =(const core::vectorSIMDu32 & val)
84+ Vector8u4& operator =(const hlsl::uint32_t4 & val)
8485 {
8586 x = val.x ;
8687 y = val.y ;
@@ -89,9 +90,9 @@ class CDirQuantCacheBase
8990 return *this ;
9091 }
9192
92- inline core::vectorSIMDu32 getValue () const
93+ hlsl::uint32_t4 getValue () const
9394 {
94- return core::vectorSIMDu32 (x,y,z,w) ;
95+ return { x, y, z, w } ;
9596 }
9697
9798 private:
@@ -108,16 +109,16 @@ class CDirQuantCacheBase
108109
109110 Vector1010102 () : storage (0u ) {}
110111 Vector1010102 (const Vector1010102&) = default ;
111- explicit Vector1010102 (const core::vectorSIMDu32 & val)
112+ explicit Vector1010102 (const hlsl::uint32_t4 & val)
112113 {
113114 operator =(val);
114115 }
115116
116117 Vector1010102& operator =(const Vector1010102&) = default ;
117- Vector1010102& operator =(const core::vectorSIMDu32 & val)
118+ Vector1010102& operator =(const hlsl::uint32_t4 & val)
118119 {
119- constexpr auto storageBits = quantizationBits+ 1u ;
120- storage = val.x | (val.y << storageBits)| (val.z << (storageBits* 2u ));
120+ constexpr auto storageBits = quantizationBits + 1u ;
121+ storage = val.x | (val.y << storageBits) | (val.z << (storageBits * 2u ));
121122 return *this ;
122123 }
123124
@@ -130,13 +131,13 @@ class CDirQuantCacheBase
130131 return storage==other.storage ;
131132 }
132133
133- inline core::vectorSIMDu32 getValue () const
134+ hlsl::uint32_t4 getValue () const
134135 {
135- constexpr auto storageBits = quantizationBits+ 1u ;
136- const core::vectorSIMDu32 mask (( 0x1u << storageBits)- 1u ) ;
137- return core::vectorSIMDu32 ( storage, storage>> storageBits, storage>> (storageBits* 2u ))& mask;
136+ constexpr auto storageBits = quantizationBits + 1u ;
137+ const auto mask = ( 0x1u << storageBits) - 1u ;
138+ return { storage & mask, ( storage >> storageBits) & mask, ( storage >> (storageBits * 2 )) & mask, 0 } ;
138139 }
139-
140+
140141 private:
141142 uint32_t storage;
142143 };
@@ -149,25 +150,25 @@ class CDirQuantCacheBase
149150
150151 Vector16u3 () : x (0u ),y (0u ),z (0u ) {}
151152 Vector16u3 (const Vector16u3&) = default ;
152- explicit Vector16u3 (const core::vectorSIMDu32 & val)
153+ explicit Vector16u3 (const hlsl::uint32_t4 & val)
153154 {
154155 operator =(val);
155156 }
156157
157158 Vector16u3& operator =(const Vector16u3&) = default ;
158- Vector16u3& operator =(const core::vectorSIMDu32 & val)
159+ Vector16u3& operator =(const hlsl::uint32_t4 & val)
159160 {
160161 x = val.x ;
161162 y = val.y ;
162163 z = val.z ;
163164 return *this ;
164165 }
165166
166- inline core::vectorSIMDu32 getValue () const
167+ hlsl::uint32_t4 getValue () const
167168 {
168- return core::vectorSIMDu32 (x,y,z) ;
169+ return { x, y, z, 0 } ;
169170 }
170-
171+
171172 private:
172173 uint16_t x;
173174 uint16_t y;
@@ -180,13 +181,13 @@ class CDirQuantCacheBase
180181
181182 Vector16u4 () : x (0u ),y (0u ),z (0u ),w (0u ) {}
182183 Vector16u4 (const Vector16u4&) = default ;
183- explicit Vector16u4 (const core::vectorSIMDu32 & val)
184+ explicit Vector16u4 (const hlsl::uint32_t4 & val)
184185 {
185186 operator =(val);
186187 }
187188
188189 Vector16u4& operator =(const Vector16u4&) = default ;
189- Vector16u4& operator =(const core::vectorSIMDu32 & val)
190+ Vector16u4& operator =(const hlsl::uint32_t4 & val)
190191 {
191192 x = val.x ;
192193 y = val.y ;
@@ -195,11 +196,11 @@ class CDirQuantCacheBase
195196 return *this ;
196197 }
197198
198- inline core::vectorSIMDu32 getValue () const
199+ hlsl::float32_t4 getValue () const
199200 {
200- return core::vectorSIMDu32 (x,y,z,w) ;
201+ return { x, y, z, w } ;
201202 }
202-
203+
203204 private:
204205 uint16_t x;
205206 uint16_t y;
@@ -377,11 +378,30 @@ class CDirQuantCacheBase : public virtual core::IReferenceCounted, public impl::
377378 std::tuple<cache_type_t <Formats>...> cache;
378379
379380 template <uint32_t dimensions, E_FORMAT CacheFormat>
380- value_type_t <CacheFormat> quantize (const core::vectorSIMDf & value)
381+ value_type_t <CacheFormat> quantize (const hlsl::vector<hlsl:: float32_t , dimensions> & value)
381382 {
382- const auto negativeMask = value < core::vectorSIMDf (0 .0f );
383+ using float32_tN = hlsl::vector<hlsl::float32_t , dimensions>;
384+
385+ auto to_vec_t4 = []<typename T>(hlsl::vector<T, dimensions> src, T padValue) -> hlsl::vector<T, 4 >
386+ {
387+ if constexpr (dimensions == 1 )
388+ {
389+ return {src.x , padValue, padValue, padValue};
390+ } else if constexpr (dimensions == 2 )
391+ {
392+ return {src.x , src.y , padValue, padValue};
393+ } else if constexpr (dimensions == 3 )
394+ {
395+ return {src.x , src.y , src.z , padValue};
396+ } else if constexpr (dimensions == 4 )
397+ {
398+ return {src.x , src.y , src.z , src.w };
399+ }
400+ };
401+
402+ const auto negativeMask = to_vec_t4 (lessThan (value, float32_tN (0 .0f )), false );
383403
384- const core::vectorSIMDf absValue = abs (value);
404+ const float32_tN absValue = abs (value);
385405 const auto key = Key (absValue);
386406
387407 constexpr auto quantizationBits = quantization_bits_v<CacheFormat>;
@@ -393,32 +413,50 @@ class CDirQuantCacheBase : public virtual core::IReferenceCounted, public impl::
393413 quantized = found->second ;
394414 else
395415 {
396- const core::vectorSIMDf fit = findBestFit<dimensions,quantizationBits>(absValue);
416+ const auto fit = findBestFit<dimensions,quantizationBits>(absValue);
417+
418+ const auto abs_fit = to_vec_t4 (abs (fit), 0 .f );
419+ quantized = hlsl::uint32_t4 (abs_fit.x , abs_fit.y , abs_fit.z , abs_fit.w );
397420
398- quantized = core::vectorSIMDu32 (core::abs (fit));
399421 insertIntoCache<CacheFormat>(key,quantized);
400422 }
401423 }
402424
403- const core::vectorSIMDu32 xorflag ((0x1u <<(quantizationBits+1u ))-1u );
404- auto restoredAsVec = quantized.getValue ()^core::mix (core::vectorSIMDu32 (0u ),xorflag,negativeMask);
405- restoredAsVec += core::mix (core::vectorSIMDu32 (0u ),core::vectorSIMDu32 (1u ),negativeMask);
406- return value_type_t <CacheFormat>(restoredAsVec&xorflag);
425+ auto select = [](hlsl::uint32_t4 val1, hlsl::uint32_t4 val2, hlsl::bool4 mask)
426+ {
427+ hlsl::uint32_t4 retval;
428+ retval.x = mask.x ? val2.x : val1.x ;
429+ retval.y = mask.y ? val2.y : val1.y ;
430+ retval.z = mask.z ? val2.z : val1.z ;
431+ retval.w = mask.w ? val2.w : val1.w ;
432+ return retval;
433+ };
434+ ;
435+ // create all one bits
436+ const hlsl::uint32_t4 xorflag ((0x1u << (quantizationBits + 1u )) - 1u );
437+
438+ // for positive number xoring with 0 keep its value
439+ // for negative number we xor with all one which will flip the bits, then we add one later. Flipping the bits then adding one will turn positive number into negative number
440+ auto restoredAsVec = quantized.getValue () ^ select (hlsl::uint32_t4 (0u ), hlsl::uint32_t4 (xorflag), negativeMask);
441+ restoredAsVec += hlsl::uint32_t4 (negativeMask);
442+
443+ return value_type_t <CacheFormat>(restoredAsVec);
407444 }
408445
409446 template <uint32_t dimensions, uint32_t quantizationBits>
410- static inline core::vectorSIMDf findBestFit (const core::vectorSIMDf & value)
447+ static inline hlsl::vector<hlsl:: float32_t , dimensions> findBestFit (const hlsl::vector<hlsl:: float32_t , dimensions> & value)
411448 {
449+ using float32_tN = hlsl::vector<hlsl::float32_t , dimensions>;
412450 static_assert (dimensions>1u ," No point" );
413451 static_assert (dimensions<=4u ," High Dimensions are Hard!" );
414- // precise normalize
415- const auto vectorForDots = value. preciseDivision ( length ( value) );
452+
453+ const auto vectorForDots = hlsl::normalize ( value);
416454
417455 //
418- core::vectorSIMDf fittingVector;
419- core::vectorSIMDf floorOffset;
456+ float32_tN fittingVector;
457+ float32_tN floorOffset = {} ;
420458 constexpr uint32_t cornerCount = (0x1u <<(dimensions-1u ))-1u ;
421- core::vectorSIMDf corners[cornerCount] = {};
459+ float32_tN corners[cornerCount] = {};
422460 {
423461 uint32_t maxDirCompIndex = 0u ;
424462 for (auto i=1u ; i<dimensions; i++)
@@ -430,9 +468,9 @@ class CDirQuantCacheBase : public virtual core::IReferenceCounted, public impl::
430468 if (maxDirectionComp < std::sqrtf (0 .9998f / float (dimensions)))
431469 {
432470 _NBL_DEBUG_BREAK_IF (true );
433- return core::vectorSIMDf (0 .f );
471+ return float32_tN (0 .f );
434472 }
435- fittingVector = value. preciseDivision ( core::vectorSIMDf ( maxDirectionComp)) ;
473+ fittingVector = value / maxDirectionComp;
436474 floorOffset[maxDirCompIndex] = 0 .499f ;
437475 const uint32_t localCorner[7 ][3 ] = {
438476 {1 ,0 ,0 },
@@ -452,12 +490,12 @@ class CDirQuantCacheBase : public virtual core::IReferenceCounted, public impl::
452490 }
453491 }
454492
455- core::vectorSIMDf bestFit;
493+ float32_tN bestFit;
456494 float closestTo1 = -1 .f ;
457- auto evaluateFit = [&](const core::vectorSIMDf & newFit) -> void
495+ auto evaluateFit = [&](const float32_tN & newFit) -> void
458496 {
459- auto newFitLen = core:: length (newFit);
460- const float dp = core ::dot<core::vectorSIMDf> (newFit,vectorForDots). preciseDivision (newFitLen)[ 0 ] ;
497+ auto newFitLen = length (newFit);
498+ const float dp = hlsl ::dot (newFit,vectorForDots) / (newFitLen);
461499 if (dp > closestTo1)
462500 {
463501 closestTo1 = dp;
@@ -466,18 +504,18 @@ class CDirQuantCacheBase : public virtual core::IReferenceCounted, public impl::
466504 };
467505
468506 constexpr uint32_t cubeHalfSize = (0x1u << quantizationBits) - 1u ;
469- const core::vectorSIMDf cubeHalfSizeND = core::vectorSIMDf (cubeHalfSize);
507+ const float32_tN cubeHalfSizeND = hlsl::promote<float32_tN> (cubeHalfSize);
470508 for (uint32_t n=cubeHalfSize; n>0u ; n--)
471509 {
472510 // we'd use float addition in the interest of speed, to increment the loop
473511 // but adding a small number to a large one loses precision, so multiplication preferrable
474- core::vectorSIMDf bottomFit = core ::floor (fittingVector* float (n)+ floorOffset);
475- if (( bottomFit<= cubeHalfSizeND). all ( ))
512+ const auto bottomFit = glm ::floor (fittingVector * float (n) + floorOffset);
513+ if (hlsl::all ( glm::lessThanEqual ( bottomFit, cubeHalfSizeND)))
476514 evaluateFit (bottomFit);
477- for (auto i= 0u ; i< cornerCount; i++)
515+ for (auto i = 0u ; i < cornerCount; i++)
478516 {
479517 auto bottomFitTmp = bottomFit+corners[i];
480- if (( bottomFitTmp<= cubeHalfSizeND). all ( ))
518+ if (hlsl::all ( glm::lessThanEqual ( bottomFitTmp, cubeHalfSizeND)))
481519 evaluateFit (bottomFitTmp);
482520 }
483521 }
0 commit comments