@@ -19,18 +19,20 @@ struct TransferLoop
1919 const uint64_t srcIndexSize = uint64_t (1 ) << SrcIndexSizeLog2;
2020 const uint64_t dstIndexSize = uint64_t (1 ) << DstIndexSizeLog2;
2121
22- const uint64_t srcOffset = invocationIndex * srcIndexSize * transferRequest.propertySize;
23- const uint64_t dstOffset = invocationIndex * dstIndexSize * transferRequest.propertySize;
22+ // Fill: Always use offset 0 on src
23+ const uint64_t srcOffset = Fill ? 0 : invocationIndex * transferRequest.propertySize;
24+ const uint64_t dstOffset = invocationIndex * transferRequest.propertySize;
2425
25- const uint64_t srcIndexAddress = Fill ? transferRequest.srcIndexAddr + srcOffset : transferRequest.srcIndexAddr;
26- const uint64_t dstIndexAddress = Fill ? transferRequest.dstIndexAddr + dstOffset : transferRequest.dstIndexAddr;
27-
28- const uint64_t srcAddressBufferOffset = SrcIndexIota ? srcIndexAddress : vk::RawBufferLoad<uint32_t>(srcIndexAddress);
29- const uint64_t dstAddressBufferOffset = DstIndexIota ? dstIndexAddress : vk::RawBufferLoad<uint32_t>(dstIndexAddress);
26+ // IOTA: Use the index as the fetching offset
27+ // Non IOTA: Read the address buffer ("index buffer") to select fetching offset
28+ const uint64_t srcAddressBufferOffset = SrcIndexIota ? srcOffset : vk::RawBufferLoad<uint32_t>(transferRequest.srcIndexAddr + srcOffset * sizeof (uint32_t));
29+ const uint64_t dstAddressBufferOffset = DstIndexIota ? dstOffset : vk::RawBufferLoad<uint32_t>(transferRequest.dstIndexAddr + dstOffset * sizeof (uint32_t));
3030
3131 const uint64_t srcAddressMapped = transferRequest.srcAddr + srcAddressBufferOffset * srcIndexSize;
3232 const uint64_t dstAddressMapped = transferRequest.dstAddr + dstAddressBufferOffset * dstIndexSize;
3333
34+ //vk::RawBufferStore<uint64_t>(transferRequest.dstAddr + invocationIndex * sizeof(uint64_t) * 2, srcAddressMapped,8);
35+ //vk::RawBufferStore<uint64_t>(transferRequest.dstAddr + invocationIndex * sizeof(uint64_t) * 2 + sizeof(uint64_t), dstAddressMapped,8);
3436 if (SrcIndexSizeLog2 == 0 ) {} // we can't write individual bytes
3537 else if (SrcIndexSizeLog2 == 1 ) vk::RawBufferStore<uint16_t>(dstAddressMapped, vk::RawBufferLoad<uint16_t>(srcAddressMapped));
3638 else if (SrcIndexSizeLog2 == 2 ) vk::RawBufferStore<uint32_t>(dstAddressMapped, vk::RawBufferLoad<uint32_t>(srcAddressMapped));
@@ -111,36 +113,49 @@ void main(uint32_t3 dispatchId)
111113
112114 // Loading transfer request from the pointer (can't use struct
113115 // with BDA on HLSL SPIRV)
116+ uint64_t transferCmdAddr = globals.transferCommandsAddress + sizeof (TransferRequest) * propertyId;
114117 TransferRequest transferRequest;
115- transferRequest.srcAddr = vk::RawBufferLoad<uint >(globals.transferCommandsAddress) | vk::RawBufferLoad< uint >(globals.transferCommandsAddress + sizeof ( uint )) << 32 ;
116- transferRequest.dstAddr = vk::RawBufferLoad<uint64_t>(globals.transferCommandsAddress + sizeof (uint64_t));
117- transferRequest.srcIndexAddr = vk::RawBufferLoad<uint64_t>(globals.transferCommandsAddress + sizeof (uint64_t) * 2 );
118- transferRequest.dstIndexAddr = vk::RawBufferLoad<uint64_t>(globals.transferCommandsAddress + sizeof (uint64_t) * 3 );
118+ transferRequest.srcAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr, 8 ) ;
119+ transferRequest.dstAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof (uint64_t), 8 );
120+ transferRequest.srcIndexAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof (uint64_t) * 2 , 8 );
121+ transferRequest.dstIndexAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof (uint64_t) * 3 , 8 );
119122 // Remaining elements are part of the same bitfield
120123 // TODO: Do this only using raw buffer load?
121- uint64_t bitfieldType = vk::RawBufferLoad<uint64_t>(globals.transferCommandsAddress + sizeof (uint64_t) * 4 );
124+ uint64_t bitfieldType = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof (uint64_t) * 4 , 8 );
122125 transferRequest.elementCount32 = uint32_t (bitfieldType);
123- transferRequest.elementCountExtra = uint32_t (bitfieldType);
124- transferRequest.propertySize = uint32_t (bitfieldType >> 3 );
125- transferRequest.fill = uint32_t (bitfieldType >> (3 + 24 ));
126- transferRequest.srcIndexSizeLog2 = uint32_t (bitfieldType >> (3 + 24 + 1 ));
127- transferRequest.dstIndexSizeLog2 = uint32_t (bitfieldType >> (3 + 24 + 1 + 2 ));
126+ transferRequest.elementCountExtra = uint32_t (bitfieldType >> 32 );
127+ transferRequest.propertySize = uint32_t (bitfieldType >> ( 32 + 3 ) );
128+ transferRequest.fill = uint32_t (bitfieldType >> (32 + 3 + 24 ));
129+ transferRequest.srcIndexSizeLog2 = uint32_t (bitfieldType >> (32 + 3 + 24 + 1 ));
130+ transferRequest.dstIndexSizeLog2 = uint32_t (bitfieldType >> (32 + 3 + 24 + 1 + 2 ));
128131
129132 const uint dispatchSize = nbl::hlsl::device_capabilities_traits<device_capabilities>::maxOptimallyResidentWorkgroupInvocations;
130133 const bool fill = transferRequest.fill == 1 ;
131134
132- vk::RawBufferStore<uint64_t>(globals.transferCommandsAddress + 40 * 3 , transferRequest.srcAddr);
133- vk::RawBufferStore<uint64_t>(globals.transferCommandsAddress + 40 * 4 , transferRequest.dstAddr);
134- vk::RawBufferStore<uint >(globals.transferCommandsAddress + 40 * 5 , vk::RawBufferLoad<uint >(transferRequest.srcAddr + sizeof (uint16_t) * 3 ));
135- //if (fill) { TransferLoopPermutationFill<true> loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); }
136- //else { TransferLoopPermutationFill<false> loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); }
135+ //uint64_t debugWriteAddr = transferRequest.dstAddr + sizeof(uint64_t) * 9 * propertyId;
136+ //vk::RawBufferStore<uint64_t>(debugWriteAddr + sizeof(uint64_t) * 0, transferRequest.srcAddr,8);
137+ //vk::RawBufferStore<uint64_t>(debugWriteAddr + sizeof(uint64_t) * 1, transferRequest.dstAddr,8);
138+ //vk::RawBufferStore<uint64_t>(debugWriteAddr + sizeof(uint64_t) * 2, transferRequest.srcIndexAddr,8);
139+ //vk::RawBufferStore<uint64_t>(debugWriteAddr + sizeof(uint64_t) * 3, transferRequest.dstIndexAddr,8);
140+ //uint64_t elementCount = uint64_t(transferRequest.elementCount32)
141+ // | uint64_t(transferRequest.elementCountExtra) << 32;
142+ //vk::RawBufferStore<uint64_t>(debugWriteAddr + sizeof(uint64_t) * 4, elementCount,8);
143+ //vk::RawBufferStore<uint32_t>(debugWriteAddr + sizeof(uint64_t) * 5, transferRequest.propertySize,4);
144+ //vk::RawBufferStore<uint32_t>(debugWriteAddr + sizeof(uint64_t) * 6, transferRequest.fill,4);
145+ //vk::RawBufferStore<uint32_t>(debugWriteAddr + sizeof(uint64_t) * 7, transferRequest.srcIndexSizeLog2,4);
146+ //vk::RawBufferStore<uint32_t>(debugWriteAddr + sizeof(uint64_t) * 8, transferRequest.dstIndexSizeLog2,4);
147+ //vk::RawBufferStore<uint64_t>(transferRequest.dstAddr + sizeof(uint64_t) * invocationIndex, invocationIndex,8);
148+
149+ if (fill) { TransferLoopPermutationFill<true > loop; loop.copyLoop (invocationIndex, propertyId, transferRequest, dispatchSize); }
150+ else { TransferLoopPermutationFill<false > loop; loop.copyLoop (invocationIndex, propertyId, transferRequest, dispatchSize); }
137151}
138152
139153}
140154}
141155}
142156
143- [numthreads (1 ,1 ,1 )]
157+ // TODO: instead use some sort of replace function for getting optimal size?
158+ [numthreads (512 ,1 ,1 )]
144159void main (uint32_t3 dispatchId : SV_DispatchThreadID )
145160{
146161 nbl::hlsl::property_pools::main<nbl::hlsl::jit::device_capabilities>(dispatchId);
0 commit comments