Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions 13_BitonicSort/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
include(common RESULT_VARIABLE RES)
if(NOT RES)
message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
endif()

nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")

if(NBL_EMBED_BUILTIN_RESOURCES)
set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
set(RESOURCE_DIR "app_resources")

get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)

file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
endforeach()

ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")

LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
endif()
112 changes: 112 additions & 0 deletions 13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#include "nbl/builtin/hlsl/bda/bda_accessor.hlsl"

struct BitonicPushData
{
uint64_t inputKeyAddress;
uint64_t inputValueAddress;
uint64_t outputKeyAddress;
uint64_t outputValueAddress;
uint32_t dataElementCount;
};

using namespace nbl::hlsl;

[[vk::push_constant]] BitonicPushData pushData;

using DataPtr = bda::__ptr<uint32_t>;
using DataAccessor = BdaAccessor<uint32_t>;

groupshared uint32_t sharedKeys[ElementCount];
groupshared uint32_t sharedValues[ElementCount];

[numthreads(WorkgroupSize, 1, 1)]
[shader("compute")]
void main(uint32_t3 dispatchId : SV_DispatchThreadID, uint32_t3 localId : SV_GroupThreadID)
{
const uint32_t threadId = localId.x;
const uint32_t dataSize = pushData.dataElementCount;

DataAccessor inputKeys = DataAccessor::create(DataPtr::create(pushData.inputKeyAddress));
DataAccessor inputValues = DataAccessor::create(DataPtr::create(pushData.inputValueAddress));

for (uint32_t i = threadId; i < dataSize; i += WorkgroupSize)
{
inputKeys.get(i, sharedKeys[i]);
inputValues.get(i, sharedValues[i]);
}

// Synchronize all threads after loading
GroupMemoryBarrierWithGroupSync();


for (uint32_t stage = 0; stage < Log2ElementCount; stage++)
{
for (uint32_t pass = 0; pass <= stage; pass++)
{
const uint32_t compareDistance = 1 << (stage - pass);

for (uint32_t i = threadId; i < dataSize; i += WorkgroupSize)
{
const uint32_t partnerId = i ^ compareDistance;

if (partnerId >= dataSize)
continue;

const uint32_t waveSize = WaveGetLaneCount();
const uint32_t myWaveId = i / waveSize;
const uint32_t partnerWaveId = partnerId / waveSize;
const bool sameWave = (myWaveId == partnerWaveId);

uint32_t myKey, myValue, partnerKey, partnerValue;
[branch]
if (sameWave && compareDistance < waveSize)
{
// WAVE INTRINSIC
myKey = sharedKeys[i];
myValue = sharedValues[i];

const uint32_t partnerLane = partnerId % waveSize;
partnerKey = WaveReadLaneAt(myKey, partnerLane);
partnerValue = WaveReadLaneAt(myValue, partnerLane);
}
else
{
// SHARED MEM
myKey = sharedKeys[i];
myValue = sharedValues[i];
partnerKey = sharedKeys[partnerId];
partnerValue = sharedValues[partnerId];
}

const uint32_t sequenceSize = 1 << (stage + 1);
const uint32_t sequenceIndex = i / sequenceSize;
const bool sequenceAscending = (sequenceIndex % 2) == 0;
const bool ascending = true;
const bool finalDirection = sequenceAscending == ascending;

const bool swap = (myKey > partnerKey) == finalDirection;

// WORKGROUP COORDINATION: Only lower-indexed element writes both
if (i < partnerId && swap)
{
sharedKeys[i] = partnerKey;
sharedKeys[partnerId] = myKey;
sharedValues[i] = partnerValue;
sharedValues[partnerId] = myValue;
}
}

GroupMemoryBarrierWithGroupSync();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If compareDistance < waveSize, these barriers serve no purpose, you are overbarriering. In fact writing to shared memory at the end of every such iteration is also pointless.

The proper way to avoid this overbarriering is to branch behaviour based on whether compareDistance < waveSize or not. All steps with compareDistance < waveSize can be done in one go. Threads shuffle their elements around using subgroup intrinsics (shuffleXor, namely), once per every compareDistance value less than the starting one, and then write back to shared memory only once. This is what we do with the FFT, although I don't expect you to infer that from the code since it can be a bit obscure. @ me on discord if you want to figure out the way we handle this with the FFT, I can explain better there since I need to draw diagrams and write a bunch more

}
}


DataAccessor outputKeys = DataAccessor::create(DataPtr::create(pushData.outputKeyAddress));
DataAccessor outputValues = DataAccessor::create(DataPtr::create(pushData.outputValueAddress));

for (uint32_t i = threadId; i < dataSize; i += WorkgroupSize)
{
outputKeys.set(i, sharedKeys[i]);
outputValues.set(i, sharedValues[i]);
}
}
17 changes: 17 additions & 0 deletions 13_BitonicSort/app_resources/common.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
// This file is part of the "Nabla Engine".
// For conditions of distribution and use, see copyright notice in nabla.h
#ifndef _BITONIC_SORT_COMMON_INCLUDED_
#define _BITONIC_SORT_COMMON_INCLUDED_

struct BitonicPushData
{

uint64_t inputKeyAddress;
uint64_t inputValueAddress;
uint64_t outputKeyAddress;
uint64_t outputValueAddress;
uint32_t dataElementCount;
};

#endif
28 changes: 28 additions & 0 deletions 13_BitonicSort/config.json.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"enableParallelBuild": true,
"threadsPerBuildProcess" : 2,
"isExecuted": false,
"scriptPath": "",
"cmake": {
"configurations": [ "Release", "Debug", "RelWithDebInfo" ],
"buildModes": [],
"requiredOptions": []
},
"profiles": [
{
"backend": "vulkan", // should be none
"platform": "windows",
"buildModes": [],
"runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example
"gpuArchitectures": []
}
],
"dependencies": [],
"data": [
{
"dependencies": [],
"command": [""],
"outputs": []
}
]
}
Loading