Merge pull request #7 from mikepapadim/optimization/flash_attention

mikepapadim · web-flow · commit bb1676b94b60 · 2025-05-22T15:29:49.000+03:00
Replace parallel attention with flash parallel attention
diff --git a/src/main/java/com/example/tornadovm/TornadoVMLayerPlanner.java b/src/main/java/com/example/tornadovm/TornadoVMLayerPlanner.java
@@ -113,10 +113,10 @@ public Tuple2<List<ImmutableTaskGraph>, GridScheduler> setupTornadoForwardPlanLa
                         config.headSize)
                 .task("copyToCaches", TransformerComputeKernelsLayered::copyToCache,
                         state.wrapKeyCache, state.wrapK,  state.wrapValueCache, state.wrapV, state.positionHolder, config.kvDim, layerIndex, config.contextLength)
-                .task("parallel-attention", TransformerComputeKernelsLayered::processHeadsParallel,
+                .task("parallel-attention", TransformerComputeKernelsLayered::processHeadsFlashAttention, context,
                         state.wrapQ, state.wrapKeyCache, state.wrapValueCache, state.wrapXb,
-                        config.numberOfHeads, config.headSize, config.kvDim, config.kvMul, config.vocabularySize,
-                        state.positionHolder, state.wrapAtt, layerIndex, config.contextLength)
+                        config.numberOfHeads, config.headSize, config.kvDim, config.kvMul,
+                        state.positionHolder, layerIndex, config.contextLength)
                 .task("matmul1", TransformerComputeKernelsLayered::matrixVectorGenericWithResidual, context,
                         state.wrapXb,  state.wrapX, weights.woLayered[layerIndex], config.dim, config.dim,  LOCAL_WORK_GROUP_SIZE_ALLOC)
                 .task("reductionsOneBlockFFN", TransformerComputeKernelsLayered::reductionOneBlockWithLayer, context, state.tempFFN,
@@ -310,7 +310,8 @@ private GridScheduler setupGridSchedulersLayered() {
         // OpenCL equivalent: clEnqueueNDRangeKernel(globalWorkSize=[config.numberOfHeads,1,1], localWorkSize=[4,1,1])
         // CUDA equivalent: kernel<<<dim3((config.numberOfHeads+3)/4,1,1), dim3(4,1,1)>>>
         WorkerGrid parallelAttentionWorker = new WorkerGrid1D(config.numberOfHeads);
-        parallelAttentionWorker.setGlobalWork(config.numberOfHeads, 1, 1);
+        // the global group work size is numberOfHeads * localWorkGroupSize, where the localWorkGroupSize is currently 4
+        parallelAttentionWorker.setGlobalWork(config.numberOfHeads * 4, 1, 1);
         parallelAttentionWorker.setLocalWork(4, 1, 1); // Set local work size to 4 (for parallel attention)
 
         // Copy to caches worker configuration
diff --git a/src/main/java/com/example/tornadovm/TransformerComputeKernelsLayered.java b/src/main/java/com/example/tornadovm/TransformerComputeKernelsLayered.java
@@ -194,7 +194,7 @@ public static void ropeRotation(KernelContext context, IntArray positionHolder,
      * @param contextLength Maximum context length
      */
     public static void processHeadsParallel(FloatArray q, FloatArray key_cache, FloatArray value_cache, FloatArray xb, int nHeads, int headSize, int kvDim, int kvMul, int seqLen,
-            IntArray positionHolder, FloatArray wrapAtt, int layer, int contextLength) {
+                                            IntArray positionHolder, FloatArray wrapAtt, int layer, int contextLength) {
 
         int pos = positionHolder.get(0);
         int loff = layer * contextLength * kvDim;
@@ -228,7 +228,7 @@ public static void processHeadsParallel(FloatArray q, FloatArray key_cache, Floa
      * @param wrapAtt Attention weights buffer
      */
     private static void processHeadTornado(FloatArray allQ, FloatArray key_cache, FloatArray value_cache, FloatArray allXb, int h, int headSize, int kvDim, int kvMul, long loff, int pos,
-            FloatArray wrapAtt) {
+                                           FloatArray wrapAtt) {
 
         // Base index for this head's attention weights
         int headOffset = h * (pos + 1);
@@ -285,6 +285,140 @@ private static void processHeadTornado(FloatArray allQ, FloatArray key_cache, Fl
         }
     }
 
+    public static void processHeadsFlashAttention(
+            KernelContext context,
+            FloatArray q,
+            FloatArray key_cache,
+            FloatArray value_cache,
+            FloatArray xb,
+            int nHeads,
+            int headSize,
+            int kvDim,
+            int kvMul,
+            IntArray positionHolder,
+            int layer,
+            int contextLength) {
+
+        // Thread and workgroup information
+        int tid = context.localIdx;
+        int gid = context.globalIdx; // gid is not actively used in the core logic here
+        int h = context.groupIdx;  // Each workgroup processes one head
+        int localSize = context.localGroupSizeX;
+
+        // Early exit if this workgroup is beyond our head count
+        // This relies on the kernel being launched with nHeads workgroups.
+        if (h >= nHeads) return;
+
+        int pos = positionHolder.get(0);
+        int loff = layer * contextLength * kvDim;
+        int kvHeadIdx = h / kvMul;
+        int BLOCK_SIZE_C = 4;
+
+        // Allocate shared memory for tiled computation
+        float[] q_shared = context.allocateFloatLocalArray(headSize);
+        float[] k_tile = context.allocateFloatLocalArray(BLOCK_SIZE_C * headSize);
+        float[] v_tile = context.allocateFloatLocalArray(BLOCK_SIZE_C * headSize);
+        float[] s_tile = context.allocateFloatLocalArray(BLOCK_SIZE_C);
+        float[] shared_tile_max_holder = context.allocateFloatLocalArray(1); // FIX: For broadcasting tile max
+
+        // Thread-local accumulators for online softmax
+        float maxScore = Float.NEGATIVE_INFINITY;
+        float sumExp = 0.0f;
+
+        // Thread-local output accumulation
+        float[] output = new float[headSize];
+        for (int i = 0; i < headSize; i++) {
+            output[i] = 0.0f;
+        }
+
+        // Load query vector into shared memory
+        for (int i = tid; i < headSize; i += localSize) {
+            q_shared[i] = q.get(h * headSize + i);
+        }
+
+        context.localBarrier();
+
+        // Process sequence in tiles
+        for (int tileC = 0; tileC <= pos; tileC += BLOCK_SIZE_C) {
+            int tileEnd = Math.min(tileC + BLOCK_SIZE_C - 1, pos);
+
+            // Load key and value vectors for this tile
+            // Each thread loads a portion of the K and V vectors for the tile
+            for (int tIdxInSeq = tileC + tid; tIdxInSeq <= tileEnd; tIdxInSeq += localSize) {
+                int k_v_idx_in_tile = tIdxInSeq - tileC; // 0, 1, 2, or 3 for this tile
+                int tileMemOffset = k_v_idx_in_tile * headSize;
+                for (int d = 0; d < headSize; d++) {
+                    int kvCacheAbsolutePos = tIdxInSeq;
+                    int kvOffset = loff + kvCacheAbsolutePos * kvDim + kvHeadIdx * headSize + d;
+                    k_tile[tileMemOffset + d] = key_cache.get(kvOffset);
+                    v_tile[tileMemOffset + d] = value_cache.get(kvOffset);
+                }
+            }
+
+            context.localBarrier();
+
+            // Compute attention scores for this tile
+            // Each thread computes one score for the tile
+            for (int tIdxInSeq = tileC + tid; tIdxInSeq <= tileEnd; tIdxInSeq += localSize) {
+                int score_idx_in_tile = tIdxInSeq - tileC; // 0, 1, 2, or 3 for this tile
+
+                float score = 0.0f;
+                for (int d = 0; d < headSize; d++) {
+                    score += q_shared[d] * k_tile[score_idx_in_tile * headSize + d];
+                }
+                score /= TornadoMath.sqrt(headSize);
+                s_tile[score_idx_in_tile] = score;
+            }
+
+            context.localBarrier();
+
+            // Find max score in this tile (all threads compute it redundantly over the small s_tile)
+            float tileLocalMax = Float.NEGATIVE_INFINITY;
+            for (int i = 0; i <= tileEnd - tileC; i++) { // Iterate over valid scores in s_tile
+                if (s_tile[i] > tileLocalMax) {
+                    tileLocalMax = s_tile[i];
+                }
+            }
+
+            // Broadcast max to all threads via shared memory
+            if (tid == 0) {
+                shared_tile_max_holder[0] = tileLocalMax; // FIX: Use dedicated holder
+            }
+            context.localBarrier();
+            float currentTileMax = shared_tile_max_holder[0]; // FIX: Read from dedicated holder
+
+            // Determine if we need to rescale previous results
+            float newMax = Math.max(maxScore, currentTileMax);
+            if (newMax != maxScore && maxScore != Float.NEGATIVE_INFINITY) {
+                float scale = TornadoMath.exp(maxScore - newMax);
+                sumExp *= scale;
+                for (int d = 0; d < headSize; d++) {
+                    output[d] *= scale;
+                }
+            }
+            maxScore = newMax;
+
+            // Process each key-value pair using original scores from s_tile
+            // All threads iterate over all scores in the current tile
+            for (int t_idx_in_s_tile = 0; t_idx_in_s_tile <= tileEnd - tileC; t_idx_in_s_tile++) {
+                // s_tile[t_idx_in_s_tile] now correctly refers to the original score
+                float expScore = TornadoMath.exp(s_tile[t_idx_in_s_tile] - maxScore);
+                sumExp += expScore;
+
+                for (int d = 0; d < headSize; d++) {
+                    output[d] += expScore * v_tile[t_idx_in_s_tile * headSize + d];
+                }
+            }
+            context.localBarrier(); // Ensure all threads finish with s_tile, k_tile, v_tile before next tile load
+        }
+
+        // Normalize and write final results
+        float normFactor = (sumExp > 0.0f) ? (1.0f / sumExp) : 0.0f; // Avoid division by zero, return 0 if sumExp is 0
+        for (int d = tid; d < headSize; d += localSize) {
+            xb.set(h * headSize + d, output[d] * normFactor);
+        }
+    }
+
     /**
      * Performs optimized matrix-vector multiplication where each work group
      * processes one row of the matrix.