intel
diff --git a/‎.github/workflows/build-test.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/build-test.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 7 additions & 0 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎.github/workflows/llvm-build.yml‎
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/llvm-build.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/workflows/triton-benchmarks-bmg.yml‎
Lines changed: 16 additions & 0 deletions b/‎.github/workflows/triton-benchmarks-bmg.yml‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎.github/workflows/triton-benchmarks-pvc.yml‎
Lines changed: 16 additions & 0 deletions b/‎.github/workflows/triton-benchmarks-pvc.yml‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎.github/workflows/triton-benchmarks.yml‎
Lines changed: 10 additions & 6 deletions b/‎.github/workflows/triton-benchmarks.yml‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎.github/workflows/wheels.yml‎
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/wheels.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 1 deletion b/‎.gitignore‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 4 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 1 addition & 0 deletions b/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 1 addition & 0 deletions
@@ -92,7 +92,7 @@ jobs:
         run: |
           pip install pytest pytest-xdist defusedxml
           cd scripts
-          pytest -n 4 test_*.py
+          pytest -v -n 4 test_*.py
 
       - name: Save pip cache
         if: ${{ steps.pip-cache.outputs.status == 'miss' }}
 
@@ -1,6 +1,13 @@
 name: Integration Tests
 on:
   workflow_dispatch:
+  pull_request:
+    branches-ignore: ['llvm-**']
+  merge_group:
+    branches: [main, 'dev-**']
+    types: [checks_requested]
+  push:
+    branches: [main]
 concurrency:
   group: ${{ github.ref }}
   cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
 
@@ -6,6 +6,9 @@ on:
       - llvm-head
     paths:
       - cmake/llvm-hash.txt
+  pull_request:
+    paths:
+      - .github/workflows/llvm-build.yml
   workflow_dispatch:
 
 env:
 
@@ -0,0 +1,16 @@
+name: Triton benchmarks, BMG
+
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - .github/workflows/triton-benchmarks*.yml
+      - benchmarks/**
+
+jobs:
+  benchmarks:
+    uses: ./.github/workflows/triton-benchmarks.yml
+    with:
+      runner_label: b580
+      skip_benchmarks: "['flash_attention_bwd_benchmark.py','flex_attention_benchmark_custom_masks.py']"
@@ -0,0 +1,16 @@
+name: Triton benchmarks, PVC
+
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - .github/workflows/triton-benchmarks*.yml
+      - benchmarks/**
+
+jobs:
+  benchmarks:
+    uses: ./.github/workflows/triton-benchmarks.yml
+    with:
+      runner_label: max1550
+      skip_benchmarks: "[]"
@@ -44,12 +44,16 @@ on:
         type: boolean
         default: false
 
-  pull_request:
-    branches:
-      - main
-    paths:
-      - .github/workflows/triton-benchmarks.yml
-      - benchmarks/**
+  # This workflow is also called from workflows triton-benchmarks-*.yml.
+  workflow_call:
+    inputs:
+      runner_label:
+        description: Runner label
+        type: string
+      skip_benchmarks:
+        description: JSON list of benchmarks to skip
+        type: string
+        default: "[]"
 
 # Cancels in-progress PR runs when the PR is updated. Manual runs are never cancelled.
 concurrency:
 
@@ -1,6 +1,9 @@
 name: Wheels
 on:
   workflow_dispatch:
+  pull_request:
+    paths:
+      - .github/workflows/wheels.yml
   schedule:
     - cron: "0 8 * * *"
 
 
@@ -8,7 +8,7 @@ llvm-project-*/
 
 # Triton Python module builds
 dist/
-triton*.egg-info/
+python/triton*.egg-info/
 *.whl
 python/triton_kernels/triton*.egg-info/
 
 
@@ -271,6 +271,10 @@ LinearLayout chooseDsReadB64TrLayout(Attribute enc, ArrayRef<int64_t> shape,
 LinearLayout getScaleTMEMStoreLinearLayout(RankedTensorType scaleType,
                                            int numWarps);
 
+std::optional<LinearLayout>
+getTmemLoadStoreLayout16x256(int M, int N, RankedTensorType oldType,
+                             int numWarps);
+
 // Return a layout valid for TMemLoad op for a tmem layout of block MxN that
 // distribute the data long M for the warp groups. This doesn't affect the TMem
 // layout it just returns a distributed layout compatible for tmem_load.
 
@@ -44,6 +44,7 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "STORE_TMEM_TO_GLOBAL_BYPASS_SMEM",
     "ALLOW_LHS_TMEM_LAYOUT_CONVERSION",
     "TRITON_F32_DEFAULT",
+    "TRITON_PREFER_TMEM_16x256_LAYOUT",
     "TRITON_INTEL_ADVANCED_PATH",
     "TRITON_INTEL_AGGRESSIVE_DPAS_REUSE",
     "TRITON_INTEL_DO_NOT_SINK_INSTR_ACROSS_RGN",