intel
diff --git a/‎.github/workflows/integration-tests-amd.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/integration-tests-amd.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/llvm-build.yml‎
Lines changed: 4 additions & 10 deletions b/‎.github/workflows/llvm-build.yml‎
Lines changed: 4 additions & 10 deletions
diff --git a/‎.github/workflows/llvm-build/almalinux.Dockerfile‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/llvm-build/almalinux.Dockerfile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/llvm-build/centos.Dockerfile‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/llvm-build/centos.Dockerfile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 0 additions & 4 deletions b/‎CMakeLists.txt‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎cmake/llvm-hash.txt‎
Lines changed: 1 addition & 1 deletion b/‎cmake/llvm-hash.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 0 additions & 26 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 0 additions & 26 deletions
diff --git a/‎include/triton/Dialect/Gluon/Transforms/Passes.td‎
Lines changed: 10 additions & 0 deletions b/‎include/triton/Dialect/Gluon/Transforms/Passes.td‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 5 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 1 addition & 2 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 1 addition & 2 deletions
@@ -13,6 +13,7 @@ jobs:
   integration-tests-amd:
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 45
+    continue-on-error: ${{ matrix.runner[1] == 'gfx90a' }}
     strategy:
       matrix:
         runner: ${{ fromJson(inputs.matrix) }}
 
@@ -106,7 +106,7 @@ jobs:
         -DLLVM_BUILD_UTILS=ON
         -DLLVM_BUILD_TOOLS=ON
         -DLLVM_ENABLE_ASSERTIONS=ON
-        -DMLIR_ENABLE_BINDINGS_PYTHON=ON
+        -DMLIR_ENABLE_BINDINGS_PYTHON=OFF
         -DLLVM_ENABLE_PROJECTS="mlir;lld"
         -DLLVM_INSTALL_UTILS=ON
         -DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU"
@@ -130,7 +130,7 @@ jobs:
         -DLLVM_BUILD_UTILS=ON
         -DLLVM_BUILD_TOOLS=ON
         -DLLVM_ENABLE_ASSERTIONS=ON
-        -DMLIR_ENABLE_BINDINGS_PYTHON=ON
+        -DMLIR_ENABLE_BINDINGS_PYTHON=OFF
         -DLLVM_ENABLE_PROJECTS="mlir;llvm;lld"
         -DLLVM_ENABLE_DIA_SDK=OFF
         -DLLVM_INSTALL_UTILS=ON
@@ -179,7 +179,7 @@ jobs:
         -DCLANG_TABLEGEN=$HOST_TOOLS/clang-tblgen \
         -DLLVM_ENABLE_ASSERTIONS=ON \
         -DCMAKE_LINKER=$LINKER \
-        -DMLIR_ENABLE_BINDINGS_PYTHON=ON \
+        -DMLIR_ENABLE_BINDINGS_PYTHON=OFF \
         -DLLVM_ENABLE_ZSTD=OFF \
         -DLLVM_ABI_BREAKING_CHECKS=FORCE_OFF \
         -DLLVM_INSTALL_UTILS=ON \
@@ -202,12 +202,6 @@ jobs:
         -DLLVM_ENABLE_TERMINFO=OFF \
         llvm-project/llvm
         ninja -C llvm-project/build install
-        CURR_PWD="$(pwd)"
-        cd "${{ env.llvm_install_dir }}/python_packages/mlir_core/mlir/_mlir_libs/"
-        for file in *x86_64*; do
-          mv "$file" "${file/x86_64/aarch64}"
-        done
-        cd $CURR_PWD
         tar czf "${{ env.llvm_install_dir }}.tar.gz" "${{ env.llvm_install_dir }}"
 
     - name: Configure, Build, and Install LLVM (macOS arm64)
@@ -225,7 +219,7 @@ jobs:
         -DLLVM_BUILD_UTILS=ON
         -DLLVM_BUILD_TOOLS=ON
         -DLLVM_ENABLE_ASSERTIONS=ON
-        -DMLIR_ENABLE_BINDINGS_PYTHON=ON
+        -DMLIR_ENABLE_BINDINGS_PYTHON=OFF
         -DLLVM_ENABLE_PROJECTS="mlir;lld"
         -DLLVM_ENABLE_ZSTD=OFF
         -DLLVM_INSTALL_UTILS=ON
 
@@ -29,7 +29,7 @@ RUN cmake -GNinja -Bbuild \
   -DLLVM_BUILD_UTILS=ON \
   -DLLVM_BUILD_TOOLS=ON \
   -DLLVM_ENABLE_ASSERTIONS=ON \
-  -DMLIR_ENABLE_BINDINGS_PYTHON=ON \
+  -DMLIR_ENABLE_BINDINGS_PYTHON=OFF \
   -DLLVM_ENABLE_PROJECTS="mlir;lld" \
   -DLLVM_ENABLE_TERMINFO=OFF \
   -DLLVM_INSTALL_UTILS=ON \
 
@@ -46,7 +46,7 @@ RUN cmake -GNinja -Bbuild \
   -DLLVM_BUILD_UTILS=ON \
   -DLLVM_BUILD_TOOLS=ON \
   -DLLVM_ENABLE_ASSERTIONS=ON \
-  -DMLIR_ENABLE_BINDINGS_PYTHON=ON \
+  -DMLIR_ENABLE_BINDINGS_PYTHON=OFF \
   -DLLVM_ENABLE_PROJECTS="mlir;lld" \
   -DLLVM_ENABLE_TERMINFO=OFF \
   -DLLVM_INSTALL_UTILS=ON \
 
@@ -89,10 +89,6 @@ if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE "Release")
 endif()
 
-if(NOT WIN32)
-  find_library(TERMINFO_LIBRARY tinfo)
-endif()
-
 if(TRITON_BUILD_UT)
   # This is an aggregate target for all unit tests.
   add_custom_target(TritonUnitTests)
 
@@ -1 +1 @@
-064f02dac0c81c19350a74415b3245f42fed09dc
+f6ded0be897e2878612dd903f7e8bb85448269e5
@@ -528,32 +528,6 @@ Value emitPadding(Location loc, RewriterBase &rewriter,
                   triton::gpu::PaddedSharedEncodingAttr layout,
                   unsigned bitwidth, Value smemOffset, bool offsetInBytes);
 
-// Emits IR to load data from shared memory into registers, or to store data
-// from registers into shared memory.
-//
-// You supply perVectorCallback, which is called once per group of register
-// elements to transfer.  You can use this callback to emit IR to load or store
-// data from or to shared memory.
-//
-// elemLlvmTy should be dstTy's element type converted to an LLVM-dialect type.
-//
-// If maxVecElems is provided, we won't vectorize more than this many elements.
-//
-// Returns true on success.
-[[nodiscard]] bool emitTransferBetweenRegistersAndShared(
-    RankedTensorType registerTy, triton::gpu::MemDescType sharedTy,
-    Type elemLlvmTy, std::optional<int32_t> maxVecElems,
-    const SharedMemoryObject &smemObj, Location loc, RewriterBase &rewriter,
-    const TargetInfoBase &target,
-    std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback);
-
-[[nodiscard]] bool emitTransferBetweenRegistersAndShared(
-    LinearLayout &regLayout, triton::gpu::MemDescType sharedTy, Type elemLlvmTy,
-    std::optional<int32_t> maxVecElems, const SharedMemoryObject &smemObj,
-    Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
-    Value laneId, Value warpId,
-    std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback);
-
 // Close cousin of lowerLdStMatrix in MemoryOpToLLVM.cpp
 // We might want to merge them at some point, but having to support
 // ldmatrix.trans makes the code in lowerLdStMatrix a bit specific
 
@@ -35,4 +35,14 @@ def GluonInline: Pass<"gluon-inline"> {
   let dependentDialects = [];
 }
 
+def GluonSimplifyControlFlow: Pass<"gluon-slimplify-control-flow"> {
+  let summary = "simplications for control flow ops";
+
+  let description = [{
+    The `gluon-inline` pass applies a reduced set of simplification
+    and canonicalization patterns to the module.
+  }];
+  let dependentDialects = [];
+}
+
 #endif
@@ -135,6 +135,11 @@ LinearLayout chooseScaledMfmaScaleLayout(MLIRContext *ctx, int dotOperandIdx,
                                          ArrayRef<unsigned> tilesPerWarp,
                                          ArrayRef<unsigned> warpsPerCTA);
 
+LinearLayout chooseScaledWmmaScaleLayout(
+    MLIRContext *ctx, int dotOperandIdx,
+    const std::vector<std::vector<int32_t>> &dotOperandWarpBasis,
+    ArrayRef<int64_t> dotOperandShape);
+
 LinearLayout getSM120DotScaledScaleLayout(MLIRContext *ctx, int dotOperandIdx,
                                           ArrayRef<int64_t> dotOperandShape,
                                           ArrayRef<unsigned> tilesPerWarp,
 
@@ -1307,8 +1307,7 @@ Row |
   let hasCustomAssemblyFormat = 1;
 
   let extraClassDeclaration = extraDistributedDeclaration # [{
-    SmallVector<int64_t> getRepForOperand(ArrayRef<int64_t> operandShape,
-                                          Type elemType, int opIdx) const;
+    SmallVector<int64_t> getRepForOperand(ArrayRef<int64_t> operandShape, int kDim, int opIdx) const;
     SmallVector<unsigned> getRepOrderForOperand(int opIdx) const;
 
     static SmallVector<unsigned, 3> getDefaultInstrShape() {
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-064f02dac0c81c19350a74415b3245f42fed09dc`
	`1`	`+f6ded0be897e2878612dd903f7e8bb85448269e5`