WebGPU JSEP: Make shader code not depend on input broadcasting patterns (microsoft#22536)

jiangzhaoming · web-flow · commit d9b91682f17d · 2024-11-08T11:00:51.000-08:00
This PR make MatMul shaders not depend on inputs broadcasting pattern,
but only depend on input ranks and their shape provided in uniform. This
change fix the issue that currently shaders code are different for
different broadcasting, but have identical cache key and results in
wrong cache hit.
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
@@ -25,7 +25,6 @@ import { ShapeUtil } from '../../../util';
 import { ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform } from '../../types';
 import {
   createTensorShapeVariables,
-  getBroadcastDims,
   IndicesHelper,
   inputVariable,
   internalVariable,
@@ -40,6 +39,7 @@ import {
   getActivationSnippet,
   InternalActivationAttributes,
 } from '../fuse-utils';
+import { convertOutputBatchIndicesToInputBatchIndices } from '../matmul-shaders';
 
 import { typeSnippet } from './activation_util';
 
@@ -373,42 +373,11 @@ const matMulReadWriteFnSource = (
   hasBias: boolean,
   applyActivation: string,
   variables: IndicesHelper[],
-  batchShapes: Array<readonly number[]>,
   isChannelsLast = false,
 ): string => {
-  const [batchAShape, batchBShape, batchShape] = batchShapes;
   const [batchVariable, aVariable, bVariable, outputVariable] = variables;
-  const broadCastADims = getBroadcastDims(batchAShape, batchShape);
-  const broadCastBDims = getBroadcastDims(batchBShape, batchShape);
   const dataType = tensorTypeToWsglStorageType(variables[0].type.tensor);
-  const getAIndices = () => {
-    const aRank = aVariable.rank;
-    const batchRank = batchVariable.rank;
-    let resStr = `var aIndices: ${aVariable.type.indices};`;
-    for (let i = aRank - 2 - 1, j = batchRank - 1; i >= 0; i--, j--) {
-      resStr += `\naIndices[${i}] = ${batchRank > 1 ? `batchIndices[${j}]` : 'batchIndices'};`;
-    }
-    broadCastADims.forEach((i) => {
-      resStr += `\naIndices[${i}] = 0;`;
-    });
-    resStr += `\naIndices[${aRank - 2}] = u32(row);
-                   aIndices[${aRank - 1}] = u32(colIn);`;
-    return resStr;
-  };
-  const getBIndices = () => {
-    const bRank = bVariable.rank;
-    const batchRank = batchVariable.rank;
-    let resStr = `var bIndices: ${bVariable.type.indices};`;
-    for (let i = bRank - 2 - 1, j = batchRank - 1; i >= 0; i--, j--) {
-      resStr += `\nbIndices[${i}] = ${batchRank > 1 ? `batchIndices[${j}]` : 'batchIndices'};`;
-    }
-    broadCastBDims.forEach((i) => {
-      resStr += `\nbIndices[${i}] = 0;`;
-    });
-    resStr += `\nbIndices[${bRank - 2}] = u32(row);
-                   bIndices[${bRank - 1}] = u32(colIn);`;
-    return resStr;
-  };
+
   const source = `
     fn mm_readA(batch: i32, row: i32, colIn: i32, batchIndices: ${batchVariable.type.indices}) -> ${typeSnippet(
       component,
@@ -418,7 +387,16 @@ const matMulReadWriteFnSource = (
       let col = colIn * ${component};
       if(row < uniforms.dim_a_outer && col < uniforms.dim_inner)
       {
-        ${getAIndices()}
+        var aIndices: ${aVariable.type.indices};
+        ${convertOutputBatchIndicesToInputBatchIndices(
+          'aIndices',
+          aVariable,
+          aVariable.rank - 2,
+          batchVariable.rank,
+          'batchIndices',
+        )}
+        ${aVariable.indicesSet('aIndices', aVariable.rank - 2, 'u32(row)')}
+        ${aVariable.indicesSet('aIndices', aVariable.rank - 1, 'u32(colIn)')}
         value = ${aVariable.getByIndices('aIndices')};
       }
       return value;
@@ -432,7 +410,16 @@ const matMulReadWriteFnSource = (
       let col = colIn * ${component};
       if(row < uniforms.dim_inner && col < uniforms.dim_b_outer)
       {
-        ${getBIndices()}
+        var bIndices: ${bVariable.type.indices};
+        ${convertOutputBatchIndicesToInputBatchIndices(
+          'bIndices',
+          bVariable,
+          bVariable.rank - 2,
+          batchVariable.rank,
+          'batchIndices',
+        )}
+        ${bVariable.indicesSet('bIndices', bVariable.rank - 2, 'u32(row)')}
+        ${bVariable.indicesSet('bIndices', bVariable.rank - 1, 'u32(colIn)')}
         value = ${bVariable.getByIndices('bIndices')};
       }
       return value;
@@ -532,7 +519,6 @@ export const createMatmulProgramInfo = (
       hasBias,
       applyActivation,
       [batchDims, A, B, output],
-      [outerDimsA, outerDimsB, outerDims],
       isChannelsLast,
     );
     return `
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/common.ts b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
@@ -996,27 +996,3 @@ class ShaderHelperImpl implements ShaderHelper {
 
 export const createShaderHelper = (dispatchGroup: [number, number, number], limits: GPUSupportedLimits) =>
   new ShaderHelperImpl(dispatchGroup, limits);
-
-/**
- * This function comes from https://github.com/tensorflow/tfjs/blob/master/tfjs-core/src/ops/broadcast_util.ts#L18-L40
- * Returns the dimensions in the input shape that are broadcasted to
- * produce the provided output shape.
- *
- * The returned dimensions are 0-indexed and sorted. An example:
- * inShape = [4, 1, 3]
- * outShape = [5, 4, 3, 3]
- * result = [1]. Dimension 1 (2nd dimension of input) gets broadcasted 1 => 3.
- */
-export const getBroadcastDims = (inShape: readonly number[], outShape: readonly number[]): number[] => {
-  const inRank = inShape.length;
-  const dims: number[] = [];
-  for (let i = 0; i < inRank; i++) {
-    const dim = inRank - 1 - i;
-    const a = inShape[dim] || 1;
-    const b = outShape[outShape.length - 1 - i] || 1;
-    if (b > 1 && a === 1) {
-      dims.unshift(dim);
-    }
-  }
-  return dims;
-};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
@@ -11,7 +11,7 @@ import { computeConv3DInfo, createConv3DNaiveProgramInfo } from './3rd-party/con
 import { createMatmulProgramInfo } from './3rd-party/matmul_packed_webgpu';
 import { createGroupedConvProgramInfo, createGroupedConvVectorizeProgramInfo } from './conv-grouped';
 import { InternalActivationAttributes, parseInternalActivationAttributes } from './fuse-utils';
-import { createNaiveMatmulProgramInfo } from './matmul';
+import { createNaiveMatmulProgramInfo } from './matmul-shaders';
 import { createTransposeProgramInfo } from './transpose';
 
 export const calculateOutputShape = (
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/matmul-shaders.ts b/js/web/lib/wasm/jsep/webgpu/ops/matmul-shaders.ts
@@ -0,0 +1,191 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import { DataType } from '../../../wasm-common';
+import { TensorView } from '../../tensor-view';
+import { ShapeUtil } from '../../util';
+import { ProgramInfo, ProgramUniform } from '../types';
+
+import {
+  createTensorShapeVariables,
+  getElementAt,
+  getMaxComponents,
+  IndicesHelper,
+  inputVariable,
+  internalVariable,
+  outputVariable,
+  ShaderHelper,
+  tensorTypeToWsglStorageType,
+  UniformsArrayType,
+} from './common';
+import {
+  appendActivationUniforms,
+  appendActivationUniformsData,
+  getActivationSnippet,
+  InternalActivationAttributes,
+} from './fuse-utils';
+
+// Helper that convert output batch indices to input batch indices using only the rank and
+// the shape information in uniform
+export const convertOutputBatchIndicesToInputBatchIndices = (
+  targetIndicesName: string,
+  inputVariable: IndicesHelper,
+  inputBatchRank: number,
+  outputBatchRank: number,
+  batchIndicesName: string,
+) => {
+  // Assume outputBatchRank >= inputBatchRank, the first outputBatchRank - inputBatchRank of
+  // outputBatchRank should be ignored.
+  const extendingInputRank = outputBatchRank - inputBatchRank;
+  return `
+      ${Array.from({ length: inputBatchRank })
+        .map(
+          (_, i) => `
+      if (${getElementAt(inputVariable.shape, i, inputVariable.rank)} != 1) {
+        ${inputVariable.indicesSet(targetIndicesName, i, getElementAt(batchIndicesName, i + extendingInputRank, outputBatchRank))}
+      } else {
+        ${inputVariable.indicesSet(targetIndicesName, i, 0)}
+      }`,
+        )
+        .join('')}
+`;
+};
+
+export const createNaiveMatmulProgramInfo = (
+  inputs: readonly TensorView[],
+  activationAttributes: InternalActivationAttributes,
+  outputShape: readonly number[],
+  reshapedOutputShape?: readonly number[],
+  isChannelsLast = false /* only used for conv2dByMatMul*/,
+  squeezeOutputShapeFunction?: (shape: readonly number[]) => number[],
+): ProgramInfo => {
+  const aShape = inputs[0].dims;
+  const bShape = inputs[1].dims;
+
+  const M = aShape[aShape.length - 2];
+  const N = bShape[bShape.length - 1];
+  const K = aShape[aShape.length - 1];
+  const components = getMaxComponents(N);
+  const aComponents = getMaxComponents(K);
+  const outputNumber = getMaxComponents(M);
+  const outputSize = ShapeUtil.size(outputShape) / components / outputNumber;
+  const hasBias = inputs.length > 2;
+  const outerDims = reshapedOutputShape ? reshapedOutputShape.slice(0, -2) : outputShape.slice(0, -2);
+  const batchSize = ShapeUtil.size(outerDims);
+  const outputShapeInShader = [batchSize, M, N];
+
+  const programUniforms: ProgramUniform[] = [
+    { type: DataType.uint32, data: outputSize },
+    { type: DataType.uint32, data: M },
+    { type: DataType.uint32, data: N },
+    { type: DataType.uint32, data: K },
+  ];
+  appendActivationUniformsData(activationAttributes, programUniforms);
+  programUniforms.push(...createTensorShapeVariables(outerDims, aShape, bShape));
+  if (hasBias) {
+    programUniforms.push(...createTensorShapeVariables(inputs[2].dims));
+  }
+  programUniforms.push(...createTensorShapeVariables(outputShapeInShader));
+
+  const getShaderSource = (shaderHelper: ShaderHelper) => {
+    const batchDims = internalVariable('batch_dims', inputs[0].dataType, outerDims.length);
+    const a = inputVariable('a', inputs[0].dataType, aShape.length, aComponents);
+    const b = inputVariable('b', inputs[1].dataType, bShape.length, components);
+    const output = outputVariable('output', inputs[0].dataType, outputShapeInShader.length, components);
+    const baseType = tensorTypeToWsglStorageType(output.type.tensor);
+    const applyActivation = getActivationSnippet(activationAttributes, output.type.value, baseType);
+    const inputVariables = [a, b];
+    let processBias = '';
+    if (hasBias) {
+      const biasComponents = isChannelsLast ? components : 1;
+      inputVariables.push(inputVariable('bias', inputs[2].dataType, inputs[2].dims.length, biasComponents));
+      processBias = `${
+        isChannelsLast ? `value += bias[col / ${biasComponents}];` : `value += ${output.type.value}(bias[row + i]);`
+      }`;
+    }
+
+    const uniforms: UniformsArrayType = [
+      { name: 'output_size', type: 'u32' },
+      { name: 'M', type: 'u32' },
+      { name: 'N', type: 'u32' },
+      { name: 'K', type: 'u32' },
+    ];
+    appendActivationUniforms(activationAttributes, uniforms);
+
+    const calcResult = (): string => {
+      let calcStr = `var a_data: ${a.type.value};`;
+      for (let i = 0; i < aComponents; i++) {
+        calcStr += `
+              let b_data${i} = b[(b_offset + (k + ${i}) * uniforms.N + col) / ${components}];`;
+      }
+      for (let i = 0; i < outputNumber; i++) {
+        calcStr += `a_data = a[(a_offset + (row + ${i}) * uniforms.K + k) / ${aComponents}];`;
+
+        for (let j = 0; j < aComponents; j++) {
+          calcStr += `
+            values[${i}] = fma(${b.type.value}(a_data${aComponents === 1 ? '' : `[${j}]`}), b_data${j}, values[${i}]);\n`;
+        }
+      }
+      return calcStr;
+    };
+
+    return `
+  ${shaderHelper
+    .registerUniforms(uniforms)
+    .registerInternalVariables(batchDims)
+    .declareVariables(...inputVariables, output)}
+  ${shaderHelper.mainStart()}
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+    let col = (global_idx % (uniforms.N / ${components})) * ${components};
+    var index1 = global_idx / (uniforms.N / ${components});
+    let stride1 = uniforms.M / ${outputNumber};
+    let row = (index1 % stride1) * ${outputNumber};
+    let batch = index1 / stride1;
+
+    ${outputShape.length === 2 ? '' : `let batch_indices = ${batchDims.offsetToIndices('batch')};`}
+
+    var a_indices: ${a.type.indices};
+    ${convertOutputBatchIndicesToInputBatchIndices('a_indices', a, a.rank - 2, batchDims.rank, 'batch_indices')}
+    ${a.indicesSet('a_indices', a.rank - 2, 0)}
+    ${a.indicesSet('a_indices', a.rank - 1, 0)}
+    let a_offset = ${a.indicesToOffset('a_indices')};
+
+    var b_indices: ${b.type.indices};
+    ${convertOutputBatchIndicesToInputBatchIndices('b_indices', b, b.rank - 2, batchDims.rank, 'batch_indices')}
+    ${b.indicesSet('b_indices', b.rank - 2, 0)}
+    ${b.indicesSet('b_indices', b.rank - 1, 0)}
+    let b_offset = ${b.indicesToOffset('b_indices')};
+    var values: array<${output.type.value}, ${outputNumber}>;
+    for (var k: u32 = 0u; k < uniforms.K; k = k + ${aComponents}) {
+      ${calcResult()}
+    }
+    for (var i = 0u; i < ${outputNumber}u; i++) {
+      var value = values[i];
+      ${processBias}
+      ${applyActivation}
+      let cur_indices = ${output.type.indices}(batch, row + i, col);
+      let offset = ${output.indicesToOffset('cur_indices')};
+      ${output.setByOffset(`offset / ${components}`, 'value')};
+    }
+  }
+  `;
+  };
+  return {
+    name: 'MatMulNaive',
+    shaderCache: {
+      hint: `${activationAttributes.activation};${components};${aComponents};${outputNumber};${isChannelsLast}`,
+      inputDependencies: hasBias ? ['rank', 'rank', 'rank'] : ['rank', 'rank'],
+    },
+    getRunData: () => ({
+      outputs: [
+        {
+          dims: squeezeOutputShapeFunction ? squeezeOutputShapeFunction(outputShape) : outputShape,
+          dataType: inputs[0].dataType,
+        },
+      ],
+      dispatchGroup: { x: Math.ceil(outputSize / 64 /* workgroup size */) },
+      programUniforms,
+    }),
+    getShaderSource,
+  };
+};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
diff --git a/js/web/test/data/ops/matmul.jsonc b/js/web/test/data/ops/matmul.jsonc