diff --git a/common/sampling.cpp b/common/sampling.cpp index c69d525b5b358..372aab43a4508 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -325,9 +325,12 @@ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) { } void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) { - // TODO: measure grammar performance - if (gsmpl) { + // Print grammar sampler performance if available + if (gsmpl->grmr != nullptr) { + llama_perf_sampler_print(gsmpl->grmr); + } + // Print main sampling chain performance llama_perf_sampler_print(gsmpl->chain); } if (ctx) { diff --git a/ggml/src/ggml-cpu/amx/mmq.cpp b/ggml/src/ggml-cpu/amx/mmq.cpp index 47c61b88164b8..e0c53519293f7 100644 --- a/ggml/src/ggml-cpu/amx/mmq.cpp +++ b/ggml/src/ggml-cpu/amx/mmq.cpp @@ -2423,25 +2423,27 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te // pointer to work space, used convert A from float to quantized type void * wdata = params->wdata; - //TODO: performance improvement: merge quant A - if (params->ith == 0) { - GGML_DISPATCH_QTYPES(TYPE, [&] { - const size_t row_size_A = K / blck_size * sizeof(vec_dot_type); - const size_t desired_wsize = M * row_size_A; - if (params->wsize < desired_wsize) { - GGML_ABORT("insufficient work space size"); - } + // Parallelize quantization of matrix A to improve cache locality and utilize multiple threads + GGML_DISPATCH_QTYPES(TYPE, [&] { + const size_t row_size_A = K / blck_size * sizeof(vec_dot_type); + const size_t desired_wsize = M * row_size_A; + if (params->wsize < desired_wsize) { + GGML_ABORT("insufficient work space size"); + } - // Q4_0, Q4_1, Q8_0 handles 1 TILE_K per blck_size - // Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size - GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size); + // Q4_0, Q4_1, Q8_0 handles 1 TILE_K per blck_size + // Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size + GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size); - const float * A_data = static_cast(src1->data); - for (int m = 0; m < M; ++m) { - from_float(A_data + m * K, (char *)wdata + m * row_size_A, K); - } - }); - } + const float * A_data = static_cast(src1->data); + const int nth = params->nth; + const int ith = params->ith; + + // Parallelize quantization across threads + for (int m = ith; m < M; m += nth) { + from_float(A_data + m * K, (char *)wdata + m * row_size_A, K); + } + }); ggml_barrier(params->threadpool); diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 8235f69594391..ced737f95cf18 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -3534,14 +3534,47 @@ static void ggml_compute_forward_rms_norm_f32( GGML_ASSERT(eps >= 0.0f); - // TODO: optimize for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = ith; i01 < ne01; i01 += nth) { const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + // SIMD-optimized sum of squares ggml_float sum = 0.0; - for (int64_t i00 = 0; i00 < ne00; i00++) { + int64_t i00 = 0; +#if defined(__AVX512F__) && defined(__AVX512DQ__) + for (; i00 + 15 < ne00; i00 += 16) { + __m512 vx = _mm512_loadu_ps(x + i00); + sum += (ggml_float)_mm512_reduce_add_ps(_mm512_mul_ps(vx, vx)); + } +#elif defined(__AVX2__) && defined(__FMA__) + for (; i00 + 7 < ne00; i00 += 8) { + __m256 vx = _mm256_loadu_ps(x + i00); + __m256 vsq = _mm256_mul_ps(vx, vx); + __m128 val2 = _mm_add_ps(_mm256_extractf128_ps(vsq, 1), + _mm256_castps256_ps128(vsq)); + val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2)); + val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2)); + sum += (ggml_float)_mm_cvtss_f32(val2); + } +#elif defined(__SSE2__) + for (; i00 + 3 < ne00; i00 += 4) { + __m128 vx = _mm_loadu_ps(x + i00); + __m128 vsq = _mm_mul_ps(vx, vx); +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) + vsq = _mm_add_ps(vsq, _mm_movehl_ps(vsq, vsq)); + vsq = _mm_add_ss(vsq, _mm_movehdup_ps(vsq)); +#else + __m128 tmp = _mm_shuffle_ps(vsq, vsq, _MM_SHUFFLE(2, 3, 0, 1)); + vsq = _mm_add_ps(vsq, tmp); + tmp = _mm_movehl_ps(tmp, vsq); + vsq = _mm_add_ss(vsq, tmp); +#endif + sum += (ggml_float)_mm_cvtss_f32(vsq); + } +#endif + // Scalar fallback for remaining elements + for (; i00 < ne00; i00++) { sum += (ggml_float)(x[i00] * x[i00]); } @@ -3603,7 +3636,6 @@ static void ggml_compute_forward_rms_norm_back_f32( float eps; memcpy(&eps, dst->op_params, sizeof(float)); - // TODO: optimize for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = ith; i01 < ne01; i01 += nth) { @@ -3615,10 +3647,61 @@ static void ggml_compute_forward_rms_norm_back_f32( const float * dz = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); const float * x = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13); + // SIMD-optimized sum of squares and dot product ggml_float sum_xx = 0.0; ggml_float sum_xdz = 0.0; - - for (int64_t i00 = 0; i00 < ne00; i00++) { + int64_t i00 = 0; +#if defined(__AVX512F__) && defined(__AVX512DQ__) + for (; i00 + 15 < ne00; i00 += 16) { + __m512 vx = _mm512_loadu_ps(x + i00); + __m512 vdz = _mm512_loadu_ps(dz + i00); + sum_xx += (ggml_float)_mm512_reduce_add_ps(_mm512_mul_ps(vx, vx)); + sum_xdz += (ggml_float)_mm512_reduce_add_ps(_mm512_mul_ps(vx, vdz)); + } +#elif defined(__AVX2__) && defined(__FMA__) + for (; i00 + 7 < ne00; i00 += 8) { + __m256 vx = _mm256_loadu_ps(x + i00); + __m256 vdz = _mm256_loadu_ps(dz + i00); + __m256 vsq = _mm256_mul_ps(vx, vx); + __m256 vdot = _mm256_mul_ps(vx, vdz); + __m128 val2_sq = _mm_add_ps(_mm256_extractf128_ps(vsq, 1), + _mm256_castps256_ps128(vsq)); + __m128 val2_dot = _mm_add_ps(_mm256_extractf128_ps(vdot, 1), + _mm256_castps256_ps128(vdot)); + val2_sq = _mm_add_ps(val2_sq, _mm_movehl_ps(val2_sq, val2_sq)); + val2_dot = _mm_add_ps(val2_dot, _mm_movehl_ps(val2_dot, val2_dot)); + val2_sq = _mm_add_ss(val2_sq, _mm_movehdup_ps(val2_sq)); + val2_dot = _mm_add_ss(val2_dot, _mm_movehdup_ps(val2_dot)); + sum_xx += (ggml_float)_mm_cvtss_f32(val2_sq); + sum_xdz += (ggml_float)_mm_cvtss_f32(val2_dot); + } +#elif defined(__SSE2__) + for (; i00 + 3 < ne00; i00 += 4) { + __m128 vx = _mm_loadu_ps(x + i00); + __m128 vdz = _mm_loadu_ps(dz + i00); + __m128 vsq = _mm_mul_ps(vx, vx); + __m128 vdot = _mm_mul_ps(vx, vdz); +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) + vsq = _mm_add_ps(vsq, _mm_movehl_ps(vsq, vsq)); + vdot = _mm_add_ps(vdot, _mm_movehl_ps(vdot, vdot)); + vsq = _mm_add_ss(vsq, _mm_movehdup_ps(vsq)); + vdot = _mm_add_ss(vdot, _mm_movehdup_ps(vdot)); +#else + __m128 tmp = _mm_shuffle_ps(vsq, vsq, _MM_SHUFFLE(2, 3, 0, 1)); + vsq = _mm_add_ps(vsq, tmp); + tmp = _mm_movehl_ps(tmp, vsq); + vsq = _mm_add_ss(vsq, tmp); + tmp = _mm_shuffle_ps(vdot, vdot, _MM_SHUFFLE(2, 3, 0, 1)); + vdot = _mm_add_ps(vdot, tmp); + tmp = _mm_movehl_ps(tmp, vdot); + vdot = _mm_add_ss(vdot, tmp); +#endif + sum_xx += (ggml_float)_mm_cvtss_f32(vsq); + sum_xdz += (ggml_float)_mm_cvtss_f32(vdot); + } +#endif + // Scalar fallback for remaining elements + for (; i00 < ne00; i00++) { sum_xx += (ggml_float)(x[i00] * x[i00]); sum_xdz += (ggml_float)(x[i00] * dz[i00]); } @@ -3775,8 +3858,6 @@ static void ggml_compute_forward_group_norm_f32( GGML_TENSOR_UNARY_OP_LOCALS - // TODO: optimize - float eps; memcpy(&eps, dst->op_params + 1, sizeof(float)); @@ -3797,8 +3878,39 @@ static void ggml_compute_forward_group_norm_f32( for (int64_t i01 = 0; i01 < ne01; i01++) { const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03); + // SIMD-optimized sum ggml_float sumr = 0.0; - for (int64_t i00 = 0; i00 < ne00; i00++) { + int64_t i00 = 0; +#if defined(__AVX512F__) && defined(__AVX512DQ__) + for (; i00 + 15 < ne00; i00 += 16) { + sumr += (ggml_float)_mm512_reduce_add_ps(_mm512_loadu_ps(x + i00)); + } +#elif defined(__AVX2__) && defined(__FMA__) + for (; i00 + 7 < ne00; i00 += 8) { + __m256 vx = _mm256_loadu_ps(x + i00); + __m128 val2 = _mm_add_ps(_mm256_extractf128_ps(vx, 1), + _mm256_castps256_ps128(vx)); + val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2)); + val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2)); + sumr += (ggml_float)_mm_cvtss_f32(val2); + } +#elif defined(__SSE2__) + for (; i00 + 3 < ne00; i00 += 4) { + __m128 vx = _mm_loadu_ps(x + i00); +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) + vx = _mm_add_ps(vx, _mm_movehl_ps(vx, vx)); + vx = _mm_add_ss(vx, _mm_movehdup_ps(vx)); +#else + __m128 tmp = _mm_shuffle_ps(vx, vx, _MM_SHUFFLE(2, 3, 0, 1)); + vx = _mm_add_ps(vx, tmp); + tmp = _mm_movehl_ps(tmp, vx); + vx = _mm_add_ss(vx, tmp); +#endif + sumr += (ggml_float)_mm_cvtss_f32(vx); + } +#endif + // Scalar fallback + for (; i00 < ne00; i00++) { sumr += (ggml_float)x[i00]; } sum += sumr; @@ -3813,8 +3925,51 @@ static void ggml_compute_forward_group_norm_f32( float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3); + // SIMD-optimized sum of squares after subtracting mean ggml_float sumr = 0.0; - for (int64_t i00 = 0; i00 < ne00; i00++) { + int64_t i00 = 0; +#if defined(__AVX512F__) && defined(__AVX512DQ__) + const __m512 vmean512 = _mm512_set1_ps(mean); + for (; i00 + 15 < ne00; i00 += 16) { + __m512 vx = _mm512_loadu_ps(x + i00); + __m512 vdiff = _mm512_sub_ps(vx, vmean512); + _mm512_storeu_ps(y + i00, vdiff); + sumr += (ggml_float)_mm512_reduce_add_ps(_mm512_mul_ps(vdiff, vdiff)); + } +#elif defined(__AVX2__) && defined(__FMA__) + const __m256 vmean256 = _mm256_set1_ps(mean); + for (; i00 + 7 < ne00; i00 += 8) { + __m256 vx = _mm256_loadu_ps(x + i00); + __m256 vdiff = _mm256_sub_ps(vx, vmean256); + _mm256_storeu_ps(y + i00, vdiff); + __m256 vsq = _mm256_mul_ps(vdiff, vdiff); + __m128 val2 = _mm_add_ps(_mm256_extractf128_ps(vsq, 1), + _mm256_castps256_ps128(vsq)); + val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2)); + val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2)); + sumr += (ggml_float)_mm_cvtss_f32(val2); + } +#elif defined(__SSE2__) + const __m128 vmean128 = _mm_set1_ps(mean); + for (; i00 + 3 < ne00; i00 += 4) { + __m128 vx = _mm_loadu_ps(x + i00); + __m128 vdiff = _mm_sub_ps(vx, vmean128); + _mm_storeu_ps(y + i00, vdiff); + __m128 vsq = _mm_mul_ps(vdiff, vdiff); +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) + vsq = _mm_add_ps(vsq, _mm_movehl_ps(vsq, vsq)); + vsq = _mm_add_ss(vsq, _mm_movehdup_ps(vsq)); +#else + __m128 tmp = _mm_shuffle_ps(vsq, vsq, _MM_SHUFFLE(2, 3, 0, 1)); + vsq = _mm_add_ps(vsq, tmp); + tmp = _mm_movehl_ps(tmp, vsq); + vsq = _mm_add_ss(vsq, tmp); +#endif + sumr += (ggml_float)_mm_cvtss_f32(vsq); + } +#endif + // Scalar fallback + for (; i00 < ne00; i00++) { float v = x[i00] - mean; y[i00] = v; sumr += (ggml_float)(v * v); @@ -3875,14 +4030,47 @@ static void ggml_compute_forward_l2_norm_f32( GGML_ASSERT(eps >= 0.0f); - // TODO: optimize for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = ith; i01 < ne01; i01 += nth) { const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + // SIMD-optimized sum of squares ggml_float sum = 0.0; - for (int64_t i00 = 0; i00 < ne00; i00++) { + int64_t i00 = 0; +#if defined(__AVX512F__) && defined(__AVX512DQ__) + for (; i00 + 15 < ne00; i00 += 16) { + __m512 vx = _mm512_loadu_ps(x + i00); + sum += (ggml_float)_mm512_reduce_add_ps(_mm512_mul_ps(vx, vx)); + } +#elif defined(__AVX2__) && defined(__FMA__) + for (; i00 + 7 < ne00; i00 += 8) { + __m256 vx = _mm256_loadu_ps(x + i00); + __m256 vsq = _mm256_mul_ps(vx, vx); + __m128 val2 = _mm_add_ps(_mm256_extractf128_ps(vsq, 1), + _mm256_castps256_ps128(vsq)); + val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2)); + val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2)); + sum += (ggml_float)_mm_cvtss_f32(val2); + } +#elif defined(__SSE2__) + for (; i00 + 3 < ne00; i00 += 4) { + __m128 vx = _mm_loadu_ps(x + i00); + __m128 vsq = _mm_mul_ps(vx, vx); +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) + vsq = _mm_add_ps(vsq, _mm_movehl_ps(vsq, vsq)); + vsq = _mm_add_ss(vsq, _mm_movehdup_ps(vsq)); +#else + __m128 tmp = _mm_shuffle_ps(vsq, vsq, _MM_SHUFFLE(2, 3, 0, 1)); + vsq = _mm_add_ps(vsq, tmp); + tmp = _mm_movehl_ps(tmp, vsq); + vsq = _mm_add_ss(vsq, tmp); +#endif + sum += (ggml_float)_mm_cvtss_f32(vsq); + } +#endif + // Scalar fallback for remaining elements + for (; i00 < ne00; i00++) { sum += (ggml_float)(x[i00] * x[i00]); } diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index 8421c84ce0942..d4d99499392d1 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -55,7 +55,6 @@ void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GG block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy; - // scalar const int blck_size_interleave = 4; float srcv[4][QK8_0]; float id[4]; @@ -63,9 +62,44 @@ void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GG for (int i = 0; i < nb; i++) { for (int row_iter = 0; row_iter < 4; row_iter++) { float amax = 0.0f; // absolute max - - for (int j = 0; j < QK8_0; j++) { - srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j]; + const float * src_row = x + row_iter * k + i * QK8_0; + + // SIMD-optimized absolute max computation + int j = 0; +#if defined(__AVX512F__) && defined(__AVX512DQ__) + __m512 vamax = _mm512_setzero_ps(); + for (; j + 15 < QK8_0; j += 16) { + __m512 vx = _mm512_loadu_ps(src_row + j); + vamax = _mm512_max_ps(vamax, _mm512_andnot_ps(_mm512_set1_ps(-0.0f), vx)); + } + amax = _mm512_reduce_max_ps(vamax); +#elif defined(__AVX2__) && defined(__FMA__) + __m256 vamax = _mm256_setzero_ps(); + for (; j + 7 < QK8_0; j += 8) { + __m256 vx = _mm256_loadu_ps(src_row + j); + vamax = _mm256_max_ps(vamax, _mm256_andnot_ps(_mm256_set1_ps(-0.0f), vx)); + } + __m128 vamax128 = _mm_max_ps(_mm256_extractf128_ps(vamax, 1), _mm256_castps256_ps128(vamax)); + vamax128 = _mm_max_ps(vamax128, _mm_movehl_ps(vamax128, vamax128)); + vamax128 = _mm_max_ss(vamax128, _mm_movehdup_ps(vamax128)); + amax = _mm_cvtss_f32(vamax128); +#elif defined(__SSE2__) + __m128 vamax = _mm_setzero_ps(); + for (; j + 3 < QK8_0; j += 4) { + __m128 vx = _mm_loadu_ps(src_row + j); + vamax = _mm_max_ps(vamax, _mm_andnot_ps(_mm_set1_ps(-0.0f), vx)); + } + vamax = _mm_max_ps(vamax, _mm_movehl_ps(vamax, vamax)); + vamax = _mm_max_ss(vamax, _mm_movehdup_ps(vamax)); + amax = _mm_cvtss_f32(vamax); +#endif + // Copy all elements to srcv and handle remaining elements + for (int j2 = 0; j2 < j; j2++) { + srcv[row_iter][j2] = src_row[j2]; + } + // Scalar fallback for remaining elements + for (; j < QK8_0; j++) { + srcv[row_iter][j] = src_row[j]; amax = MAX(amax, fabsf(srcv[row_iter][j])); } @@ -93,7 +127,6 @@ void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GG block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy; - // scalar const int blck_size_interleave = 8; float srcv[4][QK8_0]; float id[4]; @@ -101,9 +134,44 @@ void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GG for (int i = 0; i < nb; i++) { for (int row_iter = 0; row_iter < 4; row_iter++) { float amax = 0.0f; // absolute max - - for (int j = 0; j < QK8_0; j++) { - srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j]; + const float * src_row = x + row_iter * k + i * QK8_0; + + // SIMD-optimized absolute max computation + int j = 0; +#if defined(__AVX512F__) && defined(__AVX512DQ__) + __m512 vamax = _mm512_setzero_ps(); + for (; j + 15 < QK8_0; j += 16) { + __m512 vx = _mm512_loadu_ps(src_row + j); + vamax = _mm512_max_ps(vamax, _mm512_andnot_ps(_mm512_set1_ps(-0.0f), vx)); + } + amax = _mm512_reduce_max_ps(vamax); +#elif defined(__AVX2__) && defined(__FMA__) + __m256 vamax = _mm256_setzero_ps(); + for (; j + 7 < QK8_0; j += 8) { + __m256 vx = _mm256_loadu_ps(src_row + j); + vamax = _mm256_max_ps(vamax, _mm256_andnot_ps(_mm256_set1_ps(-0.0f), vx)); + } + __m128 vamax128 = _mm_max_ps(_mm256_extractf128_ps(vamax, 1), _mm256_castps256_ps128(vamax)); + vamax128 = _mm_max_ps(vamax128, _mm_movehl_ps(vamax128, vamax128)); + vamax128 = _mm_max_ss(vamax128, _mm_movehdup_ps(vamax128)); + amax = _mm_cvtss_f32(vamax128); +#elif defined(__SSE2__) + __m128 vamax = _mm_setzero_ps(); + for (; j + 3 < QK8_0; j += 4) { + __m128 vx = _mm_loadu_ps(src_row + j); + vamax = _mm_max_ps(vamax, _mm_andnot_ps(_mm_set1_ps(-0.0f), vx)); + } + vamax = _mm_max_ps(vamax, _mm_movehl_ps(vamax, vamax)); + vamax = _mm_max_ss(vamax, _mm_movehdup_ps(vamax)); + amax = _mm_cvtss_f32(vamax); +#endif + // Copy all elements to srcv and handle remaining elements + for (int j2 = 0; j2 < j; j2++) { + srcv[row_iter][j2] = src_row[j2]; + } + // Scalar fallback for remaining elements + for (; j < QK8_0; j++) { + srcv[row_iter][j] = src_row[j]; amax = MAX(amax, fabsf(srcv[row_iter][j])); } @@ -131,7 +199,6 @@ void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GG block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy; - // scalar const int blck_size_interleave = 8; float srcv[4][QK_K]; float iscale[4]; @@ -140,12 +207,50 @@ void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GG for (int row_iter = 0; row_iter < 4; row_iter++) { float amax = 0.0f; // absolute max float max = 0; - - for (int j = 0; j < QK_K; j++) { - srcv[row_iter][j] = x[row_iter * k + i * QK_K + j]; - // Update the maximum value of the corresponding super block - if(amax < fabsf(srcv[row_iter][j])) { - amax = fabsf(srcv[row_iter][j]); + const float * src_row = x + row_iter * k + i * QK_K; + + // SIMD-optimized absolute max computation + int j = 0; +#if defined(__AVX512F__) && defined(__AVX512DQ__) + __m512 vamax = _mm512_setzero_ps(); + for (; j + 15 < QK_K; j += 16) { + __m512 vx = _mm512_loadu_ps(src_row + j); + vamax = _mm512_max_ps(vamax, _mm512_andnot_ps(_mm512_set1_ps(-0.0f), vx)); + } + amax = _mm512_reduce_max_ps(vamax); +#elif defined(__AVX2__) && defined(__FMA__) + __m256 vamax = _mm256_setzero_ps(); + for (; j + 7 < QK_K; j += 8) { + __m256 vx = _mm256_loadu_ps(src_row + j); + vamax = _mm256_max_ps(vamax, _mm256_andnot_ps(_mm256_set1_ps(-0.0f), vx)); + } + __m128 vamax128 = _mm_max_ps(_mm256_extractf128_ps(vamax, 1), _mm256_castps256_ps128(vamax)); + vamax128 = _mm_max_ps(vamax128, _mm_movehl_ps(vamax128, vamax128)); + vamax128 = _mm_max_ss(vamax128, _mm_movehdup_ps(vamax128)); + amax = _mm_cvtss_f32(vamax128); +#elif defined(__SSE2__) + __m128 vamax = _mm_setzero_ps(); + for (; j + 3 < QK_K; j += 4) { + __m128 vx = _mm_loadu_ps(src_row + j); + vamax = _mm_max_ps(vamax, _mm_andnot_ps(_mm_set1_ps(-0.0f), vx)); + } + vamax = _mm_max_ps(vamax, _mm_movehl_ps(vamax, vamax)); + vamax = _mm_max_ss(vamax, _mm_movehdup_ps(vamax)); + amax = _mm_cvtss_f32(vamax); +#endif + // Copy all elements to srcv and find max for SIMD-processed elements + for (int j2 = 0; j2 < j; j2++) { + srcv[row_iter][j2] = src_row[j2]; + if (fabsf(src_row[j2]) == amax && max == 0) { + max = src_row[j2]; + } + } + // Scalar fallback for remaining elements and find max + for (; j < QK_K; j++) { + srcv[row_iter][j] = src_row[j]; + float abs_val = fabsf(srcv[row_iter][j]); + if (amax < abs_val) { + amax = abs_val; max = srcv[row_iter][j]; } } diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp index 43dc7537c3307..7d4d19a8aed4e 100644 --- a/ggml/src/ggml-cpu/vec.cpp +++ b/ggml/src/ggml-cpu/vec.cpp @@ -407,8 +407,6 @@ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean) { int i = 0; ggml_float sum = 0; -// TODO: optimize to process the remaining elements in groups using the smaller vector sizes from AVX2 and SSE -// ref: https://github.com/ggml-org/llama.cpp/pull/15953#pullrequestreview-3310928344 #if defined(__AVX512F__) && defined(__AVX512DQ__) for (; i + 15 < n; i += 16) { __m512 val = _mm512_sub_ps(_mm512_loadu_ps(x + i), @@ -416,6 +414,32 @@ ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const floa _mm512_storeu_ps(y + i, val); sum += (ggml_float)_mm512_reduce_add_ps(_mm512_mul_ps(val, val)); } + // Process remaining elements with AVX2 (8 elements at a time) + #if defined(__AVX2__) && defined(__FMA__) + for (; i + 7 < n; i += 8) { + __m256 val = _mm256_sub_ps(_mm256_loadu_ps(x + i), + _mm256_set1_ps(mean)); + _mm256_storeu_ps(y + i, val); + val = _mm256_mul_ps(val,val); + __m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1), + _mm256_castps256_ps128(val)); + val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2)); + val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2)); + sum += (ggml_float)_mm_cvtss_f32(val2); + } + #endif + // Process remaining elements with SSE (4 elements at a time) + #if defined(__SSE2__) + for (; i + 3 < n; i += 4) { + __m128 val = _mm_sub_ps(_mm_loadu_ps(x + i), + _mm_set1_ps(mean)); + _mm_storeu_ps(y + i, val); + val = _mm_mul_ps(val, val); + val = _mm_add_ps(val, _mm_movehl_ps(val, val)); + val = _mm_add_ss(val, _mm_movehdup_ps(val)); + sum += (ggml_float)_mm_cvtss_f32(val); + } + #endif #elif defined(__AVX2__) && defined(__FMA__) for (; i + 7 < n; i += 8) { __m256 val = _mm256_sub_ps(_mm256_loadu_ps(x + i), @@ -428,6 +452,18 @@ ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const floa val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2)); sum += (ggml_float)_mm_cvtss_f32(val2); } + // Process remaining elements with SSE (4 elements at a time) + #if defined(__SSE2__) + for (; i + 3 < n; i += 4) { + __m128 val = _mm_sub_ps(_mm_loadu_ps(x + i), + _mm_set1_ps(mean)); + _mm_storeu_ps(y + i, val); + val = _mm_mul_ps(val, val); + val = _mm_add_ps(val, _mm_movehl_ps(val, val)); + val = _mm_add_ss(val, _mm_movehdup_ps(val)); + sum += (ggml_float)_mm_cvtss_f32(val); + } + #endif #elif defined(__SSE2__) for (; i + 3 < n; i += 4) { __m128 val = _mm_sub_ps(_mm_loadu_ps(x + i), @@ -461,6 +497,7 @@ ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const floa sum += (ggml_float)vec_hsum_f32x4(val); } #endif + // Process remaining elements with scalar code for (; i < n; ++i) { float val = x[i] - mean; y[i] = val; @@ -553,3 +590,189 @@ ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, fl } return sum = (ggml_float)logf(sum); } + +void ggml_vec_hardswish_f32(const int n, float * y, const float * x) { + // hardswish(x) = x * min(1, max(0, (x + 3) / 6)) + const float three = 3.0f; + const float six = 6.0f; + const float one = 1.0f; + const float zero = 0.0f; + + int i = 0; +#if defined(__AVX512F__) && defined(__AVX512DQ__) + const __m512 v_three = _mm512_set1_ps(three); + const __m512 v_six = _mm512_set1_ps(six); + const __m512 v_one = _mm512_set1_ps(one); + const __m512 v_zero = _mm512_set1_ps(zero); + + for (; i + 15 < n; i += 16) { + __m512 vx = _mm512_loadu_ps(x + i); + __m512 vx_plus_3 = _mm512_add_ps(vx, v_three); + __m512 v_div_6 = _mm512_div_ps(vx_plus_3, v_six); + __m512 v_clamped = _mm512_max_ps(v_zero, _mm512_min_ps(v_one, v_div_6)); + __m512 result = _mm512_mul_ps(vx, v_clamped); + _mm512_storeu_ps(y + i, result); + } +#elif defined(__AVX2__) && defined(__FMA__) + const __m256 v_three = _mm256_set1_ps(three); + const __m256 v_six = _mm256_set1_ps(six); + const __m256 v_one = _mm256_set1_ps(one); + const __m256 v_zero = _mm256_set1_ps(zero); + + for (; i + 7 < n; i += 8) { + __m256 vx = _mm256_loadu_ps(x + i); + __m256 vx_plus_3 = _mm256_add_ps(vx, v_three); + __m256 v_div_6 = _mm256_div_ps(vx_plus_3, v_six); + __m256 v_clamped = _mm256_max_ps(v_zero, _mm256_min_ps(v_one, v_div_6)); + __m256 result = _mm256_mul_ps(vx, v_clamped); + _mm256_storeu_ps(y + i, result); + } +#elif defined(__SSE2__) + const __m128 v_three = _mm_set1_ps(three); + const __m128 v_six = _mm_set1_ps(six); + const __m128 v_one = _mm_set1_ps(one); + const __m128 v_zero = _mm_set1_ps(zero); + + for (; i + 3 < n; i += 4) { + __m128 vx = _mm_loadu_ps(x + i); + __m128 vx_plus_3 = _mm_add_ps(vx, v_three); + __m128 v_div_6 = _mm_div_ps(vx_plus_3, v_six); + __m128 v_clamped = _mm_max_ps(v_zero, _mm_min_ps(v_one, v_div_6)); + __m128 result = _mm_mul_ps(vx, v_clamped); + _mm_storeu_ps(y + i, result); + } +#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__) + const int vlen = svcntw(); + const svfloat32_t v_three = svdup_n_f32(three); + const svfloat32_t v_six = svdup_n_f32(six); + const svfloat32_t v_one = svdup_n_f32(one); + const svfloat32_t v_zero = svdup_n_f32(zero); + + for (; i < n; i += vlen) { + const svbool_t pg = svwhilelt_b32_s32(i, n); + svfloat32_t vx = svld1_f32(pg, x + i); + svfloat32_t vx_plus_3 = svadd_f32_x(pg, vx, v_three); + svfloat32_t v_div_6 = svdiv_f32_x(pg, vx_plus_3, v_six); + svfloat32_t v_clamped = svmax_f32_x(pg, v_zero, svmin_f32_x(pg, v_one, v_div_6)); + svfloat32_t result = svmul_f32_x(pg, vx, v_clamped); + svst1_f32(pg, y + i, result); + } +#elif defined(__ARM_NEON) && defined(__aarch64__) + const float32x4_t v_three = vdupq_n_f32(three); + const float32x4_t v_six = vdupq_n_f32(six); + const float32x4_t v_one = vdupq_n_f32(one); + const float32x4_t v_zero = vdupq_n_f32(zero); + + for (; i + 3 < n; i += 4) { + float32x4_t vx = vld1q_f32(x + i); + float32x4_t vx_plus_3 = vaddq_f32(vx, v_three); + float32x4_t v_div_6 = vdivq_f32(vx_plus_3, v_six); + float32x4_t v_clamped = vmaxq_f32(v_zero, vminq_f32(v_one, v_div_6)); + float32x4_t result = vmulq_f32(vx, v_clamped); + vst1q_f32(y + i, result); + } +#elif defined(__riscv_v_intrinsic) + for (int vl; i < n; i += vl) { + vl = __riscv_vsetvl_e32m2(n - i); + vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl); + vfloat32m2_t vx_plus_3 = __riscv_vfadd_vf_f32m2(vx, three, vl); + vfloat32m2_t v_div_6 = __riscv_vfdiv_vf_f32m2(vx_plus_3, six, vl); + vfloat32m2_t v_clamped = __riscv_vfmax_vf_f32m2(__riscv_vfmin_vf_f32m2(v_div_6, one, vl), zero, vl); + vfloat32m2_t result = __riscv_vfmul_vv_f32m2(vx, v_clamped, vl); + __riscv_vse32_v_f32m2(&y[i], result, vl); + } +#endif + for (; i < n; ++i) { + y[i] = x[i] * fminf(one, fmaxf(zero, (x[i] + three) / six)); + } +} + +void ggml_vec_hardsigmoid_f32(const int n, float * y, const float * x) { + // hardsigmoid(x) = min(1, max(0, (x + 3) / 6)) + const float three = 3.0f; + const float six = 6.0f; + const float one = 1.0f; + const float zero = 0.0f; + + int i = 0; +#if defined(__AVX512F__) && defined(__AVX512DQ__) + const __m512 v_three = _mm512_set1_ps(three); + const __m512 v_six = _mm512_set1_ps(six); + const __m512 v_one = _mm512_set1_ps(one); + const __m512 v_zero = _mm512_set1_ps(zero); + + for (; i + 15 < n; i += 16) { + __m512 vx = _mm512_loadu_ps(x + i); + __m512 vx_plus_3 = _mm512_add_ps(vx, v_three); + __m512 v_div_6 = _mm512_div_ps(vx_plus_3, v_six); + __m512 result = _mm512_max_ps(v_zero, _mm512_min_ps(v_one, v_div_6)); + _mm512_storeu_ps(y + i, result); + } +#elif defined(__AVX2__) && defined(__FMA__) + const __m256 v_three = _mm256_set1_ps(three); + const __m256 v_six = _mm256_set1_ps(six); + const __m256 v_one = _mm256_set1_ps(one); + const __m256 v_zero = _mm256_set1_ps(zero); + + for (; i + 7 < n; i += 8) { + __m256 vx = _mm256_loadu_ps(x + i); + __m256 vx_plus_3 = _mm256_add_ps(vx, v_three); + __m256 v_div_6 = _mm256_div_ps(vx_plus_3, v_six); + __m256 result = _mm256_max_ps(v_zero, _mm256_min_ps(v_one, v_div_6)); + _mm256_storeu_ps(y + i, result); + } +#elif defined(__SSE2__) + const __m128 v_three = _mm_set1_ps(three); + const __m128 v_six = _mm_set1_ps(six); + const __m128 v_one = _mm_set1_ps(one); + const __m128 v_zero = _mm_set1_ps(zero); + + for (; i + 3 < n; i += 4) { + __m128 vx = _mm_loadu_ps(x + i); + __m128 vx_plus_3 = _mm_add_ps(vx, v_three); + __m128 v_div_6 = _mm_div_ps(vx_plus_3, v_six); + __m128 result = _mm_max_ps(v_zero, _mm_min_ps(v_one, v_div_6)); + _mm_storeu_ps(y + i, result); + } +#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__) + const int vlen = svcntw(); + const svfloat32_t v_three = svdup_n_f32(three); + const svfloat32_t v_six = svdup_n_f32(six); + const svfloat32_t v_one = svdup_n_f32(one); + const svfloat32_t v_zero = svdup_n_f32(zero); + + for (; i < n; i += vlen) { + const svbool_t pg = svwhilelt_b32_s32(i, n); + svfloat32_t vx = svld1_f32(pg, x + i); + svfloat32_t vx_plus_3 = svadd_f32_x(pg, vx, v_three); + svfloat32_t v_div_6 = svdiv_f32_x(pg, vx_plus_3, v_six); + svfloat32_t result = svmax_f32_x(pg, v_zero, svmin_f32_x(pg, v_one, v_div_6)); + svst1_f32(pg, y + i, result); + } +#elif defined(__ARM_NEON) && defined(__aarch64__) + const float32x4_t v_three = vdupq_n_f32(three); + const float32x4_t v_six = vdupq_n_f32(six); + const float32x4_t v_one = vdupq_n_f32(one); + const float32x4_t v_zero = vdupq_n_f32(zero); + + for (; i + 3 < n; i += 4) { + float32x4_t vx = vld1q_f32(x + i); + float32x4_t vx_plus_3 = vaddq_f32(vx, v_three); + float32x4_t v_div_6 = vdivq_f32(vx_plus_3, v_six); + float32x4_t result = vmaxq_f32(v_zero, vminq_f32(v_one, v_div_6)); + vst1q_f32(y + i, result); + } +#elif defined(__riscv_v_intrinsic) + for (int vl; i < n; i += vl) { + vl = __riscv_vsetvl_e32m2(n - i); + vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl); + vfloat32m2_t vx_plus_3 = __riscv_vfadd_vf_f32m2(vx, three, vl); + vfloat32m2_t v_div_6 = __riscv_vfdiv_vf_f32m2(vx_plus_3, six, vl); + vfloat32m2_t result = __riscv_vfmax_vf_f32m2(__riscv_vfmin_vf_f32m2(v_div_6, one, vl), zero, vl); + __riscv_vse32_v_f32m2(&y[i], result, vl); + } +#endif + for (; i < n; ++i) { + y[i] = fminf(one, fmaxf(zero, (x[i] + three) / six)); + } +} diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index 65c7dfb6b9a49..a5bb74d364931 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -44,6 +44,8 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc); void ggml_vec_silu_f32(const int n, float * y, const float * x); +void ggml_vec_hardswish_f32(const int n, float * y, const float * x); +void ggml_vec_hardsigmoid_f32(const int n, float * y, const float * x); ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean); //it will also center y ( y = y - mean ) ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max); ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max); @@ -844,17 +846,40 @@ inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggm y[i] = GGML_CPU_FP32_TO_FP16(1.f / (1.f + expf(-GGML_CPU_FP16_TO_FP32(x[i])))); } } -// TODO: optimize performance -inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); } +// ggml_vec_hardswish_f32 is implemented as a non-inline function in vec.cpp inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { - for (int i = 0; i < n; ++i) { + float tmp_f32[256]; + float tmp_f32_out[256]; + int i = 0; + for (; i + 256 <= n; i += 256) { + for (int j = 0; j < 256; ++j) { + tmp_f32[j] = GGML_CPU_FP16_TO_FP32(x[i + j]); + } + ggml_vec_hardswish_f32(256, tmp_f32_out, tmp_f32); + for (int j = 0; j < 256; ++j) { + y[i + j] = GGML_CPU_FP32_TO_FP16(tmp_f32_out[j]); + } + } + for (; i < n; ++i) { float v = GGML_CPU_FP16_TO_FP32(x[i]); y[i] = GGML_CPU_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f))); } } -inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); } +// ggml_vec_hardsigmoid_f32 is implemented as a non-inline function in vec.cpp inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { - for (int i = 0; i < n; ++i) { + float tmp_f32[256]; + float tmp_f32_out[256]; + int i = 0; + for (; i + 256 <= n; i += 256) { + for (int j = 0; j < 256; ++j) { + tmp_f32[j] = GGML_CPU_FP16_TO_FP32(x[i + j]); + } + ggml_vec_hardsigmoid_f32(256, tmp_f32_out, tmp_f32); + for (int j = 0; j < 256; ++j) { + y[i + j] = GGML_CPU_FP32_TO_FP16(tmp_f32_out[j]); + } + } + for (; i < n; ++i) { y[i] = GGML_CPU_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_CPU_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f))); } } diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index e26385a1feaf1..f5022a2b2c77e 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -1249,56 +1249,61 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u // xxxxx----- // xxxxx----- // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615 - // TODO: optimize this section + // Optimized: reordered conditions for better branch prediction and cached frequently accessed values for (uint32_t h = 0; h < 1; ++h) { for (uint32_t s = 0; s < n_stream; ++s) { for (uint32_t ii = 0; ii < n_tps; ++ii) { const uint32_t i = s*n_tps + ii; const llama_seq_id seq_id = ubatch->seq_id[i][0]; - const auto & cells = v_cells[seq_to_stream[seq_id]]; - const llama_pos p1 = ubatch->pos[i]; - // for M-RoPE + // for M-RoPE - cache these values outside the inner loop const bool is_2d = ubatch->is_pos_2d(); const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0; const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0; const uint64_t idst = n_kv*(h*n_stream*n_tps_pad + s*n_tps_pad + ii); + float * dst_row = data + idst; + // Optimize inner loop: reorder conditions to fail fast and reduce redundant checks for (uint32_t j = 0; j < n_kv; ++j) { + // Fast path: check empty first (most common early exit) if (cells.is_empty(j)) { continue; } - // mask the token if not the same sequence + // Check sequence match early (before position checks) if (!cells.seq_has(j, seq_id)) { continue; } const llama_pos p0 = cells.pos_get(j); - // mask future tokens - if (causal_attn && p0 > p1) { - continue; - } - - // M-RoPE causal mask - if (causal_attn && is_2d && p0 == p1) { - const auto & p0_ext = cells.ext_get(j); - if (p0_ext.is_2d_gt(p1_x, p1_y)) { + // Causal attention check (common case) + if (causal_attn) { + // Fast path: future tokens + if (p0 > p1) { continue; } + + // M-RoPE causal mask (less common, check after position comparison) + if (is_2d && p0 == p1) { + const auto & p0_ext = cells.ext_get(j); + if (p0_ext.is_2d_gt(p1_x, p1_y)) { + continue; + } + } } - // apply SWA if any + // Apply SWA if any (less common, check last) if (is_masked_swa(p0, p1)) { continue; } - data[idst + j] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f; + // All checks passed - set the mask value + dst_row[j] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f; } } }