Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions common/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -325,9 +325,12 @@ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
}

void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
// TODO: measure grammar performance

if (gsmpl) {
// Print grammar sampler performance if available
if (gsmpl->grmr != nullptr) {
llama_perf_sampler_print(gsmpl->grmr);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

passed my local tests. ill look into it

}
// Print main sampling chain performance
llama_perf_sampler_print(gsmpl->chain);
}
if (ctx) {
Expand Down
36 changes: 19 additions & 17 deletions ggml/src/ggml-cpu/amx/mmq.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2423,25 +2423,27 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
// pointer to work space, used convert A from float to quantized type
void * wdata = params->wdata;

//TODO: performance improvement: merge quant A
if (params->ith == 0) {
GGML_DISPATCH_QTYPES(TYPE, [&] {
const size_t row_size_A = K / blck_size * sizeof(vec_dot_type);
const size_t desired_wsize = M * row_size_A;
if (params->wsize < desired_wsize) {
GGML_ABORT("insufficient work space size");
}
// Parallelize quantization of matrix A to improve cache locality and utilize multiple threads
GGML_DISPATCH_QTYPES(TYPE, [&] {
const size_t row_size_A = K / blck_size * sizeof(vec_dot_type);
const size_t desired_wsize = M * row_size_A;
if (params->wsize < desired_wsize) {
GGML_ABORT("insufficient work space size");
}

// Q4_0, Q4_1, Q8_0 handles 1 TILE_K per blck_size
// Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size);
// Q4_0, Q4_1, Q8_0 handles 1 TILE_K per blck_size
// Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size);

const float * A_data = static_cast<const float *>(src1->data);
for (int m = 0; m < M; ++m) {
from_float<vec_dot_type>(A_data + m * K, (char *)wdata + m * row_size_A, K);
}
});
}
const float * A_data = static_cast<const float *>(src1->data);
const int nth = params->nth;
const int ith = params->ith;

// Parallelize quantization across threads
for (int m = ith; m < M; m += nth) {
from_float<vec_dot_type>(A_data + m * K, (char *)wdata + m * row_size_A, K);
}
});

ggml_barrier(params->threadpool);

Expand Down
210 changes: 199 additions & 11 deletions ggml/src/ggml-cpu/ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3534,14 +3534,47 @@ static void ggml_compute_forward_rms_norm_f32(

GGML_ASSERT(eps >= 0.0f);

// TODO: optimize
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);

// SIMD-optimized sum of squares
ggml_float sum = 0.0;
for (int64_t i00 = 0; i00 < ne00; i00++) {
int64_t i00 = 0;
#if defined(__AVX512F__) && defined(__AVX512DQ__)
for (; i00 + 15 < ne00; i00 += 16) {
__m512 vx = _mm512_loadu_ps(x + i00);
sum += (ggml_float)_mm512_reduce_add_ps(_mm512_mul_ps(vx, vx));
}
#elif defined(__AVX2__) && defined(__FMA__)
for (; i00 + 7 < ne00; i00 += 8) {
__m256 vx = _mm256_loadu_ps(x + i00);
__m256 vsq = _mm256_mul_ps(vx, vx);
__m128 val2 = _mm_add_ps(_mm256_extractf128_ps(vsq, 1),
_mm256_castps256_ps128(vsq));
val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
sum += (ggml_float)_mm_cvtss_f32(val2);
}
#elif defined(__SSE2__)
for (; i00 + 3 < ne00; i00 += 4) {
__m128 vx = _mm_loadu_ps(x + i00);
__m128 vsq = _mm_mul_ps(vx, vx);
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
vsq = _mm_add_ps(vsq, _mm_movehl_ps(vsq, vsq));
vsq = _mm_add_ss(vsq, _mm_movehdup_ps(vsq));
#else
__m128 tmp = _mm_shuffle_ps(vsq, vsq, _MM_SHUFFLE(2, 3, 0, 1));
vsq = _mm_add_ps(vsq, tmp);
tmp = _mm_movehl_ps(tmp, vsq);
vsq = _mm_add_ss(vsq, tmp);
#endif
sum += (ggml_float)_mm_cvtss_f32(vsq);
}
#endif
// Scalar fallback for remaining elements
for (; i00 < ne00; i00++) {
Copy link
Contributor

@Djip007 Djip007 Nov 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is best to reduce only at the end.
But modern compiler can same job with some small help:
try this on https://godbolt.org/ with for exemple
GCC / "-march=znver4 -O3 -fopenmp"

#define VECT_SIZE 16

float rmse(float* v, int N) {
    float res_v[VECT_SIZE] = {0};
    int i = 0;
    for (; i<N/VECT_SIZE; ++i) {
#       pragma omp simd
        for (int k=0; k<VECT_SIZE; ++k) {
            res_v[k] += v[i*VECT_SIZE+k]*v[i*VECT_SIZE+k];
        }
    }
    // redution;
    float res = 0;
    for (int k=0; k<VECT_SIZE; ++k) {
        res += res_v[k];
    }
    i *= VECT_SIZE;
    for (; i<N; ++i) {
        res += v[i]*v[i];
    }

    return res;
}

form me in most case intrinsic is only needed when you can't have the same on C like:

__m512 _mm512_dpbf16_ps (__m512 src, __m512bh a, __m512bh b)

or event simple:

#define VECT_SIZE 16

float rmse(float* v, int N) {
    float res = 0;
#   pragma omp simd simdlen(VECT_SIZE) reduction(+:res)
    for (int i = 0; i<N; ++i) {
        res += v[i]*v[i];
    }
    return res;
}

sum += (ggml_float)(x[i00] * x[i00]);
}

Expand Down Expand Up @@ -3603,7 +3636,6 @@ static void ggml_compute_forward_rms_norm_back_f32(
float eps;
memcpy(&eps, dst->op_params, sizeof(float));

// TODO: optimize
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
Expand All @@ -3615,10 +3647,61 @@ static void ggml_compute_forward_rms_norm_back_f32(
const float * dz = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
const float * x = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13);

// SIMD-optimized sum of squares and dot product
ggml_float sum_xx = 0.0;
ggml_float sum_xdz = 0.0;

for (int64_t i00 = 0; i00 < ne00; i00++) {
int64_t i00 = 0;
#if defined(__AVX512F__) && defined(__AVX512DQ__)
for (; i00 + 15 < ne00; i00 += 16) {
__m512 vx = _mm512_loadu_ps(x + i00);
__m512 vdz = _mm512_loadu_ps(dz + i00);
sum_xx += (ggml_float)_mm512_reduce_add_ps(_mm512_mul_ps(vx, vx));
sum_xdz += (ggml_float)_mm512_reduce_add_ps(_mm512_mul_ps(vx, vdz));
}
#elif defined(__AVX2__) && defined(__FMA__)
for (; i00 + 7 < ne00; i00 += 8) {
__m256 vx = _mm256_loadu_ps(x + i00);
__m256 vdz = _mm256_loadu_ps(dz + i00);
__m256 vsq = _mm256_mul_ps(vx, vx);
__m256 vdot = _mm256_mul_ps(vx, vdz);
__m128 val2_sq = _mm_add_ps(_mm256_extractf128_ps(vsq, 1),
_mm256_castps256_ps128(vsq));
__m128 val2_dot = _mm_add_ps(_mm256_extractf128_ps(vdot, 1),
_mm256_castps256_ps128(vdot));
val2_sq = _mm_add_ps(val2_sq, _mm_movehl_ps(val2_sq, val2_sq));
val2_dot = _mm_add_ps(val2_dot, _mm_movehl_ps(val2_dot, val2_dot));
val2_sq = _mm_add_ss(val2_sq, _mm_movehdup_ps(val2_sq));
val2_dot = _mm_add_ss(val2_dot, _mm_movehdup_ps(val2_dot));
sum_xx += (ggml_float)_mm_cvtss_f32(val2_sq);
sum_xdz += (ggml_float)_mm_cvtss_f32(val2_dot);
}
#elif defined(__SSE2__)
for (; i00 + 3 < ne00; i00 += 4) {
__m128 vx = _mm_loadu_ps(x + i00);
__m128 vdz = _mm_loadu_ps(dz + i00);
__m128 vsq = _mm_mul_ps(vx, vx);
__m128 vdot = _mm_mul_ps(vx, vdz);
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
vsq = _mm_add_ps(vsq, _mm_movehl_ps(vsq, vsq));
vdot = _mm_add_ps(vdot, _mm_movehl_ps(vdot, vdot));
vsq = _mm_add_ss(vsq, _mm_movehdup_ps(vsq));
vdot = _mm_add_ss(vdot, _mm_movehdup_ps(vdot));
#else
__m128 tmp = _mm_shuffle_ps(vsq, vsq, _MM_SHUFFLE(2, 3, 0, 1));
vsq = _mm_add_ps(vsq, tmp);
tmp = _mm_movehl_ps(tmp, vsq);
vsq = _mm_add_ss(vsq, tmp);
tmp = _mm_shuffle_ps(vdot, vdot, _MM_SHUFFLE(2, 3, 0, 1));
vdot = _mm_add_ps(vdot, tmp);
tmp = _mm_movehl_ps(tmp, vdot);
vdot = _mm_add_ss(vdot, tmp);
#endif
sum_xx += (ggml_float)_mm_cvtss_f32(vsq);
sum_xdz += (ggml_float)_mm_cvtss_f32(vdot);
}
#endif
// Scalar fallback for remaining elements
for (; i00 < ne00; i00++) {
sum_xx += (ggml_float)(x[i00] * x[i00]);
sum_xdz += (ggml_float)(x[i00] * dz[i00]);
}
Expand Down Expand Up @@ -3775,8 +3858,6 @@ static void ggml_compute_forward_group_norm_f32(

GGML_TENSOR_UNARY_OP_LOCALS

// TODO: optimize

float eps;
memcpy(&eps, dst->op_params + 1, sizeof(float));

Expand All @@ -3797,8 +3878,39 @@ static void ggml_compute_forward_group_norm_f32(
for (int64_t i01 = 0; i01 < ne01; i01++) {
const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);

// SIMD-optimized sum
ggml_float sumr = 0.0;
for (int64_t i00 = 0; i00 < ne00; i00++) {
int64_t i00 = 0;
#if defined(__AVX512F__) && defined(__AVX512DQ__)
for (; i00 + 15 < ne00; i00 += 16) {
sumr += (ggml_float)_mm512_reduce_add_ps(_mm512_loadu_ps(x + i00));
}
#elif defined(__AVX2__) && defined(__FMA__)
for (; i00 + 7 < ne00; i00 += 8) {
__m256 vx = _mm256_loadu_ps(x + i00);
__m128 val2 = _mm_add_ps(_mm256_extractf128_ps(vx, 1),
_mm256_castps256_ps128(vx));
val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
sumr += (ggml_float)_mm_cvtss_f32(val2);
}
#elif defined(__SSE2__)
for (; i00 + 3 < ne00; i00 += 4) {
__m128 vx = _mm_loadu_ps(x + i00);
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
vx = _mm_add_ps(vx, _mm_movehl_ps(vx, vx));
vx = _mm_add_ss(vx, _mm_movehdup_ps(vx));
#else
__m128 tmp = _mm_shuffle_ps(vx, vx, _MM_SHUFFLE(2, 3, 0, 1));
vx = _mm_add_ps(vx, tmp);
tmp = _mm_movehl_ps(tmp, vx);
vx = _mm_add_ss(vx, tmp);
#endif
sumr += (ggml_float)_mm_cvtss_f32(vx);
}
#endif
// Scalar fallback
for (; i00 < ne00; i00++) {
sumr += (ggml_float)x[i00];
}
sum += sumr;
Expand All @@ -3813,8 +3925,51 @@ static void ggml_compute_forward_group_norm_f32(

float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);

// SIMD-optimized sum of squares after subtracting mean
ggml_float sumr = 0.0;
for (int64_t i00 = 0; i00 < ne00; i00++) {
int64_t i00 = 0;
#if defined(__AVX512F__) && defined(__AVX512DQ__)
const __m512 vmean512 = _mm512_set1_ps(mean);
for (; i00 + 15 < ne00; i00 += 16) {
__m512 vx = _mm512_loadu_ps(x + i00);
__m512 vdiff = _mm512_sub_ps(vx, vmean512);
_mm512_storeu_ps(y + i00, vdiff);
sumr += (ggml_float)_mm512_reduce_add_ps(_mm512_mul_ps(vdiff, vdiff));
}
#elif defined(__AVX2__) && defined(__FMA__)
const __m256 vmean256 = _mm256_set1_ps(mean);
for (; i00 + 7 < ne00; i00 += 8) {
__m256 vx = _mm256_loadu_ps(x + i00);
__m256 vdiff = _mm256_sub_ps(vx, vmean256);
_mm256_storeu_ps(y + i00, vdiff);
__m256 vsq = _mm256_mul_ps(vdiff, vdiff);
__m128 val2 = _mm_add_ps(_mm256_extractf128_ps(vsq, 1),
_mm256_castps256_ps128(vsq));
val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
sumr += (ggml_float)_mm_cvtss_f32(val2);
}
#elif defined(__SSE2__)
const __m128 vmean128 = _mm_set1_ps(mean);
for (; i00 + 3 < ne00; i00 += 4) {
__m128 vx = _mm_loadu_ps(x + i00);
__m128 vdiff = _mm_sub_ps(vx, vmean128);
_mm_storeu_ps(y + i00, vdiff);
__m128 vsq = _mm_mul_ps(vdiff, vdiff);
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
vsq = _mm_add_ps(vsq, _mm_movehl_ps(vsq, vsq));
vsq = _mm_add_ss(vsq, _mm_movehdup_ps(vsq));
#else
__m128 tmp = _mm_shuffle_ps(vsq, vsq, _MM_SHUFFLE(2, 3, 0, 1));
vsq = _mm_add_ps(vsq, tmp);
tmp = _mm_movehl_ps(tmp, vsq);
vsq = _mm_add_ss(vsq, tmp);
#endif
sumr += (ggml_float)_mm_cvtss_f32(vsq);
}
#endif
// Scalar fallback
for (; i00 < ne00; i00++) {
float v = x[i00] - mean;
y[i00] = v;
sumr += (ggml_float)(v * v);
Expand Down Expand Up @@ -3875,14 +4030,47 @@ static void ggml_compute_forward_l2_norm_f32(

GGML_ASSERT(eps >= 0.0f);

// TODO: optimize
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);

// SIMD-optimized sum of squares
ggml_float sum = 0.0;
for (int64_t i00 = 0; i00 < ne00; i00++) {
int64_t i00 = 0;
#if defined(__AVX512F__) && defined(__AVX512DQ__)
for (; i00 + 15 < ne00; i00 += 16) {
__m512 vx = _mm512_loadu_ps(x + i00);
sum += (ggml_float)_mm512_reduce_add_ps(_mm512_mul_ps(vx, vx));
}
#elif defined(__AVX2__) && defined(__FMA__)
for (; i00 + 7 < ne00; i00 += 8) {
__m256 vx = _mm256_loadu_ps(x + i00);
__m256 vsq = _mm256_mul_ps(vx, vx);
__m128 val2 = _mm_add_ps(_mm256_extractf128_ps(vsq, 1),
_mm256_castps256_ps128(vsq));
val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
sum += (ggml_float)_mm_cvtss_f32(val2);
}
#elif defined(__SSE2__)
for (; i00 + 3 < ne00; i00 += 4) {
__m128 vx = _mm_loadu_ps(x + i00);
__m128 vsq = _mm_mul_ps(vx, vx);
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
vsq = _mm_add_ps(vsq, _mm_movehl_ps(vsq, vsq));
vsq = _mm_add_ss(vsq, _mm_movehdup_ps(vsq));
#else
__m128 tmp = _mm_shuffle_ps(vsq, vsq, _MM_SHUFFLE(2, 3, 0, 1));
vsq = _mm_add_ps(vsq, tmp);
tmp = _mm_movehl_ps(tmp, vsq);
vsq = _mm_add_ss(vsq, tmp);
#endif
sum += (ggml_float)_mm_cvtss_f32(vsq);
}
#endif
// Scalar fallback for remaining elements
for (; i00 < ne00; i00++) {
sum += (ggml_float)(x[i00] * x[i00]);
}

Expand Down
Loading
Loading