@@ -3456,11 +3456,12 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
34563456 const uint16_t * qh = x [i ].qh ;
34573457
34583458 for (int ib = 0 ; ib < QK_K /32 ; ++ ib ) {
3459- const float dl = d * (2 * (qh [ib ] >> 12 ) + 1 );
3459+ const float dl = d * (2 * ((qh [ib ] >> 12 ) & 7 ) + 1 );
3460+ const float delta = qh [ib ] & 0x8000 ? - IQ1S_DELTA : IQ1S_DELTA ;
34603461 for (int l = 0 ; l < 4 ; ++ l ) {
34613462 const int8_t * grid = (const int8_t * )(iq1s_grid + (qs [l ] | (((qh [ib ] >> 3 * l ) & 7 ) << 8 )));
34623463 for (int j = 0 ; j < 8 ; ++ j ) {
3463- y [j ] = dl * grid [j ];
3464+ y [j ] = dl * ( grid [j ] + delta ) ;
34643465 }
34653466 y += 8 ;
34663467 }
@@ -9582,7 +9583,7 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
95829583 const uint8_t * qs = x [i ].qs ;
95839584 const uint16_t * qh = x [i ].qh ;
95849585
9585- int sumi1 = 0 , sumi2 = 0 ;
9586+ int sumi1 = 0 , sumi2 = 0 , sumi3 = 0 ;
95869587
95879588 for (int ib = 0 ; ib < QK_K /32 ; ib += 2 ) {
95889589
@@ -9601,26 +9602,32 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
96019602 const int32x4_t p1 = ggml_vdotq_s32 (ggml_vdotq_s32 (vdupq_n_s32 (0 ), q1b .val [0 ], q8b .val [0 ]), q1b .val [1 ], q8b .val [1 ]);
96029603 const int32x4_t p2 = ggml_vdotq_s32 (ggml_vdotq_s32 (vdupq_n_s32 (0 ), q1b .val [2 ], q8b .val [2 ]), q1b .val [3 ], q8b .val [3 ]);
96039604
9604- sumi1 += vaddvq_s32 (p1 ) * (2 * (qh [ib + 0 ] >> 12 ) + 1 );
9605- sumi2 += vaddvq_s32 (p2 ) * (2 * (qh [ib + 1 ] >> 12 ) + 1 );
9605+ const int ls1 = 2 * ((qh [ib + 0 ] >> 12 ) & 7 ) + 1 ;
9606+ const int ls2 = 2 * ((qh [ib + 1 ] >> 12 ) & 7 ) + 1 ;
9607+ sumi1 += vaddvq_s32 (p1 ) * ls1 ;
9608+ sumi2 += vaddvq_s32 (p2 ) * ls2 ;
9609+ sumi3 += (y [i ].bsums [2 * ib + 0 ] + y [i ].bsums [2 * ib + 1 ]) * ls1 * (qh [ib + 0 ] & 0x8000 ? -1 : 1 )
9610+ + (y [i ].bsums [2 * ib + 2 ] + y [i ].bsums [2 * ib + 3 ]) * ls2 * (qh [ib + 1 ] & 0x8000 ? -1 : 1 );
96069611
96079612 }
96089613
9609- sumf += y [i ].d * GGML_FP16_TO_FP32 (x [i ].d ) * (sumi1 + sumi2 );
9614+ sumf += y [i ].d * GGML_FP16_TO_FP32 (x [i ].d ) * (sumi1 + sumi2 + IQ1S_DELTA * sumi3 );
96109615 }
96119616
96129617 * s = sumf ;
96139618
96149619#elif defined __AVX2__
96159620
96169621 __m256 accum = _mm256_setzero_ps ();
9622+ float accum1 = 0 ;
96179623 for (int i = 0 ; i < nb ; ++ i ) {
96189624
96199625 const int8_t * q8 = y [i ].qs ;
96209626 const uint8_t * qs = x [i ].qs ;
96219627 const uint16_t * qh = x [i ].qh ;
96229628
96239629 __m256i sumi = _mm256_setzero_si256 ();
9630+ int sumi1 = 0 ;
96249631 for (int ib = 0 ; ib < QK_K /32 ; ib += 2 ) {
96259632 const __m256i q1b_1 = _mm256_set_epi64x (iq1s_grid [qs [3 ] | ((qh [ib + 0 ] >> 1 ) & 0x700 )], iq1s_grid [qs [2 ] | ((qh [ib + 0 ] << 2 ) & 0x700 )],
96269633 iq1s_grid [qs [1 ] | ((qh [ib + 0 ] << 5 ) & 0x700 )], iq1s_grid [qs [0 ] | ((qh [ib + 0 ] << 8 ) & 0x700 )]);
@@ -9632,17 +9639,23 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
96329639
96339640 const __m256i dot1 = mul_add_epi8 (q1b_1 , q8b_1 );
96349641 const __m256i dot2 = mul_add_epi8 (q1b_2 , q8b_2 );
9635- const __m256i p1 = _mm256_madd_epi16 (dot1 , _mm256_set1_epi16 (2 * (qh [ib + 0 ] >> 12 ) + 1 ));
9636- const __m256i p2 = _mm256_madd_epi16 (dot2 , _mm256_set1_epi16 (2 * (qh [ib + 1 ] >> 12 ) + 1 ));
9642+ const int16_t ls1 = 2 * ((qh [ib + 0 ] >> 12 ) & 7 ) + 1 ;
9643+ const int16_t ls2 = 2 * ((qh [ib + 1 ] >> 12 ) & 7 ) + 1 ;
9644+ const __m256i p1 = _mm256_madd_epi16 (dot1 , _mm256_set1_epi16 (ls1 ));
9645+ const __m256i p2 = _mm256_madd_epi16 (dot2 , _mm256_set1_epi16 (ls2 ));
96379646
96389647 sumi = _mm256_add_epi32 (sumi , _mm256_add_epi32 (p1 , p2 ));
9648+ sumi1 += (y [i ].bsums [2 * ib + 0 ] + y [i ].bsums [2 * ib + 1 ]) * (qh [ib + 0 ] & 0x8000 ? -1 : 1 ) * ls1
9649+ + (y [i ].bsums [2 * ib + 2 ] + y [i ].bsums [2 * ib + 3 ]) * (qh [ib + 1 ] & 0x8000 ? -1 : 1 ) * ls2 ;
96399650 }
96409651
9641- accum = _mm256_fmadd_ps (_mm256_set1_ps (y [i ].d * GGML_FP16_TO_FP32 (x [i ].d )), _mm256_cvtepi32_ps (sumi ), accum );
9652+ const float d = y [i ].d * GGML_FP16_TO_FP32 (x [i ].d );
9653+ accum = _mm256_fmadd_ps (_mm256_set1_ps (d ), _mm256_cvtepi32_ps (sumi ), accum );
9654+ accum1 += d * sumi1 ;
96429655
96439656 }
96449657
9645- * s = hsum_float_8 (accum );
9658+ * s = hsum_float_8 (accum ) + IQ1S_DELTA * accum1 ;
96469659
96479660#else
96489661
@@ -9653,9 +9666,10 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
96539666 const uint8_t * qs = x [i ].qs ;
96549667 const uint16_t * qh = x [i ].qh ;
96559668
9656- int sumi = 0 ;
9669+ int sumi = 0 , sumi1 = 0 ;
96579670 for (int ib = 0 ; ib < QK_K /32 ; ++ ib ) {
9658- const int ls = 2 * (qh [ib ] >> 12 ) + 1 ;
9671+ const int ls = 2 * ((qh [ib ] >> 12 ) & 7 ) + 1 ;
9672+ const int delta = qh [ib ] & 0x8000 ? -1 : 1 ;
96599673 int lsum = 0 ;
96609674 for (int l = 0 ; l < 4 ; ++ l ) {
96619675 const int8_t * grid = (const int8_t * )(iq1s_grid + (qs [l ] | (((qh [ib ] >> 3 * l ) & 7 ) << 8 )));
@@ -9664,11 +9678,12 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
96649678 }
96659679 q8 += 8 ;
96669680 }
9667- sumi += ls * lsum ;
9681+ sumi += ls * lsum ;
9682+ sumi1 += ls * delta * (y [i ].bsums [2 * ib + 0 ] + y [i ].bsums [2 * ib + 1 ]);
96689683 qs += 4 ;
96699684 }
96709685
9671- sumf += GGML_FP16_TO_FP32 (x [i ].d ) * y [i ].d * sumi ;
9686+ sumf += GGML_FP16_TO_FP32 (x [i ].d ) * y [i ].d * ( sumi + IQ1S_DELTA * sumi1 ) ;
96729687 }
96739688
96749689 * s = sumf ;
@@ -11438,7 +11453,7 @@ static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const u
1143811453}
1143911454
1144011455static int iq1_find_best_neighbour2 (const uint16_t * restrict neighbours , const uint64_t * restrict grid ,
11441- const float * restrict xval , const float * restrict weight , float scale , int8_t * restrict L , int ngrid ) {
11456+ const float * restrict xval , const float * restrict weight , float scale , const float * restrict xg , int8_t * restrict L , int ngrid ) {
1144211457 int num_neighbors = neighbours [0 ];
1144311458 GGML_ASSERT (num_neighbors > 0 );
1144411459 float best_score = FLT_MAX ;
@@ -11447,7 +11462,7 @@ static int iq1_find_best_neighbour2(const uint16_t * restrict neighbours, const
1144711462 const int8_t * pg = (const int8_t * )(grid + neighbours [j ]);
1144811463 float d2 = 0 ;
1144911464 for (int i = 0 ; i < 8 ; ++ i ) {
11450- float q = (pg [i ] - 3 )/2 ;
11465+ float q = xg [ (pg [i ] - 1 )/2 ] ;
1145111466 float w = weight [i ];
1145211467 float diff = scale * q - xval [i ];
1145311468 d2 += w * diff * diff ;
@@ -11463,7 +11478,7 @@ static int iq1_find_best_neighbour2(const uint16_t * restrict neighbours, const
1146311478 float d2 = 0 ;
1146411479 for (int j = 0 ; j < 8 ; ++ j ) {
1146511480 float w = weight [j ];
11466- float q = (grid_i [j ] - 3 )/2 ;
11481+ float q = xg [ (grid_i [j ] - 1 )/2 ] ;
1146711482 float diff = scale * q - xval [i ];
1146811483 d2 += w * diff * diff ;
1146911484 }
@@ -11480,7 +11495,7 @@ static int iq1_find_best_neighbour2(const uint16_t * restrict neighbours, const
1148011495 const int8_t * pg = (const int8_t * )(grid + neighbours [j ]);
1148111496 float sumqx = 0 , sumq2 = 0 ;
1148211497 for (int i = 0 ; i < 8 ; ++ i ) {
11483- float q = (pg [i ] - 3 )/2 ;
11498+ float q = xg [ (pg [i ] - 1 )/2 ] ;
1148411499 float w = weight [i ];
1148511500 sumqx += w * q * xval [i ];
1148611501 sumq2 += w * q * q ;
@@ -11519,6 +11534,9 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
1151911534
1152011535 block_iq1_s * y = vy ;
1152111536
11537+ const float x_p [3 ] = {-1 + IQ1S_DELTA , IQ1S_DELTA , 1 + IQ1S_DELTA };
11538+ const float x_m [3 ] = {-1 - IQ1S_DELTA , - IQ1S_DELTA , 1 - IQ1S_DELTA };
11539+
1152211540 float scales [QK_K /IQ1S_BLOCK_SIZE ];
1152311541 float weight [IQ1S_BLOCK_SIZE ];
1152411542 int8_t L [IQ1S_BLOCK_SIZE ];
@@ -11527,6 +11545,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
1152711545 float pairs [2 * IQ1S_BLOCK_SIZE ];
1152811546 int * idx = (int * )(pairs + 1 );
1152911547 uint16_t index [IQ1S_BLOCK_SIZE /8 ];
11548+ int8_t shifts [QK_K /IQ1S_BLOCK_SIZE ];
1153011549
1153111550 for (int ibl = 0 ; ibl < nbl ; ++ ibl ) {
1153211551
@@ -11572,33 +11591,41 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
1157211591 }
1157311592 }
1157411593 float best_score = 0 , scale = max ;
11575- int besti1 = 0 , besti2 = 0 ;
11594+ int besti1 = -1 , besti2 = -1 , best_shift = 0 ;
1157611595 for (int i1 = 0 ; i1 <= IQ1S_BLOCK_SIZE ; ++ i1 ) {
1157711596 for (int i2 = i1 ; i2 <= IQ1S_BLOCK_SIZE ; ++ i2 ) {
11578- float sumqx = - (sumx [i1 ] - sumx [0 ]) + (sumx [IQ1S_BLOCK_SIZE ] - sumx [i2 ]);
11579- float sumq2 = (sumw [i1 ] - sumw [0 ]) + (sumw [IQ1S_BLOCK_SIZE ] - sumw [i2 ]);
11597+ float sumqx = (sumx [i1 ] - sumx [0 ])* x_p [0 ] + (sumx [i2 ] - sumx [i1 ])* x_p [1 ] + (sumx [IQ1S_BLOCK_SIZE ] - sumx [i2 ])* x_p [2 ];
11598+ float sumq2 = (sumw [i1 ] - sumw [0 ])* x_p [0 ]* x_p [0 ] + (sumw [i2 ] - sumw [i1 ])* x_p [1 ]* x_p [1 ] + (sumw [IQ1S_BLOCK_SIZE ] - sumw [i2 ])* x_p [2 ]* x_p [2 ];
11599+ if (sumq2 > 0 && sumqx * sumqx > best_score * sumq2 ) {
11600+ scale = sumqx /sumq2 ; best_score = scale * sumqx ;
11601+ besti1 = i1 ; besti2 = i2 ; best_shift = 1 ;
11602+ }
11603+ sumqx = (sumx [i1 ] - sumx [0 ])* x_m [0 ] + (sumx [i2 ] - sumx [i1 ])* x_m [1 ] + (sumx [IQ1S_BLOCK_SIZE ] - sumx [i2 ])* x_m [2 ];
11604+ sumq2 = (sumw [i1 ] - sumw [0 ])* x_m [0 ]* x_m [0 ] + (sumw [i2 ] - sumw [i1 ])* x_m [1 ]* x_m [1 ] + (sumw [IQ1S_BLOCK_SIZE ] - sumw [i2 ])* x_m [2 ]* x_m [2 ];
1158011605 if (sumq2 > 0 && sumqx * sumqx > best_score * sumq2 ) {
1158111606 scale = sumqx /sumq2 ; best_score = scale * sumqx ;
11582- besti1 = i1 ; besti2 = i2 ;
11607+ besti1 = i1 ; besti2 = i2 ; best_shift = -1 ;
1158311608 }
1158411609 }
1158511610 }
11611+ GGML_ASSERT (besti1 >= 0 && besti2 >= 0 && best_shift != 0 );
1158611612 for (int j = 0 ; j < besti1 ; ++ j ) L [idx [2 * j ]] = 0 ;
1158711613 for (int j = besti1 ; j < besti2 ; ++ j ) L [idx [2 * j ]] = 1 ;
1158811614 for (int j = besti2 ; j < IQ1S_BLOCK_SIZE ; ++ j ) L [idx [2 * j ]] = 2 ;
1158911615 if (scale < 0 ) {
1159011616 for (int j = 0 ; j < IQ1S_BLOCK_SIZE ; ++ j ) L [j ] = 2 - L [j ];
11591- scale = - scale ;
11617+ scale = - scale ; best_shift = - best_shift ;
1159211618 }
1159311619 bool all_on_grid = true;
11620+ const float * xx = best_shift == 1 ? x_p : x_m ;
1159411621 for (int k = 0 ; k < IQ1S_BLOCK_SIZE /8 ; ++ k ) {
1159511622 uint16_t u = 0 ;
1159611623 for (int j = 0 ; j < 8 ; ++ j ) u |= (L [8 * k + j ] << 2 * j );
1159711624 int grid_index = kmap_q2xs [u ];
1159811625 if (grid_index < 0 ) {
1159911626 all_on_grid = false;
1160011627 const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs [u ] - 1 ;
11601- grid_index = iq1_find_best_neighbour2 (neighbours , kgrid_q2xs , xb + 8 * k , weight + 8 * k , scale , L + 8 * k , NGRID_IQ1S );
11628+ grid_index = iq1_find_best_neighbour2 (neighbours , kgrid_q2xs , xb + 8 * k , weight + 8 * k , scale , xx , L + 8 * k , NGRID_IQ1S );
1160211629 GGML_ASSERT (grid_index >= 0 );
1160311630 }
1160411631 index [k ] = grid_index ;
@@ -11609,7 +11636,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
1160911636 const int8_t * pg = (const int8_t * )(kgrid_q2xs + index [k ]);
1161011637 for (int j = 0 ; j < 8 ; ++ j ) {
1161111638 float w = weight [8 * k + j ];
11612- float q = (pg [j ] - 3 )/2 ;
11639+ float q = xx [ (pg [j ] - 1 )/2 ] ;
1161311640 sumqx += w * q * xb [8 * k + j ];
1161411641 sumq2 += w * q * q ;
1161511642 }
@@ -11624,6 +11651,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
1162411651 y [ibl ].qh [ib ] = h ;
1162511652 GGML_ASSERT (scale >= 0 );
1162611653 scales [ib ] = scale ;
11654+ shifts [ib ] = best_shift ;
1162711655 max_scale = MAX (max_scale , scale );
1162811656 }
1162911657
@@ -11632,12 +11660,13 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
1163211660 continue ;
1163311661 }
1163411662
11635- float d = max_scale /31 ;
11663+ float d = max_scale /15 ;
1163611664 y [ibl ].d = GGML_FP32_TO_FP16 (d * 1.125f ); // 1.085f is another fudge factor. Don't ask me why it is needed.
1163711665 float id = 1 /d ;
1163811666 for (int ib = 0 ; ib < QK_K /IQ1S_BLOCK_SIZE ; ++ ib ) {
1163911667 int l = nearest_int (0.5f * (id * scales [ib ]- 1 ));
11640- l = MAX (0 , MIN (15 , l ));
11668+ l = MAX (0 , MIN (7 , l ));
11669+ if (shifts [ib ] == -1 ) l |= 8 ;
1164111670 y [ibl ].qh [ib ] |= (l << 12 );
1164211671 }
1164311672 }
0 commit comments