From 8c8e37fe5e38de93c85124f6439ce83e53e83b63 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Fri, 24 Oct 2025 10:11:08 -0600 Subject: [PATCH 1/3] feat(eval-callback): Use -vb to set tensor print width and number of elements Branch: Mamba2SSD Signed-off-by: Gabe Goodhart --- examples/eval-callback/eval-callback.cpp | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index cefa39a57c886..da26dfbb316c1 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -6,9 +6,17 @@ #include #include +#include #include #include +// verbosity flag set via the params.verbosity CLI flag. This is used for two +// things: +// 1. If > 0, tensors are printed with 8 digits of precision instead of 5 +// 2. If > 1, all tensor values are printed instead of the pretty-printed +// partial output +static int verbosity = 0; + /** * This the arbitrary data which will be passed to each callback. * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor. @@ -61,6 +69,10 @@ static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t * } static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) { + std::stringstream ss; + const int float_digits = verbosity > 0 ? 8 : 4; + ss << "%12." << float_digits << "f"; + const auto float_fmt = ss.str(); GGML_ASSERT(n > 0); float sum = 0; for (int64_t i3 = 0; i3 < ne[3]; i3++) { @@ -93,7 +105,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne i0 = ne[0] - n; } const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3); - LOG("%12.4f", v); + LOG(float_fmt.c_str(), v); if (i0 < ne[0] - 1) LOG(", "); } LOG("],\n"); @@ -153,8 +165,9 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { } if (!ggml_is_quantized(t->type)) { + const int print_width = verbosity > 1 ? INT_MAX : 3; uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data(); - ggml_print_tensor(data, t->type, t->ne, t->nb, 3); + ggml_print_tensor(data, t->type, t->ne, t->nb, print_width); } return true; @@ -192,6 +205,9 @@ int main(int argc, char ** argv) { common_init(); + // set verbosity for printing + verbosity = params.verbosity; + llama_backend_init(); llama_numa_init(params.numa); From 1379a5f144ee9bb26e41037c44fcd71a631f6c8d Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Fri, 24 Oct 2025 13:33:09 -0600 Subject: [PATCH 2/3] fix: Fix INT_MAX to use numeric_limits for better compiler compat Branch: Mamba2SSD Signed-off-by: Gabe Goodhart --- examples/eval-callback/eval-callback.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index da26dfbb316c1..5c58110384aa6 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -9,6 +9,7 @@ #include #include #include +#include // verbosity flag set via the params.verbosity CLI flag. This is used for two // things: @@ -165,7 +166,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { } if (!ggml_is_quantized(t->type)) { - const int print_width = verbosity > 1 ? INT_MAX : 3; + const int print_width = verbosity > 1 ? std::numeric_limits::max() : 3; uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data(); ggml_print_tensor(data, t->type, t->ne, t->nb, print_width); } From d9fdc3d63b664e0ce4cc0bbf2afc68c9f0afdb1f Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Wed, 5 Nov 2025 09:49:49 -0700 Subject: [PATCH 3/3] fix(eval-callback): Map --verbose (INT_MAX) back to -lv 1 for print width Branch: EvalCallbackVerbosity Signed-off-by: Gabe Goodhart --- examples/eval-callback/eval-callback.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 5c58110384aa6..627227ca93b16 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -166,7 +166,10 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { } if (!ggml_is_quantized(t->type)) { - const int print_width = verbosity > 1 ? std::numeric_limits::max() : 3; + // The `--verbose` flag will set verbosity to INT_MAX. We want that to + // be the equivalent of `-lv 1` since it will be the most common command + // used and full-width printing is extremely verbose. + const int print_width = (verbosity > 1 && verbosity < std::numeric_limits::max()) ? std::numeric_limits::max() : 3; uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data(); ggml_print_tensor(data, t->type, t->ne, t->nb, print_width); }