From 8c8e37fe5e38de93c85124f6439ce83e53e83b63 Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Fri, 24 Oct 2025 10:11:08 -0600
Subject: [PATCH 1/3] feat(eval-callback): Use -vb to set tensor print width
 and number of elements

Branch: Mamba2SSD

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---
 examples/eval-callback/eval-callback.cpp | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index cefa39a57c886..da26dfbb316c1 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -6,9 +6,17 @@
 
 #include <cstdio>
 #include <string>
+#include <sstream>
 #include <vector>
 #include <numeric>
 
+// verbosity flag set via the params.verbosity CLI flag. This is used for two
+// things:
+// 1. If > 0, tensors are printed with 8 digits of precision instead of 5
+// 2. If > 1, all tensor values are printed instead of the pretty-printed
+//      partial output
+static int verbosity = 0;
+
 /**
  * This the arbitrary data which will be passed to each callback.
  * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
@@ -61,6 +69,10 @@ static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t *
 }
 
 static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
+    std::stringstream ss;
+    const int float_digits = verbosity > 0 ? 8 : 4;
+    ss << "%12." << float_digits << "f";
+    const auto float_fmt = ss.str();
     GGML_ASSERT(n > 0);
     float sum = 0;
     for (int64_t i3 = 0; i3 < ne[3]; i3++) {
@@ -93,7 +105,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
                         i0 = ne[0] - n;
                     }
                     const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
-                    LOG("%12.4f", v);
+                    LOG(float_fmt.c_str(), v);
                     if (i0 < ne[0] - 1) LOG(", ");
                 }
                 LOG("],\n");
@@ -153,8 +165,9 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
     }
 
     if (!ggml_is_quantized(t->type)) {
+        const int print_width = verbosity > 1 ? INT_MAX : 3;
         uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
-        ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
+        ggml_print_tensor(data, t->type, t->ne, t->nb, print_width);
     }
 
     return true;
@@ -192,6 +205,9 @@ int main(int argc, char ** argv) {
 
     common_init();
 
+    // set verbosity for printing
+    verbosity = params.verbosity;
+
     llama_backend_init();
     llama_numa_init(params.numa);
 

From 1379a5f144ee9bb26e41037c44fcd71a631f6c8d Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Fri, 24 Oct 2025 13:33:09 -0600
Subject: [PATCH 2/3] fix: Fix INT_MAX to use numeric_limits for better
 compiler compat

Branch: Mamba2SSD

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---
 examples/eval-callback/eval-callback.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index da26dfbb316c1..5c58110384aa6 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -9,6 +9,7 @@
 #include <sstream>
 #include <vector>
 #include <numeric>
+#include <limits>
 
 // verbosity flag set via the params.verbosity CLI flag. This is used for two
 // things:
@@ -165,7 +166,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
     }
 
     if (!ggml_is_quantized(t->type)) {
-        const int print_width = verbosity > 1 ? INT_MAX : 3;
+        const int print_width = verbosity > 1 ? std::numeric_limits<int>::max() : 3;
         uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
         ggml_print_tensor(data, t->type, t->ne, t->nb, print_width);
     }

From d9fdc3d63b664e0ce4cc0bbf2afc68c9f0afdb1f Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Wed, 5 Nov 2025 09:49:49 -0700
Subject: [PATCH 3/3] fix(eval-callback): Map --verbose (INT_MAX) back to -lv 1
 for print width

Branch: EvalCallbackVerbosity

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---
 examples/eval-callback/eval-callback.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index 5c58110384aa6..627227ca93b16 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -166,7 +166,10 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
     }
 
     if (!ggml_is_quantized(t->type)) {
-        const int print_width = verbosity > 1 ? std::numeric_limits<int>::max() : 3;
+        // The `--verbose` flag will set verbosity to INT_MAX. We want that to
+        // be the equivalent of `-lv 1` since it will be the most common command
+        // used and full-width printing is extremely verbose.
+        const int print_width = (verbosity > 1 && verbosity < std::numeric_limits<int>::max()) ? std::numeric_limits<int>::max() : 3;
         uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
         ggml_print_tensor(data, t->type, t->ne, t->nb, print_width);
     }