From 70f568ad17423290928bb0814a916f1ce16bce4c Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Sun, 9 Nov 2025 19:31:23 +0800
Subject: [PATCH 01/11] common: introduce auto sampling params from metadata

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 common/arg.cpp    |  9 +++++++++
 common/common.cpp | 41 +++++++++++++++++++++++++++++++++++++++++
 common/common.h   | 13 +++++++++++++
 3 files changed, 63 insertions(+)

diff --git a/common/arg.cpp b/common/arg.cpp
index a570810281499..97d896ea314c0 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1261,6 +1261,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.sampling.temp = std::stof(value);
             params.sampling.temp = std::max(params.sampling.temp, 0.0f);
+            params.sampling.sampling_mask |= common_params_sampling::SAMPLING_MASK_BITS_TEMP;
         }
     ).set_sparam());
     add_opt(common_arg(
@@ -1268,6 +1269,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k),
         [](common_params & params, int value) {
             params.sampling.top_k = value;
+            params.sampling.sampling_mask |= common_params_sampling::SAMPLING_MASK_BITS_TOP_K;
         }
     ).set_sparam());
     add_opt(common_arg(
@@ -1275,6 +1277,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
         [](common_params & params, const std::string & value) {
             params.sampling.top_p = std::stof(value);
+            params.sampling.sampling_mask |= common_params_sampling::SAMPLING_MASK_BITS_TOP_P;
         }
     ).set_sparam());
     add_opt(common_arg(
@@ -1282,6 +1285,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p),
         [](common_params & params, const std::string & value) {
             params.sampling.min_p = std::stof(value);
+            params.sampling.sampling_mask |= common_params_sampling::SAMPLING_MASK_BITS_MIN_P;
         }
     ).set_sparam());
     add_opt(common_arg(
@@ -1321,6 +1325,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             }
             params.sampling.penalty_last_n = value;
             params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
+            params.sampling.sampling_mask |= common_params_sampling::SAMPLING_MASK_BITS_PENALTY_LAST_N;
         }
     ).set_sparam());
     add_opt(common_arg(
@@ -1328,6 +1333,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
         [](common_params & params, const std::string & value) {
             params.sampling.penalty_repeat = std::stof(value);
+            params.sampling.sampling_mask |= common_params_sampling::SAMPLING_MASK_BITS_PENALTY_REPEAT;
         }
     ).set_sparam());
     add_opt(common_arg(
@@ -1425,6 +1431,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat),
         [](common_params & params, int value) {
             params.sampling.mirostat = value;
+            params.sampling.sampling_mask |= common_params_sampling::SAMPLING_MASK_BITS_MIROSTAT;
         }
     ).set_sparam());
     add_opt(common_arg(
@@ -1432,6 +1439,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta),
         [](common_params & params, const std::string & value) {
             params.sampling.mirostat_eta = std::stof(value);
+            params.sampling.sampling_mask |= common_params_sampling::SAMPLING_MASK_BITS_MIROSTAT_ETA;
         }
     ).set_sparam());
     add_opt(common_arg(
@@ -1439,6 +1447,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau),
         [](common_params & params, const std::string & value) {
             params.sampling.mirostat_tau = std::stof(value);
+            params.sampling.sampling_mask |= common_params_sampling::SAMPLING_MASK_BITS_MIROSTAT_TAU;
         }
     ).set_sparam());
     add_opt(common_arg(
diff --git a/common/common.cpp b/common/common.cpp
index a8d709ab1d050..51071fb1fc967 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -946,6 +946,45 @@ std::vector<common_file_info> fs_list_files(const std::string & path) {
 // Model utils
 //
 
+static inline void common_init_sampler_from_model(
+    const llama_model * model,
+    common_params_sampling & sparams) {
+
+    const uint16_t mask = sparams.sampling_mask;
+
+    auto get_int32 = [&](const char * key, int32_t & dst, uint16_t user_override) {
+        if (mask & user_override) return;
+
+        char buf[64] = {0};
+        if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
+            char * end = nullptr;
+            int32_t v = strtol(buf, &end, 10);
+            if (end && end != buf) dst = v;
+        }
+    };
+
+    auto get_float = [&](const char * key, float & dst, uint16_t user_override) {
+        if (mask & user_override) return;
+
+        char buf[128] = {0};
+        if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
+            char * end = nullptr;
+            float v = strtof(buf, &end);
+            if (end && end != buf) dst = v;
+        }
+    };
+
+    get_int32("general.sampler.top_k",          sparams.top_k,          common_params_sampling::SAMPLING_MASK_BITS_TOP_K);
+    get_float("general.sampler.top_p",          sparams.top_p,          common_params_sampling::SAMPLING_MASK_BITS_TOP_P);
+    get_float("general.sampler.min_p",          sparams.min_p,          common_params_sampling::SAMPLING_MASK_BITS_MIN_P);
+    get_float("general.sampler.temp",           sparams.temp,           common_params_sampling::SAMPLING_MASK_BITS_TEMP);
+    get_int32("general.sampler.penalty_last_n", sparams.penalty_last_n, common_params_sampling::SAMPLING_MASK_BITS_PENALTY_LAST_N);
+    get_float("general.sampler.penalty_repeat", sparams.penalty_repeat, common_params_sampling::SAMPLING_MASK_BITS_PENALTY_REPEAT);
+    get_int32("general.sampler.mirostat",       sparams.mirostat,       common_params_sampling::SAMPLING_MASK_BITS_MIROSTAT);
+    get_float("general.sampler.mirostat_tau",   sparams.mirostat_tau,   common_params_sampling::SAMPLING_MASK_BITS_MIROSTAT_TAU);
+    get_float("general.sampler.mirostat_eta",   sparams.mirostat_eta,   common_params_sampling::SAMPLING_MASK_BITS_MIROSTAT_ETA);
+}
+
 struct common_init_result common_init_from_params(common_params & params) {
     common_init_result iparams;
     auto mparams = common_model_params_to_llama(params);
@@ -957,6 +996,8 @@ struct common_init_result common_init_from_params(common_params & params) {
         return iparams;
     }
 
+    common_init_sampler_from_model(model, params.sampling);
+
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
     auto cparams = common_context_params_to_llama(params);
diff --git a/common/common.h b/common/common.h
index 8540725aaa476..c0d5d86127b81 100644
--- a/common/common.h
+++ b/common/common.h
@@ -165,6 +165,19 @@ struct common_params_sampling {
     bool    no_perf            = false; // disable performance metrics
     bool    timing_per_token   = false;
 
+    uint16_t sampling_mask = 0; // bitfield to track user-specified samplers
+    enum sampling_mask_bits : uint16_t {
+        SAMPLING_MASK_BITS_TOP_K          = 1 << 0,
+        SAMPLING_MASK_BITS_TOP_P          = 1 << 1,
+        SAMPLING_MASK_BITS_MIN_P          = 1 << 2,
+        SAMPLING_MASK_BITS_TEMP           = 1 << 3,
+        SAMPLING_MASK_BITS_PENALTY_LAST_N = 1 << 4,
+        SAMPLING_MASK_BITS_PENALTY_REPEAT = 1 << 5,
+        SAMPLING_MASK_BITS_MIROSTAT       = 1 << 6,
+        SAMPLING_MASK_BITS_MIROSTAT_TAU   = 1 << 7,
+        SAMPLING_MASK_BITS_MIROSTAT_ETA   = 1 << 8,
+    };
+
     std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};     // default sequence breakers for DRY
 
 

From 7de014e4e107ed8d429271cdfa96bc83bbda266a Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Sun, 9 Nov 2025 20:10:25 +0800
Subject: [PATCH 02/11] gguf-py: introduce new kv for conversion scripts

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 gguf-py/gguf/constants.py   | 11 ++++++++++
 gguf-py/gguf/gguf_writer.py | 27 ++++++++++++++++++++++++
 gguf-py/gguf/metadata.py    | 41 +++++++++++++++++++++++++++++++++++++
 src/llama-arch.cpp          | 35 +++++++++++++++++++------------
 src/llama-arch.h            |  9 ++++++++
 5 files changed, 110 insertions(+), 13 deletions(-)

diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 6b4b6c5ab075d..b1b22581d9533 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -25,6 +25,17 @@ class General:
         ALIGNMENT                  = "general.alignment"
         FILE_TYPE                  = "general.file_type"
 
+        # Recommended Sampler Parameters
+        SAMPLER_TOP_K             = "general.sampler.top_k"
+        SAMPLER_TOP_P             = "general.sampler.top_p"
+        SAMPLER_MIN_P             = "general.sampler.min_p"
+        SAMPLER_TEMP              = "general.sampler.temp"
+        SAMPLER_PENALTY_LAST_N    = "general.sampler.penalty_last_n"
+        SAMPLER_PENALTY_REPEAT    = "general.sampler.penalty_repeat"
+        SAMPLER_MIROSTAT          = "general.sampler.mirostat"
+        SAMPLER_MIROSTAT_TAU      = "general.sampler.mirostat_tau"
+        SAMPLER_MIROSTAT_ETA      = "general.sampler.mirostat_eta"
+
         # Authorship Metadata
         NAME                       = "general.name"
         AUTHOR                     = "general.author"
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index a051daeeb1341..23567281d1503 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -496,6 +496,33 @@ def add_custom_alignment(self, alignment: int) -> None:
     def add_file_type(self, ftype: int) -> None:
         self.add_uint32(Keys.General.FILE_TYPE, ftype)
 
+    def add_sampler_top_k(self, top_k: int) -> None:
+        self.add_int32(Keys.General.SAMPLER_TOP_K, top_k)
+
+    def add_sampler_top_p(self, top_p: float) -> None:
+        self.add_float32(Keys.General.SAMPLER_TOP_P, top_p)
+
+    def add_sampler_min_p(self, min_p: float) -> None:
+        self.add_float32(Keys.General.SAMPLER_MIN_P, min_p)
+
+    def add_sampler_temp(self, temp: float) -> None:
+        self.add_float32(Keys.General.SAMPLER_TEMP, temp)
+
+    def add_sampler_penalty_last_n(self, penalty_last_n: int) -> None:
+        self.add_int32(Keys.General.SAMPLER_PENALTY_LAST_N, penalty_last_n)
+
+    def add_sampler_penalty_repeat(self, penalty_repeat: float) -> None:
+        self.add_float32(Keys.General.SAMPLER_PENALTY_REPEAT, penalty_repeat)
+
+    def add_sampler_mirostat(self, mirostat: int) -> None:
+        self.add_int32(Keys.General.SAMPLER_MIROSTAT, mirostat)
+
+    def add_sampler_mirostat_tau(self, mirostat_tau: float) -> None:
+        self.add_float32(Keys.General.SAMPLER_MIROSTAT_TAU, mirostat_tau)
+
+    def add_sampler_mirostat_eta(self, mirostat_eta: float) -> None:
+        self.add_float32(Keys.General.SAMPLER_MIROSTAT_ETA, mirostat_eta)
+
     def add_name(self, name: str) -> None:
         self.add_string(Keys.General.NAME, name)
 
diff --git a/gguf-py/gguf/metadata.py b/gguf-py/gguf/metadata.py
index 67efedbdbc564..249eddbc9c763 100644
--- a/gguf-py/gguf/metadata.py
+++ b/gguf-py/gguf/metadata.py
@@ -17,6 +17,17 @@
 
 @dataclass
 class Metadata:
+    # Recommended Sampler Parameters to be written to GGUF KV Store
+    sampler_top_k: Optional[int] = None
+    sampler_top_p: Optional[float] = None
+    sampler_min_p: Optional[float] = None
+    sampler_temp: Optional[float] = None
+    sampler_penalty_last_n: Optional[int] = None
+    sampler_penalty_repeat: Optional[float] = None
+    sampler_mirostat: Optional[int] = None
+    sampler_mirostat_tau: Optional[float] = None
+    sampler_mirostat_eta: Optional[float] = None
+
     # Authorship Metadata to be written to GGUF KV Store
     name: Optional[str] = None
     author: Optional[str] = None
@@ -63,6 +74,16 @@ def load(metadata_override_path: Optional[Path] = None, model_path: Optional[Pat
         # This is based on LLM_KV_NAMES mapping in llama.cpp
         metadata_override = Metadata.load_metadata_override(metadata_override_path)
 
+        metadata.sampler_top_k          = metadata_override.get(Keys.General.SAMPLER_TOP_K,   metadata.sampler_top_k)
+        metadata.sampler_top_p          = metadata_override.get(Keys.General.SAMPLER_TOP_P,   metadata.sampler_top_p)
+        metadata.sampler_min_p          = metadata_override.get(Keys.General.SAMPLER_MIN_P,   metadata.sampler_min_p)
+        metadata.sampler_temp           = metadata_override.get(Keys.General.SAMPLER_TEMP,    metadata.sampler_temp)
+        metadata.sampler_penalty_last_n = metadata_override.get(Keys.General.SAMPLER_PENALTY_LAST_N, metadata.sampler_penalty_last_n)
+        metadata.sampler_penalty_repeat = metadata_override.get(Keys.General.SAMPLER_PENALTY_REPEAT,     metadata.sampler_penalty_repeat)
+        metadata.sampler_mirostat       = metadata_override.get(Keys.General.SAMPLER_MIROSTAT,         metadata.sampler_mirostat)
+        metadata.sampler_mirostat_tau   = metadata_override.get(Keys.General.SAMPLER_MIROSTAT_TAU,    metadata.sampler_mirostat_tau)
+        metadata.sampler_mirostat_eta   = metadata_override.get(Keys.General.SAMPLER_MIROSTAT_ETA,    metadata.sampler_mirostat_eta)
+
         metadata.name            = metadata_override.get(Keys.General.NAME,            metadata.name)
         metadata.author          = metadata_override.get(Keys.General.AUTHOR,          metadata.author)
         metadata.version         = metadata_override.get(Keys.General.VERSION,         metadata.version)
@@ -546,6 +567,26 @@ def use_array_model_card_metadata(metadata_key: str, model_card_key: str):
 
     def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter):
         assert self.name is not None
+
+        if self.sampler_top_k is not None:
+            gguf_writer.add_sampler_top_k(self.sampler_top_k)
+        if self.sampler_top_p is not None:
+            gguf_writer.add_sampler_top_p(self.sampler_top_p)
+        if self.sampler_min_p is not None:
+            gguf_writer.add_sampler_min_p(self.sampler_min_p)
+        if self.sampler_temp is not None:
+            gguf_writer.add_sampler_temp(self.sampler_temp)
+        if self.sampler_penalty_last_n is not None:
+            gguf_writer.add_sampler_penalty_last_n(self.sampler_penalty_last_n)
+        if self.sampler_penalty_repeat is not None:
+            gguf_writer.add_sampler_penalty_repeat(self.sampler_penalty_repeat)
+        if self.sampler_mirostat is not None:
+            gguf_writer.add_sampler_mirostat(self.sampler_mirostat)
+        if self.sampler_mirostat_tau is not None:
+            gguf_writer.add_sampler_mirostat_tau(self.sampler_mirostat_tau)
+        if self.sampler_mirostat_eta is not None:
+            gguf_writer.add_sampler_mirostat_eta(self.sampler_mirostat_eta)
+
         gguf_writer.add_name(self.name)
 
         if self.author is not None:
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index b7642b568dffb..2c30068712f74 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -112,19 +112,28 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
 };
 
 static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
-    { LLM_KV_GENERAL_TYPE,                 "general.type"                          },
-    { LLM_KV_GENERAL_ARCHITECTURE,         "general.architecture"                  },
-    { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version"          },
-    { LLM_KV_GENERAL_ALIGNMENT,            "general.alignment"                     },
-    { LLM_KV_GENERAL_FILE_TYPE,            "general.file_type"                     },
-    { LLM_KV_GENERAL_NAME,                 "general.name"                          },
-    { LLM_KV_GENERAL_AUTHOR,               "general.author"                        },
-    { LLM_KV_GENERAL_VERSION,              "general.version"                       },
-    { LLM_KV_GENERAL_URL,                  "general.url"                           },
-    { LLM_KV_GENERAL_DESCRIPTION,          "general.description"                   },
-    { LLM_KV_GENERAL_LICENSE,              "general.license"                       },
-    { LLM_KV_GENERAL_SOURCE_URL,           "general.source.url"                    },
-    { LLM_KV_GENERAL_SOURCE_HF_REPO,       "general.source.huggingface.repository" },
+    { LLM_KV_GENERAL_TYPE,                   "general.type"                          },
+    { LLM_KV_GENERAL_ARCHITECTURE,           "general.architecture"                  },
+    { LLM_KV_GENERAL_QUANTIZATION_VERSION,   "general.quantization_version"          },
+    { LLM_KV_GENERAL_ALIGNMENT,              "general.alignment"                     },
+    { LLM_KV_GENERAL_FILE_TYPE,              "general.file_type"                     },
+    { LLM_KV_GENERAL_SAMPLER_TOP_K,          "general.sampler.top_k"                 },
+    { LLM_KV_GENERAL_SAMPLER_TOP_P,          "general.sampler.top_p"                 },
+    { LLM_KV_GENERAL_SAMPLER_MIN_P,          "general.sampler.min_p"                 },
+    { LLM_KV_GENERAL_SAMPLER_TEMP,           "general.sampler.temperature"           },
+    { LLM_KV_GENERAL_SAMPLER_PENALTY_LAST_N, "general.sampler.penalty_last_n"        },
+    { LLM_KV_GENERAL_SAMPLER_PENALTY_REPEAT, "general.sampler.penalty_repeat"        },
+    { LLM_KV_GENERAL_SAMPLER_MIROSTAT,       "general.sampler.mirostat"              },
+    { LLM_KV_GENERAL_SAMPLER_MIROSTAT_TAU,   "general.sampler.mirostat_tau"          },
+    { LLM_KV_GENERAL_SAMPLER_MIROSTAT_ETA,   "general.sampler.mirostat_eta"          },
+    { LLM_KV_GENERAL_NAME,                   "general.name"                          },
+    { LLM_KV_GENERAL_AUTHOR,                 "general.author"                        },
+    { LLM_KV_GENERAL_VERSION,                "general.version"                       },
+    { LLM_KV_GENERAL_URL,                    "general.url"                           },
+    { LLM_KV_GENERAL_DESCRIPTION,            "general.description"                   },
+    { LLM_KV_GENERAL_LICENSE,                "general.license"                       },
+    { LLM_KV_GENERAL_SOURCE_URL,             "general.source.url"                    },
+    { LLM_KV_GENERAL_SOURCE_HF_REPO,         "general.source.huggingface.repository" },
 
     { LLM_KV_VOCAB_SIZE,                        "%s.vocab_size"                        },
     { LLM_KV_CONTEXT_LENGTH,                    "%s.context_length"                    },
diff --git a/src/llama-arch.h b/src/llama-arch.h
index a769dd1e85741..8ae8eed093e86 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -121,6 +121,15 @@ enum llm_kv {
     LLM_KV_GENERAL_QUANTIZATION_VERSION,
     LLM_KV_GENERAL_ALIGNMENT,
     LLM_KV_GENERAL_FILE_TYPE,
+    LLM_KV_GENERAL_SAMPLER_TOP_K,
+    LLM_KV_GENERAL_SAMPLER_TOP_P,
+    LLM_KV_GENERAL_SAMPLER_MIN_P,
+    LLM_KV_GENERAL_SAMPLER_TEMP,
+    LLM_KV_GENERAL_SAMPLER_PENALTY_LAST_N,
+    LLM_KV_GENERAL_SAMPLER_PENALTY_REPEAT,
+    LLM_KV_GENERAL_SAMPLER_MIROSTAT,
+    LLM_KV_GENERAL_SAMPLER_MIROSTAT_TAU,
+    LLM_KV_GENERAL_SAMPLER_MIROSTAT_ETA,
     LLM_KV_GENERAL_NAME,
     LLM_KV_GENERAL_AUTHOR,
     LLM_KV_GENERAL_VERSION,

From c41bb285831f1aa0bde2d92663a59c954ec21428 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Sun, 9 Nov 2025 20:39:24 +0800
Subject: [PATCH 03/11] gguf-py: fix formatting

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 gguf-py/gguf/metadata.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/gguf-py/gguf/metadata.py b/gguf-py/gguf/metadata.py
index 249eddbc9c763..b9c9697c8086c 100644
--- a/gguf-py/gguf/metadata.py
+++ b/gguf-py/gguf/metadata.py
@@ -74,15 +74,15 @@ def load(metadata_override_path: Optional[Path] = None, model_path: Optional[Pat
         # This is based on LLM_KV_NAMES mapping in llama.cpp
         metadata_override = Metadata.load_metadata_override(metadata_override_path)
 
-        metadata.sampler_top_k          = metadata_override.get(Keys.General.SAMPLER_TOP_K,   metadata.sampler_top_k)
-        metadata.sampler_top_p          = metadata_override.get(Keys.General.SAMPLER_TOP_P,   metadata.sampler_top_p)
-        metadata.sampler_min_p          = metadata_override.get(Keys.General.SAMPLER_MIN_P,   metadata.sampler_min_p)
-        metadata.sampler_temp           = metadata_override.get(Keys.General.SAMPLER_TEMP,    metadata.sampler_temp)
+        metadata.sampler_top_k          = metadata_override.get(Keys.General.SAMPLER_TOP_K,          metadata.sampler_top_k)
+        metadata.sampler_top_p          = metadata_override.get(Keys.General.SAMPLER_TOP_P,          metadata.sampler_top_p)
+        metadata.sampler_min_p          = metadata_override.get(Keys.General.SAMPLER_MIN_P,          metadata.sampler_min_p)
+        metadata.sampler_temp           = metadata_override.get(Keys.General.SAMPLER_TEMP,           metadata.sampler_temp)
         metadata.sampler_penalty_last_n = metadata_override.get(Keys.General.SAMPLER_PENALTY_LAST_N, metadata.sampler_penalty_last_n)
-        metadata.sampler_penalty_repeat = metadata_override.get(Keys.General.SAMPLER_PENALTY_REPEAT,     metadata.sampler_penalty_repeat)
-        metadata.sampler_mirostat       = metadata_override.get(Keys.General.SAMPLER_MIROSTAT,         metadata.sampler_mirostat)
-        metadata.sampler_mirostat_tau   = metadata_override.get(Keys.General.SAMPLER_MIROSTAT_TAU,    metadata.sampler_mirostat_tau)
-        metadata.sampler_mirostat_eta   = metadata_override.get(Keys.General.SAMPLER_MIROSTAT_ETA,    metadata.sampler_mirostat_eta)
+        metadata.sampler_penalty_repeat = metadata_override.get(Keys.General.SAMPLER_PENALTY_REPEAT, metadata.sampler_penalty_repeat)
+        metadata.sampler_mirostat       = metadata_override.get(Keys.General.SAMPLER_MIROSTAT,       metadata.sampler_mirostat)
+        metadata.sampler_mirostat_tau   = metadata_override.get(Keys.General.SAMPLER_MIROSTAT_TAU,   metadata.sampler_mirostat_tau)
+        metadata.sampler_mirostat_eta   = metadata_override.get(Keys.General.SAMPLER_MIROSTAT_ETA,   metadata.sampler_mirostat_eta)
 
         metadata.name            = metadata_override.get(Keys.General.NAME,            metadata.name)
         metadata.author          = metadata_override.get(Keys.General.AUTHOR,          metadata.author)

From caa7a039f16da11f43dd9c486755a95a9de93de7 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Sun, 9 Nov 2025 20:40:29 +0800
Subject: [PATCH 04/11] gguf-py: fix more formatting issues

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 gguf-py/gguf/constants.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index b1b22581d9533..4e8e3a15763fe 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -26,15 +26,15 @@ class General:
         FILE_TYPE                  = "general.file_type"
 
         # Recommended Sampler Parameters
-        SAMPLER_TOP_K             = "general.sampler.top_k"
-        SAMPLER_TOP_P             = "general.sampler.top_p"
-        SAMPLER_MIN_P             = "general.sampler.min_p"
-        SAMPLER_TEMP              = "general.sampler.temp"
-        SAMPLER_PENALTY_LAST_N    = "general.sampler.penalty_last_n"
-        SAMPLER_PENALTY_REPEAT    = "general.sampler.penalty_repeat"
-        SAMPLER_MIROSTAT          = "general.sampler.mirostat"
-        SAMPLER_MIROSTAT_TAU      = "general.sampler.mirostat_tau"
-        SAMPLER_MIROSTAT_ETA      = "general.sampler.mirostat_eta"
+        SAMPLER_TOP_K              = "general.sampler.top_k"
+        SAMPLER_TOP_P              = "general.sampler.top_p"
+        SAMPLER_MIN_P              = "general.sampler.min_p"
+        SAMPLER_TEMP               = "general.sampler.temp"
+        SAMPLER_PENALTY_LAST_N     = "general.sampler.penalty_last_n"
+        SAMPLER_PENALTY_REPEAT     = "general.sampler.penalty_repeat"
+        SAMPLER_MIROSTAT           = "general.sampler.mirostat"
+        SAMPLER_MIROSTAT_TAU       = "general.sampler.mirostat_tau"
+        SAMPLER_MIROSTAT_ETA       = "general.sampler.mirostat_eta"
 
         # Authorship Metadata
         NAME                       = "general.name"

From 44addcebd90673c33c10e66210f87a1ca7e3f2a4 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Sun, 9 Nov 2025 22:21:18 +0800
Subject: [PATCH 05/11] gguf-py: introduce support for reading from
 generation_config.py

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 gguf-py/gguf/metadata.py       |  48 +++++++++++++++
 gguf-py/tests/test_metadata.py | 103 +++++++++++++++++++++++++++++++++
 2 files changed, 151 insertions(+)

diff --git a/gguf-py/gguf/metadata.py b/gguf-py/gguf/metadata.py
index b9c9697c8086c..72ca4b93b78df 100644
--- a/gguf-py/gguf/metadata.py
+++ b/gguf-py/gguf/metadata.py
@@ -65,11 +65,42 @@ def load(metadata_override_path: Optional[Path] = None, model_path: Optional[Pat
 
         model_card = Metadata.load_model_card(model_path)
         hf_params = Metadata.load_hf_parameters(model_path)
+        gen_config = Metadata.load_generation_config(model_path)
         # TODO: load adapter_config.json when possible, it usually contains the base model of the LoRA adapter
 
         # heuristics
         metadata = Metadata.apply_metadata_heuristic(metadata, model_card, hf_params, model_path, total_params)
 
+        if gen_config:
+            # Standard generation_config.json parameters
+            if metadata.sampler_top_k is None and "top_k" in gen_config:
+                metadata.sampler_top_k = int(gen_config["top_k"])
+
+            if metadata.sampler_top_p is None and "top_p" in gen_config:
+                metadata.sampler_top_p = float(gen_config["top_p"])
+
+            if metadata.sampler_min_p is None and "min_p" in gen_config:
+                metadata.sampler_min_p = float(gen_config["min_p"])
+
+            if metadata.sampler_temp is None and "temperature" in gen_config:
+                metadata.sampler_temp = float(gen_config["temperature"])
+
+            # Non-standard generation_config.json parameters
+            if metadata.sampler_penalty_last_n is None and "penalty_last_n" in gen_config:
+                metadata.sampler_penalty_last_n = int(gen_config["penalty_last_n"])
+
+            if metadata.sampler_penalty_repeat is None and "penalty_repeat" in gen_config:
+                metadata.sampler_penalty_repeat = float(gen_config["penalty_repeat"])
+
+            if metadata.sampler_mirostat is None and "mirostat" in gen_config:
+                metadata.sampler_mirostat = int(gen_config["mirostat"])
+
+            if metadata.sampler_mirostat_tau is None and "mirostat_tau" in gen_config:
+                metadata.sampler_mirostat_tau = float(gen_config["mirostat_tau"])
+
+            if metadata.sampler_mirostat_eta is None and "mirostat_eta" in gen_config:
+                metadata.sampler_mirostat_eta = float(gen_config["mirostat_eta"])
+
         # Metadata Override File Provided
         # This is based on LLM_KV_NAMES mapping in llama.cpp
         metadata_override = Metadata.load_metadata_override(metadata_override_path)
@@ -193,6 +224,23 @@ def load_hf_parameters(model_path: Optional[Path] = None) -> dict[str, Any]:
         with open(config_path, "r", encoding="utf-8") as f:
             return json.load(f)
 
+    @staticmethod
+    def load_generation_config(model_path: Optional[Path] = None) -> dict[str, Any]:
+        if model_path is None or not model_path.is_dir():
+            return {}
+
+        generation_config_path = model_path / "generation_config.json"
+
+        if not generation_config_path.is_file():
+            return {}
+
+        try:
+            with open(generation_config_path, "r", encoding="utf-8") as f:
+                return json.load(f)
+        except (json.JSONDecodeError, IOError) as e:
+            # not all models have valid generation_config.json
+            return {}
+
     @staticmethod
     def id_to_title(string):
         # Convert capitalization into title form unless acronym or version number
diff --git a/gguf-py/tests/test_metadata.py b/gguf-py/tests/test_metadata.py
index 40d484f4eaa9d..93e14807cbb23 100755
--- a/gguf-py/tests/test_metadata.py
+++ b/gguf-py/tests/test_metadata.py
@@ -233,6 +233,109 @@ def test_apply_metadata_heuristic_from_model_dir(self):
         expect = gguf.Metadata(name='Hermes 2 Pro Llama 3 8b DPO', finetune='DPO', basename='hermes-2-pro-llama-3', size_label='8B')
         self.assertEqual(got, expect)
 
+    def test_load_generation_config(self):
+        import tempfile
+        import json
+
+        # Test with a valid generation_config.json
+        with tempfile.TemporaryDirectory() as tmpdir:
+            tmpdir_path = Path(tmpdir)
+            gen_config_path = tmpdir_path / "generation_config.json"
+
+            # Create a sample generation_config.json
+            gen_config_data = {
+                "temperature": 0.7,
+                "top_k": 50,
+                "top_p": 0.95,
+                "repetition_penalty": 1.1,
+                "do_sample": True,
+                "max_length": 2048
+            }
+
+            with open(gen_config_path, "w") as f:
+                json.dump(gen_config_data, f)
+
+            # Test loading the file
+            result = gguf.Metadata.load_generation_config(tmpdir_path)
+            self.assertEqual(result, gen_config_data)
+
+        # Test with missing file
+        with tempfile.TemporaryDirectory() as tmpdir:
+            result = gguf.Metadata.load_generation_config(Path(tmpdir))
+            self.assertEqual(result, {})
+
+        # Test with None path
+        result = gguf.Metadata.load_generation_config(None)
+        self.assertEqual(result, {})
+
+    def test_metadata_load_with_generation_config(self):
+        import tempfile
+        import json
+
+        # Test that generation_config values are properly loaded into metadata
+        with tempfile.TemporaryDirectory() as tmpdir:
+            tmpdir_path = Path(tmpdir)
+            gen_config_path = tmpdir_path / "generation_config.json"
+
+            # Create a sample generation_config.json with sampling parameters
+            gen_config_data = {
+                "temperature": 0.8,
+                "top_k": 40,
+                "top_p": 0.9,
+                "min_p": 0.05,
+                "repetition_penalty": 1.15,
+            }
+
+            with open(gen_config_path, "w") as f:
+                json.dump(gen_config_data, f)
+
+            # Load metadata with generation config
+            metadata = gguf.Metadata.load(model_path=tmpdir_path)
+
+            # Verify sampling parameters were loaded
+            self.assertEqual(metadata.sampler_temp, 0.8)
+            self.assertEqual(metadata.sampler_top_k, 40)
+            self.assertEqual(metadata.sampler_top_p, 0.9)
+            self.assertEqual(metadata.sampler_min_p, 0.05)
+            self.assertEqual(metadata.sampler_penalty_repeat, 1.15)
+
+    def test_metadata_override_precedence(self):
+        import tempfile
+        import json
+
+        # Test that metadata_override takes precedence over generation_config
+        with tempfile.TemporaryDirectory() as tmpdir:
+            tmpdir_path = Path(tmpdir)
+            gen_config_path = tmpdir_path / "generation_config.json"
+            metadata_override_path = tmpdir_path / "metadata.json"
+
+            # Create generation_config.json
+            gen_config_data = {
+                "temperature": 0.7,
+                "top_k": 50,
+            }
+            with open(gen_config_path, "w") as f:
+                json.dump(gen_config_data, f)
+
+            # Create metadata.json that overrides temperature
+            metadata_override_data = {
+                "general.sampler.temp": 0.5,
+            }
+            with open(metadata_override_path, "w") as f:
+                json.dump(metadata_override_data, f)
+
+            # Load metadata with both files present
+            metadata = gguf.Metadata.load(
+                metadata_override_path=metadata_override_path,
+                model_path=tmpdir_path
+            )
+
+            # Verify that metadata_override takes precedence for temperature
+            self.assertEqual(metadata.sampler_temp, 0.5)
+            # Verify that generation_config value is used for top_k
+            self.assertEqual(metadata.sampler_top_k, 50)
+
 
 if __name__ == "__main__":
     unittest.main()
+

From 0f8d637cbd3d8834905a97e4bc30d14200bfbe08 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Sun, 9 Nov 2025 22:33:18 +0800
Subject: [PATCH 06/11] gguf-py: simplified gen_config loading

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 gguf-py/gguf/metadata.py | 37 +++++++++----------------------------
 1 file changed, 9 insertions(+), 28 deletions(-)

diff --git a/gguf-py/gguf/metadata.py b/gguf-py/gguf/metadata.py
index 72ca4b93b78df..ca6a710065960 100644
--- a/gguf-py/gguf/metadata.py
+++ b/gguf-py/gguf/metadata.py
@@ -72,34 +72,15 @@ def load(metadata_override_path: Optional[Path] = None, model_path: Optional[Pat
         metadata = Metadata.apply_metadata_heuristic(metadata, model_card, hf_params, model_path, total_params)
 
         if gen_config:
-            # Standard generation_config.json parameters
-            if metadata.sampler_top_k is None and "top_k" in gen_config:
-                metadata.sampler_top_k = int(gen_config["top_k"])
-
-            if metadata.sampler_top_p is None and "top_p" in gen_config:
-                metadata.sampler_top_p = float(gen_config["top_p"])
-
-            if metadata.sampler_min_p is None and "min_p" in gen_config:
-                metadata.sampler_min_p = float(gen_config["min_p"])
-
-            if metadata.sampler_temp is None and "temperature" in gen_config:
-                metadata.sampler_temp = float(gen_config["temperature"])
-
-            # Non-standard generation_config.json parameters
-            if metadata.sampler_penalty_last_n is None and "penalty_last_n" in gen_config:
-                metadata.sampler_penalty_last_n = int(gen_config["penalty_last_n"])
-
-            if metadata.sampler_penalty_repeat is None and "penalty_repeat" in gen_config:
-                metadata.sampler_penalty_repeat = float(gen_config["penalty_repeat"])
-
-            if metadata.sampler_mirostat is None and "mirostat" in gen_config:
-                metadata.sampler_mirostat = int(gen_config["mirostat"])
-
-            if metadata.sampler_mirostat_tau is None and "mirostat_tau" in gen_config:
-                metadata.sampler_mirostat_tau = float(gen_config["mirostat_tau"])
-
-            if metadata.sampler_mirostat_eta is None and "mirostat_eta" in gen_config:
-                metadata.sampler_mirostat_eta = float(gen_config["mirostat_eta"])
+            metadata.sampler_top_k          = gen_config.get("top_k", metadata.sampler_top_k)
+            metadata.sampler_top_p          = gen_config.get("top_p", metadata.sampler_top_p)
+            metadata.sampler_min_p          = gen_config.get("min_p", metadata.sampler_min_p)
+            metadata.sampler_temp           = gen_config.get("temperature", metadata.sampler_temp)
+            metadata.sampler_penalty_last_n = gen_config.get("penalty_last_n", metadata.sampler_penalty_last_n)
+            metadata.sampler_penalty_repeat = gen_config.get("penalty_repeat", metadata.sampler_penalty_repeat)
+            metadata.sampler_mirostat       = gen_config.get("mirostat", metadata.sampler_mirostat)
+            metadata.sampler_mirostat_tau   = gen_config.get("mirostat_tau", metadata.sampler_mirostat_tau)
+            metadata.sampler_mirostat_eta   = gen_config.get("mirostat_eta", metadata.sampler_mirostat_eta)
 
         # Metadata Override File Provided
         # This is based on LLM_KV_NAMES mapping in llama.cpp

From 6cf39000a945415385186d789b837ebda122d12c Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Sun, 9 Nov 2025 22:42:04 +0800
Subject: [PATCH 07/11] llama: add support for xtc sampler

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 common/arg.cpp              |  2 ++
 common/common.cpp           | 20 ++++++++--------
 common/common.h             | 20 ++++++++--------
 gguf-py/gguf/constants.py   |  2 ++
 gguf-py/gguf/gguf_writer.py |  6 +++++
 gguf-py/gguf/metadata.py    | 46 ++++++++++++++++++++++---------------
 src/llama-arch.cpp          | 18 ++++++++-------
 src/llama-arch.h            |  2 ++
 8 files changed, 72 insertions(+), 44 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 97d896ea314c0..9ce9df59741b3 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1300,6 +1300,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
         [](common_params & params, const std::string & value) {
             params.sampling.xtc_probability = std::stof(value);
+            params.sampling.sampling_mask |= common_params_sampling::SAMPLING_MASK_BITS_XTC_PROBABILITY;
         }
     ).set_sparam());
     add_opt(common_arg(
@@ -1307,6 +1308,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
         [](common_params & params, const std::string & value) {
             params.sampling.xtc_threshold = std::stof(value);
+            params.sampling.sampling_mask |= common_params_sampling::SAMPLING_MASK_BITS_XTC_THRESHOLD;
         }
     ).set_sparam());
     add_opt(common_arg(
diff --git a/common/common.cpp b/common/common.cpp
index 51071fb1fc967..0f3a1a6c5461e 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -974,15 +974,17 @@ static inline void common_init_sampler_from_model(
         }
     };
 
-    get_int32("general.sampler.top_k",          sparams.top_k,          common_params_sampling::SAMPLING_MASK_BITS_TOP_K);
-    get_float("general.sampler.top_p",          sparams.top_p,          common_params_sampling::SAMPLING_MASK_BITS_TOP_P);
-    get_float("general.sampler.min_p",          sparams.min_p,          common_params_sampling::SAMPLING_MASK_BITS_MIN_P);
-    get_float("general.sampler.temp",           sparams.temp,           common_params_sampling::SAMPLING_MASK_BITS_TEMP);
-    get_int32("general.sampler.penalty_last_n", sparams.penalty_last_n, common_params_sampling::SAMPLING_MASK_BITS_PENALTY_LAST_N);
-    get_float("general.sampler.penalty_repeat", sparams.penalty_repeat, common_params_sampling::SAMPLING_MASK_BITS_PENALTY_REPEAT);
-    get_int32("general.sampler.mirostat",       sparams.mirostat,       common_params_sampling::SAMPLING_MASK_BITS_MIROSTAT);
-    get_float("general.sampler.mirostat_tau",   sparams.mirostat_tau,   common_params_sampling::SAMPLING_MASK_BITS_MIROSTAT_TAU);
-    get_float("general.sampler.mirostat_eta",   sparams.mirostat_eta,   common_params_sampling::SAMPLING_MASK_BITS_MIROSTAT_ETA);
+    get_int32("general.sampler.top_k",           sparams.top_k,           common_params_sampling::SAMPLING_MASK_BITS_TOP_K);
+    get_float("general.sampler.top_p",           sparams.top_p,           common_params_sampling::SAMPLING_MASK_BITS_TOP_P);
+    get_float("general.sampler.min_p",           sparams.min_p,           common_params_sampling::SAMPLING_MASK_BITS_MIN_P);
+    get_float("general.sampler.xtc_probability", sparams.xtc_probability, common_params_sampling::SAMPLING_MASK_BITS_XTC_PROBABILITY);
+    get_float("general.sampler.xtc_threshold",   sparams.xtc_threshold,   common_params_sampling::SAMPLING_MASK_BITS_XTC_THRESHOLD);
+    get_float("general.sampler.temp",            sparams.temp,            common_params_sampling::SAMPLING_MASK_BITS_TEMP);
+    get_int32("general.sampler.penalty_last_n",  sparams.penalty_last_n,  common_params_sampling::SAMPLING_MASK_BITS_PENALTY_LAST_N);
+    get_float("general.sampler.penalty_repeat",  sparams.penalty_repeat,  common_params_sampling::SAMPLING_MASK_BITS_PENALTY_REPEAT);
+    get_int32("general.sampler.mirostat",        sparams.mirostat,        common_params_sampling::SAMPLING_MASK_BITS_MIROSTAT);
+    get_float("general.sampler.mirostat_tau",    sparams.mirostat_tau,    common_params_sampling::SAMPLING_MASK_BITS_MIROSTAT_TAU);
+    get_float("general.sampler.mirostat_eta",    sparams.mirostat_eta,    common_params_sampling::SAMPLING_MASK_BITS_MIROSTAT_ETA);
 }
 
 struct common_init_result common_init_from_params(common_params & params) {
diff --git a/common/common.h b/common/common.h
index c0d5d86127b81..5081361394ddd 100644
--- a/common/common.h
+++ b/common/common.h
@@ -167,15 +167,17 @@ struct common_params_sampling {
 
     uint16_t sampling_mask = 0; // bitfield to track user-specified samplers
     enum sampling_mask_bits : uint16_t {
-        SAMPLING_MASK_BITS_TOP_K          = 1 << 0,
-        SAMPLING_MASK_BITS_TOP_P          = 1 << 1,
-        SAMPLING_MASK_BITS_MIN_P          = 1 << 2,
-        SAMPLING_MASK_BITS_TEMP           = 1 << 3,
-        SAMPLING_MASK_BITS_PENALTY_LAST_N = 1 << 4,
-        SAMPLING_MASK_BITS_PENALTY_REPEAT = 1 << 5,
-        SAMPLING_MASK_BITS_MIROSTAT       = 1 << 6,
-        SAMPLING_MASK_BITS_MIROSTAT_TAU   = 1 << 7,
-        SAMPLING_MASK_BITS_MIROSTAT_ETA   = 1 << 8,
+        SAMPLING_MASK_BITS_TOP_K           = 1 << 0,
+        SAMPLING_MASK_BITS_TOP_P           = 1 << 1,
+        SAMPLING_MASK_BITS_MIN_P           = 1 << 2,
+        SAMPLING_MASK_BITS_XTC_PROBABILITY = 1 << 3,
+        SAMPLING_MASK_BITS_XTC_THRESHOLD   = 1 << 4,
+        SAMPLING_MASK_BITS_TEMP            = 1 << 5,
+        SAMPLING_MASK_BITS_PENALTY_LAST_N  = 1 << 6,
+        SAMPLING_MASK_BITS_PENALTY_REPEAT  = 1 << 7,
+        SAMPLING_MASK_BITS_MIROSTAT        = 1 << 8,
+        SAMPLING_MASK_BITS_MIROSTAT_TAU    = 1 << 9,
+        SAMPLING_MASK_BITS_MIROSTAT_ETA    = 1 << 10,
     };
 
     std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};     // default sequence breakers for DRY
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 4e8e3a15763fe..9af3c20b9124e 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -29,6 +29,8 @@ class General:
         SAMPLER_TOP_K              = "general.sampler.top_k"
         SAMPLER_TOP_P              = "general.sampler.top_p"
         SAMPLER_MIN_P              = "general.sampler.min_p"
+        SAMPLER_XTC_PROBABILITY    = "general.sampler.xtc_probability"
+        SAMPLER_XTC_THRESHOLD      = "general.sampler.xtc_threshold"
         SAMPLER_TEMP               = "general.sampler.temp"
         SAMPLER_PENALTY_LAST_N     = "general.sampler.penalty_last_n"
         SAMPLER_PENALTY_REPEAT     = "general.sampler.penalty_repeat"
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 23567281d1503..22ba27ed6130c 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -505,6 +505,12 @@ def add_sampler_top_p(self, top_p: float) -> None:
     def add_sampler_min_p(self, min_p: float) -> None:
         self.add_float32(Keys.General.SAMPLER_MIN_P, min_p)
 
+    def add_sampler_xtc_probability(self, xtc_probability: float) -> None:
+        self.add_float32(Keys.General.SAMPLER_XTC_PROBABILITY, xtc_probability)
+
+    def add_sampler_xtc_threshold(self, xtc_threshold: float) -> None:
+        self.add_float32(Keys.General.SAMPLER_XTC_THRESHOLD, xtc_threshold)
+
     def add_sampler_temp(self, temp: float) -> None:
         self.add_float32(Keys.General.SAMPLER_TEMP, temp)
 
diff --git a/gguf-py/gguf/metadata.py b/gguf-py/gguf/metadata.py
index ca6a710065960..f7ed17bababa7 100644
--- a/gguf-py/gguf/metadata.py
+++ b/gguf-py/gguf/metadata.py
@@ -21,6 +21,8 @@ class Metadata:
     sampler_top_k: Optional[int] = None
     sampler_top_p: Optional[float] = None
     sampler_min_p: Optional[float] = None
+    sampler_xtc_probability: Optional[float] = None
+    sampler_xtc_threshold: Optional[float] = None
     sampler_temp: Optional[float] = None
     sampler_penalty_last_n: Optional[int] = None
     sampler_penalty_repeat: Optional[float] = None
@@ -72,29 +74,33 @@ def load(metadata_override_path: Optional[Path] = None, model_path: Optional[Pat
         metadata = Metadata.apply_metadata_heuristic(metadata, model_card, hf_params, model_path, total_params)
 
         if gen_config:
-            metadata.sampler_top_k          = gen_config.get("top_k", metadata.sampler_top_k)
-            metadata.sampler_top_p          = gen_config.get("top_p", metadata.sampler_top_p)
-            metadata.sampler_min_p          = gen_config.get("min_p", metadata.sampler_min_p)
-            metadata.sampler_temp           = gen_config.get("temperature", metadata.sampler_temp)
-            metadata.sampler_penalty_last_n = gen_config.get("penalty_last_n", metadata.sampler_penalty_last_n)
-            metadata.sampler_penalty_repeat = gen_config.get("penalty_repeat", metadata.sampler_penalty_repeat)
-            metadata.sampler_mirostat       = gen_config.get("mirostat", metadata.sampler_mirostat)
-            metadata.sampler_mirostat_tau   = gen_config.get("mirostat_tau", metadata.sampler_mirostat_tau)
-            metadata.sampler_mirostat_eta   = gen_config.get("mirostat_eta", metadata.sampler_mirostat_eta)
+            metadata.sampler_top_k           = gen_config.get("top_k", metadata.sampler_top_k)
+            metadata.sampler_top_p           = gen_config.get("top_p", metadata.sampler_top_p)
+            metadata.sampler_min_p           = gen_config.get("min_p", metadata.sampler_min_p)
+            metadata.sampler_xtc_probability = gen_config.get("xtc_probability", metadata.sampler_xtc_probability)
+            metadata.sampler_xtc_threshold   = gen_config.get("xtc_threshold", metadata.sampler_xtc_threshold)
+            metadata.sampler_temp            = gen_config.get("temperature", metadata.sampler_temp)
+            metadata.sampler_penalty_last_n  = gen_config.get("penalty_last_n", metadata.sampler_penalty_last_n)
+            metadata.sampler_penalty_repeat  = gen_config.get("penalty_repeat", metadata.sampler_penalty_repeat)
+            metadata.sampler_mirostat        = gen_config.get("mirostat", metadata.sampler_mirostat)
+            metadata.sampler_mirostat_tau    = gen_config.get("mirostat_tau", metadata.sampler_mirostat_tau)
+            metadata.sampler_mirostat_eta    = gen_config.get("mirostat_eta", metadata.sampler_mirostat_eta)
 
         # Metadata Override File Provided
         # This is based on LLM_KV_NAMES mapping in llama.cpp
         metadata_override = Metadata.load_metadata_override(metadata_override_path)
 
-        metadata.sampler_top_k          = metadata_override.get(Keys.General.SAMPLER_TOP_K,          metadata.sampler_top_k)
-        metadata.sampler_top_p          = metadata_override.get(Keys.General.SAMPLER_TOP_P,          metadata.sampler_top_p)
-        metadata.sampler_min_p          = metadata_override.get(Keys.General.SAMPLER_MIN_P,          metadata.sampler_min_p)
-        metadata.sampler_temp           = metadata_override.get(Keys.General.SAMPLER_TEMP,           metadata.sampler_temp)
-        metadata.sampler_penalty_last_n = metadata_override.get(Keys.General.SAMPLER_PENALTY_LAST_N, metadata.sampler_penalty_last_n)
-        metadata.sampler_penalty_repeat = metadata_override.get(Keys.General.SAMPLER_PENALTY_REPEAT, metadata.sampler_penalty_repeat)
-        metadata.sampler_mirostat       = metadata_override.get(Keys.General.SAMPLER_MIROSTAT,       metadata.sampler_mirostat)
-        metadata.sampler_mirostat_tau   = metadata_override.get(Keys.General.SAMPLER_MIROSTAT_TAU,   metadata.sampler_mirostat_tau)
-        metadata.sampler_mirostat_eta   = metadata_override.get(Keys.General.SAMPLER_MIROSTAT_ETA,   metadata.sampler_mirostat_eta)
+        metadata.sampler_top_k           = metadata_override.get(Keys.General.SAMPLER_TOP_K,           metadata.sampler_top_k)
+        metadata.sampler_top_p           = metadata_override.get(Keys.General.SAMPLER_TOP_P,           metadata.sampler_top_p)
+        metadata.sampler_min_p           = metadata_override.get(Keys.General.SAMPLER_MIN_P,           metadata.sampler_min_p)
+        metadata.sampler_xtc_probability = metadata_override.get(Keys.General.SAMPLER_XTC_PROBABILITY, metadata.sampler_xtc_probability)
+        metadata.sampler_xtc_threshold   = metadata_override.get(Keys.General.SAMPLER_XTC_THRESHOLD,   metadata.sampler_xtc_threshold)
+        metadata.sampler_temp            = metadata_override.get(Keys.General.SAMPLER_TEMP,            metadata.sampler_temp)
+        metadata.sampler_penalty_last_n  = metadata_override.get(Keys.General.SAMPLER_PENALTY_LAST_N,  metadata.sampler_penalty_last_n)
+        metadata.sampler_penalty_repeat  = metadata_override.get(Keys.General.SAMPLER_PENALTY_REPEAT,  metadata.sampler_penalty_repeat)
+        metadata.sampler_mirostat        = metadata_override.get(Keys.General.SAMPLER_MIROSTAT,        metadata.sampler_mirostat)
+        metadata.sampler_mirostat_tau    = metadata_override.get(Keys.General.SAMPLER_MIROSTAT_TAU,    metadata.sampler_mirostat_tau)
+        metadata.sampler_mirostat_eta    = metadata_override.get(Keys.General.SAMPLER_MIROSTAT_ETA,    metadata.sampler_mirostat_eta)
 
         metadata.name            = metadata_override.get(Keys.General.NAME,            metadata.name)
         metadata.author          = metadata_override.get(Keys.General.AUTHOR,          metadata.author)
@@ -603,6 +609,10 @@ def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter):
             gguf_writer.add_sampler_top_p(self.sampler_top_p)
         if self.sampler_min_p is not None:
             gguf_writer.add_sampler_min_p(self.sampler_min_p)
+        if self.sampler_xtc_probability is not None:
+            gguf_writer.add_sampler_xtc_probability(self.sampler_xtc_probability)
+        if self.sampler_xtc_threshold is not None:
+            gguf_writer.add_sampler_xtc_threshold(self.sampler_xtc_threshold)
         if self.sampler_temp is not None:
             gguf_writer.add_sampler_temp(self.sampler_temp)
         if self.sampler_penalty_last_n is not None:
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 2c30068712f74..dd5b7ae808c91 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -112,14 +112,16 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
 };
 
 static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
-    { LLM_KV_GENERAL_TYPE,                   "general.type"                          },
-    { LLM_KV_GENERAL_ARCHITECTURE,           "general.architecture"                  },
-    { LLM_KV_GENERAL_QUANTIZATION_VERSION,   "general.quantization_version"          },
-    { LLM_KV_GENERAL_ALIGNMENT,              "general.alignment"                     },
-    { LLM_KV_GENERAL_FILE_TYPE,              "general.file_type"                     },
-    { LLM_KV_GENERAL_SAMPLER_TOP_K,          "general.sampler.top_k"                 },
-    { LLM_KV_GENERAL_SAMPLER_TOP_P,          "general.sampler.top_p"                 },
-    { LLM_KV_GENERAL_SAMPLER_MIN_P,          "general.sampler.min_p"                 },
+    { LLM_KV_GENERAL_TYPE,                    "general.type"                         },
+    { LLM_KV_GENERAL_ARCHITECTURE,            "general.architecture"                 },
+    { LLM_KV_GENERAL_QUANTIZATION_VERSION,    "general.quantization_version"         },
+    { LLM_KV_GENERAL_ALIGNMENT,               "general.alignment"                    },
+    { LLM_KV_GENERAL_FILE_TYPE,               "general.file_type"                    },
+    { LLM_KV_GENERAL_SAMPLER_TOP_K,           "general.sampler.top_k"                },
+    { LLM_KV_GENERAL_SAMPLER_TOP_P,           "general.sampler.top_p"                },
+    { LLM_KV_GENERAL_SAMPLER_MIN_P,           "general.sampler.min_p"                },
+    { LLM_KV_GENERAL_SAMPLER_XTC_PROBABILITY, "general.sampler.xtc_probability"      },
+    { LLM_KV_GENERAL_SAMPLER_XTC_THRESHOLD,   "general.sampler.xtc_threshold"        },
     { LLM_KV_GENERAL_SAMPLER_TEMP,           "general.sampler.temperature"           },
     { LLM_KV_GENERAL_SAMPLER_PENALTY_LAST_N, "general.sampler.penalty_last_n"        },
     { LLM_KV_GENERAL_SAMPLER_PENALTY_REPEAT, "general.sampler.penalty_repeat"        },
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 8ae8eed093e86..6ed2731e1d4c6 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -124,6 +124,8 @@ enum llm_kv {
     LLM_KV_GENERAL_SAMPLER_TOP_K,
     LLM_KV_GENERAL_SAMPLER_TOP_P,
     LLM_KV_GENERAL_SAMPLER_MIN_P,
+    LLM_KV_GENERAL_SAMPLER_XTC_PROBABILITY,
+    LLM_KV_GENERAL_SAMPLER_XTC_THRESHOLD,
     LLM_KV_GENERAL_SAMPLER_TEMP,
     LLM_KV_GENERAL_SAMPLER_PENALTY_LAST_N,
     LLM_KV_GENERAL_SAMPLER_PENALTY_REPEAT,

From c8845ff5de4f00aba9fbd87f6f6229a67d6f9703 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Sun, 9 Nov 2025 22:42:43 +0800
Subject: [PATCH 08/11] chore: formatting

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 src/llama-arch.cpp | 48 +++++++++++++++++++++++-----------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index dd5b7ae808c91..a48d42442edad 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -112,30 +112,30 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
 };
 
 static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
-    { LLM_KV_GENERAL_TYPE,                    "general.type"                         },
-    { LLM_KV_GENERAL_ARCHITECTURE,            "general.architecture"                 },
-    { LLM_KV_GENERAL_QUANTIZATION_VERSION,    "general.quantization_version"         },
-    { LLM_KV_GENERAL_ALIGNMENT,               "general.alignment"                    },
-    { LLM_KV_GENERAL_FILE_TYPE,               "general.file_type"                    },
-    { LLM_KV_GENERAL_SAMPLER_TOP_K,           "general.sampler.top_k"                },
-    { LLM_KV_GENERAL_SAMPLER_TOP_P,           "general.sampler.top_p"                },
-    { LLM_KV_GENERAL_SAMPLER_MIN_P,           "general.sampler.min_p"                },
-    { LLM_KV_GENERAL_SAMPLER_XTC_PROBABILITY, "general.sampler.xtc_probability"      },
-    { LLM_KV_GENERAL_SAMPLER_XTC_THRESHOLD,   "general.sampler.xtc_threshold"        },
-    { LLM_KV_GENERAL_SAMPLER_TEMP,           "general.sampler.temperature"           },
-    { LLM_KV_GENERAL_SAMPLER_PENALTY_LAST_N, "general.sampler.penalty_last_n"        },
-    { LLM_KV_GENERAL_SAMPLER_PENALTY_REPEAT, "general.sampler.penalty_repeat"        },
-    { LLM_KV_GENERAL_SAMPLER_MIROSTAT,       "general.sampler.mirostat"              },
-    { LLM_KV_GENERAL_SAMPLER_MIROSTAT_TAU,   "general.sampler.mirostat_tau"          },
-    { LLM_KV_GENERAL_SAMPLER_MIROSTAT_ETA,   "general.sampler.mirostat_eta"          },
-    { LLM_KV_GENERAL_NAME,                   "general.name"                          },
-    { LLM_KV_GENERAL_AUTHOR,                 "general.author"                        },
-    { LLM_KV_GENERAL_VERSION,                "general.version"                       },
-    { LLM_KV_GENERAL_URL,                    "general.url"                           },
-    { LLM_KV_GENERAL_DESCRIPTION,            "general.description"                   },
-    { LLM_KV_GENERAL_LICENSE,                "general.license"                       },
-    { LLM_KV_GENERAL_SOURCE_URL,             "general.source.url"                    },
-    { LLM_KV_GENERAL_SOURCE_HF_REPO,         "general.source.huggingface.repository" },
+    { LLM_KV_GENERAL_TYPE,                    "general.type"                          },
+    { LLM_KV_GENERAL_ARCHITECTURE,            "general.architecture"                  },
+    { LLM_KV_GENERAL_QUANTIZATION_VERSION,    "general.quantization_version"          },
+    { LLM_KV_GENERAL_ALIGNMENT,               "general.alignment"                     },
+    { LLM_KV_GENERAL_FILE_TYPE,               "general.file_type"                     },
+    { LLM_KV_GENERAL_SAMPLER_TOP_K,           "general.sampler.top_k"                 },
+    { LLM_KV_GENERAL_SAMPLER_TOP_P,           "general.sampler.top_p"                 },
+    { LLM_KV_GENERAL_SAMPLER_MIN_P,           "general.sampler.min_p"                 },
+    { LLM_KV_GENERAL_SAMPLER_XTC_PROBABILITY, "general.sampler.xtc_probability"       },
+    { LLM_KV_GENERAL_SAMPLER_XTC_THRESHOLD,   "general.sampler.xtc_threshold"         },
+    { LLM_KV_GENERAL_SAMPLER_TEMP,            "general.sampler.temperature"           },
+    { LLM_KV_GENERAL_SAMPLER_PENALTY_LAST_N,  "general.sampler.penalty_last_n"        },
+    { LLM_KV_GENERAL_SAMPLER_PENALTY_REPEAT,  "general.sampler.penalty_repeat"        },
+    { LLM_KV_GENERAL_SAMPLER_MIROSTAT,        "general.sampler.mirostat"              },
+    { LLM_KV_GENERAL_SAMPLER_MIROSTAT_TAU,    "general.sampler.mirostat_tau"          },
+    { LLM_KV_GENERAL_SAMPLER_MIROSTAT_ETA,    "general.sampler.mirostat_eta"          },
+    { LLM_KV_GENERAL_NAME,                    "general.name"                          },
+    { LLM_KV_GENERAL_AUTHOR,                  "general.author"                        },
+    { LLM_KV_GENERAL_VERSION,                 "general.version"                       },
+    { LLM_KV_GENERAL_URL,                     "general.url"                           },
+    { LLM_KV_GENERAL_DESCRIPTION,             "general.description"                   },
+    { LLM_KV_GENERAL_LICENSE,                 "general.license"                       },
+    { LLM_KV_GENERAL_SOURCE_URL,              "general.source.url"                    },
+    { LLM_KV_GENERAL_SOURCE_HF_REPO,          "general.source.huggingface.repository" },
 
     { LLM_KV_VOCAB_SIZE,                        "%s.vocab_size"                        },
     { LLM_KV_CONTEXT_LENGTH,                    "%s.context_length"                    },

From 33ddb2742c388b8d53a9a85080b8ba405125dcca Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Sun, 9 Nov 2025 23:20:40 +0800
Subject: [PATCH 09/11] common: introduce support for general.sampler.sequence

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 common/arg.cpp              |  1 +
 common/common.cpp           | 12 ++++++++++++
 common/common.h             | 23 ++++++++++++-----------
 gguf-py/gguf/constants.py   |  1 +
 gguf-py/gguf/gguf_writer.py |  3 +++
 gguf-py/gguf/metadata.py    | 25 +++++++++++++++----------
 src/llama-arch.cpp          |  1 +
 src/llama-arch.h            |  1 +
 8 files changed, 46 insertions(+), 21 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 9ce9df59741b3..c7a85a33c0129 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1232,6 +1232,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             const auto sampler_names = string_split<std::string>(value, ';');
             params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
+            params.sampling.sampling_mask |= common_params_sampling::SAMPLING_MASK_BITS_SAMPLERS;
         }
     ).set_sparam());
     add_opt(common_arg(
diff --git a/common/common.cpp b/common/common.cpp
index 0f3a1a6c5461e..0250024a4e497 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -8,6 +8,7 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
+#include "sampling.h"
 
 #include <algorithm>
 #include <cinttypes>
@@ -974,6 +975,17 @@ static inline void common_init_sampler_from_model(
         }
     };
 
+    // Sampler sequence
+    if (!(mask & common_params_sampling::SAMPLING_MASK_BITS_SAMPLERS)) {
+        char buf[512] = {0};
+        if (llama_model_meta_val_str(model, "general.sampler.sequence", buf, sizeof(buf)) > 0) {
+            const std::vector<std::string> sampler_names = string_split<std::string>(std::string(buf), ';');
+            if (!sampler_names.empty()) {
+                sparams.samplers = common_sampler_types_from_names(sampler_names, true);
+            }
+        }
+    }
+
     get_int32("general.sampler.top_k",           sparams.top_k,           common_params_sampling::SAMPLING_MASK_BITS_TOP_K);
     get_float("general.sampler.top_p",           sparams.top_p,           common_params_sampling::SAMPLING_MASK_BITS_TOP_P);
     get_float("general.sampler.min_p",           sparams.min_p,           common_params_sampling::SAMPLING_MASK_BITS_MIN_P);
diff --git a/common/common.h b/common/common.h
index 5081361394ddd..19a708b2452c2 100644
--- a/common/common.h
+++ b/common/common.h
@@ -167,17 +167,18 @@ struct common_params_sampling {
 
     uint16_t sampling_mask = 0; // bitfield to track user-specified samplers
     enum sampling_mask_bits : uint16_t {
-        SAMPLING_MASK_BITS_TOP_K           = 1 << 0,
-        SAMPLING_MASK_BITS_TOP_P           = 1 << 1,
-        SAMPLING_MASK_BITS_MIN_P           = 1 << 2,
-        SAMPLING_MASK_BITS_XTC_PROBABILITY = 1 << 3,
-        SAMPLING_MASK_BITS_XTC_THRESHOLD   = 1 << 4,
-        SAMPLING_MASK_BITS_TEMP            = 1 << 5,
-        SAMPLING_MASK_BITS_PENALTY_LAST_N  = 1 << 6,
-        SAMPLING_MASK_BITS_PENALTY_REPEAT  = 1 << 7,
-        SAMPLING_MASK_BITS_MIROSTAT        = 1 << 8,
-        SAMPLING_MASK_BITS_MIROSTAT_TAU    = 1 << 9,
-        SAMPLING_MASK_BITS_MIROSTAT_ETA    = 1 << 10,
+        SAMPLING_MASK_BITS_SAMPLERS        = 1 << 0,
+        SAMPLING_MASK_BITS_TOP_K           = 1 << 1,
+        SAMPLING_MASK_BITS_TOP_P           = 1 << 2,
+        SAMPLING_MASK_BITS_MIN_P           = 1 << 3,
+        SAMPLING_MASK_BITS_XTC_PROBABILITY = 1 << 4,
+        SAMPLING_MASK_BITS_XTC_THRESHOLD   = 1 << 5,
+        SAMPLING_MASK_BITS_TEMP            = 1 << 6,
+        SAMPLING_MASK_BITS_PENALTY_LAST_N  = 1 << 7,
+        SAMPLING_MASK_BITS_PENALTY_REPEAT  = 1 << 8,
+        SAMPLING_MASK_BITS_MIROSTAT        = 1 << 9,
+        SAMPLING_MASK_BITS_MIROSTAT_TAU    = 1 << 10,
+        SAMPLING_MASK_BITS_MIROSTAT_ETA    = 1 << 11,
     };
 
     std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};     // default sequence breakers for DRY
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 9af3c20b9124e..be374c678cf30 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -26,6 +26,7 @@ class General:
         FILE_TYPE                  = "general.file_type"
 
         # Recommended Sampler Parameters
+        SAMPLER_SEQUENCE           = "general.sampler.sequence"
         SAMPLER_TOP_K              = "general.sampler.top_k"
         SAMPLER_TOP_P              = "general.sampler.top_p"
         SAMPLER_MIN_P              = "general.sampler.min_p"
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 22ba27ed6130c..067c062ba76c3 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -496,6 +496,9 @@ def add_custom_alignment(self, alignment: int) -> None:
     def add_file_type(self, ftype: int) -> None:
         self.add_uint32(Keys.General.FILE_TYPE, ftype)
 
+    def add_sampler_sequence(self, sequence: str) -> None:
+        self.add_string(Keys.General.SAMPLER_SEQUENCE, sequence)
+
     def add_sampler_top_k(self, top_k: int) -> None:
         self.add_int32(Keys.General.SAMPLER_TOP_K, top_k)
 
diff --git a/gguf-py/gguf/metadata.py b/gguf-py/gguf/metadata.py
index f7ed17bababa7..b783d9a82bad8 100644
--- a/gguf-py/gguf/metadata.py
+++ b/gguf-py/gguf/metadata.py
@@ -18,6 +18,7 @@
 @dataclass
 class Metadata:
     # Recommended Sampler Parameters to be written to GGUF KV Store
+    sampler_sequence: Optional[str] = None
     sampler_top_k: Optional[int] = None
     sampler_top_p: Optional[float] = None
     sampler_min_p: Optional[float] = None
@@ -74,22 +75,24 @@ def load(metadata_override_path: Optional[Path] = None, model_path: Optional[Pat
         metadata = Metadata.apply_metadata_heuristic(metadata, model_card, hf_params, model_path, total_params)
 
         if gen_config:
-            metadata.sampler_top_k           = gen_config.get("top_k", metadata.sampler_top_k)
-            metadata.sampler_top_p           = gen_config.get("top_p", metadata.sampler_top_p)
-            metadata.sampler_min_p           = gen_config.get("min_p", metadata.sampler_min_p)
+            metadata.sampler_sequence        = gen_config.get("sequence",        metadata.sampler_sequence)
+            metadata.sampler_top_k           = gen_config.get("top_k",           metadata.sampler_top_k)
+            metadata.sampler_top_p           = gen_config.get("top_p",           metadata.sampler_top_p)
+            metadata.sampler_min_p           = gen_config.get("min_p",           metadata.sampler_min_p)
             metadata.sampler_xtc_probability = gen_config.get("xtc_probability", metadata.sampler_xtc_probability)
-            metadata.sampler_xtc_threshold   = gen_config.get("xtc_threshold", metadata.sampler_xtc_threshold)
-            metadata.sampler_temp            = gen_config.get("temperature", metadata.sampler_temp)
-            metadata.sampler_penalty_last_n  = gen_config.get("penalty_last_n", metadata.sampler_penalty_last_n)
-            metadata.sampler_penalty_repeat  = gen_config.get("penalty_repeat", metadata.sampler_penalty_repeat)
-            metadata.sampler_mirostat        = gen_config.get("mirostat", metadata.sampler_mirostat)
-            metadata.sampler_mirostat_tau    = gen_config.get("mirostat_tau", metadata.sampler_mirostat_tau)
-            metadata.sampler_mirostat_eta    = gen_config.get("mirostat_eta", metadata.sampler_mirostat_eta)
+            metadata.sampler_xtc_threshold   = gen_config.get("xtc_threshold",   metadata.sampler_xtc_threshold)
+            metadata.sampler_temp            = gen_config.get("temperature",     metadata.sampler_temp)
+            metadata.sampler_penalty_last_n  = gen_config.get("penalty_last_n",  metadata.sampler_penalty_last_n)
+            metadata.sampler_penalty_repeat  = gen_config.get("penalty_repeat",  metadata.sampler_penalty_repeat)
+            metadata.sampler_mirostat        = gen_config.get("mirostat",        metadata.sampler_mirostat)
+            metadata.sampler_mirostat_tau    = gen_config.get("mirostat_tau",    metadata.sampler_mirostat_tau)
+            metadata.sampler_mirostat_eta    = gen_config.get("mirostat_eta",    metadata.sampler_mirostat_eta)
 
         # Metadata Override File Provided
         # This is based on LLM_KV_NAMES mapping in llama.cpp
         metadata_override = Metadata.load_metadata_override(metadata_override_path)
 
+        metadata.sampler_sequence        = metadata_override.get(Keys.General.SAMPLER_SEQUENCE,        metadata.sampler_sequence)
         metadata.sampler_top_k           = metadata_override.get(Keys.General.SAMPLER_TOP_K,           metadata.sampler_top_k)
         metadata.sampler_top_p           = metadata_override.get(Keys.General.SAMPLER_TOP_P,           metadata.sampler_top_p)
         metadata.sampler_min_p           = metadata_override.get(Keys.General.SAMPLER_MIN_P,           metadata.sampler_min_p)
@@ -603,6 +606,8 @@ def use_array_model_card_metadata(metadata_key: str, model_card_key: str):
     def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter):
         assert self.name is not None
 
+        if self.sampler_sequence is not None:
+            gguf_writer.add_sampler_sequence(self.sampler_sequence)
         if self.sampler_top_k is not None:
             gguf_writer.add_sampler_top_k(self.sampler_top_k)
         if self.sampler_top_p is not None:
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index a48d42442edad..423ddf7d41bde 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -117,6 +117,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_GENERAL_QUANTIZATION_VERSION,    "general.quantization_version"          },
     { LLM_KV_GENERAL_ALIGNMENT,               "general.alignment"                     },
     { LLM_KV_GENERAL_FILE_TYPE,               "general.file_type"                     },
+    { LLM_KV_GENERAL_SAMPLER_SEQUENCE,        "general.sampler.sequence"              },
     { LLM_KV_GENERAL_SAMPLER_TOP_K,           "general.sampler.top_k"                 },
     { LLM_KV_GENERAL_SAMPLER_TOP_P,           "general.sampler.top_p"                 },
     { LLM_KV_GENERAL_SAMPLER_MIN_P,           "general.sampler.min_p"                 },
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 6ed2731e1d4c6..2f868263a9f17 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -121,6 +121,7 @@ enum llm_kv {
     LLM_KV_GENERAL_QUANTIZATION_VERSION,
     LLM_KV_GENERAL_ALIGNMENT,
     LLM_KV_GENERAL_FILE_TYPE,
+    LLM_KV_GENERAL_SAMPLER_SEQUENCE,
     LLM_KV_GENERAL_SAMPLER_TOP_K,
     LLM_KV_GENERAL_SAMPLER_TOP_P,
     LLM_KV_GENERAL_SAMPLER_MIN_P,

From fd3fa3a4776215625279561f1e4cf739db0f5625 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Sun, 9 Nov 2025 23:32:42 +0800
Subject: [PATCH 10/11] gguf-py: revert test_metadata.py

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 gguf-py/tests/test_metadata.py | 103 ---------------------------------
 1 file changed, 103 deletions(-)

diff --git a/gguf-py/tests/test_metadata.py b/gguf-py/tests/test_metadata.py
index 93e14807cbb23..40d484f4eaa9d 100755
--- a/gguf-py/tests/test_metadata.py
+++ b/gguf-py/tests/test_metadata.py
@@ -233,109 +233,6 @@ def test_apply_metadata_heuristic_from_model_dir(self):
         expect = gguf.Metadata(name='Hermes 2 Pro Llama 3 8b DPO', finetune='DPO', basename='hermes-2-pro-llama-3', size_label='8B')
         self.assertEqual(got, expect)
 
-    def test_load_generation_config(self):
-        import tempfile
-        import json
-
-        # Test with a valid generation_config.json
-        with tempfile.TemporaryDirectory() as tmpdir:
-            tmpdir_path = Path(tmpdir)
-            gen_config_path = tmpdir_path / "generation_config.json"
-
-            # Create a sample generation_config.json
-            gen_config_data = {
-                "temperature": 0.7,
-                "top_k": 50,
-                "top_p": 0.95,
-                "repetition_penalty": 1.1,
-                "do_sample": True,
-                "max_length": 2048
-            }
-
-            with open(gen_config_path, "w") as f:
-                json.dump(gen_config_data, f)
-
-            # Test loading the file
-            result = gguf.Metadata.load_generation_config(tmpdir_path)
-            self.assertEqual(result, gen_config_data)
-
-        # Test with missing file
-        with tempfile.TemporaryDirectory() as tmpdir:
-            result = gguf.Metadata.load_generation_config(Path(tmpdir))
-            self.assertEqual(result, {})
-
-        # Test with None path
-        result = gguf.Metadata.load_generation_config(None)
-        self.assertEqual(result, {})
-
-    def test_metadata_load_with_generation_config(self):
-        import tempfile
-        import json
-
-        # Test that generation_config values are properly loaded into metadata
-        with tempfile.TemporaryDirectory() as tmpdir:
-            tmpdir_path = Path(tmpdir)
-            gen_config_path = tmpdir_path / "generation_config.json"
-
-            # Create a sample generation_config.json with sampling parameters
-            gen_config_data = {
-                "temperature": 0.8,
-                "top_k": 40,
-                "top_p": 0.9,
-                "min_p": 0.05,
-                "repetition_penalty": 1.15,
-            }
-
-            with open(gen_config_path, "w") as f:
-                json.dump(gen_config_data, f)
-
-            # Load metadata with generation config
-            metadata = gguf.Metadata.load(model_path=tmpdir_path)
-
-            # Verify sampling parameters were loaded
-            self.assertEqual(metadata.sampler_temp, 0.8)
-            self.assertEqual(metadata.sampler_top_k, 40)
-            self.assertEqual(metadata.sampler_top_p, 0.9)
-            self.assertEqual(metadata.sampler_min_p, 0.05)
-            self.assertEqual(metadata.sampler_penalty_repeat, 1.15)
-
-    def test_metadata_override_precedence(self):
-        import tempfile
-        import json
-
-        # Test that metadata_override takes precedence over generation_config
-        with tempfile.TemporaryDirectory() as tmpdir:
-            tmpdir_path = Path(tmpdir)
-            gen_config_path = tmpdir_path / "generation_config.json"
-            metadata_override_path = tmpdir_path / "metadata.json"
-
-            # Create generation_config.json
-            gen_config_data = {
-                "temperature": 0.7,
-                "top_k": 50,
-            }
-            with open(gen_config_path, "w") as f:
-                json.dump(gen_config_data, f)
-
-            # Create metadata.json that overrides temperature
-            metadata_override_data = {
-                "general.sampler.temp": 0.5,
-            }
-            with open(metadata_override_path, "w") as f:
-                json.dump(metadata_override_data, f)
-
-            # Load metadata with both files present
-            metadata = gguf.Metadata.load(
-                metadata_override_path=metadata_override_path,
-                model_path=tmpdir_path
-            )
-
-            # Verify that metadata_override takes precedence for temperature
-            self.assertEqual(metadata.sampler_temp, 0.5)
-            # Verify that generation_config value is used for top_k
-            self.assertEqual(metadata.sampler_top_k, 50)
-
 
 if __name__ == "__main__":
     unittest.main()
-

From fc91c1025f66e4cd4f6c8f795e3af6e5807a0e96 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Mon, 10 Nov 2025 01:18:05 +0800
Subject: [PATCH 11/11] gguf-py: fix linting

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 gguf-py/gguf/metadata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gguf-py/gguf/metadata.py b/gguf-py/gguf/metadata.py
index b783d9a82bad8..2244c072fde3b 100644
--- a/gguf-py/gguf/metadata.py
+++ b/gguf-py/gguf/metadata.py
@@ -227,7 +227,7 @@ def load_generation_config(model_path: Optional[Path] = None) -> dict[str, Any]:
         try:
             with open(generation_config_path, "r", encoding="utf-8") as f:
                 return json.load(f)
-        except (json.JSONDecodeError, IOError) as e:
+        except (json.JSONDecodeError, IOError):
             # not all models have valid generation_config.json
             return {}