llama : add llama_set_attn_type API

ngxson · ngxson · commit c3ecff5cb0a8 · 2025-03-27T18:09:25.000+01:00
diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
@@ -178,14 +178,14 @@ static int eval_image(gemma3_context & ctx, std::string & fname) {
     // decode image embeddings
     int64_t t1 = ggml_time_ms();
     eval_text(ctx, "<start_of_image>");
-    llama_set_causal_attn(ctx.lctx, false);
+    llama_set_attn_type(ctx.lctx, LLAMA_ATTENTION_TYPE_CAUSAL_FULL);
     decode_embd_batch batch_img(image_embd_v.data(), n_tokens, ctx.n_past, 0);
     if (llama_decode(ctx.lctx, batch_img.batch)) {
         LOG_ERR("failed to decode image\n");
         return 1;
     }
     ctx.n_past += n_tokens;
-    llama_set_causal_attn(ctx.lctx, true);
+    llama_set_attn_type(ctx.lctx, LLAMA_ATTENTION_TYPE_CAUSAL);
     eval_text(ctx, "<end_of_image>");
     LOG("Image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
     return 0;
diff --git a/include/llama.h b/include/llama.h
@@ -208,6 +208,7 @@ extern "C" {
         LLAMA_ATTENTION_TYPE_UNSPECIFIED = -1,
         LLAMA_ATTENTION_TYPE_CAUSAL      = 0,
         LLAMA_ATTENTION_TYPE_NON_CAUSAL  = 1,
+        LLAMA_ATTENTION_TYPE_CAUSAL_FULL = 2, // used by gemma 3, allowing image to attention to past tokens
     };
 
     enum llama_split_mode {
@@ -942,8 +943,12 @@ extern "C" {
     // If true, embeddings will be returned but logits will not
     LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
 
+    // Set the attention type
+    LLAMA_API void llama_set_attn_type(struct llama_context * ctx, llama_attention_type type);
+
     // Set whether to use causal attention or not
-    // If set to true, the model will only attend to the past tokens
+    // - true:  the model will only attend to the past tokens, alias of LLAMA_ATTENTION_TYPE_CAUSAL
+    // - false: alias of LLAMA_ATTENTION_TYPE_FULL
     LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
 
     // Set whether the model is in warmup mode or not
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -76,13 +76,13 @@ llama_context::llama_context(
     }
 
     if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) {
-        cparams.causal_attn = hparams.causal_attn;
+        cparams.attn_type = hparams.causal_attn ? LLAMA_ATTENTION_TYPE_CAUSAL : LLAMA_ATTENTION_TYPE_NON_CAUSAL;
     } else {
-        cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
+        cparams.attn_type = params.attention_type;
     }
 
     // with causal attention, the batch size is limited by the context size
-    cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
+    cparams.n_batch = cparams.use_past_tokens() ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
 
     // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
     // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
@@ -102,7 +102,7 @@ llama_context::llama_context(
     LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n",   __func__, n_ctx_per_seq);
     LLAMA_LOG_INFO("%s: n_batch       = %u\n",   __func__, cparams.n_batch);
     LLAMA_LOG_INFO("%s: n_ubatch      = %u\n",   __func__, cparams.n_ubatch);
-    LLAMA_LOG_INFO("%s: causal_attn   = %d\n",   __func__, cparams.causal_attn);
+    LLAMA_LOG_INFO("%s: attn_type     = %d\n",   __func__, cparams.attn_type);
     LLAMA_LOG_INFO("%s: flash_attn    = %d\n",   __func__, cparams.flash_attn);
     LLAMA_LOG_INFO("%s: freq_base     = %.1f\n", __func__, cparams.rope_freq_base);
     LLAMA_LOG_INFO("%s: freq_scale    = %g\n",   __func__, cparams.rope_freq_scale);
@@ -966,10 +966,10 @@ void llama_context::set_embeddings(bool value) {
     cparams.embeddings = value;
 }
 
-void llama_context::set_causal_attn(bool value) {
+void llama_context::set_attn_type(enum llama_attention_type value) {
     LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
 
-    cparams.causal_attn = value;
+    cparams.attn_type = value;
 }
 
 void llama_context::set_warmup(bool value) {
@@ -1074,12 +1074,12 @@ int llama_context::encode(llama_batch & inp_batch) {
     ggml_backend_sched_reset(sched.get());
     ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
-    const auto causal_attn_org = cparams.causal_attn;
+    const auto attn_type_org = cparams.attn_type;
 
     // always use non-causal attention for encoder graphs
     // TODO: this is a tmp solution until we have a proper way to support enc-dec models
     //       ref: https://github.com/ggml-org/llama.cpp/pull/12181#issuecomment-2730451223
-    cparams.causal_attn = false;
+    cparams.attn_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL;
 
     auto * gf = graph_init();
     auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_ENCODER);
@@ -1088,7 +1088,7 @@ int llama_context::encode(llama_batch & inp_batch) {
 
     res->set_inputs(&ubatch);
 
-    cparams.causal_attn = causal_attn_org;
+    cparams.attn_type = attn_type_org;
 
     const auto compute_status = graph_compute(gf, n_tokens > 1);
     switch (compute_status) {
@@ -1242,7 +1242,7 @@ int llama_context::decode(llama_batch & inp_batch) {
 
     GGML_ASSERT(n_tokens_all <= cparams.n_batch);
 
-    GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
+    GGML_ASSERT((!cparams.use_past_tokens() || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
 
     if (t_compute_start_us == 0) {
         t_compute_start_us = ggml_time_us();
@@ -1495,7 +1495,7 @@ int llama_context::decode(llama_batch & inp_batch) {
     //synchronize();
 
     // decide if we need to defrag the kv cache
-    if (cparams.causal_attn && cparams.defrag_thold > 0.0f) {
+    if (cparams.use_past_tokens() && cparams.defrag_thold > 0.0f) {
         // - do not defrag small contexts (i.e. < 2048 tokens)
         // - count the padding towards the number of used tokens
         const float fragmentation = kv_self->n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self->used + kv_self->get_padding(cparams))/float(kv_self->n)) : 0.0f;
@@ -2410,8 +2410,12 @@ void llama_set_embeddings(llama_context * ctx, bool embeddings) {
     ctx->set_embeddings(embeddings);
 }
 
+void llama_set_attn_type(llama_context * ctx, llama_attention_type type) {
+    ctx->set_attn_type(type);
+}
+
 void llama_set_causal_attn(llama_context * ctx, bool causal_attn) {
-    ctx->set_causal_attn(causal_attn);
+    ctx->set_attn_type(causal_attn ? LLAMA_ATTENTION_TYPE_CAUSAL : LLAMA_ATTENTION_TYPE_NON_CAUSAL);
 }
 
 void llama_set_warmup(llama_context * ctx, bool warmup) {
diff --git a/src/llama-context.h b/src/llama-context.h
@@ -62,8 +62,8 @@ struct llama_context {
 
     void set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data);
 
-    void set_embeddings (bool value);
-    void set_causal_attn(bool value);
+    void set_embeddings(bool value);
+    void set_attn_type(enum llama_attention_type value);
     void set_warmup(bool value);
 
     void set_adapter_lora(
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
@@ -25,14 +25,19 @@ struct llama_cparams {
     float defrag_thold;
 
     bool embeddings;
-    bool causal_attn;
     bool offload_kqv;
     bool flash_attn;
     bool no_perf;
     bool warmup;
 
+    enum llama_attention_type attn_type;
+
     enum llama_pooling_type pooling_type;
 
+    bool use_past_tokens() const {
+        return attn_type == LLAMA_ATTENTION_TYPE_CAUSAL || attn_type == LLAMA_ATTENTION_TYPE_CAUSAL_FULL;
+    }
+
     ggml_backend_sched_eval_callback cb_eval;
     void * cb_eval_user_data;
 };
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -315,7 +315,7 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
 
 void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
     if (kq_mask) {
-        if (cparams.causal_attn) {
+        if (cparams.use_past_tokens()) {
             const int64_t n_kv         = ubatch->n_tokens;
             const int64_t n_tokens     = ubatch->n_tokens;
             const int64_t n_seq_tokens = ubatch->n_seq_tokens;
@@ -403,12 +403,14 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
 void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
     if (self_kq_mask || self_kq_mask_swa) {
         // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
-        if (cparams.causal_attn) {
+        if (cparams.use_past_tokens()) {
             const int64_t n_kv         = kv_self->n;
             const int64_t n_tokens     = ubatch->n_tokens;
             const int64_t n_seq_tokens = ubatch->n_seq_tokens;
             const int64_t n_seqs       = ubatch->n_seqs;
 
+            bool full_mask = cparams.attn_type == LLAMA_ATTENTION_TYPE_CAUSAL_FULL;
+
             float * data     = nullptr;
             float * data_swa = nullptr;
 
@@ -434,7 +436,9 @@ void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
 
                         for (int i = 0; i < n_kv; ++i) {
                             float f;
-                            if (!kv_self->cells[i].has_seq_id(seq_id) || kv_self->cells[i].pos > pos) {
+                            // If bidirectional masking is enabled, ignore the ordering check
+                            if (!kv_self->cells[i].has_seq_id(seq_id) ||
+                                    (!full_mask && kv_self->cells[i].pos > pos)) {
                                 f = -INFINITY;
                             } else {
                                 if (hparams.use_alibi) {