@@ -32,6 +32,309 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t
3232 return relative_bucket;
3333}
3434
35+ llama_context::llama_context (const llama_model & model, const llama_context_params & params, std::function<ggml_cgraph *(llama_context &, const llama_ubatch &)> fn_build_graph_worst) :
36+ model(model),
37+ t_start_us(model.t_start_us),
38+ t_load_us (model.t_load_us) {
39+
40+ const auto & hparams = model.hparams ;
41+
42+ cparams.n_seq_max = std::max (1u , params.n_seq_max );
43+ cparams.n_threads = params.n_threads ;
44+ cparams.n_threads_batch = params.n_threads_batch ;
45+ cparams.yarn_ext_factor = params.yarn_ext_factor ;
46+ cparams.yarn_attn_factor = params.yarn_attn_factor ;
47+ cparams.yarn_beta_fast = params.yarn_beta_fast ;
48+ cparams.yarn_beta_slow = params.yarn_beta_slow ;
49+ cparams.defrag_thold = params.defrag_thold ;
50+ cparams.embeddings = params.embeddings ;
51+ cparams.offload_kqv = params.offload_kqv ;
52+ cparams.flash_attn = params.flash_attn ;
53+ cparams.no_perf = params.no_perf ;
54+ cparams.pooling_type = params.pooling_type ;
55+
56+ cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx ;
57+ cparams.rope_freq_base = params.rope_freq_base == 0 .0f ? hparams.rope_freq_base_train : params.rope_freq_base ;
58+ cparams.rope_freq_scale = params.rope_freq_scale == 0 .0f ? hparams.rope_freq_scale_train : params.rope_freq_scale ;
59+
60+ cparams.n_ctx = GGML_PAD (cparams.n_ctx , get_ctx_padding (cparams));
61+
62+ // with causal attention, the batch size is limited by the context size
63+ cparams.n_batch = hparams.causal_attn ? std::min (cparams.n_ctx , params.n_batch ) : params.n_batch ;
64+
65+ // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
66+ // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
67+ // ref: https://github.com/ggerganov/llama.cpp/pull/5021
68+ if (cparams.n_batch < GGML_KQ_MASK_PAD) {
69+ LLAMA_LOG_WARN (" %s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n " , __func__, GGML_KQ_MASK_PAD);
70+ cparams.n_batch = GGML_KQ_MASK_PAD;
71+ }
72+
73+ cparams.n_ubatch = std::min (cparams.n_batch , params.n_ubatch == 0 ? params.n_batch : params.n_ubatch );
74+
75+ cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
76+ hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
77+ hparams.n_ctx_train ;
78+
79+ cparams.cb_eval = params.cb_eval ;
80+ cparams.cb_eval_user_data = params.cb_eval_user_data ;
81+
82+ auto rope_scaling_type = params.rope_scaling_type ;
83+ if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
84+ rope_scaling_type = hparams.rope_scaling_type_train ;
85+ }
86+
87+ if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
88+ cparams.rope_freq_scale = 1 .0f ; // never scale if scaling type is none
89+ }
90+
91+ if (cparams.yarn_ext_factor < 0 .0f ) { // negative indicates 'not set'
92+ cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1 .0f : 0 .0f ;
93+ }
94+
95+ cparams.yarn_attn_factor *= hparams.rope_attn_factor ;
96+
97+ if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
98+ if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
99+ cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
100+ } else {
101+ cparams.pooling_type = hparams.pooling_type ;
102+ }
103+ }
104+
105+ if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) {
106+ cparams.causal_attn = hparams.causal_attn ;
107+ } else {
108+ cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
109+ }
110+
111+ const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max ;
112+
113+ LLAMA_LOG_INFO (" %s: n_seq_max = %u\n " , __func__, cparams.n_seq_max );
114+ LLAMA_LOG_INFO (" %s: n_ctx = %u\n " , __func__, cparams.n_ctx );
115+ LLAMA_LOG_INFO (" %s: n_ctx_per_seq = %u\n " , __func__, n_ctx_per_seq);
116+ LLAMA_LOG_INFO (" %s: n_batch = %u\n " , __func__, cparams.n_batch );
117+ LLAMA_LOG_INFO (" %s: n_ubatch = %u\n " , __func__, cparams.n_ubatch );
118+ LLAMA_LOG_INFO (" %s: flash_attn = %d\n " , __func__, cparams.flash_attn );
119+ LLAMA_LOG_INFO (" %s: freq_base = %.1f\n " , __func__, cparams.rope_freq_base );
120+ LLAMA_LOG_INFO (" %s: freq_scale = %g\n " , __func__, cparams.rope_freq_scale );
121+
122+ if (n_ctx_per_seq < hparams.n_ctx_train ) {
123+ LLAMA_LOG_WARN (" %s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n " ,
124+ __func__, n_ctx_per_seq, hparams.n_ctx_train );
125+ }
126+
127+ if (n_ctx_per_seq > hparams.n_ctx_train ) {
128+ LLAMA_LOG_WARN (" %s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n " ,
129+ __func__, n_ctx_per_seq, hparams.n_ctx_train );
130+ }
131+
132+ logits_all = params.logits_all ;
133+
134+ // build worst-case graph for encoder if a model contains encoder
135+ is_encoding = llama_model_has_encoder (&model); // TODO: model.has_encoder()
136+
137+ uint32_t kv_size = cparams.n_ctx ;
138+ ggml_type type_k = params.type_k ;
139+ ggml_type type_v = params.type_v ;
140+
141+ // Mamba only needs a constant number of KV cache cells per sequence
142+ if (llama_model_is_recurrent (&model)) {
143+ // Mamba needs at least as many KV cells as there are sequences kept at any time
144+ kv_size = std::max ((uint32_t ) 1 , params.n_seq_max );
145+ // it's probably best to keep as much precision as possible for the states
146+ type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states
147+ type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
148+ }
149+
150+ GGML_ASSERT (hparams.n_embd_head_k % ggml_blck_size (type_k) == 0 );
151+ GGML_ASSERT (hparams.n_embd_head_v % ggml_blck_size (type_v) == 0 );
152+
153+ if (!hparams.vocab_only ) {
154+ // GPU backends
155+ for (auto * dev : model.devices ) {
156+ ggml_backend_t backend = ggml_backend_dev_init (dev, nullptr );
157+ if (backend == nullptr ) {
158+ LLAMA_LOG_ERROR (" %s: failed to initialize %s backend\n " , __func__, ggml_backend_dev_name (dev));
159+ throw std::runtime_error (" failed to initialize backend" );
160+ }
161+ backends.emplace_back (backend);
162+ }
163+
164+ // add ACCEL backends (such as BLAS)
165+ for (size_t i = 0 ; i < ggml_backend_dev_count (); ++i) {
166+ ggml_backend_dev_t dev = ggml_backend_dev_get (i);
167+ if (ggml_backend_dev_type (dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
168+ ggml_backend_t backend = ggml_backend_dev_init (dev, nullptr );
169+ if (backend == nullptr ) {
170+ LLAMA_LOG_ERROR (" %s: failed to initialize %s backend\n " , __func__, ggml_backend_dev_name (dev));
171+ throw std::runtime_error (" failed to initialize backend" );
172+ }
173+ backends.emplace_back (backend);
174+ }
175+ }
176+
177+ // add CPU backend
178+ backend_cpu = ggml_backend_init_by_type (GGML_BACKEND_DEVICE_TYPE_CPU, nullptr );
179+ if (backend_cpu == nullptr ) {
180+ LLAMA_LOG_ERROR (" %s: failed to initialize CPU backend\n " , __func__);
181+ throw std::runtime_error (" failed to initialize CPU backend" );
182+ }
183+ backends.emplace_back (backend_cpu);
184+
185+ // create a list of the set_n_threads functions in the backends
186+ for (auto & backend : backends) {
187+ ggml_backend_dev_t dev = ggml_backend_get_device (backend.get ());
188+ ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg (dev) : nullptr ;
189+ if (reg) {
190+ auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t ) ggml_backend_reg_get_proc_address (reg, " ggml_backend_set_n_threads" );
191+ if (ggml_backend_set_n_threads_fn) {
192+ set_n_threads_fns.emplace_back (backend.get (), ggml_backend_set_n_threads_fn);
193+ }
194+ }
195+ }
196+
197+ llama_set_abort_callback (this , params.abort_callback , params.abort_callback_data );
198+
199+ if (!kv_self.init (model, cparams, type_k, type_v, kv_size, cparams.offload_kqv )) {
200+ LLAMA_LOG_ERROR (" %s: llama_kv_cache_init() failed for self-attention cache\n " , __func__);
201+ throw std::runtime_error (" failed to initialize self-attention cache" );
202+ }
203+
204+ {
205+ const size_t memory_size_k = kv_self.size_k_bytes ();
206+ const size_t memory_size_v = kv_self.size_v_bytes ();
207+
208+ LLAMA_LOG_INFO (" %s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n " , __func__,
209+ (float )(memory_size_k + memory_size_v) / (1024 .0f * 1024 .0f ),
210+ ggml_type_name (type_k), (float )memory_size_k / (1024 .0f * 1024 .0f ),
211+ ggml_type_name (type_v), (float )memory_size_v / (1024 .0f * 1024 .0f ));
212+ }
213+
214+ // graph outputs buffer
215+ {
216+ // resized during inference when a batch uses more outputs
217+ if (llama_output_reserve (*this , params.n_seq_max ) < params.n_seq_max ) {
218+ LLAMA_LOG_ERROR (" %s: failed to reserve initial output buffer\n " , __func__);
219+ throw std::runtime_error (" failed to reserve initial output buffer" );
220+ }
221+
222+ LLAMA_LOG_INFO (" %s: %10s output buffer size = %8.2f MiB\n " , __func__,
223+ ggml_backend_buffer_name (buf_output.get ()),
224+ ggml_backend_buffer_get_size (buf_output.get ()) / 1024.0 / 1024.0 );
225+ }
226+
227+ // scheduler and compute buffers
228+ {
229+ // buffer types used for the compute buffer of each backend
230+ std::vector<ggml_backend_buffer_type_t > backend_buft;
231+ std::vector<ggml_backend_t > backend_ptrs;
232+ for (auto & backend : backends) {
233+ auto * buft = ggml_backend_get_default_buffer_type (backend.get ());
234+ auto backend_type = ggml_backend_dev_type (ggml_backend_get_device (backend.get ()));
235+ if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model.devices .empty ()) {
236+ // use the host buffer of the first device CPU for faster transfer of the intermediate state
237+ auto * dev = model.devices [0 ];
238+ auto * host_buft = ggml_backend_dev_host_buffer_type (dev);
239+ if (host_buft) {
240+ buft = host_buft;
241+ }
242+ }
243+ backend_buft.push_back (buft);
244+ backend_ptrs.push_back (backend.get ());
245+ }
246+
247+ const size_t max_nodes = model.max_nodes ();
248+
249+ // buffer used to store the computation graph and the tensor meta data
250+ buf_compute_meta.resize (ggml_tensor_overhead ()*max_nodes + ggml_graph_overhead_custom (max_nodes, false ));
251+
252+ // TODO: move these checks to ggml_backend_sched
253+ // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
254+ bool pipeline_parallel =
255+ model.n_devices () > 1 &&
256+ model.params .n_gpu_layers > (int ) model.hparams .n_layer &&
257+ model.params .split_mode == LLAMA_SPLIT_MODE_LAYER &&
258+ params.offload_kqv ;
259+
260+ // pipeline parallelism requires support for async compute and events in all devices
261+ if (pipeline_parallel) {
262+ for (auto & backend : backends) {
263+ auto dev_type = ggml_backend_dev_type (ggml_backend_get_device (backend.get ()));
264+ if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
265+ // ignore CPU backend
266+ continue ;
267+ }
268+ auto * dev = ggml_backend_get_device (backend.get ());
269+ ggml_backend_dev_props props;
270+ ggml_backend_dev_get_props (dev, &props);
271+ if (!props.caps .async || !props.caps .events ) {
272+ // device does not support async compute or events
273+ pipeline_parallel = false ;
274+ break ;
275+ }
276+ }
277+ }
278+
279+ sched.reset (ggml_backend_sched_new (backend_ptrs.data (), backend_buft.data (), backend_ptrs.size (), max_nodes, pipeline_parallel));
280+
281+ if (pipeline_parallel) {
282+ LLAMA_LOG_INFO (" %s: pipeline parallelism enabled (n_copies=%d)\n " , __func__, ggml_backend_sched_get_n_copies (sched.get ()));
283+ }
284+
285+ // initialize scheduler with the worst-case graph
286+ uint32_t n_seqs = 1 ; // TODO: worst-case number of sequences
287+ uint32_t n_tokens = std::min (cparams.n_ctx , cparams.n_ubatch );
288+ llama_token token = model.vocab .token_bos (); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
289+
290+ llama_ubatch ubatch_pp = { true , n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr , nullptr , nullptr , nullptr , nullptr };
291+ ggml_cgraph * gf_pp = fn_build_graph_worst (*this , ubatch_pp);
292+
293+ // reserve pp graph first so that buffers are only allocated once
294+ ggml_backend_sched_reserve (sched.get (), gf_pp);
295+ int n_splits_pp = ggml_backend_sched_get_n_splits (sched.get ());
296+ int n_nodes_pp = ggml_graph_n_nodes (gf_pp);
297+
298+ // reserve with tg graph to get the number of splits and nodes
299+ llama_ubatch ubatch_tg = { true , 1 , 1 , n_seqs, &token, nullptr , nullptr , nullptr , nullptr , nullptr };
300+ ggml_cgraph * gf_tg = fn_build_graph_worst (*this , ubatch_tg);
301+ ggml_backend_sched_reserve (sched.get (), gf_tg);
302+ int n_splits_tg = ggml_backend_sched_get_n_splits (sched.get ());
303+ int n_nodes_tg = ggml_graph_n_nodes (gf_tg);
304+
305+ // reserve again with pp graph to avoid ggml-alloc reallocations during inference
306+ gf_pp = fn_build_graph_worst (*this , ubatch_pp);
307+ if (!ggml_backend_sched_reserve (sched.get (), gf_pp)) {
308+ LLAMA_LOG_ERROR (" %s: failed to allocate compute buffers\n " , __func__);
309+ throw std::runtime_error (" failed to allocate compute buffers" );
310+ }
311+
312+ for (size_t i = 0 ; i < backend_ptrs.size (); ++i) {
313+ ggml_backend_t backend = backend_ptrs[i];
314+ ggml_backend_buffer_type_t buft = backend_buft[i];
315+ size_t size = ggml_backend_sched_get_buffer_size (sched.get (), backend);
316+ if (size > 1 ) {
317+ LLAMA_LOG_INFO (" %s: %10s compute buffer size = %8.2f MiB\n " , __func__,
318+ ggml_backend_buft_name (buft),
319+ size / 1024.0 / 1024.0 );
320+ }
321+ }
322+
323+ if (n_nodes_pp == n_nodes_tg) {
324+ LLAMA_LOG_INFO (" %s: graph nodes = %d\n " , __func__, n_nodes_pp);
325+ } else {
326+ LLAMA_LOG_INFO (" %s: graph nodes = %d (with bs=%d), %d (with bs=1)\n " , __func__, n_nodes_pp, n_tokens, n_nodes_tg);
327+ }
328+ if (n_splits_pp == n_splits_tg) {
329+ LLAMA_LOG_INFO (" %s: graph splits = %d\n " , __func__, n_splits_pp);
330+ } else {
331+ LLAMA_LOG_INFO (" %s: graph splits = %d (with bs=%d), %d (with bs=1)\n " , __func__, n_splits_pp, n_tokens, n_splits_tg);
332+ }
333+ }
334+ }
335+
336+ }
337+
35338struct llama_batch_manager : public llama_batch_manager_i {
36339 llama_batch_manager (llama_context & lctx, const llama_batch & batch, bool logits_all) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) {
37340 const auto & hparams = lctx.model .hparams ;
@@ -81,7 +384,7 @@ struct llama_batch_manager : public llama_batch_manager_i {
81384
82385 // non-causal masks do not use the KV cache
83386 if (hparams.causal_attn ) {
84- llama_kv_self_update (& lctx);
387+ lctx. kv_self_update ( );
85388
86389 // if we have enough unused cells before the current head ->
87390 // better to start searching from the beginning of the cache, hoping to fill it
@@ -106,6 +409,8 @@ struct llama_batch_manager : public llama_batch_manager_i {
106409 }
107410 }
108411
412+ // printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
413+
109414 return true ;
110415 }
111416
0 commit comments