Skip to content

Commit 2689c9e

Browse files
Move SDPAToPA-related functionality to runtime (#2937)
## Description Move the SDPAToPA-related functionality to the transformation to avoid decoupling the code that needs to be a part of a single component. This would help to make the transformation more isolated and testable. Ticket: [CVS-165769](https://jira.devtools.intel.com/browse/CVS-165769) Signed-off-by: Andrii Staikov <andrii.staikov@intel.com>
1 parent eabc2de commit 2689c9e

File tree

9 files changed

+23
-90
lines changed

9 files changed

+23
-90
lines changed

.github/workflows/coverity.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ jobs:
4343
with:
4444
platform: ubuntu22
4545
commit_packages_to_provide: wheels
46-
revision: fcf7c2964cf460ecfcb039f748d1f4028626d58c
46+
revision: latest_available_commit
4747
# Set specific revision and uncomment to use OV from its PR build:
4848
# branch_name: master
4949
# event_name: pull_request

.github/workflows/linux.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ jobs:
9393
with:
9494
platform: ubuntu22
9595
commit_packages_to_provide: wheels,developer_package.tar.gz,openvino_node_npm_package.tar.gz
96-
revision: fcf7c2964cf460ecfcb039f748d1f4028626d58c
96+
revision: latest_available_commit
9797
# Set specific revision and uncomment to use OV from its PR build:
9898
# branch_name: master
9999
# event_name: pull_request

.github/workflows/mac.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ jobs:
8585
platform: macos_14_7
8686
arch: 'arm64'
8787
commit_packages_to_provide: wheels,developer_package.tar.gz,openvino_node_npm_package.tar.gz
88-
revision: fcf7c2964cf460ecfcb039f748d1f4028626d58c
88+
revision: latest_available_commit
8989
# Set specific revision and uncomment to use OV from its PR build:
9090
# branch_name: master
9191
# event_name: pull_request

.github/workflows/manylinux_2_28.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,10 @@ jobs:
9393
with:
9494
platform: almalinux8
9595
commit_packages_to_provide: wheels,developer_package.tar.gz,openvino_node_npm_package.tar.gz
96-
revision: fcf7c2964cf460ecfcb039f748d1f4028626d58c
96+
revision: latest_available_commit
97+
# Set specific revision and uncomment to use OV from its PR build:
98+
# branch_name: master
99+
# event_name: pull_request
97100

98101
- name: Clone docker tag from OpenVINO repo
99102
uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5.0.1

.github/workflows/windows.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ jobs:
8989
with:
9090
platform: windows
9191
commit_packages_to_provide: wheels,openvino_node_npm_package.zip
92-
revision: fcf7c2964cf460ecfcb039f748d1f4028626d58c
92+
revision: latest_available_commit
9393
# Set specific revision and uncomment to use OV from its PR build:
9494
# branch_name: master
9595
# event_name: pull_request

src/cpp/src/continuous_batching/paged_attention_transformations.cpp

Lines changed: 0 additions & 68 deletions
This file was deleted.

src/cpp/src/continuous_batching/paged_attention_transformations.hpp

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,6 @@ namespace genai {
1313

1414
namespace utils {
1515

16-
/** Applies transformations to the ov::Model to enable paged attention inference.
17-
* @param model Pointer to the ov::Model representing one of the supported LLM architectures.
18-
* @param device_config Configuration struct for inferencing device specifics.
19-
* @param per_layer_cache_control If true, then the transformations will enable per-layer control of KV cache blocks, allowing to specify
20-
* different sets of KV cache blocks for different attention layers. If false, then the KV cache block structure will be identical across all
21-
* decoder layers.
22-
* @param allow_cache_rotation If true, then the transformations will enable additional per-layer inputs to perform re-rotation of specific
23-
* blocks (in a RoPE fashion) before the inference step.
24-
* @param allow_xattention If true, then the transformations will enable additional per-layer inputs to control the XAttention block-sparse
25-
* attention optimization.
26-
*/
27-
void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, bool per_layer_cache_control = false, bool allow_cache_rotation = false, bool allow_xattention = false);
28-
2916
void apply_gather_before_matmul_transformation(std::shared_ptr<ov::Model> model);
3017

3118
} // namespace utils

src/cpp/src/continuous_batching/pipeline_impl.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#endif
1414

1515
#include "openvino/genai/text_streamer.hpp"
16+
#include "openvino/pass/sdpa_to_paged_attention.hpp"
1617
#include "continuous_batching/pipeline_impl.hpp"
1718
#include "utils.hpp"
1819
#include "continuous_batching/paged_attention_transformations.hpp"
@@ -76,7 +77,8 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(
7677
bool is_need_per_layer_cache_control = scheduler_config.use_cache_eviction;
7778
bool allow_cache_rotation = scheduler_config.cache_eviction_config.apply_rotation;
7879
bool allow_xattention = scheduler_config.use_sparse_attention && scheduler_config.sparse_attention_config.mode == SparseAttentionMode::XATTENTION;
79-
utils::apply_paged_attention_transformations(model, is_need_per_layer_cache_control, allow_cache_rotation, allow_xattention);
80+
bool allow_score_aggregation = true;
81+
ov::pass::SDPAToPagedAttention(is_need_per_layer_cache_control, is_need_per_layer_cache_control, allow_score_aggregation, allow_cache_rotation, allow_xattention).run_on_model(model);
8082
utils::apply_gather_before_matmul_transformation(model);
8183

8284
initialize_pipeline(model, scheduler_config, device, properties);

src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include <thread>
55

66
#include "openvino/genai/text_streamer.hpp"
7+
#include "openvino/pass/sdpa_to_paged_attention.hpp"
78
#include "speculative_decoding_impl.hpp"
89
#include "continuous_batching/paged_attention_transformations.hpp"
910
#include "utils.hpp"
@@ -35,9 +36,17 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con
3536

3637
auto main_scheduler_config = main_model_desc.scheduler_config;
3738
auto main_device = main_model_desc.device;
38-
39-
utils::apply_paged_attention_transformations(main_model, main_model_desc.scheduler_config.use_cache_eviction);
40-
utils::apply_paged_attention_transformations(draft_model, main_model_desc.scheduler_config.use_cache_eviction);
39+
bool allow_score_aggregation = true;
40+
bool allow_xattention = false;
41+
42+
ov::pass::SDPAToPagedAttention(main_model_desc.scheduler_config.use_cache_eviction,
43+
main_model_desc.scheduler_config.use_cache_eviction,
44+
allow_score_aggregation,
45+
allow_xattention).run_on_model(main_model);
46+
ov::pass::SDPAToPagedAttention(main_model_desc.scheduler_config.use_cache_eviction,
47+
main_model_desc.scheduler_config.use_cache_eviction,
48+
allow_score_aggregation,
49+
allow_xattention).run_on_model(draft_model);
4150

4251
utils::apply_gather_before_matmul_transformation(main_model);
4352
utils::apply_gather_before_matmul_transformation(draft_model);

0 commit comments

Comments
 (0)