Move SDPAToPA-related functionality to runtime (#2937)

CuriousPanCake · web-flow · commit 2689c9ec9e28 · 2025-11-28T17:08:59.000+04:00
## Description Move the SDPAToPA-related functionality to the transformation to avoid decoupling the code that needs to be a part of a single component. This would help to make the transformation more isolated and testable. Ticket: [CVS-165769](https://jira.devtools.intel.com/browse/CVS-165769) Signed-off-by: Andrii Staikov <andrii.staikov@intel.com>
diff --git a/.github/workflows/coverity.yml b/.github/workflows/coverity.yml
@@ -43,7 +43,7 @@ jobs:
       with:
         platform: ubuntu22
         commit_packages_to_provide: wheels
-        revision: fcf7c2964cf460ecfcb039f748d1f4028626d58c
+        revision: latest_available_commit
         # Set specific revision and uncomment to use OV from its PR build:
         # branch_name: master
         # event_name: pull_request
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
@@ -93,7 +93,7 @@ jobs:
       with:
         platform: ubuntu22
         commit_packages_to_provide: wheels,developer_package.tar.gz,openvino_node_npm_package.tar.gz
-        revision: fcf7c2964cf460ecfcb039f748d1f4028626d58c
+        revision: latest_available_commit
         # Set specific revision and uncomment to use OV from its PR build:
         # branch_name: master
         # event_name: pull_request
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
@@ -85,7 +85,7 @@ jobs:
         platform: macos_14_7
         arch: 'arm64'
         commit_packages_to_provide: wheels,developer_package.tar.gz,openvino_node_npm_package.tar.gz
-        revision: fcf7c2964cf460ecfcb039f748d1f4028626d58c
+        revision: latest_available_commit
         # Set specific revision and uncomment to use OV from its PR build:
         # branch_name: master
         # event_name: pull_request
diff --git a/.github/workflows/manylinux_2_28.yml b/.github/workflows/manylinux_2_28.yml
@@ -93,7 +93,10 @@ jobs:
       with:
         platform: almalinux8
         commit_packages_to_provide: wheels,developer_package.tar.gz,openvino_node_npm_package.tar.gz
-        revision: fcf7c2964cf460ecfcb039f748d1f4028626d58c
+        revision: latest_available_commit
+        # Set specific revision and uncomment to use OV from its PR build:
+        # branch_name: master
+        # event_name: pull_request
 
     - name: Clone docker tag from OpenVINO repo
       uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5.0.1
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
@@ -89,7 +89,7 @@ jobs:
       with:
         platform: windows
         commit_packages_to_provide: wheels,openvino_node_npm_package.zip
-        revision: fcf7c2964cf460ecfcb039f748d1f4028626d58c
+        revision: latest_available_commit
         # Set specific revision and uncomment to use OV from its PR build:
         # branch_name: master
         # event_name: pull_request
diff --git a/src/cpp/src/continuous_batching/paged_attention_transformations.cpp b/src/cpp/src/continuous_batching/paged_attention_transformations.cpp
diff --git a/src/cpp/src/continuous_batching/paged_attention_transformations.hpp b/src/cpp/src/continuous_batching/paged_attention_transformations.hpp
@@ -13,19 +13,6 @@ namespace genai {
 
 namespace utils {
 
-/** Applies transformations to the ov::Model to enable paged attention inference.
- * @param model Pointer to the ov::Model representing one of the supported LLM architectures.
- * @param device_config Configuration struct for inferencing device specifics.
- * @param per_layer_cache_control If true, then the transformations will enable per-layer control of KV cache blocks, allowing to specify
- * different sets of KV cache blocks for different attention layers. If false, then the KV cache block structure will be identical across all
- * decoder layers.
- * @param allow_cache_rotation If true, then the transformations will enable additional per-layer inputs to perform re-rotation of specific
- * blocks (in a RoPE fashion) before the inference step.
- * @param allow_xattention If true, then the transformations will enable additional per-layer inputs to control the XAttention block-sparse
- * attention optimization.
- */
-void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, bool per_layer_cache_control = false, bool allow_cache_rotation = false, bool allow_xattention = false);
-
 void apply_gather_before_matmul_transformation(std::shared_ptr<ov::Model> model);
 
 }  // namespace utils
diff --git a/src/cpp/src/continuous_batching/pipeline_impl.cpp b/src/cpp/src/continuous_batching/pipeline_impl.cpp
@@ -13,6 +13,7 @@
 #endif
 
 #include "openvino/genai/text_streamer.hpp"
+#include "openvino/pass/sdpa_to_paged_attention.hpp"
 #include "continuous_batching/pipeline_impl.hpp"
 #include "utils.hpp"
 #include "continuous_batching/paged_attention_transformations.hpp"
@@ -76,7 +77,8 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(
     bool is_need_per_layer_cache_control = scheduler_config.use_cache_eviction;
     bool allow_cache_rotation = scheduler_config.cache_eviction_config.apply_rotation;
     bool allow_xattention = scheduler_config.use_sparse_attention && scheduler_config.sparse_attention_config.mode == SparseAttentionMode::XATTENTION;
-    utils::apply_paged_attention_transformations(model, is_need_per_layer_cache_control, allow_cache_rotation, allow_xattention);
+    bool allow_score_aggregation = true;
+    ov::pass::SDPAToPagedAttention(is_need_per_layer_cache_control, is_need_per_layer_cache_control, allow_score_aggregation, allow_cache_rotation, allow_xattention).run_on_model(model);
     utils::apply_gather_before_matmul_transformation(model);
 
     initialize_pipeline(model, scheduler_config, device, properties);
diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
@@ -4,6 +4,7 @@
 #include <thread>
 
 #include "openvino/genai/text_streamer.hpp"
+#include "openvino/pass/sdpa_to_paged_attention.hpp"
 #include "speculative_decoding_impl.hpp"
 #include "continuous_batching/paged_attention_transformations.hpp"
 #include "utils.hpp"
@@ -35,9 +36,17 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con
 
     auto main_scheduler_config = main_model_desc.scheduler_config;
     auto main_device = main_model_desc.device;
-
-    utils::apply_paged_attention_transformations(main_model, main_model_desc.scheduler_config.use_cache_eviction);
-    utils::apply_paged_attention_transformations(draft_model, main_model_desc.scheduler_config.use_cache_eviction);
+    bool allow_score_aggregation = true;
+    bool allow_xattention = false;
+
+    ov::pass::SDPAToPagedAttention(main_model_desc.scheduler_config.use_cache_eviction,
+                                   main_model_desc.scheduler_config.use_cache_eviction,
+                                   allow_score_aggregation,
+                                   allow_xattention).run_on_model(main_model);
+    ov::pass::SDPAToPagedAttention(main_model_desc.scheduler_config.use_cache_eviction,
+                                   main_model_desc.scheduler_config.use_cache_eviction,
+                                   allow_score_aggregation,
+                                   allow_xattention).run_on_model(draft_model);
 
     utils::apply_gather_before_matmul_transformation(main_model);
     utils::apply_gather_before_matmul_transformation(draft_model);