huggingface
diff --git a/‎backends/gaudi/server/text_generation_server/layers/attention/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎backends/gaudi/server/text_generation_server/layers/attention/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/gaudi/server/text_generation_server/layers/attention/common.py‎
Lines changed: 9 additions & 27 deletions b/‎backends/gaudi/server/text_generation_server/layers/attention/common.py‎
Lines changed: 9 additions & 27 deletions
@@ -3,6 +3,7 @@
     HPUPagedAttentionMetadata,
     trim_attn_metadata,
     trim_seqlen_metadata,
+    _async_h2d_tensor_copy,
 )
 
 from .hpu import (
@@ -25,4 +26,5 @@
     "HPUPagedAttentionMetadata",
     "trim_seqlen_metadata",
     "trim_attn_metadata",
+    "_async_h2d_tensor_copy",
 ]
@@ -75,42 +75,27 @@ def trim_attn_metadata(metadata: HPUPagedAttentionMetadata) -> object:
 @dataclass
 class Seqlen:
     input_lengths: torch.Tensor
-    cache_lengths: torch.Tensor
-    cu_seqlen_q: Optional[torch.Tensor]
-    cu_seqlen_k: Optional[torch.Tensor]
 
     def __init__(
         self,
         input_lengths,
-        cache_lengths,
-        cu_seqlen_q=None,
     ):
         self.input_lengths = input_lengths
-        self.cache_lengths = cache_lengths
-        device = self.input_lengths.device
-        shape = self.input_lengths.shape
-        if cu_seqlen_q is None:
-            cu_seqlen_q = torch.arange(
-                shape[0] + 1,
-                device=device,
-                dtype=torch.int32,
-            )
-        cu_seqlen_k = torch.zeros(shape[-1] + 1, device=device, dtype=torch.int32)
-
-        # cuda graphs don't like this and this is necessary to clamp within mistral
-        # Although FA2 might not want the clamping
-        # cu_seqlen_k[0] = 0
-        total = self.input_lengths + self.cache_lengths
-        torch.cumsum(total, -1, out=cu_seqlen_k[1:])
-
-        self.cu_seqlen_q = cu_seqlen_q
-        self.cu_seqlen_k = cu_seqlen_k
 
     def clamp(self, max):
         # Flash decoding doesn't need to clamp
         return self
 
 
+def _async_h2d_tensor_copy(source, device="hpu"):
+    if source is None:
+        return None
+    assert source.device.type == "cpu", "Source tensor is not present in host memory!"
+    target = torch.empty(source.shape, dtype=source.dtype, device=device)
+    target.copy_(source, non_blocking=True)
+    return target
+
+
 def trim_seqlen_metadata(metadata: Seqlen) -> object:
     # NOTE(kzawora): To anyone working on this in the future:
     # Trimming metadata is required when using HPUGraphs.
@@ -137,9 +122,6 @@ def trim_seqlen_metadata(metadata: Seqlen) -> object:
         "TrimmedSeqlen",
         [
             "input_lengths",
-            "cache_lengths",
-            "cu_seqlen_q",
-            "cu_seqlen_k",
         ],
     )
     return attention_metadata
Original file line number	Diff line number	Diff line change
`@@ -3,6 +3,7 @@`
`3`	`3`	`HPUPagedAttentionMetadata,`
`4`	`4`	`trim_attn_metadata,`
`5`	`5`	`trim_seqlen_metadata,`
	`6`	`+ _async_h2d_tensor_copy,`
`6`	`7`	`)`
`7`	`8`
`8`	`9`	`from .hpu import (`
`@@ -25,4 +26,5 @@`
`25`	`26`	`"HPUPagedAttentionMetadata",`
`26`	`27`	`"trim_seqlen_metadata",`
`27`	`28`	`"trim_attn_metadata",`
	`29`	`+ "_async_h2d_tensor_copy",`
`28`	`30`	`]`