From c5bdec73d8132ba84a9b9f9d1369e4f39d5c25d0 Mon Sep 17 00:00:00 2001
From: lucifer4073 <lucifer4073@gmail.com>
Date: Sun, 29 Jun 2025 22:29:42 +0530
Subject: [PATCH 01/10] Init updated with informer

---
 aeon/networks/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/aeon/networks/__init__.py b/aeon/networks/__init__.py
index b6dd2d02dd..735dd354f3 100644
--- a/aeon/networks/__init__.py
+++ b/aeon/networks/__init__.py
@@ -18,6 +18,7 @@
     "AEDRNNNetwork",
     "AEBiGRUNetwork",
     "DisjointCNNNetwork",
+    "InformerNetwork",
 ]
 from aeon.networks._ae_abgru import AEAttentionBiGRUNetwork
 from aeon.networks._ae_bgru import AEBiGRUNetwork
@@ -31,6 +32,7 @@
 from aeon.networks._encoder import EncoderNetwork
 from aeon.networks._fcn import FCNNetwork
 from aeon.networks._inception import InceptionNetwork
+from aeon.networks._informer import InformerNetwork
 from aeon.networks._lite import LITENetwork
 from aeon.networks._mlp import MLPNetwork
 from aeon.networks._resnet import ResNetNetwork

From 18e557d1e39f895159a7b110241e74fbd9e3442b Mon Sep 17 00:00:00 2001
From: lucifer4073 <lucifer4073@gmail.com>
Date: Mon, 30 Jun 2025 00:48:21 +0530
Subject: [PATCH 02/10] initial stage of informer network added

---
 aeon/networks/_informer.py | 912 +++++++++++++++++++++++++++++++++++++
 1 file changed, 912 insertions(+)
 create mode 100644 aeon/networks/_informer.py

diff --git a/aeon/networks/_informer.py b/aeon/networks/_informer.py
new file mode 100644
index 0000000000..47d2d85ea4
--- /dev/null
+++ b/aeon/networks/_informer.py
@@ -0,0 +1,912 @@
+"""Informer Network for time series forecasting."""
+
+__maintainer__ = [""]
+
+from aeon.networks.base import BaseDeepLearningNetwork
+
+
+class InformerNetwork(BaseDeepLearningNetwork):
+    """
+    TensorFlow implementation of the Informer network for time series forecasting.
+
+    The Informer network is a Transformer-based architecture designed for
+    long sequence time-series forecasting. It uses ProbSparse self-attention
+    mechanism and distilling operation to reduce computational complexity.
+
+    Parameters
+    ----------
+    enc_in : int, default=7
+        Number of encoder input features.
+    dec_in : int, default=7
+        Number of decoder input features.
+    c_out : int, default=7
+        Number of output features.
+    seq_len : int, default=96
+        Input sequence length.
+    label_len : int, default=48
+        Start token length for decoder.
+    out_len : int, default=24
+        Prediction sequence length.
+    factor : int, default=5
+        ProbSparse attention factor.
+    d_model : int, default=512
+        Model dimension.
+    n_heads : int, default=8
+        Number of attention heads.
+    e_layers : int, default=3
+        Number of encoder layers.
+    d_layers : int, default=2
+        Number of decoder layers.
+    d_ff : int, default=512
+        Feed forward network dimension.
+    dropout : float, default=0.0
+        Dropout rate.
+    attn : str, default='prob'
+        Attention mechanism type ('prob' or 'full').
+    embed : str, default='fixed'
+        Embedding type.
+    freq : str, default='h'
+        Time frequency encoding.
+    activation : str, default='gelu'
+        Activation function.
+    output_attention : bool, default=False
+        Whether to output attention weights.
+    distil : bool, default=True
+        Whether to use distilling operation.
+    mix : bool, default=True
+        Whether to use mix attention in decoder.
+
+    References
+    ----------
+    .. [1] Zhou, H., Zhang, S., Peng, J., Zhang, S., Li, J., Xiong, H., & Zhang, W.
+           (2021). Informer: Beyond efficient transformer for long sequence
+           time-series forecasting. In Proceedings of the AAAI conference on
+           artificial intelligence (Vol. 35, No. 12, pp. 11106-11115).
+    """
+
+    _config = {
+        "python_dependencies": ["tensorflow"],
+        "python_version": "<3.13",
+        "structure": "encoder-decoder",
+    }
+
+    def __init__(
+        self,
+        enc_in=7,
+        dec_in=7,
+        c_out=7,
+        seq_len=96,
+        label_len=48,
+        out_len=24,
+        factor=5,
+        d_model=512,
+        n_heads=8,
+        e_layers=3,
+        d_layers=2,
+        d_ff=512,
+        dropout=0.0,
+        attn="prob",
+        embed="fixed",
+        freq="h",
+        activation="gelu",
+        output_attention=False,
+        distil=True,
+        mix=True,
+    ):
+        self.enc_in = enc_in
+        self.dec_in = dec_in
+        self.c_out = c_out
+        self.seq_len = seq_len
+        self.label_len = label_len
+        self.out_len = out_len
+        self.factor = factor
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.e_layers = e_layers
+        self.d_layers = d_layers
+        self.d_ff = d_ff
+        self.dropout = dropout
+        self.attn = attn
+        self.embed = embed
+        self.freq = freq
+        self.activation = activation
+        self.output_attention = output_attention
+        self.distil = distil
+        self.mix = mix
+
+        super().__init__()
+
+    def build_network(self, input_shape, **kwargs):
+        """
+        Construct the Informer network and return its input and output layers.
+
+        Parameters
+        ----------
+        input_shape : tuple of shape = (n_timepoints (m), n_channels (d))
+            The shape of the data fed into the input layer.
+
+        Returns
+        -------
+        input_layer : keras.layers.Input
+            The input layer of the network.
+        output_layer : keras.layers.Layer
+            The output layer of the network.
+        """
+        import tensorflow as tf
+
+        # Input layers
+        x_enc = tf.keras.layers.Input(
+            shape=(self.seq_len, self.enc_in), name="encoder_input"
+        )
+        x_mark_enc = tf.keras.layers.Input(shape=(self.seq_len, 4), name="encoder_mark")
+        x_dec = tf.keras.layers.Input(
+            shape=(self.label_len + self.out_len, self.dec_in), name="decoder_input"
+        )
+        x_mark_dec = tf.keras.layers.Input(
+            shape=(self.label_len + self.out_len, 4), name="decoder_mark"
+        )
+
+        # Encoder embedding
+        enc_embedding = self._data_embedding(
+            self.enc_in, self.d_model, self.embed, self.freq, self.dropout
+        )
+        enc_out = enc_embedding([x_enc, x_mark_enc])
+
+        # Encoder
+        encoder = self._build_encoder()
+        enc_out, attns = encoder(enc_out)
+
+        # Decoder embedding
+        dec_embedding = self._data_embedding(
+            self.dec_in, self.d_model, self.embed, self.freq, self.dropout
+        )
+        dec_out = dec_embedding([x_dec, x_mark_dec])
+
+        # Decoder
+        decoder = self._build_decoder()
+        dec_out = decoder([dec_out, enc_out])
+
+        # Final projection
+        projection = tf.keras.layers.Dense(self.c_out, use_bias=True, name="projection")
+        dec_out = projection(dec_out)
+
+        # Extract prediction sequence
+        output = tf.keras.layers.Lambda(
+            lambda x: x[:, -self.out_len :, :], name="prediction_slice"
+        )(dec_out)
+
+        # Create model inputs list
+        inputs = [x_enc, x_mark_enc, x_dec, x_mark_dec]
+
+        if self.output_attention:
+            outputs = [output, attns]
+        else:
+            outputs = output
+
+        return inputs, outputs
+
+    def _positional_embedding(self, d_model, max_len=5000):
+        """Create positional embedding layer."""
+        import math
+
+        import numpy as np
+        import tensorflow as tf
+
+        # Compute the positional encodings once in log space
+        pe = np.zeros((max_len, d_model), dtype=np.float32)
+        position = np.arange(0, max_len, dtype=np.float32)[:, np.newaxis]
+        div_term = np.exp(
+            np.arange(0, d_model, 2, dtype=np.float32) * -(math.log(10000.0) / d_model)
+        )
+
+        pe[:, 0::2] = np.sin(position * div_term)
+        pe[:, 1::2] = np.cos(position * div_term)
+        pe = pe[np.newaxis, :]  # Add batch dimension
+
+        # Create constant tensor
+        pe_tensor = tf.constant(pe, dtype=tf.float32)
+
+        def positional_function(x):
+            seq_len = tf.shape(x)[1]
+            return pe_tensor[:, :seq_len, :]
+
+        return positional_function
+
+    def _token_embedding(self, c_in, d_model):
+        """Create token embedding layer."""
+        import tensorflow as tf
+
+        token_conv = tf.keras.layers.Conv1D(
+            filters=d_model,
+            kernel_size=3,
+            padding="same",
+            kernel_initializer=tf.keras.initializers.HeNormal(),
+        )
+
+        def token_function(x):
+            return token_conv(x)
+
+        return token_function
+
+    def _fixed_embedding(self, c_in, d_model):
+        """Create fixed embedding layer."""
+        import math
+
+        import numpy as np
+        import tensorflow as tf
+
+        # Create fixed sinusoidal embeddings
+        w = np.zeros((c_in, d_model), dtype=np.float32)
+        position = np.arange(0, c_in, dtype=np.float32)[:, np.newaxis]
+        div_term = np.exp(
+            np.arange(0, d_model, 2, dtype=np.float32) * -(math.log(10000.0) / d_model)
+        )
+
+        w[:, 0::2] = np.sin(position * div_term)
+        w[:, 1::2] = np.cos(position * div_term)
+
+        # Create embedding layer with fixed weights
+        embedding = tf.keras.layers.Embedding(
+            input_dim=c_in,
+            output_dim=d_model,
+            embeddings_initializer="zeros",
+            trainable=False,
+        )
+
+        def fixed_function(x):
+            # Initialize weights if not already done
+            if not embedding.built:
+                embedding.build((None,))
+                embedding.embeddings.assign(w)
+            return tf.stop_gradient(embedding(x))
+
+        return fixed_function
+
+    def _temporal_embedding(self, d_model, embed_type, freq):
+        """Create temporal embedding layer."""
+        import tensorflow as tf
+
+        # Define embedding sizes
+        minute_size = 4
+        hour_size = 24
+        weekday_size = 7
+        day_size = 32
+        month_size = 13
+
+        # Choose embedding type
+        if embed_type == "fixed":
+            minute_embed = (
+                self._fixed_embedding(minute_size, d_model) if freq == "t" else None
+            )
+            hour_embed = self._fixed_embedding(hour_size, d_model)
+            weekday_embed = self._fixed_embedding(weekday_size, d_model)
+            day_embed = self._fixed_embedding(day_size, d_model)
+            month_embed = self._fixed_embedding(month_size, d_model)
+        else:
+            minute_embed = (
+                tf.keras.layers.Embedding(minute_size, d_model) if freq == "t" else None
+            )
+            hour_embed = tf.keras.layers.Embedding(hour_size, d_model)
+            weekday_embed = tf.keras.layers.Embedding(weekday_size, d_model)
+            day_embed = tf.keras.layers.Embedding(day_size, d_model)
+            month_embed = tf.keras.layers.Embedding(month_size, d_model)
+
+        def temporal_function(x):
+            x = tf.cast(x, tf.int32)
+
+            minute_x = minute_embed(x[:, :, 4]) if minute_embed is not None else 0.0
+            hour_x = hour_embed(x[:, :, 3])
+            weekday_x = weekday_embed(x[:, :, 2])
+            day_x = day_embed(x[:, :, 1])
+            month_x = month_embed(x[:, :, 0])
+
+            return hour_x + weekday_x + day_x + month_x + minute_x
+
+        return temporal_function
+
+    def _time_feature_embedding(self, d_model, embed_type, freq):
+        """Create time feature embedding layer."""
+        import tensorflow as tf
+
+        freq_map = {"h": 4, "t": 5, "s": 6, "m": 1, "a": 1, "w": 2, "d": 3, "b": 3}
+        d_inp = freq_map[freq]
+
+        embed_layer = tf.keras.layers.Dense(d_model)
+
+        def time_feature_function(x):
+            return embed_layer(x)
+
+        return time_feature_function
+
+    def _data_embedding(self, c_in, d_model, embed_type, freq, dropout):
+        """Create data embedding layer."""
+        import tensorflow as tf
+
+        # Create embedding components
+        value_embedding = self._token_embedding(c_in, d_model)
+        position_embedding = self._positional_embedding(d_model)
+
+        if embed_type != "timeF":
+            temporal_embedding = self._temporal_embedding(d_model, embed_type, freq)
+        else:
+            temporal_embedding = self._time_feature_embedding(d_model, embed_type, freq)
+
+        dropout_layer = tf.keras.layers.Dropout(dropout)
+
+        def embedding_function(inputs, training=None):
+            x, x_mark = inputs
+
+            value_emb = value_embedding(x)
+            pos_emb = position_embedding(x)
+            temporal_emb = temporal_embedding(x_mark)
+
+            embeddings = value_emb + pos_emb + temporal_emb
+            return dropout_layer(embeddings, training=training)
+
+        return embedding_function
+
+    def _build_encoder(self):
+        """Build the encoder stack with attention layers."""
+        import tensorflow as tf
+
+        # Choose attention type
+        if self.attn == "prob":
+            Attn = self._prob_attention(
+                False, self.factor, self.dropout, self.output_attention
+            )
+        else:
+            Attn = self._full_attention(
+                False, self.factor, self.dropout, self.output_attention
+            )
+
+        # Build encoder layers
+        encoder_layers = []
+        for l in range(self.e_layers):
+            attention_layer = self._attention_layer(
+                Attn, self.d_model, self.n_heads, mix=False
+            )
+            encoder_layer = self._encoder_layer(
+                attention_layer, self.d_model, self.d_ff, self.dropout, self.activation
+            )
+            encoder_layers.append(encoder_layer)
+
+        # Build conv layers for distilling
+        conv_layers = None
+        if self.distil:
+            conv_layers = []
+            for l in range(self.e_layers - 1):
+                conv_layer = self._conv_layer(self.d_model)
+                conv_layers.append(conv_layer)
+
+        # Normalization layer
+        norm_layer = tf.keras.layers.LayerNormalization()
+
+        def encoder_function(x, attn_mask=None, training=None):
+            # x [B, L, D]
+            attns = []
+
+            if conv_layers is not None:
+                # Process with both attention and conv layers
+                for attn_layer, conv_layer in zip(encoder_layers, conv_layers):
+                    x, attn = attn_layer(x, attn_mask=attn_mask, training=training)
+                    x = conv_layer(x, training=training)
+                    attns.append(attn)
+
+                # Final attention layer
+                x, attn = encoder_layers[-1](x, attn_mask=attn_mask, training=training)
+                attns.append(attn)
+            else:
+                # Process with only attention layers
+                for attn_layer in encoder_layers:
+                    x, attn = attn_layer(x, attn_mask=attn_mask, training=training)
+                    attns.append(attn)
+
+            if norm_layer is not None:
+                x = norm_layer(x, training=training)
+
+            return x, attns
+
+        return encoder_function
+
+    def _build_decoder(self):
+        """Build the decoder stack with attention layers."""
+        import tensorflow as tf
+
+        # Build decoder layers
+        decoder_layers = []
+        for l in range(self.d_layers):
+            # Self-attention (with mask)
+            self_attn = (
+                self._prob_attention(True, self.factor, self.dropout, False)
+                if self.attn == "prob"
+                else self._full_attention(True, self.factor, self.dropout, False)
+            )
+            self_attention_layer = self._attention_layer(
+                self_attn, self.d_model, self.n_heads, self.mix
+            )
+
+            # Cross-attention (without mask)
+            cross_attn = self._full_attention(False, self.factor, self.dropout, False)
+            cross_attention_layer = self._attention_layer(
+                cross_attn, self.d_model, self.n_heads, False
+            )
+
+            decoder_layer = self._decoder_layer(
+                self_attention_layer,
+                cross_attention_layer,
+                self.d_model,
+                self.d_ff,
+                self.dropout,
+                self.activation,
+            )
+            decoder_layers.append(decoder_layer)
+
+        # Normalization layer
+        norm_layer = tf.keras.layers.LayerNormalization()
+
+        def decoder_function(inputs, training=None):
+            x, cross = inputs
+            x_mask = None  # Can be added as parameter if needed
+            cross_mask = None  # Can be added as parameter if needed
+
+            for layer in decoder_layers:
+                x = layer(
+                    x, cross, x_mask=x_mask, cross_mask=cross_mask, training=training
+                )
+
+            if norm_layer is not None:
+                x = norm_layer(x, training=training)
+
+            return x
+
+        return decoder_function
+
+    def _prob_attention(self, mask_flag, factor, attention_dropout, output_attention):
+        """Create ProbSparse attention mechanism."""
+        from math import sqrt
+
+        import numpy as np
+        import tensorflow as tf
+
+        dropout_layer = tf.keras.layers.Dropout(attention_dropout)
+
+        def _prob_QK(Q, K, sample_k, n_top):
+            # Q [B, H, L, D]
+            B, H, L_K, E = (
+                tf.shape(K)[0],
+                tf.shape(K)[1],
+                tf.shape(K)[2],
+                tf.shape(K)[3],
+            )
+            L_Q = tf.shape(Q)[2]
+
+            # calculate the sampled Q_K
+            K_expand = tf.expand_dims(K, axis=-3)  # [B, H, 1, L_K, E]
+            K_expand = tf.tile(K_expand, [1, 1, L_Q, 1, 1])  # [B, H, L_Q, L_K, E]
+
+            # Generate random indices for sampling
+            index_sample = tf.random.uniform(
+                (L_Q, sample_k), maxval=L_K, dtype=tf.int32
+            )
+
+            # Create indices for gathering
+            batch_indices = tf.range(B)[:, None, None, None, None]
+            head_indices = tf.range(H)[None, :, None, None, None]
+            query_indices = tf.range(L_Q)[None, None, :, None, None]
+            sample_indices = index_sample[None, None, :, :, None]
+
+            # Gather K_sample
+            gather_indices = tf.concat(
+                [
+                    tf.broadcast_to(batch_indices, [B, H, L_Q, sample_k, 1]),
+                    tf.broadcast_to(head_indices, [B, H, L_Q, sample_k, 1]),
+                    tf.broadcast_to(query_indices, [B, H, L_Q, sample_k, 1]),
+                    tf.broadcast_to(sample_indices, [B, H, L_Q, sample_k, 1]),
+                ],
+                axis=-1,
+            )
+
+            K_sample = tf.gather_nd(
+                K_expand, gather_indices
+            )  # [B, H, L_Q, sample_k, E]
+
+            # Calculate Q_K_sample
+            Q_expanded = tf.expand_dims(Q, axis=-2)  # [B, H, L_Q, 1, E]
+            Q_K_sample = tf.matmul(
+                Q_expanded, K_sample, transpose_b=True
+            )  # [B, H, L_Q, 1, sample_k]
+            Q_K_sample = tf.squeeze(Q_K_sample, axis=-2)  # [B, H, L_Q, sample_k]
+
+            # find the Top_k query with sparsity measurement
+            M = tf.reduce_max(Q_K_sample, axis=-1) - tf.reduce_sum(
+                Q_K_sample, axis=-1
+            ) / tf.cast(L_K, tf.float32)
+            M_top = tf.nn.top_k(M, k=n_top, sorted=False).indices
+
+            # use the reduced Q to calculate Q_K
+            batch_idx = tf.range(B)[:, None, None]
+            head_idx = tf.range(H)[None, :, None]
+
+            gather_indices_q = tf.stack(
+                [
+                    tf.broadcast_to(batch_idx, tf.shape(M_top)),
+                    tf.broadcast_to(head_idx, tf.shape(M_top)),
+                    M_top,
+                ],
+                axis=-1,
+            )
+
+            Q_reduce = tf.gather_nd(Q, gather_indices_q)  # [B, H, n_top, E]
+            Q_K = tf.matmul(Q_reduce, K, transpose_b=True)  # [B, H, n_top, L_K]
+
+            return Q_K, M_top
+
+        def _get_initial_context(V, L_Q):
+            B, H, L_V, D = (
+                tf.shape(V)[0],
+                tf.shape(V)[1],
+                tf.shape(V)[2],
+                tf.shape(V)[3],
+            )
+
+            if not mask_flag:
+                V_sum = tf.reduce_mean(V, axis=-2)  # [B, H, D]
+                context = tf.expand_dims(V_sum, axis=-2)  # [B, H, 1, D]
+                context = tf.tile(context, [1, 1, L_Q, 1])  # [B, H, L_Q, D]
+            else:
+                # For masked case, L_Q should equal L_V
+                context = tf.cumsum(V, axis=-2)
+
+            return context
+
+        def _prob_mask(B, H, L, index, scores):
+            # Create upper triangular mask (excluding diagonal)
+            L_scores = tf.shape(scores)[-1]
+            _mask = tf.linalg.band_part(tf.ones((L, L_scores), dtype=tf.bool), 0, -1)
+            _mask = tf.logical_not(_mask)  # Upper triangular without diagonal
+
+            # Expand mask for batch and head dimensions
+            _mask_ex = tf.tile(
+                tf.expand_dims(tf.expand_dims(_mask, 0), 0), [B, H, 1, 1]
+            )
+
+            # Gather mask at specified indices
+            batch_idx = tf.range(B)[:, None, None]
+            head_idx = tf.range(H)[None, :, None]
+
+            gather_indices = tf.stack(
+                [
+                    tf.broadcast_to(batch_idx, tf.shape(index)),
+                    tf.broadcast_to(head_idx, tf.shape(index)),
+                    index,
+                ],
+                axis=-1,
+            )
+
+            indicator = tf.gather_nd(_mask_ex, gather_indices)
+            return indicator
+
+        def _update_context(context_in, V, scores, index, L_Q, attn_mask):
+            B, H, L_V, D = (
+                tf.shape(V)[0],
+                tf.shape(V)[1],
+                tf.shape(V)[2],
+                tf.shape(V)[3],
+            )
+
+            if mask_flag:
+                attn_mask = _prob_mask(B, H, L_Q, index, scores)
+                scores = tf.where(
+                    attn_mask, tf.fill(tf.shape(scores), float("-inf")), scores
+                )
+
+            attn = tf.nn.softmax(scores, axis=-1)
+
+            # Calculate attention-weighted values
+            attn_V = tf.matmul(attn, V)  # [B, H, n_top, D]
+
+            # Update context_in at specified indices
+            batch_idx = tf.range(B)[:, None, None]
+            head_idx = tf.range(H)[None, :, None]
+
+            update_indices = tf.stack(
+                [
+                    tf.broadcast_to(batch_idx, tf.shape(index)),
+                    tf.broadcast_to(head_idx, tf.shape(index)),
+                    index,
+                ],
+                axis=-1,
+            )
+
+            context_in = tf.tensor_scatter_nd_update(context_in, update_indices, attn_V)
+
+            if output_attention:
+                # Initialize full attention matrix
+                attns = tf.ones([B, H, L_V, L_V], dtype=attn.dtype) / tf.cast(
+                    L_V, attn.dtype
+                )
+                attns = tf.tensor_scatter_nd_update(attns, update_indices, attn)
+                return context_in, attns
+            else:
+                return context_in, None
+
+        def prob_attention_function(
+            queries, keys, values, attn_mask=None, training=None
+        ):
+            B, L_Q, H, D = (
+                tf.shape(queries)[0],
+                tf.shape(queries)[1],
+                tf.shape(queries)[2],
+                tf.shape(queries)[3],
+            )
+            L_K = tf.shape(keys)[1]
+
+            # Transpose to [B, H, L, D] format
+            queries = tf.transpose(queries, perm=[0, 2, 1, 3])
+            keys = tf.transpose(keys, perm=[0, 2, 1, 3])
+            values = tf.transpose(values, perm=[0, 2, 1, 3])
+
+            # Calculate sampling parameters
+            U_part = int(factor * np.ceil(np.log(L_K)))
+            u = int(factor * np.ceil(np.log(L_Q)))
+
+            U_part = min(U_part, L_K)
+            u = min(u, L_Q)
+
+            # Get top-k scores and indices
+            scores_top, index = _prob_QK(queries, keys, sample_k=U_part, n_top=u)
+
+            # Apply scale factor
+            scale = 1.0 / sqrt(D)
+            if scale is not None:
+                scores_top = scores_top * scale
+
+            # Get initial context and update with top-k queries
+            context = _get_initial_context(values, L_Q)
+            context, attn = _update_context(
+                context, values, scores_top, index, L_Q, attn_mask
+            )
+
+            # Transpose back to [B, L, H, D] format
+            context = tf.transpose(context, perm=[0, 2, 1, 3])
+
+            return context, attn
+
+        return prob_attention_function
+
+    def _full_attention(self, mask_flag, factor, attention_dropout, output_attention):
+        """Create full attention mechanism."""
+        import numpy as np
+        import tensorflow as tf
+
+        dropout_layer = tf.keras.layers.Dropout(attention_dropout)
+
+        def _triangular_causal_mask(B, L):
+            """Create triangular causal mask for attention."""
+            mask_shape = [B, 1, L, L]
+            # Create upper triangular mask (excluding diagonal)
+            mask = tf.linalg.band_part(tf.ones(mask_shape, dtype=tf.bool), 0, -1)
+            mask = tf.logical_not(tf.linalg.band_part(mask, 0, 0))  # Remove diagonal
+            return mask
+
+        def full_attention_function(
+            queries, keys, values, attn_mask=None, training=None
+        ):
+            # Get shapes
+            B = tf.shape(queries)[0]
+            L = tf.shape(queries)[1]
+            H = tf.shape(queries)[2]
+            E = tf.shape(queries)[3]
+            S = tf.shape(keys)[1]
+            D = tf.shape(values)[3]
+
+            # Calculate scale
+            scale = 1.0 / tf.math.sqrt(tf.cast(E, tf.float32))
+
+            # Compute attention scores: "blhe,bshe->bhls"
+            scores = tf.einsum("blhe,bshe->bhls", queries, keys)
+
+            if mask_flag:
+                if attn_mask is None:
+                    attn_mask = _triangular_causal_mask(B, L)
+                else:
+                    # If attn_mask is provided, use its mask attribute if it's an object
+                    if hasattr(attn_mask, "mask"):
+                        attn_mask = attn_mask.mask
+
+                # Apply mask by setting masked positions to -inf
+                scores = tf.where(
+                    attn_mask,
+                    tf.fill(tf.shape(scores), tf.constant(-np.inf, dtype=scores.dtype)),
+                    scores,
+                )
+
+            # Apply scale and softmax
+            A = tf.nn.softmax(scale * scores, axis=-1)
+
+            # Apply dropout
+            A = dropout_layer(A, training=training)
+
+            # Compute output: "bhls,bshd->blhd"
+            V = tf.einsum("bhls,bshd->blhd", A, values)
+
+            if output_attention:
+                return V, A
+            else:
+                return V, None
+
+        return full_attention_function
+
+    def _attention_layer(self, attention, d_model, n_heads, mix):
+        """Create attention layer wrapper."""
+        import tensorflow as tf
+
+        d_keys = d_model // n_heads
+        d_values = d_model // n_heads
+
+        # Linear projection layers for Q, K, V
+        query_dense = tf.keras.layers.Dense(d_model)
+        key_dense = tf.keras.layers.Dense(d_model)
+        value_dense = tf.keras.layers.Dense(d_model)
+
+        # Output projection
+        out_projection = tf.keras.layers.Dense(d_model)
+
+        def attention_layer_function(
+            queries, keys, values, attn_mask=None, training=None
+        ):
+            B, L, _ = tf.shape(queries)[0], tf.shape(queries)[1], tf.shape(queries)[2]
+            S = tf.shape(keys)[1]
+            H = n_heads
+
+            # Linear projections in batch from d_model => h x d_k
+            Q = query_dense(queries)
+            K = key_dense(keys)
+            V = value_dense(values)
+
+            # Reshape to (B, L, H, d_k) and transpose to (B, H, L, d_k)
+            Q = tf.reshape(Q, [B, L, H, d_keys])
+            K = tf.reshape(K, [B, S, H, d_keys])
+            V = tf.reshape(V, [B, S, H, d_values])
+
+            Q = tf.transpose(Q, [0, 2, 1, 3])  # (B, H, L, d_k)
+            K = tf.transpose(K, [0, 2, 1, 3])  # (B, H, S, d_k)
+            V = tf.transpose(V, [0, 2, 1, 3])  # (B, H, S, d_v)
+
+            # Apply attention function
+            out, attn = attention(Q, K, V, attn_mask=attn_mask, training=training)
+
+            # Concatenate heads and put through final linear layer
+            # out shape: (B, H, L, d_v) -> (B, L, H, d_v) -> (B, L, H*d_v)
+            out = tf.transpose(out, [0, 2, 1, 3])
+            out = tf.reshape(out, [B, L, H * d_values])
+
+            # Apply mix transformation if needed
+            if mix:
+                # Reshape to (B, L, H, d_values) then transpose to (B, H, L, d_values)
+                out = tf.reshape(out, [B, L, H, d_values])
+                out = tf.transpose(out, [0, 2, 1, 3])
+                out = tf.reshape(out, [B, L, H * d_values])
+
+            # Final output projection
+            out = out_projection(out)
+
+            return out, attn
+
+        return attention_layer_function
+
+    def _encoder_layer(self, attention_layer, d_model, d_ff, dropout, activation):
+        """Create single encoder layer."""
+        import tensorflow as tf
+
+        d_ff = d_ff or 4 * d_model
+
+        # Conv1D layers for feed-forward network
+        conv1 = tf.keras.layers.Conv1D(filters=d_ff, kernel_size=1)
+        conv2 = tf.keras.layers.Conv1D(filters=d_model, kernel_size=1)
+
+        # Layer normalization
+        norm1 = tf.keras.layers.LayerNormalization()
+        norm2 = tf.keras.layers.LayerNormalization()
+
+        # Dropout
+        dropout_layer = tf.keras.layers.Dropout(dropout)
+
+        # Activation function
+        if activation == "relu":
+            activation_fn = tf.nn.relu
+        else:
+            activation_fn = tf.nn.gelu
+
+        def encoder_layer_function(x, attn_mask=None, training=None):
+            # Self-attention with residual connection
+            new_x, attn = attention_layer(
+                x, x, x, attn_mask=attn_mask, training=training
+            )
+            x = x + dropout_layer(new_x, training=training)
+            y = x = norm1(x, training=training)
+
+            # Feed-forward network with residual connection
+            y = conv1(y)
+            y = dropout_layer(activation_fn(y), training=training)
+            y = conv2(y)
+            y = dropout_layer(y, training=training)
+
+            return norm2(x + y, training=training), attn
+
+        return encoder_layer_function
+
+    def _decoder_layer(
+        self, self_attention, cross_attention, d_model, d_ff, dropout, activation
+    ):
+        """Create single decoder layer."""
+        import tensorflow as tf
+
+        d_ff = d_ff or 4 * d_model
+
+        # Conv1D layers equivalent to PyTorch's Conv1d
+        conv1 = tf.keras.layers.Conv1D(filters=d_ff, kernel_size=1)
+        conv2 = tf.keras.layers.Conv1D(filters=d_model, kernel_size=1)
+
+        # Layer normalization
+        norm1 = tf.keras.layers.LayerNormalization()
+        norm2 = tf.keras.layers.LayerNormalization()
+        norm3 = tf.keras.layers.LayerNormalization()
+
+        # Dropout
+        dropout_layer = tf.keras.layers.Dropout(dropout)
+
+        # Activation function
+        if activation == "relu":
+            activation_fn = tf.nn.relu
+        else:
+            activation_fn = tf.nn.gelu
+
+        def decoder_layer_function(
+            x, cross, x_mask=None, cross_mask=None, training=None
+        ):
+            # Self-attention with residual connection
+            self_attn_out = self_attention(
+                x, x, x, attn_mask=x_mask, training=training
+            )[0]
+            x = x + dropout_layer(self_attn_out, training=training)
+            x = norm1(x, training=training)
+
+            # Cross-attention with residual connection
+            cross_attn_out = cross_attention(
+                x, cross, cross, attn_mask=cross_mask, training=training
+            )[0]
+            x = x + dropout_layer(cross_attn_out, training=training)
+            y = x = norm2(x, training=training)
+
+            # Feed-forward network with residual connection
+            y = conv1(y)
+            y = dropout_layer(activation_fn(y), training=training)
+            y = conv2(y)
+            y = dropout_layer(y, training=training)
+
+            return norm3(x + y, training=training)
+
+        return decoder_layer_function
+
+    def _conv_layer(self, d_model):
+        """Create convolution layer for distilling."""
+        import tensorflow as tf
+
+        # TensorFlow doesn't have direct circular padding, using 'same' padding
+        downConv = tf.keras.layers.Conv1D(
+            filters=d_model, kernel_size=3, padding="same", activation=None
+        )
+        norm = tf.keras.layers.BatchNormalization()
+        activation = tf.keras.layers.ELU()
+        maxPool = tf.keras.layers.MaxPool1D(pool_size=3, strides=2, padding="same")
+
+        def conv_layer_function(x, training=None):
+            # x shape: [B, L, D] -> Conv1D expects [B, L, C]
+            x = downConv(x)
+            x = norm(x, training=training)
+            x = activation(x)
+            x = maxPool(x)
+            return x
+
+        return conv_layer_function

From 40a58e4c642a5f84f6b7602b87b7831128ab8b05 Mon Sep 17 00:00:00 2001
From: lucifer4073 <lucifer4073@gmail.com>
Date: Mon, 21 Jul 2025 18:05:22 +0530
Subject: [PATCH 03/10] informer made according to aeon standard

---
 aeon/networks/_informer.py           | 1357 ++++++++++++--------------
 aeon/networks/tests/test_informer.py |  221 +++++
 aeon/utils/networks/attention.py     |  350 +++++++
 3 files changed, 1210 insertions(+), 718 deletions(-)
 create mode 100644 aeon/networks/tests/test_informer.py
 create mode 100644 aeon/utils/networks/attention.py

diff --git a/aeon/networks/_informer.py b/aeon/networks/_informer.py
index 47d2d85ea4..dfbf4b1bfc 100644
--- a/aeon/networks/_informer.py
+++ b/aeon/networks/_informer.py
@@ -2,7 +2,17 @@
 
 __maintainer__ = [""]
 
+from typing import Optional
+
 from aeon.networks.base import BaseDeepLearningNetwork
+from aeon.utils.networks.attention import (
+    AttentionLayer,
+    KerasProbAttention,
+)
+from aeon.utils.validation._dependencies import _check_soft_dependencies
+
+if _check_soft_dependencies(["tensorflow"], severity="none"):
+    import tensorflow as tf
 
 
 class InformerNetwork(BaseDeepLearningNetwork):
@@ -15,12 +25,6 @@ class InformerNetwork(BaseDeepLearningNetwork):
 
     Parameters
     ----------
-    enc_in : int, default=7
-        Number of encoder input features.
-    dec_in : int, default=7
-        Number of decoder input features.
-    c_out : int, default=7
-        Number of output features.
     seq_len : int, default=96
         Input sequence length.
     label_len : int, default=48
@@ -43,14 +47,8 @@ class InformerNetwork(BaseDeepLearningNetwork):
         Dropout rate.
     attn : str, default='prob'
         Attention mechanism type ('prob' or 'full').
-    embed : str, default='fixed'
-        Embedding type.
-    freq : str, default='h'
-        Time frequency encoding.
     activation : str, default='gelu'
         Activation function.
-    output_attention : bool, default=False
-        Whether to output attention weights.
     distil : bool, default=True
         Whether to use distilling operation.
     mix : bool, default=True
@@ -59,9 +57,9 @@ class InformerNetwork(BaseDeepLearningNetwork):
     References
     ----------
     .. [1] Zhou, H., Zhang, S., Peng, J., Zhang, S., Li, J., Xiong, H., & Zhang, W.
-           (2021). Informer: Beyond efficient transformer for long sequence
-           time-series forecasting. In Proceedings of the AAAI conference on
-           artificial intelligence (Vol. 35, No. 12, pp. 11106-11115).
+        (2021). Informer: Beyond efficient transformer for long sequence
+        time-series forecasting. In Proceedings of the AAAI conference on
+        artificial intelligence (Vol. 35, No. 12, pp. 11106-11115).
     """
 
     _config = {
@@ -72,30 +70,21 @@ class InformerNetwork(BaseDeepLearningNetwork):
 
     def __init__(
         self,
-        enc_in=7,
-        dec_in=7,
-        c_out=7,
-        seq_len=96,
-        label_len=48,
-        out_len=24,
-        factor=5,
-        d_model=512,
-        n_heads=8,
-        e_layers=3,
-        d_layers=2,
-        d_ff=512,
-        dropout=0.0,
-        attn="prob",
-        embed="fixed",
-        freq="h",
-        activation="gelu",
-        output_attention=False,
-        distil=True,
-        mix=True,
+        seq_len: int = 96,
+        label_len: int = 48,
+        out_len: int = 24,
+        factor: int = 5,
+        d_model: int = 512,
+        n_heads: int = 8,
+        e_layers: int = 3,
+        d_layers: int = 2,
+        d_ff: int = 512,
+        dropout: float = 0.0,
+        attn: str = "prob",
+        activation: str = "gelu",
+        distil: bool = True,
+        mix: bool = True,
     ):
-        self.enc_in = enc_in
-        self.dec_in = dec_in
-        self.c_out = c_out
         self.seq_len = seq_len
         self.label_len = label_len
         self.out_len = out_len
@@ -107,806 +96,738 @@ def __init__(
         self.d_ff = d_ff
         self.dropout = dropout
         self.attn = attn
-        self.embed = embed
-        self.freq = freq
         self.activation = activation
-        self.output_attention = output_attention
         self.distil = distil
         self.mix = mix
 
         super().__init__()
 
-    def build_network(self, input_shape, **kwargs):
+    def _token_embedding(
+        self, input_tensor: tf.Tensor, c_in: int, d_model: int
+    ) -> tf.Tensor:
         """
-        Construct the Informer network and return its input and output layers.
+        Token embedding layer using 1D convolution with causal padding.
 
         Parameters
         ----------
-        input_shape : tuple of shape = (n_timepoints (m), n_channels (d))
-            The shape of the data fed into the input layer.
+        input_tensor : tf.Tensor
+            Input tensor to be processed.
+        c_in : int
+            Number of input channels.
+        d_model : int
+            Dimension of the model (number of output filters).
 
         Returns
         -------
-        input_layer : keras.layers.Input
-            The input layer of the network.
-        output_layer : keras.layers.Layer
-            The output layer of the network.
+        tf.Tensor
+            Output tensor after token embedding transformation.
         """
         import tensorflow as tf
 
-        # Input layers
-        x_enc = tf.keras.layers.Input(
-            shape=(self.seq_len, self.enc_in), name="encoder_input"
-        )
-        x_mark_enc = tf.keras.layers.Input(shape=(self.seq_len, 4), name="encoder_mark")
-        x_dec = tf.keras.layers.Input(
-            shape=(self.label_len + self.out_len, self.dec_in), name="decoder_input"
-        )
-        x_mark_dec = tf.keras.layers.Input(
-            shape=(self.label_len + self.out_len, 4), name="decoder_mark"
-        )
-
-        # Encoder embedding
-        enc_embedding = self._data_embedding(
-            self.enc_in, self.d_model, self.embed, self.freq, self.dropout
-        )
-        enc_out = enc_embedding([x_enc, x_mark_enc])
-
-        # Encoder
-        encoder = self._build_encoder()
-        enc_out, attns = encoder(enc_out)
-
-        # Decoder embedding
-        dec_embedding = self._data_embedding(
-            self.dec_in, self.d_model, self.embed, self.freq, self.dropout
-        )
-        dec_out = dec_embedding([x_dec, x_mark_dec])
-
-        # Decoder
-        decoder = self._build_decoder()
-        dec_out = decoder([dec_out, enc_out])
-
-        # Final projection
-        projection = tf.keras.layers.Dense(self.c_out, use_bias=True, name="projection")
-        dec_out = projection(dec_out)
+        x = tf.keras.layers.Conv1D(
+            filters=d_model, kernel_size=3, padding="causal", activation="linear"
+        )(input_tensor)
+        x = tf.keras.layers.LeakyReLU()(x)
+        return x
 
-        # Extract prediction sequence
-        output = tf.keras.layers.Lambda(
-            lambda x: x[:, -self.out_len :, :], name="prediction_slice"
-        )(dec_out)
-
-        # Create model inputs list
-        inputs = [x_enc, x_mark_enc, x_dec, x_mark_dec]
-
-        if self.output_attention:
-            outputs = [output, attns]
-        else:
-            outputs = output
+    def _positional_embedding(
+        self, input_tensor: tf.Tensor, d_model: int, max_len: int = 5000
+    ) -> tf.Tensor:
+        """
+        Positional embedding layer that computes positional encodings.
 
-        return inputs, outputs
+        Parameters
+        ----------
+        input_tensor : tf.Tensor
+            Input tensor to get positional embeddings for.
+        d_model : int
+            Dimension of the model.
+        max_len : int, optional
+            Maximum length of the sequence, by default 5000
 
-    def _positional_embedding(self, d_model, max_len=5000):
-        """Create positional embedding layer."""
+        Returns
+        -------
+        tf.Tensor
+            Positional encoding tensor matching input tensor's sequence length.
+        """
         import math
 
         import numpy as np
         import tensorflow as tf
 
-        # Compute the positional encodings once in log space
+        # Compute the positional encodings
         pe = np.zeros((max_len, d_model), dtype=np.float32)
-        position = np.arange(0, max_len, dtype=np.float32)[:, np.newaxis]
+        position = np.expand_dims(np.arange(0, max_len, dtype=np.float32), 1)
         div_term = np.exp(
             np.arange(0, d_model, 2, dtype=np.float32) * -(math.log(10000.0) / d_model)
         )
 
         pe[:, 0::2] = np.sin(position * div_term)
         pe[:, 1::2] = np.cos(position * div_term)
-        pe = pe[np.newaxis, :]  # Add batch dimension
-
-        # Create constant tensor
-        pe_tensor = tf.constant(pe, dtype=tf.float32)
-
-        def positional_function(x):
-            seq_len = tf.shape(x)[1]
-            return pe_tensor[:, :seq_len, :]
-
-        return positional_function
-
-    def _token_embedding(self, c_in, d_model):
-        """Create token embedding layer."""
-        import tensorflow as tf
-
-        token_conv = tf.keras.layers.Conv1D(
-            filters=d_model,
-            kernel_size=3,
-            padding="same",
-            kernel_initializer=tf.keras.initializers.HeNormal(),
-        )
-
-        def token_function(x):
-            return token_conv(x)
-
-        return token_function
-
-    def _fixed_embedding(self, c_in, d_model):
-        """Create fixed embedding layer."""
-        import math
 
-        import numpy as np
-        import tensorflow as tf
+        # Convert to tensor and add batch dimension
+        pe_tensor = tf.expand_dims(tf.convert_to_tensor(pe), 0)
 
-        # Create fixed sinusoidal embeddings
-        w = np.zeros((c_in, d_model), dtype=np.float32)
-        position = np.arange(0, c_in, dtype=np.float32)[:, np.newaxis]
-        div_term = np.exp(
-            np.arange(0, d_model, 2, dtype=np.float32) * -(math.log(10000.0) / d_model)
-        )
+        # Return positional embeddings for the input tensor's sequence length
+        return pe_tensor[:, : input_tensor.shape[1]]
 
-        w[:, 0::2] = np.sin(position * div_term)
-        w[:, 1::2] = np.cos(position * div_term)
-
-        # Create embedding layer with fixed weights
-        embedding = tf.keras.layers.Embedding(
-            input_dim=c_in,
-            output_dim=d_model,
-            embeddings_initializer="zeros",
-            trainable=False,
-        )
-
-        def fixed_function(x):
-            # Initialize weights if not already done
-            if not embedding.built:
-                embedding.build((None,))
-                embedding.embeddings.assign(w)
-            return tf.stop_gradient(embedding(x))
+    def _data_embedding(
+        self,
+        input_tensor: tf.Tensor,
+        c_in: int,
+        d_model: int,
+        dropout: float = 0.1,
+        max_len: int = 5000,
+    ) -> tf.Tensor:
+        """
+        Combine token and positional embeddings for the input tensor.
 
-        return fixed_function
+        Parameters
+        ----------
+        input_tensor : tf.Tensor
+            Input tensor to be processed.
+        c_in : int
+            Number of input channels.
+        d_model : int
+            Dimension of the model (number of output filters).
+        dropout : float, optional
+            Dropout rate, by default 0.1
+        max_len : int, optional
+            Maximum length of the sequence for positional embedding
 
-    def _temporal_embedding(self, d_model, embed_type, freq):
-        """Create temporal embedding layer."""
+        Returns
+        -------
+        tf.Tensor
+            Output tensor after data embedding transformation.
+        """
         import tensorflow as tf
 
-        # Define embedding sizes
-        minute_size = 4
-        hour_size = 24
-        weekday_size = 7
-        day_size = 32
-        month_size = 13
-
-        # Choose embedding type
-        if embed_type == "fixed":
-            minute_embed = (
-                self._fixed_embedding(minute_size, d_model) if freq == "t" else None
-            )
-            hour_embed = self._fixed_embedding(hour_size, d_model)
-            weekday_embed = self._fixed_embedding(weekday_size, d_model)
-            day_embed = self._fixed_embedding(day_size, d_model)
-            month_embed = self._fixed_embedding(month_size, d_model)
-        else:
-            minute_embed = (
-                tf.keras.layers.Embedding(minute_size, d_model) if freq == "t" else None
-            )
-            hour_embed = tf.keras.layers.Embedding(hour_size, d_model)
-            weekday_embed = tf.keras.layers.Embedding(weekday_size, d_model)
-            day_embed = tf.keras.layers.Embedding(day_size, d_model)
-            month_embed = tf.keras.layers.Embedding(month_size, d_model)
-
-        def temporal_function(x):
-            x = tf.cast(x, tf.int32)
-
-            minute_x = minute_embed(x[:, :, 4]) if minute_embed is not None else 0.0
-            hour_x = hour_embed(x[:, :, 3])
-            weekday_x = weekday_embed(x[:, :, 2])
-            day_x = day_embed(x[:, :, 1])
-            month_x = month_embed(x[:, :, 0])
-
-            return hour_x + weekday_x + day_x + month_x + minute_x
+        # Get token embeddings
+        token_emb = self._token_embedding(input_tensor, c_in, d_model)
 
-        return temporal_function
+        # Get positional embeddings
+        pos_emb = self._positional_embedding(input_tensor, d_model, max_len)
 
-    def _time_feature_embedding(self, d_model, embed_type, freq):
-        """Create time feature embedding layer."""
-        import tensorflow as tf
+        # Combine embeddings
+        x = token_emb + pos_emb
 
-        freq_map = {"h": 4, "t": 5, "s": 6, "m": 1, "a": 1, "w": 2, "d": 3, "b": 3}
-        d_inp = freq_map[freq]
+        # Apply dropout
+        x = tf.keras.layers.Dropout(dropout)(x)
 
-        embed_layer = tf.keras.layers.Dense(d_model)
+        return x
 
-        def time_feature_function(x):
-            return embed_layer(x)
+    def _conv_layer(self, input_tensor: tf.Tensor, c_in: int) -> tf.Tensor:
+        """
+        Convolutional layer with batch normalization, ELU, and max pooling.
 
-        return time_feature_function
+        Parameters
+        ----------
+        input_tensor : tf.Tensor
+            Input tensor to be processed.
+        c_in : int
+            Number of input channels (filters for the convolution).
 
-    def _data_embedding(self, c_in, d_model, embed_type, freq, dropout):
-        """Create data embedding layer."""
+        Returns
+        -------
+        tf.Tensor
+            Output tensor after convolution and pooling operations.
+        """
         import tensorflow as tf
 
-        # Create embedding components
-        value_embedding = self._token_embedding(c_in, d_model)
-        position_embedding = self._positional_embedding(d_model)
-
-        if embed_type != "timeF":
-            temporal_embedding = self._temporal_embedding(d_model, embed_type, freq)
-        else:
-            temporal_embedding = self._time_feature_embedding(d_model, embed_type, freq)
-
-        dropout_layer = tf.keras.layers.Dropout(dropout)
+        # Apply 1D convolution with causal padding
+        x = tf.keras.layers.Conv1D(filters=c_in, kernel_size=3, padding="causal")(
+            input_tensor
+        )
 
-        def embedding_function(inputs, training=None):
-            x, x_mark = inputs
+        # Apply batch normalization
+        x = tf.keras.layers.BatchNormalization()(x)
 
-            value_emb = value_embedding(x)
-            pos_emb = position_embedding(x)
-            temporal_emb = temporal_embedding(x_mark)
+        # Apply ELU activation
+        x = tf.keras.layers.ELU()(x)
 
-            embeddings = value_emb + pos_emb + temporal_emb
-            return dropout_layer(embeddings, training=training)
+        # Apply max pooling for downsampling
+        x = tf.keras.layers.MaxPool1D(pool_size=3, strides=2)(x)
 
-        return embedding_function
+        return x
 
-    def _build_encoder(self):
-        """Build the encoder stack with attention layers."""
-        import tensorflow as tf
+    def _attention_out(
+        self,
+        input_tensor: tf.Tensor,
+        attention_type: str,
+        mask_flag: bool,
+        d_model: int,
+        n_heads: int,
+        factor: int = 5,
+        dropout: float = 0.1,
+        attn_mask: Optional[tf.Tensor] = None,
+    ) -> tf.Tensor:
+        """
+        Attention output layer applying either ProbAttention or FullAttention.
 
-        # Choose attention type
-        if self.attn == "prob":
-            Attn = self._prob_attention(
-                False, self.factor, self.dropout, self.output_attention
-            )
-        else:
-            Attn = self._full_attention(
-                False, self.factor, self.dropout, self.output_attention
-            )
+        Parameters
+        ----------
+        input_tensor : tf.Tensor
+            Input tensor for attention computation.
+        attention_type : str
+            Type of attention mechanism ('prob' or 'full').
+        mask_flag : bool
+            Whether to use attention masking.
+        d_model : int
+            Model dimension.
+        n_heads : int
+            Number of attention heads.
+        factor : int, optional
+            Attention factor for ProbSparse attention, by default 5
+        dropout : float, optional
+            Dropout rate, by default 0.1
+        attn_mask : tf.Tensor, optional
+            Attention mask tensor, by default None
 
-        # Build encoder layers
-        encoder_layers = []
-        for l in range(self.e_layers):
-            attention_layer = self._attention_layer(
-                Attn, self.d_model, self.n_heads, mix=False
-            )
-            encoder_layer = self._encoder_layer(
-                attention_layer, self.d_model, self.d_ff, self.dropout, self.activation
-            )
-            encoder_layers.append(encoder_layer)
-
-        # Build conv layers for distilling
-        conv_layers = None
-        if self.distil:
-            conv_layers = []
-            for l in range(self.e_layers - 1):
-                conv_layer = self._conv_layer(self.d_model)
-                conv_layers.append(conv_layer)
-
-        # Normalization layer
-        norm_layer = tf.keras.layers.LayerNormalization()
-
-        def encoder_function(x, attn_mask=None, training=None):
-            # x [B, L, D]
-            attns = []
-
-            if conv_layers is not None:
-                # Process with both attention and conv layers
-                for attn_layer, conv_layer in zip(encoder_layers, conv_layers):
-                    x, attn = attn_layer(x, attn_mask=attn_mask, training=training)
-                    x = conv_layer(x, training=training)
-                    attns.append(attn)
-
-                # Final attention layer
-                x, attn = encoder_layers[-1](x, attn_mask=attn_mask, training=training)
-                attns.append(attn)
-            else:
-                # Process with only attention layers
-                for attn_layer in encoder_layers:
-                    x, attn = attn_layer(x, attn_mask=attn_mask, training=training)
-                    attns.append(attn)
-
-            if norm_layer is not None:
-                x = norm_layer(x, training=training)
-
-            return x, attns
-
-        return encoder_function
-
-    def _build_decoder(self):
-        """Build the decoder stack with attention layers."""
+        Returns
+        -------
+        tf.Tensor
+            Output tensor after attention computation.
+        """
         import tensorflow as tf
 
-        # Build decoder layers
-        decoder_layers = []
-        for l in range(self.d_layers):
-            # Self-attention (with mask)
-            self_attn = (
-                self._prob_attention(True, self.factor, self.dropout, False)
-                if self.attn == "prob"
-                else self._full_attention(True, self.factor, self.dropout, False)
-            )
-            self_attention_layer = self._attention_layer(
-                self_attn, self.d_model, self.n_heads, self.mix
+        if attention_type == "prob":
+            prob_attention = KerasProbAttention(
+                mask_flag=mask_flag,
+                factor=factor,
+                attention_dropout=dropout,
             )
 
-            # Cross-attention (without mask)
-            cross_attn = self._full_attention(False, self.factor, self.dropout, False)
-            cross_attention_layer = self._attention_layer(
-                cross_attn, self.d_model, self.n_heads, False
-            )
+            output = AttentionLayer(
+                attention=prob_attention,
+                d_model=d_model,
+                n_heads=n_heads,
+                d_keys=d_model // n_heads,  # 512 // 8 = 64
+                d_values=d_model // n_heads,  # 512 // 8 = 64
+            )(input_tensor, attn_mask=attn_mask)
 
-            decoder_layer = self._decoder_layer(
-                self_attention_layer,
-                cross_attention_layer,
-                self.d_model,
-                self.d_ff,
-                self.dropout,
-                self.activation,
+        else:
+            queries, keys, values = input_tensor
+            output = tf.keras.layers.MultiHeadAttention(
+                num_heads=n_heads,  # 8
+                key_dim=d_model // n_heads,  # 512 // 8 = 64
+                value_dim=d_model // n_heads,  # 512 // 8 = 64
+                dropout=dropout,
+                use_bias=True,
+            )(
+                query=queries,  # (32, 20, 512)
+                key=keys,  # (32, 20, 512)
+                value=values,  # (32, 20, 512)
+                attention_mask=attn_mask,
+                use_causal_mask=mask_flag,
             )
-            decoder_layers.append(decoder_layer)
-
-        # Normalization layer
-        norm_layer = tf.keras.layers.LayerNormalization()
-
-        def decoder_function(inputs, training=None):
-            x, cross = inputs
-            x_mask = None  # Can be added as parameter if needed
-            cross_mask = None  # Can be added as parameter if needed
-
-            for layer in decoder_layers:
-                x = layer(
-                    x, cross, x_mask=x_mask, cross_mask=cross_mask, training=training
-                )
-
-            if norm_layer is not None:
-                x = norm_layer(x, training=training)
 
-            return x
+        return output
 
-        return decoder_function
+    def _encoder_layer(
+        self,
+        input_tensor: tf.Tensor,
+        d_model: int,
+        d_ff: Optional[int] = None,
+        dropout: float = 0.1,
+        activation: str = "relu",
+        attn_mask: Optional[tf.Tensor] = None,
+        attention_type: str = "prob",
+        mask_flag: bool = True,
+        n_heads: int = 8,
+        factor: int = 5,
+    ) -> tf.Tensor:
+        """
+        Apply encoder layer with multi-head attention and feed-forward network.
 
-    def _prob_attention(self, mask_flag, factor, attention_dropout, output_attention):
-        """Create ProbSparse attention mechanism."""
-        from math import sqrt
+        Parameters
+        ----------
+        input_tensor : tf.Tensor
+            Input tensor of shape [B, L, D] where B is batch size,
+            L is sequence length, D is model dimension.
+        d_model : int
+            Model dimension (must match input tensor's last dimension).
+        d_ff : int, optional
+            Feed-forward network dimension
+        dropout : float, optional
+            Dropout rate, by default 0.1
+        activation : str, optional
+            Activation function ('relu' or 'gelu'), by default "relu"
+        attn_mask : tf.Tensor, optional
+            Attention mask tensor, by default None
 
-        import numpy as np
+        Returns
+        -------
+        tf.Tensor
+            Output tensor after encoder layer processing.
+        """
         import tensorflow as tf
 
-        dropout_layer = tf.keras.layers.Dropout(attention_dropout)
-
-        def _prob_QK(Q, K, sample_k, n_top):
-            # Q [B, H, L, D]
-            B, H, L_K, E = (
-                tf.shape(K)[0],
-                tf.shape(K)[1],
-                tf.shape(K)[2],
-                tf.shape(K)[3],
-            )
-            L_Q = tf.shape(Q)[2]
-
-            # calculate the sampled Q_K
-            K_expand = tf.expand_dims(K, axis=-3)  # [B, H, 1, L_K, E]
-            K_expand = tf.tile(K_expand, [1, 1, L_Q, 1, 1])  # [B, H, L_Q, L_K, E]
-
-            # Generate random indices for sampling
-            index_sample = tf.random.uniform(
-                (L_Q, sample_k), maxval=L_K, dtype=tf.int32
-            )
+        # Set default d_ff if not provided
+        if d_ff is None:
+            d_ff = 4 * d_model
+
+        # Self-attention using the _attention_out function with parameters
+        attn_output = self._attention_out(
+            input_tensor=[input_tensor, input_tensor, input_tensor],
+            attention_type=attention_type,
+            mask_flag=mask_flag,
+            d_model=d_model,
+            n_heads=n_heads,
+            factor=factor,
+            dropout=dropout,
+            attn_mask=attn_mask,
+        )
 
-            # Create indices for gathering
-            batch_indices = tf.range(B)[:, None, None, None, None]
-            head_indices = tf.range(H)[None, :, None, None, None]
-            query_indices = tf.range(L_Q)[None, None, :, None, None]
-            sample_indices = index_sample[None, None, :, :, None]
-
-            # Gather K_sample
-            gather_indices = tf.concat(
-                [
-                    tf.broadcast_to(batch_indices, [B, H, L_Q, sample_k, 1]),
-                    tf.broadcast_to(head_indices, [B, H, L_Q, sample_k, 1]),
-                    tf.broadcast_to(query_indices, [B, H, L_Q, sample_k, 1]),
-                    tf.broadcast_to(sample_indices, [B, H, L_Q, sample_k, 1]),
-                ],
-                axis=-1,
-            )
+        # Apply dropout and residual connection
+        x = input_tensor + tf.keras.layers.Dropout(dropout)(attn_output)
 
-            K_sample = tf.gather_nd(
-                K_expand, gather_indices
-            )  # [B, H, L_Q, sample_k, E]
-
-            # Calculate Q_K_sample
-            Q_expanded = tf.expand_dims(Q, axis=-2)  # [B, H, L_Q, 1, E]
-            Q_K_sample = tf.matmul(
-                Q_expanded, K_sample, transpose_b=True
-            )  # [B, H, L_Q, 1, sample_k]
-            Q_K_sample = tf.squeeze(Q_K_sample, axis=-2)  # [B, H, L_Q, sample_k]
-
-            # find the Top_k query with sparsity measurement
-            M = tf.reduce_max(Q_K_sample, axis=-1) - tf.reduce_sum(
-                Q_K_sample, axis=-1
-            ) / tf.cast(L_K, tf.float32)
-            M_top = tf.nn.top_k(M, k=n_top, sorted=False).indices
-
-            # use the reduced Q to calculate Q_K
-            batch_idx = tf.range(B)[:, None, None]
-            head_idx = tf.range(H)[None, :, None]
-
-            gather_indices_q = tf.stack(
-                [
-                    tf.broadcast_to(batch_idx, tf.shape(M_top)),
-                    tf.broadcast_to(head_idx, tf.shape(M_top)),
-                    M_top,
-                ],
-                axis=-1,
-            )
+        # First layer normalization
+        x = tf.keras.layers.LayerNormalization()(x)
 
-            Q_reduce = tf.gather_nd(Q, gather_indices_q)  # [B, H, n_top, E]
-            Q_K = tf.matmul(Q_reduce, K, transpose_b=True)  # [B, H, n_top, L_K]
+        # Store for second residual connection
+        residual = x
 
-            return Q_K, M_top
+        # Feed-forward network
+        # First 1D convolution (expansion)
+        y = tf.keras.layers.Conv1D(filters=d_ff, kernel_size=1)(x)
 
-        def _get_initial_context(V, L_Q):
-            B, H, L_V, D = (
-                tf.shape(V)[0],
-                tf.shape(V)[1],
-                tf.shape(V)[2],
-                tf.shape(V)[3],
-            )
+        # Apply activation function
+        if activation == "relu":
+            y = tf.keras.layers.ReLU()(y)
+        else:  # gelu
+            y = tf.keras.layers.Activation("gelu")(y)
 
-            if not mask_flag:
-                V_sum = tf.reduce_mean(V, axis=-2)  # [B, H, D]
-                context = tf.expand_dims(V_sum, axis=-2)  # [B, H, 1, D]
-                context = tf.tile(context, [1, 1, L_Q, 1])  # [B, H, L_Q, D]
-            else:
-                # For masked case, L_Q should equal L_V
-                context = tf.cumsum(V, axis=-2)
-
-            return context
-
-        def _prob_mask(B, H, L, index, scores):
-            # Create upper triangular mask (excluding diagonal)
-            L_scores = tf.shape(scores)[-1]
-            _mask = tf.linalg.band_part(tf.ones((L, L_scores), dtype=tf.bool), 0, -1)
-            _mask = tf.logical_not(_mask)  # Upper triangular without diagonal
-
-            # Expand mask for batch and head dimensions
-            _mask_ex = tf.tile(
-                tf.expand_dims(tf.expand_dims(_mask, 0), 0), [B, H, 1, 1]
-            )
+        # Apply dropout
+        y = tf.keras.layers.Dropout(dropout)(y)
 
-            # Gather mask at specified indices
-            batch_idx = tf.range(B)[:, None, None]
-            head_idx = tf.range(H)[None, :, None]
-
-            gather_indices = tf.stack(
-                [
-                    tf.broadcast_to(batch_idx, tf.shape(index)),
-                    tf.broadcast_to(head_idx, tf.shape(index)),
-                    index,
-                ],
-                axis=-1,
-            )
+        # Second 1D convolution (compression back to d_model)
+        y = tf.keras.layers.Conv1D(filters=d_model, kernel_size=1)(y)
 
-            indicator = tf.gather_nd(_mask_ex, gather_indices)
-            return indicator
+        # Apply dropout
+        y = tf.keras.layers.Dropout(dropout)(y)
 
-        def _update_context(context_in, V, scores, index, L_Q, attn_mask):
-            B, H, L_V, D = (
-                tf.shape(V)[0],
-                tf.shape(V)[1],
-                tf.shape(V)[2],
-                tf.shape(V)[3],
-            )
+        # Second residual connection and layer normalization
+        output = tf.keras.layers.LayerNormalization()(residual + y)
 
-            if mask_flag:
-                attn_mask = _prob_mask(B, H, L_Q, index, scores)
-                scores = tf.where(
-                    attn_mask, tf.fill(tf.shape(scores), float("-inf")), scores
-                )
+        return output
 
-            attn = tf.nn.softmax(scores, axis=-1)
-
-            # Calculate attention-weighted values
-            attn_V = tf.matmul(attn, V)  # [B, H, n_top, D]
-
-            # Update context_in at specified indices
-            batch_idx = tf.range(B)[:, None, None]
-            head_idx = tf.range(H)[None, :, None]
+    def _encoder(
+        self,
+        input_tensor: tf.Tensor,
+        e_layers: int,
+        d_model: int,
+        d_ff: Optional[int] = None,
+        dropout: float = 0.1,
+        activation: str = "relu",
+        attn_mask: Optional[tf.Tensor] = None,
+        attention_type: str = "prob",
+        mask_flag: bool = True,
+        n_heads: int = 8,
+        factor: int = 5,
+        use_conv_layers: bool = False,
+        c_in: Optional[int] = None,
+        use_norm: bool = True,
+    ) -> tf.Tensor:
+        """
+        Apply encoder stack with multiple encoder layers and optional conv layers.
 
-            update_indices = tf.stack(
-                [
-                    tf.broadcast_to(batch_idx, tf.shape(index)),
-                    tf.broadcast_to(head_idx, tf.shape(index)),
-                    index,
-                ],
-                axis=-1,
-            )
+        Parameters
+        ----------
+        input_tensor : tf.Tensor
+            Input tensor of shape [B, L, D]
+        e_layers : int
+            Number of encoder layers to stack.
+        d_model : int
+            Model dimension (must match input tensor's last dimension).
+        d_ff : int, optional
+            Feed-forward network dimension
+        dropout : float, optional
+            Dropout rate, by default 0.1
+        activation : str, optional
+            Activation function ('relu' or 'gelu'), by default "relu"
+        attn_mask : tf.Tensor, optional
+            Attention mask tensor, by default None
+        attention_type : str, optional
+            Type of attention mechanism ('prob' or 'full')
+        mask_flag : bool, optional
+            Whether to use attention masking, by default True
+        n_heads : int, optional
+            Number of attention heads, by default 8
+        factor : int, optional
+            Attention factor for ProbSparse attention, by default 5
+        use_conv_layers : bool, optional
+            Whether to use convolutional layers between encoder layers
+        c_in : int, optional
+            Number of input channels for convolutional layers
+        use_norm : bool, optional
+            Whether to apply final layer normalization, by default True
 
-            context_in = tf.tensor_scatter_nd_update(context_in, update_indices, attn_V)
+        Returns
+        -------
+        tf.Tensor
+            Output tensor after encoder stack processing.
+        """
+        import tensorflow as tf
 
-            if output_attention:
-                # Initialize full attention matrix
-                attns = tf.ones([B, H, L_V, L_V], dtype=attn.dtype) / tf.cast(
-                    L_V, attn.dtype
+        # Set default values
+        if c_in is None:
+            c_in = d_model
+
+        x = input_tensor
+
+        # Apply encoder layers with optional convolutional layers
+        if use_conv_layers:
+            # Apply paired encoder and conv layers
+            for _ in range(e_layers - 1):
+                # Apply encoder layer
+                x = self._encoder_layer(
+                    input_tensor=x,
+                    d_model=d_model,
+                    d_ff=d_ff,
+                    dropout=dropout,
+                    activation=activation,
+                    attn_mask=attn_mask,
+                    attention_type=attention_type,
+                    mask_flag=mask_flag,
+                    n_heads=n_heads,
+                    factor=factor,
                 )
-                attns = tf.tensor_scatter_nd_update(attns, update_indices, attn)
-                return context_in, attns
-            else:
-                return context_in, None
-
-        def prob_attention_function(
-            queries, keys, values, attn_mask=None, training=None
-        ):
-            B, L_Q, H, D = (
-                tf.shape(queries)[0],
-                tf.shape(queries)[1],
-                tf.shape(queries)[2],
-                tf.shape(queries)[3],
-            )
-            L_K = tf.shape(keys)[1]
-
-            # Transpose to [B, H, L, D] format
-            queries = tf.transpose(queries, perm=[0, 2, 1, 3])
-            keys = tf.transpose(keys, perm=[0, 2, 1, 3])
-            values = tf.transpose(values, perm=[0, 2, 1, 3])
-
-            # Calculate sampling parameters
-            U_part = int(factor * np.ceil(np.log(L_K)))
-            u = int(factor * np.ceil(np.log(L_Q)))
-
-            U_part = min(U_part, L_K)
-            u = min(u, L_Q)
-
-            # Get top-k scores and indices
-            scores_top, index = _prob_QK(queries, keys, sample_k=U_part, n_top=u)
 
-            # Apply scale factor
-            scale = 1.0 / sqrt(D)
-            if scale is not None:
-                scores_top = scores_top * scale
+                # Apply convolutional layer for downsampling
+                x = self._conv_layer(
+                    input_tensor=x,
+                    c_in=c_in,
+                )
 
-            # Get initial context and update with top-k queries
-            context = _get_initial_context(values, L_Q)
-            context, attn = _update_context(
-                context, values, scores_top, index, L_Q, attn_mask
+            # Apply final encoder layer (without conv layer)
+            x = self._encoder_layer(
+                input_tensor=x,
+                d_model=d_model,
+                d_ff=d_ff,
+                dropout=dropout,
+                activation=activation,
+                attn_mask=attn_mask,
+                attention_type=attention_type,
+                mask_flag=mask_flag,
+                n_heads=n_heads,
+                factor=factor,
             )
 
-            # Transpose back to [B, L, H, D] format
-            context = tf.transpose(context, perm=[0, 2, 1, 3])
-
-            return context, attn
-
-        return prob_attention_function
-
-    def _full_attention(self, mask_flag, factor, attention_dropout, output_attention):
-        """Create full attention mechanism."""
-        import numpy as np
-        import tensorflow as tf
-
-        dropout_layer = tf.keras.layers.Dropout(attention_dropout)
-
-        def _triangular_causal_mask(B, L):
-            """Create triangular causal mask for attention."""
-            mask_shape = [B, 1, L, L]
-            # Create upper triangular mask (excluding diagonal)
-            mask = tf.linalg.band_part(tf.ones(mask_shape, dtype=tf.bool), 0, -1)
-            mask = tf.logical_not(tf.linalg.band_part(mask, 0, 0))  # Remove diagonal
-            return mask
-
-        def full_attention_function(
-            queries, keys, values, attn_mask=None, training=None
-        ):
-            # Get shapes
-            B = tf.shape(queries)[0]
-            L = tf.shape(queries)[1]
-            H = tf.shape(queries)[2]
-            E = tf.shape(queries)[3]
-            S = tf.shape(keys)[1]
-            D = tf.shape(values)[3]
-
-            # Calculate scale
-            scale = 1.0 / tf.math.sqrt(tf.cast(E, tf.float32))
-
-            # Compute attention scores: "blhe,bshe->bhls"
-            scores = tf.einsum("blhe,bshe->bhls", queries, keys)
-
-            if mask_flag:
-                if attn_mask is None:
-                    attn_mask = _triangular_causal_mask(B, L)
-                else:
-                    # If attn_mask is provided, use its mask attribute if it's an object
-                    if hasattr(attn_mask, "mask"):
-                        attn_mask = attn_mask.mask
-
-                # Apply mask by setting masked positions to -inf
-                scores = tf.where(
-                    attn_mask,
-                    tf.fill(tf.shape(scores), tf.constant(-np.inf, dtype=scores.dtype)),
-                    scores,
+        else:
+            # Apply only encoder layers without convolutional layers
+            for _ in range(e_layers):
+                x = self._encoder_layer(
+                    input_tensor=x,
+                    d_model=d_model,
+                    d_ff=d_ff,
+                    dropout=dropout,
+                    activation=activation,
+                    attn_mask=attn_mask,
+                    attention_type=attention_type,
+                    mask_flag=mask_flag,
+                    n_heads=n_heads,
+                    factor=factor,
                 )
 
-            # Apply scale and softmax
-            A = tf.nn.softmax(scale * scores, axis=-1)
+        # Apply optional final layer normalization
+        if use_norm:
+            x = tf.keras.layers.LayerNormalization()(x)
 
-            # Apply dropout
-            A = dropout_layer(A, training=training)
+        return x
 
-            # Compute output: "bhls,bshd->blhd"
-            V = tf.einsum("bhls,bshd->blhd", A, values)
-
-            if output_attention:
-                return V, A
-            else:
-                return V, None
+    def _decoder_layer(
+        self,
+        input_tensor: tf.Tensor,
+        cross_tensor: tf.Tensor,
+        d_model: int,
+        d_ff: Optional[int] = None,
+        dropout: float = 0.1,
+        activation: str = "relu",
+        x_mask: Optional[tf.Tensor] = None,
+        cross_mask: Optional[tf.Tensor] = None,
+        self_attention_type: str = "prob",
+        cross_attention_type: str = "prob",
+        mask_flag: bool = True,
+        n_heads: int = 8,
+        factor: int = 5,
+    ) -> tf.Tensor:
+        """
+        Apply decoder layer with self-attention, cross-attention, and FFN.
 
-        return full_attention_function
+        Parameters
+        ----------
+        input_tensor : tf.Tensor
+            Input tensor of shape [B, L, D]
+        cross_tensor : tf.Tensor
+            Cross-attention input tensor (encoder output) of shape [B, L_enc, D]
+        d_model : int
+            Model dimension (must match input tensor's last dimension).
+        d_ff : int, optional
+            Feed-forward network dimension
+        dropout : float, optional
+            Dropout rate, by default 0.1
+        activation : str, optional
+            Activation function ('relu' or 'gelu'), by default "relu"
+        x_mask : tf.Tensor, optional
+            Self-attention mask tensor, by default None
+        cross_mask : tf.Tensor, optional
+            Cross-attention mask tensor, by default None
+        self_attention_type : str, optional
+            Type of self-attention mechanism ('prob' or 'full')
+        cross_attention_type : str, optional
+            Type of cross-attention mechanism ('prob' or 'full')
+        mask_flag : bool, optional
+            Whether to use attention masking, by default True
+        n_heads : int, optional
+            Number of attention heads, by default 8
+        factor : int, optional
+            Attention factor for ProbSparse attention, by default 5
 
-    def _attention_layer(self, attention, d_model, n_heads, mix):
-        """Create attention layer wrapper."""
+        Returns
+        -------
+        tf.Tensor
+            Output tensor after decoder layer processing with same shape.
+        """
         import tensorflow as tf
 
-        d_keys = d_model // n_heads
-        d_values = d_model // n_heads
+        # Set default d_ff if not provided
+        if d_ff is None:
+            d_ff = 4 * d_model
+
+        # Self-attention block
+        self_attn_output = self._attention_out(
+            input_tensor=[input_tensor, input_tensor, input_tensor],
+            attention_type=self_attention_type,
+            mask_flag=mask_flag,
+            d_model=d_model,
+            n_heads=n_heads,
+            factor=factor,
+            dropout=dropout,
+            attn_mask=x_mask,
+        )
 
-        # Linear projection layers for Q, K, V
-        query_dense = tf.keras.layers.Dense(d_model)
-        key_dense = tf.keras.layers.Dense(d_model)
-        value_dense = tf.keras.layers.Dense(d_model)
+        # Apply dropout and first residual connection
+        x = input_tensor + tf.keras.layers.Dropout(dropout)(self_attn_output)
+
+        # First layer normalization
+        x = tf.keras.layers.LayerNormalization()(x)
+
+        # Cross-attention block
+        cross_attn_output = self._attention_out(
+            input_tensor=[x, cross_tensor, cross_tensor],
+            attention_type=cross_attention_type,
+            mask_flag=mask_flag,
+            d_model=d_model,
+            n_heads=n_heads,
+            factor=factor,
+            dropout=dropout,
+            attn_mask=cross_mask,
+        )
 
-        # Output projection
-        out_projection = tf.keras.layers.Dense(d_model)
+        # Apply dropout and second residual connection
+        x = x + tf.keras.layers.Dropout(dropout)(cross_attn_output)
 
-        def attention_layer_function(
-            queries, keys, values, attn_mask=None, training=None
-        ):
-            B, L, _ = tf.shape(queries)[0], tf.shape(queries)[1], tf.shape(queries)[2]
-            S = tf.shape(keys)[1]
-            H = n_heads
+        # Second layer normalization
+        x = tf.keras.layers.LayerNormalization()(x)
 
-            # Linear projections in batch from d_model => h x d_k
-            Q = query_dense(queries)
-            K = key_dense(keys)
-            V = value_dense(values)
+        # Store for third residual connection
+        residual = x
 
-            # Reshape to (B, L, H, d_k) and transpose to (B, H, L, d_k)
-            Q = tf.reshape(Q, [B, L, H, d_keys])
-            K = tf.reshape(K, [B, S, H, d_keys])
-            V = tf.reshape(V, [B, S, H, d_values])
+        # Feed-forward network
+        # First 1D convolution (expansion)
+        y = tf.keras.layers.Conv1D(filters=d_ff, kernel_size=1)(x)
 
-            Q = tf.transpose(Q, [0, 2, 1, 3])  # (B, H, L, d_k)
-            K = tf.transpose(K, [0, 2, 1, 3])  # (B, H, S, d_k)
-            V = tf.transpose(V, [0, 2, 1, 3])  # (B, H, S, d_v)
+        # Apply activation function
+        if activation == "relu":
+            y = tf.keras.layers.ReLU()(y)
+        else:  # gelu
+            y = tf.keras.layers.Activation("gelu")(y)
 
-            # Apply attention function
-            out, attn = attention(Q, K, V, attn_mask=attn_mask, training=training)
+        # Apply dropout
+        y = tf.keras.layers.Dropout(dropout)(y)
 
-            # Concatenate heads and put through final linear layer
-            # out shape: (B, H, L, d_v) -> (B, L, H, d_v) -> (B, L, H*d_v)
-            out = tf.transpose(out, [0, 2, 1, 3])
-            out = tf.reshape(out, [B, L, H * d_values])
+        # Second 1D convolution (compression back to d_model)
+        y = tf.keras.layers.Conv1D(filters=d_model, kernel_size=1)(y)
 
-            # Apply mix transformation if needed
-            if mix:
-                # Reshape to (B, L, H, d_values) then transpose to (B, H, L, d_values)
-                out = tf.reshape(out, [B, L, H, d_values])
-                out = tf.transpose(out, [0, 2, 1, 3])
-                out = tf.reshape(out, [B, L, H * d_values])
+        # Apply dropout
+        y = tf.keras.layers.Dropout(dropout)(y)
 
-            # Final output projection
-            out = out_projection(out)
+        # Third residual connection and final layer normalization
+        output = tf.keras.layers.LayerNormalization()(residual + y)
 
-            return out, attn
+        return output
 
-        return attention_layer_function
+    def _decoder(
+        self,
+        input_tensor: tf.Tensor,
+        cross_tensor: tf.Tensor,
+        d_layers: int,
+        d_model: int,
+        d_ff: Optional[int] = None,
+        dropout: float = 0.1,
+        activation: str = "relu",
+        x_mask: Optional[tf.Tensor] = None,
+        cross_mask: Optional[tf.Tensor] = None,
+        self_attention_type: str = "prob",
+        cross_attention_type: str = "prob",
+        mask_flag: bool = True,
+        n_heads: int = 8,
+        factor: int = 5,
+        use_norm: bool = True,
+    ) -> tf.Tensor:
+        """
+        Apply decoder stack with multiple decoder layers and optional normalization.
 
-    def _encoder_layer(self, attention_layer, d_model, d_ff, dropout, activation):
-        """Create single encoder layer."""
+        Parameters
+        ----------
+        input_tensor : tf.Tensor
+            Decoder input tensor of shape [B, L_dec, D]
+        cross_tensor : tf.Tensor
+            Cross-attention input tensor (encoder output) of shape [B, L_enc, D]
+        d_layers : int
+            Number of decoder layers to stack.
+        d_model : int
+            Model dimension (must match input tensor's last dimension).
+        d_ff : int, optional
+            Feed-forward network dimension
+        dropout : float, optional
+            Dropout rate, by default 0.1
+        activation : str, optional
+            Activation function ('relu' or 'gelu'), by default "relu"
+        x_mask : tf.Tensor, optional
+            Self-attention mask tensor for decoder, by default None
+        cross_mask : tf.Tensor, optional
+            Cross-attention mask tensor, by default None
+        self_attention_type : str, optional
+            Type of self-attention mechanism ('prob' or 'full')
+        cross_attention_type : str, optional
+            Type of cross-attention mechanism ('prob' or 'full')
+        mask_flag : bool, optional
+            Whether to use attention masking, by default True
+        n_heads : int, optional
+            Number of attention heads, by default 8
+        factor : int, optional
+            Attention factor for ProbSparse attention, by default 5
+        use_norm : bool, optional
+            Whether to apply final layer normalization, by default True
+
+        Returns
+        -------
+        tf.Tensor
+            Output tensor after decoder stack processing.
+        """
         import tensorflow as tf
 
-        d_ff = d_ff or 4 * d_model
+        x = input_tensor
+
+        # Apply multiple decoder layers
+        for _ in range(d_layers):
+            x = self._decoder_layer(
+                input_tensor=x,
+                cross_tensor=cross_tensor,
+                d_model=d_model,
+                d_ff=d_ff,
+                dropout=dropout,
+                activation=activation,
+                x_mask=x_mask,
+                cross_mask=cross_mask,
+                self_attention_type=self_attention_type,
+                cross_attention_type=cross_attention_type,
+                mask_flag=mask_flag,
+                n_heads=n_heads,
+                factor=factor,
+            )
 
-        # Conv1D layers for feed-forward network
-        conv1 = tf.keras.layers.Conv1D(filters=d_ff, kernel_size=1)
-        conv2 = tf.keras.layers.Conv1D(filters=d_model, kernel_size=1)
+        # Apply optional final layer normalization
+        if use_norm:
+            x = tf.keras.layers.LayerNormalization()(x)
 
-        # Layer normalization
-        norm1 = tf.keras.layers.LayerNormalization()
-        norm2 = tf.keras.layers.LayerNormalization()
+        return x
 
-        # Dropout
-        dropout_layer = tf.keras.layers.Dropout(dropout)
+    def build_network(
+        self, input_shape: tuple[int, int], **kwargs
+    ) -> tuple[list[tf.Tensor], tf.Tensor]:
+        """Build the complete Informer architecture for time series forecasting."""
+        import tensorflow as tf
 
-        # Activation function
-        if activation == "relu":
-            activation_fn = tf.nn.relu
-        else:
-            activation_fn = tf.nn.gelu
+        # Get input dimensions
+        n_timepoints, n_channels = input_shape
 
-        def encoder_layer_function(x, attn_mask=None, training=None):
-            # Self-attention with residual connection
-            new_x, attn = attention_layer(
-                x, x, x, attn_mask=attn_mask, training=training
-            )
-            x = x + dropout_layer(new_x, training=training)
-            y = x = norm1(x, training=training)
+        # hardcode batch_size for now
+        batch_size = 32
 
-            # Feed-forward network with residual connection
-            y = conv1(y)
-            y = dropout_layer(activation_fn(y), training=training)
-            y = conv2(y)
-            y = dropout_layer(y, training=training)
+        # Create input layers for encoder and decoder
+        encoder_input = tf.keras.layers.Input(
+            shape=(self.seq_len, n_channels),
+            name="encoder_input",
+            batch_size=batch_size,
+        )
 
-            return norm2(x + y, training=training), attn
+        decoder_input = tf.keras.layers.Input(
+            shape=(self.label_len + self.out_len, n_channels),
+            name="decoder_input",
+            batch_size=batch_size,
+        )
 
-        return encoder_layer_function
+        # Encoder embedding
+        enc_embedded = self._data_embedding(
+            input_tensor=encoder_input,
+            c_in=n_channels,
+            d_model=self.d_model,
+            dropout=self.dropout,
+            max_len=self.seq_len,
+        )
 
-    def _decoder_layer(
-        self, self_attention, cross_attention, d_model, d_ff, dropout, activation
-    ):
-        """Create single decoder layer."""
-        import tensorflow as tf
+        # Encoder processing
+        enc_output = self._encoder(
+            input_tensor=enc_embedded,
+            e_layers=self.e_layers,
+            d_model=self.d_model,
+            d_ff=self.d_ff,
+            dropout=self.dropout,
+            activation=self.activation,
+            attention_type=self.attn,
+            mask_flag=False,
+            n_heads=self.n_heads,
+            factor=self.factor,
+            use_conv_layers=self.distil,
+            c_in=self.d_model,
+            use_norm=True,
+        )
 
-        d_ff = d_ff or 4 * d_model
+        # Decoder embedding
+        dec_embedded = self._data_embedding(
+            input_tensor=decoder_input,
+            c_in=n_channels,
+            d_model=self.d_model,
+            dropout=self.dropout,
+            max_len=self.label_len + self.out_len,
+        )
 
-        # Conv1D layers equivalent to PyTorch's Conv1d
-        conv1 = tf.keras.layers.Conv1D(filters=d_ff, kernel_size=1)
-        conv2 = tf.keras.layers.Conv1D(filters=d_model, kernel_size=1)
+        # Decoder processing
+        dec_output = self._decoder(
+            input_tensor=dec_embedded,
+            cross_tensor=enc_output,
+            d_layers=self.d_layers,
+            d_model=self.d_model,
+            d_ff=self.d_ff,
+            dropout=self.dropout,
+            activation=self.activation,
+            self_attention_type=self.attn,
+            cross_attention_type="full",
+            mask_flag=self.mix,
+            n_heads=self.n_heads,
+            factor=self.factor,
+            use_norm=True,
+        )
 
-        # Layer normalization
-        norm1 = tf.keras.layers.LayerNormalization()
-        norm2 = tf.keras.layers.LayerNormalization()
-        norm3 = tf.keras.layers.LayerNormalization()
+        # Final projection to output dimension
+        output = tf.keras.layers.Dense(n_channels, name="output_projection")(dec_output)
 
-        # Dropout
-        dropout_layer = tf.keras.layers.Dropout(dropout)
+        # Extract only the prediction part (last out_len timesteps)
+        output = output[:, -self.out_len :, :]
 
-        # Activation function
-        if activation == "relu":
-            activation_fn = tf.nn.relu
-        else:
-            activation_fn = tf.nn.gelu
-
-        def decoder_layer_function(
-            x, cross, x_mask=None, cross_mask=None, training=None
-        ):
-            # Self-attention with residual connection
-            self_attn_out = self_attention(
-                x, x, x, attn_mask=x_mask, training=training
-            )[0]
-            x = x + dropout_layer(self_attn_out, training=training)
-            x = norm1(x, training=training)
-
-            # Cross-attention with residual connection
-            cross_attn_out = cross_attention(
-                x, cross, cross, attn_mask=cross_mask, training=training
-            )[0]
-            x = x + dropout_layer(cross_attn_out, training=training)
-            y = x = norm2(x, training=training)
-
-            # Feed-forward network with residual connection
-            y = conv1(y)
-            y = dropout_layer(activation_fn(y), training=training)
-            y = conv2(y)
-            y = dropout_layer(y, training=training)
-
-            return norm3(x + y, training=training)
-
-        return decoder_layer_function
-
-    def _conv_layer(self, d_model):
-        """Create convolution layer for distilling."""
-        import tensorflow as tf
+        # Create the model with both encoder and decoder inputs
+        inputs = [encoder_input, decoder_input]
 
-        # TensorFlow doesn't have direct circular padding, using 'same' padding
-        downConv = tf.keras.layers.Conv1D(
-            filters=d_model, kernel_size=3, padding="same", activation=None
-        )
-        norm = tf.keras.layers.BatchNormalization()
-        activation = tf.keras.layers.ELU()
-        maxPool = tf.keras.layers.MaxPool1D(pool_size=3, strides=2, padding="same")
-
-        def conv_layer_function(x, training=None):
-            # x shape: [B, L, D] -> Conv1D expects [B, L, C]
-            x = downConv(x)
-            x = norm(x, training=training)
-            x = activation(x)
-            x = maxPool(x)
-            return x
-
-        return conv_layer_function
+        return inputs, output
diff --git a/aeon/networks/tests/test_informer.py b/aeon/networks/tests/test_informer.py
new file mode 100644
index 0000000000..9d5be59351
--- /dev/null
+++ b/aeon/networks/tests/test_informer.py
@@ -0,0 +1,221 @@
+"""Tests for the Informer Network Model."""
+
+import random
+
+import pytest
+
+from aeon.networks import InformerNetwork
+from aeon.utils.validation._dependencies import _check_soft_dependencies
+
+
+@pytest.mark.skipif(
+    not _check_soft_dependencies(["tensorflow"], severity="none"),
+    reason="Tensorflow soft dependency unavailable.",
+)
+@pytest.mark.parametrize(
+    "seq_len,label_len,out_len,d_model,n_heads,e_layers,d_layers",
+    [
+        (96, 48, 24, 512, 8, 3, 2),
+        (48, 24, 12, 256, 4, 2, 1),
+        (120, 60, 30, 128, 2, 1, 1),
+        (72, 36, 18, 64, 1, 2, 2),
+    ],
+)
+def test_informer_network_init(
+    seq_len,
+    label_len,
+    out_len,
+    d_model,
+    n_heads,
+    e_layers,
+    d_layers,
+):
+    """Test whether InformerNetwork initializes correctly for various parameters."""
+    informer = InformerNetwork(
+        seq_len=seq_len,
+        label_len=label_len,
+        out_len=out_len,
+        d_model=d_model,
+        n_heads=n_heads,
+        e_layers=e_layers,
+        d_layers=d_layers,
+        factor=random.choice([3, 5, 7]),
+        dropout=random.choice([0.0, 0.1, 0.2]),
+        attn=random.choice(["prob", "full"]),
+        activation=random.choice(["relu", "gelu"]),
+    )
+
+    inputs, outputs = informer.build_network((seq_len + label_len, 5))
+    assert inputs is not None
+    assert outputs is not None
+    assert len(inputs) == 2  # encoder_input and decoder_input
+
+
+@pytest.mark.skipif(
+    not _check_soft_dependencies(["tensorflow"], severity="none"),
+    reason="Tensorflow soft dependency unavailable.",
+)
+@pytest.mark.parametrize(
+    "attn,activation",
+    [("prob", "relu"), ("full", "gelu"), ("prob", "gelu"), ("full", "relu")],
+)
+def test_informer_network_attention_activation(attn, activation):
+    """Test InformerNetwork with different attention and activation."""
+    informer = InformerNetwork(
+        seq_len=96,
+        label_len=48,
+        out_len=24,
+        d_model=128,
+        n_heads=4,
+        e_layers=2,
+        d_layers=1,
+        attn=attn,
+        activation=activation,
+    )
+
+    inputs, outputs = informer.build_network((144, 3))
+    assert inputs is not None
+    assert outputs is not None
+
+
+@pytest.mark.skipif(
+    not _check_soft_dependencies(["tensorflow"], severity="none"),
+    reason="Tensorflow soft dependency unavailable.",
+)
+@pytest.mark.parametrize(
+    "distil,mix,factor",
+    [(True, True, 5), (False, False, 3), (True, False, 7), (False, True, 2)],
+)
+def test_informer_network_distil_mix_factor(distil, mix, factor):
+    """Test whether InformerNetwork works with different configurations."""
+    informer = InformerNetwork(
+        seq_len=48,
+        label_len=24,
+        out_len=12,
+        d_model=64,
+        n_heads=2,
+        e_layers=1,
+        d_layers=1,
+        distil=distil,
+        mix=mix,
+        factor=factor,
+    )
+
+    inputs, outputs = informer.build_network((72, 2))
+    assert inputs is not None
+    assert outputs is not None
+
+
+@pytest.mark.skipif(
+    not _check_soft_dependencies(["tensorflow"], severity="none"),
+    reason="Tensorflow soft dependency unavailable.",
+)
+def test_informer_network_output_shape():
+    """Test whether InformerNetwork produces correct output shapes."""
+    seq_len = 96
+    label_len = 48
+    out_len = 24
+    n_channels = 5
+    # batch_size = 32
+
+    informer = InformerNetwork(
+        seq_len=seq_len,
+        label_len=label_len,
+        out_len=out_len,
+        d_model=128,
+        n_heads=4,
+        e_layers=2,
+        d_layers=1,
+    )
+
+    inputs, outputs = informer.build_network((seq_len + label_len, n_channels))
+
+    # Create a TensorFlow model to test actual shapes
+    if _check_soft_dependencies(["tensorflow"], severity="none"):
+        # keras_model = tf.keras.Model(inputs=inputs, outputs=outputs)
+
+        # Test input shapes
+        encoder_input_shape = inputs[0].shape
+        decoder_input_shape = inputs[1].shape
+
+        assert encoder_input_shape[1] == seq_len  # sequence length
+        assert encoder_input_shape[2] == n_channels  # number of channels
+        assert decoder_input_shape[1] == label_len + out_len  # decoder sequence length
+        assert decoder_input_shape[2] == n_channels  # number of channels
+
+        # Test output shape
+        output_shape = outputs.shape
+        assert output_shape[1] == out_len  # prediction length
+        assert output_shape[2] == n_channels  # number of channels
+
+
+@pytest.mark.skipif(
+    not _check_soft_dependencies(["tensorflow"], severity="none"),
+    reason="Tensorflow soft dependency unavailable.",
+)
+def test_informer_network_default_parameters():
+    """Test whether InformerNetwork works with default parameters."""
+    informer = InformerNetwork()
+
+    inputs, outputs = informer.build_network((120, 1))
+    assert inputs is not None
+    assert outputs is not None
+
+    # Check default values
+    assert informer.seq_len == 96
+    assert informer.label_len == 48
+    assert informer.out_len == 24
+    assert informer.d_model == 512
+    assert informer.n_heads == 8
+    assert informer.e_layers == 3
+    assert informer.d_layers == 2
+    assert informer.attn == "prob"
+    assert informer.activation == "gelu"
+    assert informer.distil
+    assert informer.mix
+
+
+@pytest.mark.skipif(
+    not _check_soft_dependencies(["tensorflow"], severity="none"),
+    reason="Tensorflow soft dependency unavailable.",
+)
+def test_informer_network_parameter_validation():
+    """Test whether InformerNetwork handles edge case parameters correctly."""
+    # Test minimum viable configuration
+    informer = InformerNetwork(
+        seq_len=12,
+        label_len=6,
+        out_len=3,
+        d_model=32,
+        n_heads=1,
+        e_layers=1,
+        d_layers=1,
+        factor=1,
+        dropout=0.0,
+    )
+
+    inputs, outputs = informer.build_network((18, 1))
+    assert inputs is not None
+    assert outputs is not None
+
+
+@pytest.mark.skipif(
+    not _check_soft_dependencies(["tensorflow"], severity="none"),
+    reason="Tensorflow soft dependency unavailable.",
+)
+def test_informer_network_different_channels():
+    """Test whether InformerNetwork works with different numbers of input channels."""
+    for n_channels in [1, 3, 5, 10]:
+        informer = InformerNetwork(
+            seq_len=48,
+            label_len=24,
+            out_len=12,
+            d_model=64,
+            n_heads=2,
+            e_layers=1,
+            d_layers=1,
+        )
+
+        inputs, outputs = informer.build_network((72, n_channels))
+        assert inputs is not None
+        assert outputs is not None
diff --git a/aeon/utils/networks/attention.py b/aeon/utils/networks/attention.py
new file mode 100644
index 0000000000..025a625641
--- /dev/null
+++ b/aeon/utils/networks/attention.py
@@ -0,0 +1,350 @@
+"""Full Attention, ProbSparseAttention and Attention Layer."""
+
+from aeon.utils.validation._dependencies import _check_soft_dependencies
+
+if _check_soft_dependencies(["tensorflow"], severity="none"):
+    import numpy as np
+    import tensorflow as tf
+    from tensorflow.keras.layers import Dropout, Layer
+
+
+class KerasProbAttention(Layer):
+    """Keras implementation of ProbSparse Attention mechanism for Informer."""
+
+    def __init__(
+        self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, **kwargs
+    ):
+        """Initialize KerasProbAttention layer."""
+        super().__init__(**kwargs)
+        self.factor = factor
+        self.scale = scale
+        self.mask_flag = mask_flag
+        self.attention_dropout = attention_dropout
+        self.dropout = Dropout(attention_dropout)
+
+    def build(self, input_shape):
+        """Build the layer."""
+        super().build(input_shape)
+
+    def compute_output_shape(self, input_shape):
+        """Compute output shape for the layer."""
+        # Return the same shape as queries input
+        return input_shape[0]  # queries shape
+
+    def compute_output_spec(self, input_spec):
+        """Compute output spec for the layer."""
+        return input_spec[0]  # Return queries spec
+
+    def _prob_QK(self, Q, K, sample_k, n_top):
+        """Compute probabilistic QK with fixed dimension handling."""
+        B, H, L, _ = tf.shape(Q)[0], tf.shape(Q)[1], tf.shape(Q)[2], tf.shape(Q)[3]
+        S = tf.shape(K)[2]
+
+        # Ensure sample_k doesn't exceed available dimensions
+        sample_k = tf.minimum(sample_k, L)
+        n_top = tf.minimum(n_top, S)  # Ensure n_top doesn't exceed sequence length
+
+        # Expand K for sampling
+        K_expand = tf.expand_dims(K, axis=2)  # [B, H, 1, L, E]
+        K_expand = tf.tile(K_expand, [1, 1, S, 1, 1])  # [B, H, S, L, E]
+
+        # Generate random indices - ensure they're within bounds
+        indx_q_seq = tf.random.uniform([S], maxval=L, dtype=tf.int32)
+        indx_k_seq = tf.random.uniform([sample_k], maxval=L, dtype=tf.int32)
+
+        # Gather operations for sampling
+        indices_s = tf.range(S)
+        K_sample = tf.gather(K_expand, indices_s, axis=2)
+        K_sample = tf.gather(K_sample, indx_q_seq, axis=2)
+        K_sample = tf.gather(K_sample, indx_k_seq, axis=3)
+
+        # Matrix multiplication for Q_K_sample
+        Q_expanded = tf.expand_dims(Q, axis=-2)  # [B, H, S, 1, E]
+        K_sample_transposed = tf.transpose(K_sample, perm=[0, 1, 2, 4, 3])
+        Q_K_sample = tf.squeeze(tf.matmul(Q_expanded, K_sample_transposed), axis=-2)
+
+        # Sparsity measurement calculation
+        M_max = tf.reduce_max(Q_K_sample, axis=-1)
+        M_mean = tf.reduce_sum(Q_K_sample, axis=-1) / tf.cast(sample_k, tf.float32)
+        M = M_max - M_mean
+
+        # Top-k selection with dynamic k
+        actual_k = tf.minimum(n_top, tf.shape(M)[-1])
+        _, M_top = tf.nn.top_k(M, k=actual_k, sorted=False)
+
+        # Create indices for gather_nd
+        batch_range = tf.range(B)
+        head_range = tf.range(H)
+        batch_indices = tf.tile(
+            tf.expand_dims(tf.expand_dims(batch_range, 1), 2), [1, H, actual_k]
+        )
+
+        head_indices = tf.tile(
+            tf.expand_dims(tf.expand_dims(head_range, 0), 2), [B, 1, actual_k]
+        )
+
+        # Stack indices for gather_nd
+        idx = tf.stack([batch_indices, head_indices, M_top], axis=-1)
+
+        # Reduce Q and calculate final Q_K
+        Q_reduce = tf.gather_nd(Q, idx)
+        K_transposed = tf.transpose(K, perm=[0, 1, 3, 2])
+        Q_K = tf.matmul(Q_reduce, K_transposed)
+
+        return Q_K, M_top
+
+    def _get_initial_context(self, V, L_Q):
+        """Get initial context using Keras-compatible operations."""
+        if not self.mask_flag:
+            # Sum reduction and broadcasting
+            V_sum = tf.reduce_sum(V, axis=-2)  # [B, H, D]
+            V_sum_expanded = tf.expand_dims(V_sum, axis=-2)  # [B, H, 1, D]
+            context = tf.tile(V_sum_expanded, [1, 1, L_Q, 1])  # [B, H, L_Q, D]
+        else:
+            # Cumulative sum for masked attention
+            context = tf.cumsum(V, axis=-2)
+
+        return context
+
+    def _create_prob_mask(self, B, H, L, index, scores):
+        """Create probability mask for tf.where compatibility."""
+        # Create base mask with ones
+        _mask = tf.ones((L, tf.shape(scores)[-1]), dtype=tf.float32)
+
+        # Create upper triangular matrix (including diagonal)
+        mask_a = tf.linalg.band_part(
+            _mask, 0, -1
+        )  # Upper triangular matrix of 0s and 1s
+
+        # Create diagonal matrix
+        mask_b = tf.linalg.band_part(_mask, 0, 0)  # Diagonal matrix of 0s and 1s
+
+        # Subtract diagonal from upper triangular to get strict upper triangular
+        _mask = tf.cast(mask_a - mask_b, dtype=tf.float32)
+
+        # Broadcast to [B, H, L, scores.shape[-1]]
+        _mask_ex = tf.broadcast_to(_mask, [B, H, L, tf.shape(scores)[-1]])
+
+        # Create indexing tensors
+        batch_indices = tf.range(B)[:, None, None]
+        head_indices = tf.range(H)[None, :, None]
+
+        # Extract indicator using advanced indexing
+        indicator = tf.gather_nd(
+            _mask_ex,
+            tf.stack(
+                [
+                    tf.broadcast_to(batch_indices, tf.shape(index)),
+                    tf.broadcast_to(head_indices, tf.shape(index)),
+                    index,
+                ],
+                axis=-1,
+            ),
+        )
+
+        # Reshape to match scores shape
+        prob_mask_float = tf.reshape(indicator, tf.shape(scores))
+
+        # **KEY FIX**: Convert to boolean tensor
+        prob_mask_bool = tf.cast(prob_mask_float, tf.bool)
+
+        return prob_mask_bool
+
+    def _update_context(self, context_in, V, scores, index, L_Q):
+        """Update context using Keras-compatible operations."""
+        if self.mask_flag:
+            # Apply simple masking
+            attn_mask = self._create_prob_mask(
+                tf.shape(V)[0], tf.shape(V)[1], L_Q, index, scores
+            )
+
+            # Apply mask with large negative value
+            large_neg = -1e9
+            mask_value = tf.where(attn_mask, 0.0, large_neg)
+            scores = scores + mask_value
+
+        # Softmax activation
+        attn = tf.nn.softmax(scores, axis=-1)
+        attn = self.dropout(attn)
+
+        # Create indices for scatter update
+        B, H = tf.shape(V)[0], tf.shape(V)[1]
+        index_shape = tf.shape(index)[-1]
+
+        batch_indices = tf.tile(
+            tf.expand_dims(tf.expand_dims(tf.range(B), 1), 2), [1, H, index_shape]
+        )
+
+        head_indices = tf.tile(
+            tf.expand_dims(tf.expand_dims(tf.range(H), 0), 2), [B, 1, index_shape]
+        )
+
+        idx = tf.stack([batch_indices, head_indices, index], axis=-1)
+
+        # Matrix multiplication and scatter update
+        attn_V = tf.matmul(attn, V)
+        context_updated = tf.tensor_scatter_nd_update(context_in, idx, attn_V)
+
+        return context_updated
+
+    def call(self, inputs, attention_mask=None, training=None):
+        """Run forward pass with fixed tensor operations."""
+        queries, keys, values = inputs
+
+        # Get shapes
+        # B = tf.shape(queries)[0]
+        L = tf.shape(queries)[1]  # sequence length
+        # H = tf.shape(queries)[2]  # number of heads
+        D = tf.shape(queries)[3]  # dimension per head
+        S = tf.shape(keys)[1]  # source sequence length
+
+        # Reshape tensors - transpose to [B, H, L, D]
+        queries = tf.transpose(queries, perm=[0, 2, 1, 3])  # [B, H, L, D]
+        keys = tf.transpose(keys, perm=[0, 2, 1, 3])  # [B, H, S, D]
+        values = tf.transpose(values, perm=[0, 2, 1, 3])  # [B, H, S, D]
+
+        # Calculate sampling parameters with bounds checking
+        # Use tf.py_function to handle numpy operations safely
+        def safe_log_calc(seq_len, factor):
+            if hasattr(seq_len, "numpy"):
+                return int(factor * np.ceil(np.log(max(seq_len.numpy(), 2))))
+            else:
+                return int(factor * np.ceil(np.log(20)))  # fallback
+
+        U = tf.py_function(
+            func=lambda: safe_log_calc(S, self.factor), inp=[], Tout=tf.int32
+        )
+
+        u = tf.py_function(
+            func=lambda: safe_log_calc(L, self.factor), inp=[], Tout=tf.int32
+        )
+
+        # Ensure U and u are within reasonable bounds
+        U = tf.minimum(U, S)  # Can't select more than available
+        u = tf.minimum(u, L)
+
+        # Probabilistic QK computation
+        scores_top, index = self._prob_QK(queries, keys, u, U)
+
+        # Apply scale factor
+        scale = self.scale or (1.0 / tf.sqrt(tf.cast(D, tf.float32)))
+        scores_top = scores_top * scale
+
+        # Get initial context
+        context = self._get_initial_context(values, L)
+
+        # Update context with selected queries
+        context = self._update_context(context, values, scores_top, index, L)
+
+        # Transpose back to original format [B, L, H, D]
+        context = tf.transpose(context, perm=[0, 2, 1, 3])
+
+        return context
+
+    def get_config(self):
+        """Return the config of the layer."""
+        config = super().get_config()
+        config.update(
+            {
+                "mask_flag": self.mask_flag,
+                "factor": self.factor,
+                "scale": self.scale,
+                "attention_dropout": self.attention_dropout,
+            }
+        )
+        return config
+
+    @classmethod
+    def from_config(cls, config):
+        """Create layer from config."""
+        return cls(**config)
+
+
+class AttentionLayer(Layer):
+    """Keras multi-head attention layer using a custom attention mechanism."""
+
+    def __init__(
+        self, attention, d_model, n_heads, d_keys=None, d_values=None, **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.d_keys = d_keys or (d_model // n_heads)
+        self.d_values = d_values or (d_model // n_heads)
+        self.d_model = d_model
+        self.n_heads = n_heads
+
+        # Store the attention mechanism
+        self.inner_attention = attention
+
+        # Projection layers
+        self.query_projection = tf.keras.layers.Dense(
+            self.d_keys * n_heads, name="query_proj"
+        )
+
+        self.key_projection = tf.keras.layers.Dense(
+            self.d_keys * n_heads, name="key_proj"
+        )
+
+        self.value_projection = tf.keras.layers.Dense(
+            self.d_values * n_heads, name="value_proj"
+        )
+
+        self.out_projection = tf.keras.layers.Dense(d_model, name="output_proj")
+
+    def build(self, input_shape):
+        """Build the layer."""
+        # Build the projection layers
+        super().build(input_shape)
+
+    def compute_output_shape(self, input_shape):
+        """Compute output shape for the layer."""
+        # Output shape is same as queries input shape but with d_model as last dimension
+        batch_size, seq_length, _ = input_shape[0]
+        return (batch_size, seq_length, self.d_model)
+
+    def call(self, inputs, attn_mask=None, training=None):
+        """Run forward pass for the attention layer."""
+        queries, keys, values = inputs
+
+        # Get batch size and sequence lengths dynamically
+        B = tf.shape(queries)[0]
+        L = tf.shape(queries)[1]  # target sequence length
+        S = tf.shape(keys)[1]  # source sequence length
+        H = self.n_heads
+
+        # Apply projections
+        queries_proj = self.query_projection(queries)  # [B, L, d_keys * n_heads]
+        keys_proj = self.key_projection(keys)  # [B, S, d_keys * n_heads]
+        values_proj = self.value_projection(values)  # [B, S, d_values * n_heads]
+
+        # Reshape to multi-head format: [B, L/S, H, d_keys/d_values]
+        queries_reshaped = tf.reshape(queries_proj, (B, L, H, self.d_keys))
+        keys_reshaped = tf.reshape(keys_proj, (B, S, H, self.d_keys))
+        values_reshaped = tf.reshape(values_proj, (B, S, H, self.d_values))
+
+        # Apply inner attention mechanism
+        attention_output = self.inner_attention(
+            [queries_reshaped, keys_reshaped, values_reshaped],
+            attention_mask=attn_mask,
+            training=training,
+        )
+
+        # Reshape attention output back to [B, L, H * d_values]
+        attention_flattened = tf.reshape(attention_output, (B, L, H * self.d_values))
+
+        # Final output projection
+        output = self.out_projection(attention_flattened)
+
+        return output
+
+    def get_config(self):
+        """Return the config of the layer."""
+        config = super().get_config()
+        config.update(
+            {
+                "d_model": self.d_model,
+                "n_heads": self.n_heads,
+                "d_keys": self.d_keys,
+                "d_values": self.d_values,
+            }
+        )
+        return config

From 3fb1ef31263cc36d1c2a7c8a57b2bb57832d8532 Mon Sep 17 00:00:00 2001
From: lucifer4073 <lucifer4073@gmail.com>
Date: Mon, 21 Jul 2025 18:13:27 +0530
Subject: [PATCH 04/10] attention changed

---
 aeon/utils/networks/attention.py | 653 +++++++++++++++----------------
 1 file changed, 326 insertions(+), 327 deletions(-)

diff --git a/aeon/utils/networks/attention.py b/aeon/utils/networks/attention.py
index 025a625641..1a724c35ec 100644
--- a/aeon/utils/networks/attention.py
+++ b/aeon/utils/networks/attention.py
@@ -7,344 +7,343 @@
     import tensorflow as tf
     from tensorflow.keras.layers import Dropout, Layer
 
+    class KerasProbAttention(Layer):
+        """Keras implementation of ProbSparse Attention mechanism for Informer."""
+
+        def __init__(
+            self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, **kwargs
+        ):
+            """Initialize KerasProbAttention layer."""
+            super().__init__(**kwargs)
+            self.factor = factor
+            self.scale = scale
+            self.mask_flag = mask_flag
+            self.attention_dropout = attention_dropout
+            self.dropout = Dropout(attention_dropout)
+
+        def build(self, input_shape):
+            """Build the layer."""
+            super().build(input_shape)
+
+        def compute_output_shape(self, input_shape):
+            """Compute output shape for the layer."""
+            # Return the same shape as queries input
+            return input_shape[0]  # queries shape
+
+        def compute_output_spec(self, input_spec):
+            """Compute output spec for the layer."""
+            return input_spec[0]  # Return queries spec
+
+        def _prob_QK(self, Q, K, sample_k, n_top):
+            """Compute probabilistic QK with fixed dimension handling."""
+            B, H, L, _ = tf.shape(Q)[0], tf.shape(Q)[1], tf.shape(Q)[2], tf.shape(Q)[3]
+            S = tf.shape(K)[2]
+
+            # Ensure sample_k doesn't exceed available dimensions
+            sample_k = tf.minimum(sample_k, L)
+            n_top = tf.minimum(n_top, S)  # Ensure n_top doesn't exceed sequence length
+
+            # Expand K for sampling
+            K_expand = tf.expand_dims(K, axis=2)  # [B, H, 1, L, E]
+            K_expand = tf.tile(K_expand, [1, 1, S, 1, 1])  # [B, H, S, L, E]
+
+            # Generate random indices - ensure they're within bounds
+            indx_q_seq = tf.random.uniform([S], maxval=L, dtype=tf.int32)
+            indx_k_seq = tf.random.uniform([sample_k], maxval=L, dtype=tf.int32)
+
+            # Gather operations for sampling
+            indices_s = tf.range(S)
+            K_sample = tf.gather(K_expand, indices_s, axis=2)
+            K_sample = tf.gather(K_sample, indx_q_seq, axis=2)
+            K_sample = tf.gather(K_sample, indx_k_seq, axis=3)
+
+            # Matrix multiplication for Q_K_sample
+            Q_expanded = tf.expand_dims(Q, axis=-2)  # [B, H, S, 1, E]
+            K_sample_transposed = tf.transpose(K_sample, perm=[0, 1, 2, 4, 3])
+            Q_K_sample = tf.squeeze(tf.matmul(Q_expanded, K_sample_transposed), axis=-2)
+
+            # Sparsity measurement calculation
+            M_max = tf.reduce_max(Q_K_sample, axis=-1)
+            M_mean = tf.reduce_sum(Q_K_sample, axis=-1) / tf.cast(sample_k, tf.float32)
+            M = M_max - M_mean
+
+            # Top-k selection with dynamic k
+            actual_k = tf.minimum(n_top, tf.shape(M)[-1])
+            _, M_top = tf.nn.top_k(M, k=actual_k, sorted=False)
+
+            # Create indices for gather_nd
+            batch_range = tf.range(B)
+            head_range = tf.range(H)
+            batch_indices = tf.tile(
+                tf.expand_dims(tf.expand_dims(batch_range, 1), 2), [1, H, actual_k]
+            )
 
-class KerasProbAttention(Layer):
-    """Keras implementation of ProbSparse Attention mechanism for Informer."""
-
-    def __init__(
-        self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, **kwargs
-    ):
-        """Initialize KerasProbAttention layer."""
-        super().__init__(**kwargs)
-        self.factor = factor
-        self.scale = scale
-        self.mask_flag = mask_flag
-        self.attention_dropout = attention_dropout
-        self.dropout = Dropout(attention_dropout)
-
-    def build(self, input_shape):
-        """Build the layer."""
-        super().build(input_shape)
-
-    def compute_output_shape(self, input_shape):
-        """Compute output shape for the layer."""
-        # Return the same shape as queries input
-        return input_shape[0]  # queries shape
-
-    def compute_output_spec(self, input_spec):
-        """Compute output spec for the layer."""
-        return input_spec[0]  # Return queries spec
-
-    def _prob_QK(self, Q, K, sample_k, n_top):
-        """Compute probabilistic QK with fixed dimension handling."""
-        B, H, L, _ = tf.shape(Q)[0], tf.shape(Q)[1], tf.shape(Q)[2], tf.shape(Q)[3]
-        S = tf.shape(K)[2]
-
-        # Ensure sample_k doesn't exceed available dimensions
-        sample_k = tf.minimum(sample_k, L)
-        n_top = tf.minimum(n_top, S)  # Ensure n_top doesn't exceed sequence length
-
-        # Expand K for sampling
-        K_expand = tf.expand_dims(K, axis=2)  # [B, H, 1, L, E]
-        K_expand = tf.tile(K_expand, [1, 1, S, 1, 1])  # [B, H, S, L, E]
-
-        # Generate random indices - ensure they're within bounds
-        indx_q_seq = tf.random.uniform([S], maxval=L, dtype=tf.int32)
-        indx_k_seq = tf.random.uniform([sample_k], maxval=L, dtype=tf.int32)
-
-        # Gather operations for sampling
-        indices_s = tf.range(S)
-        K_sample = tf.gather(K_expand, indices_s, axis=2)
-        K_sample = tf.gather(K_sample, indx_q_seq, axis=2)
-        K_sample = tf.gather(K_sample, indx_k_seq, axis=3)
-
-        # Matrix multiplication for Q_K_sample
-        Q_expanded = tf.expand_dims(Q, axis=-2)  # [B, H, S, 1, E]
-        K_sample_transposed = tf.transpose(K_sample, perm=[0, 1, 2, 4, 3])
-        Q_K_sample = tf.squeeze(tf.matmul(Q_expanded, K_sample_transposed), axis=-2)
-
-        # Sparsity measurement calculation
-        M_max = tf.reduce_max(Q_K_sample, axis=-1)
-        M_mean = tf.reduce_sum(Q_K_sample, axis=-1) / tf.cast(sample_k, tf.float32)
-        M = M_max - M_mean
-
-        # Top-k selection with dynamic k
-        actual_k = tf.minimum(n_top, tf.shape(M)[-1])
-        _, M_top = tf.nn.top_k(M, k=actual_k, sorted=False)
-
-        # Create indices for gather_nd
-        batch_range = tf.range(B)
-        head_range = tf.range(H)
-        batch_indices = tf.tile(
-            tf.expand_dims(tf.expand_dims(batch_range, 1), 2), [1, H, actual_k]
-        )
-
-        head_indices = tf.tile(
-            tf.expand_dims(tf.expand_dims(head_range, 0), 2), [B, 1, actual_k]
-        )
-
-        # Stack indices for gather_nd
-        idx = tf.stack([batch_indices, head_indices, M_top], axis=-1)
-
-        # Reduce Q and calculate final Q_K
-        Q_reduce = tf.gather_nd(Q, idx)
-        K_transposed = tf.transpose(K, perm=[0, 1, 3, 2])
-        Q_K = tf.matmul(Q_reduce, K_transposed)
-
-        return Q_K, M_top
-
-    def _get_initial_context(self, V, L_Q):
-        """Get initial context using Keras-compatible operations."""
-        if not self.mask_flag:
-            # Sum reduction and broadcasting
-            V_sum = tf.reduce_sum(V, axis=-2)  # [B, H, D]
-            V_sum_expanded = tf.expand_dims(V_sum, axis=-2)  # [B, H, 1, D]
-            context = tf.tile(V_sum_expanded, [1, 1, L_Q, 1])  # [B, H, L_Q, D]
-        else:
-            # Cumulative sum for masked attention
-            context = tf.cumsum(V, axis=-2)
-
-        return context
-
-    def _create_prob_mask(self, B, H, L, index, scores):
-        """Create probability mask for tf.where compatibility."""
-        # Create base mask with ones
-        _mask = tf.ones((L, tf.shape(scores)[-1]), dtype=tf.float32)
-
-        # Create upper triangular matrix (including diagonal)
-        mask_a = tf.linalg.band_part(
-            _mask, 0, -1
-        )  # Upper triangular matrix of 0s and 1s
-
-        # Create diagonal matrix
-        mask_b = tf.linalg.band_part(_mask, 0, 0)  # Diagonal matrix of 0s and 1s
-
-        # Subtract diagonal from upper triangular to get strict upper triangular
-        _mask = tf.cast(mask_a - mask_b, dtype=tf.float32)
-
-        # Broadcast to [B, H, L, scores.shape[-1]]
-        _mask_ex = tf.broadcast_to(_mask, [B, H, L, tf.shape(scores)[-1]])
-
-        # Create indexing tensors
-        batch_indices = tf.range(B)[:, None, None]
-        head_indices = tf.range(H)[None, :, None]
-
-        # Extract indicator using advanced indexing
-        indicator = tf.gather_nd(
-            _mask_ex,
-            tf.stack(
-                [
-                    tf.broadcast_to(batch_indices, tf.shape(index)),
-                    tf.broadcast_to(head_indices, tf.shape(index)),
-                    index,
-                ],
-                axis=-1,
-            ),
-        )
-
-        # Reshape to match scores shape
-        prob_mask_float = tf.reshape(indicator, tf.shape(scores))
-
-        # **KEY FIX**: Convert to boolean tensor
-        prob_mask_bool = tf.cast(prob_mask_float, tf.bool)
-
-        return prob_mask_bool
-
-    def _update_context(self, context_in, V, scores, index, L_Q):
-        """Update context using Keras-compatible operations."""
-        if self.mask_flag:
-            # Apply simple masking
-            attn_mask = self._create_prob_mask(
-                tf.shape(V)[0], tf.shape(V)[1], L_Q, index, scores
+            head_indices = tf.tile(
+                tf.expand_dims(tf.expand_dims(head_range, 0), 2), [B, 1, actual_k]
             )
 
-            # Apply mask with large negative value
-            large_neg = -1e9
-            mask_value = tf.where(attn_mask, 0.0, large_neg)
-            scores = scores + mask_value
+            # Stack indices for gather_nd
+            idx = tf.stack([batch_indices, head_indices, M_top], axis=-1)
 
-        # Softmax activation
-        attn = tf.nn.softmax(scores, axis=-1)
-        attn = self.dropout(attn)
+            # Reduce Q and calculate final Q_K
+            Q_reduce = tf.gather_nd(Q, idx)
+            K_transposed = tf.transpose(K, perm=[0, 1, 3, 2])
+            Q_K = tf.matmul(Q_reduce, K_transposed)
 
-        # Create indices for scatter update
-        B, H = tf.shape(V)[0], tf.shape(V)[1]
-        index_shape = tf.shape(index)[-1]
+            return Q_K, M_top
 
-        batch_indices = tf.tile(
-            tf.expand_dims(tf.expand_dims(tf.range(B), 1), 2), [1, H, index_shape]
-        )
+        def _get_initial_context(self, V, L_Q):
+            """Get initial context using Keras-compatible operations."""
+            if not self.mask_flag:
+                # Sum reduction and broadcasting
+                V_sum = tf.reduce_sum(V, axis=-2)  # [B, H, D]
+                V_sum_expanded = tf.expand_dims(V_sum, axis=-2)  # [B, H, 1, D]
+                context = tf.tile(V_sum_expanded, [1, 1, L_Q, 1])  # [B, H, L_Q, D]
+            else:
+                # Cumulative sum for masked attention
+                context = tf.cumsum(V, axis=-2)
+
+            return context
+
+        def _create_prob_mask(self, B, H, L, index, scores):
+            """Create probability mask for tf.where compatibility."""
+            # Create base mask with ones
+            _mask = tf.ones((L, tf.shape(scores)[-1]), dtype=tf.float32)
+
+            # Create upper triangular matrix (including diagonal)
+            mask_a = tf.linalg.band_part(
+                _mask, 0, -1
+            )  # Upper triangular matrix of 0s and 1s
+
+            # Create diagonal matrix
+            mask_b = tf.linalg.band_part(_mask, 0, 0)  # Diagonal matrix of 0s and 1s
+
+            # Subtract diagonal from upper triangular to get strict upper triangular
+            _mask = tf.cast(mask_a - mask_b, dtype=tf.float32)
+
+            # Broadcast to [B, H, L, scores.shape[-1]]
+            _mask_ex = tf.broadcast_to(_mask, [B, H, L, tf.shape(scores)[-1]])
+
+            # Create indexing tensors
+            batch_indices = tf.range(B)[:, None, None]
+            head_indices = tf.range(H)[None, :, None]
+
+            # Extract indicator using advanced indexing
+            indicator = tf.gather_nd(
+                _mask_ex,
+                tf.stack(
+                    [
+                        tf.broadcast_to(batch_indices, tf.shape(index)),
+                        tf.broadcast_to(head_indices, tf.shape(index)),
+                        index,
+                    ],
+                    axis=-1,
+                ),
+            )
 
-        head_indices = tf.tile(
-            tf.expand_dims(tf.expand_dims(tf.range(H), 0), 2), [B, 1, index_shape]
-        )
+            # Reshape to match scores shape
+            prob_mask_float = tf.reshape(indicator, tf.shape(scores))
 
-        idx = tf.stack([batch_indices, head_indices, index], axis=-1)
+            # **KEY FIX**: Convert to boolean tensor
+            prob_mask_bool = tf.cast(prob_mask_float, tf.bool)
 
-        # Matrix multiplication and scatter update
-        attn_V = tf.matmul(attn, V)
-        context_updated = tf.tensor_scatter_nd_update(context_in, idx, attn_V)
+            return prob_mask_bool
 
-        return context_updated
+        def _update_context(self, context_in, V, scores, index, L_Q):
+            """Update context using Keras-compatible operations."""
+            if self.mask_flag:
+                # Apply simple masking
+                attn_mask = self._create_prob_mask(
+                    tf.shape(V)[0], tf.shape(V)[1], L_Q, index, scores
+                )
 
-    def call(self, inputs, attention_mask=None, training=None):
-        """Run forward pass with fixed tensor operations."""
-        queries, keys, values = inputs
+                # Apply mask with large negative value
+                large_neg = -1e9
+                mask_value = tf.where(attn_mask, 0.0, large_neg)
+                scores = scores + mask_value
 
-        # Get shapes
-        # B = tf.shape(queries)[0]
-        L = tf.shape(queries)[1]  # sequence length
-        # H = tf.shape(queries)[2]  # number of heads
-        D = tf.shape(queries)[3]  # dimension per head
-        S = tf.shape(keys)[1]  # source sequence length
+            # Softmax activation
+            attn = tf.nn.softmax(scores, axis=-1)
+            attn = self.dropout(attn)
 
-        # Reshape tensors - transpose to [B, H, L, D]
-        queries = tf.transpose(queries, perm=[0, 2, 1, 3])  # [B, H, L, D]
-        keys = tf.transpose(keys, perm=[0, 2, 1, 3])  # [B, H, S, D]
-        values = tf.transpose(values, perm=[0, 2, 1, 3])  # [B, H, S, D]
+            # Create indices for scatter update
+            B, H = tf.shape(V)[0], tf.shape(V)[1]
+            index_shape = tf.shape(index)[-1]
 
-        # Calculate sampling parameters with bounds checking
-        # Use tf.py_function to handle numpy operations safely
-        def safe_log_calc(seq_len, factor):
-            if hasattr(seq_len, "numpy"):
-                return int(factor * np.ceil(np.log(max(seq_len.numpy(), 2))))
-            else:
-                return int(factor * np.ceil(np.log(20)))  # fallback
-
-        U = tf.py_function(
-            func=lambda: safe_log_calc(S, self.factor), inp=[], Tout=tf.int32
-        )
-
-        u = tf.py_function(
-            func=lambda: safe_log_calc(L, self.factor), inp=[], Tout=tf.int32
-        )
-
-        # Ensure U and u are within reasonable bounds
-        U = tf.minimum(U, S)  # Can't select more than available
-        u = tf.minimum(u, L)
-
-        # Probabilistic QK computation
-        scores_top, index = self._prob_QK(queries, keys, u, U)
-
-        # Apply scale factor
-        scale = self.scale or (1.0 / tf.sqrt(tf.cast(D, tf.float32)))
-        scores_top = scores_top * scale
-
-        # Get initial context
-        context = self._get_initial_context(values, L)
-
-        # Update context with selected queries
-        context = self._update_context(context, values, scores_top, index, L)
-
-        # Transpose back to original format [B, L, H, D]
-        context = tf.transpose(context, perm=[0, 2, 1, 3])
-
-        return context
-
-    def get_config(self):
-        """Return the config of the layer."""
-        config = super().get_config()
-        config.update(
-            {
-                "mask_flag": self.mask_flag,
-                "factor": self.factor,
-                "scale": self.scale,
-                "attention_dropout": self.attention_dropout,
-            }
-        )
-        return config
-
-    @classmethod
-    def from_config(cls, config):
-        """Create layer from config."""
-        return cls(**config)
-
-
-class AttentionLayer(Layer):
-    """Keras multi-head attention layer using a custom attention mechanism."""
-
-    def __init__(
-        self, attention, d_model, n_heads, d_keys=None, d_values=None, **kwargs
-    ):
-        super().__init__(**kwargs)
-        self.d_keys = d_keys or (d_model // n_heads)
-        self.d_values = d_values or (d_model // n_heads)
-        self.d_model = d_model
-        self.n_heads = n_heads
-
-        # Store the attention mechanism
-        self.inner_attention = attention
-
-        # Projection layers
-        self.query_projection = tf.keras.layers.Dense(
-            self.d_keys * n_heads, name="query_proj"
-        )
-
-        self.key_projection = tf.keras.layers.Dense(
-            self.d_keys * n_heads, name="key_proj"
-        )
-
-        self.value_projection = tf.keras.layers.Dense(
-            self.d_values * n_heads, name="value_proj"
-        )
-
-        self.out_projection = tf.keras.layers.Dense(d_model, name="output_proj")
-
-    def build(self, input_shape):
-        """Build the layer."""
-        # Build the projection layers
-        super().build(input_shape)
-
-    def compute_output_shape(self, input_shape):
-        """Compute output shape for the layer."""
-        # Output shape is same as queries input shape but with d_model as last dimension
-        batch_size, seq_length, _ = input_shape[0]
-        return (batch_size, seq_length, self.d_model)
-
-    def call(self, inputs, attn_mask=None, training=None):
-        """Run forward pass for the attention layer."""
-        queries, keys, values = inputs
-
-        # Get batch size and sequence lengths dynamically
-        B = tf.shape(queries)[0]
-        L = tf.shape(queries)[1]  # target sequence length
-        S = tf.shape(keys)[1]  # source sequence length
-        H = self.n_heads
-
-        # Apply projections
-        queries_proj = self.query_projection(queries)  # [B, L, d_keys * n_heads]
-        keys_proj = self.key_projection(keys)  # [B, S, d_keys * n_heads]
-        values_proj = self.value_projection(values)  # [B, S, d_values * n_heads]
-
-        # Reshape to multi-head format: [B, L/S, H, d_keys/d_values]
-        queries_reshaped = tf.reshape(queries_proj, (B, L, H, self.d_keys))
-        keys_reshaped = tf.reshape(keys_proj, (B, S, H, self.d_keys))
-        values_reshaped = tf.reshape(values_proj, (B, S, H, self.d_values))
-
-        # Apply inner attention mechanism
-        attention_output = self.inner_attention(
-            [queries_reshaped, keys_reshaped, values_reshaped],
-            attention_mask=attn_mask,
-            training=training,
-        )
-
-        # Reshape attention output back to [B, L, H * d_values]
-        attention_flattened = tf.reshape(attention_output, (B, L, H * self.d_values))
-
-        # Final output projection
-        output = self.out_projection(attention_flattened)
-
-        return output
-
-    def get_config(self):
-        """Return the config of the layer."""
-        config = super().get_config()
-        config.update(
-            {
-                "d_model": self.d_model,
-                "n_heads": self.n_heads,
-                "d_keys": self.d_keys,
-                "d_values": self.d_values,
-            }
-        )
-        return config
+            batch_indices = tf.tile(
+                tf.expand_dims(tf.expand_dims(tf.range(B), 1), 2), [1, H, index_shape]
+            )
+
+            head_indices = tf.tile(
+                tf.expand_dims(tf.expand_dims(tf.range(H), 0), 2), [B, 1, index_shape]
+            )
+
+            idx = tf.stack([batch_indices, head_indices, index], axis=-1)
+
+            # Matrix multiplication and scatter update
+            attn_V = tf.matmul(attn, V)
+            context_updated = tf.tensor_scatter_nd_update(context_in, idx, attn_V)
+
+            return context_updated
+
+        def call(self, inputs, attention_mask=None, training=None):
+            """Run forward pass with fixed tensor operations."""
+            queries, keys, values = inputs
+
+            # Get shapes
+            # B = tf.shape(queries)[0]
+            L = tf.shape(queries)[1]  # sequence length
+            # H = tf.shape(queries)[2]  # number of heads
+            D = tf.shape(queries)[3]  # dimension per head
+            S = tf.shape(keys)[1]  # source sequence length
+
+            # Reshape tensors - transpose to [B, H, L, D]
+            queries = tf.transpose(queries, perm=[0, 2, 1, 3])  # [B, H, L, D]
+            keys = tf.transpose(keys, perm=[0, 2, 1, 3])  # [B, H, S, D]
+            values = tf.transpose(values, perm=[0, 2, 1, 3])  # [B, H, S, D]
+
+            # Calculate sampling parameters with bounds checking
+            # Use tf.py_function to handle numpy operations safely
+            def safe_log_calc(seq_len, factor):
+                if hasattr(seq_len, "numpy"):
+                    return int(factor * np.ceil(np.log(max(seq_len.numpy(), 2))))
+                else:
+                    return int(factor * np.ceil(np.log(20)))  # fallback
+
+            U = tf.py_function(
+                func=lambda: safe_log_calc(S, self.factor), inp=[], Tout=tf.int32
+            )
+
+            u = tf.py_function(
+                func=lambda: safe_log_calc(L, self.factor), inp=[], Tout=tf.int32
+            )
+
+            # Ensure U and u are within reasonable bounds
+            U = tf.minimum(U, S)  # Can't select more than available
+            u = tf.minimum(u, L)
+
+            # Probabilistic QK computation
+            scores_top, index = self._prob_QK(queries, keys, u, U)
+
+            # Apply scale factor
+            scale = self.scale or (1.0 / tf.sqrt(tf.cast(D, tf.float32)))
+            scores_top = scores_top * scale
+
+            # Get initial context
+            context = self._get_initial_context(values, L)
+
+            # Update context with selected queries
+            context = self._update_context(context, values, scores_top, index, L)
+
+            # Transpose back to original format [B, L, H, D]
+            context = tf.transpose(context, perm=[0, 2, 1, 3])
+
+            return context
+
+        def get_config(self):
+            """Return the config of the layer."""
+            config = super().get_config()
+            config.update(
+                {
+                    "mask_flag": self.mask_flag,
+                    "factor": self.factor,
+                    "scale": self.scale,
+                    "attention_dropout": self.attention_dropout,
+                }
+            )
+            return config
+
+        @classmethod
+        def from_config(cls, config):
+            """Create layer from config."""
+            return cls(**config)
+
+    class AttentionLayer(Layer):
+        """Keras multi-head attention layer using a custom attention mechanism."""
+
+        def __init__(
+            self, attention, d_model, n_heads, d_keys=None, d_values=None, **kwargs
+        ):
+            super().__init__(**kwargs)
+            self.d_keys = d_keys or (d_model // n_heads)
+            self.d_values = d_values or (d_model // n_heads)
+            self.d_model = d_model
+            self.n_heads = n_heads
+
+            # Store the attention mechanism
+            self.inner_attention = attention
+
+            # Projection layers
+            self.query_projection = tf.keras.layers.Dense(
+                self.d_keys * n_heads, name="query_proj"
+            )
+
+            self.key_projection = tf.keras.layers.Dense(
+                self.d_keys * n_heads, name="key_proj"
+            )
+
+            self.value_projection = tf.keras.layers.Dense(
+                self.d_values * n_heads, name="value_proj"
+            )
+
+            self.out_projection = tf.keras.layers.Dense(d_model, name="output_proj")
+
+        def build(self, input_shape):
+            """Build the layer."""
+            # Build the projection layers
+            super().build(input_shape)
+
+        def compute_output_shape(self, input_shape):
+            """Compute output shape for the layer."""
+            batch_size, seq_length, _ = input_shape[0]
+            return (batch_size, seq_length, self.d_model)
+
+        def call(self, inputs, attn_mask=None, training=None):
+            """Run forward pass for the attention layer."""
+            queries, keys, values = inputs
+
+            # Get batch size and sequence lengths dynamically
+            B = tf.shape(queries)[0]
+            L = tf.shape(queries)[1]  # target sequence length
+            S = tf.shape(keys)[1]  # source sequence length
+            H = self.n_heads
+
+            # Apply projections
+            queries_proj = self.query_projection(queries)  # [B, L, d_keys * n_heads]
+            keys_proj = self.key_projection(keys)  # [B, S, d_keys * n_heads]
+            values_proj = self.value_projection(values)  # [B, S, d_values * n_heads]
+
+            # Reshape to multi-head format: [B, L/S, H, d_keys/d_values]
+            queries_reshaped = tf.reshape(queries_proj, (B, L, H, self.d_keys))
+            keys_reshaped = tf.reshape(keys_proj, (B, S, H, self.d_keys))
+            values_reshaped = tf.reshape(values_proj, (B, S, H, self.d_values))
+
+            # Apply inner attention mechanism
+            attention_output = self.inner_attention(
+                [queries_reshaped, keys_reshaped, values_reshaped],
+                attention_mask=attn_mask,
+                training=training,
+            )
+
+            # Reshape attention output back to [B, L, H * d_values]
+            attention_flattened = tf.reshape(
+                attention_output, (B, L, H * self.d_values)
+            )
+
+            # Final output projection
+            output = self.out_projection(attention_flattened)
+
+            return output
+
+        def get_config(self):
+            """Return the config of the layer."""
+            config = super().get_config()
+            config.update(
+                {
+                    "d_model": self.d_model,
+                    "n_heads": self.n_heads,
+                    "d_keys": self.d_keys,
+                    "d_values": self.d_values,
+                }
+            )
+            return config

From b77165a38d01aefab5e3bb26d51ba35116e44716 Mon Sep 17 00:00:00 2001
From: lucifer4073 <lucifer4073@gmail.com>
Date: Mon, 21 Jul 2025 18:18:58 +0530
Subject: [PATCH 05/10] attention layers made as serializable aeon package

---
 aeon/utils/networks/attention.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/aeon/utils/networks/attention.py b/aeon/utils/networks/attention.py
index 1a724c35ec..5201bc5134 100644
--- a/aeon/utils/networks/attention.py
+++ b/aeon/utils/networks/attention.py
@@ -7,6 +7,7 @@
     import tensorflow as tf
     from tensorflow.keras.layers import Dropout, Layer
 
+    @tf.keras.utils.register_keras_serializable(package="aeon")
     class KerasProbAttention(Layer):
         """Keras implementation of ProbSparse Attention mechanism for Informer."""
 
@@ -258,6 +259,7 @@ def from_config(cls, config):
             """Create layer from config."""
             return cls(**config)
 
+    @tf.keras.utils.register_keras_serializable(package="aeon")
     class AttentionLayer(Layer):
         """Keras multi-head attention layer using a custom attention mechanism."""
 

From c9b46637fec5fd04f6bbb2c212c844ee9c928305 Mon Sep 17 00:00:00 2001
From: lucifer4073 <lucifer4073@gmail.com>
Date: Mon, 21 Jul 2025 18:20:47 +0530
Subject: [PATCH 06/10] informer net

---
 aeon/networks/_informer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aeon/networks/_informer.py b/aeon/networks/_informer.py
index dfbf4b1bfc..b30b594c62 100644
--- a/aeon/networks/_informer.py
+++ b/aeon/networks/_informer.py
@@ -65,7 +65,7 @@ class InformerNetwork(BaseDeepLearningNetwork):
     _config = {
         "python_dependencies": ["tensorflow"],
         "python_version": "<3.13",
-        "structure": "encoder-decoder",
+        "structure": "auto-encoder",
     }
 
     def __init__(

From 043da23490651a13e0083c94db684446c58f6764 Mon Sep 17 00:00:00 2001
From: lucifer4073 <lucifer4073@gmail.com>
Date: Mon, 21 Jul 2025 18:39:41 +0530
Subject: [PATCH 07/10] core import check

---
 aeon/networks/_informer.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/aeon/networks/_informer.py b/aeon/networks/_informer.py
index b30b594c62..99e64fcf77 100644
--- a/aeon/networks/_informer.py
+++ b/aeon/networks/_informer.py
@@ -5,15 +5,16 @@
 from typing import Optional
 
 from aeon.networks.base import BaseDeepLearningNetwork
-from aeon.utils.networks.attention import (
-    AttentionLayer,
-    KerasProbAttention,
-)
 from aeon.utils.validation._dependencies import _check_soft_dependencies
 
 if _check_soft_dependencies(["tensorflow"], severity="none"):
     import tensorflow as tf
 
+    from aeon.utils.networks.attention import (
+        AttentionLayer,
+        KerasProbAttention,
+    )
+
 
 class InformerNetwork(BaseDeepLearningNetwork):
     """

From e18f73590338b509195e7e7d8d419155f328a924 Mon Sep 17 00:00:00 2001
From: lucifer4073 <lucifer4073@gmail.com>
Date: Mon, 21 Jul 2025 18:42:23 +0530
Subject: [PATCH 08/10] check tf dep

---
 aeon/networks/_informer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/aeon/networks/_informer.py b/aeon/networks/_informer.py
index 99e64fcf77..00d79aaff5 100644
--- a/aeon/networks/_informer.py
+++ b/aeon/networks/_informer.py
@@ -8,7 +8,6 @@
 from aeon.utils.validation._dependencies import _check_soft_dependencies
 
 if _check_soft_dependencies(["tensorflow"], severity="none"):
-    import tensorflow as tf
 
     from aeon.utils.networks.attention import (
         AttentionLayer,
@@ -69,6 +68,8 @@ class InformerNetwork(BaseDeepLearningNetwork):
         "structure": "auto-encoder",
     }
 
+    import tensorflow as tf
+
     def __init__(
         self,
         seq_len: int = 96,

From 2830627aaaab2acd598f4d4099453a815552ceea Mon Sep 17 00:00:00 2001
From: lucifer4073 <lucifer4073@gmail.com>
Date: Tue, 22 Jul 2025 01:19:57 +0530
Subject: [PATCH 09/10] tests corrected

---
 aeon/networks/_informer.py               | 230 +++++++++++++----------
 aeon/networks/tests/test_all_networks.py |   5 +
 aeon/networks/tests/test_informer.py     |  44 -----
 3 files changed, 134 insertions(+), 145 deletions(-)

diff --git a/aeon/networks/_informer.py b/aeon/networks/_informer.py
index 00d79aaff5..f5b81c6ac3 100644
--- a/aeon/networks/_informer.py
+++ b/aeon/networks/_informer.py
@@ -2,7 +2,6 @@
 
 __maintainer__ = [""]
 
-from typing import Optional
 
 from aeon.networks.base import BaseDeepLearningNetwork
 from aeon.utils.validation._dependencies import _check_soft_dependencies
@@ -65,11 +64,9 @@ class InformerNetwork(BaseDeepLearningNetwork):
     _config = {
         "python_dependencies": ["tensorflow"],
         "python_version": "<3.13",
-        "structure": "auto-encoder",
+        "structure": "transformer",
     }
 
-    import tensorflow as tf
-
     def __init__(
         self,
         seq_len: int = 96,
@@ -104,9 +101,7 @@ def __init__(
 
         super().__init__()
 
-    def _token_embedding(
-        self, input_tensor: tf.Tensor, c_in: int, d_model: int
-    ) -> tf.Tensor:
+    def _token_embedding(self, input_tensor, c_in, d_model):
         """
         Token embedding layer using 1D convolution with causal padding.
 
@@ -132,9 +127,7 @@ def _token_embedding(
         x = tf.keras.layers.LeakyReLU()(x)
         return x
 
-    def _positional_embedding(
-        self, input_tensor: tf.Tensor, d_model: int, max_len: int = 5000
-    ) -> tf.Tensor:
+    def _positional_embedding(self, input_tensor, d_model, max_len=5000):
         """
         Positional embedding layer that computes positional encodings.
 
@@ -175,12 +168,12 @@ def _positional_embedding(
 
     def _data_embedding(
         self,
-        input_tensor: tf.Tensor,
-        c_in: int,
-        d_model: int,
-        dropout: float = 0.1,
-        max_len: int = 5000,
-    ) -> tf.Tensor:
+        input_tensor,
+        c_in,
+        d_model,
+        dropout=0.1,
+        max_len=5000,
+    ):
         """
         Combine token and positional embeddings for the input tensor.
 
@@ -218,7 +211,7 @@ def _data_embedding(
 
         return x
 
-    def _conv_layer(self, input_tensor: tf.Tensor, c_in: int) -> tf.Tensor:
+    def _conv_layer(self, input_tensor, c_in):
         """
         Convolutional layer with batch normalization, ELU, and max pooling.
 
@@ -254,15 +247,15 @@ def _conv_layer(self, input_tensor: tf.Tensor, c_in: int) -> tf.Tensor:
 
     def _attention_out(
         self,
-        input_tensor: tf.Tensor,
-        attention_type: str,
-        mask_flag: bool,
-        d_model: int,
-        n_heads: int,
-        factor: int = 5,
-        dropout: float = 0.1,
-        attn_mask: Optional[tf.Tensor] = None,
-    ) -> tf.Tensor:
+        input_tensor,
+        attention_type,
+        mask_flag,
+        d_model,
+        n_heads,
+        factor=5,
+        dropout=0.1,
+        attn_mask=None,
+    ):
         """
         Attention output layer applying either ProbAttention or FullAttention.
 
@@ -327,17 +320,17 @@ def _attention_out(
 
     def _encoder_layer(
         self,
-        input_tensor: tf.Tensor,
-        d_model: int,
-        d_ff: Optional[int] = None,
-        dropout: float = 0.1,
-        activation: str = "relu",
-        attn_mask: Optional[tf.Tensor] = None,
-        attention_type: str = "prob",
-        mask_flag: bool = True,
-        n_heads: int = 8,
-        factor: int = 5,
-    ) -> tf.Tensor:
+        input_tensor,
+        d_model,
+        d_ff=None,
+        dropout=0.1,
+        activation="relu",
+        attn_mask=None,
+        attention_type="prob",
+        mask_flag=True,
+        n_heads=8,
+        factor=5,
+    ):
         """
         Apply encoder layer with multi-head attention and feed-forward network.
 
@@ -415,21 +408,21 @@ def _encoder_layer(
 
     def _encoder(
         self,
-        input_tensor: tf.Tensor,
-        e_layers: int,
-        d_model: int,
-        d_ff: Optional[int] = None,
-        dropout: float = 0.1,
-        activation: str = "relu",
-        attn_mask: Optional[tf.Tensor] = None,
-        attention_type: str = "prob",
-        mask_flag: bool = True,
-        n_heads: int = 8,
-        factor: int = 5,
-        use_conv_layers: bool = False,
-        c_in: Optional[int] = None,
-        use_norm: bool = True,
-    ) -> tf.Tensor:
+        input_tensor,
+        e_layers,
+        d_model,
+        d_ff=None,
+        dropout=0.1,
+        activation="relu",
+        attn_mask=None,
+        attention_type="prob",
+        mask_flag=True,
+        n_heads=8,
+        factor=5,
+        use_conv_layers=False,
+        c_in=None,
+        use_norm=True,
+    ):
         """
         Apply encoder stack with multiple encoder layers and optional conv layers.
 
@@ -539,20 +532,20 @@ def _encoder(
 
     def _decoder_layer(
         self,
-        input_tensor: tf.Tensor,
-        cross_tensor: tf.Tensor,
-        d_model: int,
-        d_ff: Optional[int] = None,
-        dropout: float = 0.1,
-        activation: str = "relu",
-        x_mask: Optional[tf.Tensor] = None,
-        cross_mask: Optional[tf.Tensor] = None,
-        self_attention_type: str = "prob",
-        cross_attention_type: str = "prob",
-        mask_flag: bool = True,
-        n_heads: int = 8,
-        factor: int = 5,
-    ) -> tf.Tensor:
+        input_tensor,
+        cross_tensor,
+        d_model,
+        d_ff=None,
+        dropout=0.1,
+        activation="relu",
+        x_mask=None,
+        cross_mask=None,
+        self_attention_type="prob",
+        cross_attention_type="prob",
+        mask_flag=True,
+        n_heads=8,
+        factor=5,
+    ):
         """
         Apply decoder layer with self-attention, cross-attention, and FFN.
 
@@ -661,22 +654,22 @@ def _decoder_layer(
 
     def _decoder(
         self,
-        input_tensor: tf.Tensor,
-        cross_tensor: tf.Tensor,
-        d_layers: int,
-        d_model: int,
-        d_ff: Optional[int] = None,
-        dropout: float = 0.1,
-        activation: str = "relu",
-        x_mask: Optional[tf.Tensor] = None,
-        cross_mask: Optional[tf.Tensor] = None,
-        self_attention_type: str = "prob",
-        cross_attention_type: str = "prob",
-        mask_flag: bool = True,
-        n_heads: int = 8,
-        factor: int = 5,
-        use_norm: bool = True,
-    ) -> tf.Tensor:
+        input_tensor,
+        cross_tensor,
+        d_layers,
+        d_model,
+        d_ff=None,
+        dropout=0.1,
+        activation="relu",
+        x_mask=None,
+        cross_mask=None,
+        self_attention_type="prob",
+        cross_attention_type="prob",
+        mask_flag=True,
+        n_heads=8,
+        factor=5,
+        use_norm=True,
+    ):
         """
         Apply decoder stack with multiple decoder layers and optional normalization.
 
@@ -746,29 +739,67 @@ def _decoder(
 
         return x
 
-    def build_network(
-        self, input_shape: tuple[int, int], **kwargs
-    ) -> tuple[list[tf.Tensor], tf.Tensor]:
+    def _preprocess_time_series(self, data, seq_len, label_len, pred_len):
+        """
+        Preprocess time series data of shape (None, n_timepoints, n_channels).
+
+        Parameters
+        ----------
+        data : tf.Tensor
+            Input tensor of shape (None, n_timepoints, n_channels)
+        seq_len : int
+            Encoder input sequence length
+        label_len : int
+            Known decoder input length
+        pred_len : int
+            Prediction length
+
+        Returns
+        -------
+        tuple
+            (x_enc, x_dec) where:
+            - x_enc: Encoder input tensor of shape (None, seq_len, n_channels)
+            - x_dec: Decoder input tensor of shape (None, label_len + pred_len
+                                                        , n_channels)
+        """
+        import tensorflow as tf
+
+        # Get tensor dimensions - handle None batch dimension
+        batch_size, n_timepoints, n_channels = data.shape
+
+        # Encoder input: first seq_len timepoints
+        x_enc = data[:, :seq_len, :]  # (None, seq_len, n_channels)
+
+        # Decoder input construction
+        x_dec_known = data[
+            :, seq_len - label_len : seq_len, :
+        ]  # (None, label_len, n_channels)
+
+        # Unknown part: zeros for prediction horizon
+        x_dec_pred = data[:, :pred_len, :]
+
+        # Concatenate known and prediction parts
+        x_dec = tf.keras.layers.Concatenate(axis=1)([x_dec_known, x_dec_pred])
+
+        return x_enc, x_dec
+
+    def build_network(self, input_shape, **kwargs):
         """Build the complete Informer architecture for time series forecasting."""
         import tensorflow as tf
 
         # Get input dimensions
         n_timepoints, n_channels = input_shape
 
-        # hardcode batch_size for now
-        batch_size = 32
-
-        # Create input layers for encoder and decoder
-        encoder_input = tf.keras.layers.Input(
-            shape=(self.seq_len, n_channels),
-            name="encoder_input",
-            batch_size=batch_size,
+        input_data = tf.keras.layers.Input(
+            shape=input_shape,
+            name="time_series_input",
         )
 
-        decoder_input = tf.keras.layers.Input(
-            shape=(self.label_len + self.out_len, n_channels),
-            name="decoder_input",
-            batch_size=batch_size,
+        encoder_input, decoder_input = self._preprocess_time_series(
+            data=input_data,
+            seq_len=self.seq_len,
+            label_len=self.label_len,
+            pred_len=self.out_len,
         )
 
         # Encoder embedding
@@ -829,7 +860,4 @@ def build_network(
         # Extract only the prediction part (last out_len timesteps)
         output = output[:, -self.out_len :, :]
 
-        # Create the model with both encoder and decoder inputs
-        inputs = [encoder_input, decoder_input]
-
-        return inputs, output
+        return input_data, output
diff --git a/aeon/networks/tests/test_all_networks.py b/aeon/networks/tests/test_all_networks.py
index 9ca85474fb..924e1f2623 100644
--- a/aeon/networks/tests/test_all_networks.py
+++ b/aeon/networks/tests/test_all_networks.py
@@ -75,6 +75,11 @@ def test_all_networks_params(network):
             f"{network.__name__} not to be tested (AE networks have their own tests)."
         )
 
+    if network._config["structure"] == "transformer":
+        pytest.skip(
+            f"{network.__name__} not to be tested (transformers have their own tests)."
+        )
+
     if not (
         _check_soft_dependencies(
             network._config["python_dependencies"], severity="none"
diff --git a/aeon/networks/tests/test_informer.py b/aeon/networks/tests/test_informer.py
index 9d5be59351..9e472f77f8 100644
--- a/aeon/networks/tests/test_informer.py
+++ b/aeon/networks/tests/test_informer.py
@@ -48,7 +48,6 @@ def test_informer_network_init(
     inputs, outputs = informer.build_network((seq_len + label_len, 5))
     assert inputs is not None
     assert outputs is not None
-    assert len(inputs) == 2  # encoder_input and decoder_input
 
 
 @pytest.mark.skipif(
@@ -106,49 +105,6 @@ def test_informer_network_distil_mix_factor(distil, mix, factor):
     assert outputs is not None
 
 
-@pytest.mark.skipif(
-    not _check_soft_dependencies(["tensorflow"], severity="none"),
-    reason="Tensorflow soft dependency unavailable.",
-)
-def test_informer_network_output_shape():
-    """Test whether InformerNetwork produces correct output shapes."""
-    seq_len = 96
-    label_len = 48
-    out_len = 24
-    n_channels = 5
-    # batch_size = 32
-
-    informer = InformerNetwork(
-        seq_len=seq_len,
-        label_len=label_len,
-        out_len=out_len,
-        d_model=128,
-        n_heads=4,
-        e_layers=2,
-        d_layers=1,
-    )
-
-    inputs, outputs = informer.build_network((seq_len + label_len, n_channels))
-
-    # Create a TensorFlow model to test actual shapes
-    if _check_soft_dependencies(["tensorflow"], severity="none"):
-        # keras_model = tf.keras.Model(inputs=inputs, outputs=outputs)
-
-        # Test input shapes
-        encoder_input_shape = inputs[0].shape
-        decoder_input_shape = inputs[1].shape
-
-        assert encoder_input_shape[1] == seq_len  # sequence length
-        assert encoder_input_shape[2] == n_channels  # number of channels
-        assert decoder_input_shape[1] == label_len + out_len  # decoder sequence length
-        assert decoder_input_shape[2] == n_channels  # number of channels
-
-        # Test output shape
-        output_shape = outputs.shape
-        assert output_shape[1] == out_len  # prediction length
-        assert output_shape[2] == n_channels  # number of channels
-
-
 @pytest.mark.skipif(
     not _check_soft_dependencies(["tensorflow"], severity="none"),
     reason="Tensorflow soft dependency unavailable.",

From d86771f80a98a0fb41571b421f536cccd3c2dd94 Mon Sep 17 00:00:00 2001
From: lucifer4073 <lucifer4073@gmail.com>
Date: Tue, 22 Jul 2025 01:56:23 +0530
Subject: [PATCH 10/10] informer updated

---
 aeon/networks/_informer.py           | 284 ++++++++++++++-------------
 aeon/networks/tests/test_informer.py | 115 +++++------
 2 files changed, 203 insertions(+), 196 deletions(-)

diff --git a/aeon/networks/_informer.py b/aeon/networks/_informer.py
index f5b81c6ac3..f02c058afa 100644
--- a/aeon/networks/_informer.py
+++ b/aeon/networks/_informer.py
@@ -24,27 +24,27 @@ class InformerNetwork(BaseDeepLearningNetwork):
 
     Parameters
     ----------
-    seq_len : int, default=96
-        Input sequence length.
-    label_len : int, default=48
+    encoder_input_len : int, default=96
+        Encoder input sequence length.
+    decoder_input_len : int, default=48
         Start token length for decoder.
-    out_len : int, default=24
+    prediction_horizon : int, default=24
         Prediction sequence length.
     factor : int, default=5
         ProbSparse attention factor.
-    d_model : int, default=512
+    model_dimension : int, default=512
         Model dimension.
-    n_heads : int, default=8
+    num_attention_heads : int, default=8
         Number of attention heads.
-    e_layers : int, default=3
+    encoder_layers : int, default=3
         Number of encoder layers.
-    d_layers : int, default=2
+    decoder_layers : int, default=2
         Number of decoder layers.
-    d_ff : int, default=512
+    feedforward_dim : int, default=512
         Feed forward network dimension.
     dropout : float, default=0.0
         Dropout rate.
-    attn : str, default='prob'
+    attention_type : str, default='prob'
         Attention mechanism type ('prob' or 'full').
     activation : str, default='gelu'
         Activation function.
@@ -69,39 +69,39 @@ class InformerNetwork(BaseDeepLearningNetwork):
 
     def __init__(
         self,
-        seq_len: int = 96,
-        label_len: int = 48,
-        out_len: int = 24,
+        encoder_input_len: int = 96,
+        decoder_input_len: int = 48,
+        prediction_horizon: int = 24,
         factor: int = 5,
-        d_model: int = 512,
-        n_heads: int = 8,
-        e_layers: int = 3,
-        d_layers: int = 2,
-        d_ff: int = 512,
+        model_dimension: int = 512,
+        num_attention_heads: int = 8,
+        encoder_layers: int = 3,
+        decoder_layers: int = 2,
+        feedforward_dim: int = 512,
         dropout: float = 0.0,
-        attn: str = "prob",
+        attention_type: str = "prob",
         activation: str = "gelu",
         distil: bool = True,
         mix: bool = True,
     ):
-        self.seq_len = seq_len
-        self.label_len = label_len
-        self.out_len = out_len
+        self.encoder_input_len = encoder_input_len
+        self.decoder_input_len = decoder_input_len
+        self.prediction_horizon = prediction_horizon
         self.factor = factor
-        self.d_model = d_model
-        self.n_heads = n_heads
-        self.e_layers = e_layers
-        self.d_layers = d_layers
-        self.d_ff = d_ff
+        self.model_dimension = model_dimension
+        self.num_attention_heads = num_attention_heads
+        self.encoder_layers = encoder_layers
+        self.decoder_layers = decoder_layers
+        self.feedforward_dim = feedforward_dim
         self.dropout = dropout
-        self.attn = attn
+        self.attention_type = attention_type
         self.activation = activation
         self.distil = distil
         self.mix = mix
 
         super().__init__()
 
-    def _token_embedding(self, input_tensor, c_in, d_model):
+    def _token_embedding(self, input_tensor, c_in, model_dimension):
         """
         Token embedding layer using 1D convolution with causal padding.
 
@@ -111,7 +111,7 @@ def _token_embedding(self, input_tensor, c_in, d_model):
             Input tensor to be processed.
         c_in : int
             Number of input channels.
-        d_model : int
+        model_dimension : int
             Dimension of the model (number of output filters).
 
         Returns
@@ -122,12 +122,15 @@ def _token_embedding(self, input_tensor, c_in, d_model):
         import tensorflow as tf
 
         x = tf.keras.layers.Conv1D(
-            filters=d_model, kernel_size=3, padding="causal", activation="linear"
+            filters=model_dimension,
+            kernel_size=3,
+            padding="causal",
+            activation="linear",
         )(input_tensor)
         x = tf.keras.layers.LeakyReLU()(x)
         return x
 
-    def _positional_embedding(self, input_tensor, d_model, max_len=5000):
+    def _positional_embedding(self, input_tensor, model_dimension, max_len=5000):
         """
         Positional embedding layer that computes positional encodings.
 
@@ -135,7 +138,7 @@ def _positional_embedding(self, input_tensor, d_model, max_len=5000):
         ----------
         input_tensor : tf.Tensor
             Input tensor to get positional embeddings for.
-        d_model : int
+        model_dimension : int
             Dimension of the model.
         max_len : int, optional
             Maximum length of the sequence, by default 5000
@@ -151,10 +154,11 @@ def _positional_embedding(self, input_tensor, d_model, max_len=5000):
         import tensorflow as tf
 
         # Compute the positional encodings
-        pe = np.zeros((max_len, d_model), dtype=np.float32)
+        pe = np.zeros((max_len, model_dimension), dtype=np.float32)
         position = np.expand_dims(np.arange(0, max_len, dtype=np.float32), 1)
         div_term = np.exp(
-            np.arange(0, d_model, 2, dtype=np.float32) * -(math.log(10000.0) / d_model)
+            np.arange(0, model_dimension, 2, dtype=np.float32)
+            * -(math.log(10000.0) / model_dimension)
         )
 
         pe[:, 0::2] = np.sin(position * div_term)
@@ -170,7 +174,7 @@ def _data_embedding(
         self,
         input_tensor,
         c_in,
-        d_model,
+        model_dimension,
         dropout=0.1,
         max_len=5000,
     ):
@@ -183,7 +187,7 @@ def _data_embedding(
             Input tensor to be processed.
         c_in : int
             Number of input channels.
-        d_model : int
+        model_dimension : int
             Dimension of the model (number of output filters).
         dropout : float, optional
             Dropout rate, by default 0.1
@@ -198,10 +202,10 @@ def _data_embedding(
         import tensorflow as tf
 
         # Get token embeddings
-        token_emb = self._token_embedding(input_tensor, c_in, d_model)
+        token_emb = self._token_embedding(input_tensor, c_in, model_dimension)
 
         # Get positional embeddings
-        pos_emb = self._positional_embedding(input_tensor, d_model, max_len)
+        pos_emb = self._positional_embedding(input_tensor, model_dimension, max_len)
 
         # Combine embeddings
         x = token_emb + pos_emb
@@ -250,8 +254,8 @@ def _attention_out(
         input_tensor,
         attention_type,
         mask_flag,
-        d_model,
-        n_heads,
+        model_dimension,
+        num_attention_heads,
         factor=5,
         dropout=0.1,
         attn_mask=None,
@@ -267,9 +271,9 @@ def _attention_out(
             Type of attention mechanism ('prob' or 'full').
         mask_flag : bool
             Whether to use attention masking.
-        d_model : int
+        model_dimension : int
             Model dimension.
-        n_heads : int
+        num_attention_heads : int
             Number of attention heads.
         factor : int, optional
             Attention factor for ProbSparse attention, by default 5
@@ -294,18 +298,18 @@ def _attention_out(
 
             output = AttentionLayer(
                 attention=prob_attention,
-                d_model=d_model,
-                n_heads=n_heads,
-                d_keys=d_model // n_heads,  # 512 // 8 = 64
-                d_values=d_model // n_heads,  # 512 // 8 = 64
+                d_model=model_dimension,
+                n_heads=num_attention_heads,
+                d_keys=model_dimension // num_attention_heads,  # 512 // 8 = 64
+                d_values=model_dimension // num_attention_heads,  # 512 // 8 = 64
             )(input_tensor, attn_mask=attn_mask)
 
         else:
             queries, keys, values = input_tensor
             output = tf.keras.layers.MultiHeadAttention(
-                num_heads=n_heads,  # 8
-                key_dim=d_model // n_heads,  # 512 // 8 = 64
-                value_dim=d_model // n_heads,  # 512 // 8 = 64
+                num_heads=num_attention_heads,  # 8
+                key_dim=model_dimension // num_attention_heads,  # 512 // 8 = 64
+                value_dim=model_dimension // num_attention_heads,  # 512 // 8 = 64
                 dropout=dropout,
                 use_bias=True,
             )(
@@ -321,14 +325,14 @@ def _attention_out(
     def _encoder_layer(
         self,
         input_tensor,
-        d_model,
-        d_ff=None,
+        model_dimension,
+        feedforward_dim=None,
         dropout=0.1,
         activation="relu",
         attn_mask=None,
         attention_type="prob",
         mask_flag=True,
-        n_heads=8,
+        num_attention_heads=8,
         factor=5,
     ):
         """
@@ -339,9 +343,9 @@ def _encoder_layer(
         input_tensor : tf.Tensor
             Input tensor of shape [B, L, D] where B is batch size,
             L is sequence length, D is model dimension.
-        d_model : int
+        model_dimension : int
             Model dimension (must match input tensor's last dimension).
-        d_ff : int, optional
+        feedforward_dim : int, optional
             Feed-forward network dimension
         dropout : float, optional
             Dropout rate, by default 0.1
@@ -357,17 +361,17 @@ def _encoder_layer(
         """
         import tensorflow as tf
 
-        # Set default d_ff if not provided
-        if d_ff is None:
-            d_ff = 4 * d_model
+        # Set default feedforward_dim if not provided
+        if feedforward_dim is None:
+            feedforward_dim = 4 * model_dimension
 
         # Self-attention using the _attention_out function with parameters
         attn_output = self._attention_out(
             input_tensor=[input_tensor, input_tensor, input_tensor],
             attention_type=attention_type,
             mask_flag=mask_flag,
-            d_model=d_model,
-            n_heads=n_heads,
+            model_dimension=model_dimension,
+            num_attention_heads=num_attention_heads,
             factor=factor,
             dropout=dropout,
             attn_mask=attn_mask,
@@ -384,7 +388,7 @@ def _encoder_layer(
 
         # Feed-forward network
         # First 1D convolution (expansion)
-        y = tf.keras.layers.Conv1D(filters=d_ff, kernel_size=1)(x)
+        y = tf.keras.layers.Conv1D(filters=feedforward_dim, kernel_size=1)(x)
 
         # Apply activation function
         if activation == "relu":
@@ -396,7 +400,7 @@ def _encoder_layer(
         y = tf.keras.layers.Dropout(dropout)(y)
 
         # Second 1D convolution (compression back to d_model)
-        y = tf.keras.layers.Conv1D(filters=d_model, kernel_size=1)(y)
+        y = tf.keras.layers.Conv1D(filters=model_dimension, kernel_size=1)(y)
 
         # Apply dropout
         y = tf.keras.layers.Dropout(dropout)(y)
@@ -409,15 +413,15 @@ def _encoder_layer(
     def _encoder(
         self,
         input_tensor,
-        e_layers,
-        d_model,
-        d_ff=None,
+        encoder_layers,
+        model_dimension,
+        feedforward_dim=None,
         dropout=0.1,
         activation="relu",
         attn_mask=None,
         attention_type="prob",
         mask_flag=True,
-        n_heads=8,
+        num_attention_heads=8,
         factor=5,
         use_conv_layers=False,
         c_in=None,
@@ -430,11 +434,11 @@ def _encoder(
         ----------
         input_tensor : tf.Tensor
             Input tensor of shape [B, L, D]
-        e_layers : int
+        encoder_layers : int
             Number of encoder layers to stack.
-        d_model : int
+        model_dimension : int
             Model dimension (must match input tensor's last dimension).
-        d_ff : int, optional
+        feedforward_dim : int, optional
             Feed-forward network dimension
         dropout : float, optional
             Dropout rate, by default 0.1
@@ -446,7 +450,7 @@ def _encoder(
             Type of attention mechanism ('prob' or 'full')
         mask_flag : bool, optional
             Whether to use attention masking, by default True
-        n_heads : int, optional
+        num_attention_heads : int, optional
             Number of attention heads, by default 8
         factor : int, optional
             Attention factor for ProbSparse attention, by default 5
@@ -466,25 +470,25 @@ def _encoder(
 
         # Set default values
         if c_in is None:
-            c_in = d_model
+            c_in = model_dimension
 
         x = input_tensor
 
         # Apply encoder layers with optional convolutional layers
         if use_conv_layers:
             # Apply paired encoder and conv layers
-            for _ in range(e_layers - 1):
+            for _ in range(encoder_layers - 1):
                 # Apply encoder layer
                 x = self._encoder_layer(
                     input_tensor=x,
-                    d_model=d_model,
-                    d_ff=d_ff,
+                    model_dimension=model_dimension,
+                    feedforward_dim=feedforward_dim,
                     dropout=dropout,
                     activation=activation,
                     attn_mask=attn_mask,
                     attention_type=attention_type,
                     mask_flag=mask_flag,
-                    n_heads=n_heads,
+                    num_attention_heads=num_attention_heads,
                     factor=factor,
                 )
 
@@ -497,30 +501,30 @@ def _encoder(
             # Apply final encoder layer (without conv layer)
             x = self._encoder_layer(
                 input_tensor=x,
-                d_model=d_model,
-                d_ff=d_ff,
+                model_dimension=model_dimension,
+                feedforward_dim=feedforward_dim,
                 dropout=dropout,
                 activation=activation,
                 attn_mask=attn_mask,
                 attention_type=attention_type,
                 mask_flag=mask_flag,
-                n_heads=n_heads,
+                num_attention_heads=num_attention_heads,
                 factor=factor,
             )
 
         else:
             # Apply only encoder layers without convolutional layers
-            for _ in range(e_layers):
+            for _ in range(encoder_layers):
                 x = self._encoder_layer(
                     input_tensor=x,
-                    d_model=d_model,
-                    d_ff=d_ff,
+                    model_dimension=model_dimension,
+                    feedforward_dim=feedforward_dim,
                     dropout=dropout,
                     activation=activation,
                     attn_mask=attn_mask,
                     attention_type=attention_type,
                     mask_flag=mask_flag,
-                    n_heads=n_heads,
+                    num_attention_heads=num_attention_heads,
                     factor=factor,
                 )
 
@@ -534,8 +538,8 @@ def _decoder_layer(
         self,
         input_tensor,
         cross_tensor,
-        d_model,
-        d_ff=None,
+        model_dimension,
+        feedforward_dim=None,
         dropout=0.1,
         activation="relu",
         x_mask=None,
@@ -543,7 +547,7 @@ def _decoder_layer(
         self_attention_type="prob",
         cross_attention_type="prob",
         mask_flag=True,
-        n_heads=8,
+        num_attention_heads=8,
         factor=5,
     ):
         """
@@ -555,9 +559,9 @@ def _decoder_layer(
             Input tensor of shape [B, L, D]
         cross_tensor : tf.Tensor
             Cross-attention input tensor (encoder output) of shape [B, L_enc, D]
-        d_model : int
+        model_dimension : int
             Model dimension (must match input tensor's last dimension).
-        d_ff : int, optional
+        feedforward_dim : int, optional
             Feed-forward network dimension
         dropout : float, optional
             Dropout rate, by default 0.1
@@ -573,7 +577,7 @@ def _decoder_layer(
             Type of cross-attention mechanism ('prob' or 'full')
         mask_flag : bool, optional
             Whether to use attention masking, by default True
-        n_heads : int, optional
+        num_attention_heads : int, optional
             Number of attention heads, by default 8
         factor : int, optional
             Attention factor for ProbSparse attention, by default 5
@@ -585,17 +589,17 @@ def _decoder_layer(
         """
         import tensorflow as tf
 
-        # Set default d_ff if not provided
-        if d_ff is None:
-            d_ff = 4 * d_model
+        # Set default feedforward_dim if not provided
+        if feedforward_dim is None:
+            feedforward_dim = 4 * model_dimension
 
         # Self-attention block
         self_attn_output = self._attention_out(
             input_tensor=[input_tensor, input_tensor, input_tensor],
             attention_type=self_attention_type,
             mask_flag=mask_flag,
-            d_model=d_model,
-            n_heads=n_heads,
+            model_dimension=model_dimension,
+            num_attention_heads=num_attention_heads,
             factor=factor,
             dropout=dropout,
             attn_mask=x_mask,
@@ -612,8 +616,8 @@ def _decoder_layer(
             input_tensor=[x, cross_tensor, cross_tensor],
             attention_type=cross_attention_type,
             mask_flag=mask_flag,
-            d_model=d_model,
-            n_heads=n_heads,
+            model_dimension=model_dimension,
+            num_attention_heads=num_attention_heads,
             factor=factor,
             dropout=dropout,
             attn_mask=cross_mask,
@@ -630,7 +634,7 @@ def _decoder_layer(
 
         # Feed-forward network
         # First 1D convolution (expansion)
-        y = tf.keras.layers.Conv1D(filters=d_ff, kernel_size=1)(x)
+        y = tf.keras.layers.Conv1D(filters=feedforward_dim, kernel_size=1)(x)
 
         # Apply activation function
         if activation == "relu":
@@ -642,7 +646,7 @@ def _decoder_layer(
         y = tf.keras.layers.Dropout(dropout)(y)
 
         # Second 1D convolution (compression back to d_model)
-        y = tf.keras.layers.Conv1D(filters=d_model, kernel_size=1)(y)
+        y = tf.keras.layers.Conv1D(filters=model_dimension, kernel_size=1)(y)
 
         # Apply dropout
         y = tf.keras.layers.Dropout(dropout)(y)
@@ -656,9 +660,9 @@ def _decoder(
         self,
         input_tensor,
         cross_tensor,
-        d_layers,
-        d_model,
-        d_ff=None,
+        decoder_layers,
+        model_dimension,
+        feedforward_dim=None,
         dropout=0.1,
         activation="relu",
         x_mask=None,
@@ -666,7 +670,7 @@ def _decoder(
         self_attention_type="prob",
         cross_attention_type="prob",
         mask_flag=True,
-        n_heads=8,
+        num_attention_heads=8,
         factor=5,
         use_norm=True,
     ):
@@ -679,11 +683,11 @@ def _decoder(
             Decoder input tensor of shape [B, L_dec, D]
         cross_tensor : tf.Tensor
             Cross-attention input tensor (encoder output) of shape [B, L_enc, D]
-        d_layers : int
+        decoder_layers : int
             Number of decoder layers to stack.
-        d_model : int
+        model_dimension : int
             Model dimension (must match input tensor's last dimension).
-        d_ff : int, optional
+        feedforward_dim : int, optional
             Feed-forward network dimension
         dropout : float, optional
             Dropout rate, by default 0.1
@@ -699,7 +703,7 @@ def _decoder(
             Type of cross-attention mechanism ('prob' or 'full')
         mask_flag : bool, optional
             Whether to use attention masking, by default True
-        n_heads : int, optional
+        num_attention_heads : int, optional
             Number of attention heads, by default 8
         factor : int, optional
             Attention factor for ProbSparse attention, by default 5
@@ -716,12 +720,12 @@ def _decoder(
         x = input_tensor
 
         # Apply multiple decoder layers
-        for _ in range(d_layers):
+        for _ in range(decoder_layers):
             x = self._decoder_layer(
                 input_tensor=x,
                 cross_tensor=cross_tensor,
-                d_model=d_model,
-                d_ff=d_ff,
+                model_dimension=model_dimension,
+                feedforward_dim=feedforward_dim,
                 dropout=dropout,
                 activation=activation,
                 x_mask=x_mask,
@@ -729,7 +733,7 @@ def _decoder(
                 self_attention_type=self_attention_type,
                 cross_attention_type=cross_attention_type,
                 mask_flag=mask_flag,
-                n_heads=n_heads,
+                num_attention_heads=num_attention_heads,
                 factor=factor,
             )
 
@@ -739,7 +743,9 @@ def _decoder(
 
         return x
 
-    def _preprocess_time_series(self, data, seq_len, label_len, pred_len):
+    def _preprocess_time_series(
+        self, data, encoder_input_len, decoder_input_len, prediction_horizon
+    ):
         """
         Preprocess time series data of shape (None, n_timepoints, n_channels).
 
@@ -747,20 +753,20 @@ def _preprocess_time_series(self, data, seq_len, label_len, pred_len):
         ----------
         data : tf.Tensor
             Input tensor of shape (None, n_timepoints, n_channels)
-        seq_len : int
+        encoder_input_len : int
             Encoder input sequence length
-        label_len : int
+        decoder_input_len : int
             Known decoder input length
-        pred_len : int
+        prediction_horizon : int
             Prediction length
 
         Returns
         -------
         tuple
             (x_enc, x_dec) where:
-            - x_enc: Encoder input tensor of shape (None, seq_len, n_channels)
-            - x_dec: Decoder input tensor of shape (None, label_len + pred_len
-                                                        , n_channels)
+            - x_enc: Encoder input tensor of shape (None, encoder_input_len, n_channels)
+            - x_dec: Decoder input tensor of shape (None,
+            decoder_input_len + prediction_horizon, n_channels)
         """
         import tensorflow as tf
 
@@ -768,15 +774,15 @@ def _preprocess_time_series(self, data, seq_len, label_len, pred_len):
         batch_size, n_timepoints, n_channels = data.shape
 
         # Encoder input: first seq_len timepoints
-        x_enc = data[:, :seq_len, :]  # (None, seq_len, n_channels)
+        x_enc = data[:, :encoder_input_len, :]  # (None, encoder_input_len, n_channels)
 
         # Decoder input construction
         x_dec_known = data[
-            :, seq_len - label_len : seq_len, :
-        ]  # (None, label_len, n_channels)
+            :, encoder_input_len - decoder_input_len : encoder_input_len, :
+        ]  # (None, decoder_input_len, n_channels)
 
         # Unknown part: zeros for prediction horizon
-        x_dec_pred = data[:, :pred_len, :]
+        x_dec_pred = data[:, :prediction_horizon, :]
 
         # Concatenate known and prediction parts
         x_dec = tf.keras.layers.Concatenate(axis=1)([x_dec_known, x_dec_pred])
@@ -797,34 +803,34 @@ def build_network(self, input_shape, **kwargs):
 
         encoder_input, decoder_input = self._preprocess_time_series(
             data=input_data,
-            seq_len=self.seq_len,
-            label_len=self.label_len,
-            pred_len=self.out_len,
+            encoder_input_len=self.encoder_input_len,
+            decoder_input_len=self.decoder_input_len,
+            prediction_horizon=self.prediction_horizon,
         )
 
         # Encoder embedding
         enc_embedded = self._data_embedding(
             input_tensor=encoder_input,
             c_in=n_channels,
-            d_model=self.d_model,
+            model_dimension=self.model_dimension,
             dropout=self.dropout,
-            max_len=self.seq_len,
+            max_len=self.encoder_input_len,
         )
 
         # Encoder processing
         enc_output = self._encoder(
             input_tensor=enc_embedded,
-            e_layers=self.e_layers,
-            d_model=self.d_model,
-            d_ff=self.d_ff,
+            encoder_layers=self.encoder_layers,
+            model_dimension=self.model_dimension,
+            feedforward_dim=self.feedforward_dim,
             dropout=self.dropout,
             activation=self.activation,
-            attention_type=self.attn,
+            attention_type=self.attention_type,
             mask_flag=False,
-            n_heads=self.n_heads,
+            num_attention_heads=self.num_attention_heads,
             factor=self.factor,
             use_conv_layers=self.distil,
-            c_in=self.d_model,
+            c_in=self.model_dimension,
             use_norm=True,
         )
 
@@ -832,24 +838,24 @@ def build_network(self, input_shape, **kwargs):
         dec_embedded = self._data_embedding(
             input_tensor=decoder_input,
             c_in=n_channels,
-            d_model=self.d_model,
+            model_dimension=self.model_dimension,
             dropout=self.dropout,
-            max_len=self.label_len + self.out_len,
+            max_len=self.decoder_input_len + self.prediction_horizon,
         )
 
         # Decoder processing
         dec_output = self._decoder(
             input_tensor=dec_embedded,
             cross_tensor=enc_output,
-            d_layers=self.d_layers,
-            d_model=self.d_model,
-            d_ff=self.d_ff,
+            decoder_layers=self.decoder_layers,
+            model_dimension=self.model_dimension,
+            feedforward_dim=self.feedforward_dim,
             dropout=self.dropout,
             activation=self.activation,
-            self_attention_type=self.attn,
+            self_attention_type=self.attention_type,
             cross_attention_type="full",
             mask_flag=self.mix,
-            n_heads=self.n_heads,
+            num_attention_heads=self.num_attention_heads,
             factor=self.factor,
             use_norm=True,
         )
@@ -858,6 +864,6 @@ def build_network(self, input_shape, **kwargs):
         output = tf.keras.layers.Dense(n_channels, name="output_projection")(dec_output)
 
         # Extract only the prediction part (last out_len timesteps)
-        output = output[:, -self.out_len :, :]
+        output = output[:, -self.prediction_horizon :, :]
 
         return input_data, output
diff --git a/aeon/networks/tests/test_informer.py b/aeon/networks/tests/test_informer.py
index 9e472f77f8..8c59fa30d5 100644
--- a/aeon/networks/tests/test_informer.py
+++ b/aeon/networks/tests/test_informer.py
@@ -13,7 +13,9 @@
     reason="Tensorflow soft dependency unavailable.",
 )
 @pytest.mark.parametrize(
-    "seq_len,label_len,out_len,d_model,n_heads,e_layers,d_layers",
+    "encoder_input_len,decoder_input_len,"
+    "prediction_horizon,model_dimension,num_attention_heads,"
+    "encoder_layers,decoder_layers",
     [
         (96, 48, 24, 512, 8, 3, 2),
         (48, 24, 12, 256, 4, 2, 1),
@@ -22,30 +24,30 @@
     ],
 )
 def test_informer_network_init(
-    seq_len,
-    label_len,
-    out_len,
-    d_model,
-    n_heads,
-    e_layers,
-    d_layers,
+    encoder_input_len,
+    decoder_input_len,
+    prediction_horizon,
+    model_dimension,
+    num_attention_heads,
+    encoder_layers,
+    decoder_layers,
 ):
     """Test whether InformerNetwork initializes correctly for various parameters."""
     informer = InformerNetwork(
-        seq_len=seq_len,
-        label_len=label_len,
-        out_len=out_len,
-        d_model=d_model,
-        n_heads=n_heads,
-        e_layers=e_layers,
-        d_layers=d_layers,
+        encoder_input_len=encoder_input_len,
+        decoder_input_len=decoder_input_len,
+        prediction_horizon=prediction_horizon,
+        model_dimension=model_dimension,
+        num_attention_heads=num_attention_heads,
+        encoder_layers=encoder_layers,
+        decoder_layers=decoder_layers,
         factor=random.choice([3, 5, 7]),
         dropout=random.choice([0.0, 0.1, 0.2]),
-        attn=random.choice(["prob", "full"]),
+        attention_type=random.choice(["prob", "full"]),
         activation=random.choice(["relu", "gelu"]),
     )
 
-    inputs, outputs = informer.build_network((seq_len + label_len, 5))
+    inputs, outputs = informer.build_network((encoder_input_len + decoder_input_len, 5))
     assert inputs is not None
     assert outputs is not None
 
@@ -55,20 +57,20 @@ def test_informer_network_init(
     reason="Tensorflow soft dependency unavailable.",
 )
 @pytest.mark.parametrize(
-    "attn,activation",
+    "attention_type,activation",
     [("prob", "relu"), ("full", "gelu"), ("prob", "gelu"), ("full", "relu")],
 )
-def test_informer_network_attention_activation(attn, activation):
+def test_informer_network_attention_activation(attention_type, activation):
     """Test InformerNetwork with different attention and activation."""
     informer = InformerNetwork(
-        seq_len=96,
-        label_len=48,
-        out_len=24,
-        d_model=128,
-        n_heads=4,
-        e_layers=2,
-        d_layers=1,
-        attn=attn,
+        encoder_input_len=96,
+        decoder_input_len=48,
+        prediction_horizon=24,
+        model_dimension=128,
+        num_attention_heads=4,
+        encoder_layers=2,
+        decoder_layers=1,
+        attention_type=attention_type,
         activation=activation,
     )
 
@@ -88,13 +90,13 @@ def test_informer_network_attention_activation(attn, activation):
 def test_informer_network_distil_mix_factor(distil, mix, factor):
     """Test whether InformerNetwork works with different configurations."""
     informer = InformerNetwork(
-        seq_len=48,
-        label_len=24,
-        out_len=12,
-        d_model=64,
-        n_heads=2,
-        e_layers=1,
-        d_layers=1,
+        encoder_input_len=48,
+        decoder_input_len=24,
+        prediction_horizon=12,
+        model_dimension=64,
+        num_attention_heads=2,
+        encoder_layers=1,
+        decoder_layers=1,
         distil=distil,
         mix=mix,
         factor=factor,
@@ -118,14 +120,14 @@ def test_informer_network_default_parameters():
     assert outputs is not None
 
     # Check default values
-    assert informer.seq_len == 96
-    assert informer.label_len == 48
-    assert informer.out_len == 24
-    assert informer.d_model == 512
-    assert informer.n_heads == 8
-    assert informer.e_layers == 3
-    assert informer.d_layers == 2
-    assert informer.attn == "prob"
+    assert informer.encoder_input_len == 96
+    assert informer.decoder_input_len == 48
+    assert informer.prediction_horizon == 24
+    assert informer.model_dimension == 512
+    assert informer.num_attention_heads == 8
+    assert informer.encoder_layers == 3
+    assert informer.decoder_layers == 2
+    assert informer.attention_type == "prob"
     assert informer.activation == "gelu"
     assert informer.distil
     assert informer.mix
@@ -137,15 +139,14 @@ def test_informer_network_default_parameters():
 )
 def test_informer_network_parameter_validation():
     """Test whether InformerNetwork handles edge case parameters correctly."""
-    # Test minimum viable configuration
     informer = InformerNetwork(
-        seq_len=12,
-        label_len=6,
-        out_len=3,
-        d_model=32,
-        n_heads=1,
-        e_layers=1,
-        d_layers=1,
+        encoder_input_len=12,
+        decoder_input_len=6,
+        prediction_horizon=3,
+        model_dimension=32,
+        num_attention_heads=1,
+        encoder_layers=1,
+        decoder_layers=1,
         factor=1,
         dropout=0.0,
     )
@@ -163,13 +164,13 @@ def test_informer_network_different_channels():
     """Test whether InformerNetwork works with different numbers of input channels."""
     for n_channels in [1, 3, 5, 10]:
         informer = InformerNetwork(
-            seq_len=48,
-            label_len=24,
-            out_len=12,
-            d_model=64,
-            n_heads=2,
-            e_layers=1,
-            d_layers=1,
+            encoder_input_len=48,
+            decoder_input_len=24,
+            prediction_horizon=12,
+            model_dimension=64,
+            num_attention_heads=2,
+            encoder_layers=1,
+            decoder_layers=1,
         )
 
         inputs, outputs = informer.build_network((72, n_channels))