From c5bdec73d8132ba84a9b9f9d1369e4f39d5c25d0 Mon Sep 17 00:00:00 2001 From: lucifer4073 Date: Sun, 29 Jun 2025 22:29:42 +0530 Subject: [PATCH 01/10] Init updated with informer --- aeon/networks/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/aeon/networks/__init__.py b/aeon/networks/__init__.py index b6dd2d02dd..735dd354f3 100644 --- a/aeon/networks/__init__.py +++ b/aeon/networks/__init__.py @@ -18,6 +18,7 @@ "AEDRNNNetwork", "AEBiGRUNetwork", "DisjointCNNNetwork", + "InformerNetwork", ] from aeon.networks._ae_abgru import AEAttentionBiGRUNetwork from aeon.networks._ae_bgru import AEBiGRUNetwork @@ -31,6 +32,7 @@ from aeon.networks._encoder import EncoderNetwork from aeon.networks._fcn import FCNNetwork from aeon.networks._inception import InceptionNetwork +from aeon.networks._informer import InformerNetwork from aeon.networks._lite import LITENetwork from aeon.networks._mlp import MLPNetwork from aeon.networks._resnet import ResNetNetwork From 18e557d1e39f895159a7b110241e74fbd9e3442b Mon Sep 17 00:00:00 2001 From: lucifer4073 Date: Mon, 30 Jun 2025 00:48:21 +0530 Subject: [PATCH 02/10] initial stage of informer network added --- aeon/networks/_informer.py | 912 +++++++++++++++++++++++++++++++++++++ 1 file changed, 912 insertions(+) create mode 100644 aeon/networks/_informer.py diff --git a/aeon/networks/_informer.py b/aeon/networks/_informer.py new file mode 100644 index 0000000000..47d2d85ea4 --- /dev/null +++ b/aeon/networks/_informer.py @@ -0,0 +1,912 @@ +"""Informer Network for time series forecasting.""" + +__maintainer__ = [""] + +from aeon.networks.base import BaseDeepLearningNetwork + + +class InformerNetwork(BaseDeepLearningNetwork): + """ + TensorFlow implementation of the Informer network for time series forecasting. + + The Informer network is a Transformer-based architecture designed for + long sequence time-series forecasting. It uses ProbSparse self-attention + mechanism and distilling operation to reduce computational complexity. + + Parameters + ---------- + enc_in : int, default=7 + Number of encoder input features. + dec_in : int, default=7 + Number of decoder input features. + c_out : int, default=7 + Number of output features. + seq_len : int, default=96 + Input sequence length. + label_len : int, default=48 + Start token length for decoder. + out_len : int, default=24 + Prediction sequence length. + factor : int, default=5 + ProbSparse attention factor. + d_model : int, default=512 + Model dimension. + n_heads : int, default=8 + Number of attention heads. + e_layers : int, default=3 + Number of encoder layers. + d_layers : int, default=2 + Number of decoder layers. + d_ff : int, default=512 + Feed forward network dimension. + dropout : float, default=0.0 + Dropout rate. + attn : str, default='prob' + Attention mechanism type ('prob' or 'full'). + embed : str, default='fixed' + Embedding type. + freq : str, default='h' + Time frequency encoding. + activation : str, default='gelu' + Activation function. + output_attention : bool, default=False + Whether to output attention weights. + distil : bool, default=True + Whether to use distilling operation. + mix : bool, default=True + Whether to use mix attention in decoder. + + References + ---------- + .. [1] Zhou, H., Zhang, S., Peng, J., Zhang, S., Li, J., Xiong, H., & Zhang, W. + (2021). Informer: Beyond efficient transformer for long sequence + time-series forecasting. In Proceedings of the AAAI conference on + artificial intelligence (Vol. 35, No. 12, pp. 11106-11115). + """ + + _config = { + "python_dependencies": ["tensorflow"], + "python_version": "<3.13", + "structure": "encoder-decoder", + } + + def __init__( + self, + enc_in=7, + dec_in=7, + c_out=7, + seq_len=96, + label_len=48, + out_len=24, + factor=5, + d_model=512, + n_heads=8, + e_layers=3, + d_layers=2, + d_ff=512, + dropout=0.0, + attn="prob", + embed="fixed", + freq="h", + activation="gelu", + output_attention=False, + distil=True, + mix=True, + ): + self.enc_in = enc_in + self.dec_in = dec_in + self.c_out = c_out + self.seq_len = seq_len + self.label_len = label_len + self.out_len = out_len + self.factor = factor + self.d_model = d_model + self.n_heads = n_heads + self.e_layers = e_layers + self.d_layers = d_layers + self.d_ff = d_ff + self.dropout = dropout + self.attn = attn + self.embed = embed + self.freq = freq + self.activation = activation + self.output_attention = output_attention + self.distil = distil + self.mix = mix + + super().__init__() + + def build_network(self, input_shape, **kwargs): + """ + Construct the Informer network and return its input and output layers. + + Parameters + ---------- + input_shape : tuple of shape = (n_timepoints (m), n_channels (d)) + The shape of the data fed into the input layer. + + Returns + ------- + input_layer : keras.layers.Input + The input layer of the network. + output_layer : keras.layers.Layer + The output layer of the network. + """ + import tensorflow as tf + + # Input layers + x_enc = tf.keras.layers.Input( + shape=(self.seq_len, self.enc_in), name="encoder_input" + ) + x_mark_enc = tf.keras.layers.Input(shape=(self.seq_len, 4), name="encoder_mark") + x_dec = tf.keras.layers.Input( + shape=(self.label_len + self.out_len, self.dec_in), name="decoder_input" + ) + x_mark_dec = tf.keras.layers.Input( + shape=(self.label_len + self.out_len, 4), name="decoder_mark" + ) + + # Encoder embedding + enc_embedding = self._data_embedding( + self.enc_in, self.d_model, self.embed, self.freq, self.dropout + ) + enc_out = enc_embedding([x_enc, x_mark_enc]) + + # Encoder + encoder = self._build_encoder() + enc_out, attns = encoder(enc_out) + + # Decoder embedding + dec_embedding = self._data_embedding( + self.dec_in, self.d_model, self.embed, self.freq, self.dropout + ) + dec_out = dec_embedding([x_dec, x_mark_dec]) + + # Decoder + decoder = self._build_decoder() + dec_out = decoder([dec_out, enc_out]) + + # Final projection + projection = tf.keras.layers.Dense(self.c_out, use_bias=True, name="projection") + dec_out = projection(dec_out) + + # Extract prediction sequence + output = tf.keras.layers.Lambda( + lambda x: x[:, -self.out_len :, :], name="prediction_slice" + )(dec_out) + + # Create model inputs list + inputs = [x_enc, x_mark_enc, x_dec, x_mark_dec] + + if self.output_attention: + outputs = [output, attns] + else: + outputs = output + + return inputs, outputs + + def _positional_embedding(self, d_model, max_len=5000): + """Create positional embedding layer.""" + import math + + import numpy as np + import tensorflow as tf + + # Compute the positional encodings once in log space + pe = np.zeros((max_len, d_model), dtype=np.float32) + position = np.arange(0, max_len, dtype=np.float32)[:, np.newaxis] + div_term = np.exp( + np.arange(0, d_model, 2, dtype=np.float32) * -(math.log(10000.0) / d_model) + ) + + pe[:, 0::2] = np.sin(position * div_term) + pe[:, 1::2] = np.cos(position * div_term) + pe = pe[np.newaxis, :] # Add batch dimension + + # Create constant tensor + pe_tensor = tf.constant(pe, dtype=tf.float32) + + def positional_function(x): + seq_len = tf.shape(x)[1] + return pe_tensor[:, :seq_len, :] + + return positional_function + + def _token_embedding(self, c_in, d_model): + """Create token embedding layer.""" + import tensorflow as tf + + token_conv = tf.keras.layers.Conv1D( + filters=d_model, + kernel_size=3, + padding="same", + kernel_initializer=tf.keras.initializers.HeNormal(), + ) + + def token_function(x): + return token_conv(x) + + return token_function + + def _fixed_embedding(self, c_in, d_model): + """Create fixed embedding layer.""" + import math + + import numpy as np + import tensorflow as tf + + # Create fixed sinusoidal embeddings + w = np.zeros((c_in, d_model), dtype=np.float32) + position = np.arange(0, c_in, dtype=np.float32)[:, np.newaxis] + div_term = np.exp( + np.arange(0, d_model, 2, dtype=np.float32) * -(math.log(10000.0) / d_model) + ) + + w[:, 0::2] = np.sin(position * div_term) + w[:, 1::2] = np.cos(position * div_term) + + # Create embedding layer with fixed weights + embedding = tf.keras.layers.Embedding( + input_dim=c_in, + output_dim=d_model, + embeddings_initializer="zeros", + trainable=False, + ) + + def fixed_function(x): + # Initialize weights if not already done + if not embedding.built: + embedding.build((None,)) + embedding.embeddings.assign(w) + return tf.stop_gradient(embedding(x)) + + return fixed_function + + def _temporal_embedding(self, d_model, embed_type, freq): + """Create temporal embedding layer.""" + import tensorflow as tf + + # Define embedding sizes + minute_size = 4 + hour_size = 24 + weekday_size = 7 + day_size = 32 + month_size = 13 + + # Choose embedding type + if embed_type == "fixed": + minute_embed = ( + self._fixed_embedding(minute_size, d_model) if freq == "t" else None + ) + hour_embed = self._fixed_embedding(hour_size, d_model) + weekday_embed = self._fixed_embedding(weekday_size, d_model) + day_embed = self._fixed_embedding(day_size, d_model) + month_embed = self._fixed_embedding(month_size, d_model) + else: + minute_embed = ( + tf.keras.layers.Embedding(minute_size, d_model) if freq == "t" else None + ) + hour_embed = tf.keras.layers.Embedding(hour_size, d_model) + weekday_embed = tf.keras.layers.Embedding(weekday_size, d_model) + day_embed = tf.keras.layers.Embedding(day_size, d_model) + month_embed = tf.keras.layers.Embedding(month_size, d_model) + + def temporal_function(x): + x = tf.cast(x, tf.int32) + + minute_x = minute_embed(x[:, :, 4]) if minute_embed is not None else 0.0 + hour_x = hour_embed(x[:, :, 3]) + weekday_x = weekday_embed(x[:, :, 2]) + day_x = day_embed(x[:, :, 1]) + month_x = month_embed(x[:, :, 0]) + + return hour_x + weekday_x + day_x + month_x + minute_x + + return temporal_function + + def _time_feature_embedding(self, d_model, embed_type, freq): + """Create time feature embedding layer.""" + import tensorflow as tf + + freq_map = {"h": 4, "t": 5, "s": 6, "m": 1, "a": 1, "w": 2, "d": 3, "b": 3} + d_inp = freq_map[freq] + + embed_layer = tf.keras.layers.Dense(d_model) + + def time_feature_function(x): + return embed_layer(x) + + return time_feature_function + + def _data_embedding(self, c_in, d_model, embed_type, freq, dropout): + """Create data embedding layer.""" + import tensorflow as tf + + # Create embedding components + value_embedding = self._token_embedding(c_in, d_model) + position_embedding = self._positional_embedding(d_model) + + if embed_type != "timeF": + temporal_embedding = self._temporal_embedding(d_model, embed_type, freq) + else: + temporal_embedding = self._time_feature_embedding(d_model, embed_type, freq) + + dropout_layer = tf.keras.layers.Dropout(dropout) + + def embedding_function(inputs, training=None): + x, x_mark = inputs + + value_emb = value_embedding(x) + pos_emb = position_embedding(x) + temporal_emb = temporal_embedding(x_mark) + + embeddings = value_emb + pos_emb + temporal_emb + return dropout_layer(embeddings, training=training) + + return embedding_function + + def _build_encoder(self): + """Build the encoder stack with attention layers.""" + import tensorflow as tf + + # Choose attention type + if self.attn == "prob": + Attn = self._prob_attention( + False, self.factor, self.dropout, self.output_attention + ) + else: + Attn = self._full_attention( + False, self.factor, self.dropout, self.output_attention + ) + + # Build encoder layers + encoder_layers = [] + for l in range(self.e_layers): + attention_layer = self._attention_layer( + Attn, self.d_model, self.n_heads, mix=False + ) + encoder_layer = self._encoder_layer( + attention_layer, self.d_model, self.d_ff, self.dropout, self.activation + ) + encoder_layers.append(encoder_layer) + + # Build conv layers for distilling + conv_layers = None + if self.distil: + conv_layers = [] + for l in range(self.e_layers - 1): + conv_layer = self._conv_layer(self.d_model) + conv_layers.append(conv_layer) + + # Normalization layer + norm_layer = tf.keras.layers.LayerNormalization() + + def encoder_function(x, attn_mask=None, training=None): + # x [B, L, D] + attns = [] + + if conv_layers is not None: + # Process with both attention and conv layers + for attn_layer, conv_layer in zip(encoder_layers, conv_layers): + x, attn = attn_layer(x, attn_mask=attn_mask, training=training) + x = conv_layer(x, training=training) + attns.append(attn) + + # Final attention layer + x, attn = encoder_layers[-1](x, attn_mask=attn_mask, training=training) + attns.append(attn) + else: + # Process with only attention layers + for attn_layer in encoder_layers: + x, attn = attn_layer(x, attn_mask=attn_mask, training=training) + attns.append(attn) + + if norm_layer is not None: + x = norm_layer(x, training=training) + + return x, attns + + return encoder_function + + def _build_decoder(self): + """Build the decoder stack with attention layers.""" + import tensorflow as tf + + # Build decoder layers + decoder_layers = [] + for l in range(self.d_layers): + # Self-attention (with mask) + self_attn = ( + self._prob_attention(True, self.factor, self.dropout, False) + if self.attn == "prob" + else self._full_attention(True, self.factor, self.dropout, False) + ) + self_attention_layer = self._attention_layer( + self_attn, self.d_model, self.n_heads, self.mix + ) + + # Cross-attention (without mask) + cross_attn = self._full_attention(False, self.factor, self.dropout, False) + cross_attention_layer = self._attention_layer( + cross_attn, self.d_model, self.n_heads, False + ) + + decoder_layer = self._decoder_layer( + self_attention_layer, + cross_attention_layer, + self.d_model, + self.d_ff, + self.dropout, + self.activation, + ) + decoder_layers.append(decoder_layer) + + # Normalization layer + norm_layer = tf.keras.layers.LayerNormalization() + + def decoder_function(inputs, training=None): + x, cross = inputs + x_mask = None # Can be added as parameter if needed + cross_mask = None # Can be added as parameter if needed + + for layer in decoder_layers: + x = layer( + x, cross, x_mask=x_mask, cross_mask=cross_mask, training=training + ) + + if norm_layer is not None: + x = norm_layer(x, training=training) + + return x + + return decoder_function + + def _prob_attention(self, mask_flag, factor, attention_dropout, output_attention): + """Create ProbSparse attention mechanism.""" + from math import sqrt + + import numpy as np + import tensorflow as tf + + dropout_layer = tf.keras.layers.Dropout(attention_dropout) + + def _prob_QK(Q, K, sample_k, n_top): + # Q [B, H, L, D] + B, H, L_K, E = ( + tf.shape(K)[0], + tf.shape(K)[1], + tf.shape(K)[2], + tf.shape(K)[3], + ) + L_Q = tf.shape(Q)[2] + + # calculate the sampled Q_K + K_expand = tf.expand_dims(K, axis=-3) # [B, H, 1, L_K, E] + K_expand = tf.tile(K_expand, [1, 1, L_Q, 1, 1]) # [B, H, L_Q, L_K, E] + + # Generate random indices for sampling + index_sample = tf.random.uniform( + (L_Q, sample_k), maxval=L_K, dtype=tf.int32 + ) + + # Create indices for gathering + batch_indices = tf.range(B)[:, None, None, None, None] + head_indices = tf.range(H)[None, :, None, None, None] + query_indices = tf.range(L_Q)[None, None, :, None, None] + sample_indices = index_sample[None, None, :, :, None] + + # Gather K_sample + gather_indices = tf.concat( + [ + tf.broadcast_to(batch_indices, [B, H, L_Q, sample_k, 1]), + tf.broadcast_to(head_indices, [B, H, L_Q, sample_k, 1]), + tf.broadcast_to(query_indices, [B, H, L_Q, sample_k, 1]), + tf.broadcast_to(sample_indices, [B, H, L_Q, sample_k, 1]), + ], + axis=-1, + ) + + K_sample = tf.gather_nd( + K_expand, gather_indices + ) # [B, H, L_Q, sample_k, E] + + # Calculate Q_K_sample + Q_expanded = tf.expand_dims(Q, axis=-2) # [B, H, L_Q, 1, E] + Q_K_sample = tf.matmul( + Q_expanded, K_sample, transpose_b=True + ) # [B, H, L_Q, 1, sample_k] + Q_K_sample = tf.squeeze(Q_K_sample, axis=-2) # [B, H, L_Q, sample_k] + + # find the Top_k query with sparsity measurement + M = tf.reduce_max(Q_K_sample, axis=-1) - tf.reduce_sum( + Q_K_sample, axis=-1 + ) / tf.cast(L_K, tf.float32) + M_top = tf.nn.top_k(M, k=n_top, sorted=False).indices + + # use the reduced Q to calculate Q_K + batch_idx = tf.range(B)[:, None, None] + head_idx = tf.range(H)[None, :, None] + + gather_indices_q = tf.stack( + [ + tf.broadcast_to(batch_idx, tf.shape(M_top)), + tf.broadcast_to(head_idx, tf.shape(M_top)), + M_top, + ], + axis=-1, + ) + + Q_reduce = tf.gather_nd(Q, gather_indices_q) # [B, H, n_top, E] + Q_K = tf.matmul(Q_reduce, K, transpose_b=True) # [B, H, n_top, L_K] + + return Q_K, M_top + + def _get_initial_context(V, L_Q): + B, H, L_V, D = ( + tf.shape(V)[0], + tf.shape(V)[1], + tf.shape(V)[2], + tf.shape(V)[3], + ) + + if not mask_flag: + V_sum = tf.reduce_mean(V, axis=-2) # [B, H, D] + context = tf.expand_dims(V_sum, axis=-2) # [B, H, 1, D] + context = tf.tile(context, [1, 1, L_Q, 1]) # [B, H, L_Q, D] + else: + # For masked case, L_Q should equal L_V + context = tf.cumsum(V, axis=-2) + + return context + + def _prob_mask(B, H, L, index, scores): + # Create upper triangular mask (excluding diagonal) + L_scores = tf.shape(scores)[-1] + _mask = tf.linalg.band_part(tf.ones((L, L_scores), dtype=tf.bool), 0, -1) + _mask = tf.logical_not(_mask) # Upper triangular without diagonal + + # Expand mask for batch and head dimensions + _mask_ex = tf.tile( + tf.expand_dims(tf.expand_dims(_mask, 0), 0), [B, H, 1, 1] + ) + + # Gather mask at specified indices + batch_idx = tf.range(B)[:, None, None] + head_idx = tf.range(H)[None, :, None] + + gather_indices = tf.stack( + [ + tf.broadcast_to(batch_idx, tf.shape(index)), + tf.broadcast_to(head_idx, tf.shape(index)), + index, + ], + axis=-1, + ) + + indicator = tf.gather_nd(_mask_ex, gather_indices) + return indicator + + def _update_context(context_in, V, scores, index, L_Q, attn_mask): + B, H, L_V, D = ( + tf.shape(V)[0], + tf.shape(V)[1], + tf.shape(V)[2], + tf.shape(V)[3], + ) + + if mask_flag: + attn_mask = _prob_mask(B, H, L_Q, index, scores) + scores = tf.where( + attn_mask, tf.fill(tf.shape(scores), float("-inf")), scores + ) + + attn = tf.nn.softmax(scores, axis=-1) + + # Calculate attention-weighted values + attn_V = tf.matmul(attn, V) # [B, H, n_top, D] + + # Update context_in at specified indices + batch_idx = tf.range(B)[:, None, None] + head_idx = tf.range(H)[None, :, None] + + update_indices = tf.stack( + [ + tf.broadcast_to(batch_idx, tf.shape(index)), + tf.broadcast_to(head_idx, tf.shape(index)), + index, + ], + axis=-1, + ) + + context_in = tf.tensor_scatter_nd_update(context_in, update_indices, attn_V) + + if output_attention: + # Initialize full attention matrix + attns = tf.ones([B, H, L_V, L_V], dtype=attn.dtype) / tf.cast( + L_V, attn.dtype + ) + attns = tf.tensor_scatter_nd_update(attns, update_indices, attn) + return context_in, attns + else: + return context_in, None + + def prob_attention_function( + queries, keys, values, attn_mask=None, training=None + ): + B, L_Q, H, D = ( + tf.shape(queries)[0], + tf.shape(queries)[1], + tf.shape(queries)[2], + tf.shape(queries)[3], + ) + L_K = tf.shape(keys)[1] + + # Transpose to [B, H, L, D] format + queries = tf.transpose(queries, perm=[0, 2, 1, 3]) + keys = tf.transpose(keys, perm=[0, 2, 1, 3]) + values = tf.transpose(values, perm=[0, 2, 1, 3]) + + # Calculate sampling parameters + U_part = int(factor * np.ceil(np.log(L_K))) + u = int(factor * np.ceil(np.log(L_Q))) + + U_part = min(U_part, L_K) + u = min(u, L_Q) + + # Get top-k scores and indices + scores_top, index = _prob_QK(queries, keys, sample_k=U_part, n_top=u) + + # Apply scale factor + scale = 1.0 / sqrt(D) + if scale is not None: + scores_top = scores_top * scale + + # Get initial context and update with top-k queries + context = _get_initial_context(values, L_Q) + context, attn = _update_context( + context, values, scores_top, index, L_Q, attn_mask + ) + + # Transpose back to [B, L, H, D] format + context = tf.transpose(context, perm=[0, 2, 1, 3]) + + return context, attn + + return prob_attention_function + + def _full_attention(self, mask_flag, factor, attention_dropout, output_attention): + """Create full attention mechanism.""" + import numpy as np + import tensorflow as tf + + dropout_layer = tf.keras.layers.Dropout(attention_dropout) + + def _triangular_causal_mask(B, L): + """Create triangular causal mask for attention.""" + mask_shape = [B, 1, L, L] + # Create upper triangular mask (excluding diagonal) + mask = tf.linalg.band_part(tf.ones(mask_shape, dtype=tf.bool), 0, -1) + mask = tf.logical_not(tf.linalg.band_part(mask, 0, 0)) # Remove diagonal + return mask + + def full_attention_function( + queries, keys, values, attn_mask=None, training=None + ): + # Get shapes + B = tf.shape(queries)[0] + L = tf.shape(queries)[1] + H = tf.shape(queries)[2] + E = tf.shape(queries)[3] + S = tf.shape(keys)[1] + D = tf.shape(values)[3] + + # Calculate scale + scale = 1.0 / tf.math.sqrt(tf.cast(E, tf.float32)) + + # Compute attention scores: "blhe,bshe->bhls" + scores = tf.einsum("blhe,bshe->bhls", queries, keys) + + if mask_flag: + if attn_mask is None: + attn_mask = _triangular_causal_mask(B, L) + else: + # If attn_mask is provided, use its mask attribute if it's an object + if hasattr(attn_mask, "mask"): + attn_mask = attn_mask.mask + + # Apply mask by setting masked positions to -inf + scores = tf.where( + attn_mask, + tf.fill(tf.shape(scores), tf.constant(-np.inf, dtype=scores.dtype)), + scores, + ) + + # Apply scale and softmax + A = tf.nn.softmax(scale * scores, axis=-1) + + # Apply dropout + A = dropout_layer(A, training=training) + + # Compute output: "bhls,bshd->blhd" + V = tf.einsum("bhls,bshd->blhd", A, values) + + if output_attention: + return V, A + else: + return V, None + + return full_attention_function + + def _attention_layer(self, attention, d_model, n_heads, mix): + """Create attention layer wrapper.""" + import tensorflow as tf + + d_keys = d_model // n_heads + d_values = d_model // n_heads + + # Linear projection layers for Q, K, V + query_dense = tf.keras.layers.Dense(d_model) + key_dense = tf.keras.layers.Dense(d_model) + value_dense = tf.keras.layers.Dense(d_model) + + # Output projection + out_projection = tf.keras.layers.Dense(d_model) + + def attention_layer_function( + queries, keys, values, attn_mask=None, training=None + ): + B, L, _ = tf.shape(queries)[0], tf.shape(queries)[1], tf.shape(queries)[2] + S = tf.shape(keys)[1] + H = n_heads + + # Linear projections in batch from d_model => h x d_k + Q = query_dense(queries) + K = key_dense(keys) + V = value_dense(values) + + # Reshape to (B, L, H, d_k) and transpose to (B, H, L, d_k) + Q = tf.reshape(Q, [B, L, H, d_keys]) + K = tf.reshape(K, [B, S, H, d_keys]) + V = tf.reshape(V, [B, S, H, d_values]) + + Q = tf.transpose(Q, [0, 2, 1, 3]) # (B, H, L, d_k) + K = tf.transpose(K, [0, 2, 1, 3]) # (B, H, S, d_k) + V = tf.transpose(V, [0, 2, 1, 3]) # (B, H, S, d_v) + + # Apply attention function + out, attn = attention(Q, K, V, attn_mask=attn_mask, training=training) + + # Concatenate heads and put through final linear layer + # out shape: (B, H, L, d_v) -> (B, L, H, d_v) -> (B, L, H*d_v) + out = tf.transpose(out, [0, 2, 1, 3]) + out = tf.reshape(out, [B, L, H * d_values]) + + # Apply mix transformation if needed + if mix: + # Reshape to (B, L, H, d_values) then transpose to (B, H, L, d_values) + out = tf.reshape(out, [B, L, H, d_values]) + out = tf.transpose(out, [0, 2, 1, 3]) + out = tf.reshape(out, [B, L, H * d_values]) + + # Final output projection + out = out_projection(out) + + return out, attn + + return attention_layer_function + + def _encoder_layer(self, attention_layer, d_model, d_ff, dropout, activation): + """Create single encoder layer.""" + import tensorflow as tf + + d_ff = d_ff or 4 * d_model + + # Conv1D layers for feed-forward network + conv1 = tf.keras.layers.Conv1D(filters=d_ff, kernel_size=1) + conv2 = tf.keras.layers.Conv1D(filters=d_model, kernel_size=1) + + # Layer normalization + norm1 = tf.keras.layers.LayerNormalization() + norm2 = tf.keras.layers.LayerNormalization() + + # Dropout + dropout_layer = tf.keras.layers.Dropout(dropout) + + # Activation function + if activation == "relu": + activation_fn = tf.nn.relu + else: + activation_fn = tf.nn.gelu + + def encoder_layer_function(x, attn_mask=None, training=None): + # Self-attention with residual connection + new_x, attn = attention_layer( + x, x, x, attn_mask=attn_mask, training=training + ) + x = x + dropout_layer(new_x, training=training) + y = x = norm1(x, training=training) + + # Feed-forward network with residual connection + y = conv1(y) + y = dropout_layer(activation_fn(y), training=training) + y = conv2(y) + y = dropout_layer(y, training=training) + + return norm2(x + y, training=training), attn + + return encoder_layer_function + + def _decoder_layer( + self, self_attention, cross_attention, d_model, d_ff, dropout, activation + ): + """Create single decoder layer.""" + import tensorflow as tf + + d_ff = d_ff or 4 * d_model + + # Conv1D layers equivalent to PyTorch's Conv1d + conv1 = tf.keras.layers.Conv1D(filters=d_ff, kernel_size=1) + conv2 = tf.keras.layers.Conv1D(filters=d_model, kernel_size=1) + + # Layer normalization + norm1 = tf.keras.layers.LayerNormalization() + norm2 = tf.keras.layers.LayerNormalization() + norm3 = tf.keras.layers.LayerNormalization() + + # Dropout + dropout_layer = tf.keras.layers.Dropout(dropout) + + # Activation function + if activation == "relu": + activation_fn = tf.nn.relu + else: + activation_fn = tf.nn.gelu + + def decoder_layer_function( + x, cross, x_mask=None, cross_mask=None, training=None + ): + # Self-attention with residual connection + self_attn_out = self_attention( + x, x, x, attn_mask=x_mask, training=training + )[0] + x = x + dropout_layer(self_attn_out, training=training) + x = norm1(x, training=training) + + # Cross-attention with residual connection + cross_attn_out = cross_attention( + x, cross, cross, attn_mask=cross_mask, training=training + )[0] + x = x + dropout_layer(cross_attn_out, training=training) + y = x = norm2(x, training=training) + + # Feed-forward network with residual connection + y = conv1(y) + y = dropout_layer(activation_fn(y), training=training) + y = conv2(y) + y = dropout_layer(y, training=training) + + return norm3(x + y, training=training) + + return decoder_layer_function + + def _conv_layer(self, d_model): + """Create convolution layer for distilling.""" + import tensorflow as tf + + # TensorFlow doesn't have direct circular padding, using 'same' padding + downConv = tf.keras.layers.Conv1D( + filters=d_model, kernel_size=3, padding="same", activation=None + ) + norm = tf.keras.layers.BatchNormalization() + activation = tf.keras.layers.ELU() + maxPool = tf.keras.layers.MaxPool1D(pool_size=3, strides=2, padding="same") + + def conv_layer_function(x, training=None): + # x shape: [B, L, D] -> Conv1D expects [B, L, C] + x = downConv(x) + x = norm(x, training=training) + x = activation(x) + x = maxPool(x) + return x + + return conv_layer_function From 40a58e4c642a5f84f6b7602b87b7831128ab8b05 Mon Sep 17 00:00:00 2001 From: lucifer4073 Date: Mon, 21 Jul 2025 18:05:22 +0530 Subject: [PATCH 03/10] informer made according to aeon standard --- aeon/networks/_informer.py | 1357 ++++++++++++-------------- aeon/networks/tests/test_informer.py | 221 +++++ aeon/utils/networks/attention.py | 350 +++++++ 3 files changed, 1210 insertions(+), 718 deletions(-) create mode 100644 aeon/networks/tests/test_informer.py create mode 100644 aeon/utils/networks/attention.py diff --git a/aeon/networks/_informer.py b/aeon/networks/_informer.py index 47d2d85ea4..dfbf4b1bfc 100644 --- a/aeon/networks/_informer.py +++ b/aeon/networks/_informer.py @@ -2,7 +2,17 @@ __maintainer__ = [""] +from typing import Optional + from aeon.networks.base import BaseDeepLearningNetwork +from aeon.utils.networks.attention import ( + AttentionLayer, + KerasProbAttention, +) +from aeon.utils.validation._dependencies import _check_soft_dependencies + +if _check_soft_dependencies(["tensorflow"], severity="none"): + import tensorflow as tf class InformerNetwork(BaseDeepLearningNetwork): @@ -15,12 +25,6 @@ class InformerNetwork(BaseDeepLearningNetwork): Parameters ---------- - enc_in : int, default=7 - Number of encoder input features. - dec_in : int, default=7 - Number of decoder input features. - c_out : int, default=7 - Number of output features. seq_len : int, default=96 Input sequence length. label_len : int, default=48 @@ -43,14 +47,8 @@ class InformerNetwork(BaseDeepLearningNetwork): Dropout rate. attn : str, default='prob' Attention mechanism type ('prob' or 'full'). - embed : str, default='fixed' - Embedding type. - freq : str, default='h' - Time frequency encoding. activation : str, default='gelu' Activation function. - output_attention : bool, default=False - Whether to output attention weights. distil : bool, default=True Whether to use distilling operation. mix : bool, default=True @@ -59,9 +57,9 @@ class InformerNetwork(BaseDeepLearningNetwork): References ---------- .. [1] Zhou, H., Zhang, S., Peng, J., Zhang, S., Li, J., Xiong, H., & Zhang, W. - (2021). Informer: Beyond efficient transformer for long sequence - time-series forecasting. In Proceedings of the AAAI conference on - artificial intelligence (Vol. 35, No. 12, pp. 11106-11115). + (2021). Informer: Beyond efficient transformer for long sequence + time-series forecasting. In Proceedings of the AAAI conference on + artificial intelligence (Vol. 35, No. 12, pp. 11106-11115). """ _config = { @@ -72,30 +70,21 @@ class InformerNetwork(BaseDeepLearningNetwork): def __init__( self, - enc_in=7, - dec_in=7, - c_out=7, - seq_len=96, - label_len=48, - out_len=24, - factor=5, - d_model=512, - n_heads=8, - e_layers=3, - d_layers=2, - d_ff=512, - dropout=0.0, - attn="prob", - embed="fixed", - freq="h", - activation="gelu", - output_attention=False, - distil=True, - mix=True, + seq_len: int = 96, + label_len: int = 48, + out_len: int = 24, + factor: int = 5, + d_model: int = 512, + n_heads: int = 8, + e_layers: int = 3, + d_layers: int = 2, + d_ff: int = 512, + dropout: float = 0.0, + attn: str = "prob", + activation: str = "gelu", + distil: bool = True, + mix: bool = True, ): - self.enc_in = enc_in - self.dec_in = dec_in - self.c_out = c_out self.seq_len = seq_len self.label_len = label_len self.out_len = out_len @@ -107,806 +96,738 @@ def __init__( self.d_ff = d_ff self.dropout = dropout self.attn = attn - self.embed = embed - self.freq = freq self.activation = activation - self.output_attention = output_attention self.distil = distil self.mix = mix super().__init__() - def build_network(self, input_shape, **kwargs): + def _token_embedding( + self, input_tensor: tf.Tensor, c_in: int, d_model: int + ) -> tf.Tensor: """ - Construct the Informer network and return its input and output layers. + Token embedding layer using 1D convolution with causal padding. Parameters ---------- - input_shape : tuple of shape = (n_timepoints (m), n_channels (d)) - The shape of the data fed into the input layer. + input_tensor : tf.Tensor + Input tensor to be processed. + c_in : int + Number of input channels. + d_model : int + Dimension of the model (number of output filters). Returns ------- - input_layer : keras.layers.Input - The input layer of the network. - output_layer : keras.layers.Layer - The output layer of the network. + tf.Tensor + Output tensor after token embedding transformation. """ import tensorflow as tf - # Input layers - x_enc = tf.keras.layers.Input( - shape=(self.seq_len, self.enc_in), name="encoder_input" - ) - x_mark_enc = tf.keras.layers.Input(shape=(self.seq_len, 4), name="encoder_mark") - x_dec = tf.keras.layers.Input( - shape=(self.label_len + self.out_len, self.dec_in), name="decoder_input" - ) - x_mark_dec = tf.keras.layers.Input( - shape=(self.label_len + self.out_len, 4), name="decoder_mark" - ) - - # Encoder embedding - enc_embedding = self._data_embedding( - self.enc_in, self.d_model, self.embed, self.freq, self.dropout - ) - enc_out = enc_embedding([x_enc, x_mark_enc]) - - # Encoder - encoder = self._build_encoder() - enc_out, attns = encoder(enc_out) - - # Decoder embedding - dec_embedding = self._data_embedding( - self.dec_in, self.d_model, self.embed, self.freq, self.dropout - ) - dec_out = dec_embedding([x_dec, x_mark_dec]) - - # Decoder - decoder = self._build_decoder() - dec_out = decoder([dec_out, enc_out]) - - # Final projection - projection = tf.keras.layers.Dense(self.c_out, use_bias=True, name="projection") - dec_out = projection(dec_out) + x = tf.keras.layers.Conv1D( + filters=d_model, kernel_size=3, padding="causal", activation="linear" + )(input_tensor) + x = tf.keras.layers.LeakyReLU()(x) + return x - # Extract prediction sequence - output = tf.keras.layers.Lambda( - lambda x: x[:, -self.out_len :, :], name="prediction_slice" - )(dec_out) - - # Create model inputs list - inputs = [x_enc, x_mark_enc, x_dec, x_mark_dec] - - if self.output_attention: - outputs = [output, attns] - else: - outputs = output + def _positional_embedding( + self, input_tensor: tf.Tensor, d_model: int, max_len: int = 5000 + ) -> tf.Tensor: + """ + Positional embedding layer that computes positional encodings. - return inputs, outputs + Parameters + ---------- + input_tensor : tf.Tensor + Input tensor to get positional embeddings for. + d_model : int + Dimension of the model. + max_len : int, optional + Maximum length of the sequence, by default 5000 - def _positional_embedding(self, d_model, max_len=5000): - """Create positional embedding layer.""" + Returns + ------- + tf.Tensor + Positional encoding tensor matching input tensor's sequence length. + """ import math import numpy as np import tensorflow as tf - # Compute the positional encodings once in log space + # Compute the positional encodings pe = np.zeros((max_len, d_model), dtype=np.float32) - position = np.arange(0, max_len, dtype=np.float32)[:, np.newaxis] + position = np.expand_dims(np.arange(0, max_len, dtype=np.float32), 1) div_term = np.exp( np.arange(0, d_model, 2, dtype=np.float32) * -(math.log(10000.0) / d_model) ) pe[:, 0::2] = np.sin(position * div_term) pe[:, 1::2] = np.cos(position * div_term) - pe = pe[np.newaxis, :] # Add batch dimension - - # Create constant tensor - pe_tensor = tf.constant(pe, dtype=tf.float32) - - def positional_function(x): - seq_len = tf.shape(x)[1] - return pe_tensor[:, :seq_len, :] - - return positional_function - - def _token_embedding(self, c_in, d_model): - """Create token embedding layer.""" - import tensorflow as tf - - token_conv = tf.keras.layers.Conv1D( - filters=d_model, - kernel_size=3, - padding="same", - kernel_initializer=tf.keras.initializers.HeNormal(), - ) - - def token_function(x): - return token_conv(x) - - return token_function - - def _fixed_embedding(self, c_in, d_model): - """Create fixed embedding layer.""" - import math - import numpy as np - import tensorflow as tf + # Convert to tensor and add batch dimension + pe_tensor = tf.expand_dims(tf.convert_to_tensor(pe), 0) - # Create fixed sinusoidal embeddings - w = np.zeros((c_in, d_model), dtype=np.float32) - position = np.arange(0, c_in, dtype=np.float32)[:, np.newaxis] - div_term = np.exp( - np.arange(0, d_model, 2, dtype=np.float32) * -(math.log(10000.0) / d_model) - ) + # Return positional embeddings for the input tensor's sequence length + return pe_tensor[:, : input_tensor.shape[1]] - w[:, 0::2] = np.sin(position * div_term) - w[:, 1::2] = np.cos(position * div_term) - - # Create embedding layer with fixed weights - embedding = tf.keras.layers.Embedding( - input_dim=c_in, - output_dim=d_model, - embeddings_initializer="zeros", - trainable=False, - ) - - def fixed_function(x): - # Initialize weights if not already done - if not embedding.built: - embedding.build((None,)) - embedding.embeddings.assign(w) - return tf.stop_gradient(embedding(x)) + def _data_embedding( + self, + input_tensor: tf.Tensor, + c_in: int, + d_model: int, + dropout: float = 0.1, + max_len: int = 5000, + ) -> tf.Tensor: + """ + Combine token and positional embeddings for the input tensor. - return fixed_function + Parameters + ---------- + input_tensor : tf.Tensor + Input tensor to be processed. + c_in : int + Number of input channels. + d_model : int + Dimension of the model (number of output filters). + dropout : float, optional + Dropout rate, by default 0.1 + max_len : int, optional + Maximum length of the sequence for positional embedding - def _temporal_embedding(self, d_model, embed_type, freq): - """Create temporal embedding layer.""" + Returns + ------- + tf.Tensor + Output tensor after data embedding transformation. + """ import tensorflow as tf - # Define embedding sizes - minute_size = 4 - hour_size = 24 - weekday_size = 7 - day_size = 32 - month_size = 13 - - # Choose embedding type - if embed_type == "fixed": - minute_embed = ( - self._fixed_embedding(minute_size, d_model) if freq == "t" else None - ) - hour_embed = self._fixed_embedding(hour_size, d_model) - weekday_embed = self._fixed_embedding(weekday_size, d_model) - day_embed = self._fixed_embedding(day_size, d_model) - month_embed = self._fixed_embedding(month_size, d_model) - else: - minute_embed = ( - tf.keras.layers.Embedding(minute_size, d_model) if freq == "t" else None - ) - hour_embed = tf.keras.layers.Embedding(hour_size, d_model) - weekday_embed = tf.keras.layers.Embedding(weekday_size, d_model) - day_embed = tf.keras.layers.Embedding(day_size, d_model) - month_embed = tf.keras.layers.Embedding(month_size, d_model) - - def temporal_function(x): - x = tf.cast(x, tf.int32) - - minute_x = minute_embed(x[:, :, 4]) if minute_embed is not None else 0.0 - hour_x = hour_embed(x[:, :, 3]) - weekday_x = weekday_embed(x[:, :, 2]) - day_x = day_embed(x[:, :, 1]) - month_x = month_embed(x[:, :, 0]) - - return hour_x + weekday_x + day_x + month_x + minute_x + # Get token embeddings + token_emb = self._token_embedding(input_tensor, c_in, d_model) - return temporal_function + # Get positional embeddings + pos_emb = self._positional_embedding(input_tensor, d_model, max_len) - def _time_feature_embedding(self, d_model, embed_type, freq): - """Create time feature embedding layer.""" - import tensorflow as tf + # Combine embeddings + x = token_emb + pos_emb - freq_map = {"h": 4, "t": 5, "s": 6, "m": 1, "a": 1, "w": 2, "d": 3, "b": 3} - d_inp = freq_map[freq] + # Apply dropout + x = tf.keras.layers.Dropout(dropout)(x) - embed_layer = tf.keras.layers.Dense(d_model) + return x - def time_feature_function(x): - return embed_layer(x) + def _conv_layer(self, input_tensor: tf.Tensor, c_in: int) -> tf.Tensor: + """ + Convolutional layer with batch normalization, ELU, and max pooling. - return time_feature_function + Parameters + ---------- + input_tensor : tf.Tensor + Input tensor to be processed. + c_in : int + Number of input channels (filters for the convolution). - def _data_embedding(self, c_in, d_model, embed_type, freq, dropout): - """Create data embedding layer.""" + Returns + ------- + tf.Tensor + Output tensor after convolution and pooling operations. + """ import tensorflow as tf - # Create embedding components - value_embedding = self._token_embedding(c_in, d_model) - position_embedding = self._positional_embedding(d_model) - - if embed_type != "timeF": - temporal_embedding = self._temporal_embedding(d_model, embed_type, freq) - else: - temporal_embedding = self._time_feature_embedding(d_model, embed_type, freq) - - dropout_layer = tf.keras.layers.Dropout(dropout) + # Apply 1D convolution with causal padding + x = tf.keras.layers.Conv1D(filters=c_in, kernel_size=3, padding="causal")( + input_tensor + ) - def embedding_function(inputs, training=None): - x, x_mark = inputs + # Apply batch normalization + x = tf.keras.layers.BatchNormalization()(x) - value_emb = value_embedding(x) - pos_emb = position_embedding(x) - temporal_emb = temporal_embedding(x_mark) + # Apply ELU activation + x = tf.keras.layers.ELU()(x) - embeddings = value_emb + pos_emb + temporal_emb - return dropout_layer(embeddings, training=training) + # Apply max pooling for downsampling + x = tf.keras.layers.MaxPool1D(pool_size=3, strides=2)(x) - return embedding_function + return x - def _build_encoder(self): - """Build the encoder stack with attention layers.""" - import tensorflow as tf + def _attention_out( + self, + input_tensor: tf.Tensor, + attention_type: str, + mask_flag: bool, + d_model: int, + n_heads: int, + factor: int = 5, + dropout: float = 0.1, + attn_mask: Optional[tf.Tensor] = None, + ) -> tf.Tensor: + """ + Attention output layer applying either ProbAttention or FullAttention. - # Choose attention type - if self.attn == "prob": - Attn = self._prob_attention( - False, self.factor, self.dropout, self.output_attention - ) - else: - Attn = self._full_attention( - False, self.factor, self.dropout, self.output_attention - ) + Parameters + ---------- + input_tensor : tf.Tensor + Input tensor for attention computation. + attention_type : str + Type of attention mechanism ('prob' or 'full'). + mask_flag : bool + Whether to use attention masking. + d_model : int + Model dimension. + n_heads : int + Number of attention heads. + factor : int, optional + Attention factor for ProbSparse attention, by default 5 + dropout : float, optional + Dropout rate, by default 0.1 + attn_mask : tf.Tensor, optional + Attention mask tensor, by default None - # Build encoder layers - encoder_layers = [] - for l in range(self.e_layers): - attention_layer = self._attention_layer( - Attn, self.d_model, self.n_heads, mix=False - ) - encoder_layer = self._encoder_layer( - attention_layer, self.d_model, self.d_ff, self.dropout, self.activation - ) - encoder_layers.append(encoder_layer) - - # Build conv layers for distilling - conv_layers = None - if self.distil: - conv_layers = [] - for l in range(self.e_layers - 1): - conv_layer = self._conv_layer(self.d_model) - conv_layers.append(conv_layer) - - # Normalization layer - norm_layer = tf.keras.layers.LayerNormalization() - - def encoder_function(x, attn_mask=None, training=None): - # x [B, L, D] - attns = [] - - if conv_layers is not None: - # Process with both attention and conv layers - for attn_layer, conv_layer in zip(encoder_layers, conv_layers): - x, attn = attn_layer(x, attn_mask=attn_mask, training=training) - x = conv_layer(x, training=training) - attns.append(attn) - - # Final attention layer - x, attn = encoder_layers[-1](x, attn_mask=attn_mask, training=training) - attns.append(attn) - else: - # Process with only attention layers - for attn_layer in encoder_layers: - x, attn = attn_layer(x, attn_mask=attn_mask, training=training) - attns.append(attn) - - if norm_layer is not None: - x = norm_layer(x, training=training) - - return x, attns - - return encoder_function - - def _build_decoder(self): - """Build the decoder stack with attention layers.""" + Returns + ------- + tf.Tensor + Output tensor after attention computation. + """ import tensorflow as tf - # Build decoder layers - decoder_layers = [] - for l in range(self.d_layers): - # Self-attention (with mask) - self_attn = ( - self._prob_attention(True, self.factor, self.dropout, False) - if self.attn == "prob" - else self._full_attention(True, self.factor, self.dropout, False) - ) - self_attention_layer = self._attention_layer( - self_attn, self.d_model, self.n_heads, self.mix + if attention_type == "prob": + prob_attention = KerasProbAttention( + mask_flag=mask_flag, + factor=factor, + attention_dropout=dropout, ) - # Cross-attention (without mask) - cross_attn = self._full_attention(False, self.factor, self.dropout, False) - cross_attention_layer = self._attention_layer( - cross_attn, self.d_model, self.n_heads, False - ) + output = AttentionLayer( + attention=prob_attention, + d_model=d_model, + n_heads=n_heads, + d_keys=d_model // n_heads, # 512 // 8 = 64 + d_values=d_model // n_heads, # 512 // 8 = 64 + )(input_tensor, attn_mask=attn_mask) - decoder_layer = self._decoder_layer( - self_attention_layer, - cross_attention_layer, - self.d_model, - self.d_ff, - self.dropout, - self.activation, + else: + queries, keys, values = input_tensor + output = tf.keras.layers.MultiHeadAttention( + num_heads=n_heads, # 8 + key_dim=d_model // n_heads, # 512 // 8 = 64 + value_dim=d_model // n_heads, # 512 // 8 = 64 + dropout=dropout, + use_bias=True, + )( + query=queries, # (32, 20, 512) + key=keys, # (32, 20, 512) + value=values, # (32, 20, 512) + attention_mask=attn_mask, + use_causal_mask=mask_flag, ) - decoder_layers.append(decoder_layer) - - # Normalization layer - norm_layer = tf.keras.layers.LayerNormalization() - - def decoder_function(inputs, training=None): - x, cross = inputs - x_mask = None # Can be added as parameter if needed - cross_mask = None # Can be added as parameter if needed - - for layer in decoder_layers: - x = layer( - x, cross, x_mask=x_mask, cross_mask=cross_mask, training=training - ) - - if norm_layer is not None: - x = norm_layer(x, training=training) - return x + return output - return decoder_function + def _encoder_layer( + self, + input_tensor: tf.Tensor, + d_model: int, + d_ff: Optional[int] = None, + dropout: float = 0.1, + activation: str = "relu", + attn_mask: Optional[tf.Tensor] = None, + attention_type: str = "prob", + mask_flag: bool = True, + n_heads: int = 8, + factor: int = 5, + ) -> tf.Tensor: + """ + Apply encoder layer with multi-head attention and feed-forward network. - def _prob_attention(self, mask_flag, factor, attention_dropout, output_attention): - """Create ProbSparse attention mechanism.""" - from math import sqrt + Parameters + ---------- + input_tensor : tf.Tensor + Input tensor of shape [B, L, D] where B is batch size, + L is sequence length, D is model dimension. + d_model : int + Model dimension (must match input tensor's last dimension). + d_ff : int, optional + Feed-forward network dimension + dropout : float, optional + Dropout rate, by default 0.1 + activation : str, optional + Activation function ('relu' or 'gelu'), by default "relu" + attn_mask : tf.Tensor, optional + Attention mask tensor, by default None - import numpy as np + Returns + ------- + tf.Tensor + Output tensor after encoder layer processing. + """ import tensorflow as tf - dropout_layer = tf.keras.layers.Dropout(attention_dropout) - - def _prob_QK(Q, K, sample_k, n_top): - # Q [B, H, L, D] - B, H, L_K, E = ( - tf.shape(K)[0], - tf.shape(K)[1], - tf.shape(K)[2], - tf.shape(K)[3], - ) - L_Q = tf.shape(Q)[2] - - # calculate the sampled Q_K - K_expand = tf.expand_dims(K, axis=-3) # [B, H, 1, L_K, E] - K_expand = tf.tile(K_expand, [1, 1, L_Q, 1, 1]) # [B, H, L_Q, L_K, E] - - # Generate random indices for sampling - index_sample = tf.random.uniform( - (L_Q, sample_k), maxval=L_K, dtype=tf.int32 - ) + # Set default d_ff if not provided + if d_ff is None: + d_ff = 4 * d_model + + # Self-attention using the _attention_out function with parameters + attn_output = self._attention_out( + input_tensor=[input_tensor, input_tensor, input_tensor], + attention_type=attention_type, + mask_flag=mask_flag, + d_model=d_model, + n_heads=n_heads, + factor=factor, + dropout=dropout, + attn_mask=attn_mask, + ) - # Create indices for gathering - batch_indices = tf.range(B)[:, None, None, None, None] - head_indices = tf.range(H)[None, :, None, None, None] - query_indices = tf.range(L_Q)[None, None, :, None, None] - sample_indices = index_sample[None, None, :, :, None] - - # Gather K_sample - gather_indices = tf.concat( - [ - tf.broadcast_to(batch_indices, [B, H, L_Q, sample_k, 1]), - tf.broadcast_to(head_indices, [B, H, L_Q, sample_k, 1]), - tf.broadcast_to(query_indices, [B, H, L_Q, sample_k, 1]), - tf.broadcast_to(sample_indices, [B, H, L_Q, sample_k, 1]), - ], - axis=-1, - ) + # Apply dropout and residual connection + x = input_tensor + tf.keras.layers.Dropout(dropout)(attn_output) - K_sample = tf.gather_nd( - K_expand, gather_indices - ) # [B, H, L_Q, sample_k, E] - - # Calculate Q_K_sample - Q_expanded = tf.expand_dims(Q, axis=-2) # [B, H, L_Q, 1, E] - Q_K_sample = tf.matmul( - Q_expanded, K_sample, transpose_b=True - ) # [B, H, L_Q, 1, sample_k] - Q_K_sample = tf.squeeze(Q_K_sample, axis=-2) # [B, H, L_Q, sample_k] - - # find the Top_k query with sparsity measurement - M = tf.reduce_max(Q_K_sample, axis=-1) - tf.reduce_sum( - Q_K_sample, axis=-1 - ) / tf.cast(L_K, tf.float32) - M_top = tf.nn.top_k(M, k=n_top, sorted=False).indices - - # use the reduced Q to calculate Q_K - batch_idx = tf.range(B)[:, None, None] - head_idx = tf.range(H)[None, :, None] - - gather_indices_q = tf.stack( - [ - tf.broadcast_to(batch_idx, tf.shape(M_top)), - tf.broadcast_to(head_idx, tf.shape(M_top)), - M_top, - ], - axis=-1, - ) + # First layer normalization + x = tf.keras.layers.LayerNormalization()(x) - Q_reduce = tf.gather_nd(Q, gather_indices_q) # [B, H, n_top, E] - Q_K = tf.matmul(Q_reduce, K, transpose_b=True) # [B, H, n_top, L_K] + # Store for second residual connection + residual = x - return Q_K, M_top + # Feed-forward network + # First 1D convolution (expansion) + y = tf.keras.layers.Conv1D(filters=d_ff, kernel_size=1)(x) - def _get_initial_context(V, L_Q): - B, H, L_V, D = ( - tf.shape(V)[0], - tf.shape(V)[1], - tf.shape(V)[2], - tf.shape(V)[3], - ) + # Apply activation function + if activation == "relu": + y = tf.keras.layers.ReLU()(y) + else: # gelu + y = tf.keras.layers.Activation("gelu")(y) - if not mask_flag: - V_sum = tf.reduce_mean(V, axis=-2) # [B, H, D] - context = tf.expand_dims(V_sum, axis=-2) # [B, H, 1, D] - context = tf.tile(context, [1, 1, L_Q, 1]) # [B, H, L_Q, D] - else: - # For masked case, L_Q should equal L_V - context = tf.cumsum(V, axis=-2) - - return context - - def _prob_mask(B, H, L, index, scores): - # Create upper triangular mask (excluding diagonal) - L_scores = tf.shape(scores)[-1] - _mask = tf.linalg.band_part(tf.ones((L, L_scores), dtype=tf.bool), 0, -1) - _mask = tf.logical_not(_mask) # Upper triangular without diagonal - - # Expand mask for batch and head dimensions - _mask_ex = tf.tile( - tf.expand_dims(tf.expand_dims(_mask, 0), 0), [B, H, 1, 1] - ) + # Apply dropout + y = tf.keras.layers.Dropout(dropout)(y) - # Gather mask at specified indices - batch_idx = tf.range(B)[:, None, None] - head_idx = tf.range(H)[None, :, None] - - gather_indices = tf.stack( - [ - tf.broadcast_to(batch_idx, tf.shape(index)), - tf.broadcast_to(head_idx, tf.shape(index)), - index, - ], - axis=-1, - ) + # Second 1D convolution (compression back to d_model) + y = tf.keras.layers.Conv1D(filters=d_model, kernel_size=1)(y) - indicator = tf.gather_nd(_mask_ex, gather_indices) - return indicator + # Apply dropout + y = tf.keras.layers.Dropout(dropout)(y) - def _update_context(context_in, V, scores, index, L_Q, attn_mask): - B, H, L_V, D = ( - tf.shape(V)[0], - tf.shape(V)[1], - tf.shape(V)[2], - tf.shape(V)[3], - ) + # Second residual connection and layer normalization + output = tf.keras.layers.LayerNormalization()(residual + y) - if mask_flag: - attn_mask = _prob_mask(B, H, L_Q, index, scores) - scores = tf.where( - attn_mask, tf.fill(tf.shape(scores), float("-inf")), scores - ) + return output - attn = tf.nn.softmax(scores, axis=-1) - - # Calculate attention-weighted values - attn_V = tf.matmul(attn, V) # [B, H, n_top, D] - - # Update context_in at specified indices - batch_idx = tf.range(B)[:, None, None] - head_idx = tf.range(H)[None, :, None] + def _encoder( + self, + input_tensor: tf.Tensor, + e_layers: int, + d_model: int, + d_ff: Optional[int] = None, + dropout: float = 0.1, + activation: str = "relu", + attn_mask: Optional[tf.Tensor] = None, + attention_type: str = "prob", + mask_flag: bool = True, + n_heads: int = 8, + factor: int = 5, + use_conv_layers: bool = False, + c_in: Optional[int] = None, + use_norm: bool = True, + ) -> tf.Tensor: + """ + Apply encoder stack with multiple encoder layers and optional conv layers. - update_indices = tf.stack( - [ - tf.broadcast_to(batch_idx, tf.shape(index)), - tf.broadcast_to(head_idx, tf.shape(index)), - index, - ], - axis=-1, - ) + Parameters + ---------- + input_tensor : tf.Tensor + Input tensor of shape [B, L, D] + e_layers : int + Number of encoder layers to stack. + d_model : int + Model dimension (must match input tensor's last dimension). + d_ff : int, optional + Feed-forward network dimension + dropout : float, optional + Dropout rate, by default 0.1 + activation : str, optional + Activation function ('relu' or 'gelu'), by default "relu" + attn_mask : tf.Tensor, optional + Attention mask tensor, by default None + attention_type : str, optional + Type of attention mechanism ('prob' or 'full') + mask_flag : bool, optional + Whether to use attention masking, by default True + n_heads : int, optional + Number of attention heads, by default 8 + factor : int, optional + Attention factor for ProbSparse attention, by default 5 + use_conv_layers : bool, optional + Whether to use convolutional layers between encoder layers + c_in : int, optional + Number of input channels for convolutional layers + use_norm : bool, optional + Whether to apply final layer normalization, by default True - context_in = tf.tensor_scatter_nd_update(context_in, update_indices, attn_V) + Returns + ------- + tf.Tensor + Output tensor after encoder stack processing. + """ + import tensorflow as tf - if output_attention: - # Initialize full attention matrix - attns = tf.ones([B, H, L_V, L_V], dtype=attn.dtype) / tf.cast( - L_V, attn.dtype + # Set default values + if c_in is None: + c_in = d_model + + x = input_tensor + + # Apply encoder layers with optional convolutional layers + if use_conv_layers: + # Apply paired encoder and conv layers + for _ in range(e_layers - 1): + # Apply encoder layer + x = self._encoder_layer( + input_tensor=x, + d_model=d_model, + d_ff=d_ff, + dropout=dropout, + activation=activation, + attn_mask=attn_mask, + attention_type=attention_type, + mask_flag=mask_flag, + n_heads=n_heads, + factor=factor, ) - attns = tf.tensor_scatter_nd_update(attns, update_indices, attn) - return context_in, attns - else: - return context_in, None - - def prob_attention_function( - queries, keys, values, attn_mask=None, training=None - ): - B, L_Q, H, D = ( - tf.shape(queries)[0], - tf.shape(queries)[1], - tf.shape(queries)[2], - tf.shape(queries)[3], - ) - L_K = tf.shape(keys)[1] - - # Transpose to [B, H, L, D] format - queries = tf.transpose(queries, perm=[0, 2, 1, 3]) - keys = tf.transpose(keys, perm=[0, 2, 1, 3]) - values = tf.transpose(values, perm=[0, 2, 1, 3]) - - # Calculate sampling parameters - U_part = int(factor * np.ceil(np.log(L_K))) - u = int(factor * np.ceil(np.log(L_Q))) - - U_part = min(U_part, L_K) - u = min(u, L_Q) - - # Get top-k scores and indices - scores_top, index = _prob_QK(queries, keys, sample_k=U_part, n_top=u) - # Apply scale factor - scale = 1.0 / sqrt(D) - if scale is not None: - scores_top = scores_top * scale + # Apply convolutional layer for downsampling + x = self._conv_layer( + input_tensor=x, + c_in=c_in, + ) - # Get initial context and update with top-k queries - context = _get_initial_context(values, L_Q) - context, attn = _update_context( - context, values, scores_top, index, L_Q, attn_mask + # Apply final encoder layer (without conv layer) + x = self._encoder_layer( + input_tensor=x, + d_model=d_model, + d_ff=d_ff, + dropout=dropout, + activation=activation, + attn_mask=attn_mask, + attention_type=attention_type, + mask_flag=mask_flag, + n_heads=n_heads, + factor=factor, ) - # Transpose back to [B, L, H, D] format - context = tf.transpose(context, perm=[0, 2, 1, 3]) - - return context, attn - - return prob_attention_function - - def _full_attention(self, mask_flag, factor, attention_dropout, output_attention): - """Create full attention mechanism.""" - import numpy as np - import tensorflow as tf - - dropout_layer = tf.keras.layers.Dropout(attention_dropout) - - def _triangular_causal_mask(B, L): - """Create triangular causal mask for attention.""" - mask_shape = [B, 1, L, L] - # Create upper triangular mask (excluding diagonal) - mask = tf.linalg.band_part(tf.ones(mask_shape, dtype=tf.bool), 0, -1) - mask = tf.logical_not(tf.linalg.band_part(mask, 0, 0)) # Remove diagonal - return mask - - def full_attention_function( - queries, keys, values, attn_mask=None, training=None - ): - # Get shapes - B = tf.shape(queries)[0] - L = tf.shape(queries)[1] - H = tf.shape(queries)[2] - E = tf.shape(queries)[3] - S = tf.shape(keys)[1] - D = tf.shape(values)[3] - - # Calculate scale - scale = 1.0 / tf.math.sqrt(tf.cast(E, tf.float32)) - - # Compute attention scores: "blhe,bshe->bhls" - scores = tf.einsum("blhe,bshe->bhls", queries, keys) - - if mask_flag: - if attn_mask is None: - attn_mask = _triangular_causal_mask(B, L) - else: - # If attn_mask is provided, use its mask attribute if it's an object - if hasattr(attn_mask, "mask"): - attn_mask = attn_mask.mask - - # Apply mask by setting masked positions to -inf - scores = tf.where( - attn_mask, - tf.fill(tf.shape(scores), tf.constant(-np.inf, dtype=scores.dtype)), - scores, + else: + # Apply only encoder layers without convolutional layers + for _ in range(e_layers): + x = self._encoder_layer( + input_tensor=x, + d_model=d_model, + d_ff=d_ff, + dropout=dropout, + activation=activation, + attn_mask=attn_mask, + attention_type=attention_type, + mask_flag=mask_flag, + n_heads=n_heads, + factor=factor, ) - # Apply scale and softmax - A = tf.nn.softmax(scale * scores, axis=-1) + # Apply optional final layer normalization + if use_norm: + x = tf.keras.layers.LayerNormalization()(x) - # Apply dropout - A = dropout_layer(A, training=training) + return x - # Compute output: "bhls,bshd->blhd" - V = tf.einsum("bhls,bshd->blhd", A, values) - - if output_attention: - return V, A - else: - return V, None + def _decoder_layer( + self, + input_tensor: tf.Tensor, + cross_tensor: tf.Tensor, + d_model: int, + d_ff: Optional[int] = None, + dropout: float = 0.1, + activation: str = "relu", + x_mask: Optional[tf.Tensor] = None, + cross_mask: Optional[tf.Tensor] = None, + self_attention_type: str = "prob", + cross_attention_type: str = "prob", + mask_flag: bool = True, + n_heads: int = 8, + factor: int = 5, + ) -> tf.Tensor: + """ + Apply decoder layer with self-attention, cross-attention, and FFN. - return full_attention_function + Parameters + ---------- + input_tensor : tf.Tensor + Input tensor of shape [B, L, D] + cross_tensor : tf.Tensor + Cross-attention input tensor (encoder output) of shape [B, L_enc, D] + d_model : int + Model dimension (must match input tensor's last dimension). + d_ff : int, optional + Feed-forward network dimension + dropout : float, optional + Dropout rate, by default 0.1 + activation : str, optional + Activation function ('relu' or 'gelu'), by default "relu" + x_mask : tf.Tensor, optional + Self-attention mask tensor, by default None + cross_mask : tf.Tensor, optional + Cross-attention mask tensor, by default None + self_attention_type : str, optional + Type of self-attention mechanism ('prob' or 'full') + cross_attention_type : str, optional + Type of cross-attention mechanism ('prob' or 'full') + mask_flag : bool, optional + Whether to use attention masking, by default True + n_heads : int, optional + Number of attention heads, by default 8 + factor : int, optional + Attention factor for ProbSparse attention, by default 5 - def _attention_layer(self, attention, d_model, n_heads, mix): - """Create attention layer wrapper.""" + Returns + ------- + tf.Tensor + Output tensor after decoder layer processing with same shape. + """ import tensorflow as tf - d_keys = d_model // n_heads - d_values = d_model // n_heads + # Set default d_ff if not provided + if d_ff is None: + d_ff = 4 * d_model + + # Self-attention block + self_attn_output = self._attention_out( + input_tensor=[input_tensor, input_tensor, input_tensor], + attention_type=self_attention_type, + mask_flag=mask_flag, + d_model=d_model, + n_heads=n_heads, + factor=factor, + dropout=dropout, + attn_mask=x_mask, + ) - # Linear projection layers for Q, K, V - query_dense = tf.keras.layers.Dense(d_model) - key_dense = tf.keras.layers.Dense(d_model) - value_dense = tf.keras.layers.Dense(d_model) + # Apply dropout and first residual connection + x = input_tensor + tf.keras.layers.Dropout(dropout)(self_attn_output) + + # First layer normalization + x = tf.keras.layers.LayerNormalization()(x) + + # Cross-attention block + cross_attn_output = self._attention_out( + input_tensor=[x, cross_tensor, cross_tensor], + attention_type=cross_attention_type, + mask_flag=mask_flag, + d_model=d_model, + n_heads=n_heads, + factor=factor, + dropout=dropout, + attn_mask=cross_mask, + ) - # Output projection - out_projection = tf.keras.layers.Dense(d_model) + # Apply dropout and second residual connection + x = x + tf.keras.layers.Dropout(dropout)(cross_attn_output) - def attention_layer_function( - queries, keys, values, attn_mask=None, training=None - ): - B, L, _ = tf.shape(queries)[0], tf.shape(queries)[1], tf.shape(queries)[2] - S = tf.shape(keys)[1] - H = n_heads + # Second layer normalization + x = tf.keras.layers.LayerNormalization()(x) - # Linear projections in batch from d_model => h x d_k - Q = query_dense(queries) - K = key_dense(keys) - V = value_dense(values) + # Store for third residual connection + residual = x - # Reshape to (B, L, H, d_k) and transpose to (B, H, L, d_k) - Q = tf.reshape(Q, [B, L, H, d_keys]) - K = tf.reshape(K, [B, S, H, d_keys]) - V = tf.reshape(V, [B, S, H, d_values]) + # Feed-forward network + # First 1D convolution (expansion) + y = tf.keras.layers.Conv1D(filters=d_ff, kernel_size=1)(x) - Q = tf.transpose(Q, [0, 2, 1, 3]) # (B, H, L, d_k) - K = tf.transpose(K, [0, 2, 1, 3]) # (B, H, S, d_k) - V = tf.transpose(V, [0, 2, 1, 3]) # (B, H, S, d_v) + # Apply activation function + if activation == "relu": + y = tf.keras.layers.ReLU()(y) + else: # gelu + y = tf.keras.layers.Activation("gelu")(y) - # Apply attention function - out, attn = attention(Q, K, V, attn_mask=attn_mask, training=training) + # Apply dropout + y = tf.keras.layers.Dropout(dropout)(y) - # Concatenate heads and put through final linear layer - # out shape: (B, H, L, d_v) -> (B, L, H, d_v) -> (B, L, H*d_v) - out = tf.transpose(out, [0, 2, 1, 3]) - out = tf.reshape(out, [B, L, H * d_values]) + # Second 1D convolution (compression back to d_model) + y = tf.keras.layers.Conv1D(filters=d_model, kernel_size=1)(y) - # Apply mix transformation if needed - if mix: - # Reshape to (B, L, H, d_values) then transpose to (B, H, L, d_values) - out = tf.reshape(out, [B, L, H, d_values]) - out = tf.transpose(out, [0, 2, 1, 3]) - out = tf.reshape(out, [B, L, H * d_values]) + # Apply dropout + y = tf.keras.layers.Dropout(dropout)(y) - # Final output projection - out = out_projection(out) + # Third residual connection and final layer normalization + output = tf.keras.layers.LayerNormalization()(residual + y) - return out, attn + return output - return attention_layer_function + def _decoder( + self, + input_tensor: tf.Tensor, + cross_tensor: tf.Tensor, + d_layers: int, + d_model: int, + d_ff: Optional[int] = None, + dropout: float = 0.1, + activation: str = "relu", + x_mask: Optional[tf.Tensor] = None, + cross_mask: Optional[tf.Tensor] = None, + self_attention_type: str = "prob", + cross_attention_type: str = "prob", + mask_flag: bool = True, + n_heads: int = 8, + factor: int = 5, + use_norm: bool = True, + ) -> tf.Tensor: + """ + Apply decoder stack with multiple decoder layers and optional normalization. - def _encoder_layer(self, attention_layer, d_model, d_ff, dropout, activation): - """Create single encoder layer.""" + Parameters + ---------- + input_tensor : tf.Tensor + Decoder input tensor of shape [B, L_dec, D] + cross_tensor : tf.Tensor + Cross-attention input tensor (encoder output) of shape [B, L_enc, D] + d_layers : int + Number of decoder layers to stack. + d_model : int + Model dimension (must match input tensor's last dimension). + d_ff : int, optional + Feed-forward network dimension + dropout : float, optional + Dropout rate, by default 0.1 + activation : str, optional + Activation function ('relu' or 'gelu'), by default "relu" + x_mask : tf.Tensor, optional + Self-attention mask tensor for decoder, by default None + cross_mask : tf.Tensor, optional + Cross-attention mask tensor, by default None + self_attention_type : str, optional + Type of self-attention mechanism ('prob' or 'full') + cross_attention_type : str, optional + Type of cross-attention mechanism ('prob' or 'full') + mask_flag : bool, optional + Whether to use attention masking, by default True + n_heads : int, optional + Number of attention heads, by default 8 + factor : int, optional + Attention factor for ProbSparse attention, by default 5 + use_norm : bool, optional + Whether to apply final layer normalization, by default True + + Returns + ------- + tf.Tensor + Output tensor after decoder stack processing. + """ import tensorflow as tf - d_ff = d_ff or 4 * d_model + x = input_tensor + + # Apply multiple decoder layers + for _ in range(d_layers): + x = self._decoder_layer( + input_tensor=x, + cross_tensor=cross_tensor, + d_model=d_model, + d_ff=d_ff, + dropout=dropout, + activation=activation, + x_mask=x_mask, + cross_mask=cross_mask, + self_attention_type=self_attention_type, + cross_attention_type=cross_attention_type, + mask_flag=mask_flag, + n_heads=n_heads, + factor=factor, + ) - # Conv1D layers for feed-forward network - conv1 = tf.keras.layers.Conv1D(filters=d_ff, kernel_size=1) - conv2 = tf.keras.layers.Conv1D(filters=d_model, kernel_size=1) + # Apply optional final layer normalization + if use_norm: + x = tf.keras.layers.LayerNormalization()(x) - # Layer normalization - norm1 = tf.keras.layers.LayerNormalization() - norm2 = tf.keras.layers.LayerNormalization() + return x - # Dropout - dropout_layer = tf.keras.layers.Dropout(dropout) + def build_network( + self, input_shape: tuple[int, int], **kwargs + ) -> tuple[list[tf.Tensor], tf.Tensor]: + """Build the complete Informer architecture for time series forecasting.""" + import tensorflow as tf - # Activation function - if activation == "relu": - activation_fn = tf.nn.relu - else: - activation_fn = tf.nn.gelu + # Get input dimensions + n_timepoints, n_channels = input_shape - def encoder_layer_function(x, attn_mask=None, training=None): - # Self-attention with residual connection - new_x, attn = attention_layer( - x, x, x, attn_mask=attn_mask, training=training - ) - x = x + dropout_layer(new_x, training=training) - y = x = norm1(x, training=training) + # hardcode batch_size for now + batch_size = 32 - # Feed-forward network with residual connection - y = conv1(y) - y = dropout_layer(activation_fn(y), training=training) - y = conv2(y) - y = dropout_layer(y, training=training) + # Create input layers for encoder and decoder + encoder_input = tf.keras.layers.Input( + shape=(self.seq_len, n_channels), + name="encoder_input", + batch_size=batch_size, + ) - return norm2(x + y, training=training), attn + decoder_input = tf.keras.layers.Input( + shape=(self.label_len + self.out_len, n_channels), + name="decoder_input", + batch_size=batch_size, + ) - return encoder_layer_function + # Encoder embedding + enc_embedded = self._data_embedding( + input_tensor=encoder_input, + c_in=n_channels, + d_model=self.d_model, + dropout=self.dropout, + max_len=self.seq_len, + ) - def _decoder_layer( - self, self_attention, cross_attention, d_model, d_ff, dropout, activation - ): - """Create single decoder layer.""" - import tensorflow as tf + # Encoder processing + enc_output = self._encoder( + input_tensor=enc_embedded, + e_layers=self.e_layers, + d_model=self.d_model, + d_ff=self.d_ff, + dropout=self.dropout, + activation=self.activation, + attention_type=self.attn, + mask_flag=False, + n_heads=self.n_heads, + factor=self.factor, + use_conv_layers=self.distil, + c_in=self.d_model, + use_norm=True, + ) - d_ff = d_ff or 4 * d_model + # Decoder embedding + dec_embedded = self._data_embedding( + input_tensor=decoder_input, + c_in=n_channels, + d_model=self.d_model, + dropout=self.dropout, + max_len=self.label_len + self.out_len, + ) - # Conv1D layers equivalent to PyTorch's Conv1d - conv1 = tf.keras.layers.Conv1D(filters=d_ff, kernel_size=1) - conv2 = tf.keras.layers.Conv1D(filters=d_model, kernel_size=1) + # Decoder processing + dec_output = self._decoder( + input_tensor=dec_embedded, + cross_tensor=enc_output, + d_layers=self.d_layers, + d_model=self.d_model, + d_ff=self.d_ff, + dropout=self.dropout, + activation=self.activation, + self_attention_type=self.attn, + cross_attention_type="full", + mask_flag=self.mix, + n_heads=self.n_heads, + factor=self.factor, + use_norm=True, + ) - # Layer normalization - norm1 = tf.keras.layers.LayerNormalization() - norm2 = tf.keras.layers.LayerNormalization() - norm3 = tf.keras.layers.LayerNormalization() + # Final projection to output dimension + output = tf.keras.layers.Dense(n_channels, name="output_projection")(dec_output) - # Dropout - dropout_layer = tf.keras.layers.Dropout(dropout) + # Extract only the prediction part (last out_len timesteps) + output = output[:, -self.out_len :, :] - # Activation function - if activation == "relu": - activation_fn = tf.nn.relu - else: - activation_fn = tf.nn.gelu - - def decoder_layer_function( - x, cross, x_mask=None, cross_mask=None, training=None - ): - # Self-attention with residual connection - self_attn_out = self_attention( - x, x, x, attn_mask=x_mask, training=training - )[0] - x = x + dropout_layer(self_attn_out, training=training) - x = norm1(x, training=training) - - # Cross-attention with residual connection - cross_attn_out = cross_attention( - x, cross, cross, attn_mask=cross_mask, training=training - )[0] - x = x + dropout_layer(cross_attn_out, training=training) - y = x = norm2(x, training=training) - - # Feed-forward network with residual connection - y = conv1(y) - y = dropout_layer(activation_fn(y), training=training) - y = conv2(y) - y = dropout_layer(y, training=training) - - return norm3(x + y, training=training) - - return decoder_layer_function - - def _conv_layer(self, d_model): - """Create convolution layer for distilling.""" - import tensorflow as tf + # Create the model with both encoder and decoder inputs + inputs = [encoder_input, decoder_input] - # TensorFlow doesn't have direct circular padding, using 'same' padding - downConv = tf.keras.layers.Conv1D( - filters=d_model, kernel_size=3, padding="same", activation=None - ) - norm = tf.keras.layers.BatchNormalization() - activation = tf.keras.layers.ELU() - maxPool = tf.keras.layers.MaxPool1D(pool_size=3, strides=2, padding="same") - - def conv_layer_function(x, training=None): - # x shape: [B, L, D] -> Conv1D expects [B, L, C] - x = downConv(x) - x = norm(x, training=training) - x = activation(x) - x = maxPool(x) - return x - - return conv_layer_function + return inputs, output diff --git a/aeon/networks/tests/test_informer.py b/aeon/networks/tests/test_informer.py new file mode 100644 index 0000000000..9d5be59351 --- /dev/null +++ b/aeon/networks/tests/test_informer.py @@ -0,0 +1,221 @@ +"""Tests for the Informer Network Model.""" + +import random + +import pytest + +from aeon.networks import InformerNetwork +from aeon.utils.validation._dependencies import _check_soft_dependencies + + +@pytest.mark.skipif( + not _check_soft_dependencies(["tensorflow"], severity="none"), + reason="Tensorflow soft dependency unavailable.", +) +@pytest.mark.parametrize( + "seq_len,label_len,out_len,d_model,n_heads,e_layers,d_layers", + [ + (96, 48, 24, 512, 8, 3, 2), + (48, 24, 12, 256, 4, 2, 1), + (120, 60, 30, 128, 2, 1, 1), + (72, 36, 18, 64, 1, 2, 2), + ], +) +def test_informer_network_init( + seq_len, + label_len, + out_len, + d_model, + n_heads, + e_layers, + d_layers, +): + """Test whether InformerNetwork initializes correctly for various parameters.""" + informer = InformerNetwork( + seq_len=seq_len, + label_len=label_len, + out_len=out_len, + d_model=d_model, + n_heads=n_heads, + e_layers=e_layers, + d_layers=d_layers, + factor=random.choice([3, 5, 7]), + dropout=random.choice([0.0, 0.1, 0.2]), + attn=random.choice(["prob", "full"]), + activation=random.choice(["relu", "gelu"]), + ) + + inputs, outputs = informer.build_network((seq_len + label_len, 5)) + assert inputs is not None + assert outputs is not None + assert len(inputs) == 2 # encoder_input and decoder_input + + +@pytest.mark.skipif( + not _check_soft_dependencies(["tensorflow"], severity="none"), + reason="Tensorflow soft dependency unavailable.", +) +@pytest.mark.parametrize( + "attn,activation", + [("prob", "relu"), ("full", "gelu"), ("prob", "gelu"), ("full", "relu")], +) +def test_informer_network_attention_activation(attn, activation): + """Test InformerNetwork with different attention and activation.""" + informer = InformerNetwork( + seq_len=96, + label_len=48, + out_len=24, + d_model=128, + n_heads=4, + e_layers=2, + d_layers=1, + attn=attn, + activation=activation, + ) + + inputs, outputs = informer.build_network((144, 3)) + assert inputs is not None + assert outputs is not None + + +@pytest.mark.skipif( + not _check_soft_dependencies(["tensorflow"], severity="none"), + reason="Tensorflow soft dependency unavailable.", +) +@pytest.mark.parametrize( + "distil,mix,factor", + [(True, True, 5), (False, False, 3), (True, False, 7), (False, True, 2)], +) +def test_informer_network_distil_mix_factor(distil, mix, factor): + """Test whether InformerNetwork works with different configurations.""" + informer = InformerNetwork( + seq_len=48, + label_len=24, + out_len=12, + d_model=64, + n_heads=2, + e_layers=1, + d_layers=1, + distil=distil, + mix=mix, + factor=factor, + ) + + inputs, outputs = informer.build_network((72, 2)) + assert inputs is not None + assert outputs is not None + + +@pytest.mark.skipif( + not _check_soft_dependencies(["tensorflow"], severity="none"), + reason="Tensorflow soft dependency unavailable.", +) +def test_informer_network_output_shape(): + """Test whether InformerNetwork produces correct output shapes.""" + seq_len = 96 + label_len = 48 + out_len = 24 + n_channels = 5 + # batch_size = 32 + + informer = InformerNetwork( + seq_len=seq_len, + label_len=label_len, + out_len=out_len, + d_model=128, + n_heads=4, + e_layers=2, + d_layers=1, + ) + + inputs, outputs = informer.build_network((seq_len + label_len, n_channels)) + + # Create a TensorFlow model to test actual shapes + if _check_soft_dependencies(["tensorflow"], severity="none"): + # keras_model = tf.keras.Model(inputs=inputs, outputs=outputs) + + # Test input shapes + encoder_input_shape = inputs[0].shape + decoder_input_shape = inputs[1].shape + + assert encoder_input_shape[1] == seq_len # sequence length + assert encoder_input_shape[2] == n_channels # number of channels + assert decoder_input_shape[1] == label_len + out_len # decoder sequence length + assert decoder_input_shape[2] == n_channels # number of channels + + # Test output shape + output_shape = outputs.shape + assert output_shape[1] == out_len # prediction length + assert output_shape[2] == n_channels # number of channels + + +@pytest.mark.skipif( + not _check_soft_dependencies(["tensorflow"], severity="none"), + reason="Tensorflow soft dependency unavailable.", +) +def test_informer_network_default_parameters(): + """Test whether InformerNetwork works with default parameters.""" + informer = InformerNetwork() + + inputs, outputs = informer.build_network((120, 1)) + assert inputs is not None + assert outputs is not None + + # Check default values + assert informer.seq_len == 96 + assert informer.label_len == 48 + assert informer.out_len == 24 + assert informer.d_model == 512 + assert informer.n_heads == 8 + assert informer.e_layers == 3 + assert informer.d_layers == 2 + assert informer.attn == "prob" + assert informer.activation == "gelu" + assert informer.distil + assert informer.mix + + +@pytest.mark.skipif( + not _check_soft_dependencies(["tensorflow"], severity="none"), + reason="Tensorflow soft dependency unavailable.", +) +def test_informer_network_parameter_validation(): + """Test whether InformerNetwork handles edge case parameters correctly.""" + # Test minimum viable configuration + informer = InformerNetwork( + seq_len=12, + label_len=6, + out_len=3, + d_model=32, + n_heads=1, + e_layers=1, + d_layers=1, + factor=1, + dropout=0.0, + ) + + inputs, outputs = informer.build_network((18, 1)) + assert inputs is not None + assert outputs is not None + + +@pytest.mark.skipif( + not _check_soft_dependencies(["tensorflow"], severity="none"), + reason="Tensorflow soft dependency unavailable.", +) +def test_informer_network_different_channels(): + """Test whether InformerNetwork works with different numbers of input channels.""" + for n_channels in [1, 3, 5, 10]: + informer = InformerNetwork( + seq_len=48, + label_len=24, + out_len=12, + d_model=64, + n_heads=2, + e_layers=1, + d_layers=1, + ) + + inputs, outputs = informer.build_network((72, n_channels)) + assert inputs is not None + assert outputs is not None diff --git a/aeon/utils/networks/attention.py b/aeon/utils/networks/attention.py new file mode 100644 index 0000000000..025a625641 --- /dev/null +++ b/aeon/utils/networks/attention.py @@ -0,0 +1,350 @@ +"""Full Attention, ProbSparseAttention and Attention Layer.""" + +from aeon.utils.validation._dependencies import _check_soft_dependencies + +if _check_soft_dependencies(["tensorflow"], severity="none"): + import numpy as np + import tensorflow as tf + from tensorflow.keras.layers import Dropout, Layer + + +class KerasProbAttention(Layer): + """Keras implementation of ProbSparse Attention mechanism for Informer.""" + + def __init__( + self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, **kwargs + ): + """Initialize KerasProbAttention layer.""" + super().__init__(**kwargs) + self.factor = factor + self.scale = scale + self.mask_flag = mask_flag + self.attention_dropout = attention_dropout + self.dropout = Dropout(attention_dropout) + + def build(self, input_shape): + """Build the layer.""" + super().build(input_shape) + + def compute_output_shape(self, input_shape): + """Compute output shape for the layer.""" + # Return the same shape as queries input + return input_shape[0] # queries shape + + def compute_output_spec(self, input_spec): + """Compute output spec for the layer.""" + return input_spec[0] # Return queries spec + + def _prob_QK(self, Q, K, sample_k, n_top): + """Compute probabilistic QK with fixed dimension handling.""" + B, H, L, _ = tf.shape(Q)[0], tf.shape(Q)[1], tf.shape(Q)[2], tf.shape(Q)[3] + S = tf.shape(K)[2] + + # Ensure sample_k doesn't exceed available dimensions + sample_k = tf.minimum(sample_k, L) + n_top = tf.minimum(n_top, S) # Ensure n_top doesn't exceed sequence length + + # Expand K for sampling + K_expand = tf.expand_dims(K, axis=2) # [B, H, 1, L, E] + K_expand = tf.tile(K_expand, [1, 1, S, 1, 1]) # [B, H, S, L, E] + + # Generate random indices - ensure they're within bounds + indx_q_seq = tf.random.uniform([S], maxval=L, dtype=tf.int32) + indx_k_seq = tf.random.uniform([sample_k], maxval=L, dtype=tf.int32) + + # Gather operations for sampling + indices_s = tf.range(S) + K_sample = tf.gather(K_expand, indices_s, axis=2) + K_sample = tf.gather(K_sample, indx_q_seq, axis=2) + K_sample = tf.gather(K_sample, indx_k_seq, axis=3) + + # Matrix multiplication for Q_K_sample + Q_expanded = tf.expand_dims(Q, axis=-2) # [B, H, S, 1, E] + K_sample_transposed = tf.transpose(K_sample, perm=[0, 1, 2, 4, 3]) + Q_K_sample = tf.squeeze(tf.matmul(Q_expanded, K_sample_transposed), axis=-2) + + # Sparsity measurement calculation + M_max = tf.reduce_max(Q_K_sample, axis=-1) + M_mean = tf.reduce_sum(Q_K_sample, axis=-1) / tf.cast(sample_k, tf.float32) + M = M_max - M_mean + + # Top-k selection with dynamic k + actual_k = tf.minimum(n_top, tf.shape(M)[-1]) + _, M_top = tf.nn.top_k(M, k=actual_k, sorted=False) + + # Create indices for gather_nd + batch_range = tf.range(B) + head_range = tf.range(H) + batch_indices = tf.tile( + tf.expand_dims(tf.expand_dims(batch_range, 1), 2), [1, H, actual_k] + ) + + head_indices = tf.tile( + tf.expand_dims(tf.expand_dims(head_range, 0), 2), [B, 1, actual_k] + ) + + # Stack indices for gather_nd + idx = tf.stack([batch_indices, head_indices, M_top], axis=-1) + + # Reduce Q and calculate final Q_K + Q_reduce = tf.gather_nd(Q, idx) + K_transposed = tf.transpose(K, perm=[0, 1, 3, 2]) + Q_K = tf.matmul(Q_reduce, K_transposed) + + return Q_K, M_top + + def _get_initial_context(self, V, L_Q): + """Get initial context using Keras-compatible operations.""" + if not self.mask_flag: + # Sum reduction and broadcasting + V_sum = tf.reduce_sum(V, axis=-2) # [B, H, D] + V_sum_expanded = tf.expand_dims(V_sum, axis=-2) # [B, H, 1, D] + context = tf.tile(V_sum_expanded, [1, 1, L_Q, 1]) # [B, H, L_Q, D] + else: + # Cumulative sum for masked attention + context = tf.cumsum(V, axis=-2) + + return context + + def _create_prob_mask(self, B, H, L, index, scores): + """Create probability mask for tf.where compatibility.""" + # Create base mask with ones + _mask = tf.ones((L, tf.shape(scores)[-1]), dtype=tf.float32) + + # Create upper triangular matrix (including diagonal) + mask_a = tf.linalg.band_part( + _mask, 0, -1 + ) # Upper triangular matrix of 0s and 1s + + # Create diagonal matrix + mask_b = tf.linalg.band_part(_mask, 0, 0) # Diagonal matrix of 0s and 1s + + # Subtract diagonal from upper triangular to get strict upper triangular + _mask = tf.cast(mask_a - mask_b, dtype=tf.float32) + + # Broadcast to [B, H, L, scores.shape[-1]] + _mask_ex = tf.broadcast_to(_mask, [B, H, L, tf.shape(scores)[-1]]) + + # Create indexing tensors + batch_indices = tf.range(B)[:, None, None] + head_indices = tf.range(H)[None, :, None] + + # Extract indicator using advanced indexing + indicator = tf.gather_nd( + _mask_ex, + tf.stack( + [ + tf.broadcast_to(batch_indices, tf.shape(index)), + tf.broadcast_to(head_indices, tf.shape(index)), + index, + ], + axis=-1, + ), + ) + + # Reshape to match scores shape + prob_mask_float = tf.reshape(indicator, tf.shape(scores)) + + # **KEY FIX**: Convert to boolean tensor + prob_mask_bool = tf.cast(prob_mask_float, tf.bool) + + return prob_mask_bool + + def _update_context(self, context_in, V, scores, index, L_Q): + """Update context using Keras-compatible operations.""" + if self.mask_flag: + # Apply simple masking + attn_mask = self._create_prob_mask( + tf.shape(V)[0], tf.shape(V)[1], L_Q, index, scores + ) + + # Apply mask with large negative value + large_neg = -1e9 + mask_value = tf.where(attn_mask, 0.0, large_neg) + scores = scores + mask_value + + # Softmax activation + attn = tf.nn.softmax(scores, axis=-1) + attn = self.dropout(attn) + + # Create indices for scatter update + B, H = tf.shape(V)[0], tf.shape(V)[1] + index_shape = tf.shape(index)[-1] + + batch_indices = tf.tile( + tf.expand_dims(tf.expand_dims(tf.range(B), 1), 2), [1, H, index_shape] + ) + + head_indices = tf.tile( + tf.expand_dims(tf.expand_dims(tf.range(H), 0), 2), [B, 1, index_shape] + ) + + idx = tf.stack([batch_indices, head_indices, index], axis=-1) + + # Matrix multiplication and scatter update + attn_V = tf.matmul(attn, V) + context_updated = tf.tensor_scatter_nd_update(context_in, idx, attn_V) + + return context_updated + + def call(self, inputs, attention_mask=None, training=None): + """Run forward pass with fixed tensor operations.""" + queries, keys, values = inputs + + # Get shapes + # B = tf.shape(queries)[0] + L = tf.shape(queries)[1] # sequence length + # H = tf.shape(queries)[2] # number of heads + D = tf.shape(queries)[3] # dimension per head + S = tf.shape(keys)[1] # source sequence length + + # Reshape tensors - transpose to [B, H, L, D] + queries = tf.transpose(queries, perm=[0, 2, 1, 3]) # [B, H, L, D] + keys = tf.transpose(keys, perm=[0, 2, 1, 3]) # [B, H, S, D] + values = tf.transpose(values, perm=[0, 2, 1, 3]) # [B, H, S, D] + + # Calculate sampling parameters with bounds checking + # Use tf.py_function to handle numpy operations safely + def safe_log_calc(seq_len, factor): + if hasattr(seq_len, "numpy"): + return int(factor * np.ceil(np.log(max(seq_len.numpy(), 2)))) + else: + return int(factor * np.ceil(np.log(20))) # fallback + + U = tf.py_function( + func=lambda: safe_log_calc(S, self.factor), inp=[], Tout=tf.int32 + ) + + u = tf.py_function( + func=lambda: safe_log_calc(L, self.factor), inp=[], Tout=tf.int32 + ) + + # Ensure U and u are within reasonable bounds + U = tf.minimum(U, S) # Can't select more than available + u = tf.minimum(u, L) + + # Probabilistic QK computation + scores_top, index = self._prob_QK(queries, keys, u, U) + + # Apply scale factor + scale = self.scale or (1.0 / tf.sqrt(tf.cast(D, tf.float32))) + scores_top = scores_top * scale + + # Get initial context + context = self._get_initial_context(values, L) + + # Update context with selected queries + context = self._update_context(context, values, scores_top, index, L) + + # Transpose back to original format [B, L, H, D] + context = tf.transpose(context, perm=[0, 2, 1, 3]) + + return context + + def get_config(self): + """Return the config of the layer.""" + config = super().get_config() + config.update( + { + "mask_flag": self.mask_flag, + "factor": self.factor, + "scale": self.scale, + "attention_dropout": self.attention_dropout, + } + ) + return config + + @classmethod + def from_config(cls, config): + """Create layer from config.""" + return cls(**config) + + +class AttentionLayer(Layer): + """Keras multi-head attention layer using a custom attention mechanism.""" + + def __init__( + self, attention, d_model, n_heads, d_keys=None, d_values=None, **kwargs + ): + super().__init__(**kwargs) + self.d_keys = d_keys or (d_model // n_heads) + self.d_values = d_values or (d_model // n_heads) + self.d_model = d_model + self.n_heads = n_heads + + # Store the attention mechanism + self.inner_attention = attention + + # Projection layers + self.query_projection = tf.keras.layers.Dense( + self.d_keys * n_heads, name="query_proj" + ) + + self.key_projection = tf.keras.layers.Dense( + self.d_keys * n_heads, name="key_proj" + ) + + self.value_projection = tf.keras.layers.Dense( + self.d_values * n_heads, name="value_proj" + ) + + self.out_projection = tf.keras.layers.Dense(d_model, name="output_proj") + + def build(self, input_shape): + """Build the layer.""" + # Build the projection layers + super().build(input_shape) + + def compute_output_shape(self, input_shape): + """Compute output shape for the layer.""" + # Output shape is same as queries input shape but with d_model as last dimension + batch_size, seq_length, _ = input_shape[0] + return (batch_size, seq_length, self.d_model) + + def call(self, inputs, attn_mask=None, training=None): + """Run forward pass for the attention layer.""" + queries, keys, values = inputs + + # Get batch size and sequence lengths dynamically + B = tf.shape(queries)[0] + L = tf.shape(queries)[1] # target sequence length + S = tf.shape(keys)[1] # source sequence length + H = self.n_heads + + # Apply projections + queries_proj = self.query_projection(queries) # [B, L, d_keys * n_heads] + keys_proj = self.key_projection(keys) # [B, S, d_keys * n_heads] + values_proj = self.value_projection(values) # [B, S, d_values * n_heads] + + # Reshape to multi-head format: [B, L/S, H, d_keys/d_values] + queries_reshaped = tf.reshape(queries_proj, (B, L, H, self.d_keys)) + keys_reshaped = tf.reshape(keys_proj, (B, S, H, self.d_keys)) + values_reshaped = tf.reshape(values_proj, (B, S, H, self.d_values)) + + # Apply inner attention mechanism + attention_output = self.inner_attention( + [queries_reshaped, keys_reshaped, values_reshaped], + attention_mask=attn_mask, + training=training, + ) + + # Reshape attention output back to [B, L, H * d_values] + attention_flattened = tf.reshape(attention_output, (B, L, H * self.d_values)) + + # Final output projection + output = self.out_projection(attention_flattened) + + return output + + def get_config(self): + """Return the config of the layer.""" + config = super().get_config() + config.update( + { + "d_model": self.d_model, + "n_heads": self.n_heads, + "d_keys": self.d_keys, + "d_values": self.d_values, + } + ) + return config From 3fb1ef31263cc36d1c2a7c8a57b2bb57832d8532 Mon Sep 17 00:00:00 2001 From: lucifer4073 Date: Mon, 21 Jul 2025 18:13:27 +0530 Subject: [PATCH 04/10] attention changed --- aeon/utils/networks/attention.py | 653 +++++++++++++++---------------- 1 file changed, 326 insertions(+), 327 deletions(-) diff --git a/aeon/utils/networks/attention.py b/aeon/utils/networks/attention.py index 025a625641..1a724c35ec 100644 --- a/aeon/utils/networks/attention.py +++ b/aeon/utils/networks/attention.py @@ -7,344 +7,343 @@ import tensorflow as tf from tensorflow.keras.layers import Dropout, Layer + class KerasProbAttention(Layer): + """Keras implementation of ProbSparse Attention mechanism for Informer.""" + + def __init__( + self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, **kwargs + ): + """Initialize KerasProbAttention layer.""" + super().__init__(**kwargs) + self.factor = factor + self.scale = scale + self.mask_flag = mask_flag + self.attention_dropout = attention_dropout + self.dropout = Dropout(attention_dropout) + + def build(self, input_shape): + """Build the layer.""" + super().build(input_shape) + + def compute_output_shape(self, input_shape): + """Compute output shape for the layer.""" + # Return the same shape as queries input + return input_shape[0] # queries shape + + def compute_output_spec(self, input_spec): + """Compute output spec for the layer.""" + return input_spec[0] # Return queries spec + + def _prob_QK(self, Q, K, sample_k, n_top): + """Compute probabilistic QK with fixed dimension handling.""" + B, H, L, _ = tf.shape(Q)[0], tf.shape(Q)[1], tf.shape(Q)[2], tf.shape(Q)[3] + S = tf.shape(K)[2] + + # Ensure sample_k doesn't exceed available dimensions + sample_k = tf.minimum(sample_k, L) + n_top = tf.minimum(n_top, S) # Ensure n_top doesn't exceed sequence length + + # Expand K for sampling + K_expand = tf.expand_dims(K, axis=2) # [B, H, 1, L, E] + K_expand = tf.tile(K_expand, [1, 1, S, 1, 1]) # [B, H, S, L, E] + + # Generate random indices - ensure they're within bounds + indx_q_seq = tf.random.uniform([S], maxval=L, dtype=tf.int32) + indx_k_seq = tf.random.uniform([sample_k], maxval=L, dtype=tf.int32) + + # Gather operations for sampling + indices_s = tf.range(S) + K_sample = tf.gather(K_expand, indices_s, axis=2) + K_sample = tf.gather(K_sample, indx_q_seq, axis=2) + K_sample = tf.gather(K_sample, indx_k_seq, axis=3) + + # Matrix multiplication for Q_K_sample + Q_expanded = tf.expand_dims(Q, axis=-2) # [B, H, S, 1, E] + K_sample_transposed = tf.transpose(K_sample, perm=[0, 1, 2, 4, 3]) + Q_K_sample = tf.squeeze(tf.matmul(Q_expanded, K_sample_transposed), axis=-2) + + # Sparsity measurement calculation + M_max = tf.reduce_max(Q_K_sample, axis=-1) + M_mean = tf.reduce_sum(Q_K_sample, axis=-1) / tf.cast(sample_k, tf.float32) + M = M_max - M_mean + + # Top-k selection with dynamic k + actual_k = tf.minimum(n_top, tf.shape(M)[-1]) + _, M_top = tf.nn.top_k(M, k=actual_k, sorted=False) + + # Create indices for gather_nd + batch_range = tf.range(B) + head_range = tf.range(H) + batch_indices = tf.tile( + tf.expand_dims(tf.expand_dims(batch_range, 1), 2), [1, H, actual_k] + ) -class KerasProbAttention(Layer): - """Keras implementation of ProbSparse Attention mechanism for Informer.""" - - def __init__( - self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, **kwargs - ): - """Initialize KerasProbAttention layer.""" - super().__init__(**kwargs) - self.factor = factor - self.scale = scale - self.mask_flag = mask_flag - self.attention_dropout = attention_dropout - self.dropout = Dropout(attention_dropout) - - def build(self, input_shape): - """Build the layer.""" - super().build(input_shape) - - def compute_output_shape(self, input_shape): - """Compute output shape for the layer.""" - # Return the same shape as queries input - return input_shape[0] # queries shape - - def compute_output_spec(self, input_spec): - """Compute output spec for the layer.""" - return input_spec[0] # Return queries spec - - def _prob_QK(self, Q, K, sample_k, n_top): - """Compute probabilistic QK with fixed dimension handling.""" - B, H, L, _ = tf.shape(Q)[0], tf.shape(Q)[1], tf.shape(Q)[2], tf.shape(Q)[3] - S = tf.shape(K)[2] - - # Ensure sample_k doesn't exceed available dimensions - sample_k = tf.minimum(sample_k, L) - n_top = tf.minimum(n_top, S) # Ensure n_top doesn't exceed sequence length - - # Expand K for sampling - K_expand = tf.expand_dims(K, axis=2) # [B, H, 1, L, E] - K_expand = tf.tile(K_expand, [1, 1, S, 1, 1]) # [B, H, S, L, E] - - # Generate random indices - ensure they're within bounds - indx_q_seq = tf.random.uniform([S], maxval=L, dtype=tf.int32) - indx_k_seq = tf.random.uniform([sample_k], maxval=L, dtype=tf.int32) - - # Gather operations for sampling - indices_s = tf.range(S) - K_sample = tf.gather(K_expand, indices_s, axis=2) - K_sample = tf.gather(K_sample, indx_q_seq, axis=2) - K_sample = tf.gather(K_sample, indx_k_seq, axis=3) - - # Matrix multiplication for Q_K_sample - Q_expanded = tf.expand_dims(Q, axis=-2) # [B, H, S, 1, E] - K_sample_transposed = tf.transpose(K_sample, perm=[0, 1, 2, 4, 3]) - Q_K_sample = tf.squeeze(tf.matmul(Q_expanded, K_sample_transposed), axis=-2) - - # Sparsity measurement calculation - M_max = tf.reduce_max(Q_K_sample, axis=-1) - M_mean = tf.reduce_sum(Q_K_sample, axis=-1) / tf.cast(sample_k, tf.float32) - M = M_max - M_mean - - # Top-k selection with dynamic k - actual_k = tf.minimum(n_top, tf.shape(M)[-1]) - _, M_top = tf.nn.top_k(M, k=actual_k, sorted=False) - - # Create indices for gather_nd - batch_range = tf.range(B) - head_range = tf.range(H) - batch_indices = tf.tile( - tf.expand_dims(tf.expand_dims(batch_range, 1), 2), [1, H, actual_k] - ) - - head_indices = tf.tile( - tf.expand_dims(tf.expand_dims(head_range, 0), 2), [B, 1, actual_k] - ) - - # Stack indices for gather_nd - idx = tf.stack([batch_indices, head_indices, M_top], axis=-1) - - # Reduce Q and calculate final Q_K - Q_reduce = tf.gather_nd(Q, idx) - K_transposed = tf.transpose(K, perm=[0, 1, 3, 2]) - Q_K = tf.matmul(Q_reduce, K_transposed) - - return Q_K, M_top - - def _get_initial_context(self, V, L_Q): - """Get initial context using Keras-compatible operations.""" - if not self.mask_flag: - # Sum reduction and broadcasting - V_sum = tf.reduce_sum(V, axis=-2) # [B, H, D] - V_sum_expanded = tf.expand_dims(V_sum, axis=-2) # [B, H, 1, D] - context = tf.tile(V_sum_expanded, [1, 1, L_Q, 1]) # [B, H, L_Q, D] - else: - # Cumulative sum for masked attention - context = tf.cumsum(V, axis=-2) - - return context - - def _create_prob_mask(self, B, H, L, index, scores): - """Create probability mask for tf.where compatibility.""" - # Create base mask with ones - _mask = tf.ones((L, tf.shape(scores)[-1]), dtype=tf.float32) - - # Create upper triangular matrix (including diagonal) - mask_a = tf.linalg.band_part( - _mask, 0, -1 - ) # Upper triangular matrix of 0s and 1s - - # Create diagonal matrix - mask_b = tf.linalg.band_part(_mask, 0, 0) # Diagonal matrix of 0s and 1s - - # Subtract diagonal from upper triangular to get strict upper triangular - _mask = tf.cast(mask_a - mask_b, dtype=tf.float32) - - # Broadcast to [B, H, L, scores.shape[-1]] - _mask_ex = tf.broadcast_to(_mask, [B, H, L, tf.shape(scores)[-1]]) - - # Create indexing tensors - batch_indices = tf.range(B)[:, None, None] - head_indices = tf.range(H)[None, :, None] - - # Extract indicator using advanced indexing - indicator = tf.gather_nd( - _mask_ex, - tf.stack( - [ - tf.broadcast_to(batch_indices, tf.shape(index)), - tf.broadcast_to(head_indices, tf.shape(index)), - index, - ], - axis=-1, - ), - ) - - # Reshape to match scores shape - prob_mask_float = tf.reshape(indicator, tf.shape(scores)) - - # **KEY FIX**: Convert to boolean tensor - prob_mask_bool = tf.cast(prob_mask_float, tf.bool) - - return prob_mask_bool - - def _update_context(self, context_in, V, scores, index, L_Q): - """Update context using Keras-compatible operations.""" - if self.mask_flag: - # Apply simple masking - attn_mask = self._create_prob_mask( - tf.shape(V)[0], tf.shape(V)[1], L_Q, index, scores + head_indices = tf.tile( + tf.expand_dims(tf.expand_dims(head_range, 0), 2), [B, 1, actual_k] ) - # Apply mask with large negative value - large_neg = -1e9 - mask_value = tf.where(attn_mask, 0.0, large_neg) - scores = scores + mask_value + # Stack indices for gather_nd + idx = tf.stack([batch_indices, head_indices, M_top], axis=-1) - # Softmax activation - attn = tf.nn.softmax(scores, axis=-1) - attn = self.dropout(attn) + # Reduce Q and calculate final Q_K + Q_reduce = tf.gather_nd(Q, idx) + K_transposed = tf.transpose(K, perm=[0, 1, 3, 2]) + Q_K = tf.matmul(Q_reduce, K_transposed) - # Create indices for scatter update - B, H = tf.shape(V)[0], tf.shape(V)[1] - index_shape = tf.shape(index)[-1] + return Q_K, M_top - batch_indices = tf.tile( - tf.expand_dims(tf.expand_dims(tf.range(B), 1), 2), [1, H, index_shape] - ) + def _get_initial_context(self, V, L_Q): + """Get initial context using Keras-compatible operations.""" + if not self.mask_flag: + # Sum reduction and broadcasting + V_sum = tf.reduce_sum(V, axis=-2) # [B, H, D] + V_sum_expanded = tf.expand_dims(V_sum, axis=-2) # [B, H, 1, D] + context = tf.tile(V_sum_expanded, [1, 1, L_Q, 1]) # [B, H, L_Q, D] + else: + # Cumulative sum for masked attention + context = tf.cumsum(V, axis=-2) + + return context + + def _create_prob_mask(self, B, H, L, index, scores): + """Create probability mask for tf.where compatibility.""" + # Create base mask with ones + _mask = tf.ones((L, tf.shape(scores)[-1]), dtype=tf.float32) + + # Create upper triangular matrix (including diagonal) + mask_a = tf.linalg.band_part( + _mask, 0, -1 + ) # Upper triangular matrix of 0s and 1s + + # Create diagonal matrix + mask_b = tf.linalg.band_part(_mask, 0, 0) # Diagonal matrix of 0s and 1s + + # Subtract diagonal from upper triangular to get strict upper triangular + _mask = tf.cast(mask_a - mask_b, dtype=tf.float32) + + # Broadcast to [B, H, L, scores.shape[-1]] + _mask_ex = tf.broadcast_to(_mask, [B, H, L, tf.shape(scores)[-1]]) + + # Create indexing tensors + batch_indices = tf.range(B)[:, None, None] + head_indices = tf.range(H)[None, :, None] + + # Extract indicator using advanced indexing + indicator = tf.gather_nd( + _mask_ex, + tf.stack( + [ + tf.broadcast_to(batch_indices, tf.shape(index)), + tf.broadcast_to(head_indices, tf.shape(index)), + index, + ], + axis=-1, + ), + ) - head_indices = tf.tile( - tf.expand_dims(tf.expand_dims(tf.range(H), 0), 2), [B, 1, index_shape] - ) + # Reshape to match scores shape + prob_mask_float = tf.reshape(indicator, tf.shape(scores)) - idx = tf.stack([batch_indices, head_indices, index], axis=-1) + # **KEY FIX**: Convert to boolean tensor + prob_mask_bool = tf.cast(prob_mask_float, tf.bool) - # Matrix multiplication and scatter update - attn_V = tf.matmul(attn, V) - context_updated = tf.tensor_scatter_nd_update(context_in, idx, attn_V) + return prob_mask_bool - return context_updated + def _update_context(self, context_in, V, scores, index, L_Q): + """Update context using Keras-compatible operations.""" + if self.mask_flag: + # Apply simple masking + attn_mask = self._create_prob_mask( + tf.shape(V)[0], tf.shape(V)[1], L_Q, index, scores + ) - def call(self, inputs, attention_mask=None, training=None): - """Run forward pass with fixed tensor operations.""" - queries, keys, values = inputs + # Apply mask with large negative value + large_neg = -1e9 + mask_value = tf.where(attn_mask, 0.0, large_neg) + scores = scores + mask_value - # Get shapes - # B = tf.shape(queries)[0] - L = tf.shape(queries)[1] # sequence length - # H = tf.shape(queries)[2] # number of heads - D = tf.shape(queries)[3] # dimension per head - S = tf.shape(keys)[1] # source sequence length + # Softmax activation + attn = tf.nn.softmax(scores, axis=-1) + attn = self.dropout(attn) - # Reshape tensors - transpose to [B, H, L, D] - queries = tf.transpose(queries, perm=[0, 2, 1, 3]) # [B, H, L, D] - keys = tf.transpose(keys, perm=[0, 2, 1, 3]) # [B, H, S, D] - values = tf.transpose(values, perm=[0, 2, 1, 3]) # [B, H, S, D] + # Create indices for scatter update + B, H = tf.shape(V)[0], tf.shape(V)[1] + index_shape = tf.shape(index)[-1] - # Calculate sampling parameters with bounds checking - # Use tf.py_function to handle numpy operations safely - def safe_log_calc(seq_len, factor): - if hasattr(seq_len, "numpy"): - return int(factor * np.ceil(np.log(max(seq_len.numpy(), 2)))) - else: - return int(factor * np.ceil(np.log(20))) # fallback - - U = tf.py_function( - func=lambda: safe_log_calc(S, self.factor), inp=[], Tout=tf.int32 - ) - - u = tf.py_function( - func=lambda: safe_log_calc(L, self.factor), inp=[], Tout=tf.int32 - ) - - # Ensure U and u are within reasonable bounds - U = tf.minimum(U, S) # Can't select more than available - u = tf.minimum(u, L) - - # Probabilistic QK computation - scores_top, index = self._prob_QK(queries, keys, u, U) - - # Apply scale factor - scale = self.scale or (1.0 / tf.sqrt(tf.cast(D, tf.float32))) - scores_top = scores_top * scale - - # Get initial context - context = self._get_initial_context(values, L) - - # Update context with selected queries - context = self._update_context(context, values, scores_top, index, L) - - # Transpose back to original format [B, L, H, D] - context = tf.transpose(context, perm=[0, 2, 1, 3]) - - return context - - def get_config(self): - """Return the config of the layer.""" - config = super().get_config() - config.update( - { - "mask_flag": self.mask_flag, - "factor": self.factor, - "scale": self.scale, - "attention_dropout": self.attention_dropout, - } - ) - return config - - @classmethod - def from_config(cls, config): - """Create layer from config.""" - return cls(**config) - - -class AttentionLayer(Layer): - """Keras multi-head attention layer using a custom attention mechanism.""" - - def __init__( - self, attention, d_model, n_heads, d_keys=None, d_values=None, **kwargs - ): - super().__init__(**kwargs) - self.d_keys = d_keys or (d_model // n_heads) - self.d_values = d_values or (d_model // n_heads) - self.d_model = d_model - self.n_heads = n_heads - - # Store the attention mechanism - self.inner_attention = attention - - # Projection layers - self.query_projection = tf.keras.layers.Dense( - self.d_keys * n_heads, name="query_proj" - ) - - self.key_projection = tf.keras.layers.Dense( - self.d_keys * n_heads, name="key_proj" - ) - - self.value_projection = tf.keras.layers.Dense( - self.d_values * n_heads, name="value_proj" - ) - - self.out_projection = tf.keras.layers.Dense(d_model, name="output_proj") - - def build(self, input_shape): - """Build the layer.""" - # Build the projection layers - super().build(input_shape) - - def compute_output_shape(self, input_shape): - """Compute output shape for the layer.""" - # Output shape is same as queries input shape but with d_model as last dimension - batch_size, seq_length, _ = input_shape[0] - return (batch_size, seq_length, self.d_model) - - def call(self, inputs, attn_mask=None, training=None): - """Run forward pass for the attention layer.""" - queries, keys, values = inputs - - # Get batch size and sequence lengths dynamically - B = tf.shape(queries)[0] - L = tf.shape(queries)[1] # target sequence length - S = tf.shape(keys)[1] # source sequence length - H = self.n_heads - - # Apply projections - queries_proj = self.query_projection(queries) # [B, L, d_keys * n_heads] - keys_proj = self.key_projection(keys) # [B, S, d_keys * n_heads] - values_proj = self.value_projection(values) # [B, S, d_values * n_heads] - - # Reshape to multi-head format: [B, L/S, H, d_keys/d_values] - queries_reshaped = tf.reshape(queries_proj, (B, L, H, self.d_keys)) - keys_reshaped = tf.reshape(keys_proj, (B, S, H, self.d_keys)) - values_reshaped = tf.reshape(values_proj, (B, S, H, self.d_values)) - - # Apply inner attention mechanism - attention_output = self.inner_attention( - [queries_reshaped, keys_reshaped, values_reshaped], - attention_mask=attn_mask, - training=training, - ) - - # Reshape attention output back to [B, L, H * d_values] - attention_flattened = tf.reshape(attention_output, (B, L, H * self.d_values)) - - # Final output projection - output = self.out_projection(attention_flattened) - - return output - - def get_config(self): - """Return the config of the layer.""" - config = super().get_config() - config.update( - { - "d_model": self.d_model, - "n_heads": self.n_heads, - "d_keys": self.d_keys, - "d_values": self.d_values, - } - ) - return config + batch_indices = tf.tile( + tf.expand_dims(tf.expand_dims(tf.range(B), 1), 2), [1, H, index_shape] + ) + + head_indices = tf.tile( + tf.expand_dims(tf.expand_dims(tf.range(H), 0), 2), [B, 1, index_shape] + ) + + idx = tf.stack([batch_indices, head_indices, index], axis=-1) + + # Matrix multiplication and scatter update + attn_V = tf.matmul(attn, V) + context_updated = tf.tensor_scatter_nd_update(context_in, idx, attn_V) + + return context_updated + + def call(self, inputs, attention_mask=None, training=None): + """Run forward pass with fixed tensor operations.""" + queries, keys, values = inputs + + # Get shapes + # B = tf.shape(queries)[0] + L = tf.shape(queries)[1] # sequence length + # H = tf.shape(queries)[2] # number of heads + D = tf.shape(queries)[3] # dimension per head + S = tf.shape(keys)[1] # source sequence length + + # Reshape tensors - transpose to [B, H, L, D] + queries = tf.transpose(queries, perm=[0, 2, 1, 3]) # [B, H, L, D] + keys = tf.transpose(keys, perm=[0, 2, 1, 3]) # [B, H, S, D] + values = tf.transpose(values, perm=[0, 2, 1, 3]) # [B, H, S, D] + + # Calculate sampling parameters with bounds checking + # Use tf.py_function to handle numpy operations safely + def safe_log_calc(seq_len, factor): + if hasattr(seq_len, "numpy"): + return int(factor * np.ceil(np.log(max(seq_len.numpy(), 2)))) + else: + return int(factor * np.ceil(np.log(20))) # fallback + + U = tf.py_function( + func=lambda: safe_log_calc(S, self.factor), inp=[], Tout=tf.int32 + ) + + u = tf.py_function( + func=lambda: safe_log_calc(L, self.factor), inp=[], Tout=tf.int32 + ) + + # Ensure U and u are within reasonable bounds + U = tf.minimum(U, S) # Can't select more than available + u = tf.minimum(u, L) + + # Probabilistic QK computation + scores_top, index = self._prob_QK(queries, keys, u, U) + + # Apply scale factor + scale = self.scale or (1.0 / tf.sqrt(tf.cast(D, tf.float32))) + scores_top = scores_top * scale + + # Get initial context + context = self._get_initial_context(values, L) + + # Update context with selected queries + context = self._update_context(context, values, scores_top, index, L) + + # Transpose back to original format [B, L, H, D] + context = tf.transpose(context, perm=[0, 2, 1, 3]) + + return context + + def get_config(self): + """Return the config of the layer.""" + config = super().get_config() + config.update( + { + "mask_flag": self.mask_flag, + "factor": self.factor, + "scale": self.scale, + "attention_dropout": self.attention_dropout, + } + ) + return config + + @classmethod + def from_config(cls, config): + """Create layer from config.""" + return cls(**config) + + class AttentionLayer(Layer): + """Keras multi-head attention layer using a custom attention mechanism.""" + + def __init__( + self, attention, d_model, n_heads, d_keys=None, d_values=None, **kwargs + ): + super().__init__(**kwargs) + self.d_keys = d_keys or (d_model // n_heads) + self.d_values = d_values or (d_model // n_heads) + self.d_model = d_model + self.n_heads = n_heads + + # Store the attention mechanism + self.inner_attention = attention + + # Projection layers + self.query_projection = tf.keras.layers.Dense( + self.d_keys * n_heads, name="query_proj" + ) + + self.key_projection = tf.keras.layers.Dense( + self.d_keys * n_heads, name="key_proj" + ) + + self.value_projection = tf.keras.layers.Dense( + self.d_values * n_heads, name="value_proj" + ) + + self.out_projection = tf.keras.layers.Dense(d_model, name="output_proj") + + def build(self, input_shape): + """Build the layer.""" + # Build the projection layers + super().build(input_shape) + + def compute_output_shape(self, input_shape): + """Compute output shape for the layer.""" + batch_size, seq_length, _ = input_shape[0] + return (batch_size, seq_length, self.d_model) + + def call(self, inputs, attn_mask=None, training=None): + """Run forward pass for the attention layer.""" + queries, keys, values = inputs + + # Get batch size and sequence lengths dynamically + B = tf.shape(queries)[0] + L = tf.shape(queries)[1] # target sequence length + S = tf.shape(keys)[1] # source sequence length + H = self.n_heads + + # Apply projections + queries_proj = self.query_projection(queries) # [B, L, d_keys * n_heads] + keys_proj = self.key_projection(keys) # [B, S, d_keys * n_heads] + values_proj = self.value_projection(values) # [B, S, d_values * n_heads] + + # Reshape to multi-head format: [B, L/S, H, d_keys/d_values] + queries_reshaped = tf.reshape(queries_proj, (B, L, H, self.d_keys)) + keys_reshaped = tf.reshape(keys_proj, (B, S, H, self.d_keys)) + values_reshaped = tf.reshape(values_proj, (B, S, H, self.d_values)) + + # Apply inner attention mechanism + attention_output = self.inner_attention( + [queries_reshaped, keys_reshaped, values_reshaped], + attention_mask=attn_mask, + training=training, + ) + + # Reshape attention output back to [B, L, H * d_values] + attention_flattened = tf.reshape( + attention_output, (B, L, H * self.d_values) + ) + + # Final output projection + output = self.out_projection(attention_flattened) + + return output + + def get_config(self): + """Return the config of the layer.""" + config = super().get_config() + config.update( + { + "d_model": self.d_model, + "n_heads": self.n_heads, + "d_keys": self.d_keys, + "d_values": self.d_values, + } + ) + return config From b77165a38d01aefab5e3bb26d51ba35116e44716 Mon Sep 17 00:00:00 2001 From: lucifer4073 Date: Mon, 21 Jul 2025 18:18:58 +0530 Subject: [PATCH 05/10] attention layers made as serializable aeon package --- aeon/utils/networks/attention.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/aeon/utils/networks/attention.py b/aeon/utils/networks/attention.py index 1a724c35ec..5201bc5134 100644 --- a/aeon/utils/networks/attention.py +++ b/aeon/utils/networks/attention.py @@ -7,6 +7,7 @@ import tensorflow as tf from tensorflow.keras.layers import Dropout, Layer + @tf.keras.utils.register_keras_serializable(package="aeon") class KerasProbAttention(Layer): """Keras implementation of ProbSparse Attention mechanism for Informer.""" @@ -258,6 +259,7 @@ def from_config(cls, config): """Create layer from config.""" return cls(**config) + @tf.keras.utils.register_keras_serializable(package="aeon") class AttentionLayer(Layer): """Keras multi-head attention layer using a custom attention mechanism.""" From c9b46637fec5fd04f6bbb2c212c844ee9c928305 Mon Sep 17 00:00:00 2001 From: lucifer4073 Date: Mon, 21 Jul 2025 18:20:47 +0530 Subject: [PATCH 06/10] informer net --- aeon/networks/_informer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aeon/networks/_informer.py b/aeon/networks/_informer.py index dfbf4b1bfc..b30b594c62 100644 --- a/aeon/networks/_informer.py +++ b/aeon/networks/_informer.py @@ -65,7 +65,7 @@ class InformerNetwork(BaseDeepLearningNetwork): _config = { "python_dependencies": ["tensorflow"], "python_version": "<3.13", - "structure": "encoder-decoder", + "structure": "auto-encoder", } def __init__( From 043da23490651a13e0083c94db684446c58f6764 Mon Sep 17 00:00:00 2001 From: lucifer4073 Date: Mon, 21 Jul 2025 18:39:41 +0530 Subject: [PATCH 07/10] core import check --- aeon/networks/_informer.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/aeon/networks/_informer.py b/aeon/networks/_informer.py index b30b594c62..99e64fcf77 100644 --- a/aeon/networks/_informer.py +++ b/aeon/networks/_informer.py @@ -5,15 +5,16 @@ from typing import Optional from aeon.networks.base import BaseDeepLearningNetwork -from aeon.utils.networks.attention import ( - AttentionLayer, - KerasProbAttention, -) from aeon.utils.validation._dependencies import _check_soft_dependencies if _check_soft_dependencies(["tensorflow"], severity="none"): import tensorflow as tf + from aeon.utils.networks.attention import ( + AttentionLayer, + KerasProbAttention, + ) + class InformerNetwork(BaseDeepLearningNetwork): """ From e18f73590338b509195e7e7d8d419155f328a924 Mon Sep 17 00:00:00 2001 From: lucifer4073 Date: Mon, 21 Jul 2025 18:42:23 +0530 Subject: [PATCH 08/10] check tf dep --- aeon/networks/_informer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/aeon/networks/_informer.py b/aeon/networks/_informer.py index 99e64fcf77..00d79aaff5 100644 --- a/aeon/networks/_informer.py +++ b/aeon/networks/_informer.py @@ -8,7 +8,6 @@ from aeon.utils.validation._dependencies import _check_soft_dependencies if _check_soft_dependencies(["tensorflow"], severity="none"): - import tensorflow as tf from aeon.utils.networks.attention import ( AttentionLayer, @@ -69,6 +68,8 @@ class InformerNetwork(BaseDeepLearningNetwork): "structure": "auto-encoder", } + import tensorflow as tf + def __init__( self, seq_len: int = 96, From 2830627aaaab2acd598f4d4099453a815552ceea Mon Sep 17 00:00:00 2001 From: lucifer4073 Date: Tue, 22 Jul 2025 01:19:57 +0530 Subject: [PATCH 09/10] tests corrected --- aeon/networks/_informer.py | 230 +++++++++++++---------- aeon/networks/tests/test_all_networks.py | 5 + aeon/networks/tests/test_informer.py | 44 ----- 3 files changed, 134 insertions(+), 145 deletions(-) diff --git a/aeon/networks/_informer.py b/aeon/networks/_informer.py index 00d79aaff5..f5b81c6ac3 100644 --- a/aeon/networks/_informer.py +++ b/aeon/networks/_informer.py @@ -2,7 +2,6 @@ __maintainer__ = [""] -from typing import Optional from aeon.networks.base import BaseDeepLearningNetwork from aeon.utils.validation._dependencies import _check_soft_dependencies @@ -65,11 +64,9 @@ class InformerNetwork(BaseDeepLearningNetwork): _config = { "python_dependencies": ["tensorflow"], "python_version": "<3.13", - "structure": "auto-encoder", + "structure": "transformer", } - import tensorflow as tf - def __init__( self, seq_len: int = 96, @@ -104,9 +101,7 @@ def __init__( super().__init__() - def _token_embedding( - self, input_tensor: tf.Tensor, c_in: int, d_model: int - ) -> tf.Tensor: + def _token_embedding(self, input_tensor, c_in, d_model): """ Token embedding layer using 1D convolution with causal padding. @@ -132,9 +127,7 @@ def _token_embedding( x = tf.keras.layers.LeakyReLU()(x) return x - def _positional_embedding( - self, input_tensor: tf.Tensor, d_model: int, max_len: int = 5000 - ) -> tf.Tensor: + def _positional_embedding(self, input_tensor, d_model, max_len=5000): """ Positional embedding layer that computes positional encodings. @@ -175,12 +168,12 @@ def _positional_embedding( def _data_embedding( self, - input_tensor: tf.Tensor, - c_in: int, - d_model: int, - dropout: float = 0.1, - max_len: int = 5000, - ) -> tf.Tensor: + input_tensor, + c_in, + d_model, + dropout=0.1, + max_len=5000, + ): """ Combine token and positional embeddings for the input tensor. @@ -218,7 +211,7 @@ def _data_embedding( return x - def _conv_layer(self, input_tensor: tf.Tensor, c_in: int) -> tf.Tensor: + def _conv_layer(self, input_tensor, c_in): """ Convolutional layer with batch normalization, ELU, and max pooling. @@ -254,15 +247,15 @@ def _conv_layer(self, input_tensor: tf.Tensor, c_in: int) -> tf.Tensor: def _attention_out( self, - input_tensor: tf.Tensor, - attention_type: str, - mask_flag: bool, - d_model: int, - n_heads: int, - factor: int = 5, - dropout: float = 0.1, - attn_mask: Optional[tf.Tensor] = None, - ) -> tf.Tensor: + input_tensor, + attention_type, + mask_flag, + d_model, + n_heads, + factor=5, + dropout=0.1, + attn_mask=None, + ): """ Attention output layer applying either ProbAttention or FullAttention. @@ -327,17 +320,17 @@ def _attention_out( def _encoder_layer( self, - input_tensor: tf.Tensor, - d_model: int, - d_ff: Optional[int] = None, - dropout: float = 0.1, - activation: str = "relu", - attn_mask: Optional[tf.Tensor] = None, - attention_type: str = "prob", - mask_flag: bool = True, - n_heads: int = 8, - factor: int = 5, - ) -> tf.Tensor: + input_tensor, + d_model, + d_ff=None, + dropout=0.1, + activation="relu", + attn_mask=None, + attention_type="prob", + mask_flag=True, + n_heads=8, + factor=5, + ): """ Apply encoder layer with multi-head attention and feed-forward network. @@ -415,21 +408,21 @@ def _encoder_layer( def _encoder( self, - input_tensor: tf.Tensor, - e_layers: int, - d_model: int, - d_ff: Optional[int] = None, - dropout: float = 0.1, - activation: str = "relu", - attn_mask: Optional[tf.Tensor] = None, - attention_type: str = "prob", - mask_flag: bool = True, - n_heads: int = 8, - factor: int = 5, - use_conv_layers: bool = False, - c_in: Optional[int] = None, - use_norm: bool = True, - ) -> tf.Tensor: + input_tensor, + e_layers, + d_model, + d_ff=None, + dropout=0.1, + activation="relu", + attn_mask=None, + attention_type="prob", + mask_flag=True, + n_heads=8, + factor=5, + use_conv_layers=False, + c_in=None, + use_norm=True, + ): """ Apply encoder stack with multiple encoder layers and optional conv layers. @@ -539,20 +532,20 @@ def _encoder( def _decoder_layer( self, - input_tensor: tf.Tensor, - cross_tensor: tf.Tensor, - d_model: int, - d_ff: Optional[int] = None, - dropout: float = 0.1, - activation: str = "relu", - x_mask: Optional[tf.Tensor] = None, - cross_mask: Optional[tf.Tensor] = None, - self_attention_type: str = "prob", - cross_attention_type: str = "prob", - mask_flag: bool = True, - n_heads: int = 8, - factor: int = 5, - ) -> tf.Tensor: + input_tensor, + cross_tensor, + d_model, + d_ff=None, + dropout=0.1, + activation="relu", + x_mask=None, + cross_mask=None, + self_attention_type="prob", + cross_attention_type="prob", + mask_flag=True, + n_heads=8, + factor=5, + ): """ Apply decoder layer with self-attention, cross-attention, and FFN. @@ -661,22 +654,22 @@ def _decoder_layer( def _decoder( self, - input_tensor: tf.Tensor, - cross_tensor: tf.Tensor, - d_layers: int, - d_model: int, - d_ff: Optional[int] = None, - dropout: float = 0.1, - activation: str = "relu", - x_mask: Optional[tf.Tensor] = None, - cross_mask: Optional[tf.Tensor] = None, - self_attention_type: str = "prob", - cross_attention_type: str = "prob", - mask_flag: bool = True, - n_heads: int = 8, - factor: int = 5, - use_norm: bool = True, - ) -> tf.Tensor: + input_tensor, + cross_tensor, + d_layers, + d_model, + d_ff=None, + dropout=0.1, + activation="relu", + x_mask=None, + cross_mask=None, + self_attention_type="prob", + cross_attention_type="prob", + mask_flag=True, + n_heads=8, + factor=5, + use_norm=True, + ): """ Apply decoder stack with multiple decoder layers and optional normalization. @@ -746,29 +739,67 @@ def _decoder( return x - def build_network( - self, input_shape: tuple[int, int], **kwargs - ) -> tuple[list[tf.Tensor], tf.Tensor]: + def _preprocess_time_series(self, data, seq_len, label_len, pred_len): + """ + Preprocess time series data of shape (None, n_timepoints, n_channels). + + Parameters + ---------- + data : tf.Tensor + Input tensor of shape (None, n_timepoints, n_channels) + seq_len : int + Encoder input sequence length + label_len : int + Known decoder input length + pred_len : int + Prediction length + + Returns + ------- + tuple + (x_enc, x_dec) where: + - x_enc: Encoder input tensor of shape (None, seq_len, n_channels) + - x_dec: Decoder input tensor of shape (None, label_len + pred_len + , n_channels) + """ + import tensorflow as tf + + # Get tensor dimensions - handle None batch dimension + batch_size, n_timepoints, n_channels = data.shape + + # Encoder input: first seq_len timepoints + x_enc = data[:, :seq_len, :] # (None, seq_len, n_channels) + + # Decoder input construction + x_dec_known = data[ + :, seq_len - label_len : seq_len, : + ] # (None, label_len, n_channels) + + # Unknown part: zeros for prediction horizon + x_dec_pred = data[:, :pred_len, :] + + # Concatenate known and prediction parts + x_dec = tf.keras.layers.Concatenate(axis=1)([x_dec_known, x_dec_pred]) + + return x_enc, x_dec + + def build_network(self, input_shape, **kwargs): """Build the complete Informer architecture for time series forecasting.""" import tensorflow as tf # Get input dimensions n_timepoints, n_channels = input_shape - # hardcode batch_size for now - batch_size = 32 - - # Create input layers for encoder and decoder - encoder_input = tf.keras.layers.Input( - shape=(self.seq_len, n_channels), - name="encoder_input", - batch_size=batch_size, + input_data = tf.keras.layers.Input( + shape=input_shape, + name="time_series_input", ) - decoder_input = tf.keras.layers.Input( - shape=(self.label_len + self.out_len, n_channels), - name="decoder_input", - batch_size=batch_size, + encoder_input, decoder_input = self._preprocess_time_series( + data=input_data, + seq_len=self.seq_len, + label_len=self.label_len, + pred_len=self.out_len, ) # Encoder embedding @@ -829,7 +860,4 @@ def build_network( # Extract only the prediction part (last out_len timesteps) output = output[:, -self.out_len :, :] - # Create the model with both encoder and decoder inputs - inputs = [encoder_input, decoder_input] - - return inputs, output + return input_data, output diff --git a/aeon/networks/tests/test_all_networks.py b/aeon/networks/tests/test_all_networks.py index 9ca85474fb..924e1f2623 100644 --- a/aeon/networks/tests/test_all_networks.py +++ b/aeon/networks/tests/test_all_networks.py @@ -75,6 +75,11 @@ def test_all_networks_params(network): f"{network.__name__} not to be tested (AE networks have their own tests)." ) + if network._config["structure"] == "transformer": + pytest.skip( + f"{network.__name__} not to be tested (transformers have their own tests)." + ) + if not ( _check_soft_dependencies( network._config["python_dependencies"], severity="none" diff --git a/aeon/networks/tests/test_informer.py b/aeon/networks/tests/test_informer.py index 9d5be59351..9e472f77f8 100644 --- a/aeon/networks/tests/test_informer.py +++ b/aeon/networks/tests/test_informer.py @@ -48,7 +48,6 @@ def test_informer_network_init( inputs, outputs = informer.build_network((seq_len + label_len, 5)) assert inputs is not None assert outputs is not None - assert len(inputs) == 2 # encoder_input and decoder_input @pytest.mark.skipif( @@ -106,49 +105,6 @@ def test_informer_network_distil_mix_factor(distil, mix, factor): assert outputs is not None -@pytest.mark.skipif( - not _check_soft_dependencies(["tensorflow"], severity="none"), - reason="Tensorflow soft dependency unavailable.", -) -def test_informer_network_output_shape(): - """Test whether InformerNetwork produces correct output shapes.""" - seq_len = 96 - label_len = 48 - out_len = 24 - n_channels = 5 - # batch_size = 32 - - informer = InformerNetwork( - seq_len=seq_len, - label_len=label_len, - out_len=out_len, - d_model=128, - n_heads=4, - e_layers=2, - d_layers=1, - ) - - inputs, outputs = informer.build_network((seq_len + label_len, n_channels)) - - # Create a TensorFlow model to test actual shapes - if _check_soft_dependencies(["tensorflow"], severity="none"): - # keras_model = tf.keras.Model(inputs=inputs, outputs=outputs) - - # Test input shapes - encoder_input_shape = inputs[0].shape - decoder_input_shape = inputs[1].shape - - assert encoder_input_shape[1] == seq_len # sequence length - assert encoder_input_shape[2] == n_channels # number of channels - assert decoder_input_shape[1] == label_len + out_len # decoder sequence length - assert decoder_input_shape[2] == n_channels # number of channels - - # Test output shape - output_shape = outputs.shape - assert output_shape[1] == out_len # prediction length - assert output_shape[2] == n_channels # number of channels - - @pytest.mark.skipif( not _check_soft_dependencies(["tensorflow"], severity="none"), reason="Tensorflow soft dependency unavailable.", From d86771f80a98a0fb41571b421f536cccd3c2dd94 Mon Sep 17 00:00:00 2001 From: lucifer4073 Date: Tue, 22 Jul 2025 01:56:23 +0530 Subject: [PATCH 10/10] informer updated --- aeon/networks/_informer.py | 284 ++++++++++++++------------- aeon/networks/tests/test_informer.py | 115 +++++------ 2 files changed, 203 insertions(+), 196 deletions(-) diff --git a/aeon/networks/_informer.py b/aeon/networks/_informer.py index f5b81c6ac3..f02c058afa 100644 --- a/aeon/networks/_informer.py +++ b/aeon/networks/_informer.py @@ -24,27 +24,27 @@ class InformerNetwork(BaseDeepLearningNetwork): Parameters ---------- - seq_len : int, default=96 - Input sequence length. - label_len : int, default=48 + encoder_input_len : int, default=96 + Encoder input sequence length. + decoder_input_len : int, default=48 Start token length for decoder. - out_len : int, default=24 + prediction_horizon : int, default=24 Prediction sequence length. factor : int, default=5 ProbSparse attention factor. - d_model : int, default=512 + model_dimension : int, default=512 Model dimension. - n_heads : int, default=8 + num_attention_heads : int, default=8 Number of attention heads. - e_layers : int, default=3 + encoder_layers : int, default=3 Number of encoder layers. - d_layers : int, default=2 + decoder_layers : int, default=2 Number of decoder layers. - d_ff : int, default=512 + feedforward_dim : int, default=512 Feed forward network dimension. dropout : float, default=0.0 Dropout rate. - attn : str, default='prob' + attention_type : str, default='prob' Attention mechanism type ('prob' or 'full'). activation : str, default='gelu' Activation function. @@ -69,39 +69,39 @@ class InformerNetwork(BaseDeepLearningNetwork): def __init__( self, - seq_len: int = 96, - label_len: int = 48, - out_len: int = 24, + encoder_input_len: int = 96, + decoder_input_len: int = 48, + prediction_horizon: int = 24, factor: int = 5, - d_model: int = 512, - n_heads: int = 8, - e_layers: int = 3, - d_layers: int = 2, - d_ff: int = 512, + model_dimension: int = 512, + num_attention_heads: int = 8, + encoder_layers: int = 3, + decoder_layers: int = 2, + feedforward_dim: int = 512, dropout: float = 0.0, - attn: str = "prob", + attention_type: str = "prob", activation: str = "gelu", distil: bool = True, mix: bool = True, ): - self.seq_len = seq_len - self.label_len = label_len - self.out_len = out_len + self.encoder_input_len = encoder_input_len + self.decoder_input_len = decoder_input_len + self.prediction_horizon = prediction_horizon self.factor = factor - self.d_model = d_model - self.n_heads = n_heads - self.e_layers = e_layers - self.d_layers = d_layers - self.d_ff = d_ff + self.model_dimension = model_dimension + self.num_attention_heads = num_attention_heads + self.encoder_layers = encoder_layers + self.decoder_layers = decoder_layers + self.feedforward_dim = feedforward_dim self.dropout = dropout - self.attn = attn + self.attention_type = attention_type self.activation = activation self.distil = distil self.mix = mix super().__init__() - def _token_embedding(self, input_tensor, c_in, d_model): + def _token_embedding(self, input_tensor, c_in, model_dimension): """ Token embedding layer using 1D convolution with causal padding. @@ -111,7 +111,7 @@ def _token_embedding(self, input_tensor, c_in, d_model): Input tensor to be processed. c_in : int Number of input channels. - d_model : int + model_dimension : int Dimension of the model (number of output filters). Returns @@ -122,12 +122,15 @@ def _token_embedding(self, input_tensor, c_in, d_model): import tensorflow as tf x = tf.keras.layers.Conv1D( - filters=d_model, kernel_size=3, padding="causal", activation="linear" + filters=model_dimension, + kernel_size=3, + padding="causal", + activation="linear", )(input_tensor) x = tf.keras.layers.LeakyReLU()(x) return x - def _positional_embedding(self, input_tensor, d_model, max_len=5000): + def _positional_embedding(self, input_tensor, model_dimension, max_len=5000): """ Positional embedding layer that computes positional encodings. @@ -135,7 +138,7 @@ def _positional_embedding(self, input_tensor, d_model, max_len=5000): ---------- input_tensor : tf.Tensor Input tensor to get positional embeddings for. - d_model : int + model_dimension : int Dimension of the model. max_len : int, optional Maximum length of the sequence, by default 5000 @@ -151,10 +154,11 @@ def _positional_embedding(self, input_tensor, d_model, max_len=5000): import tensorflow as tf # Compute the positional encodings - pe = np.zeros((max_len, d_model), dtype=np.float32) + pe = np.zeros((max_len, model_dimension), dtype=np.float32) position = np.expand_dims(np.arange(0, max_len, dtype=np.float32), 1) div_term = np.exp( - np.arange(0, d_model, 2, dtype=np.float32) * -(math.log(10000.0) / d_model) + np.arange(0, model_dimension, 2, dtype=np.float32) + * -(math.log(10000.0) / model_dimension) ) pe[:, 0::2] = np.sin(position * div_term) @@ -170,7 +174,7 @@ def _data_embedding( self, input_tensor, c_in, - d_model, + model_dimension, dropout=0.1, max_len=5000, ): @@ -183,7 +187,7 @@ def _data_embedding( Input tensor to be processed. c_in : int Number of input channels. - d_model : int + model_dimension : int Dimension of the model (number of output filters). dropout : float, optional Dropout rate, by default 0.1 @@ -198,10 +202,10 @@ def _data_embedding( import tensorflow as tf # Get token embeddings - token_emb = self._token_embedding(input_tensor, c_in, d_model) + token_emb = self._token_embedding(input_tensor, c_in, model_dimension) # Get positional embeddings - pos_emb = self._positional_embedding(input_tensor, d_model, max_len) + pos_emb = self._positional_embedding(input_tensor, model_dimension, max_len) # Combine embeddings x = token_emb + pos_emb @@ -250,8 +254,8 @@ def _attention_out( input_tensor, attention_type, mask_flag, - d_model, - n_heads, + model_dimension, + num_attention_heads, factor=5, dropout=0.1, attn_mask=None, @@ -267,9 +271,9 @@ def _attention_out( Type of attention mechanism ('prob' or 'full'). mask_flag : bool Whether to use attention masking. - d_model : int + model_dimension : int Model dimension. - n_heads : int + num_attention_heads : int Number of attention heads. factor : int, optional Attention factor for ProbSparse attention, by default 5 @@ -294,18 +298,18 @@ def _attention_out( output = AttentionLayer( attention=prob_attention, - d_model=d_model, - n_heads=n_heads, - d_keys=d_model // n_heads, # 512 // 8 = 64 - d_values=d_model // n_heads, # 512 // 8 = 64 + d_model=model_dimension, + n_heads=num_attention_heads, + d_keys=model_dimension // num_attention_heads, # 512 // 8 = 64 + d_values=model_dimension // num_attention_heads, # 512 // 8 = 64 )(input_tensor, attn_mask=attn_mask) else: queries, keys, values = input_tensor output = tf.keras.layers.MultiHeadAttention( - num_heads=n_heads, # 8 - key_dim=d_model // n_heads, # 512 // 8 = 64 - value_dim=d_model // n_heads, # 512 // 8 = 64 + num_heads=num_attention_heads, # 8 + key_dim=model_dimension // num_attention_heads, # 512 // 8 = 64 + value_dim=model_dimension // num_attention_heads, # 512 // 8 = 64 dropout=dropout, use_bias=True, )( @@ -321,14 +325,14 @@ def _attention_out( def _encoder_layer( self, input_tensor, - d_model, - d_ff=None, + model_dimension, + feedforward_dim=None, dropout=0.1, activation="relu", attn_mask=None, attention_type="prob", mask_flag=True, - n_heads=8, + num_attention_heads=8, factor=5, ): """ @@ -339,9 +343,9 @@ def _encoder_layer( input_tensor : tf.Tensor Input tensor of shape [B, L, D] where B is batch size, L is sequence length, D is model dimension. - d_model : int + model_dimension : int Model dimension (must match input tensor's last dimension). - d_ff : int, optional + feedforward_dim : int, optional Feed-forward network dimension dropout : float, optional Dropout rate, by default 0.1 @@ -357,17 +361,17 @@ def _encoder_layer( """ import tensorflow as tf - # Set default d_ff if not provided - if d_ff is None: - d_ff = 4 * d_model + # Set default feedforward_dim if not provided + if feedforward_dim is None: + feedforward_dim = 4 * model_dimension # Self-attention using the _attention_out function with parameters attn_output = self._attention_out( input_tensor=[input_tensor, input_tensor, input_tensor], attention_type=attention_type, mask_flag=mask_flag, - d_model=d_model, - n_heads=n_heads, + model_dimension=model_dimension, + num_attention_heads=num_attention_heads, factor=factor, dropout=dropout, attn_mask=attn_mask, @@ -384,7 +388,7 @@ def _encoder_layer( # Feed-forward network # First 1D convolution (expansion) - y = tf.keras.layers.Conv1D(filters=d_ff, kernel_size=1)(x) + y = tf.keras.layers.Conv1D(filters=feedforward_dim, kernel_size=1)(x) # Apply activation function if activation == "relu": @@ -396,7 +400,7 @@ def _encoder_layer( y = tf.keras.layers.Dropout(dropout)(y) # Second 1D convolution (compression back to d_model) - y = tf.keras.layers.Conv1D(filters=d_model, kernel_size=1)(y) + y = tf.keras.layers.Conv1D(filters=model_dimension, kernel_size=1)(y) # Apply dropout y = tf.keras.layers.Dropout(dropout)(y) @@ -409,15 +413,15 @@ def _encoder_layer( def _encoder( self, input_tensor, - e_layers, - d_model, - d_ff=None, + encoder_layers, + model_dimension, + feedforward_dim=None, dropout=0.1, activation="relu", attn_mask=None, attention_type="prob", mask_flag=True, - n_heads=8, + num_attention_heads=8, factor=5, use_conv_layers=False, c_in=None, @@ -430,11 +434,11 @@ def _encoder( ---------- input_tensor : tf.Tensor Input tensor of shape [B, L, D] - e_layers : int + encoder_layers : int Number of encoder layers to stack. - d_model : int + model_dimension : int Model dimension (must match input tensor's last dimension). - d_ff : int, optional + feedforward_dim : int, optional Feed-forward network dimension dropout : float, optional Dropout rate, by default 0.1 @@ -446,7 +450,7 @@ def _encoder( Type of attention mechanism ('prob' or 'full') mask_flag : bool, optional Whether to use attention masking, by default True - n_heads : int, optional + num_attention_heads : int, optional Number of attention heads, by default 8 factor : int, optional Attention factor for ProbSparse attention, by default 5 @@ -466,25 +470,25 @@ def _encoder( # Set default values if c_in is None: - c_in = d_model + c_in = model_dimension x = input_tensor # Apply encoder layers with optional convolutional layers if use_conv_layers: # Apply paired encoder and conv layers - for _ in range(e_layers - 1): + for _ in range(encoder_layers - 1): # Apply encoder layer x = self._encoder_layer( input_tensor=x, - d_model=d_model, - d_ff=d_ff, + model_dimension=model_dimension, + feedforward_dim=feedforward_dim, dropout=dropout, activation=activation, attn_mask=attn_mask, attention_type=attention_type, mask_flag=mask_flag, - n_heads=n_heads, + num_attention_heads=num_attention_heads, factor=factor, ) @@ -497,30 +501,30 @@ def _encoder( # Apply final encoder layer (without conv layer) x = self._encoder_layer( input_tensor=x, - d_model=d_model, - d_ff=d_ff, + model_dimension=model_dimension, + feedforward_dim=feedforward_dim, dropout=dropout, activation=activation, attn_mask=attn_mask, attention_type=attention_type, mask_flag=mask_flag, - n_heads=n_heads, + num_attention_heads=num_attention_heads, factor=factor, ) else: # Apply only encoder layers without convolutional layers - for _ in range(e_layers): + for _ in range(encoder_layers): x = self._encoder_layer( input_tensor=x, - d_model=d_model, - d_ff=d_ff, + model_dimension=model_dimension, + feedforward_dim=feedforward_dim, dropout=dropout, activation=activation, attn_mask=attn_mask, attention_type=attention_type, mask_flag=mask_flag, - n_heads=n_heads, + num_attention_heads=num_attention_heads, factor=factor, ) @@ -534,8 +538,8 @@ def _decoder_layer( self, input_tensor, cross_tensor, - d_model, - d_ff=None, + model_dimension, + feedforward_dim=None, dropout=0.1, activation="relu", x_mask=None, @@ -543,7 +547,7 @@ def _decoder_layer( self_attention_type="prob", cross_attention_type="prob", mask_flag=True, - n_heads=8, + num_attention_heads=8, factor=5, ): """ @@ -555,9 +559,9 @@ def _decoder_layer( Input tensor of shape [B, L, D] cross_tensor : tf.Tensor Cross-attention input tensor (encoder output) of shape [B, L_enc, D] - d_model : int + model_dimension : int Model dimension (must match input tensor's last dimension). - d_ff : int, optional + feedforward_dim : int, optional Feed-forward network dimension dropout : float, optional Dropout rate, by default 0.1 @@ -573,7 +577,7 @@ def _decoder_layer( Type of cross-attention mechanism ('prob' or 'full') mask_flag : bool, optional Whether to use attention masking, by default True - n_heads : int, optional + num_attention_heads : int, optional Number of attention heads, by default 8 factor : int, optional Attention factor for ProbSparse attention, by default 5 @@ -585,17 +589,17 @@ def _decoder_layer( """ import tensorflow as tf - # Set default d_ff if not provided - if d_ff is None: - d_ff = 4 * d_model + # Set default feedforward_dim if not provided + if feedforward_dim is None: + feedforward_dim = 4 * model_dimension # Self-attention block self_attn_output = self._attention_out( input_tensor=[input_tensor, input_tensor, input_tensor], attention_type=self_attention_type, mask_flag=mask_flag, - d_model=d_model, - n_heads=n_heads, + model_dimension=model_dimension, + num_attention_heads=num_attention_heads, factor=factor, dropout=dropout, attn_mask=x_mask, @@ -612,8 +616,8 @@ def _decoder_layer( input_tensor=[x, cross_tensor, cross_tensor], attention_type=cross_attention_type, mask_flag=mask_flag, - d_model=d_model, - n_heads=n_heads, + model_dimension=model_dimension, + num_attention_heads=num_attention_heads, factor=factor, dropout=dropout, attn_mask=cross_mask, @@ -630,7 +634,7 @@ def _decoder_layer( # Feed-forward network # First 1D convolution (expansion) - y = tf.keras.layers.Conv1D(filters=d_ff, kernel_size=1)(x) + y = tf.keras.layers.Conv1D(filters=feedforward_dim, kernel_size=1)(x) # Apply activation function if activation == "relu": @@ -642,7 +646,7 @@ def _decoder_layer( y = tf.keras.layers.Dropout(dropout)(y) # Second 1D convolution (compression back to d_model) - y = tf.keras.layers.Conv1D(filters=d_model, kernel_size=1)(y) + y = tf.keras.layers.Conv1D(filters=model_dimension, kernel_size=1)(y) # Apply dropout y = tf.keras.layers.Dropout(dropout)(y) @@ -656,9 +660,9 @@ def _decoder( self, input_tensor, cross_tensor, - d_layers, - d_model, - d_ff=None, + decoder_layers, + model_dimension, + feedforward_dim=None, dropout=0.1, activation="relu", x_mask=None, @@ -666,7 +670,7 @@ def _decoder( self_attention_type="prob", cross_attention_type="prob", mask_flag=True, - n_heads=8, + num_attention_heads=8, factor=5, use_norm=True, ): @@ -679,11 +683,11 @@ def _decoder( Decoder input tensor of shape [B, L_dec, D] cross_tensor : tf.Tensor Cross-attention input tensor (encoder output) of shape [B, L_enc, D] - d_layers : int + decoder_layers : int Number of decoder layers to stack. - d_model : int + model_dimension : int Model dimension (must match input tensor's last dimension). - d_ff : int, optional + feedforward_dim : int, optional Feed-forward network dimension dropout : float, optional Dropout rate, by default 0.1 @@ -699,7 +703,7 @@ def _decoder( Type of cross-attention mechanism ('prob' or 'full') mask_flag : bool, optional Whether to use attention masking, by default True - n_heads : int, optional + num_attention_heads : int, optional Number of attention heads, by default 8 factor : int, optional Attention factor for ProbSparse attention, by default 5 @@ -716,12 +720,12 @@ def _decoder( x = input_tensor # Apply multiple decoder layers - for _ in range(d_layers): + for _ in range(decoder_layers): x = self._decoder_layer( input_tensor=x, cross_tensor=cross_tensor, - d_model=d_model, - d_ff=d_ff, + model_dimension=model_dimension, + feedforward_dim=feedforward_dim, dropout=dropout, activation=activation, x_mask=x_mask, @@ -729,7 +733,7 @@ def _decoder( self_attention_type=self_attention_type, cross_attention_type=cross_attention_type, mask_flag=mask_flag, - n_heads=n_heads, + num_attention_heads=num_attention_heads, factor=factor, ) @@ -739,7 +743,9 @@ def _decoder( return x - def _preprocess_time_series(self, data, seq_len, label_len, pred_len): + def _preprocess_time_series( + self, data, encoder_input_len, decoder_input_len, prediction_horizon + ): """ Preprocess time series data of shape (None, n_timepoints, n_channels). @@ -747,20 +753,20 @@ def _preprocess_time_series(self, data, seq_len, label_len, pred_len): ---------- data : tf.Tensor Input tensor of shape (None, n_timepoints, n_channels) - seq_len : int + encoder_input_len : int Encoder input sequence length - label_len : int + decoder_input_len : int Known decoder input length - pred_len : int + prediction_horizon : int Prediction length Returns ------- tuple (x_enc, x_dec) where: - - x_enc: Encoder input tensor of shape (None, seq_len, n_channels) - - x_dec: Decoder input tensor of shape (None, label_len + pred_len - , n_channels) + - x_enc: Encoder input tensor of shape (None, encoder_input_len, n_channels) + - x_dec: Decoder input tensor of shape (None, + decoder_input_len + prediction_horizon, n_channels) """ import tensorflow as tf @@ -768,15 +774,15 @@ def _preprocess_time_series(self, data, seq_len, label_len, pred_len): batch_size, n_timepoints, n_channels = data.shape # Encoder input: first seq_len timepoints - x_enc = data[:, :seq_len, :] # (None, seq_len, n_channels) + x_enc = data[:, :encoder_input_len, :] # (None, encoder_input_len, n_channels) # Decoder input construction x_dec_known = data[ - :, seq_len - label_len : seq_len, : - ] # (None, label_len, n_channels) + :, encoder_input_len - decoder_input_len : encoder_input_len, : + ] # (None, decoder_input_len, n_channels) # Unknown part: zeros for prediction horizon - x_dec_pred = data[:, :pred_len, :] + x_dec_pred = data[:, :prediction_horizon, :] # Concatenate known and prediction parts x_dec = tf.keras.layers.Concatenate(axis=1)([x_dec_known, x_dec_pred]) @@ -797,34 +803,34 @@ def build_network(self, input_shape, **kwargs): encoder_input, decoder_input = self._preprocess_time_series( data=input_data, - seq_len=self.seq_len, - label_len=self.label_len, - pred_len=self.out_len, + encoder_input_len=self.encoder_input_len, + decoder_input_len=self.decoder_input_len, + prediction_horizon=self.prediction_horizon, ) # Encoder embedding enc_embedded = self._data_embedding( input_tensor=encoder_input, c_in=n_channels, - d_model=self.d_model, + model_dimension=self.model_dimension, dropout=self.dropout, - max_len=self.seq_len, + max_len=self.encoder_input_len, ) # Encoder processing enc_output = self._encoder( input_tensor=enc_embedded, - e_layers=self.e_layers, - d_model=self.d_model, - d_ff=self.d_ff, + encoder_layers=self.encoder_layers, + model_dimension=self.model_dimension, + feedforward_dim=self.feedforward_dim, dropout=self.dropout, activation=self.activation, - attention_type=self.attn, + attention_type=self.attention_type, mask_flag=False, - n_heads=self.n_heads, + num_attention_heads=self.num_attention_heads, factor=self.factor, use_conv_layers=self.distil, - c_in=self.d_model, + c_in=self.model_dimension, use_norm=True, ) @@ -832,24 +838,24 @@ def build_network(self, input_shape, **kwargs): dec_embedded = self._data_embedding( input_tensor=decoder_input, c_in=n_channels, - d_model=self.d_model, + model_dimension=self.model_dimension, dropout=self.dropout, - max_len=self.label_len + self.out_len, + max_len=self.decoder_input_len + self.prediction_horizon, ) # Decoder processing dec_output = self._decoder( input_tensor=dec_embedded, cross_tensor=enc_output, - d_layers=self.d_layers, - d_model=self.d_model, - d_ff=self.d_ff, + decoder_layers=self.decoder_layers, + model_dimension=self.model_dimension, + feedforward_dim=self.feedforward_dim, dropout=self.dropout, activation=self.activation, - self_attention_type=self.attn, + self_attention_type=self.attention_type, cross_attention_type="full", mask_flag=self.mix, - n_heads=self.n_heads, + num_attention_heads=self.num_attention_heads, factor=self.factor, use_norm=True, ) @@ -858,6 +864,6 @@ def build_network(self, input_shape, **kwargs): output = tf.keras.layers.Dense(n_channels, name="output_projection")(dec_output) # Extract only the prediction part (last out_len timesteps) - output = output[:, -self.out_len :, :] + output = output[:, -self.prediction_horizon :, :] return input_data, output diff --git a/aeon/networks/tests/test_informer.py b/aeon/networks/tests/test_informer.py index 9e472f77f8..8c59fa30d5 100644 --- a/aeon/networks/tests/test_informer.py +++ b/aeon/networks/tests/test_informer.py @@ -13,7 +13,9 @@ reason="Tensorflow soft dependency unavailable.", ) @pytest.mark.parametrize( - "seq_len,label_len,out_len,d_model,n_heads,e_layers,d_layers", + "encoder_input_len,decoder_input_len," + "prediction_horizon,model_dimension,num_attention_heads," + "encoder_layers,decoder_layers", [ (96, 48, 24, 512, 8, 3, 2), (48, 24, 12, 256, 4, 2, 1), @@ -22,30 +24,30 @@ ], ) def test_informer_network_init( - seq_len, - label_len, - out_len, - d_model, - n_heads, - e_layers, - d_layers, + encoder_input_len, + decoder_input_len, + prediction_horizon, + model_dimension, + num_attention_heads, + encoder_layers, + decoder_layers, ): """Test whether InformerNetwork initializes correctly for various parameters.""" informer = InformerNetwork( - seq_len=seq_len, - label_len=label_len, - out_len=out_len, - d_model=d_model, - n_heads=n_heads, - e_layers=e_layers, - d_layers=d_layers, + encoder_input_len=encoder_input_len, + decoder_input_len=decoder_input_len, + prediction_horizon=prediction_horizon, + model_dimension=model_dimension, + num_attention_heads=num_attention_heads, + encoder_layers=encoder_layers, + decoder_layers=decoder_layers, factor=random.choice([3, 5, 7]), dropout=random.choice([0.0, 0.1, 0.2]), - attn=random.choice(["prob", "full"]), + attention_type=random.choice(["prob", "full"]), activation=random.choice(["relu", "gelu"]), ) - inputs, outputs = informer.build_network((seq_len + label_len, 5)) + inputs, outputs = informer.build_network((encoder_input_len + decoder_input_len, 5)) assert inputs is not None assert outputs is not None @@ -55,20 +57,20 @@ def test_informer_network_init( reason="Tensorflow soft dependency unavailable.", ) @pytest.mark.parametrize( - "attn,activation", + "attention_type,activation", [("prob", "relu"), ("full", "gelu"), ("prob", "gelu"), ("full", "relu")], ) -def test_informer_network_attention_activation(attn, activation): +def test_informer_network_attention_activation(attention_type, activation): """Test InformerNetwork with different attention and activation.""" informer = InformerNetwork( - seq_len=96, - label_len=48, - out_len=24, - d_model=128, - n_heads=4, - e_layers=2, - d_layers=1, - attn=attn, + encoder_input_len=96, + decoder_input_len=48, + prediction_horizon=24, + model_dimension=128, + num_attention_heads=4, + encoder_layers=2, + decoder_layers=1, + attention_type=attention_type, activation=activation, ) @@ -88,13 +90,13 @@ def test_informer_network_attention_activation(attn, activation): def test_informer_network_distil_mix_factor(distil, mix, factor): """Test whether InformerNetwork works with different configurations.""" informer = InformerNetwork( - seq_len=48, - label_len=24, - out_len=12, - d_model=64, - n_heads=2, - e_layers=1, - d_layers=1, + encoder_input_len=48, + decoder_input_len=24, + prediction_horizon=12, + model_dimension=64, + num_attention_heads=2, + encoder_layers=1, + decoder_layers=1, distil=distil, mix=mix, factor=factor, @@ -118,14 +120,14 @@ def test_informer_network_default_parameters(): assert outputs is not None # Check default values - assert informer.seq_len == 96 - assert informer.label_len == 48 - assert informer.out_len == 24 - assert informer.d_model == 512 - assert informer.n_heads == 8 - assert informer.e_layers == 3 - assert informer.d_layers == 2 - assert informer.attn == "prob" + assert informer.encoder_input_len == 96 + assert informer.decoder_input_len == 48 + assert informer.prediction_horizon == 24 + assert informer.model_dimension == 512 + assert informer.num_attention_heads == 8 + assert informer.encoder_layers == 3 + assert informer.decoder_layers == 2 + assert informer.attention_type == "prob" assert informer.activation == "gelu" assert informer.distil assert informer.mix @@ -137,15 +139,14 @@ def test_informer_network_default_parameters(): ) def test_informer_network_parameter_validation(): """Test whether InformerNetwork handles edge case parameters correctly.""" - # Test minimum viable configuration informer = InformerNetwork( - seq_len=12, - label_len=6, - out_len=3, - d_model=32, - n_heads=1, - e_layers=1, - d_layers=1, + encoder_input_len=12, + decoder_input_len=6, + prediction_horizon=3, + model_dimension=32, + num_attention_heads=1, + encoder_layers=1, + decoder_layers=1, factor=1, dropout=0.0, ) @@ -163,13 +164,13 @@ def test_informer_network_different_channels(): """Test whether InformerNetwork works with different numbers of input channels.""" for n_channels in [1, 3, 5, 10]: informer = InformerNetwork( - seq_len=48, - label_len=24, - out_len=12, - d_model=64, - n_heads=2, - e_layers=1, - d_layers=1, + encoder_input_len=48, + decoder_input_len=24, + prediction_horizon=12, + model_dimension=64, + num_attention_heads=2, + encoder_layers=1, + decoder_layers=1, ) inputs, outputs = informer.build_network((72, n_channels))