From 5c9cb68695d7ceb4099338d7729e23d1aed6b292 Mon Sep 17 00:00:00 2001 From: casinca <47400729+casinca@users.noreply.github.com> Date: Sat, 8 Nov 2025 20:28:29 +0100 Subject: [PATCH] fix(GatedDeltaNet): Init param A from log of a uniform distrib --- ch04/08_deltanet/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ch04/08_deltanet/README.md b/ch04/08_deltanet/README.md index ca50fe5a9..257e15ffb 100644 --- a/ch04/08_deltanet/README.md +++ b/ch04/08_deltanet/README.md @@ -166,7 +166,8 @@ class GatedDeltaNet(nn.Module): # A_log + W_alpha(x) + dt_bias self.W_alpha = nn.Linear(d_in, num_heads, bias=False) self.dt_bias = nn.Parameter(torch.ones(num_heads)) - self.A_log = nn.Parameter(torch.zeros(num_heads)) + A_init = torch.empty(num_heads).uniform_(0, 16) + self.A_log = nn.Parameter(torch.log(A_init)) # We could implement this as # W_alpha = nn.Linear(d_in, num_heads, bias=True) # but the bias is separate for interpretability and