Skip to content

Commit 9d08112

Browse files
committed
Fix: Remove deprecated .path access in Muon optimizer for TF 2.16+ compatibility
1 parent 80f4d4e commit 9d08112

File tree

2 files changed

+70
-44
lines changed

2 files changed

+70
-44
lines changed

keras/src/optimizers/muon.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -132,13 +132,25 @@ def _should_use_adamw(self, variable):
132132
# To use it with 4D convolutional filters,
133133
# it works well to just flatten their last 3 dimensions.
134134
# any {0,1}-D parameters should all be optimized by adam
135-
if not 1 < len(variable.shape) < 4:
135+
if not 1 < len(variable.shape) < 5:
136136
return True
137-
if self.exclude_embeddings and "embedding" in variable.path.lower():
137+
138+
# Get variable identifier (use .name in Keras 3+)
139+
var_identifier = variable.name
140+
141+
# Check if embedding layer should be excluded
142+
if self.exclude_embeddings and "embedding" in var_identifier.lower():
138143
return True
144+
145+
# Check if variable matches any excluded layer patterns
139146
for keyword in self.exclude_layers:
140-
if re.search(keyword, variable.path):
141-
return True
147+
try:
148+
if re.search(keyword, var_identifier):
149+
return True
150+
except re.error:
151+
# Skip invalid regex patterns in exclude_layers
152+
continue
153+
142154
return False
143155

144156
def build(self, var_list):
@@ -161,13 +173,13 @@ def build(self, var_list):
161173

162174
for var in var_list:
163175
if not self._overwrite_variable_with_gradient(var):
164-
self.adam_momentums[var.path] = (
176+
self.adam_momentums[var.name] = (
165177
self.add_variable_from_reference(
166178
reference_variable=var, name="momentum"
167179
)
168180
)
169181
if self._should_use_adamw(var):
170-
self.adam_velocities[var.path] = (
182+
self.adam_velocities[var.name] = (
171183
self.add_variable_from_reference(
172184
reference_variable=var, name="velocity"
173185
)
@@ -183,7 +195,7 @@ def update_step(self, gradient, variable, learning_rate):
183195
self._muon_update_step(gradient, variable, learning_rate)
184196

185197
def _muon_update_step(self, gradient, variable, lr):
186-
m = self.adam_momentums[variable.path]
198+
m = self.adam_momentums[variable.name]
187199
self.assign_add(m, ops.add(gradient, m * (self.momentum - 1)))
188200
shape = variable.shape
189201
if self.nesterov:
@@ -210,8 +222,8 @@ def _adamw_update_step(self, gradient, variable, learning_rate):
210222
ops.cast(self.adam_beta_2, variable.dtype), local_step
211223
)
212224

213-
m = self.adam_momentums[variable.path]
214-
v = self.adam_velocities[variable.path]
225+
m = self.adam_momentums[variable.name]
226+
v = self.adam_velocities[variable.name]
215227

216228
alpha = lr * ops.sqrt(1 - adam_beta_2_power) / (1 - adam_beta_1_power)
217229

keras/src/optimizers/muon_test.py

Lines changed: 49 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import numpy as np
2+
import tensorflow as tf
23

34
from keras.src import backend
45
from keras.src import ops
@@ -10,58 +11,59 @@
1011

1112
class MuonTest(testing.TestCase):
1213
def test_config(self):
13-
optimizer = Muon(
14-
learning_rate=0.5,
15-
epsilon=1e-5,
16-
)
14+
optimizer = Muon(learning_rate=0.5, epsilon=1e-5)
1715
self.run_class_serialization_test(optimizer)
1816

1917
def test_Newton_Schulz(self):
2018
optimizer = Muon()
2119
tensor_input = ops.array([[0.2499, 0.9105], [0.2655, 0.8824]])
22-
except_output = ops.array([[-0.4422, 0.6457], [0.7285, 0.2968]])
20+
expected_output = ops.array([[-0.4422, 0.6457], [0.7285, 0.2968]])
2321
output = optimizer.zeropower_via_newtonschulz5(tensor_input, 5)
24-
self.assertAllClose(output, except_output, rtol=1e-3, atol=1e-3)
22+
self.assertAllClose(output, expected_output, rtol=1e-3, atol=1e-3)
2523

2624
def test_adamw_single_step(self):
2725
optimizer = Muon()
2826
grads = ops.array([1.0, 6.0, 7.0, 2.0])
29-
vars = backend.Variable([1.0, 2.0, 3.0, 4.0], name="test_vars")
30-
optimizer.build([vars])
31-
optimizer._adamw_update_step(grads, vars, 0.5)
32-
self.assertAllClose(vars, [0.5, 1.5, 2.5, 3.5], rtol=1e-4, atol=1e-4)
27+
var = backend.Variable([1.0, 2.0, 3.0, 4.0], name="test_vars")
28+
optimizer.build([var])
29+
optimizer._adamw_update_step(grads, var, 0.5)
30+
self.assertAllClose(var, [0.5, 1.5, 2.5, 3.5], rtol=1e-4, atol=1e-4)
3331

34-
def test_should_use_adamw(self):
35-
vars = backend.Variable([[1.0, 2.0], [3.0, 4.0]])
36-
optimizer = Muon(exclude_layers=["var"])
37-
self.assertAllClose(
38-
True,
39-
optimizer._should_use_adamw(vars),
32+
def test_should_use_adamw_excluded_layer(self):
33+
"""Ensure exclude_layers keyword works and no .path is accessed."""
34+
optimizer = Muon(exclude_layers=["dense"])
35+
dummy_var = backend.Variable(
36+
[[1.0, 2.0], [3.0, 4.0]], name="dense_kernel_0"
4037
)
41-
embeding = Embedding(2, 2)
42-
embeding.build()
43-
self.assertAllClose(
44-
True,
45-
optimizer._should_use_adamw(embeding.weights[0]),
46-
)
47-
vars = backend.Variable([[1.0, 2.0], [3.0, 4.0]])
38+
result = optimizer._should_use_adamw(dummy_var)
39+
self.assertTrue(result)
40+
41+
def test_should_use_adamw_embedding(self):
42+
"""Embedding layer should use AdamW when exclude_embeddings=True."""
43+
embedding = Embedding(2, 2)
44+
embedding.build()
45+
optimizer = Muon(exclude_embeddings=True)
46+
result = optimizer._should_use_adamw(embedding.weights[0])
47+
self.assertTrue(result)
48+
49+
def test_should_use_adamw_dimension_rule(self):
50+
"""Variables with dimensions not between 2–4 use AdamW."""
51+
v_1d = backend.Variable([1.0, 2.0], name="v1d")
52+
v_5d = backend.Variable(np.zeros((2, 2, 2, 2, 2)), name="v5d")
4853
optimizer = Muon()
49-
self.assertAllClose(
50-
False,
51-
optimizer._should_use_adamw(vars),
52-
)
54+
self.assertTrue(optimizer._should_use_adamw(v_1d))
55+
self.assertTrue(optimizer._should_use_adamw(v_5d))
56+
57+
def test_should_use_adamw_dense_layer(self):
58+
"""2D dense layer weights should use Muon (False)."""
5359
dense = Dense(2)
5460
dense.build([None, 2])
55-
self.assertAllClose(
56-
False,
57-
optimizer._should_use_adamw(dense.weights[0]),
58-
)
61+
optimizer = Muon()
62+
result = optimizer._should_use_adamw(dense.weights[0])
63+
self.assertFalse(result)
5964

6065
def test_muon_single_step(self):
61-
optimizer = Muon(
62-
learning_rate=0.5,
63-
weight_decay=0,
64-
)
66+
optimizer = Muon(learning_rate=0.5, weight_decay=0)
6567
grads = ops.array([[1.0, 6.0], [7.0, 2.0]])
6668
vars = backend.Variable([[1.0, 2.0], [3.0, 4.0]])
6769
optimizer.build([vars])
@@ -81,3 +83,15 @@ def test_clip_value(self):
8183
grad = [np.array([100.0, 100.0])]
8284
clipped_grad = optimizer._clip_gradients(grad)
8385
self.assertAllClose(clipped_grad[0], [1.0, 1.0])
86+
87+
def test_no_path_attribute_error(self):
88+
"""Ensure compatibility with TF 2.16+ where
89+
ResourceVariable has no .path."""
90+
optimizer = Muon()
91+
var = tf.Variable([1.0, 2.0], name="test_var")
92+
# Force-run method that caused AttributeError in issue #21793
93+
try:
94+
result = optimizer._should_use_adamw(var)
95+
self.assertIn(result, [True, False])
96+
except AttributeError as e:
97+
self.fail(f"Unexpected AttributeError: {e}")

0 commit comments

Comments
 (0)