From 10c87180d05a55138d396af278986b134a3098f1 Mon Sep 17 00:00:00 2001
From: Yashwant Bezawada <yashwant_b@me.com>
Date: Fri, 7 Nov 2025 20:42:44 -0600
Subject: [PATCH 1/4] Fix Qwen3Moe load balancing loss calculation during
 inference

Fixes generation bug in Qwen3Moe models that calculate load_balancing_loss
during evaluation/generation. Load balancing loss should only be calculated
during training, not during inference.

Fixes #42100
---
 src/transformers/models/qwen3_moe/modeling_qwen3_moe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py b/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py
index ff0855c223ee..d549d56e2c94 100644
--- a/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py
+++ b/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py
@@ -668,7 +668,7 @@ def forward(
             loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
 
         aux_loss = None
-        if output_router_logits:
+        if output_router_logits and self.training:
             aux_loss = load_balancing_loss_func(
                 outputs.router_logits,
                 self.num_experts,

From 14dba9e4d944403a147759f58873fb657513d21e Mon Sep 17 00:00:00 2001
From: Yashwant Bezawada <yashwant_b@me.com>
Date: Sat, 8 Nov 2025 13:51:53 -0600
Subject: [PATCH 2/4] fix: Apply load balancing loss fix to modular qwen3_moe
 source

The previous commit only modified the auto-generated modeling_qwen3_moe.py file.
This commit applies the same fix to the modular source file (modular_qwen3_moe.py)
which is the canonical source that generates modeling_qwen3_moe.py.

Changes:
- Add 'and self.training' check before calculating load_balancing_loss
- Ensures consistency between modular source and generated file
- Prevents CI failures from modular file mismatch
---
 src/transformers/models/qwen3_moe/modular_qwen3_moe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/qwen3_moe/modular_qwen3_moe.py b/src/transformers/models/qwen3_moe/modular_qwen3_moe.py
index 87a4bbfa9625..453758f55dfc 100644
--- a/src/transformers/models/qwen3_moe/modular_qwen3_moe.py
+++ b/src/transformers/models/qwen3_moe/modular_qwen3_moe.py
@@ -180,7 +180,7 @@ def forward(
             loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
 
         aux_loss = None
-        if output_router_logits:
+        if output_router_logits and self.training:
             aux_loss = load_balancing_loss_func(
                 outputs.router_logits,
                 self.num_experts,

From ff3663371321d1115210589ccd6e0d1bcfa7878f Mon Sep 17 00:00:00 2001
From: Yashwant Bezawada <yashwant_b@me.com>
Date: Sat, 8 Nov 2025 14:00:42 -0600
Subject: [PATCH 3/4] docs: Add CLAUDE.md with guidelines for AI assistants

Add comprehensive documentation for AI assistants working on transformers:

Critical Guidelines Added:
1. Auto-Generated Files Detection
   - Mandatory pre-edit checklist
   - How to identify modular source files
   - Warning signs and examples
   - Proper workflow for modular architecture

2. PR Branch Management
   - When and how to keep branches up-to-date
   - Automated update workflow scripts
   - Conflict resolution procedures
   - Red flags indicating stale branches
   - Best practice timing table

3. Common Mistakes Prevention
   - Editing generated files instead of source files
   - Letting PR branches fall behind base branch
   - Not reading file headers before editing
   - Rushing implementation without understanding architecture

This documentation helps prevent:
- CI failures from modular file mismatches
- Merge conflicts from outdated branches
- Review delays from improper workflows
- Technical debt from architectural misunderstandings
---
 CLAUDE.md | 251 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 251 insertions(+)
 create mode 100644 CLAUDE.md

diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 000000000000..3feac5f7426d
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,251 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code when working with code in the transformers repository.
+
+## ⚠️ CRITICAL: Auto-Generated Files
+
+**BEFORE editing ANY file in this repository, ALWAYS check if it's auto-generated.**
+
+### Pre-Edit Checklist (MANDATORY):
+
+1. **Read the file header** (first 50 lines) for warnings like:
+   - "This file was automatically generated"
+   - "Do NOT edit this file manually"
+   - "Generated from [source_file]"
+
+2. **Check for modular source files**:
+   ```bash
+   # If editing a modeling file, check if a modular version exists
+   ls src/transformers/models/[model_name]/modular_*.py
+   ```
+
+3. **If file is auto-generated**:
+   - ❌ DO NOT edit the generated file
+   - ✅ INSTEAD, edit the source file mentioned in the header
+   - ✅ Run the generation script if needed (usually mentioned in header or Makefile)
+
+### Common Auto-Generated Files:
+
+- **Modeling files**: Many `modeling_*.py` files are generated from `modular_*.py` files
+  - Example: `modeling_qwen3_moe.py` is generated from `modular_qwen3_moe.py`
+  - Always edit the `modular_*.py` file, not the generated one
+  - CI will fail if modular and generated files don't match
+
+- **Generation script**: `utils/check_modular_conversion.py --fix_and_overwrite`
+  - Regenerates all modeling files from their modular sources
+  - CI enforces that generated files match their modular sources
+
+### Example Warning Signs:
+
+```python
+# 🚨 THIS MEANS DO NOT EDIT THIS FILE 🚨
+# This file was automatically generated from src/
+# transformers/models/qwen3_moe/modular_qwen3_moe.py.
+#      Do NOT edit this file manually as any edits will be
+#      overwritten by the generation of
+#      the file from the modular. If any change should be
+# done, please apply the change to the
+#      modular qwen3_moe.py file directly. One
+# of our CI enforces this.
+```
+
+## Modular Architecture
+
+### What is it?
+
+The transformers library uses a "modular" system where:
+- Core model logic is written in `modular_*.py` files
+- These files are processed to generate the final `modeling_*.py` files
+- This allows for code reuse and consistency across similar models
+
+### How to work with modular files:
+
+1. **Find the modular source**:
+   ```bash
+   # If you need to edit modeling_foo.py, look for:
+   ls src/transformers/models/foo/modular_foo.py
+   ```
+
+2. **Edit the modular file**:
+   - Make your changes to `modular_foo.py`
+   - NOT to `modeling_foo.py`
+
+3. **Regenerate (if needed)**:
+   ```bash
+   # The generation usually happens automatically in CI
+   # But you can run it locally:
+   make check-modular-conversion
+   # or
+   python utils/check_modular_conversion.py --fix_and_overwrite
+   ```
+
+4. **Verify both files are updated**:
+   ```bash
+   git status  # Should show both modular and modeling files changed
+   ```
+
+## Contributing Workflow
+
+### Before Making Changes:
+
+1. Read `CONTRIBUTING.md` thoroughly
+2. Check if files are auto-generated (see above)
+3. Look for existing patterns in similar models
+4. Check CI requirements in `.github/workflows/`
+
+### Before Committing:
+
+1. ✅ Verify you edited the correct files (source, not generated)
+2. ✅ Run relevant tests
+3. ✅ Check that modular files and generated files both changed
+4. ✅ Read your diff carefully
+
+### Common Mistakes to Avoid:
+
+- ❌ Editing auto-generated files instead of source files
+- ❌ Not reading file headers before editing
+- ❌ Assuming file structure without checking documentation
+- ❌ Rushing to implement without understanding the architecture
+- ❌ Letting PR branches fall behind the base branch
+
+## ⚠️ CRITICAL: Keep PR Branches Up-to-Date
+
+**ALWAYS keep your PR branch current with the base branch to avoid conflicts and ensure CI passes.**
+
+### Why This Matters:
+
+1. **CI may fail** on outdated branches even if your code is correct
+2. **Merge conflicts** become harder to resolve the longer you wait
+3. **Review delays** - maintainers may wait for you to update before reviewing
+4. **Your changes may conflict** with recent updates to the codebase
+
+### When to Update Your Branch:
+
+✅ **ALWAYS update in these situations:**
+- When you see "This branch is out-of-date with the base branch" on GitHub
+- Before pushing new commits to an existing PR
+- After receiving review feedback that requires code changes
+- Daily for long-running PRs (if base branch is active)
+- Before requesting re-review from maintainers
+
+### How to Update Your Branch:
+
+```bash
+# Method 1: Merge upstream changes (RECOMMENDED for most cases)
+git fetch upstream main
+git merge upstream/main --no-edit
+git push origin <your-branch-name>
+
+# Method 2: Rebase (use if you need clean history, but be careful)
+git fetch upstream main
+git rebase upstream/main
+git push origin <your-branch-name> --force-with-lease
+
+# Quick check if your branch is behind:
+git fetch upstream main
+git log HEAD..upstream/main --oneline  # Shows commits you're missing
+```
+
+### Automated Branch Update Workflow:
+
+**Before EVERY push to a PR branch, run this checklist:**
+
+```bash
+# 1. Check if upstream has new commits
+git fetch upstream main
+
+# 2. See how far behind you are
+git log HEAD..upstream/main --oneline
+
+# 3. If there are new commits, merge them
+if [[ $(git log HEAD..upstream/main --oneline | wc -l) -gt 0 ]]; then
+    echo "⚠️  Branch is behind, updating..."
+    git merge upstream/main --no-edit
+fi
+
+# 4. Now push your changes
+git push origin <your-branch-name>
+```
+
+### Handling "Branch is out-of-date" Warnings:
+
+When GitHub shows: **"This branch is out-of-date with the base branch. It's N commits behind"**
+
+**Immediate action required:**
+1. Don't ignore it - update immediately
+2. Fetch and merge latest changes from base branch
+3. Resolve any conflicts if they appear
+4. Push the merge commit
+5. Verify CI passes on the updated branch
+
+### Merge Conflicts During Update:
+
+If you get conflicts when updating:
+
+```bash
+# 1. Conflicts will be marked in affected files
+git status  # Shows files with conflicts
+
+# 2. Open each conflicted file and resolve manually
+#    Look for <<<<<<< HEAD and >>>>>>> sections
+
+# 3. After resolving conflicts in all files:
+git add <conflicted-files>
+git commit  # Complete the merge
+
+# 4. Push the resolved merge
+git push origin <your-branch-name>
+```
+
+### Prevention Strategy:
+
+**Set up these habits to avoid falling behind:**
+
+1. **Daily check** for active PRs:
+   ```bash
+   gh pr status  # Shows status of your PRs
+   ```
+
+2. **Before starting work** on an existing PR:
+   ```bash
+   git fetch upstream main && git merge upstream/main --no-edit
+   ```
+
+3. **Before requesting review**:
+   ```bash
+   # Ensure branch is current
+   git fetch upstream main
+   git merge upstream/main --no-edit
+   git push origin <branch-name>
+   ```
+
+4. **Set up notifications**:
+   - Enable GitHub notifications for your PRs
+   - Watch for CI failures that might indicate branch is stale
+
+### Red Flags That Indicate Branch Needs Update:
+
+🚨 **Update immediately if you see:**
+- "This branch is out-of-date with the base branch"
+- CI failures on checks that passed before
+- Merge conflict warnings from GitHub
+- "Changes requested" review with note about conflicts
+- Your PR shows as "N commits behind" (any number > 0)
+
+### Best Practice Timing:
+
+| Situation | When to Update |
+|-----------|---------------|
+| Fresh PR just created | ✅ Already current, no action needed |
+| PR open for 1-2 days | ✅ Check daily, update if behind |
+| PR open for 1+ week | ⚠️ Update immediately, likely very behind |
+| After receiving review | ✅ Update before addressing comments |
+| Before pushing new commits | ✅ Update first, then push changes |
+| CI failing on old code | ⚠️ Update immediately, may fix issues |
+
+## Additional Resources
+
+- Main Contributing Guide: `CONTRIBUTING.md`
+- Modular Transformers Docs: `docs/source/en/modular_transformers.md`
+- Model Converter: `utils/modular_model_converter.py`
+- Conversion Checker: `utils/check_modular_conversion.py`

From d5779a23104950f07256dc32b020b098a297bd3b Mon Sep 17 00:00:00 2001
From: Yashwant Bezawada <yashwant_b@me.com>
Date: Sat, 8 Nov 2025 14:37:37 -0600
Subject: [PATCH 4/4] fix: Correct aux_loss calculation for Qwen3MoE

The previous fix was too restrictive - it prevented aux_loss from being
calculated during eval mode, which broke the test_load_balancing_loss test.

Correct behavior:
- Calculate aux_loss whenever output_router_logits=True (for monitoring)
- Only add aux_loss to the total loss during training (labels + self.training)

This matches the Mixtral implementation pattern and fixes the CircleCI test failure.
---
 src/transformers/models/qwen3_moe/modeling_qwen3_moe.py | 4 ++--
 src/transformers/models/qwen3_moe/modular_qwen3_moe.py  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py b/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py
index d549d56e2c94..da962cf58b86 100644
--- a/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py
+++ b/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py
@@ -668,14 +668,14 @@ def forward(
             loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
 
         aux_loss = None
-        if output_router_logits and self.training:
+        if output_router_logits:
             aux_loss = load_balancing_loss_func(
                 outputs.router_logits,
                 self.num_experts,
                 self.num_experts_per_tok,
                 attention_mask,
             )
-            if labels is not None:
+            if labels is not None and self.training:
                 loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
 
         return MoeCausalLMOutputWithPast(
diff --git a/src/transformers/models/qwen3_moe/modular_qwen3_moe.py b/src/transformers/models/qwen3_moe/modular_qwen3_moe.py
index 453758f55dfc..7aa70482944d 100644
--- a/src/transformers/models/qwen3_moe/modular_qwen3_moe.py
+++ b/src/transformers/models/qwen3_moe/modular_qwen3_moe.py
@@ -180,14 +180,14 @@ def forward(
             loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
 
         aux_loss = None
-        if output_router_logits and self.training:
+        if output_router_logits:
             aux_loss = load_balancing_loss_func(
                 outputs.router_logits,
                 self.num_experts,
                 self.num_experts_per_tok,
                 attention_mask,
             )
-            if labels is not None:
+            if labels is not None and self.training:
                 loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
 
         return MoeCausalLMOutputWithPast(