From 10c87180d05a55138d396af278986b134a3098f1 Mon Sep 17 00:00:00 2001 From: Yashwant Bezawada Date: Fri, 7 Nov 2025 20:42:44 -0600 Subject: [PATCH 1/4] Fix Qwen3Moe load balancing loss calculation during inference Fixes generation bug in Qwen3Moe models that calculate load_balancing_loss during evaluation/generation. Load balancing loss should only be calculated during training, not during inference. Fixes #42100 --- src/transformers/models/qwen3_moe/modeling_qwen3_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py b/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py index ff0855c223ee..d549d56e2c94 100644 --- a/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py +++ b/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py @@ -668,7 +668,7 @@ def forward( loss = self.loss_function(logits, labels, self.vocab_size, **kwargs) aux_loss = None - if output_router_logits: + if output_router_logits and self.training: aux_loss = load_balancing_loss_func( outputs.router_logits, self.num_experts, From 14dba9e4d944403a147759f58873fb657513d21e Mon Sep 17 00:00:00 2001 From: Yashwant Bezawada Date: Sat, 8 Nov 2025 13:51:53 -0600 Subject: [PATCH 2/4] fix: Apply load balancing loss fix to modular qwen3_moe source The previous commit only modified the auto-generated modeling_qwen3_moe.py file. This commit applies the same fix to the modular source file (modular_qwen3_moe.py) which is the canonical source that generates modeling_qwen3_moe.py. Changes: - Add 'and self.training' check before calculating load_balancing_loss - Ensures consistency between modular source and generated file - Prevents CI failures from modular file mismatch --- src/transformers/models/qwen3_moe/modular_qwen3_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/qwen3_moe/modular_qwen3_moe.py b/src/transformers/models/qwen3_moe/modular_qwen3_moe.py index 87a4bbfa9625..453758f55dfc 100644 --- a/src/transformers/models/qwen3_moe/modular_qwen3_moe.py +++ b/src/transformers/models/qwen3_moe/modular_qwen3_moe.py @@ -180,7 +180,7 @@ def forward( loss = self.loss_function(logits, labels, self.vocab_size, **kwargs) aux_loss = None - if output_router_logits: + if output_router_logits and self.training: aux_loss = load_balancing_loss_func( outputs.router_logits, self.num_experts, From ff3663371321d1115210589ccd6e0d1bcfa7878f Mon Sep 17 00:00:00 2001 From: Yashwant Bezawada Date: Sat, 8 Nov 2025 14:00:42 -0600 Subject: [PATCH 3/4] docs: Add CLAUDE.md with guidelines for AI assistants Add comprehensive documentation for AI assistants working on transformers: Critical Guidelines Added: 1. Auto-Generated Files Detection - Mandatory pre-edit checklist - How to identify modular source files - Warning signs and examples - Proper workflow for modular architecture 2. PR Branch Management - When and how to keep branches up-to-date - Automated update workflow scripts - Conflict resolution procedures - Red flags indicating stale branches - Best practice timing table 3. Common Mistakes Prevention - Editing generated files instead of source files - Letting PR branches fall behind base branch - Not reading file headers before editing - Rushing implementation without understanding architecture This documentation helps prevent: - CI failures from modular file mismatches - Merge conflicts from outdated branches - Review delays from improper workflows - Technical debt from architectural misunderstandings --- CLAUDE.md | 251 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 251 insertions(+) create mode 100644 CLAUDE.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 000000000000..3feac5f7426d --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,251 @@ +# CLAUDE.md + +This file provides guidance to Claude Code when working with code in the transformers repository. + +## ⚠️ CRITICAL: Auto-Generated Files + +**BEFORE editing ANY file in this repository, ALWAYS check if it's auto-generated.** + +### Pre-Edit Checklist (MANDATORY): + +1. **Read the file header** (first 50 lines) for warnings like: + - "This file was automatically generated" + - "Do NOT edit this file manually" + - "Generated from [source_file]" + +2. **Check for modular source files**: + ```bash + # If editing a modeling file, check if a modular version exists + ls src/transformers/models/[model_name]/modular_*.py + ``` + +3. **If file is auto-generated**: + - ❌ DO NOT edit the generated file + - ✅ INSTEAD, edit the source file mentioned in the header + - ✅ Run the generation script if needed (usually mentioned in header or Makefile) + +### Common Auto-Generated Files: + +- **Modeling files**: Many `modeling_*.py` files are generated from `modular_*.py` files + - Example: `modeling_qwen3_moe.py` is generated from `modular_qwen3_moe.py` + - Always edit the `modular_*.py` file, not the generated one + - CI will fail if modular and generated files don't match + +- **Generation script**: `utils/check_modular_conversion.py --fix_and_overwrite` + - Regenerates all modeling files from their modular sources + - CI enforces that generated files match their modular sources + +### Example Warning Signs: + +```python +# 🚨 THIS MEANS DO NOT EDIT THIS FILE 🚨 +# This file was automatically generated from src/ +# transformers/models/qwen3_moe/modular_qwen3_moe.py. +# Do NOT edit this file manually as any edits will be +# overwritten by the generation of +# the file from the modular. If any change should be +# done, please apply the change to the +# modular qwen3_moe.py file directly. One +# of our CI enforces this. +``` + +## Modular Architecture + +### What is it? + +The transformers library uses a "modular" system where: +- Core model logic is written in `modular_*.py` files +- These files are processed to generate the final `modeling_*.py` files +- This allows for code reuse and consistency across similar models + +### How to work with modular files: + +1. **Find the modular source**: + ```bash + # If you need to edit modeling_foo.py, look for: + ls src/transformers/models/foo/modular_foo.py + ``` + +2. **Edit the modular file**: + - Make your changes to `modular_foo.py` + - NOT to `modeling_foo.py` + +3. **Regenerate (if needed)**: + ```bash + # The generation usually happens automatically in CI + # But you can run it locally: + make check-modular-conversion + # or + python utils/check_modular_conversion.py --fix_and_overwrite + ``` + +4. **Verify both files are updated**: + ```bash + git status # Should show both modular and modeling files changed + ``` + +## Contributing Workflow + +### Before Making Changes: + +1. Read `CONTRIBUTING.md` thoroughly +2. Check if files are auto-generated (see above) +3. Look for existing patterns in similar models +4. Check CI requirements in `.github/workflows/` + +### Before Committing: + +1. ✅ Verify you edited the correct files (source, not generated) +2. ✅ Run relevant tests +3. ✅ Check that modular files and generated files both changed +4. ✅ Read your diff carefully + +### Common Mistakes to Avoid: + +- ❌ Editing auto-generated files instead of source files +- ❌ Not reading file headers before editing +- ❌ Assuming file structure without checking documentation +- ❌ Rushing to implement without understanding the architecture +- ❌ Letting PR branches fall behind the base branch + +## ⚠️ CRITICAL: Keep PR Branches Up-to-Date + +**ALWAYS keep your PR branch current with the base branch to avoid conflicts and ensure CI passes.** + +### Why This Matters: + +1. **CI may fail** on outdated branches even if your code is correct +2. **Merge conflicts** become harder to resolve the longer you wait +3. **Review delays** - maintainers may wait for you to update before reviewing +4. **Your changes may conflict** with recent updates to the codebase + +### When to Update Your Branch: + +✅ **ALWAYS update in these situations:** +- When you see "This branch is out-of-date with the base branch" on GitHub +- Before pushing new commits to an existing PR +- After receiving review feedback that requires code changes +- Daily for long-running PRs (if base branch is active) +- Before requesting re-review from maintainers + +### How to Update Your Branch: + +```bash +# Method 1: Merge upstream changes (RECOMMENDED for most cases) +git fetch upstream main +git merge upstream/main --no-edit +git push origin + +# Method 2: Rebase (use if you need clean history, but be careful) +git fetch upstream main +git rebase upstream/main +git push origin --force-with-lease + +# Quick check if your branch is behind: +git fetch upstream main +git log HEAD..upstream/main --oneline # Shows commits you're missing +``` + +### Automated Branch Update Workflow: + +**Before EVERY push to a PR branch, run this checklist:** + +```bash +# 1. Check if upstream has new commits +git fetch upstream main + +# 2. See how far behind you are +git log HEAD..upstream/main --oneline + +# 3. If there are new commits, merge them +if [[ $(git log HEAD..upstream/main --oneline | wc -l) -gt 0 ]]; then + echo "⚠️ Branch is behind, updating..." + git merge upstream/main --no-edit +fi + +# 4. Now push your changes +git push origin +``` + +### Handling "Branch is out-of-date" Warnings: + +When GitHub shows: **"This branch is out-of-date with the base branch. It's N commits behind"** + +**Immediate action required:** +1. Don't ignore it - update immediately +2. Fetch and merge latest changes from base branch +3. Resolve any conflicts if they appear +4. Push the merge commit +5. Verify CI passes on the updated branch + +### Merge Conflicts During Update: + +If you get conflicts when updating: + +```bash +# 1. Conflicts will be marked in affected files +git status # Shows files with conflicts + +# 2. Open each conflicted file and resolve manually +# Look for <<<<<<< HEAD and >>>>>>> sections + +# 3. After resolving conflicts in all files: +git add +git commit # Complete the merge + +# 4. Push the resolved merge +git push origin +``` + +### Prevention Strategy: + +**Set up these habits to avoid falling behind:** + +1. **Daily check** for active PRs: + ```bash + gh pr status # Shows status of your PRs + ``` + +2. **Before starting work** on an existing PR: + ```bash + git fetch upstream main && git merge upstream/main --no-edit + ``` + +3. **Before requesting review**: + ```bash + # Ensure branch is current + git fetch upstream main + git merge upstream/main --no-edit + git push origin + ``` + +4. **Set up notifications**: + - Enable GitHub notifications for your PRs + - Watch for CI failures that might indicate branch is stale + +### Red Flags That Indicate Branch Needs Update: + +🚨 **Update immediately if you see:** +- "This branch is out-of-date with the base branch" +- CI failures on checks that passed before +- Merge conflict warnings from GitHub +- "Changes requested" review with note about conflicts +- Your PR shows as "N commits behind" (any number > 0) + +### Best Practice Timing: + +| Situation | When to Update | +|-----------|---------------| +| Fresh PR just created | ✅ Already current, no action needed | +| PR open for 1-2 days | ✅ Check daily, update if behind | +| PR open for 1+ week | ⚠️ Update immediately, likely very behind | +| After receiving review | ✅ Update before addressing comments | +| Before pushing new commits | ✅ Update first, then push changes | +| CI failing on old code | ⚠️ Update immediately, may fix issues | + +## Additional Resources + +- Main Contributing Guide: `CONTRIBUTING.md` +- Modular Transformers Docs: `docs/source/en/modular_transformers.md` +- Model Converter: `utils/modular_model_converter.py` +- Conversion Checker: `utils/check_modular_conversion.py` From d5779a23104950f07256dc32b020b098a297bd3b Mon Sep 17 00:00:00 2001 From: Yashwant Bezawada Date: Sat, 8 Nov 2025 14:37:37 -0600 Subject: [PATCH 4/4] fix: Correct aux_loss calculation for Qwen3MoE The previous fix was too restrictive - it prevented aux_loss from being calculated during eval mode, which broke the test_load_balancing_loss test. Correct behavior: - Calculate aux_loss whenever output_router_logits=True (for monitoring) - Only add aux_loss to the total loss during training (labels + self.training) This matches the Mixtral implementation pattern and fixes the CircleCI test failure. --- src/transformers/models/qwen3_moe/modeling_qwen3_moe.py | 4 ++-- src/transformers/models/qwen3_moe/modular_qwen3_moe.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py b/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py index d549d56e2c94..da962cf58b86 100644 --- a/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py +++ b/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py @@ -668,14 +668,14 @@ def forward( loss = self.loss_function(logits, labels, self.vocab_size, **kwargs) aux_loss = None - if output_router_logits and self.training: + if output_router_logits: aux_loss = load_balancing_loss_func( outputs.router_logits, self.num_experts, self.num_experts_per_tok, attention_mask, ) - if labels is not None: + if labels is not None and self.training: loss += self.router_aux_loss_coef * aux_loss.to(loss.device) # make sure to reside in the same device return MoeCausalLMOutputWithPast( diff --git a/src/transformers/models/qwen3_moe/modular_qwen3_moe.py b/src/transformers/models/qwen3_moe/modular_qwen3_moe.py index 453758f55dfc..7aa70482944d 100644 --- a/src/transformers/models/qwen3_moe/modular_qwen3_moe.py +++ b/src/transformers/models/qwen3_moe/modular_qwen3_moe.py @@ -180,14 +180,14 @@ def forward( loss = self.loss_function(logits, labels, self.vocab_size, **kwargs) aux_loss = None - if output_router_logits and self.training: + if output_router_logits: aux_loss = load_balancing_loss_func( outputs.router_logits, self.num_experts, self.num_experts_per_tok, attention_mask, ) - if labels is not None: + if labels is not None and self.training: loss += self.router_aux_loss_coef * aux_loss.to(loss.device) # make sure to reside in the same device return MoeCausalLMOutputWithPast(