|
| 1 | +""" |
| 2 | +Debugging utilities for PyTorch Connectomics. |
| 3 | +
|
| 4 | +This module implements: |
| 5 | +- NaN/Inf detection in activations |
| 6 | +- NaN/Inf detection in parameters and gradients |
| 7 | +- Forward hook management for intermediate layer inspection |
| 8 | +- Debug statistics collection and reporting |
| 9 | +""" |
| 10 | + |
| 11 | +from __future__ import annotations |
| 12 | +from typing import Dict, Any, Optional, Tuple |
| 13 | + |
| 14 | +import torch |
| 15 | +import torch.nn as nn |
| 16 | + |
| 17 | +from ..utils.debug_hooks import NaNDetectionHookManager |
| 18 | + |
| 19 | + |
| 20 | +class DebugManager: |
| 21 | + """ |
| 22 | + Manager for debugging operations including NaN detection. |
| 23 | +
|
| 24 | + This class handles: |
| 25 | + - Forward hooks for NaN/Inf detection in layer outputs |
| 26 | + - Parameter and gradient inspection |
| 27 | + - Statistics collection and reporting |
| 28 | + - Interactive debugging support (pdb integration) |
| 29 | +
|
| 30 | + Args: |
| 31 | + model: PyTorch model to debug (nn.Module) |
| 32 | + """ |
| 33 | + |
| 34 | + def __init__(self, model: nn.Module): |
| 35 | + self.model = model |
| 36 | + self._hook_manager: Optional[NaNDetectionHookManager] = None |
| 37 | + |
| 38 | + def enable_nan_hooks( |
| 39 | + self, |
| 40 | + debug_on_nan: bool = True, |
| 41 | + verbose: bool = False, |
| 42 | + layer_types: Optional[Tuple] = None, |
| 43 | + ) -> NaNDetectionHookManager: |
| 44 | + """ |
| 45 | + Enable forward hooks to detect NaN in intermediate layer outputs. |
| 46 | +
|
| 47 | + This attaches hooks to all layers in the model that will check for NaN/Inf |
| 48 | + in layer outputs during forward pass. When NaN is detected, it will print |
| 49 | + diagnostics and optionally enter the debugger. |
| 50 | +
|
| 51 | + Useful for debugging in pdb: |
| 52 | + (Pdb) pl_module.enable_nan_hooks() |
| 53 | + (Pdb) outputs = pl_module(batch['image']) |
| 54 | + # Will stop at first layer producing NaN |
| 55 | +
|
| 56 | + Args: |
| 57 | + debug_on_nan: If True, enter pdb when NaN detected (default: True) |
| 58 | + verbose: If True, print stats for every layer (slow, default: False) |
| 59 | + layer_types: Tuple of layer types to hook (default: all common layers) |
| 60 | +
|
| 61 | + Returns: |
| 62 | + NaNDetectionHookManager instance |
| 63 | + """ |
| 64 | + if self._hook_manager is not None: |
| 65 | + print("⚠️ Hooks already enabled. Call disable_nan_hooks() first.") |
| 66 | + return self._hook_manager |
| 67 | + |
| 68 | + self._hook_manager = NaNDetectionHookManager( |
| 69 | + model=self.model, |
| 70 | + debug_on_nan=debug_on_nan, |
| 71 | + verbose=verbose, |
| 72 | + collect_stats=True, |
| 73 | + layer_types=layer_types, |
| 74 | + ) |
| 75 | + |
| 76 | + return self._hook_manager |
| 77 | + |
| 78 | + def disable_nan_hooks(self): |
| 79 | + """ |
| 80 | + Disable forward hooks for NaN detection. |
| 81 | +
|
| 82 | + Removes all hooks that were attached by enable_nan_hooks(). |
| 83 | + """ |
| 84 | + if self._hook_manager is not None: |
| 85 | + self._hook_manager.remove_hooks() |
| 86 | + self._hook_manager = None |
| 87 | + else: |
| 88 | + print("⚠️ No hooks to remove.") |
| 89 | + |
| 90 | + def get_hook_stats(self) -> Optional[Dict[str, Dict[str, Any]]]: |
| 91 | + """ |
| 92 | + Get statistics from NaN detection hooks. |
| 93 | +
|
| 94 | + Returns: |
| 95 | + Dictionary mapping layer names to their statistics, or None if hooks not enabled |
| 96 | + """ |
| 97 | + if self._hook_manager is not None: |
| 98 | + return self._hook_manager.get_stats() |
| 99 | + else: |
| 100 | + print("⚠️ Hooks not enabled. Call enable_nan_hooks() first.") |
| 101 | + return None |
| 102 | + |
| 103 | + def print_hook_summary(self): |
| 104 | + """ |
| 105 | + Print summary of NaN detection hook statistics. |
| 106 | +
|
| 107 | + Shows which layers detected NaN/Inf and how many times. |
| 108 | + """ |
| 109 | + if self._hook_manager is not None: |
| 110 | + self._hook_manager.print_summary() |
| 111 | + else: |
| 112 | + print("⚠️ Hooks not enabled. Call enable_nan_hooks() first.") |
| 113 | + |
| 114 | + def check_for_nan(self, check_grads: bool = True, verbose: bool = True) -> dict: |
| 115 | + """ |
| 116 | + Debug utility to check for NaN/Inf in model parameters and gradients. |
| 117 | +
|
| 118 | + Useful when debugging in pdb. Call as: pl_module.check_for_nan() |
| 119 | +
|
| 120 | + Args: |
| 121 | + check_grads: Also check gradients |
| 122 | + verbose: Print detailed information |
| 123 | +
|
| 124 | + Returns: |
| 125 | + Dictionary with NaN/Inf information |
| 126 | + """ |
| 127 | + nan_params = [] |
| 128 | + inf_params = [] |
| 129 | + nan_grads = [] |
| 130 | + inf_grads = [] |
| 131 | + |
| 132 | + for name, param in self.model.named_parameters(): |
| 133 | + # Check parameters |
| 134 | + if torch.isnan(param).any(): |
| 135 | + nan_params.append((name, param.shape)) |
| 136 | + if verbose: |
| 137 | + print(f"⚠️ NaN in parameter: {name}, shape={param.shape}") |
| 138 | + if torch.isinf(param).any(): |
| 139 | + inf_params.append((name, param.shape)) |
| 140 | + if verbose: |
| 141 | + print(f"⚠️ Inf in parameter: {name}, shape={param.shape}") |
| 142 | + |
| 143 | + # Check gradients |
| 144 | + if check_grads and param.grad is not None: |
| 145 | + if torch.isnan(param.grad).any(): |
| 146 | + nan_grads.append((name, param.grad.shape)) |
| 147 | + if verbose: |
| 148 | + print(f"⚠️ NaN in gradient: {name}, shape={param.grad.shape}") |
| 149 | + if torch.isinf(param.grad).any(): |
| 150 | + inf_grads.append((name, param.grad.shape)) |
| 151 | + if verbose: |
| 152 | + print(f"⚠️ Inf in gradient: {name}, shape={param.grad.shape}") |
| 153 | + |
| 154 | + result = { |
| 155 | + 'nan_params': nan_params, |
| 156 | + 'inf_params': inf_params, |
| 157 | + 'nan_grads': nan_grads, |
| 158 | + 'inf_grads': inf_grads, |
| 159 | + 'has_nan': len(nan_params) > 0 or len(nan_grads) > 0, |
| 160 | + 'has_inf': len(inf_params) > 0 or len(inf_grads) > 0, |
| 161 | + } |
| 162 | + |
| 163 | + if verbose: |
| 164 | + if not result['has_nan'] and not result['has_inf']: |
| 165 | + print("✅ No NaN/Inf found in parameters or gradients") |
| 166 | + else: |
| 167 | + print(f"\n📊 Summary:") |
| 168 | + print(f" NaN parameters: {len(nan_params)}") |
| 169 | + print(f" Inf parameters: {len(inf_params)}") |
| 170 | + print(f" NaN gradients: {len(nan_grads)}") |
| 171 | + print(f" Inf gradients: {len(inf_grads)}") |
| 172 | + |
| 173 | + return result |
0 commit comments