A bit of docstring and comment consistency cleanup, remove some debug code

rwightman · rwightman · commit 4ff865caec09 · 2025-06-04T21:11:13.000-07:00
diff --git a/timm/data/naflex_dataset.py b/timm/data/naflex_dataset.py
@@ -1,15 +1,16 @@
-"""
-Dynamic Sequence Length Datasets for Variable Resolution Image Processing
+""" Dynamic Sequence Length Datasets for Variable Resolution Image Processing
 
 Implements two dataset wrappers:
-1. DynamicSeqMapDataset - Map-style dataset that returns batches with variable sequence lengths
-2. DynamicSeqIterDataset - Iterable dataset that yields batches with variable sequence lengths
+1. NaFlexMapDatasetWrapper - Map-style dataset that returns batches with variable sequence lengths
+TODO: 2. NaFlexIterableDatasetWrapper - Iterable dataset that yields batches with variable sequence lengths
 
 Both support:
 - Pre-initialized transforms for efficiency
 - Distributed training
 - Multiple workers
 - Variable batch sizes based on sequence length
+
+Hacked together by / Copyright 2025, Ross Wightman, Hugging Face
 """
 
 import math
@@ -20,12 +21,10 @@
 
 import torch
 from torch.utils.data import Dataset, IterableDataset, DataLoader
-from torchvision import transforms
 from PIL import Image
 
-
-from .naflex_transforms import Patchify, patchify_image
-from ..layers import to_2tuple
+from .naflex_transforms import Patchify
+from timm.layers import to_2tuple
 
 
 def calculate_naflex_batch_size(
@@ -203,7 +202,7 @@ class NaFlexMapDatasetWrapper(IterableDataset):
     Yields batches with variable sequence lengths. It calculates a canonical
     batch schedule (sequence length, batch size pairs) once based on the
     total dataset size (padded for distribution). Each epoch, it shuffles
-    the *order* of this canonical schedule and the dataset indices.
+    the order of this canonical schedule and the dataset indices.
     This ensures a consistent number of batches and samples per epoch
     across all ranks. Handles distributed training and multiple workers.
     """
@@ -292,13 +291,13 @@ def __init__(
 
         self.mixup_fn = mixup_fn
 
-        # --- Canonical Schedule Calculation (Done Once) ---
+        # Canonical Schedule Calculation (Done Once)
         self._canonical_batch_schedule: List[Tuple[int, int]] = []
         self._num_batches_per_rank: int = 0
         self._padded_samples_per_rank: int = 0
         self._create_canonical_schedule() # Calculate schedule based on padded size
 
-        # --- Per-Epoch State ---
+        # Per-Epoch State
         # Stores (seq_len, list_of_indices) for the current epoch, specific to this rank
         self._epoch_batches: List[Tuple[int, List[int]]] = []
         self._prepare_epoch_batches(self.epoch)  # setup for initial epoch
@@ -420,7 +419,6 @@ def _prepare_epoch_batches(self, epoch: int):
             if len(indices_for_ranks) != padded_total_len:
                  raise RuntimeError(f"Internal Error: Padded index list length {len(indices_for_ranks)} does not match expected {padded_total_len}")
 
-
         # 3. Select indices for the current rank
         if self.distributed and self.world_size > 1:
             indices_this_rank = indices_for_ranks[self.rank::self.world_size]
diff --git a/timm/data/naflex_loader.py b/timm/data/naflex_loader.py
@@ -1,3 +1,14 @@
+"""NaFlex data loader for dynamic sequence length training.
+
+This module provides a specialized data loader for Vision Transformer models that supports:
+- Dynamic sequence length sampling during training for improved efficiency
+- Variable patch size training with probabilistic selection
+- Patch-level random erasing augmentation
+- Efficient GPU prefetching with normalization
+
+Hacked together by / Copyright 2025, Ross Wightman, Hugging Face
+"""
+
 import math
 from contextlib import suppress
 from functools import partial
diff --git a/timm/data/naflex_mixup.py b/timm/data/naflex_mixup.py
@@ -11,6 +11,7 @@
   all augmentation hyper‑parameters in one place, making it easy to plug into
   different dataset wrappers.
 
+Hacked together by / Copyright 2025, Ross Wightman, Hugging Face
 """
 import math
 import random
@@ -113,7 +114,6 @@ def mix_batch_variable_size(
 
             corrected_lam = 1.0 - cut_area / float(dest_area)
             lam_list[i] = corrected_lam
-            #print(i, 'Doing cutmix', yl_i, xl_i, yl_j, xl_j, ch, cw, lam_raw, corrected_lam)
         else:
             # Mixup: blend the entire overlap region
             patch_i = xi[:, top_i:top_i + oh, left_i:left_i + ow]
@@ -125,7 +125,6 @@ def mix_batch_variable_size(
 
             corrected_lam = (dest_area - overlap_area) / dest_area + lam_raw * overlap_area / dest_area
             lam_list[i] = corrected_lam
-            #print(i, 'Doing mixup', top_i, left_i, top_j, left_j, (oh, ow), (hi, wi), (hj, wj), lam_raw, corrected_lam)
 
     return mixed_imgs, lam_list, pair_to
 
diff --git a/timm/data/naflex_random_erasing.py b/timm/data/naflex_random_erasing.py
@@ -1,3 +1,16 @@
+"""Patch-level random erasing augmentation for NaFlex Vision Transformers.
+
+This module implements random erasing specifically designed for patchified images,
+operating at the patch granularity rather than pixel level. It supports two modes:
+- 'patch': Randomly erases individual patches (speckle-like noise)
+- 'region': Erases contiguous rectangular regions of patches (similar to original RandomErasing)
+
+The implementation is coordinate-aware, respecting valid patch boundaries and supporting
+variable patch sizes in NaFlex training.
+
+Hacked together by / Copyright 2025, Ross Wightman, Hugging Face
+"""
+
 import random
 import math
 from typing import Optional, Union, Tuple
diff --git a/timm/data/naflex_transforms.py b/timm/data/naflex_transforms.py
@@ -5,6 +5,8 @@
 - FlexiViT: https://arxiv.org/abs/2212.08013
 
 Enables variable resolution/aspect ratio image handling with efficient patching.
+
+Hacked together by / Copyright 2025, Ross Wightman, Hugging Face
 """
 
 import math
diff --git a/timm/models/naflexvit.py b/timm/models/naflexvit.py
@@ -1,16 +1,19 @@
-""" Vision Transformer (New)
+""" NaFlex Vision Transformer
 
 An improved version of the Vision Transformer with:
 1. Encapsulated embedding and position encoding in a single module
 2. Support for linear patch embedding on pre-patchified inputs
-3. Support for NaFlex functionality (NaViT + FlexiViT)
+3. Support for NaFlex variable aspect, variable resolution
+4. Support for FlexiViT variable patch size
+5. Support for NaViT fractional/factorized position embedding
 
-Based on:
+Based on ideas from:
 - Original Vision Transformer: https://arxiv.org/abs/2010.11929
 - FlexiViT: https://arxiv.org/abs/2212.08013
 - NaViT: https://arxiv.org/abs/2307.06304
+- NaFlex (SigLip-2): https://arxiv.org/abs/2502.14786
 
-Copyright 2025
+Hacked together by / Copyright 2025, Ross Wightman, Hugging Face
 """
 
 import logging