Merge pull request #167 from BoyuShen2004/master

donglaiw · web-flow · commit 1014d8dac0d1 · 2025-11-08T22:58:27.000-05:00
fix inference sliding_window bug
diff --git a/.latest_timestamp b/.latest_timestamp
@@ -0,0 +1 @@
+20251108_212916
diff --git a/connectomics/config/hydra_config.py b/connectomics/config/hydra_config.py
@@ -514,6 +514,7 @@ class CheckpointConfig:
     """Model checkpointing configuration."""
 
     monitor: str = "train_loss_total_epoch"
+    dirpath: Optional[str] = None
     mode: str = "min"
     save_top_k: int = 1
     save_last: bool = True
@@ -831,6 +832,7 @@ class InferenceDataConfig:
     test_transpose: List[int] = field(
         default_factory=list
     )  # Axis permutation for test data (e.g., [2,1,0] for xyz->zyx)
+    output_path: Optional[str] = None  # Optional explicit directory for inference outputs
     output_name: str = (
         "predictions.h5"  # Output filename (auto-pathed to inference/{checkpoint}/{output_name})
     )
@@ -845,6 +847,7 @@ class SlidingWindowConfig:
 
     window_size: Optional[List[int]] = None
     sw_batch_size: Optional[int] = None  # If None, will use system.inference.batch_size
+    overlap: Optional[Any] = None  # Overlap between window passes (float or sequence)
     stride: Optional[List[int]] = None  # Explicit stride for controlling window movement
     blending: str = "gaussian"  # 'gaussian' or 'constant' - blending mode for overlapping patches
     sigma_scale: float = (
diff --git a/install.py b/install.py
@@ -431,6 +431,17 @@ def install_pytorch_connectomics(
         print_success(f"Core packages installed: {', '.join(to_install)}")
     else:
         print_success("All core packages already installed")
+    print_info("Ensuring numpy and h5py are installed from conda-forge (force reinstall)...")
+    code, _, stderr = run_command(
+        f"conda install -n {env_name} -c conda-forge numpy h5py -y --force-reinstall",
+        check=False,
+    )
+    if code != 0:
+        print_warning("conda reinstall of numpy/h5py failed; please verify the environment manually")
+        if stderr.strip():
+            print_warning(stderr.strip())
+    else:
+        print_success("numpy and h5py verified via conda-forge")
 
     # Group 2: Optional scientific packages (nice to have, but slow to install)
     optional_packages = ["scipy", "scikit-learn", "scikit-image", "opencv"]
diff --git a/scripts/main.py b/scripts/main.py
@@ -182,17 +182,24 @@ def setup_config(args) -> Config:
     config_name = config_path.stem  # Get filename without extension
     output_folder = f"outputs/{config_name}/"
 
-    # Update checkpoint dirpath to use the new output folder
-    cfg.monitor.checkpoint.dirpath = f"{output_folder}checkpoints/"
+    # Update checkpoint dirpath only if not provided by the user
+    if not getattr(cfg.monitor.checkpoint, "dirpath", None):
+        cfg.monitor.checkpoint.dirpath = str(Path(output_folder) / "checkpoints")
+    else:
+        cfg.monitor.checkpoint.dirpath = str(Path(cfg.monitor.checkpoint.dirpath))
 
-    # Update inference output path to use the new output folder
-    cfg.inference.data.output_path = f"{output_folder}results/"
+    # Update inference output path only if not provided by the user
+    if not getattr(cfg.inference.data, "output_path", None):
+        cfg.inference.data.output_path = str(Path(output_folder) / "results")
+    else:
+        cfg.inference.data.output_path = str(Path(cfg.inference.data.output_path))
 
     # Note: We handle timestamping manually in main() to create run directories
     # Set this to False to prevent PyTorch Lightning from adding its own timestamp
     cfg.monitor.checkpoint.use_timestamp = False
 
-    print(f"📁 Output folder set to: {output_folder}")
+    print(f"📁 Checkpoints base directory: {cfg.monitor.checkpoint.dirpath}")
+    print(f"📂 Inference output directory: {cfg.inference.data.output_path}")
 
     # Apply CLI overrides
     if args.overrides:
@@ -1111,8 +1118,9 @@ def main():
     # Subsequent invocations (with LOCAL_RANK set) reuse the existing timestamp.
     if args.mode == "train":
         # Extract output folder from checkpoint dirpath (remove /checkpoints suffix)
-        checkpoint_dirpath = cfg.monitor.checkpoint.dirpath
-        output_base = Path(checkpoint_dirpath).parent  # This gives us outputs/experiment_name/
+        checkpoint_dir = Path(cfg.monitor.checkpoint.dirpath)
+        checkpoint_subdir = checkpoint_dir.name or "checkpoints"
+        output_base = checkpoint_dir.parent  # Base directory containing timestamped runs
 
         # Check if this is a DDP re-launch (LOCAL_RANK is set by PyTorch Lightning)
         import os
@@ -1129,10 +1137,11 @@ def main():
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             run_dir = output_base / timestamp
 
-            # Update checkpoint dirpath to use the timestamped directory
-            cfg.monitor.checkpoint.dirpath = str(run_dir / "checkpoints")
+            # Update checkpoint dirpath to use the timestamped directory (preserve leaf name)
+            checkpoint_path = run_dir / checkpoint_subdir
+            cfg.monitor.checkpoint.dirpath = str(checkpoint_path)
 
-            run_dir.mkdir(parents=True, exist_ok=True)
+            checkpoint_path.mkdir(parents=True, exist_ok=True)
             print(f"📁 Run directory: {run_dir}")
 
             # Save config to run directory
@@ -1156,7 +1165,8 @@ def main():
             if timestamp_file.exists():
                 timestamp = timestamp_file.read_text().strip()
                 run_dir = output_base / timestamp
-                cfg.monitor.checkpoint.dirpath = str(run_dir / "checkpoints")
+                checkpoint_path = run_dir / checkpoint_subdir
+                cfg.monitor.checkpoint.dirpath = str(checkpoint_path)
                 print(f"📁 [DDP Rank {local_rank}] Using run directory: {run_dir}")
             else:
                 raise RuntimeError(
diff --git a/tutorials/monai2d_worm.yaml b/tutorials/monai2d_worm.yaml
@@ -62,8 +62,8 @@ data:
   do_2d: true                     # Enable 2D data processing (extract 2D slices from 3D volumes)
   
   # Volume configuration
-  train_image: /projects/weilab/shenb/PyTC/datasets/Dataset001_worm_image96/imagesTr/*.tif
-  train_label: /projects/weilab/shenb/PyTC/datasets/Dataset001_worm_image96/labelsTr/*.tif
+  train_image: /orcd/data/edboyden/002/shenb/PyTC/datasets/Dataset001_worm_image96/imagesTr/*.tif
+  train_label: /orcd/data/edboyden/002/shenb/PyTC/datasets/Dataset001_worm_image96/labelsTr/*.tif
   train_resolution: [5, 5]        # Lucchi EM: 5nm isotropic resolution
   use_preloaded_cache: true            # Load volumes into memory for fast training
 
@@ -158,7 +158,7 @@ monitor:
     save_top_k: 1
     save_last: true
     save_every_n_epochs: 10
-    dirpath: checkpoints/  # Will be dynamically set to outputs/{yaml_filename}/YYYYMMDD_HHMMSS/checkpoints/
+    dirpath: outputs/monai2d_worm/checkpoints/  # Will be dynamically set to outputs/{yaml_filename}/YYYYMMDD_HHMMSS/checkpoints/
     # checkpoint_filename: auto-generated from monitor metric (epoch={epoch:03d}-{monitor}={value:.4f})
     use_timestamp: true       # Enable timestamped subdirectories (YYYYMMDD_HHMMSS)
 
@@ -177,7 +177,7 @@ monitor:
 inference:
   data:
     do_2d: true                     # Enable 2D data processing for inference
-    test_image: /projects/weilab/shenb/PyTC/datasets/Dataset001_worm_image96/imagesTs/*.tif
+    test_image: /orcd/data/edboyden/002/shenb/wormbehavior/13/*.tif
     test_label:
     test_resolution: [5, 5]
     output_path: outputs/monai2d_worm/results/