From 977f278d769ca5be6201b7cafe6591247145c7e4 Mon Sep 17 00:00:00 2001 From: Tanisha Date: Tue, 4 Nov 2025 15:35:20 +0000 Subject: [PATCH 1/6] Adding support to load checkpoints from epoch Signed-off-by: Tanisha --- QEfficient/finetune/utils/train_utils.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py index e9e1320de..00476eef3 100644 --- a/QEfficient/finetune/utils/train_utils.py +++ b/QEfficient/finetune/utils/train_utils.py @@ -123,8 +123,13 @@ def train( break if train_config.use_peft and train_config.from_peft_checkpoint: - intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-2].split("_")[-1]) - 1 - intermediate_step = int(train_config.from_peft_checkpoint.split("/")[-1].split("_")[-1]) + try: + intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-2].split("_")[-1]) - 1 + intermediate_step = int(train_config.from_peft_checkpoint.split("/")[-1].split("_")[-1]) + except Exception: + intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-1].split("_")[-1]) -1 + intermediate_step=0 + if epoch < intermediate_epoch: logger.log_rank_zero(f"Skipping epoch {epoch + 1} since fine tuning has already completed for it.") continue @@ -154,6 +159,12 @@ def train( # resume training from a particular checkpoint, assuming the dataset is not shuffled if train_config.use_peft and train_config.from_peft_checkpoint: # to bring the count of train_step in sync with where it left off + if intermediate_step == 0 and epoch == intermediate_epoch: + logger.log_rank_zero( + f"Skipping epoch {epoch + 1}, since fine tuning has already completed for it." + ) + break + if epoch == intermediate_epoch and step == 0: logger.log_rank_zero( f"Skipping first {intermediate_step} steps for epoch {epoch + 1}, since fine tuning has already completed for it." From 9ce2d13f616830485c4962f99a0e7a8ab456628e Mon Sep 17 00:00:00 2001 From: Tanisha Date: Tue, 4 Nov 2025 15:39:55 +0000 Subject: [PATCH 2/6] Adding support to load checkpoints from epoch Signed-off-by: Tanisha --- QEfficient/finetune/utils/train_utils.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py index 00476eef3..824b94723 100644 --- a/QEfficient/finetune/utils/train_utils.py +++ b/QEfficient/finetune/utils/train_utils.py @@ -123,12 +123,12 @@ def train( break if train_config.use_peft and train_config.from_peft_checkpoint: - try: + try: intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-2].split("_")[-1]) - 1 intermediate_step = int(train_config.from_peft_checkpoint.split("/")[-1].split("_")[-1]) except Exception: - intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-1].split("_")[-1]) -1 - intermediate_step=0 + intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-1].split("_")[-1]) - 1 + intermediate_step = 0 if epoch < intermediate_epoch: logger.log_rank_zero(f"Skipping epoch {epoch + 1} since fine tuning has already completed for it.") @@ -160,9 +160,7 @@ def train( if train_config.use_peft and train_config.from_peft_checkpoint: # to bring the count of train_step in sync with where it left off if intermediate_step == 0 and epoch == intermediate_epoch: - logger.log_rank_zero( - f"Skipping epoch {epoch + 1}, since fine tuning has already completed for it." - ) + logger.log_rank_zero(f"Skipping epoch {epoch + 1}, since fine tuning has already completed for it.") break if epoch == intermediate_epoch and step == 0: From 4fb76eba6f26579bc44f30c01d43ea30e10085ed Mon Sep 17 00:00:00 2001 From: Tanisha Date: Wed, 5 Nov 2025 04:23:36 +0000 Subject: [PATCH 3/6] Adding support to load checkpoints from epoch Signed-off-by: Tanisha --- QEfficient/finetune/utils/train_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py index 824b94723..494b6bde6 100644 --- a/QEfficient/finetune/utils/train_utils.py +++ b/QEfficient/finetune/utils/train_utils.py @@ -133,6 +133,9 @@ def train( if epoch < intermediate_epoch: logger.log_rank_zero(f"Skipping epoch {epoch + 1} since fine tuning has already completed for it.") continue + if intermediate_step == 0 and epoch == intermediate_epoch: + logger.log_rank_zero(f"Skipping epoch {epoch + 1}, since fine tuning has already completed for it.") + continue logger.log_rank_zero(f"Starting epoch {epoch + 1}/{train_config.num_epochs}") if max_steps_reached: @@ -159,9 +162,6 @@ def train( # resume training from a particular checkpoint, assuming the dataset is not shuffled if train_config.use_peft and train_config.from_peft_checkpoint: # to bring the count of train_step in sync with where it left off - if intermediate_step == 0 and epoch == intermediate_epoch: - logger.log_rank_zero(f"Skipping epoch {epoch + 1}, since fine tuning has already completed for it.") - break if epoch == intermediate_epoch and step == 0: logger.log_rank_zero( From 5443ac6b537b0d31c298cc006d28e3ef9910a919 Mon Sep 17 00:00:00 2001 From: Tanisha Date: Wed, 5 Nov 2025 04:55:16 +0000 Subject: [PATCH 4/6] Adding support to load checkpoints from epoch Signed-off-by: Tanisha --- QEfficient/finetune/utils/train_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py index 494b6bde6..6113f2b03 100644 --- a/QEfficient/finetune/utils/train_utils.py +++ b/QEfficient/finetune/utils/train_utils.py @@ -126,7 +126,7 @@ def train( try: intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-2].split("_")[-1]) - 1 intermediate_step = int(train_config.from_peft_checkpoint.split("/")[-1].split("_")[-1]) - except Exception: + except (IndexError, ValueError): intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-1].split("_")[-1]) - 1 intermediate_step = 0 From eea687cebdfcadedb87f284d1052b0a8497041aa Mon Sep 17 00:00:00 2001 From: Tanisha Date: Mon, 10 Nov 2025 09:34:56 +0000 Subject: [PATCH 5/6] [QEff. Finetune]:Adding support to load checkpoints from epoch Signed-off-by: Tanisha --- QEfficient/finetune/utils/train_utils.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py index 6113f2b03..169b34a45 100644 --- a/QEfficient/finetune/utils/train_utils.py +++ b/QEfficient/finetune/utils/train_utils.py @@ -123,11 +123,12 @@ def train( break if train_config.use_peft and train_config.from_peft_checkpoint: + path = train_config.from_peft_checkpoint.rstrip("/") try: - intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-2].split("_")[-1]) - 1 - intermediate_step = int(train_config.from_peft_checkpoint.split("/")[-1].split("_")[-1]) + intermediate_epoch = int(path.split("/")[-2].split("_")[-1]) - 1 + intermediate_step = int(path.split("/")[-1].split("_")[-1]) except (IndexError, ValueError): - intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-1].split("_")[-1]) - 1 + intermediate_epoch = int(path.split("/")[-1].split("_")[-1])-1 intermediate_step = 0 if epoch < intermediate_epoch: @@ -373,8 +374,8 @@ def train( eval_loss, eval_step_metric, eval_metric, - ) - avg_epoch_time = sum(epoch_times) / len(epoch_times) + ) + avg_epoch_time = sum(epoch_times) / len(epoch_times) if len(epoch_times) > 0 else 0 avg_checkpoint_time = sum(checkpoint_times) / len(checkpoint_times) if len(checkpoint_times) > 0 else 0 results["last_epoch_train_loss"] = train_epoch_loss.cpu() From 3d7ce0a0eea8a3ae656b742924d601df927fb0b8 Mon Sep 17 00:00:00 2001 From: Tanisha Date: Mon, 10 Nov 2025 09:38:40 +0000 Subject: [PATCH 6/6] [QEff. Finetune]:Adding support to load checkpoints from epoch Signed-off-by: Tanisha --- QEfficient/finetune/utils/train_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py index 169b34a45..45b995124 100644 --- a/QEfficient/finetune/utils/train_utils.py +++ b/QEfficient/finetune/utils/train_utils.py @@ -128,7 +128,7 @@ def train( intermediate_epoch = int(path.split("/")[-2].split("_")[-1]) - 1 intermediate_step = int(path.split("/")[-1].split("_")[-1]) except (IndexError, ValueError): - intermediate_epoch = int(path.split("/")[-1].split("_")[-1])-1 + intermediate_epoch = int(path.split("/")[-1].split("_")[-1]) - 1 intermediate_step = 0 if epoch < intermediate_epoch: @@ -374,7 +374,7 @@ def train( eval_loss, eval_step_metric, eval_metric, - ) + ) avg_epoch_time = sum(epoch_times) / len(epoch_times) if len(epoch_times) > 0 else 0 avg_checkpoint_time = sum(checkpoint_times) / len(checkpoint_times) if len(checkpoint_times) > 0 else 0