File tree Expand file tree Collapse file tree 2 files changed +3
-3
lines changed Expand file tree Collapse file tree 2 files changed +3
-3
lines changed Original file line number Diff line number Diff line change 99N_NODE=4
1010N_GPU_PER_NODE=8
1111
12- # You need to export $RANK , $MASTER_ADDR, $MASTER_PORT automatically for each Node.
12+ # You need to export $MACHINE_RANK , $MASTER_ADDR, $MASTER_PORT automatically for each Node.
1313
1414# config path
1515CONFIG=" configs/xxx_train_config.json"
@@ -37,7 +37,7 @@ accelerate launch \
3737 --mixed_precision ' bf16' \
3838 --dynamo_backend ' no' \
3939 --same_network \
40- --machine_rank $RANK \
40+ --machine_rank $MACHINE_RANK \
4141 --main_process_ip $MASTER_ADDR \
4242 --main_process_port $MASTER_PORT \
4343 --rdzv_backend ' static' \
Original file line number Diff line number Diff line change @@ -431,7 +431,7 @@ def accelerate_train(self):
431431 # Training Loop!
432432 for epoch in range (starting_epoch , self .args .num_train_epochs ):
433433 # set_epoch
434- self .train_dataloader .set_epoch (epoch )
434+ # self.train_dataloader.set_epoch(epoch)
435435
436436 # if we early stop by some ckpts not converging
437437 if self .args .early_stopping and stall_num == self .args .early_stopping_stall_num :
You can’t perform that action at this time.
0 commit comments