switch to samples_count in logging of llama2_70b_lora (#725)

itayhubara · web-flow · commit cbb109dea6ea · 2024-04-04T08:43:13.000-07:00
* switch to samples_count in logging of llama2_70b_lora

* fix logging issue with GBS and update readme
diff --git a/llama2_70b_lora/scripts/mlperf_logging_utils.py b/llama2_70b_lora/scripts/mlperf_logging_utils.py
@@ -123,9 +123,7 @@ def on_train_begin(self, args, state, control, **kwargs):
         )
         self.mllogger.event(
             key=constants.GLOBAL_BATCH_SIZE,
-            value=args.per_device_train_batch_size
-            * args.gradient_accumulation_steps
-            * os.getenv("WORLD_SIZE", 1),
+            value=self.gbs,
         )
         self.mllogger.event(
             key=constants.TRAIN_SAMPLES,
@@ -168,25 +166,25 @@ def on_step_begin(
             self.mllogger.event(
                 "train_loss",
                 value=state.log_history[-1]["loss"],
-                metadata={"step_num": state.log_history[-1]["step"]},
+                metadata={"samples_count": state.log_history[-1]["step"]*self.gbs},
             )
             control.should_log = True
 
         if state.global_step % (state.eval_steps) == 0 and state.global_step > 0:
             self.mllogger.end(
                 constants.BLOCK_STOP,
                 value="",
-                metadata={"step_num": state.log_history[-1]["step"]},
+                metadata={"samples_count": state.log_history[-1]["step"]*self.gbs},
             )
             self.mllogger.event(
                 constants.EVAL_ACCURACY,
                 value=state.log_history[-1]["eval_loss"],
-                metadata={"samples_num": state.log_history[-1]["step"]*self.gbs},
+                metadata={"samples_count": state.log_history[-1]["step"]*self.gbs},
             )
             self.mllogger.start(
                 constants.BLOCK_START,
                 value="",
-                metadata={"step_num": state.log_history[-1]["step"]},
+                metadata={"samples_count": state.log_history[-1]["step"]},
             )            
             control.should_log = True
         eval_loss_list = [
@@ -198,7 +196,7 @@ def on_step_begin(
                 constants.RUN_STOP,
                 value=eval_loss_list[-1],
                 metadata={
-                    "samples_num": state.log_history[-1]["step"]*self.gbs,
+                    "samples_count": state.log_history[-1]["step"]*self.gbs,
                     "status": "success",
                 },
             )
@@ -207,7 +205,7 @@ def on_step_begin(
             self.mllogger.end(
                 constants.RUN_STOP,
                 value=eval_loss_list[-1],
-                metadata={"step_num": state.log_history[-1]["step"], "status": "fail"},
+                metadata={"samples_count": state.log_history[-1]["step"]*self.gbs, "status": "fail"},
             )
 
         return control