Skip to content

Commit cbb109d

Browse files
authored
switch to samples_count in logging of llama2_70b_lora (#725)
* switch to samples_count in logging of llama2_70b_lora * fix logging issue with GBS and update readme
1 parent 09373f2 commit cbb109d

File tree

1 file changed

+7
-9
lines changed

1 file changed

+7
-9
lines changed

llama2_70b_lora/scripts/mlperf_logging_utils.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -123,9 +123,7 @@ def on_train_begin(self, args, state, control, **kwargs):
123123
)
124124
self.mllogger.event(
125125
key=constants.GLOBAL_BATCH_SIZE,
126-
value=args.per_device_train_batch_size
127-
* args.gradient_accumulation_steps
128-
* os.getenv("WORLD_SIZE", 1),
126+
value=self.gbs,
129127
)
130128
self.mllogger.event(
131129
key=constants.TRAIN_SAMPLES,
@@ -168,25 +166,25 @@ def on_step_begin(
168166
self.mllogger.event(
169167
"train_loss",
170168
value=state.log_history[-1]["loss"],
171-
metadata={"step_num": state.log_history[-1]["step"]},
169+
metadata={"samples_count": state.log_history[-1]["step"]*self.gbs},
172170
)
173171
control.should_log = True
174172

175173
if state.global_step % (state.eval_steps) == 0 and state.global_step > 0:
176174
self.mllogger.end(
177175
constants.BLOCK_STOP,
178176
value="",
179-
metadata={"step_num": state.log_history[-1]["step"]},
177+
metadata={"samples_count": state.log_history[-1]["step"]*self.gbs},
180178
)
181179
self.mllogger.event(
182180
constants.EVAL_ACCURACY,
183181
value=state.log_history[-1]["eval_loss"],
184-
metadata={"samples_num": state.log_history[-1]["step"]*self.gbs},
182+
metadata={"samples_count": state.log_history[-1]["step"]*self.gbs},
185183
)
186184
self.mllogger.start(
187185
constants.BLOCK_START,
188186
value="",
189-
metadata={"step_num": state.log_history[-1]["step"]},
187+
metadata={"samples_count": state.log_history[-1]["step"]},
190188
)
191189
control.should_log = True
192190
eval_loss_list = [
@@ -198,7 +196,7 @@ def on_step_begin(
198196
constants.RUN_STOP,
199197
value=eval_loss_list[-1],
200198
metadata={
201-
"samples_num": state.log_history[-1]["step"]*self.gbs,
199+
"samples_count": state.log_history[-1]["step"]*self.gbs,
202200
"status": "success",
203201
},
204202
)
@@ -207,7 +205,7 @@ def on_step_begin(
207205
self.mllogger.end(
208206
constants.RUN_STOP,
209207
value=eval_loss_list[-1],
210-
metadata={"step_num": state.log_history[-1]["step"], "status": "fail"},
208+
metadata={"samples_count": state.log_history[-1]["step"]*self.gbs, "status": "fail"},
211209
)
212210

213211
return control

0 commit comments

Comments
 (0)