Skip to content
This repository was archived by the owner on Feb 25, 2022. It is now read-only.

Commit 8875b3a

Browse files
authored
Update initial heartbeat timeout to function as a grace period
1 parent bb00ad0 commit 8875b3a

File tree

1 file changed

+7
-4
lines changed

1 file changed

+7
-4
lines changed

run_experiment.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -173,15 +173,21 @@ def main(_run):
173173
curr_step = {}
174174
seen_predictions = set()
175175

176+
heartbeat_timeout = args.initial_heartbeat_timeout * 2
176177
while True:
177-
heartbeat_timeout = args.initial_heartbeat_timeout
178178
last_tb_log_time = time.time()
179+
start_time = time.time()
179180
q = queue.Queue()
180181
trainthd = threading.Thread(target=train_thread, args=(args, args.tpu, _run._id, q))
181182
trainthd.start()
182183

183184
while trainthd.is_alive():
184185
time.sleep(60)
186+
187+
if start_time + args.initial_heartbeat_timeout < time.time():
188+
# after initial args.initial_heartbeat_timeout grace period, now we want to set the timeout threshold much lower
189+
heartbeat_timeout = args.heartbeat_timeout
190+
185191
print('Polling tensorboard for metrics...')
186192
data = get_run_data(tensorboard_port)
187193
for k in data.keys():
@@ -195,9 +201,6 @@ def main(_run):
195201

196202
# found something new, so logging!
197203
last_tb_log_time = time.time()
198-
199-
# after logging for the first time, how we want to set the timeout threshold much lower
200-
heartbeat_timeout = args.heartbeat_timeout
201204

202205
curr_step[k] = step
203206

0 commit comments

Comments
 (0)