Open
Description
python -m train --cfg configs/config_h3d_stage2.yaml --nodebug
Epoch 0/299 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 578/578 0:05:47 • 0:00:00 1.67it/s 2025-03-27 12:53:34,870 Epoch 0:
Epoch 1/299 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 578/578 0:05:49 • 0:00:00 1.68it/s 2025-03-27 12:59:41,727 Epoch 1: loss_total 4.618e+00
Epoch 2/299 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 578/578 0:05:47 • 0:00:00 1.68it/s 2025-03-27 13:06:26,242 Epoch 2: loss_total 3.436e+00
Epoch 3/299 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 578/578 0:05:46 • 0:00:00 1.67it/s 2025-03-27 13:13:14,351 Epoch 3: loss_total 3.022e+00
Epoch 4/299 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 578/578 0:05:48 • 0:00:00 1.68it/s 2025-03-27 13:20:02,474 Epoch 4: loss_total 2.830e+00
Epoch 5/299 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 578/578 0:05:47 • 0:00:00 1.67it/s 2025-03-27 13:26:48,871 Epoch 5: loss_total 2.671e+00
Epoch 6/299 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 578/578 0:05:46 • 0:00:00 1.68it/s 2025-03-27 13:33:35,438 Epoch 6: loss_total 2.500e+00
Epoch 7/299 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 578/578 0:05:48 • 0:00:00 1.68it/s 2025-03-27 13:40:22,982 Epoch 7: loss_total 2.334e+00
Epoch 8/299 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 578/578 0:05:46 • 0:00:00 1.68it/s 2025-03-27 13:47:10,023 Epoch 8: loss_total 2.172e+00
Epoch 9/299 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 578/578 0:05:46 • 0:00:00 1.68it/s [rank2]: Traceback (most recent call last):
[rank2]: File "/home/xxx/anaconda3/envs/wsdm/lib/python3.10/runpy.py", line 196, in _run_module_as_main
[rank2]: return _run_code(code, main_globals, None,
[rank2]: File "/home/xxx/anaconda3/envs/wsdm/lib/python3.10/runpy.py", line 86, in _run_code
[rank2]: exec(code, run_globals)
[rank2]: File "/data2/xxx/MotionGPT/train.py", line 94, in <module>
[rank2]: main()
[rank2]: File "/data2/xxx/MotionGPT/train.py", line 85, in main
[rank2]: trainer.fit(model, datamodule=datamodule)
[rank2]: File "/home/xxx/anaconda3/envs/wsdm/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
[rank2]: call._call_and_handle_interrupt(
[rank2]: File "/home/xxx/anaconda3/envs/wsdm/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
[rank2]: return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
[rank2]: File "/home/xxx/anaconda3/envs/wsdm/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
[rank2]: return function(*args, **kwargs)
[rank2]: File "/home/xxx/anaconda3/envs/wsdm/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
[rank2]: self._run(model, ckpt_path=ckpt_path)
[rank2]: File "/home/xxx/anaconda3/envs/wsdm/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
[rank2]: results = self._run_stage()
[rank2]: File "/home/xxx/anaconda3/envs/wsdm/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1056, in _run_stage
[rank2]: self.fit_loop.run()
[rank2]: File "/home/xxx/anaconda3/envs/wsdm/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 216, in run
[rank2]: self.advance()
[rank2]: File "/home/xxx/anaconda3/envs/wsdm/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 455, in advance
[rank2]: self.epoch_loop.run(self._data_fetcher)
[rank2]: File "/home/xxx/anaconda3/envs/wsdm/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 151, in run
[rank2]: self.on_advance_end(data_fetcher)
[rank2]: File "/home/xxx/anaconda3/envs/wsdm/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 370, in on_advance_end
[rank2]: self.val_loop.run()
[rank2]: File "/home/xxx/anaconda3/envs/wsdm/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator
[rank2]: return loop_run(self, *args, **kwargs)
[rank2]: File "/home/xxx/anaconda3/envs/wsdm/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run
[rank2]: self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
[rank2]: File "/home/xxx/anaconda3/envs/wsdm/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step
[rank2]: output = call._call_strategy_hook(trainer, hook_name, *step_args)
[rank2]: File "/home/xxx/anaconda3/envs/wsdm/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
[rank2]: output = fn(*args, **kwargs)
[rank2]: File "/home/xxx/anaconda3/envs/wsdm/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step
[rank2]: return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs)
[rank2]: File "/home/xxx/anaconda3/envs/wsdm/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
[rank2]: wrapper_output = wrapper_module(*args, **kwargs)
[rank2]: File "/home/xxx/anaconda3/envs/wsdm/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
[rank2]: return self._call_impl(*args, **kwargs)
[rank2]: File "/home/xxx/anaconda3/envs/wsdm/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
[rank2]: return forward_call(*args, **kwargs)
[rank2]: File "/home/xxx/anaconda3/envs/wsdm/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1636, in forward
[rank2]: else self._run_ddp_forward(*inputs, **kwargs)
[rank2]: File "/home/xxx/anaconda3/envs/wsdm/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1454, in _run_ddp_forward
[rank2]: return self.module(*inputs, **kwargs) # type: ignore[index]
[rank2]: File "/home/xxx/anaconda3/envs/wsdm/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
[rank2]: return self._call_impl(*args, **kwargs)
[rank2]: File "/home/xxx/anaconda3/envs/wsdm/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
[rank2]: return forward_call(*args, **kwargs)
[rank2]: File "/home/xxx/anaconda3/envs/wsdm/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
[rank2]: out = method(*_args, **_kwargs)
[rank2]: File "/data2/xxx/MotionGPT/mGPT/models/base.py", line 28, in validation_step
[rank2]: return self.allsplit_step("val", batch, batch_idx)
[rank2]: File "/data2/xxx/MotionGPT/mGPT/models/mgpt.py", line 393, in allsplit_step
[rank2]: rs_set = self.val_t2m_forward(batch)
[rank2]: File "/home/xxx/anaconda3/envs/wsdm/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
[rank2]: return func(*args, **kwargs)
[rank2]: File "/data2/xxx/MotionGPT/mGPT/models/mgpt.py", line 166, in val_t2m_forward
[rank2]: outputs = self.lm.generate_conditional(texts,
[rank2]: File "/data2/xxx/MotionGPT/mGPT/archs/mgpt_lm.py", line 345, in generate_conditional
[rank2]: outputs_tokens, cleaned_text = self.generate_direct(inputs,
[rank2]: File "/data2/xxx/MotionGPT/mGPT/archs/mgpt_lm.py", line 274, in generate_direct
[rank2]: outputs_tokens, cleaned_text = self.motion_string_to_token(
[rank2]: File "/data2/xxx/MotionGPT/mGPT/archs/mgpt_lm.py", line 416, in motion_string_to_token
[rank2]: token_list = [
[rank2]: File "/data2/xxx/MotionGPT/mGPT/archs/mgpt_lm.py", line 417, in <listcomp>
[rank2]: int(i.split('_')[-1].replace('>', ''))
[rank2]: ValueError: invalid literal for int() with base 10: '427='
[rank: 2] Child process with PID 75745 terminated with code 1. Forcefully terminating all other processes to avoid zombies 🧟
已杀死
Metadata
Metadata
Assignees
Labels
No labels