I encounter a problem during training on Epoch 0: 71%|█████████▏ | 5415/7667 [53:48<22:22, 1.68it/s, loss=0.0182, v_num=vzmh, ema_decay=0.998, train/mse_loss=0.00714, train/id_loss=0.180, train/total_loss=0.0161], the Traceback is as follows:
Traceback (most recent call last): File "src/train.py", line 149, in main metric_dict, _ = train(cfg) File "src/train.py", line 92, in train trainer.fit(model=model, datamodule=datamodule, ckpt_path=cfg.get("ckpt_path")) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 696, in fit self._call_and_handle_interrupt( File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 662, in _call_and_handle_interrupt self.strategy.reconciliate_processes(traceback.format_exc()) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/strategies/ddp.py", line 451, in reconciliate_processes raise DeadlockDetectedException(f"DeadLock detected from rank: {self.global_rank} \n {trace}") pytorch_lightning.utilities.exceptions.DeadlockDetectedException: DeadLock detected from rank: 0 Traceback (most recent call last): File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 648, in _call_and_handle_interrupt return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 93, in launch return function(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 737, in _fit_impl results = self._run(model, ckpt_path=self.ckpt_path) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1168, in _run results = self._run_stage() File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1254, in _run_stage return self._run_train() File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1285, in _run_train self.fit_loop.run() File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run self.advance(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py", line 270, in advance self._outputs = self.epoch_loop.run(self._data_fetcher) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run self.advance(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 203, in advance batch_output = self.batch_loop.run(kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run self.advance(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 87, in advance outputs = self.optimizer_loop.run(optimizers, kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run self.advance(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 201, in advance result = self._run_optimization(kwargs, self._optimizers[self.optim_progress.optimizer_position]) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 248, in _run_optimization self._optimizer_step(optimizer, opt_idx, kwargs.get("batch_idx", 0), closure) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 358, in _optimizer_step self.trainer._call_lightning_module_hook( File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1552, in _call_lightning_module_hook output = fn(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/core/module.py", line 1666, in optimizer_step optimizer.step(closure=optimizer_closure) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py", line 168, in step step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/strategies/ddp.py", line 286, in optimizer_step optimizer_output = super().optimizer_step(optimizer, opt_idx, closure, model, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/strategies/strategy.py", line 216, in optimizer_step return self.precision_plugin.optimizer_step(model, optimizer, opt_idx, closure, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/native_amp.py", line 85, in optimizer_step closure_result = closure() File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 146, in __call__ self._result = self.closure(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 132, in closure step_output = self._step_fn() File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 407, in _training_step training_step_output = self.trainer._call_strategy_hook("training_step", *kwargs.values()) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1706, in _call_strategy_hook output = fn(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/strategies/ddp.py", line 349, in training_step return self.model(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1156, in forward output = self._run_ddp_forward(*inputs, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1110, in _run_ddp_forward return module_to_run(*inputs[0], **kwargs[0]) # type: ignore[index] File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/overrides/base.py", line 79, in forward output = self.module.training_step(*inputs, **kwargs) File "/mnt/disk1/toky/Project/Face_Anonymity/dcface/dcface/src/trainer.py", line 191, in training_step # cv2.imwrite('/mckim/temp/temp3.png',tensor_to_numpy(batch['image'].cpu()[10])) # this is in rgb. so wrong color saved File "/mnt/disk1/toky/Project/Face_Anonymity/dcface/dcface/src/trainer.py", line 156, in shared_step if self.hparams.losses.identity_consistency_loss_lambda > 0 or \ File "/mnt/disk1/toky/Project/Face_Anonymity/dcface/dcface/src/losses/consistency_loss.py", line 76, in calc_identity_consistency_loss x0_pred = calculate_x0_from_eps(eps, noisy_images, timesteps, scheduler) File "/mnt/disk1/toky/Project/Face_Anonymity/dcface/dcface/src/losses/consistency_loss.py", line 9, in calculate_x0_from_eps alpha_prod_t = scheduler.alphas_cumprod[timesteps.cpu()].clone() IndexError: index 1015283389 is out of bounds for dimension 0 with size 1000
I encounter a problem during training on Epoch 0: 71%|█████████▏ | 5415/7667 [53:48<22:22, 1.68it/s, loss=0.0182, v_num=vzmh, ema_decay=0.998, train/mse_loss=0.00714, train/id_loss=0.180, train/total_loss=0.0161], the Traceback is as follows:
Traceback (most recent call last): File "src/train.py", line 149, in main metric_dict, _ = train(cfg) File "src/train.py", line 92, in train trainer.fit(model=model, datamodule=datamodule, ckpt_path=cfg.get("ckpt_path")) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 696, in fit self._call_and_handle_interrupt( File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 662, in _call_and_handle_interrupt self.strategy.reconciliate_processes(traceback.format_exc()) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/strategies/ddp.py", line 451, in reconciliate_processes raise DeadlockDetectedException(f"DeadLock detected from rank: {self.global_rank} \n {trace}") pytorch_lightning.utilities.exceptions.DeadlockDetectedException: DeadLock detected from rank: 0 Traceback (most recent call last): File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 648, in _call_and_handle_interrupt return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 93, in launch return function(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 737, in _fit_impl results = self._run(model, ckpt_path=self.ckpt_path) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1168, in _run results = self._run_stage() File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1254, in _run_stage return self._run_train() File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1285, in _run_train self.fit_loop.run() File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run self.advance(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py", line 270, in advance self._outputs = self.epoch_loop.run(self._data_fetcher) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run self.advance(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 203, in advance batch_output = self.batch_loop.run(kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run self.advance(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 87, in advance outputs = self.optimizer_loop.run(optimizers, kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run self.advance(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 201, in advance result = self._run_optimization(kwargs, self._optimizers[self.optim_progress.optimizer_position]) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 248, in _run_optimization self._optimizer_step(optimizer, opt_idx, kwargs.get("batch_idx", 0), closure) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 358, in _optimizer_step self.trainer._call_lightning_module_hook( File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1552, in _call_lightning_module_hook output = fn(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/core/module.py", line 1666, in optimizer_step optimizer.step(closure=optimizer_closure) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py", line 168, in step step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/strategies/ddp.py", line 286, in optimizer_step optimizer_output = super().optimizer_step(optimizer, opt_idx, closure, model, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/strategies/strategy.py", line 216, in optimizer_step return self.precision_plugin.optimizer_step(model, optimizer, opt_idx, closure, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/native_amp.py", line 85, in optimizer_step closure_result = closure() File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 146, in __call__ self._result = self.closure(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 132, in closure step_output = self._step_fn() File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 407, in _training_step training_step_output = self.trainer._call_strategy_hook("training_step", *kwargs.values()) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1706, in _call_strategy_hook output = fn(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/strategies/ddp.py", line 349, in training_step return self.model(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1156, in forward output = self._run_ddp_forward(*inputs, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1110, in _run_ddp_forward return module_to_run(*inputs[0], **kwargs[0]) # type: ignore[index] File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/overrides/base.py", line 79, in forward output = self.module.training_step(*inputs, **kwargs) File "/mnt/disk1/toky/Project/Face_Anonymity/dcface/dcface/src/trainer.py", line 191, in training_step # cv2.imwrite('/mckim/temp/temp3.png',tensor_to_numpy(batch['image'].cpu()[10])) # this is in rgb. so wrong color saved File "/mnt/disk1/toky/Project/Face_Anonymity/dcface/dcface/src/trainer.py", line 156, in shared_step if self.hparams.losses.identity_consistency_loss_lambda > 0 or \ File "/mnt/disk1/toky/Project/Face_Anonymity/dcface/dcface/src/losses/consistency_loss.py", line 76, in calc_identity_consistency_loss x0_pred = calculate_x0_from_eps(eps, noisy_images, timesteps, scheduler) File "/mnt/disk1/toky/Project/Face_Anonymity/dcface/dcface/src/losses/consistency_loss.py", line 9, in calculate_x0_from_eps alpha_prod_t = scheduler.alphas_cumprod[timesteps.cpu()].clone() IndexError: index 1015283389 is out of bounds for dimension 0 with size 1000