Skip to content

IndexError: index 1015283389 is out of bounds for dimension 0 with size 1000 #19

@Tokymin

Description

@Tokymin

I encounter a problem during training on Epoch 0: 71%|█████████▏ | 5415/7667 [53:48<22:22, 1.68it/s, loss=0.0182, v_num=vzmh, ema_decay=0.998, train/mse_loss=0.00714, train/id_loss=0.180, train/total_loss=0.0161], the Traceback is as follows:

Traceback (most recent call last): File "src/train.py", line 149, in main metric_dict, _ = train(cfg) File "src/train.py", line 92, in train trainer.fit(model=model, datamodule=datamodule, ckpt_path=cfg.get("ckpt_path")) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 696, in fit self._call_and_handle_interrupt( File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 662, in _call_and_handle_interrupt self.strategy.reconciliate_processes(traceback.format_exc()) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/strategies/ddp.py", line 451, in reconciliate_processes raise DeadlockDetectedException(f"DeadLock detected from rank: {self.global_rank} \n {trace}") pytorch_lightning.utilities.exceptions.DeadlockDetectedException: DeadLock detected from rank: 0 Traceback (most recent call last): File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 648, in _call_and_handle_interrupt return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 93, in launch return function(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 737, in _fit_impl results = self._run(model, ckpt_path=self.ckpt_path) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1168, in _run results = self._run_stage() File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1254, in _run_stage return self._run_train() File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1285, in _run_train self.fit_loop.run() File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run self.advance(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py", line 270, in advance self._outputs = self.epoch_loop.run(self._data_fetcher) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run self.advance(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 203, in advance batch_output = self.batch_loop.run(kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run self.advance(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 87, in advance outputs = self.optimizer_loop.run(optimizers, kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 200, in run self.advance(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 201, in advance result = self._run_optimization(kwargs, self._optimizers[self.optim_progress.optimizer_position]) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 248, in _run_optimization self._optimizer_step(optimizer, opt_idx, kwargs.get("batch_idx", 0), closure) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 358, in _optimizer_step self.trainer._call_lightning_module_hook( File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1552, in _call_lightning_module_hook output = fn(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/core/module.py", line 1666, in optimizer_step optimizer.step(closure=optimizer_closure) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py", line 168, in step step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/strategies/ddp.py", line 286, in optimizer_step optimizer_output = super().optimizer_step(optimizer, opt_idx, closure, model, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/strategies/strategy.py", line 216, in optimizer_step return self.precision_plugin.optimizer_step(model, optimizer, opt_idx, closure, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/native_amp.py", line 85, in optimizer_step closure_result = closure() File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 146, in __call__ self._result = self.closure(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 132, in closure step_output = self._step_fn() File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 407, in _training_step training_step_output = self.trainer._call_strategy_hook("training_step", *kwargs.values()) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1706, in _call_strategy_hook output = fn(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/strategies/ddp.py", line 349, in training_step return self.model(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1156, in forward output = self._run_ddp_forward(*inputs, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1110, in _run_ddp_forward return module_to_run(*inputs[0], **kwargs[0]) # type: ignore[index] File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, **kwargs) File "/home/toky/anaconda3/envs/face_ano/lib/python3.8/site-packages/pytorch_lightning/overrides/base.py", line 79, in forward output = self.module.training_step(*inputs, **kwargs) File "/mnt/disk1/toky/Project/Face_Anonymity/dcface/dcface/src/trainer.py", line 191, in training_step # cv2.imwrite('/mckim/temp/temp3.png',tensor_to_numpy(batch['image'].cpu()[10])) # this is in rgb. so wrong color saved File "/mnt/disk1/toky/Project/Face_Anonymity/dcface/dcface/src/trainer.py", line 156, in shared_step if self.hparams.losses.identity_consistency_loss_lambda > 0 or \ File "/mnt/disk1/toky/Project/Face_Anonymity/dcface/dcface/src/losses/consistency_loss.py", line 76, in calc_identity_consistency_loss x0_pred = calculate_x0_from_eps(eps, noisy_images, timesteps, scheduler) File "/mnt/disk1/toky/Project/Face_Anonymity/dcface/dcface/src/losses/consistency_loss.py", line 9, in calculate_x0_from_eps alpha_prod_t = scheduler.alphas_cumprod[timesteps.cpu()].clone() IndexError: index 1015283389 is out of bounds for dimension 0 with size 1000

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions