-
Notifications
You must be signed in to change notification settings - Fork 146
Open
Description
I was plotting some rollout visualizations on the viscoelastic_instability and I noticed that there is a repeated frame in the sequence, which then leads to offsets in the rollout prediction vs. ground truth. See the following figure, with the duplication happening at timesteps 7 & 8.
Based on this, I checked and found a total of 128 cases across the 257 trajectories contained in the viscoelastic dataset. I've plotted a number of frames around instances, and I think the duplicate frames can simply be removed. Still, this seems to be a rather critical issue given it's impact on the rollout evaluation, but also on the training.
JSON of duplicates
{
"data_dir": "viscoelastic_instability/data",
"files_checked": 21,
"trajectories_checked": 257,
"total_duplicates": 128,
"duplicates": [
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_AH.hdf5",
"trajectory": 9,
"frame_idx": 0,
"n_timesteps": 20
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 0,
"frame_idx": 28,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 0,
"frame_idx": 54,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 1,
"frame_idx": 20,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 1,
"frame_idx": 42,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 2,
"frame_idx": 15,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 2,
"frame_idx": 51,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 3,
"frame_idx": 5,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 3,
"frame_idx": 39,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 4,
"frame_idx": 17,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 5,
"frame_idx": 17,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 5,
"frame_idx": 33,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 6,
"frame_idx": 28,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 7,
"frame_idx": 11,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 7,
"frame_idx": 35,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 8,
"frame_idx": 2,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 8,
"frame_idx": 31,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 9,
"frame_idx": 24,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 9,
"frame_idx": 25,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 10,
"frame_idx": 4,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 10,
"frame_idx": 30,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 10,
"frame_idx": 58,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 11,
"frame_idx": 23,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 11,
"frame_idx": 51,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 12,
"frame_idx": 4,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 12,
"frame_idx": 47,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 13,
"frame_idx": 16,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 13,
"frame_idx": 40,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 14,
"frame_idx": 10,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 15,
"frame_idx": 10,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 15,
"frame_idx": 56,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 16,
"frame_idx": 57,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 17,
"frame_idx": 26,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 18,
"frame_idx": 14,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 19,
"frame_idx": 50,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 21,
"frame_idx": 47,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 22,
"frame_idx": 1,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 23,
"frame_idx": 22,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 24,
"frame_idx": 19,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 24,
"frame_idx": 54,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 25,
"frame_idx": 14,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 25,
"frame_idx": 43,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 26,
"frame_idx": 20,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 27,
"frame_idx": 29,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 28,
"frame_idx": 11,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 29,
"frame_idx": 5,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 30,
"frame_idx": 0,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 30,
"frame_idx": 41,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 31,
"frame_idx": 25,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 31,
"frame_idx": 46,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_CAR.hdf5",
"trajectory": 32,
"frame_idx": 23,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 1,
"frame_idx": 12,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 2,
"frame_idx": 33,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 3,
"frame_idx": 5,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 3,
"frame_idx": 45,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 4,
"frame_idx": 20,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 5,
"frame_idx": 35,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 5,
"frame_idx": 55,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 6,
"frame_idx": 7,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 6,
"frame_idx": 18,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 6,
"frame_idx": 27,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 6,
"frame_idx": 38,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 6,
"frame_idx": 49,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 6,
"frame_idx": 58,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 7,
"frame_idx": 11,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 7,
"frame_idx": 21,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 7,
"frame_idx": 32,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 7,
"frame_idx": 38,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 7,
"frame_idx": 50,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 8,
"frame_idx": 4,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 8,
"frame_idx": 15,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 8,
"frame_idx": 34,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 8,
"frame_idx": 53,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 9,
"frame_idx": 23,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 9,
"frame_idx": 41,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 10,
"frame_idx": 0,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 10,
"frame_idx": 18,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 10,
"frame_idx": 47,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 11,
"frame_idx": 8,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 11,
"frame_idx": 38,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 11,
"frame_idx": 57,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 12,
"frame_idx": 19,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 12,
"frame_idx": 35,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 13,
"frame_idx": 21,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 13,
"frame_idx": 32,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 14,
"frame_idx": 55,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 15,
"frame_idx": 16,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 16,
"frame_idx": 28,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 18,
"frame_idx": 53,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 19,
"frame_idx": 52,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 20,
"frame_idx": 52,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 21,
"frame_idx": 52,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 22,
"frame_idx": 55,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 23,
"frame_idx": 37,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 24,
"frame_idx": 34,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 25,
"frame_idx": 19,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 25,
"frame_idx": 55,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 26,
"frame_idx": 33,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 27,
"frame_idx": 11,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_EIT.hdf5",
"trajectory": 27,
"frame_idx": 38,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_transtochaos_EIT_L.hdf5",
"trajectory": 20,
"frame_idx": 12,
"n_timesteps": 20
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_transtochaos_EIT_L.hdf5",
"trajectory": 26,
"frame_idx": 14,
"n_timesteps": 20
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_transtochaos_EIT_SAR.hdf5",
"trajectory": 26,
"frame_idx": 17,
"n_timesteps": 20
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_transtochaos_EIT_SAR.hdf5",
"trajectory": 28,
"frame_idx": 9,
"n_timesteps": 20
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_transtononchaos_EIT_L.hdf5",
"trajectory": 25,
"frame_idx": 7,
"n_timesteps": 20
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_transtononchaos_EIT_SAR.hdf5",
"trajectory": 0,
"frame_idx": 8,
"n_timesteps": 20
},
{
"file": "viscoelastic_instability/data/train/viscoelastic_instability_transtononchaos_EIT_SAR.hdf5",
"trajectory": 24,
"frame_idx": 3,
"n_timesteps": 20
},
{
"file": "viscoelastic_instability/data/valid/viscoelastic_instability_CAR.hdf5",
"trajectory": 0,
"frame_idx": 20,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/valid/viscoelastic_instability_CAR.hdf5",
"trajectory": 1,
"frame_idx": 50,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/valid/viscoelastic_instability_CAR.hdf5",
"trajectory": 2,
"frame_idx": 44,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/valid/viscoelastic_instability_EIT.hdf5",
"trajectory": 0,
"frame_idx": 50,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/valid/viscoelastic_instability_EIT.hdf5",
"trajectory": 1,
"frame_idx": 12,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/valid/viscoelastic_instability_EIT.hdf5",
"trajectory": 1,
"frame_idx": 39,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/valid/viscoelastic_instability_EIT.hdf5",
"trajectory": 1,
"frame_idx": 57,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/test/viscoelastic_instability_CAR.hdf5",
"trajectory": 0,
"frame_idx": 12,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/test/viscoelastic_instability_CAR.hdf5",
"trajectory": 0,
"frame_idx": 50,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/test/viscoelastic_instability_CAR.hdf5",
"trajectory": 1,
"frame_idx": 18,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/test/viscoelastic_instability_CAR.hdf5",
"trajectory": 1,
"frame_idx": 36,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/test/viscoelastic_instability_CAR.hdf5",
"trajectory": 2,
"frame_idx": 26,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/test/viscoelastic_instability_EIT.hdf5",
"trajectory": 0,
"frame_idx": 6,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/test/viscoelastic_instability_EIT.hdf5",
"trajectory": 0,
"frame_idx": 18,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/test/viscoelastic_instability_EIT.hdf5",
"trajectory": 0,
"frame_idx": 39,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/test/viscoelastic_instability_EIT.hdf5",
"trajectory": 0,
"frame_idx": 40,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/test/viscoelastic_instability_EIT.hdf5",
"trajectory": 0,
"frame_idx": 56,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/test/viscoelastic_instability_EIT.hdf5",
"trajectory": 1,
"frame_idx": 34,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/test/viscoelastic_instability_EIT.hdf5",
"trajectory": 2,
"frame_idx": 23,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/test/viscoelastic_instability_EIT.hdf5",
"trajectory": 2,
"frame_idx": 57,
"n_timesteps": 60
},
{
"file": "viscoelastic_instability/data/test/viscoelastic_instability_transtochaos_EIT_SAR.hdf5",
"trajectory": 1,
"frame_idx": 4,
"n_timesteps": 20
}
]
}
Duplication finding script
(the script was written by claude code)
#!/usr/bin/env python
"""Scan viscoelastic_instability dataset for duplicate consecutive frames."""
import argparse
import json
import os
import h5py
import numpy as np
def check_file_for_duplicates(filepath: str) -> list[dict]:
"""Check all trajectories in an HDF5 file for duplicate consecutive frames.
Returns list of duplicates found, each with file, trajectory, frame_idx, n_timesteps.
"""
duplicates = []
with h5py.File(filepath, "r") as f:
pressure = f["t0_fields/pressure"][:] # (n_traj, n_timesteps, H, W)
n_trajectories, n_timesteps = pressure.shape[:2]
for traj_idx in range(n_trajectories):
traj_data = pressure[traj_idx] # (n_timesteps, H, W)
# Compute frame-to-frame differences
for t in range(n_timesteps - 1):
diff = traj_data[t + 1] - traj_data[t]
max_diff = np.abs(diff).max()
if max_diff == 0:
duplicates.append(
{
"file": filepath,
"trajectory": int(traj_idx),
"frame_idx": int(t),
"n_timesteps": int(n_timesteps),
}
)
return duplicates
def main():
parser = argparse.ArgumentParser(
description="Scan viscoelastic_instability dataset for duplicate frames"
)
parser.add_argument(
"--data-dir",
default="viscoelastic_instability/data",
help="Base data directory",
)
parser.add_argument(
"--output",
default="duplicates.json",
help="Output JSON file path",
)
parser.add_argument(
"--splits",
nargs="+",
default=["train", "valid", "test"],
help="Splits to check",
)
args = parser.parse_args()
all_duplicates = []
files_checked = 0
trajectories_checked = 0
for split in args.splits:
split_path = os.path.join(args.data_dir, split)
if not os.path.exists(split_path):
print(f"Warning: {split_path} does not exist, skipping")
continue
hdf5_files = sorted(
[f for f in os.listdir(split_path) if f.endswith(".hdf5")]
)
for fname in hdf5_files:
filepath = os.path.join(split_path, fname)
print(f"Checking {split}/{fname}...", end=" ", flush=True)
duplicates = check_file_for_duplicates(filepath)
all_duplicates.extend(duplicates)
files_checked += 1
# Count trajectories
with h5py.File(filepath, "r") as f:
n_traj = int(f.attrs.get("n_trajectories", f["t0_fields/pressure"].shape[0]))
trajectories_checked += n_traj
if duplicates:
print(f"FOUND {len(duplicates)} duplicates!")
else:
print("OK")
# Write results
result = {
"data_dir": args.data_dir,
"files_checked": files_checked,
"trajectories_checked": trajectories_checked,
"total_duplicates": len(all_duplicates),
"duplicates": all_duplicates,
}
with open(args.output, "w") as f:
json.dump(result, f, indent=2)
print()
print("=" * 50)
print(f"Files checked: {files_checked}")
print(f"Trajectories checked: {trajectories_checked}")
print(f"Total duplicates found: {len(all_duplicates)}")
print(f"Results saved to: {args.output}")
if all_duplicates:
print()
print("Duplicate summary:")
# Group by file
by_file = {}
for dup in all_duplicates:
fname = os.path.basename(dup["file"])
if fname not in by_file:
by_file[fname] = []
by_file[fname].append(dup)
for fname, dups in sorted(by_file.items()):
print(f" {fname}: {len(dups)} duplicates")
for d in dups[:5]: # Show first 5
print(f" - trajectory {d['trajectory']}, frame {d['frame_idx']}")
if len(dups) > 5:
print(f" ... and {len(dups) - 5} more")
if __name__ == "__main__":
main()Metadata
Metadata
Assignees
Labels
No labels