-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun_validation_multi_gpu.sh
More file actions
executable file
·204 lines (181 loc) · 6.13 KB
/
run_validation_multi_gpu.sh
File metadata and controls
executable file
·204 lines (181 loc) · 6.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#!/bin/bash
# Multi-GPU Validation Script
# This script launches multiple instances of validate_on_dataset.py across different GPUs
# Default values
TOTAL_GPUS=8
TIMESTEP_SKIPPING_LORA_PATH="/home/swhong/workspace/diffusion_inversion/train_result/flux_train_random_access_5_steps/checkpoint-3000/pytorch_lora_weights.safetensors"
TIMESTEP_TO_SKIP_TO=0.31
GENERATION_DATA_DIR="/data/inversion_data/diffusionDB_enhanced_360k_validation_data"
VALIDATED_RESULT_DIR="/home/swhong/workspace/diffusion_inversion/diffusionDB_enhanced_360k_validation_result"
RUN_TYPE="lora"
OFFLOAD=True
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--total_gpus)
TOTAL_GPUS="$2"
shift 2
;;
--timestep_skipping_lora_path)
TIMESTEP_SKIPPING_LORA_PATH="$2"
shift 2
;;
--timestep_to_skip_to)
TIMESTEP_TO_SKIP_TO="$2"
shift 2
;;
--generation_data_dir)
GENERATION_DATA_DIR="$2"
shift 2
;;
--validated_result_dir)
VALIDATED_RESULT_DIR="$2"
shift 2
;;
--run_type)
RUN_TYPE="$2"
shift 2
;;
--offload)
OFFLOAD="$2"
shift 2
;;
-h|--help)
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Options:"
echo " --total_gpus NUM Number of GPUs to use (default: 8)"
echo " --timestep_skipping_lora_path PATH Path to LoRA weights"
echo " --timestep_to_skip_to FLOAT Timestep to skip to (default: 0.31)"
echo " --generation_data_dir PATH Input data directory"
echo " --validated_result_dir PATH Output results directory"
echo " --run_type TYPE Run type: lora, baseline, or straight_timestep_skipping (default: lora)"
echo " --offload BOOL Whether to offload model (default: False)"
echo " -h, --help Show this help message"
echo ""
echo "Example:"
echo " $0 --total_gpus 4 --run_type baseline"
exit 0
;;
*)
echo "Unknown option: $1"
echo "Use --help for usage information"
exit 1
;;
esac
done
# Validate arguments
if [[ ! $TOTAL_GPUS =~ ^[0-9]+$ ]] || [[ $TOTAL_GPUS -lt 1 ]]; then
echo "Error: --total_gpus must be a positive integer"
exit 1
fi
if [[ ! "$RUN_TYPE" =~ ^(lora|baseline|straight_timestep_skipping)$ ]]; then
echo "Error: --run_type must be one of: lora, baseline, straight_timestep_skipping"
exit 1
fi
# Check if the validation script exists
SCRIPT_PATH="src/validate_on_dataset.py"
if [[ ! -f "$SCRIPT_PATH" ]]; then
echo "Error: Cannot find validation script at $SCRIPT_PATH"
exit 1
fi
# Create output directory if it doesn't exist
mkdir -p "$VALIDATED_RESULT_DIR"
echo "========================================"
echo "Multi-GPU Validation Script"
echo "========================================"
echo "Total GPUs: $TOTAL_GPUS"
echo "Run Type: $RUN_TYPE"
echo "LoRA Path: $TIMESTEP_SKIPPING_LORA_PATH"
echo "Timestep to Skip: $TIMESTEP_TO_SKIP_TO"
echo "Input Directory: $GENERATION_DATA_DIR"
echo "Output Directory: $VALIDATED_RESULT_DIR"
echo "Offload: $OFFLOAD"
echo "========================================"
echo ""
# Array to store process IDs
pids=()
# Launch validation processes for each GPU
for ((gpu_index=0; gpu_index<TOTAL_GPUS; gpu_index++)); do
echo "Launching validation on GPU $gpu_index..."
# Set CUDA_VISIBLE_DEVICES to restrict each process to its assigned GPU
# export CUDA_VISIBLE_DEVICES=$gpu_index
# Launch the validation script in the background
python "$SCRIPT_PATH" \
--gpu_index $gpu_index \
--total_gpus $TOTAL_GPUS \
--timestep_skipping_lora_path "$TIMESTEP_SKIPPING_LORA_PATH" \
--timestep_to_skip_to $TIMESTEP_TO_SKIP_TO \
--generation_data_dir "$GENERATION_DATA_DIR" \
--validated_result_dir "$VALIDATED_RESULT_DIR" \
--run_type "$RUN_TYPE" \
--offload $OFFLOAD \
> "gpu_${gpu_index}_validation.log" 2>&1 &
# Store the process ID
pids+=($!)
# Small delay to avoid overwhelming the system
sleep 2
done
echo ""
echo "All $TOTAL_GPUS validation processes launched!"
echo "Process IDs: ${pids[*]}"
echo "Logs are being written to gpu_X_validation.log files"
echo ""
echo "Monitoring progress..."
# Function to check if all processes are still running
all_running() {
for pid in "${pids[@]}"; do
if ! kill -0 "$pid" 2>/dev/null; then
return 1
fi
done
return 0
}
# Monitor processes
start_time=$(date +%s)
while all_running; do
current_time=$(date +%s)
elapsed=$((current_time - start_time))
hours=$((elapsed / 3600))
minutes=$(((elapsed % 3600) / 60))
seconds=$((elapsed % 60))
echo -ne "\rElapsed time: ${hours}h ${minutes}m ${seconds}s - All processes running..."
sleep 10
done
echo ""
echo ""
echo "========================================"
echo "Validation Complete!"
echo "========================================"
# Check exit status of each process
all_success=true
for i in "${!pids[@]}"; do
wait "${pids[$i]}"
exit_code=$?
if [[ $exit_code -eq 0 ]]; then
echo "GPU $i: SUCCESS"
else
echo "GPU $i: FAILED (exit code: $exit_code)"
all_success=false
fi
done
echo ""
if $all_success; then
echo "🎉 All validation processes completed successfully!"
echo "Results saved to: $VALIDATED_RESULT_DIR"
else
echo "⚠️ Some validation processes failed. Check the log files for details."
fi
echo ""
echo "Log files:"
for ((gpu_index=0; gpu_index<TOTAL_GPUS; gpu_index++)); do
if [[ -f "gpu_${gpu_index}_validation.log" ]]; then
echo " GPU $gpu_index: gpu_${gpu_index}_validation.log"
fi
done
echo ""
echo "To view a specific GPU's log:"
echo " tail -f gpu_X_validation.log"
echo ""
echo "To clean up log files:"
echo " rm gpu_*_validation.log"