-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsplit_captions.py
More file actions
54 lines (40 loc) · 1.73 KB
/
split_captions.py
File metadata and controls
54 lines (40 loc) · 1.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/env python3
"""
Script to split captions.txt into 8 chunks and save them to different files.
"""
import os
import math
def split_captions(input_file="diffusionDB_enhanced_150k.txt", num_chunks=7, output_prefix="src/examples/captions/diffusionDB_enhanced_150k_chunk"):
"""
Split captions into chunks and save to separate files.
Args:
input_file: Path to input captions file
num_chunks: Number of chunks to create
output_prefix: Prefix for output files
"""
# Read all captions
if not os.path.exists(input_file):
print(f"Error: Input file '{input_file}' not found!")
return
with open(input_file, 'r', encoding='utf-8') as f:
captions = [line.strip() for line in f.readlines() if line.strip()]
total_captions = len(captions)
print(f"Total captions: {total_captions}")
# Calculate chunk size
chunk_size = math.ceil(total_captions / num_chunks)
print(f"Chunk size: {chunk_size}")
# Split and save chunks
for i in range(num_chunks):
start_idx = i * chunk_size
end_idx = min((i + 1) * chunk_size, total_captions)
chunk_captions = captions[start_idx:end_idx]
# Create output filename
output_file = f"{output_prefix}_{i+1}.txt"
# Save chunk to file
with open(output_file, 'w', encoding='utf-8') as f:
for caption in chunk_captions:
f.write(caption + '\n')
print(f"Chunk {i+1}: {len(chunk_captions)} captions saved to '{output_file}' (lines {start_idx+1}-{end_idx})")
print(f"\nSuccessfully split {total_captions} captions into {num_chunks} chunks!")
if __name__ == "__main__":
split_captions()