The way the data folders are structured is most likely many subfolders with 1000 images. This means that the computer vision models cannot fully use GPU power when the model is small. That's why models should have an argument to allow recursive listing of files in subfolders to create larger lists of images. Below is a pseudo code for a reference:
import os
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor
import multiprocessing
from itertools import chain
from functools import partial
import time
class FastImageFinder:
"""
A fast image finder class that uses parallel processing and efficient file system operations
to quickly collect image paths from large directory structures.
"""
def __init__(self):
self.image_extensions = {
'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff',
'.JPG', '.JPEG', '.PNG', '.GIF', '.BMP', '.TIFF'
}
@staticmethod
def _scan_directory(directory, extensions, recursive=True):
"""
Scan a single directory for image files using os.scandir for maximum performance.
"""
results = []
try:
with os.scandir(directory) as scanner:
for entry in scanner:
if entry.is_file():
if os.path.splitext(entry.name)[1].lower() in extensions:
results.append(entry.path)
elif recursive and entry.is_dir():
results.extend(FastImageFinder._scan_directory(
entry.path, extensions, recursive))
except PermissionError:
print(f"Permission denied for directory: {directory}")
except Exception as e:
print(f"Error scanning directory {directory}: {e}")
return results
def _process_directory_chunk(self, directories, recursive=True):
"""
Process a chunk of directories in a single process.
"""
lower_extensions = {ext.lower() for ext in self.image_extensions}
all_paths = []
for directory in directories:
paths = self._scan_directory(directory, lower_extensions, recursive)
all_paths.extend(paths)
return all_paths
def find_images(self, root_dir: Union[str, Path], recursive=True, chunk_size=100) -> List[str]:
"""
Find all images in the given directory and its subdirectories using parallel processing.
Args:
root_dir: Root directory to start the search
recursive: Whether to search in subdirectories
chunk_size: Number of directories to process per CPU core
Returns:
List of image file paths
"""
root_dir = Path(root_dir)
if root_dir.is_file():
return [str(root_dir)] if root_dir.suffix.lower() in self.image_extensions else []
start_time = time.time()
print(f"Starting image collection from {root_dir}")
# Get all subdirectories first
directories = []
try:
if recursive:
for entry in os.scandir(root_dir):
if entry.is_dir():
directories.append(entry.path)
directories.append(str(root_dir))
except Exception as e:
print(f"Error accessing root directory: {e}")
return []
# Split directories into chunks for parallel processing
num_cores = multiprocessing.cpu_count()
chunks = [directories[i:i + chunk_size]
for i in range(0, len(directories), chunk_size)]
print(f"Found {len(directories)} directories, using {num_cores} CPU cores")
print(f"Processing in {len(chunks)} chunks")
# Process chunks in parallel
all_paths = []
with ProcessPoolExecutor(max_workers=num_cores) as executor:
process_chunk = partial(self._process_directory_chunk,
recursive=recursive)
for i, chunk_paths in enumerate(executor.map(process_chunk, chunks)):
all_paths.extend(chunk_paths)
if (i + 1) % 10 == 0: # Progress update every 10 chunks
print(f"Processed {i + 1}/{len(chunks)} chunks, "
f"found {len(all_paths)} images so far...")
end_time = time.time()
print(f"Found {len(all_paths)} images in {end_time - start_time:.2f} seconds")
return sorted(all_paths)
def get_image_paths_fast(directory: Union[str, Path], recursive=False) -> List[str]:
"""
Wrapper function for the FastImageFinder class.
"""
finder = FastImageFinder()
return finder.find_images(directory, recursive=recursive)
# Modified ClassifierPerceptionViT class with the new fast image collection
class ClassifierPerceptionViT(BaseClassifier):
# ... [previous methods remain the same] ...
def classify(
self,
dir_input: Union[str, Path],
dir_summary_output: Union[str, Path],
batch_size=32,
save_format="json csv",
recursive=False,
chunk_size=5000
) -> List[str]:
"""
[previous docstring remains the same]
"""
if dir_summary_output:
Path(dir_summary_output).mkdir(parents=True, exist_ok=True)
# Use the new fast image collection method
img_paths = get_image_paths_fast(dir_input, recursive=recursive)
total_images = len(img_paths)
if total_images == 0:
print(f"No images found in {dir_input}")
return []
print(f"Found {total_images} images {'(recursive search)' if recursive else ''}")
# [rest of the method remains the same]
The way the data folders are structured is most likely many subfolders with 1000 images. This means that the computer vision models cannot fully use GPU power when the model is small. That's why models should have an argument to allow recursive listing of files in subfolders to create larger lists of images. Below is a pseudo code for a reference: