Skip to content

Add an optional argument for recursive file listing in computer vision models #92

@koito19960406

Description

@koito19960406

The way the data folders are structured is most likely many subfolders with 1000 images. This means that the computer vision models cannot fully use GPU power when the model is small. That's why models should have an argument to allow recursive listing of files in subfolders to create larger lists of images. Below is a pseudo code for a reference:

import os
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor
import multiprocessing
from itertools import chain
from functools import partial
import time

class FastImageFinder:
    """
    A fast image finder class that uses parallel processing and efficient file system operations
    to quickly collect image paths from large directory structures.
    """
    
    def __init__(self):
        self.image_extensions = {
            '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff',
            '.JPG', '.JPEG', '.PNG', '.GIF', '.BMP', '.TIFF'
        }
        
    @staticmethod
    def _scan_directory(directory, extensions, recursive=True):
        """
        Scan a single directory for image files using os.scandir for maximum performance.
        """
        results = []
        try:
            with os.scandir(directory) as scanner:
                for entry in scanner:
                    if entry.is_file():
                        if os.path.splitext(entry.name)[1].lower() in extensions:
                            results.append(entry.path)
                    elif recursive and entry.is_dir():
                        results.extend(FastImageFinder._scan_directory(
                            entry.path, extensions, recursive))
        except PermissionError:
            print(f"Permission denied for directory: {directory}")
        except Exception as e:
            print(f"Error scanning directory {directory}: {e}")
        return results

    def _process_directory_chunk(self, directories, recursive=True):
        """
        Process a chunk of directories in a single process.
        """
        lower_extensions = {ext.lower() for ext in self.image_extensions}
        all_paths = []
        for directory in directories:
            paths = self._scan_directory(directory, lower_extensions, recursive)
            all_paths.extend(paths)
        return all_paths

    def find_images(self, root_dir: Union[str, Path], recursive=True, chunk_size=100) -> List[str]:
        """
        Find all images in the given directory and its subdirectories using parallel processing.
        
        Args:
            root_dir: Root directory to start the search
            recursive: Whether to search in subdirectories
            chunk_size: Number of directories to process per CPU core
            
        Returns:
            List of image file paths
        """
        root_dir = Path(root_dir)
        if root_dir.is_file():
            return [str(root_dir)] if root_dir.suffix.lower() in self.image_extensions else []

        start_time = time.time()
        print(f"Starting image collection from {root_dir}")

        # Get all subdirectories first
        directories = []
        try:
            if recursive:
                for entry in os.scandir(root_dir):
                    if entry.is_dir():
                        directories.append(entry.path)
            directories.append(str(root_dir))
        except Exception as e:
            print(f"Error accessing root directory: {e}")
            return []

        # Split directories into chunks for parallel processing
        num_cores = multiprocessing.cpu_count()
        chunks = [directories[i:i + chunk_size] 
                 for i in range(0, len(directories), chunk_size)]
        
        print(f"Found {len(directories)} directories, using {num_cores} CPU cores")
        print(f"Processing in {len(chunks)} chunks")

        # Process chunks in parallel
        all_paths = []
        with ProcessPoolExecutor(max_workers=num_cores) as executor:
            process_chunk = partial(self._process_directory_chunk, 
                                  recursive=recursive)
            for i, chunk_paths in enumerate(executor.map(process_chunk, chunks)):
                all_paths.extend(chunk_paths)
                if (i + 1) % 10 == 0:  # Progress update every 10 chunks
                    print(f"Processed {i + 1}/{len(chunks)} chunks, "
                          f"found {len(all_paths)} images so far...")

        end_time = time.time()
        print(f"Found {len(all_paths)} images in {end_time - start_time:.2f} seconds")
        
        return sorted(all_paths)

def get_image_paths_fast(directory: Union[str, Path], recursive=False) -> List[str]:
    """
    Wrapper function for the FastImageFinder class.
    """
    finder = FastImageFinder()
    return finder.find_images(directory, recursive=recursive)

# Modified ClassifierPerceptionViT class with the new fast image collection
class ClassifierPerceptionViT(BaseClassifier):
    # ... [previous methods remain the same] ...

    def classify(
        self,
        dir_input: Union[str, Path],
        dir_summary_output: Union[str, Path],
        batch_size=32,
        save_format="json csv",
        recursive=False,
        chunk_size=5000
    ) -> List[str]:
        """
        [previous docstring remains the same]
        """
        if dir_summary_output:
            Path(dir_summary_output).mkdir(parents=True, exist_ok=True)

        # Use the new fast image collection method
        img_paths = get_image_paths_fast(dir_input, recursive=recursive)
        total_images = len(img_paths)
        
        if total_images == 0:
            print(f"No images found in {dir_input}")
            return []

        print(f"Found {total_images} images {'(recursive search)' if recursive else ''}")

        # [rest of the method remains the same]

Metadata

Metadata

Assignees

Labels

enhancementNew feature or request

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions