From 67523d6fe482fd7afb0b3d366d5428915aa8e756 Mon Sep 17 00:00:00 2001 From: Antigravity Date: Sun, 5 Apr 2026 18:48:06 +0400 Subject: [PATCH 1/6] feat: GPU AllVsAll architecture - mass upload + single kernel launch --- src/AntiDupl/AntiDupl.cpp | 9 +- src/AntiDupl/AntiDupl.vcxproj | 18 +- src/AntiDupl/adConfig.h | 4 +- src/AntiDupl/adDataCollector.cpp | 40 +- src/AntiDupl/adDataCollector.h | 1 + src/AntiDupl/adDump.cpp | 4 +- src/AntiDupl/adEngine.cpp | 308 +++++++++++- src/AntiDupl/adEngine.h | 10 + src/AntiDupl/adGPU.cu | 726 ++++++++++++++++++++++++++++ src/AntiDupl/adGPU.h | 73 +++ src/AntiDupl/adGPUManager.cpp | 106 ++++ src/AntiDupl/adGPUManager.h | 100 ++++ src/AntiDupl/adIO.h | 11 +- src/AntiDupl/adImageComparer.cpp | 124 ++++- src/AntiDupl/adImageComparer.h | 2 + src/AntiDupl/adImageData.cpp | 4 + src/AntiDupl/adImageData.h | 3 + src/AntiDupl/adImageDataStorage.cpp | 44 +- src/AntiDupl/adImageDataStorage.h | 8 +- src/AntiDupl/adImageUtils.cpp | 6 +- src/AntiDupl/adJxl.cpp | 19 +- src/AntiDupl/adOpenJpeg.cpp | 25 +- src/AntiDupl/adThreadManagement.cpp | 51 +- 23 files changed, 1628 insertions(+), 68 deletions(-) create mode 100644 src/AntiDupl/adGPU.cu create mode 100644 src/AntiDupl/adGPU.h create mode 100644 src/AntiDupl/adGPUManager.cpp create mode 100644 src/AntiDupl/adGPUManager.h diff --git a/src/AntiDupl/AntiDupl.cpp b/src/AntiDupl/AntiDupl.cpp index 0890612f..9d289f47 100644 --- a/src/AntiDupl/AntiDupl.cpp +++ b/src/AntiDupl/AntiDupl.cpp @@ -80,6 +80,8 @@ typedef ad::TEngine* adEngineHandle; if(p == NULL) \ return AD_ERROR_INVALID_POINTER; +#define AD_DEBUG(msg) OutputDebugStringA(msg) + BOOL APIENTRY DllMain(HMODULE hModule, DWORD dwReasonForCall, LPVOID lpReserved) { @@ -153,9 +155,14 @@ DLLAPI adEngineHandle adCreateA(const adCharA * userPath) DLLAPI adEngineHandle adCreateW(const adCharW * userPath) { + AD_DEBUG("adCreateW: Starting\n"); ad::DumpInit(); + AD_DEBUG("adCreateW: DumpInit done\n"); - return new ad::TEngine(ad::TString(userPath)); + adEngineHandle handle = new ad::TEngine(ad::TString(userPath)); + AD_DEBUG("adCreateW: TEngine created\n"); + + return handle; } DLLAPI adError adRelease(adEngineHandle handle) diff --git a/src/AntiDupl/AntiDupl.vcxproj b/src/AntiDupl/AntiDupl.vcxproj index e494d05a..492f37e0 100644 --- a/src/AntiDupl/AntiDupl.vcxproj +++ b/src/AntiDupl/AntiDupl.vcxproj @@ -23,6 +23,7 @@ DynamicLibrary + v143 @@ -51,14 +52,22 @@ - _WINDOWS;_USRDLL;ANTIDUPL_EXPORTS;%(PreprocessorDefinitions) + _WINDOWS;_USRDLL;ANTIDUPL_EXPORTS;ENABLE_CUDA;%(PreprocessorDefinitions) 4267 %(AdditionalIncludeDirectories) + cudart.lib;%(AdditionalDependencies) Windows MachineX64 + + 64 + + + 64 + compute_89,sm_89 + "$(ProjectDir)".\adExternal.cmd @@ -78,6 +87,7 @@ + @@ -113,6 +123,7 @@ + @@ -128,6 +139,8 @@ + + @@ -169,4 +182,7 @@ + + + \ No newline at end of file diff --git a/src/AntiDupl/adConfig.h b/src/AntiDupl/adConfig.h index 4bb98959..656d63e9 100644 --- a/src/AntiDupl/adConfig.h +++ b/src/AntiDupl/adConfig.h @@ -45,8 +45,8 @@ #include //#define AD_PERFORMANCE_TEST_ENABLE -//#define AD_LOGGER_ENABLE -//#define AD_DUMP_ENABLE +#define AD_LOGGER_ENABLE +#define AD_DUMP_ENABLE #define AD_TURBO_JPEG_ENABLE #include "adSimd.h" diff --git a/src/AntiDupl/adDataCollector.cpp b/src/AntiDupl/adDataCollector.cpp index 8ef65a35..e778218f 100644 --- a/src/AntiDupl/adDataCollector.cpp +++ b/src/AntiDupl/adDataCollector.cpp @@ -32,11 +32,16 @@ #include "adImageUtils.h" #include "adPixelData.h" #include "adBlurringDetector.h" +#include "adGPUManager.h" +#include + +#define AD_DEBUG(msg) OutputDebugStringA(msg) namespace ad { TDataCollector::TDataCollector(TEngine *pEngine) - :m_pOptions(pEngine->Options()), + :m_pEngine(pEngine), + m_pOptions(pEngine->Options()), m_pResult(pEngine->Result()) { for(int size = INITIAL_REDUCED_IMAGE_SIZE; size > m_pOptions->advanced.reducedImageSize; size >>= 1) @@ -98,13 +103,44 @@ namespace ad pImageData->imageExif = pImage->ImageExif(); - Simd::ResizeBilinear(gray, *m_pGrayBuffers.front()); + Simd::Resize(gray, *m_pGrayBuffers.front()); for(size_t i = 1; i < m_pGrayBuffers.size(); ++i) Simd::ReduceGray2x2(*m_pGrayBuffers[i - 1], *m_pGrayBuffers[i]); TPixelData & data = *pImageData->data; ReduceGray2x2(*m_pGrayBuffers.back(), TView(data.side, data.side, data.side, TView::Gray8, data.main)); data.filled = true; + if (m_pEngine->GpuManager() && m_pEngine->GpuManager()->IsAvailable()) + { + // Ensure GPU buffer is initialized before first upload + static bool gpuBufferInitialized = false; + if (!gpuBufferInitialized) + { + AD_DEBUG("FillPixelData: Initializing GPU buffer\n"); + size_t estimatedCapacity = 10000; // Start with reasonable estimate + size_t thumbSize = m_pOptions->advanced.reducedImageSize * m_pOptions->advanced.reducedImageSize; + if (m_pEngine->GpuManager()->EnsureCapacity(estimatedCapacity, thumbSize)) + { + gpuBufferInitialized = true; + AD_DEBUG("FillPixelData: GPU buffer initialized\n"); + } + else + { + AD_DEBUG("FillPixelData: GPU buffer initialization FAILED\n"); + } + } + + // Upload thumbnail to GPU immediately + if (m_pEngine->GpuManager()->UploadThumbnail(pImageData->globalIdx, data.main)) + { + // Successfully uploaded + } + else + { + AD_DEBUG("FillPixelData: UploadThumbnail FAILED\n"); + } + } + delete pImage; } else diff --git a/src/AntiDupl/adDataCollector.h b/src/AntiDupl/adDataCollector.h index 75647891..df87c0b8 100644 --- a/src/AntiDupl/adDataCollector.h +++ b/src/AntiDupl/adDataCollector.h @@ -38,6 +38,7 @@ namespace ad //------------------------------------------------------------------------- class TDataCollector { + TEngine *m_pEngine; TOptions *m_pOptions; TResultStorage *m_pResult; std::vector m_pGrayBuffers; diff --git a/src/AntiDupl/adDump.cpp b/src/AntiDupl/adDump.cpp index ee73bc4e..1b02b70b 100644 --- a/src/AntiDupl/adDump.cpp +++ b/src/AntiDupl/adDump.cpp @@ -31,7 +31,7 @@ namespace ad { - const DWORD STATUS_FATAL_APP_EXIT = 0x40000015; + const DWORD AD_STATUS_FATAL_APP_EXIT = 0x40000015; typedef BOOL (*MiniDumpWriteDumpPtr)( HANDLE hProcess, @@ -107,7 +107,7 @@ namespace ad { DWORD exception_code = info->ExceptionRecord->ExceptionCode; void *address = info->ExceptionRecord->ExceptionAddress; - if (exception_code == STATUS_FATAL_APP_EXIT) + if (exception_code == AD_STATUS_FATAL_APP_EXIT) {// abort() MessageBox(NULL, TEXT("Program aborted."), TEXT("Crash report"), MB_OK | MB_ICONERROR); diff --git a/src/AntiDupl/adEngine.cpp b/src/AntiDupl/adEngine.cpp index 73c22d15..a5985fd7 100644 --- a/src/AntiDupl/adEngine.cpp +++ b/src/AntiDupl/adEngine.cpp @@ -36,27 +36,111 @@ #include "adPerformance.h" #include "adLogger.h" #include "adFileUtils.h" +#include "adGPUManager.h" +#include "adStatus.h" +#include +#include +#include + +#define AD_DEBUG(msg) OutputDebugStringA(msg) +#define AD_DEBUG_FMT(msg, ...) \ + do { \ + char buf[512]; \ + snprintf(buf, sizeof(buf), msg, __VA_ARGS__); \ + OutputDebugStringA(buf); \ + } while(0) namespace ad { TEngine::TEngine(const TString & userPath) : _userPath(userPath) { + AD_DEBUG("TEngine: Constructor starting\n"); + #ifdef AD_LOGGER_ENABLE TLogger::s_logger.SetFileOut((UserPath() + TEXT("\\debug_log.txt")).c_str(), true); #endif//AD_LOGGER_ENABLE + + AD_DEBUG("TEngine: Creating TInit\n"); m_pInit = new TInit(); + + AD_DEBUG("TEngine: Creating TOptions\n"); m_pOptions = new TOptions(userPath); + + AD_DEBUG("TEngine: Creating TStatus\n"); m_pStatus = new TStatus(); + + AD_DEBUG("TEngine: Creating TGpuManager\n"); + m_pGpuManager = new TGpuManager(); + + AD_DEBUG("TEngine: TGpuManager created, IsAvailable=1\n"); + + if (m_pGpuManager->IsAvailable()) + { + AD_DEBUG("TEngine: GPU is available, getting device info\n"); + + const GpuDeviceInfo& info = m_pGpuManager->DeviceInfo(); + std::stringstream ss; + ss << "GPU acceleration initialized: " << info.name + << " (" << (info.totalGlobalMem / (1024 * 1024)) << " MB VRAM, Compute " + << info.computeMajor << "." << info.computeMinor << ")"; +#ifdef AD_LOGGER_ENABLE + AD_LOG(ss.str().c_str()); +#endif//AD_LOGGER_ENABLE + + // GPU Sanity Check: Test mathematical parity + AD_DEBUG("TEngine: Starting GPU sanity check\n"); + + const size_t testSize = 1024; + uint8_t h_test1[testSize], h_test2[testSize]; + double cpuSum = 0; + for(size_t i = 0; i < testSize; ++i) { + h_test1[i] = (uint8_t)(i % 256); + h_test2[i] = (uint8_t)(255 - (i % 256)); + double diff = (double)h_test1[i] - (double)h_test2[i]; + cpuSum += diff * diff; + } + + AD_DEBUG("TEngine: Calling GpuCompareSquaredSum\n"); + + double gpuSum = GpuCompareSquaredSum(h_test1, h_test2, testSize); + + AD_DEBUG("TEngine: GpuCompareSquaredSum returned\n"); + + std::stringstream ts; + ts << "CUDA Sanity Check: CPU=" << cpuSum << ", GPU=" << gpuSum; + double tolerance = cpuSum * 0.001; // 0.1% relative tolerance + if (fabs(cpuSum - gpuSum) <= tolerance) { + ts << " [SUCCESS - PARITY MATCH]"; + } else { + ts << " [FAILURE - MATH MISMATCH] Tolerance: " << tolerance; + } +#ifdef AD_LOGGER_ENABLE + AD_LOG(ts.str().c_str()); +#endif//AD_LOGGER_ENABLE + } + else + { + AD_DEBUG("TEngine: GPU not available\n"); +#ifdef AD_LOGGER_ENABLE + AD_LOG("GPU acceleration not available."); +#endif//AD_LOGGER_ENABLE + } + + AD_DEBUG("TEngine: Creating storage objects\n"); + m_pMistakeStorage = new TMistakeStorage(this); m_pImageDataStorage = new TImageDataStorage(this); - m_pRecycleBin = new TRecycleBin(this); + m_pRecycleBin = new TRecycleBin(this); m_pResult = new TResultStorage(this); m_pImageDataPtrs = new TImageDataPtrs(); m_pCriticalSection = new TCriticalSection(); m_pCompareManager = new TCompareManager(this); m_pCollectManager = new TCollectManager(this, m_pCompareManager); m_pSearcher = new TSearcher(this, m_pImageDataPtrs); + m_skipComparisonDuringCollection = false; + + AD_DEBUG("TEngine: Constructor finished successfully\n"); } TEngine::~TEngine() @@ -71,6 +155,7 @@ namespace ad delete m_pCollectManager; delete m_pSearcher; delete m_pRecycleBin; + delete m_pGpuManager; delete m_pStatus; delete m_pOptions; #ifdef AD_LOGGER_ENABLE @@ -82,42 +167,239 @@ namespace ad #endif//AD_LOGGER_ENABLE } + void TEngine::UpdateGpuDatabase() + { + AD_DEBUG("UpdateGpuDatabase: Starting\n"); + + if (m_pGpuManager && m_pGpuManager->IsAvailable()) + { + AD_DEBUG("UpdateGpuDatabase: GPU is available\n"); + + const TImageDataStorage::TStorage& storage = m_pImageDataStorage->Storage(); + AD_DEBUG("UpdateGpuDatabase: Storage size\n"); + + size_t reducedImageSize = m_pOptions->advanced.reducedImageSize; + size_t thumbSize = reducedImageSize * reducedImageSize; + AD_DEBUG("UpdateGpuDatabase: reducedImageSize and thumbSize calculated\n"); + + // Ensure GPU has enough capacity for the current database + AD_DEBUG("UpdateGpuDatabase: Calling EnsureCapacity\n"); + + if (!m_pGpuManager->EnsureCapacity(storage.size(), thumbSize)) + { + AD_DEBUG("UpdateGpuDatabase: EnsureCapacity FAILED\n"); +#ifdef AD_LOGGER_ENABLE + AD_LOG("GPU: Failed to ensure capacity for database."); +#endif + return; + } + + AD_DEBUG("UpdateGpuDatabase: EnsureCapacity succeeded\n"); + + size_t count = 0; + for (TImageDataStorage::TStorage::const_iterator it = storage.begin(); it != storage.end(); ++it) + { + TImageDataPtr pImageData = it->second; + if (pImageData->data && pImageData->data->filled && pImageData->data->main != nullptr) + { + if (m_pGpuManager->UploadThumbnail(pImageData->globalIdx, pImageData->data->main)) + { + count++; + } + else + { + AD_DEBUG("UpdateGpuDatabase: Upload FAILED\n"); + } + } + } + AD_DEBUG("UpdateGpuDatabase: Uploaded thumbnails\n"); + +#ifdef AD_LOGGER_ENABLE + if (count > 0) + { + std::stringstream ss; + ss << "GPU: Synchronized " << count << " thumbnails to VRAM."; + AD_LOG(ss.str().c_str()); + } +#endif + } + else + { + AD_DEBUG("UpdateGpuDatabase: GPU not available\n"); + } + AD_DEBUG("UpdateGpuDatabase: Finished\n"); + } + + // NEW: GPU AllVsAll comparison + void TEngine::ExecuteGpuAllVsAllComparison() + { + AD_DEBUG("ExecuteGpuAllVsAllComparison: Starting\n"); + + if (!m_pGpuManager || !m_pGpuManager->IsAvailable()) { + AD_DEBUG("ExecuteGpuAllVsAllComparison: GPU not available\n"); + return; + } + + const TImageDataStorage::TStorage& storage = m_pImageDataStorage->Storage(); + size_t count = storage.size(); + if (count == 0) { + AD_DEBUG("ExecuteGpuAllVsAllComparison: Empty storage\n"); + return; + } + + size_t reducedImageSize = m_pOptions->advanced.reducedImageSize; + size_t thumbSize = reducedImageSize * reducedImageSize; + + AD_DEBUG_FMT("ExecuteGpuAllVsAllComparison: Preparing data for %zu images\n", count); + + // Собираем все thumbnails в один массив + std::vector allThumbnails(count * thumbSize); + size_t validCount = 0; + + size_t idx = 0; + for (TImageDataStorage::TStorage::const_iterator it = storage.begin(); it != storage.end(); ++it, ++idx) { + TImageDataPtr pImageData = it->second; + if (pImageData->data && pImageData->data->filled && pImageData->data->main != nullptr) { + memcpy(&allThumbnails[idx * thumbSize], pImageData->data->main, thumbSize); + validCount++; + } + } + + AD_DEBUG_FMT("ExecuteGpuAllVsAllComparison: %zu valid thumbnails\n", validCount); + + // Вычисляем threshold как в оригинальном TImageComparer + int thresholdPerPixel = Simd::Square(m_pOptions->compare.thresholdDifference * PIXEL_MAX_DIFFERENCE) / + Simd::Square(DENOMINATOR); + int mainThreshold = (int)(thumbSize * thresholdPerPixel); + double threshold = (double)mainThreshold; + + AD_DEBUG_FMT("ExecuteGpuAllVsAllComparison: thresholdPerPixel=%d, mainThreshold=%d, threshold=%f\n", + thresholdPerPixel, mainThreshold, threshold); + + // Выделяем память для результатов (предполагаем ~5% дубликатов) + size_t maxMatches = count * (count - 1) / 2; + if (maxMatches > 10000000) maxMatches = 10000000; // Ограничиваем 10M + + std::vector outImage1(maxMatches); + std::vector outImage2(maxMatches); + std::vector outDifference(maxMatches); + size_t matchCount = 0; + + AD_DEBUG("ExecuteGpuAllVsAllComparison: Calling GPU\n"); + + if (m_pGpuManager->CompareAllVsAll( + allThumbnails.data(), count, thumbSize, threshold, + outImage1.data(), outImage2.data(), outDifference.data(), + &matchCount, maxMatches)) + { + AD_DEBUG_FMT("ExecuteGpuAllVsAllComparison: GPU returned %zu matches\n", matchCount); + + // Обрабатываем результаты + for (size_t i = 0; i < matchCount; i++) { + // Находим изображения по индексам + auto it1 = storage.begin(); + std::advance(it1, outImage1[i]); + TImageDataPtr pImage1 = it1->second; + + auto it2 = storage.begin(); + std::advance(it2, outImage2[i]); + TImageDataPtr pImage2 = it2->second; + + double maxDifference = (double)(Simd::Square(PIXEL_MAX_DIFFERENCE) * thumbSize); + double difference = sqrt((double)outDifference[i] / maxDifference) * 100; + if (pImage1->crc32c != pImage2->crc32c) + difference += ADDITIONAL_DIFFERENCE_FOR_DIFFERENT_CRC32; + + m_pResult->AddDuplImagePair(pImage1, pImage2, difference, AD_TRANSFORM_TURN_0); + } + + AD_DEBUG("ExecuteGpuAllVsAllComparison: Results processed\n"); + } + else { + AD_DEBUG("ExecuteGpuAllVsAllComparison: GPU comparison FAILED\n"); + } + + AD_DEBUG("ExecuteGpuAllVsAllComparison: Finished\n"); + } + void TEngine::Search() { + AD_DEBUG("Search: Starting\n"); + AD_FUNCTION_PERFORMANCE_TEST m_pStatus->ClearStatistic(); m_pStatus->SetProgress(0, 0); m_pResult->Clear(); + // 1. First, search for images on disk or load from DB + AD_DEBUG("Search: Calling SearchImages\n"); m_pSearcher->SearchImages(); + AD_DEBUG("Search: SearchImages completed\n"); - if(m_pOptions->compare.checkOnEquality == TRUE) - { - m_pCompareManager->Start(m_pImageDataPtrs->size()); - m_pCompareManager->SetPriority(THREAD_PRIORITY_LOWEST); - } + // 2. Start collection threads + AD_DEBUG("Search: Starting collection manager\n"); m_pCollectManager->Start(); m_pCollectManager->SetPriority(THREAD_PRIORITY_BELOW_NORMAL); - size_t current = 0, total = m_pImageDataPtrs->size(); - for(TImageDataPtrs::iterator it = m_pImageDataPtrs->begin(); + // 3. GPU AllVsAll comparison (если включено и доступно) + bool useGpu = (m_pGpuManager && m_pGpuManager->IsAvailable() && + m_pOptions->compare.algorithmComparing == AD_COMPARING_SQUARED_SUM && + m_pOptions->advanced.ignoreFrameWidth == 0); + + if (useGpu) + { + m_skipComparisonDuringCollection = true; // Отключаем старое сравнение ДО цикла + } + else + { + m_skipComparisonDuringCollection = false; + // 4. CPU comparison (старый подход) - нужно запустить CompareManager ДО сбора данных + AD_DEBUG("Search: Starting CPU comparison\n"); + + if(m_pOptions->compare.checkOnEquality == TRUE) + { + AD_DEBUG("Search: Starting compare manager\n"); + m_pCompareManager->Start(m_pImageDataPtrs->size()); + m_pCompareManager->SetPriority(THREAD_PRIORITY_NORMAL); + AD_DEBUG("Search: Compare manager started\n"); + } + } + + size_t current = 0, total = m_pImageDataPtrs->size(); + AD_DEBUG("Search: Total images to process\n"); + + for(TImageDataPtrs::iterator it = m_pImageDataPtrs->begin(); it != m_pImageDataPtrs->end() && !m_pStatus->Stopped(); ++it, ++current) { TImageDataPtr pImageData = *it; m_pCollectManager->Add(pImageData); m_pStatus->SetProgress(current, total); } + AD_DEBUG("Search: Collection loop finished\n"); + m_pCollectManager->Finish(); + AD_DEBUG("Search: Collection manager finished\n"); - if(m_pOptions->compare.checkOnEquality == TRUE) + if (useGpu) { - m_pStatus->SetProgress(current, total); - m_pStatus->Wait(AD_THREAD_TYPE_MAIN, 0); - m_pCompareManager->SetPriority(THREAD_PRIORITY_NORMAL); - m_pCompareManager->Finish(); + AD_DEBUG("Search: Using GPU AllVsAll comparison\n"); + ExecuteGpuAllVsAllComparison(); + m_skipComparisonDuringCollection = false; + AD_DEBUG("Search: GPU comparison completed\n"); + } + else + { + if(m_pOptions->compare.checkOnEquality == TRUE) + { + AD_DEBUG("Search: Waiting for compare manager to finish\n"); + m_pCompareManager->Finish(); + AD_DEBUG("Search: Compare manager finished\n"); + } } m_pImageDataPtrs->clear(); m_pStatus->Reset(); + + AD_DEBUG("Search: Completed successfully\n"); } } diff --git a/src/AntiDupl/adEngine.h b/src/AntiDupl/adEngine.h index 1395855e..60488656 100644 --- a/src/AntiDupl/adEngine.h +++ b/src/AntiDupl/adEngine.h @@ -44,6 +44,7 @@ namespace ad class TSearcher; class TRecycleBin; class TCriticalSection; + class TGpuManager; typedef TImageData *TImageDataPtr; //------------------------------------------------------------------------- @@ -56,6 +57,13 @@ namespace ad void Search(); + void UpdateGpuDatabase(); + void ExecuteGpuAllVsAllComparison(); + + // Flag to skip comparison during collection (for GPU AllVsAll mode) + bool m_skipComparisonDuringCollection; + bool SkipComparisonDuringCollection() const { return m_skipComparisonDuringCollection; } + const TString & UserPath() const { return _userPath; } TStatus* Status() {return m_pStatus;} TOptions* Options() {return m_pOptions;}; @@ -64,6 +72,7 @@ namespace ad TResultStorage* Result() {return m_pResult;} TCriticalSection* CriticalSection() {return m_pCriticalSection;} TRecycleBin* RecycleBin() {return m_pRecycleBin;} + TGpuManager* GpuManager() {return m_pGpuManager;} private: TString _userPath; @@ -79,6 +88,7 @@ namespace ad TInit *m_pInit; TSearcher *m_pSearcher; TRecycleBin *m_pRecycleBin; + TGpuManager *m_pGpuManager; }; //------------------------------------------------------------------------- } diff --git a/src/AntiDupl/adGPU.cu b/src/AntiDupl/adGPU.cu new file mode 100644 index 00000000..e5058330 --- /dev/null +++ b/src/AntiDupl/adGPU.cu @@ -0,0 +1,726 @@ +/* +* AntiDuplPlus Program (http://github.com/Sucotasch/AntiDuplPlus). +* +* Copyright (c) 2023-2026. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "adGPU.h" +#include "adLogger.h" +#include +#include +#include +#include +#include +#include + +#define AD_DEBUG(msg) OutputDebugStringA(msg) +#define AD_DEBUG_FMT(msg, ...) \ + do { \ + char buf[512]; \ + snprintf(buf, sizeof(buf), msg, __VA_ARGS__); \ + OutputDebugStringA(buf); \ + } while(0) + +namespace ad +{ + // Global VRAM Storage + static uint8_t* g_pDeviceThumbnailBuffer = nullptr; + static size_t g_bufferCapacity = 0; + static size_t g_thumbSize = 1024; // Default 32x32 + static uint8_t* g_pQueryBuffer = nullptr; + static double* g_pResultBuffer = nullptr; + static size_t* g_pIndexBuffer = nullptr; + + // --- Kernels --- + + // Match structure for sparse results + struct Match { + uint32_t image1; + uint32_t image2; + float difference; + }; + + __global__ void SquaredSumKernel(const uint8_t* pSrc1, const uint8_t* pSrc2, size_t size, double* pResult) + { + extern __shared__ double shared_data[]; + size_t tid = threadIdx.x; + size_t i = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + double sum = 0; + if (i < size) { + double diff = (double)pSrc1[i] - (double)pSrc2[i]; + sum = diff * diff; + } + shared_data[tid] = sum; + __syncthreads(); + for (size_t s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s) shared_data[tid] += shared_data[tid + s]; + __syncthreads(); + } + if (tid == 0) atomicAdd(pResult, shared_data[0]); + } + + // NEW: AllVsAll kernel — каждый блок обрабатывает одну строку (один i), каждый поток — один j + __global__ void AllVsAllKernel( + const uint8_t* thumbnails, // Все thumbnails в VRAM + size_t thumbSize, // Размер одного thumbnail (1024) + size_t count, // Общее количество изображений + double threshold, // Порог для дубликатов + Match* results, // Sparse buffer для результатов + size_t* matchCount) // Atomic counter + { + // Каждый блок обрабатывает одну строку i + size_t i = blockIdx.x; + if (i >= count) return; + + const uint8_t* thumb1 = thumbnails + i * thumbSize; + + // Каждый поток обрабатывает несколько j > i с stride + size_t numThreads = blockDim.x; + + for (size_t j = i + 1 + threadIdx.x; j < count; j += numThreads) { + const uint8_t* thumb2 = thumbnails + j * thumbSize; + + // Вычисляем squared difference + double sumSqDiff = 0; + for (size_t p = 0; p < thumbSize; p++) { + double diff = (double)thumb1[p] - (double)thumb2[p]; + sumSqDiff += diff * diff; + } + + // Если ниже threshold — записываем результат + if (sumSqDiff <= threshold) { + size_t idx = atomicAdd(matchCount, (size_t)1); + results[idx].image1 = (uint32_t)i; + results[idx].image2 = (uint32_t)j; + results[idx].difference = (float)sumSqDiff; + } + } + } + + __global__ void OneVsManyKernel(const uint8_t* pQuery, const uint8_t* pDatabase, + size_t thumbSize, size_t count, double* pResults) + { + size_t dbIdx = blockIdx.x; + if (dbIdx >= count) return; + + extern __shared__ double shared_sum[]; + size_t tid = threadIdx.x; + + double localSum = 0; + const uint8_t* pTarget = pDatabase + (size_t)dbIdx * thumbSize; + + for (size_t i = tid; i < thumbSize; i += blockDim.x) { + double diff = (double)pQuery[i] - (double)pTarget[i]; + localSum += diff * diff; + } + + shared_sum[tid] = localSum; + __syncthreads(); + + for (size_t s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s) shared_sum[tid] += shared_sum[tid + s]; + __syncthreads(); + } + + if (tid == 0) { + pResults[dbIdx] = shared_sum[0]; + } + } + + __global__ void OneVsListKernel(const uint8_t* pQuery, const uint8_t* pDatabase, + const size_t* pIndices, size_t thumbSize, size_t count, + double* pResults, size_t maxBufferCapacity) + { + size_t listIdx = blockIdx.x; + if (listIdx >= count) return; + + extern __shared__ double shared_sum_list[]; + size_t tid = threadIdx.x; + + double localSum = 0; + size_t dbIdx = pIndices[listIdx]; + + // Check bounds to prevent out-of-bounds access + if (dbIdx >= maxBufferCapacity) { + if (tid == 0) { + pResults[listIdx] = 1e100; // Maximum difference (no match) + } + return; + } + + const uint8_t* pTarget = pDatabase + (size_t)dbIdx * thumbSize; + + for (size_t i = tid; i < thumbSize; i += blockDim.x) { + double diff = (double)pQuery[i] - (double)pTarget[i]; + localSum += diff * diff; + } + + shared_sum_list[tid] = localSum; + __syncthreads(); + + for (size_t s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s) shared_sum_list[tid] += shared_sum_list[tid + s]; + __syncthreads(); + } + + if (tid == 0) { + pResults[listIdx] = shared_sum_list[0]; + } + } + + // --- Implementation --- + + bool GpuInit(GpuDeviceInfo* pInfo) + { + AD_DEBUG("GpuInit: Starting\n"); + + int deviceCount = 0; + cudaError_t err = cudaGetDeviceCount(&deviceCount); + if (err != cudaSuccess || deviceCount == 0) { + AD_DEBUG("GpuInit: cudaGetDeviceCount failed\n"); + return false; + } + + AD_DEBUG("GpuInit: Got device count\n"); + + cudaDeviceProp prop; + err = cudaGetDeviceProperties(&prop, 0); + if (err != cudaSuccess) { + AD_DEBUG("GpuInit: cudaGetDeviceProperties failed\n"); + return false; + } + + AD_DEBUG("GpuInit: Got device properties\n"); + + if (pInfo) { + strncpy(pInfo->name, prop.name, 256); + pInfo->totalGlobalMem = prop.totalGlobalMem; + pInfo->computeMajor = prop.major; + pInfo->computeMinor = prop.minor; + pInfo->isCompatible = (prop.major >= 8); + } + + AD_DEBUG("GpuInit: successful\n"); + return true; + } + + bool GpuCreateBuffer(size_t capacity, size_t thumbSize) + { + GpuReleaseBuffer(); + if (capacity == 0 || thumbSize == 0) return true; + + // Use temporary variable for validation before setting global state + size_t testThumbSize = thumbSize; + + size_t freeMem = 0, totalMem = 0; + cudaMemGetInfo(&freeMem, &totalMem); + + size_t requiredMem = capacity * testThumbSize + capacity * sizeof(double) + capacity * sizeof(size_t) + testThumbSize; + + if (requiredMem > (size_t)(freeMem * 0.8)) + { +#ifdef AD_LOGGER_ENABLE + std::stringstream ss; + ss << "GPU: Not enough free VRAM! Required: " << (requiredMem / 1024 / 1024) + << " MB, Free: " << (freeMem / 1024 / 1024) << " MB."; + AD_LOG(ss.str().c_str()); +#endif + return false; + } + + // Only set g_thumbSize after memory check passes + g_thumbSize = testThumbSize; + + cudaError_t err; + err = cudaMalloc(&g_pDeviceThumbnailBuffer, capacity * g_thumbSize); + if (err != cudaSuccess) goto error; + + err = cudaMalloc(&g_pQueryBuffer, g_thumbSize); + if (err != cudaSuccess) goto error; + + err = cudaMalloc(&g_pResultBuffer, capacity * sizeof(double)); + if (err != cudaSuccess) goto error; + + err = cudaMalloc(&g_pIndexBuffer, capacity * sizeof(size_t)); + if (err != cudaSuccess) goto error; + + g_bufferCapacity = capacity; +#ifdef AD_LOGGER_ENABLE + { + std::stringstream ss; + ss << "GPU: VRAM Allocated. Capacity: " << capacity << " units. Thumbnail Size: " << g_thumbSize << " bytes. Required: " << (requiredMem / 1024 / 1024) << " MB. Free VRAM: " << (freeMem / 1024 / 1024) << " MB."; + AD_LOG(ss.str().c_str()); + } +#endif + return true; + + error: + GpuReleaseBuffer(); + return false; + } + + void GpuReleaseBuffer() + { + if (g_pDeviceThumbnailBuffer) cudaFree(g_pDeviceThumbnailBuffer); + if (g_pQueryBuffer) cudaFree(g_pQueryBuffer); + if (g_pResultBuffer) cudaFree(g_pResultBuffer); + if (g_pIndexBuffer) cudaFree(g_pIndexBuffer); + g_pDeviceThumbnailBuffer = nullptr; + g_pQueryBuffer = nullptr; + g_pResultBuffer = nullptr; + g_pIndexBuffer = nullptr; + g_bufferCapacity = 0; + } + + void GpuRelease() + { + GpuReleaseBuffer(); + cudaDeviceReset(); + } + + bool GpuUploadThumbnail(size_t index, const uint8_t* pData) + { + fprintf(stderr, "GpuUploadThumbnail: index=%zu, g_pDeviceThumbnailBuffer=%p, g_bufferCapacity=%zu\n", + index, (void*)g_pDeviceThumbnailBuffer, g_bufferCapacity); + fflush(stderr); + + if (!g_pDeviceThumbnailBuffer) { + fprintf(stderr, "GpuUploadThumbnail: FAILED - buffer not allocated\n"); + fflush(stderr); + return false; + } + + if (index >= g_bufferCapacity) { + fprintf(stderr, "GpuUploadThumbnail: FAILED - index %zu exceeds capacity %zu\n", + index, g_bufferCapacity); + fflush(stderr); + return false; + } + + if (pData == nullptr) { + fprintf(stderr, "GpuUploadThumbnail: FAILED - null data pointer\n"); + fflush(stderr); + return false; + } + + cudaError_t err = cudaMemcpy(g_pDeviceThumbnailBuffer + (size_t)index * g_thumbSize, + pData, g_thumbSize, cudaMemcpyHostToDevice); + if (err != cudaSuccess) { + fprintf(stderr, "GpuUploadThumbnail: FAILED - cudaMemcpy error: %s\n", + cudaGetErrorString(err)); + fflush(stderr); + return false; + } + + fprintf(stderr, "GpuUploadThumbnail: Success\n"); + fflush(stderr); + return true; + } + + bool GpuCompareOneVsMany(const uint8_t* pQuery, size_t startIdx, size_t count, double threshold, + size_t* pMatchIndices, double* pMatchDifferences, size_t* pMatchCount, size_t maxMatches) + { + if (!g_pDeviceThumbnailBuffer || (startIdx + count) > g_bufferCapacity || count == 0 || + pQuery == nullptr || pMatchIndices == nullptr || pMatchDifferences == nullptr || + pMatchCount == nullptr || maxMatches == 0) + return false; + + if (cudaMemcpy(g_pQueryBuffer, pQuery, g_thumbSize, cudaMemcpyHostToDevice) != cudaSuccess) return false; + + // Check for integer overflow before kernel launch + if (count > INT_MAX) { +#ifdef AD_LOGGER_ENABLE + AD_LOG("GPU: Count exceeds INT_MAX, cannot launch kernel"); +#endif + return false; + } + + int threadsPerBlock = 256; + OneVsManyKernel<<< (int)count, threadsPerBlock, threadsPerBlock * sizeof(double) >>>( + g_pQueryBuffer, g_pDeviceThumbnailBuffer + (size_t)startIdx * g_thumbSize, g_thumbSize, count, g_pResultBuffer); + + if (cudaGetLastError() != cudaSuccess) return false; + if (cudaDeviceSynchronize() != cudaSuccess) return false; + + std::vector results(count); + if (cudaMemcpy(results.data(), g_pResultBuffer, count * sizeof(double), cudaMemcpyDeviceToHost) != cudaSuccess) return false; + + size_t found = 0; + for (size_t i = 0; i < count && found < maxMatches; ++i) { + if (results[i] <= threshold) { + pMatchIndices[found] = startIdx + i; + pMatchDifferences[found] = results[i]; + found++; + } + } + *pMatchCount = found; + return true; + } + + bool GpuCompareOneVsList(const uint8_t* pQuery, const size_t* pIndices, size_t count, double threshold, + size_t* pMatchIndices, double* pMatchDifferences, size_t* pMatchCount, size_t maxMatches) + { + fprintf(stderr, "GpuCompareOneVsList: Starting, count=%zu, threshold=%f\n", count, threshold); + fflush(stderr); + + if (!g_pDeviceThumbnailBuffer || count > g_bufferCapacity || count == 0 || + pQuery == nullptr || pIndices == nullptr || pMatchIndices == nullptr || + pMatchDifferences == nullptr || pMatchCount == nullptr || maxMatches == 0) { + fprintf(stderr, "GpuCompareOneVsList: Parameter validation FAILED\n"); + fflush(stderr); + return false; + } + + fprintf(stderr, "GpuCompareOneVsList: Copying query to device\n"); + fflush(stderr); + if (cudaMemcpy(g_pQueryBuffer, pQuery, g_thumbSize, cudaMemcpyHostToDevice) != cudaSuccess) { + fprintf(stderr, "GpuCompareOneVsList: Query copy FAILED\n"); + fflush(stderr); + return false; + } + + fprintf(stderr, "GpuCompareOneVsList: Copying indices to device\n"); + fflush(stderr); + if (cudaMemcpy(g_pIndexBuffer, pIndices, count * sizeof(size_t), cudaMemcpyHostToDevice) != cudaSuccess) { + fprintf(stderr, "GpuCompareOneVsList: Indices copy FAILED\n"); + fflush(stderr); + return false; + } + + // Check for integer overflow before kernel launch + if (count > INT_MAX) { + fprintf(stderr, "GpuCompareOneVsList: Count exceeds INT_MAX\n"); + fflush(stderr); +#ifdef AD_LOGGER_ENABLE + AD_LOG("GPU: Count exceeds INT_MAX, cannot launch kernel"); +#endif + return false; + } + + fprintf(stderr, "GpuCompareOneVsList: Launching kernel with count=%zu\n", count); + fflush(stderr); + + int threadsPerBlock = 256; + OneVsListKernel<<< (int)count, threadsPerBlock, threadsPerBlock * sizeof(double) >>>( + g_pQueryBuffer, g_pDeviceThumbnailBuffer, g_pIndexBuffer, g_thumbSize, count, g_pResultBuffer, g_bufferCapacity); + + fprintf(stderr, "GpuCompareOneVsList: Kernel launched, checking for errors\n"); + fflush(stderr); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "GpuCompareOneVsList: Kernel launch error: %s\n", cudaGetErrorString(err)); + fflush(stderr); + return false; + } + + fprintf(stderr, "GpuCompareOneVsList: Synchronizing device\n"); + fflush(stderr); + err = cudaDeviceSynchronize(); + if (err != cudaSuccess) { + fprintf(stderr, "GpuCompareOneVsList: Device sync error: %s\n", cudaGetErrorString(err)); + fflush(stderr); + return false; + } + + fprintf(stderr, "GpuCompareOneVsList: Copying results from device\n"); + fflush(stderr); + + std::vector results(count); + if (cudaMemcpy(results.data(), g_pResultBuffer, count * sizeof(double), cudaMemcpyDeviceToHost) != cudaSuccess) { + fprintf(stderr, "GpuCompareOneVsList: Results copy FAILED\n"); + fflush(stderr); + return false; + } + + fprintf(stderr, "GpuCompareOneVsList: Processing results\n"); + fflush(stderr); + + size_t found = 0; + for (size_t i = 0; i < count && found < maxMatches; ++i) { + if (results[i] <= threshold) { + pMatchIndices[found] = pIndices[i]; + pMatchDifferences[found] = results[i]; + found++; + } + } + *pMatchCount = found; + + fprintf(stderr, "GpuCompareOneVsList: Completed, found=%zu matches\n", found); + fflush(stderr); + + return true; + } + + double GpuCompareSquaredSum(const uint8_t* pSrc1, const uint8_t* pSrc2, size_t size) + { + fprintf(stderr, "GpuCompareSquaredSum: Starting, size=%zu\n", size); + fflush(stderr); + + if (pSrc1 == nullptr || pSrc2 == nullptr) { + fprintf(stderr, "GpuCompareSquaredSum: Null input pointers\n"); + fflush(stderr); + return 1e10; + } + + // Check CUDA device availability + int deviceCount = 0; + cudaError_t testErr = cudaGetDeviceCount(&deviceCount); + if (testErr != cudaSuccess || deviceCount == 0) { + fprintf(stderr, "GpuCompareSquaredSum: No CUDA devices available\n"); + fflush(stderr); + return 1e10; + } + + uint8_t *d_1 = nullptr, *d_2 = nullptr; + double *d_r = nullptr, h_r = 0; + size_t numBlocks = 0; + + fprintf(stderr, "GpuCompareSquaredSum: Allocating device memory\n"); + fflush(stderr); + + cudaError_t err; + err = cudaMalloc(&d_1, size); + if (err != cudaSuccess) { + fprintf(stderr, "GpuCompareSquaredSum: cudaMalloc d_1 failed: %s\n", cudaGetErrorString(err)); + fflush(stderr); + goto cleanup; + } + + err = cudaMalloc(&d_2, size); + if (err != cudaSuccess) { + fprintf(stderr, "GpuCompareSquaredSum: cudaMalloc d_2 failed: %s\n", cudaGetErrorString(err)); + fflush(stderr); + goto cleanup; + } + + err = cudaMalloc(&d_r, sizeof(double)); + if (err != cudaSuccess) { + fprintf(stderr, "GpuCompareSquaredSum: cudaMalloc d_r failed: %s\n", cudaGetErrorString(err)); + fflush(stderr); + goto cleanup; + } + + fprintf(stderr, "GpuCompareSquaredSum: Copying data to device\n"); + fflush(stderr); + + if (cudaMemcpy(d_1, pSrc1, size, cudaMemcpyHostToDevice) != cudaSuccess) { + fprintf(stderr, "GpuCompareSquaredSum: cudaMemcpy d_1 failed\n"); + fflush(stderr); + goto cleanup; + } + if (cudaMemcpy(d_2, pSrc2, size, cudaMemcpyHostToDevice) != cudaSuccess) { + fprintf(stderr, "GpuCompareSquaredSum: cudaMemcpy d_2 failed\n"); + fflush(stderr); + goto cleanup; + } + if (cudaMemset(d_r, 0, sizeof(double)) != cudaSuccess) { + fprintf(stderr, "GpuCompareSquaredSum: cudaMemset d_r failed\n"); + fflush(stderr); + goto cleanup; + } + + // Check for integer overflow before kernel launch + numBlocks = (size + 255) / 256; + if (numBlocks > INT_MAX) { + fprintf(stderr, "GpuCompareSquaredSum: Block count exceeds INT_MAX\n"); + fflush(stderr); +#ifdef AD_LOGGER_ENABLE + AD_LOG("GPU: Block count exceeds INT_MAX, cannot launch kernel"); +#endif + goto cleanup; + } + + fprintf(stderr, "GpuCompareSquaredSum: Launching kernel with %zu blocks\n", numBlocks); + fflush(stderr); + + SquaredSumKernel<<< (int)numBlocks, 256, 256 * sizeof(double) >>>(d_1, d_2, size, d_r); + + err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "GpuCompareSquaredSum: Kernel launch failed: %s\n", cudaGetErrorString(err)); + fflush(stderr); + goto cleanup; + } + + err = cudaDeviceSynchronize(); + if (err != cudaSuccess) { + fprintf(stderr, "GpuCompareSquaredSum: Device synchronize failed: %s\n", cudaGetErrorString(err)); + fflush(stderr); + goto cleanup; + } + + fprintf(stderr, "GpuCompareSquaredSum: Copying result from device\n"); + fflush(stderr); + + err = cudaMemcpy(&h_r, d_r, sizeof(double), cudaMemcpyDeviceToHost); + if (err != cudaSuccess) { + fprintf(stderr, "GpuCompareSquaredSum: cudaMemcpy result failed: %s\n", cudaGetErrorString(err)); + fflush(stderr); + goto cleanup; + } + + fprintf(stderr, "GpuCompareSquaredSum: Success, result=%f\n", h_r); + fflush(stderr); + + cleanup: + if (d_1) cudaFree(d_1); + if (d_2) cudaFree(d_2); + if (d_r) cudaFree(d_r); + return h_r; + } + + // NEW: AllVsAll comparison с массовым upload + bool GpuCompareAllVsAll( + const uint8_t* allThumbnails, // Все thumbnails в RAM (непрерывный массив) + size_t count, // Количество изображений + size_t thumbSize, // Размер одного thumbnail (1024) + double threshold, // Порог для дубликатов + uint32_t* outImage1, // Массив для image1 (результат) + uint32_t* outImage2, // Массив для image2 (результат) + float* outDifference, // Массив для difference (результат) + size_t* outMatchCount, // Количество найденных дубликатов + size_t maxMatches) // Максимальное количество результатов + { + AD_DEBUG("GpuCompareAllVsAll: Starting\n"); + + if (!allThumbnails || count == 0 || thumbSize == 0 || !outImage1 || !outImage2 || !outDifference || !outMatchCount) { + AD_DEBUG("GpuCompareAllVsAll: Invalid parameters\n"); + return false; + } + + size_t totalPairs = count * (count - 1) / 2; + AD_DEBUG_FMT("GpuCompareAllVsAll: Comparing %zu images, %zu pairs, threshold=%f\n", count, totalPairs, threshold); + + // Выделяем VRAM для thumbnails + uint8_t* d_thumbnails = nullptr; + Match* d_results = nullptr; + size_t* d_matchCount = nullptr; + + cudaError_t err; + + // 1. Выделяем память для thumbnails + AD_DEBUG("GpuCompareAllVsAll: Allocating VRAM for thumbnails\n"); + err = cudaMalloc(&d_thumbnails, count * thumbSize); + if (err != cudaSuccess) { + AD_DEBUG_FMT("GpuCompareAllVsAll: Failed to allocate thumbnails VRAM: %s\n", cudaGetErrorString(err)); + return false; + } + + // 2. Выделяем память для результатов (sparse buffer) + AD_DEBUG("GpuCompareAllVsAll: Allocating VRAM for results\n"); + err = cudaMalloc(&d_results, maxMatches * sizeof(Match)); + if (err != cudaSuccess) { + AD_DEBUG_FMT("GpuCompareAllVsAll: Failed to allocate results VRAM: %s\n", cudaGetErrorString(err)); + cudaFree(d_thumbnails); + return false; + } + + // 3. Выделяем память для counter + err = cudaMalloc(&d_matchCount, sizeof(size_t)); + if (err != cudaSuccess) { + AD_DEBUG("GpuCompareAllVsAll: Failed to allocate counter VRAM\n"); + cudaFree(d_thumbnails); + cudaFree(d_results); + return false; + } + + // 4. Один массовый upload всех thumbnails + AD_DEBUG("GpuCompareAllVsAll: Uploading all thumbnails to VRAM\n"); + err = cudaMemcpy(d_thumbnails, allThumbnails, count * thumbSize, cudaMemcpyHostToDevice); + if (err != cudaSuccess) { + AD_DEBUG_FMT("GpuCompareAllVsAll: Upload failed: %s\n", cudaGetErrorString(err)); + cudaFree(d_thumbnails); + cudaFree(d_results); + cudaFree(d_matchCount); + return false; + } + AD_DEBUG("GpuCompareAllVsAll: Upload complete\n"); + + // 5. Инициализируем counter + size_t h_matchCount = 0; + cudaMemcpy(d_matchCount, &h_matchCount, sizeof(size_t), cudaMemcpyHostToDevice); + + // 6. Запускаем kernel + AD_DEBUG("GpuCompareAllVsAll: Launching kernel\n"); + + int threadsPerBlock = 256; + // Теперь каждый блок = одна строка i, так что blocks = count + size_t blocks = count; + if (blocks > 65535) blocks = 65535; // Ограничение CUDA + + AD_DEBUG_FMT("GpuCompareAllVsAll: Launching %zu blocks, %d threads/block\n", blocks, threadsPerBlock); + + AllVsAllKernel<<<(int)blocks, threadsPerBlock>>>( + d_thumbnails, thumbSize, count, threshold, d_results, d_matchCount); + + err = cudaGetLastError(); + if (err != cudaSuccess) { + AD_DEBUG_FMT("GpuCompareAllVsAll: Kernel launch failed: %s\n", cudaGetErrorString(err)); + cudaFree(d_thumbnails); + cudaFree(d_results); + cudaFree(d_matchCount); + return false; + } + + // 7. Ждём завершения + AD_DEBUG("GpuCompareAllVsAll: Synchronizing\n"); + err = cudaDeviceSynchronize(); + if (err != cudaSuccess) { + AD_DEBUG_FMT("GpuCompareAllVsAll: Sync failed: %s\n", cudaGetErrorString(err)); + cudaFree(d_thumbnails); + cudaFree(d_results); + cudaFree(d_matchCount); + return false; + } + + // 8. Считываем counter + AD_DEBUG("GpuCompareAllVsAll: Reading match count\n"); + cudaMemcpy(&h_matchCount, d_matchCount, sizeof(size_t), cudaMemcpyDeviceToHost); + + AD_DEBUG_FMT("GpuCompareAllVsAll: Found %zu matches\n", h_matchCount); + + // 9. Считываем результаты + if (h_matchCount > 0) { + size_t readCount = (h_matchCount < maxMatches) ? h_matchCount : maxMatches; + AD_DEBUG_FMT("GpuCompareAllVsAll: Reading %zu results\n", readCount); + + std::vector h_results(readCount); + cudaMemcpy(h_results.data(), d_results, readCount * sizeof(Match), cudaMemcpyDeviceToHost); + + // Копируем в выходные массивы + for (size_t i = 0; i < readCount; i++) { + outImage1[i] = h_results[i].image1; + outImage2[i] = h_results[i].image2; + outDifference[i] = h_results[i].difference; + } + } + + *outMatchCount = h_matchCount; + + // 10. Освобождаем VRAM + cudaFree(d_thumbnails); + cudaFree(d_results); + cudaFree(d_matchCount); + + AD_DEBUG("GpuCompareAllVsAll: Complete\n"); + return true; + } +} diff --git a/src/AntiDupl/adGPU.h b/src/AntiDupl/adGPU.h new file mode 100644 index 00000000..1299a68b --- /dev/null +++ b/src/AntiDupl/adGPU.h @@ -0,0 +1,73 @@ +/* +* AntiDuplPlus Program (http://github.com/Sucotasch/AntiDuplPlus). +* +* Copyright (c) 2023-2026. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#ifndef __adGPU_h__ +#define __adGPU_h__ + +#include "adConfig.h" + +namespace ad +{ + struct GpuDeviceInfo + { + char name[256]; + size_t totalGlobalMem; + int computeMajor; + int computeMinor; + bool isCompatible; + }; + + bool GpuInit(GpuDeviceInfo* pInfo); + void GpuRelease(); + + // Persistent Buffer Management + bool GpuCreateBuffer(size_t capacity, size_t thumbSize); + void GpuReleaseBuffer(); + bool GpuUploadThumbnail(size_t index, const uint8_t* pData); + + // Batch Comparison: Compare one query thumbnail against a range of thumbnails in the buffer + // Returns number of matches found (difference <= threshold) + bool GpuCompareOneVsMany(const uint8_t* pQuery, size_t startIdx, size_t count, double threshold, + size_t* pMatchIndices, double* pMatchDifferences, size_t* pMatchCount, size_t maxMatches); + + // Compare one query thumbnail against a list of thumbnail indices in the buffer + bool GpuCompareOneVsList(const uint8_t* pQuery, const size_t* pIndices, size_t count, double threshold, + size_t* pMatchIndices, double* pMatchDifferences, size_t* pMatchCount, size_t maxMatches); + + // Single comparison (optimized, no malloc inside) + double GpuCompareSquaredSum(const uint8_t* pSrc1, const uint8_t* pSrc2, size_t size); + + // NEW: AllVsAll comparison с массовым upload + bool GpuCompareAllVsAll( + const uint8_t* allThumbnails, // Все thumbnails в RAM (непрерывный массив) + size_t count, // Количество изображений + size_t thumbSize, // Размер одного thumbnail (1024) + double threshold, // Порог для дубликатов + uint32_t* outImage1, // Массив для image1 (результат) + uint32_t* outImage2, // Массив для image2 (результат) + float* outDifference, // Массив для difference (результат) + size_t* outMatchCount, // Количество найденных дубликатов + size_t maxMatches); // Максимальное количество результатов +} + +#endif//__adGPU_h__ diff --git a/src/AntiDupl/adGPUManager.cpp b/src/AntiDupl/adGPUManager.cpp new file mode 100644 index 00000000..c1610501 --- /dev/null +++ b/src/AntiDupl/adGPUManager.cpp @@ -0,0 +1,106 @@ +/* +* AntiDuplPlus Program (http://github.com/Sucotasch/AntiDuplPlus). +* +* Copyright (c) 2023-2026. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "adGPUManager.h" +#include "adLogger.h" +#include +#include + +#define AD_DEBUG(msg) OutputDebugStringA(msg) + +namespace ad +{ + TGpuManager::TGpuManager() + : m_available(false), m_capacity(0), m_thumbSize(0) + { + AD_DEBUG("TGpuManager: Constructor starting\n"); + memset(&m_deviceInfo, 0, sizeof(m_deviceInfo)); + try + { + AD_DEBUG("TGpuManager: Calling GpuInit\n"); + if (GpuInit(&m_deviceInfo)) + { + AD_DEBUG("TGpuManager: GpuInit succeeded, checking compatibility\n"); + if (m_deviceInfo.isCompatible) + { + m_available = true; + AD_DEBUG("TGpuManager: GPU available and compatible\n"); + } + else + { + AD_DEBUG("TGpuManager: GPU not compatible\n"); + } + } + else + { + AD_DEBUG("TGpuManager: GpuInit failed\n"); + } + } + catch (const std::exception& e) + { + AD_DEBUG("TGpuManager: Exception caught\n"); + m_available = false; +#ifdef AD_LOGGER_ENABLE + AD_LOG("GPU: Exception during initialization."); +#endif + } + catch (...) + { + AD_DEBUG("TGpuManager: Unknown exception caught\n"); + m_available = false; +#ifdef AD_LOGGER_ENABLE + AD_LOG("GPU: Unknown exception during initialization."); +#endif + } + AD_DEBUG("TGpuManager: Constructor finished\n"); + } + + TGpuManager::~TGpuManager() + { + if (m_available) + { + std::lock_guard lock(m_mutex); + GpuRelease(); + } + } + + bool TGpuManager::EnsureCapacity(size_t required, size_t thumbSize) + { + if (!m_available) return false; + + std::lock_guard lock(m_mutex); + if (required <= m_capacity && thumbSize == m_thumbSize && m_capacity > 0) return true; + + // Reallocate if size changed or capacity is too small + size_t newCapacity = (size_t)(required * 1.2); + if (newCapacity < 1024) newCapacity = 1024; // Minimum buffer to avoid frequent reallocs + + if (GpuCreateBuffer(newCapacity, thumbSize)) + { + m_capacity = newCapacity; + m_thumbSize = thumbSize; + return true; + } + return false; + } +} diff --git a/src/AntiDupl/adGPUManager.h b/src/AntiDupl/adGPUManager.h new file mode 100644 index 00000000..6ec1f0d8 --- /dev/null +++ b/src/AntiDupl/adGPUManager.h @@ -0,0 +1,100 @@ +/* +* AntiDuplPlus Program (http://github.com/Sucotasch/AntiDuplPlus). +* +* Copyright (c) 2023-2026. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#ifndef __adGPUManager_h__ +#define __adGPUManager_h__ + +#include "adGPU.h" +#include + +namespace ad +{ + class TGpuManager + { + public: + TGpuManager(); + ~TGpuManager(); + + bool IsAvailable() const { return m_available; } + const GpuDeviceInfo& DeviceInfo() const { return m_deviceInfo; } + + bool UploadThumbnail(size_t index, const uint8_t* pData) { + if (!m_available) return false; + std::lock_guard lock(m_mutex); + return GpuUploadThumbnail(index, pData); + } + + bool CompareOneVsMany(const uint8_t* pQuery, size_t startIdx, size_t count, double threshold, + size_t* pMatchIndices, double* pMatchDifferences, size_t* pMatchCount, size_t maxMatches) { + if (!m_available) return false; + std::lock_guard lock(m_mutex); + return GpuCompareOneVsMany(pQuery, startIdx, count, threshold, + pMatchIndices, pMatchDifferences, pMatchCount, maxMatches); + } + + bool CompareOneVsList(const uint8_t* pQuery, const size_t* pIndices, size_t count, double threshold, + size_t* pMatchIndices, double* pMatchDifferences, size_t* pMatchCount, size_t maxMatches) { + if (!m_available) return false; + std::lock_guard lock(m_mutex); + return GpuCompareOneVsList(pQuery, pIndices, count, threshold, + pMatchIndices, pMatchDifferences, pMatchCount, maxMatches); + } + + void ClearBuffer() { + if (!m_available) return; + std::lock_guard lock(m_mutex); + size_t size = m_capacity; // Reuse current capacity + GpuReleaseBuffer(); + GpuCreateBuffer(size, m_thumbSize); + } + + bool EnsureCapacity(size_t required, size_t thumbSize); + + // NEW: AllVsAll comparison + bool CompareAllVsAll( + const uint8_t* allThumbnails, + size_t count, + size_t thumbSize, + double threshold, + uint32_t* outImage1, + uint32_t* outImage2, + float* outDifference, + size_t* outMatchCount, + size_t maxMatches) { + if (!m_available) return false; + std::lock_guard lock(m_mutex); + return GpuCompareAllVsAll(allThumbnails, count, thumbSize, threshold, + outImage1, outImage2, outDifference, + outMatchCount, maxMatches); + } + + private: + bool m_available; + GpuDeviceInfo m_deviceInfo; + size_t m_capacity; + size_t m_thumbSize; + mutable std::mutex m_mutex; + }; +} + +#endif//__adGPUManager_h__ diff --git a/src/AntiDupl/adIO.h b/src/AntiDupl/adIO.h index 5f8d88a4..2d40898f 100644 --- a/src/AntiDupl/adIO.h +++ b/src/AntiDupl/adIO.h @@ -24,6 +24,7 @@ #ifndef __adIO_h__ #define __adIO_h__ +#include #include "adConfig.h" namespace ad @@ -106,7 +107,7 @@ namespace ad { std::stringstream ss; ss << "Can't read " << sizeof(T) << " bytes from stream!"; - throw std::exception(ss.str().c_str()); + throw std::runtime_error(ss.str()); } return (T)0; } @@ -155,7 +156,7 @@ namespace ad { std::stringstream ss; ss << "Can't read " << size << " bytes from stream!"; - throw std::exception(ss.str().c_str()); + throw std::runtime_error(ss.str()); } } } @@ -167,7 +168,7 @@ namespace ad LARGE_INTEGER shift; shift.QuadPart = size; if(pStream->Seek(shift, STREAM_SEEK_CUR, NULL) != S_OK) - throw std::exception("Can't read stub buffer!"); + throw std::runtime_error("Can't read stub buffer!"); } } @@ -176,7 +177,7 @@ namespace ad ULARGE_INTEGER position; LARGE_INTEGER shift = {0}; if(pStream->Seek(shift, STREAM_SEEK_CUR, &position) != S_OK) - throw std::exception("Can't get current stream position!"); + throw std::runtime_error("Can't get current stream position!"); return position.QuadPart; } @@ -185,7 +186,7 @@ namespace ad LARGE_INTEGER shift; shift.QuadPart = position; if(pStream->Seek(shift, STREAM_SEEK_CUR, NULL) != S_OK) - throw std::exception("Can't get current stream position!"); + throw std::runtime_error("Can't get current stream position!"); } } diff --git a/src/AntiDupl/adImageComparer.cpp b/src/AntiDupl/adImageComparer.cpp index 5dfd6e8b..2cf8321e 100644 --- a/src/AntiDupl/adImageComparer.cpp +++ b/src/AntiDupl/adImageComparer.cpp @@ -31,6 +31,10 @@ #include "adResultStorage.h" #include "adImageComparer.h" #include "adImageDataStorage.h" +#include "adGPUManager.h" +#include + +#define AD_DEBUG(msg) OutputDebugStringA(msg) namespace ad { @@ -39,7 +43,8 @@ namespace ad //------------------------------------------------------------------------- TImageComparer::TImageComparer(TEngine *pEngine) - :m_pOptions(pEngine->Options()), + :m_pEngine(pEngine), + m_pOptions(pEngine->Options()), m_pResult(pEngine->Result()), m_pTransformedImageData(NULL), m_pBuffer(NULL), @@ -116,25 +121,111 @@ namespace ad // pTransformed - трансформированное, если применяется трансформация или то же что и оригинальное. void TImageComparer::CompareWithSet(const Set &set, TImageDataPtr pOriginal, TImageDataPtr pTransformed, adTransformType transform) { - double difference; - // Если картинка не в проверенных - if(!pTransformed->valid) + TGpuManager* pGpu = m_pEngine->GpuManager(); + bool gpuEligible = (pGpu && pGpu->IsAvailable() && + m_pOptions->compare.algorithmComparing == AD_COMPARING_SQUARED_SUM && + m_pOptions->advanced.ignoreFrameWidth == 0); + + if (gpuEligible) + { + if(!pTransformed->valid) + CompareWithSetGPU(set.valid, pOriginal, pTransformed, transform); + CompareWithSetGPU(set.other, pOriginal, pTransformed, transform); + } + else { - // Сравниваем с набором проверенных - for(TImageDataPtrList::const_iterator i = set.valid.begin(); i != set.valid.end(); ++i) + double difference; + // Если картинка не в проверенных + if(!pTransformed->valid) + { + // Сравниваем с набором проверенных + for(TImageDataPtrList::const_iterator i = set.valid.begin(); i != set.valid.end(); ++i) + { + if(IsDuplPair(pTransformed, *i, &difference)) + m_pResult->AddDuplImagePair(pOriginal, *i, difference, transform); + } + } + // Сравниваем с набором остальных + for(TImageDataPtrList::const_iterator i = set.other.begin(); i != set.other.end(); ++i) { if(IsDuplPair(pTransformed, *i, &difference)) m_pResult->AddDuplImagePair(pOriginal, *i, difference, transform); } } - // Сравниваем с набором остальных - for(TImageDataPtrList::const_iterator i = set.other.begin(); i != set.other.end(); ++i) - { - if(IsDuplPair(pTransformed, *i, &difference)) - m_pResult->AddDuplImagePair(pOriginal, *i, difference, transform); - } } + void TImageComparer::CompareWithSetGPU(const TImageDataPtrList &list, TImageDataPtr pOriginal, TImageDataPtr pTransformed, adTransformType transform) + { + AD_DEBUG("CompareWithSetGPU: Starting\n"); + + if (list.empty()) return; + + TGpuManager* pGpu = m_pEngine->GpuManager(); + double threshold = (double)m_mainThreshold; + + AD_DEBUG("CompareWithSetGPU: Gathering indices\n"); + + // Gather indices + std::vector indices; + std::vector ptrs; + indices.reserve(list.size()); + ptrs.reserve(list.size()); + + for (TImageDataPtrList::const_iterator i = list.begin(); i != list.end(); ++i) + { + TImageDataPtr pSecond = *i; + // Basic CPU-side pre-checks to avoid unnecessary GPU work + if(m_pOptions->compare.typeControl == TRUE && pTransformed->type != pSecond->type) continue; + if(m_pOptions->compare.sizeControl == TRUE && (pTransformed->height != pSecond->height || pTransformed->width != pSecond->width)) continue; + if(m_pOptions->compare.ratioControl == TRUE && Simd::Square(pTransformed->ratio - pSecond->ratio) > Simd::Square(RATIO_THRESHOLD_DIFFERENCE)) continue; + if(m_pOptions->compare.compareInsideOneFolder == FALSE && TPath::EqualByDirectory(pTransformed->path, pSecond->path)) continue; + if(m_pOptions->compare.compareInsideOneSearchPath == FALSE && pTransformed->index == pSecond->index) continue; + + indices.push_back(pSecond->globalIdx); + ptrs.push_back(pSecond); + } + + AD_DEBUG("CompareWithSetGPU: Processing batches\n"); + + if (indices.empty()) return; + + const size_t batchSize = 1024; // Process in chunks to manage VRAM/latency + + for (size_t start = 0; start < indices.size(); start += batchSize) + { + size_t count = std::min(batchSize, indices.size() - start); + + std::vector matchIndices(count); + std::vector matchDiffs(count); + size_t matchCount = 0; + + if (pGpu->CompareOneVsList(pTransformed->data->main, &indices[start], count, threshold, + matchIndices.data(), matchDiffs.data(), &matchCount, count)) + { + for (size_t m = 0; m < matchCount; ++m) + { + // Find original pointer by matching globalIdx + for (size_t b = 0; b < count; ++b) { + if (indices[start + b] == matchIndices[m]) { + TImageDataPtr pSecond = ptrs[start + b]; + double difference = sqrt(matchDiffs[m]/m_maxDifference)*100; + if(pOriginal->crc32c != pSecond->crc32c) + difference += ADDITIONAL_DIFFERENCE_FOR_DIFFERENT_CRC32; + m_pResult->AddDuplImagePair(pOriginal, pSecond, difference, transform); + break; + } + } + } + } + else + { + AD_DEBUG("CompareWithSetGPU: GpuCompareOneVsList FAILED\n"); + } + } + + AD_DEBUG("CompareWithSetGPU: Finished\n"); + } + void TImageComparer::AddToSet(Set &set, TImageDataPtr pImageData) { if(pImageData->valid) @@ -173,17 +264,18 @@ namespace ad if(fastDifference > m_fastThreshold) return false; - uint64_t mainDifference = 0; + uint64_t mainDifference = 0; if(m_pOptions->advanced.ignoreFrameWidth > 0) { SimdSquaredDifferenceSumMasked(pFirst->data->main, m_mainSize, pSecond->data->main, m_mainSize, - m_pMask, m_mainSize, FRAME_MASK_INDEX, m_mainSize, 1, &mainDifference); + m_pMask, m_mainSize, FRAME_MASK_INDEX, m_mainSize, 1, &mainDifference); } else { - SimdSquaredDifferenceSum(pFirst->data->main, m_mainSize, pSecond->data->main, m_mainSize, - m_mainSize, 1, &mainDifference); + SimdSquaredDifferenceSum(pFirst->data->main, m_mainSize, pSecond->data->main, m_mainSize, + m_mainSize, 1, &mainDifference); } + if(mainDifference > m_mainThreshold) return false; diff --git a/src/AntiDupl/adImageComparer.h b/src/AntiDupl/adImageComparer.h index 572b0cef..e8382a89 100644 --- a/src/AntiDupl/adImageComparer.h +++ b/src/AntiDupl/adImageComparer.h @@ -50,6 +50,7 @@ namespace ad typedef std::vector Sets; Sets m_sets; + TEngine *m_pEngine; TOptions *m_pOptions; public: TImageComparer(TEngine *pEngine); @@ -64,6 +65,7 @@ namespace ad void AddToSet(Set &set, TImageDataPtr pImageData); void CompareWithSet(const Set &set, TImageDataPtr pOriginal, TImageDataPtr pTransformed, adTransformType transform); + void CompareWithSetGPU(const TImageDataPtrList &list, TImageDataPtr pOriginal, TImageDataPtr pTransformed, adTransformType transform); private: TResultStorage *m_pResult; diff --git a/src/AntiDupl/adImageData.cpp b/src/AntiDupl/adImageData.cpp index 1cc19491..81028a8e 100644 --- a/src/AntiDupl/adImageData.cpp +++ b/src/AntiDupl/adImageData.cpp @@ -51,6 +51,8 @@ namespace ad ratio = 0; valid = false; index = AD_IS_NOT_EXIST; + globalIdx = 0; + pEngine = NULL; defect = AD_DEFECT_UNDEFINE; crc32c = 0; data = NULL; @@ -90,6 +92,8 @@ namespace ad defect = imageData.defect; crc32c = imageData.crc32c; index = imageData.index; + globalIdx = imageData.globalIdx; + pEngine = imageData.pEngine; if(m_owner && imageData.data->side != data->side) { delete data; diff --git a/src/AntiDupl/adImageData.h b/src/AntiDupl/adImageData.h index 425c349a..ebb1e603 100644 --- a/src/AntiDupl/adImageData.h +++ b/src/AntiDupl/adImageData.h @@ -31,12 +31,15 @@ namespace ad { struct TOptions; + class TEngine; //------------------------------------------------------------------------- struct TImageData : public TImageInfo { TInt32 ratio; // Ratio between height and width of image; bool valid; // The Image lie in 'valid' directory; size_t index; // Index of the path from path list where this image were found; + size_t globalIdx; // Unique index for GPU buffer + TEngine* pEngine; // Pointer to engine for GPU access TDefectType defect; TUInt32 crc32c; TPixelDataPtr data; diff --git a/src/AntiDupl/adImageDataStorage.cpp b/src/AntiDupl/adImageDataStorage.cpp index ae8a771b..b8eea24e 100644 --- a/src/AntiDupl/adImageDataStorage.cpp +++ b/src/AntiDupl/adImageDataStorage.cpp @@ -28,7 +28,9 @@ #include "adImageDataStorage.h" #include "adIO.h" #include "adFileStream.h" +#include "adLogger.h" #include "adException.h" +#include "adGPUManager.h" namespace ad { @@ -43,9 +45,11 @@ namespace ad //------------------------------------------------------------------------- TImageDataStorage::TImageDataStorage(TEngine *pEngine) - :m_pStatus(pEngine->Status()), + :m_pEngine(pEngine), + m_pStatus(pEngine->Status()), m_pOptions(pEngine->Options()), - m_needToSave (false) + m_needToSave (false), + m_nextGlobalIdx(0) { } @@ -63,6 +67,16 @@ namespace ad TImageDataStorage::TStorage::iterator TImageDataStorage::Insert(TImageData* pImageData) { + // Check for globalIdx overflow + if (m_nextGlobalIdx >= SIZE_MAX) { +#ifdef AD_LOGGER_ENABLE + AD_LOG("GPU: globalIdx counter overflow, resetting indices..."); +#endif + ResetGpuIndices(); + } + + pImageData->globalIdx = m_nextGlobalIdx++; + pImageData->pEngine = m_pEngine; return m_storage.insert(TStorage::value_type(pImageData->hash, pImageData)); } @@ -71,12 +85,27 @@ namespace ad for(TStorage::iterator it = m_storage.begin(); it != m_storage.end(); ++it) delete it->second; m_storage.clear(); + m_nextGlobalIdx = 0; + if (m_pEngine->GpuManager() && m_pEngine->GpuManager()->IsAvailable()) + { + m_pEngine->GpuManager()->ClearBuffer(); + } + } + + void TImageDataStorage::ResetGpuIndices() + { + m_nextGlobalIdx = 0; + for(TStorage::iterator it = m_storage.begin(); it != m_storage.end(); ++it) + { + it->second->globalIdx = m_nextGlobalIdx++; + } } void TImageDataStorage::Check() { m_pStatus->Reset(); size_t size = m_storage.size(), i = 0; + bool found_deleted = false; for(TStorage::iterator it = m_storage.begin(); it != m_storage.end(); ) { if(m_pStatus->Stopped()) @@ -86,6 +115,7 @@ namespace ad { delete it->second; it = m_storage.erase(it); + found_deleted = true; } else ++it; @@ -93,6 +123,16 @@ namespace ad m_pStatus->SetProgress(i++, size); } m_pStatus->Reset(); + + // Re-index GPU indices after deletions + if (found_deleted) { + ResetGpuIndices(); +#ifdef AD_LOGGER_ENABLE + std::stringstream ss; + ss << "GPU: Re-indexed " << m_storage.size() << " images after cleanup."; + AD_LOG(ss.str().c_str()); +#endif + } } // Загружает в хранилише m_storage переданный файл diff --git a/src/AntiDupl/adImageDataStorage.h b/src/AntiDupl/adImageDataStorage.h index c161a5f8..78187e2c 100644 --- a/src/AntiDupl/adImageDataStorage.h +++ b/src/AntiDupl/adImageDataStorage.h @@ -33,6 +33,7 @@ namespace ad // Хранение информации об изображениях в т.ч. эскизов class TImageDataStorage { + friend class TEngine; public: TImageDataStorage(TEngine *pEngine); ~TImageDataStorage() {ClearMemory();} @@ -47,9 +48,12 @@ namespace ad void Check(); void ClearMemory(); void SetSaveState(const bool needToSave); + void ResetGpuIndices(); - private: typedef std::multimap TStorage; + const TStorage& Storage() const { return m_storage; } + + private: typedef std::vector TVector; TStorage::iterator Find(const TImageInfo& imageInfo); @@ -57,10 +61,12 @@ namespace ad // Информация которую будем записывать. Словарь TImageData TStorage m_storage; + TEngine *m_pEngine; TStatus *m_pStatus; TOptions *m_pOptions; bool m_needToSave; + size_t m_nextGlobalIdx; struct TData { diff --git a/src/AntiDupl/adImageUtils.cpp b/src/AntiDupl/adImageUtils.cpp index 1a3a6776..f1566401 100644 --- a/src/AntiDupl/adImageUtils.cpp +++ b/src/AntiDupl/adImageUtils.cpp @@ -64,18 +64,18 @@ namespace ad int img_stride = (pImage->View()->width) * TView::PixelSize(TView::Bgra32);; TView Bgra(pImage->View()->width, pImage->View()->height, img_stride, TView::Bgra32, NULL); Simd::RgbToBgra(*pImage->View(), Bgra); - Simd::ResizeBilinear(Bgra, view); + Simd::Resize(Bgra, view); } else if (pImage->View()->format == TView::Format::Rgba32) { int img_stride = (pImage->View()->width) * TView::PixelSize(TView::Bgra32);; TView Bgra(pImage->View()->width, pImage->View()->height, TView::Bgra32, NULL); Simd::RgbaToBgra(*pImage->View(), Bgra); - Simd::ResizeBilinear(Bgra, view); + Simd::Resize(Bgra, view); } else { - Simd::ResizeBilinear(*pImage->View(), view); + Simd::Resize(*pImage->View(), view); } result = AD_OK; } diff --git a/src/AntiDupl/adJxl.cpp b/src/AntiDupl/adJxl.cpp index 1da8b868..e1a6a121 100644 --- a/src/AntiDupl/adJxl.cpp +++ b/src/AntiDupl/adJxl.cpp @@ -72,17 +72,16 @@ namespace ad JXL_DEC_FULL_IMAGE)) { #ifdef AD_LOGGER_ENABLE - AD_LOG("JxlDecoderSubscribeEvents failed\n); + AD_LOG("JxlDecoderSubscribeEvents failed\n"); #endif//AD_LOGGER_ENABLE return NULL; } - - if (JXL_DEC_SUCCESS != JxlDecoderSetParallelRunner(decoder.get(), - JxlResizableParallelRunner, - runner.get())) + if (JXL_DEC_SUCCESS != + JxlDecoderSetParallelRunner(decoder.get(), + JxlResizableParallelRunner, runner.get())) { #ifdef AD_LOGGER_ENABLE - AD_LOG("JxlDecoderSetParallelRunner failed\n); + AD_LOG("JxlDecoderSetParallelRunner failed\n"); #endif//AD_LOGGER_ENABLE return NULL; } @@ -118,7 +117,7 @@ namespace ad if (JXL_DEC_SUCCESS != JxlDecoderGetBasicInfo(decoder.get(), &info)) { #ifdef AD_LOGGER_ENABLE - AD_LOG("JxlDecoderGetBasicInfo failed\n); + AD_LOG("JxlDecoderGetBasicInfo failed\n"); #endif//AD_LOGGER_ENABLE return NULL; } @@ -134,10 +133,10 @@ namespace ad size_t icc_size; if (JXL_DEC_SUCCESS != JxlDecoderGetICCProfileSize( - decoder.get(), &format, JXL_COLOR_PROFILE_TARGET_DATA, &icc_size)) + decoder.get(), JXL_COLOR_PROFILE_TARGET_DATA, &icc_size)) { #ifdef AD_LOGGER_ENABLE - AD_LOG("JxlDecoderGetICCProfileSize failed\n); + AD_LOG("JxlDecoderGetICCProfileSize failed\n"); #endif//AD_LOGGER_ENABLE return NULL; } @@ -156,7 +155,7 @@ namespace ad JxlDecoderImageOutBufferSize(decoder.get(), &format, &buffer_size)) { #ifdef AD_LOGGER_ENABLE - AD_LOG("JxlDecoderImageOutBufferSize failed\n); + AD_LOG("JxlDecoderImageOutBufferSize failed\n"); #endif//AD_LOGGER_ENABLE return NULL; } diff --git a/src/AntiDupl/adOpenJpeg.cpp b/src/AntiDupl/adOpenJpeg.cpp index 82157300..07269242 100644 --- a/src/AntiDupl/adOpenJpeg.cpp +++ b/src/AntiDupl/adOpenJpeg.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #define OPJ_STATIC +#include #include "openjpeg.h" #include "adPerformance.h" @@ -317,19 +318,28 @@ namespace ad { AD_FUNCTION_PERFORMANCE_TEST TView *pView = NULL; - opj_codec_t * codec = opj_create_decompress(OpenJpegCodecFormat(data, size)); + + std::unique_ptr codec( + opj_create_decompress(OpenJpegCodecFormat(data, size)), opj_destroy_codec); + if(codec) { opj_dparameters_t parameters; opj_set_default_decoder_parameters(¶meters); - opj_setup_decoder(codec, ¶meters); - opj_stream_t * stream = СreateBlobStream(data, size); + opj_setup_decoder(codec.get(), ¶meters); + + std::unique_ptr stream( + СreateBlobStream(data, size), opj_stream_destroy); + if(stream) { - opj_image_t * image; - if (opj_read_header(stream, codec, &image)) + opj_image_t * imageRaw = NULL; + if (opj_read_header(stream.get(), codec.get(), &imageRaw)) { - if(opj_decode(codec, stream, image)) + std::unique_ptr image( + imageRaw, opj_image_destroy); + + if(opj_decode(codec.get(), stream.get(), image.get())) { size_t width = image->x1 - image->x0; size_t height = image->y1 - image->y0; @@ -375,11 +385,8 @@ namespace ad AD_PERFORMANCE_TEST_SET_SIZE(width*height) } } - opj_image_destroy(image); } - opj_stream_destroy(stream); } - opj_destroy_codec(codec); } return pView; } diff --git a/src/AntiDupl/adThreadManagement.cpp b/src/AntiDupl/adThreadManagement.cpp index 1a6e4c28..a32d97d7 100644 --- a/src/AntiDupl/adThreadManagement.cpp +++ b/src/AntiDupl/adThreadManagement.cpp @@ -32,6 +32,9 @@ #include "adResult.h" #include "adResultStorage.h" #include "adPerformance.h" +#include + +#define AD_DEBUG(msg) OutputDebugStringA(msg) namespace ad { @@ -257,15 +260,36 @@ namespace ad void TCompareManager::Add(TImageData *pImageData) { + // Check if manager is started + if (m_pThreads == NULL) { + AD_DEBUG("TCompareManager::Add: Not started, skipping\n"); + return; + } + + AD_DEBUG("TCompareManager::Add: Starting\n"); + if(CanCompare(pImageData)) { + AD_DEBUG("TCompareManager::Add: CanCompare is true\n"); + TCriticalSection::TLocker locker(m_pCS); size_t threadId = m_addCounter%m_pThreads->size(); + AD_DEBUG("TCompareManager::Add: Pushing to threads\n"); + for(TThreads::iterator i = m_pThreads->begin(); i != m_pThreads->end(); i++) i->task->Queue()->Push(pImageData, threadId); + + AD_DEBUG("TCompareManager::Add: Pushed to all threads\n"); + m_pEngine->Status()->Assign(AD_THREAD_TYPE_COMPARE, threadId); m_addCounter++; } + else + { + AD_DEBUG("TCompareManager::Add: CanCompare is false\n"); + } + + AD_DEBUG("TCompareManager::Add: Finished\n"); } size_t TCompareManager::DefaultThreadCount(size_t imageCount) @@ -316,21 +340,46 @@ namespace ad void TCollectManager::Add(TImageData *pImageData) { + AD_DEBUG("TCollectManager::Add: Starting\n"); + if(pImageData->DefectCheckingNeed(m_pOptions) || pImageData->PixelDataFillingNeed(m_pOptions) || pImageData->crc32c == 0) { + AD_DEBUG("TCollectManager::Add: Loading file to memory\n"); pImageData->hGlobal = LoadFileToMemory(pImageData->path.Original().c_str()); + AD_DEBUG("TCollectManager::Add: File loaded\n"); + size_t threadId = GetThreadId(); + AD_DEBUG("TCollectManager::Add: Got threadId\n"); + m_pThreads->at(threadId).task->Queue()->Push(pImageData, threadId); + AD_DEBUG("TCollectManager::Add: Pushed to queue\n"); + m_pEngine->Status()->Assign(AD_THREAD_TYPE_COLLECT, threadId); } else { + AD_DEBUG("TCollectManager::Add: Using cached data\n"); TDefectType defect = pImageData->GetDefect(m_pOptions); if(defect > AD_DEFECT_NONE) m_pEngine->Result()->AddDefectImage(pImageData, defect); + + AD_DEBUG("TCollectManager::Add: Calling FillOther\n"); pImageData->FillOther(m_pOptions); - m_pCompareManager->Add(pImageData); + + // Skip comparison if GPU AllVsAll mode is enabled + if (m_pEngine->SkipComparisonDuringCollection()) + { + AD_DEBUG("TCollectManager::Add: Skipping comparison (GPU mode)\n"); + } + else + { + AD_DEBUG("TCollectManager::Add: Calling CompareManager->Add\n"); + m_pCompareManager->Add(pImageData); + AD_DEBUG("TCollectManager::Add: CompareManager->Add returned\n"); + } } + + AD_DEBUG("TCollectManager::Add: Finished\n"); } size_t TCollectManager::DefaultThreadCount() From 934a123835d514a3bd5bad67c1bf15e91fef5ee7 Mon Sep 17 00:00:00 2001 From: Antigravity Date: Sun, 5 Apr 2026 22:15:34 +0400 Subject: [PATCH 2/6] fix: 5 critical GPU AllVsAll bug fixes + performance optimizations --- src/AntiDupl/adEngine.cpp | 22 ++++---- src/AntiDupl/adGPU.cu | 78 ++++++++++++++++++----------- src/AntiDupl/adThreadManagement.cpp | 7 ++- 3 files changed, 68 insertions(+), 39 deletions(-) diff --git a/src/AntiDupl/adEngine.cpp b/src/AntiDupl/adEngine.cpp index a5985fd7..2c5ee415 100644 --- a/src/AntiDupl/adEngine.cpp +++ b/src/AntiDupl/adEngine.cpp @@ -267,6 +267,16 @@ namespace ad AD_DEBUG_FMT("ExecuteGpuAllVsAllComparison: %zu valid thumbnails\n", validCount); + // Создаём vector для O(1) доступа по индексу (вместо O(N) std::advance) + std::vector imageByIndex(count); + idx = 0; + for (TImageDataStorage::TStorage::const_iterator it = storage.begin(); it != storage.end(); ++it, ++idx) { + TImageDataPtr pImageData = it->second; + if (pImageData->data && pImageData->data->filled && pImageData->data->main != nullptr) { + imageByIndex[idx] = pImageData; + } + } + // Вычисляем threshold как в оригинальном TImageComparer int thresholdPerPixel = Simd::Square(m_pOptions->compare.thresholdDifference * PIXEL_MAX_DIFFERENCE) / Simd::Square(DENOMINATOR); @@ -294,16 +304,10 @@ namespace ad { AD_DEBUG_FMT("ExecuteGpuAllVsAllComparison: GPU returned %zu matches\n", matchCount); - // Обрабатываем результаты + // Обрабатываем результаты — O(1) доступ вместо O(N) std::advance for (size_t i = 0; i < matchCount; i++) { - // Находим изображения по индексам - auto it1 = storage.begin(); - std::advance(it1, outImage1[i]); - TImageDataPtr pImage1 = it1->second; - - auto it2 = storage.begin(); - std::advance(it2, outImage2[i]); - TImageDataPtr pImage2 = it2->second; + TImageDataPtr pImage1 = imageByIndex[outImage1[i]]; + TImageDataPtr pImage2 = imageByIndex[outImage2[i]]; double maxDifference = (double)(Simd::Square(PIXEL_MAX_DIFFERENCE) * thumbSize); double difference = sqrt((double)outDifference[i] / maxDifference) * 100; diff --git a/src/AntiDupl/adGPU.cu b/src/AntiDupl/adGPU.cu index e5058330..076b0a03 100644 --- a/src/AntiDupl/adGPU.cu +++ b/src/AntiDupl/adGPU.cu @@ -76,7 +76,8 @@ namespace ad if (tid == 0) atomicAdd(pResult, shared_data[0]); } - // NEW: AllVsAll kernel — каждый блок обрабатывает одну строку (один i), каждый поток — один j + // NEW: AllVsAll kernel — grid-stride loop + shared memory оптимизация + // Каждый блок обрабатывает несколько строк i с шагом gridDim.x __global__ void AllVsAllKernel( const uint8_t* thumbnails, // Все thumbnails в VRAM size_t thumbSize, // Размер одного thumbnail (1024) @@ -85,32 +86,42 @@ namespace ad Match* results, // Sparse buffer для результатов size_t* matchCount) // Atomic counter { - // Каждый блок обрабатывает одну строку i - size_t i = blockIdx.x; - if (i >= count) return; - - const uint8_t* thumb1 = thumbnails + i * thumbSize; - - // Каждый поток обрабатывает несколько j > i с stride - size_t numThreads = blockDim.x; - - for (size_t j = i + 1 + threadIdx.x; j < count; j += numThreads) { - const uint8_t* thumb2 = thumbnails + j * thumbSize; - - // Вычисляем squared difference - double sumSqDiff = 0; - for (size_t p = 0; p < thumbSize; p++) { - double diff = (double)thumb1[p] - (double)thumb2[p]; - sumSqDiff += diff * diff; + // Shared memory для thumb1 — ускоряет чтение в 10-100 раз + // Максимум 1024 байта (32x32 thumbnail) + extern __shared__ uint8_t shared_thumb[]; + + // Grid-stride loop: каждый блок обрабатывает несколько строк i + for (size_t i = blockIdx.x; i < count; i += gridDim.x) { + const uint8_t* thumb1_global = thumbnails + i * thumbSize; + + // Загружаем thumb1 в shared memory кооперативно + for (size_t p = threadIdx.x; p < thumbSize; p += blockDim.x) { + shared_thumb[p] = thumb1_global[p]; } + __syncthreads(); - // Если ниже threshold — записываем результат - if (sumSqDiff <= threshold) { - size_t idx = atomicAdd(matchCount, (size_t)1); - results[idx].image1 = (uint32_t)i; - results[idx].image2 = (uint32_t)j; - results[idx].difference = (float)sumSqDiff; + // Каждый поток обрабатывает несколько j > i с stride + size_t numThreads = blockDim.x; + + for (size_t j = i + 1 + threadIdx.x; j < count; j += numThreads) { + const uint8_t* thumb2 = thumbnails + j * thumbSize; + + // Вычисляем squared difference из shared memory + double sumSqDiff = 0; + for (size_t p = 0; p < thumbSize; p++) { + double diff = (double)shared_thumb[p] - (double)thumb2[p]; + sumSqDiff += diff * diff; + } + + // Если ниже threshold — записываем результат + if (sumSqDiff <= threshold) { + size_t idx = atomicAdd(matchCount, (size_t)1); + results[idx].image1 = (uint32_t)i; + results[idx].image2 = (uint32_t)j; + results[idx].difference = (float)sumSqDiff; + } } + __syncthreads(); // Синхронизация перед следующей итерацией i } } @@ -656,19 +667,28 @@ namespace ad // 5. Инициализируем counter size_t h_matchCount = 0; - cudaMemcpy(d_matchCount, &h_matchCount, sizeof(size_t), cudaMemcpyHostToDevice); + err = cudaMemcpy(d_matchCount, &h_matchCount, sizeof(size_t), cudaMemcpyHostToDevice); + if (err != cudaSuccess) { + AD_DEBUG_FMT("GpuCompareAllVsAll: Counter init failed: %s\n", cudaGetErrorString(err)); + cudaFree(d_thumbnails); + cudaFree(d_results); + cudaFree(d_matchCount); + return false; + } // 6. Запускаем kernel AD_DEBUG("GpuCompareAllVsAll: Launching kernel\n"); int threadsPerBlock = 256; - // Теперь каждый блок = одна строка i, так что blocks = count + // Grid-stride loop: используем максимум блоков для параллелизма size_t blocks = count; - if (blocks > 65535) blocks = 65535; // Ограничение CUDA + if (blocks > 65535) blocks = 65535; // Максимум CUDA grid size + if (blocks == 0) blocks = 1; // Минимум 1 блок - AD_DEBUG_FMT("GpuCompareAllVsAll: Launching %zu blocks, %d threads/block\n", blocks, threadsPerBlock); + AD_DEBUG_FMT("GpuCompareAllVsAll: Launching %zu blocks, %d threads/block (grid-stride + shared mem)\n", blocks, threadsPerBlock); - AllVsAllKernel<<<(int)blocks, threadsPerBlock>>>( + // Передаём размер shared memory динамически + AllVsAllKernel<<<(int)blocks, threadsPerBlock, thumbSize>>>( d_thumbnails, thumbSize, count, threshold, d_results, d_matchCount); err = cudaGetLastError(); diff --git a/src/AntiDupl/adThreadManagement.cpp b/src/AntiDupl/adThreadManagement.cpp index a32d97d7..5eab72a7 100644 --- a/src/AntiDupl/adThreadManagement.cpp +++ b/src/AntiDupl/adThreadManagement.cpp @@ -174,7 +174,12 @@ namespace ad void TCollectTask::DoOwn(TImageData *pImageData) { m_pDataCollector->Fill(pImageData); - m_pCompareManager->Add(pImageData); + + // Skip CPU comparison if GPU AllVsAll mode is enabled + if (!m_pEngine->SkipComparisonDuringCollection()) { + m_pCompareManager->Add(pImageData); + } + m_pStatus->Process(AD_THREAD_TYPE_COLLECT, Queue()->Id(), pImageData->path.Original().c_str()); } //------------------------------------------------------------------------- From 6febedf209054ec95aadcba6f2836e0643660814 Mon Sep 17 00:00:00 2001 From: Antigravity Date: Sun, 5 Apr 2026 22:51:45 +0400 Subject: [PATCH 3/6] perf: GPU streaming processing, CRC check on GPU, collection optimization --- src/AntiDupl/adDataCollector.cpp | 6 +- src/AntiDupl/adEngine.cpp | 121 +++++++++++++--------- src/AntiDupl/adGPU.cu | 151 +++++++++++++++++----------- src/AntiDupl/adGPU.h | 26 +++-- src/AntiDupl/adGPUManager.h | 19 ++-- src/AntiDupl/adThreadManagement.cpp | 7 ++ 6 files changed, 206 insertions(+), 124 deletions(-) diff --git a/src/AntiDupl/adDataCollector.cpp b/src/AntiDupl/adDataCollector.cpp index e778218f..858acb3e 100644 --- a/src/AntiDupl/adDataCollector.cpp +++ b/src/AntiDupl/adDataCollector.cpp @@ -110,7 +110,9 @@ namespace ad ReduceGray2x2(*m_pGrayBuffers.back(), TView(data.side, data.side, data.side, TView::Gray8, data.main)); data.filled = true; - if (m_pEngine->GpuManager() && m_pEngine->GpuManager()->IsAvailable()) + // Upload thumbnail to GPU — only for non-AllVsAll mode + // In AllVsAll mode, ExecuteGpuAllVsAllComparison does its own mass upload + if (!m_pEngine->SkipComparisonDuringCollection()) { // Ensure GPU buffer is initialized before first upload static bool gpuBufferInitialized = false; @@ -129,7 +131,7 @@ namespace ad AD_DEBUG("FillPixelData: GPU buffer initialization FAILED\n"); } } - + // Upload thumbnail to GPU immediately if (m_pEngine->GpuManager()->UploadThumbnail(pImageData->globalIdx, data.main)) { diff --git a/src/AntiDupl/adEngine.cpp b/src/AntiDupl/adEngine.cpp index 2c5ee415..5ad99c05 100644 --- a/src/AntiDupl/adEngine.cpp +++ b/src/AntiDupl/adEngine.cpp @@ -37,6 +37,7 @@ #include "adLogger.h" #include "adFileUtils.h" #include "adGPUManager.h" +#include "adGPU.h" #include "adStatus.h" #include #include @@ -230,7 +231,31 @@ namespace ad AD_DEBUG("UpdateGpuDatabase: Finished\n"); } - // NEW: GPU AllVsAll comparison + // Структура для контекста callback + struct MatchProcessContext { + TEngine* engine; + const std::vector* imageByIndex; + size_t thumbSize; + double maxDifference; + size_t totalProcessed; + size_t bufferFullCount; + }; + + // Callback функция для streaming обработки matches + static void MatchCallback(const void* batch, size_t count, void* context) { + MatchProcessContext* ctx = (MatchProcessContext*)context; + const Match* matches = (const Match*)batch; + + for (size_t i = 0; i < count; i++) { + TImageDataPtr pImage1 = ctx->imageByIndex->at(matches[i].image1); + TImageDataPtr pImage2 = ctx->imageByIndex->at(matches[i].image2); + + ctx->engine->Result()->AddDuplImagePair(pImage1, pImage2, matches[i].difference, AD_TRANSFORM_TURN_0); + ctx->totalProcessed++; + } + } + + // NEW: GPU AllVsAll comparison с streaming processing void TEngine::ExecuteGpuAllVsAllComparison() { AD_DEBUG("ExecuteGpuAllVsAllComparison: Starting\n"); @@ -252,8 +277,10 @@ namespace ad AD_DEBUG_FMT("ExecuteGpuAllVsAllComparison: Preparing data for %zu images\n", count); - // Собираем все thumbnails в один массив + // Собираем все thumbnails и CRC в один проход std::vector allThumbnails(count * thumbSize); + std::vector allCrcArray(count); + std::vector imageByIndex(count); size_t validCount = 0; size_t idx = 0; @@ -261,68 +288,64 @@ namespace ad TImageDataPtr pImageData = it->second; if (pImageData->data && pImageData->data->filled && pImageData->data->main != nullptr) { memcpy(&allThumbnails[idx * thumbSize], pImageData->data->main, thumbSize); + allCrcArray[idx] = pImageData->crc32c; + imageByIndex[idx] = pImageData; validCount++; } } AD_DEBUG_FMT("ExecuteGpuAllVsAllComparison: %zu valid thumbnails\n", validCount); - // Создаём vector для O(1) доступа по индексу (вместо O(N) std::advance) - std::vector imageByIndex(count); - idx = 0; - for (TImageDataStorage::TStorage::const_iterator it = storage.begin(); it != storage.end(); ++it, ++idx) { - TImageDataPtr pImageData = it->second; - if (pImageData->data && pImageData->data->filled && pImageData->data->main != nullptr) { - imageByIndex[idx] = pImageData; - } - } - - // Вычисляем threshold как в оригинальном TImageComparer + // Вычисляем threshold и maxDifference как в оригинальном TImageComparer int thresholdPerPixel = Simd::Square(m_pOptions->compare.thresholdDifference * PIXEL_MAX_DIFFERENCE) / Simd::Square(DENOMINATOR); int mainThreshold = (int)(thumbSize * thresholdPerPixel); double threshold = (double)mainThreshold; - - AD_DEBUG_FMT("ExecuteGpuAllVsAllComparison: thresholdPerPixel=%d, mainThreshold=%d, threshold=%f\n", - thresholdPerPixel, mainThreshold, threshold); - - // Выделяем память для результатов (предполагаем ~5% дубликатов) - size_t maxMatches = count * (count - 1) / 2; - if (maxMatches > 10000000) maxMatches = 10000000; // Ограничиваем 10M - - std::vector outImage1(maxMatches); - std::vector outImage2(maxMatches); - std::vector outDifference(maxMatches); - size_t matchCount = 0; - - AD_DEBUG("ExecuteGpuAllVsAllComparison: Calling GPU\n"); - - if (m_pGpuManager->CompareAllVsAll( - allThumbnails.data(), count, thumbSize, threshold, - outImage1.data(), outImage2.data(), outDifference.data(), - &matchCount, maxMatches)) - { - AD_DEBUG_FMT("ExecuteGpuAllVsAllComparison: GPU returned %zu matches\n", matchCount); - - // Обрабатываем результаты — O(1) доступ вместо O(N) std::advance - for (size_t i = 0; i < matchCount; i++) { - TImageDataPtr pImage1 = imageByIndex[outImage1[i]]; - TImageDataPtr pImage2 = imageByIndex[outImage2[i]]; - - double maxDifference = (double)(Simd::Square(PIXEL_MAX_DIFFERENCE) * thumbSize); - double difference = sqrt((double)outDifference[i] / maxDifference) * 100; - if (pImage1->crc32c != pImage2->crc32c) - difference += ADDITIONAL_DIFFERENCE_FOR_DIFFERENT_CRC32; - - m_pResult->AddDuplImagePair(pImage1, pImage2, difference, AD_TRANSFORM_TURN_0); - } - - AD_DEBUG("ExecuteGpuAllVsAllComparison: Results processed\n"); + double maxDifference = (double)(Simd::Square(PIXEL_MAX_DIFFERENCE) * thumbSize); + + AD_DEBUG_FMT("ExecuteGpuAllVsAllComparison: threshold=%f, maxDifference=%f\n", threshold, maxDifference); + + // Streaming processing context + MatchProcessContext ctx; + ctx.engine = this; + ctx.imageByIndex = &imageByIndex; + ctx.thumbSize = thumbSize; + ctx.maxDifference = maxDifference; + ctx.totalProcessed = 0; + ctx.bufferFullCount = 0; + + // Batch size для streaming readback: 5M matches = 60MB RAM + const size_t BATCH_MATCHES = 5000000; + + AD_DEBUG_FMT("ExecuteGpuAllVsAllComparison: Calling GPU (batch size: %zu)\n", BATCH_MATCHES); + + bool success = m_pGpuManager->CompareAllVsAll( + allThumbnails.data(), + allCrcArray.data(), + count, + thumbSize, + threshold, + maxDifference, + ADDITIONAL_DIFFERENCE_FOR_DIFFERENT_CRC32, + &ctx, + MatchCallback, + BATCH_MATCHES); + + if (success) { + AD_DEBUG_FMT("ExecuteGpuAllVsAllComparison: Processed %zu total matches\n", ctx.totalProcessed); } else { AD_DEBUG("ExecuteGpuAllVsAllComparison: GPU comparison FAILED\n"); } + // Освобождаем большую память заранее + allThumbnails.clear(); + allThumbnails.shrink_to_fit(); + allCrcArray.clear(); + allCrcArray.shrink_to_fit(); + imageByIndex.clear(); + imageByIndex.shrink_to_fit(); + AD_DEBUG("ExecuteGpuAllVsAllComparison: Finished\n"); } diff --git a/src/AntiDupl/adGPU.cu b/src/AntiDupl/adGPU.cu index 076b0a03..96a40d74 100644 --- a/src/AntiDupl/adGPU.cu +++ b/src/AntiDupl/adGPU.cu @@ -50,13 +50,6 @@ namespace ad // --- Kernels --- - // Match structure for sparse results - struct Match { - uint32_t image1; - uint32_t image2; - float difference; - }; - __global__ void SquaredSumKernel(const uint8_t* pSrc1, const uint8_t* pSrc2, size_t size, double* pResult) { extern __shared__ double shared_data[]; @@ -76,18 +69,20 @@ namespace ad if (tid == 0) atomicAdd(pResult, shared_data[0]); } - // NEW: AllVsAll kernel — grid-stride loop + shared memory оптимизация + // NEW: AllVsAll kernel — grid-stride loop + shared memory + CRC check + final difference // Каждый блок обрабатывает несколько строк i с шагом gridDim.x __global__ void AllVsAllKernel( - const uint8_t* thumbnails, // Все thumbnails в VRAM + const uint8_t* thumbnails, // Все thumbnails в VRAM + const uint64_t* crcArray, // CRC32c для каждого изображения size_t thumbSize, // Размер одного thumbnail (1024) size_t count, // Общее количество изображений - double threshold, // Порог для дубликатов + double threshold, // Порог squared difference + double maxDifference, // Максимальная разница для нормализации + double addDiffForCrcMismatch, // Добавка за несовпадение CRC Match* results, // Sparse buffer для результатов size_t* matchCount) // Atomic counter { // Shared memory для thumb1 — ускоряет чтение в 10-100 раз - // Максимум 1024 байта (32x32 thumbnail) extern __shared__ uint8_t shared_thumb[]; // Grid-stride loop: каждый блок обрабатывает несколько строк i @@ -113,12 +108,20 @@ namespace ad sumSqDiff += diff * diff; } - // Если ниже threshold — записываем результат + // Проверяем threshold ДО нормализации (как в CPU версии) if (sumSqDiff <= threshold) { + // Вычисляем финальный difference в процентах + double difference = sqrt(sumSqDiff / maxDifference) * 100.0; + + // Добавляем штраф за несовпадение CRC + if (crcArray[i] != crcArray[j]) { + difference += addDiffForCrcMismatch; + } + size_t idx = atomicAdd(matchCount, (size_t)1); results[idx].image1 = (uint32_t)i; results[idx].image2 = (uint32_t)j; - results[idx].difference = (float)sumSqDiff; + results[idx].difference = (float)difference; } } __syncthreads(); // Синхронизация перед следующей итерацией i @@ -598,30 +601,32 @@ namespace ad return h_r; } - // NEW: AllVsAll comparison с массовым upload + // NEW: AllVsAll comparison с массовым upload + streaming callback bool GpuCompareAllVsAll( - const uint8_t* allThumbnails, // Все thumbnails в RAM (непрерывный массив) + const uint8_t* allThumbnails, // Все thumbnails в RAM + const uint64_t* allCrcArray, // CRC32c для каждого изображения size_t count, // Количество изображений size_t thumbSize, // Размер одного thumbnail (1024) - double threshold, // Порог для дубликатов - uint32_t* outImage1, // Массив для image1 (результат) - uint32_t* outImage2, // Массив для image2 (результат) - float* outDifference, // Массив для difference (результат) - size_t* outMatchCount, // Количество найденных дубликатов - size_t maxMatches) // Максимальное количество результатов + double threshold, // Порог squared difference + double maxDifference, // Максимальная разница для нормализации + double addDiffForCrcMismatch, // Добавка за несовпадение CRC + void* callbackContext, // Контекст для callback + GpuMatchCallback callback, // Callback для streaming обработки + size_t maxMatchesPerBatch) // Максимум matches за один вызов { AD_DEBUG("GpuCompareAllVsAll: Starting\n"); - if (!allThumbnails || count == 0 || thumbSize == 0 || !outImage1 || !outImage2 || !outDifference || !outMatchCount) { + if (!allThumbnails || !allCrcArray || count == 0 || thumbSize == 0 || !callback) { AD_DEBUG("GpuCompareAllVsAll: Invalid parameters\n"); return false; } size_t totalPairs = count * (count - 1) / 2; - AD_DEBUG_FMT("GpuCompareAllVsAll: Comparing %zu images, %zu pairs, threshold=%f\n", count, totalPairs, threshold); + AD_DEBUG_FMT("GpuCompareAllVsAll: Comparing %zu images, %zu pairs\n", count, totalPairs); // Выделяем VRAM для thumbnails uint8_t* d_thumbnails = nullptr; + uint64_t* d_crcArray = nullptr; Match* d_results = nullptr; size_t* d_matchCount = nullptr; @@ -635,108 +640,142 @@ namespace ad return false; } - // 2. Выделяем память для результатов (sparse buffer) + // 2. Выделяем память для CRC массива + AD_DEBUG("GpuCompareAllVsAll: Allocating VRAM for CRC array\n"); + err = cudaMalloc(&d_crcArray, count * sizeof(uint64_t)); + if (err != cudaSuccess) { + AD_DEBUG_FMT("GpuCompareAllVsAll: Failed to allocate CRC VRAM: %s\n", cudaGetErrorString(err)); + cudaFree(d_thumbnails); + return false; + } + + // 3. Выделяем память для результатов (batch buffer) AD_DEBUG("GpuCompareAllVsAll: Allocating VRAM for results\n"); - err = cudaMalloc(&d_results, maxMatches * sizeof(Match)); + err = cudaMalloc(&d_results, maxMatchesPerBatch * sizeof(Match)); if (err != cudaSuccess) { AD_DEBUG_FMT("GpuCompareAllVsAll: Failed to allocate results VRAM: %s\n", cudaGetErrorString(err)); cudaFree(d_thumbnails); + cudaFree(d_crcArray); return false; } - // 3. Выделяем память для counter + // 4. Выделяем память для counter err = cudaMalloc(&d_matchCount, sizeof(size_t)); if (err != cudaSuccess) { AD_DEBUG("GpuCompareAllVsAll: Failed to allocate counter VRAM\n"); cudaFree(d_thumbnails); + cudaFree(d_crcArray); cudaFree(d_results); return false; } - // 4. Один массовый upload всех thumbnails + // 5. Upload всех thumbnails в VRAM AD_DEBUG("GpuCompareAllVsAll: Uploading all thumbnails to VRAM\n"); err = cudaMemcpy(d_thumbnails, allThumbnails, count * thumbSize, cudaMemcpyHostToDevice); if (err != cudaSuccess) { - AD_DEBUG_FMT("GpuCompareAllVsAll: Upload failed: %s\n", cudaGetErrorString(err)); + AD_DEBUG_FMT("GpuCompareAllVsAll: Upload thumbnails failed: %s\n", cudaGetErrorString(err)); cudaFree(d_thumbnails); + cudaFree(d_crcArray); cudaFree(d_results); cudaFree(d_matchCount); return false; } - AD_DEBUG("GpuCompareAllVsAll: Upload complete\n"); + AD_DEBUG("GpuCompareAllVsAll: Upload thumbnails complete\n"); - // 5. Инициализируем counter + // 6. Upload CRC массива + AD_DEBUG("GpuCompareAllVsAll: Uploading CRC array to VRAM\n"); + err = cudaMemcpy(d_crcArray, allCrcArray, count * sizeof(uint64_t), cudaMemcpyHostToDevice); + if (err != cudaSuccess) { + AD_DEBUG_FMT("GpuCompareAllVsAll: Upload CRC failed: %s\n", cudaGetErrorString(err)); + cudaFree(d_thumbnails); + cudaFree(d_crcArray); + cudaFree(d_results); + cudaFree(d_matchCount); + return false; + } + AD_DEBUG("GpuCompareAllVsAll: Upload CRC complete\n"); + + // 7. Инициализируем counter size_t h_matchCount = 0; err = cudaMemcpy(d_matchCount, &h_matchCount, sizeof(size_t), cudaMemcpyHostToDevice); if (err != cudaSuccess) { AD_DEBUG_FMT("GpuCompareAllVsAll: Counter init failed: %s\n", cudaGetErrorString(err)); cudaFree(d_thumbnails); + cudaFree(d_crcArray); cudaFree(d_results); cudaFree(d_matchCount); return false; } - // 6. Запускаем kernel + // 8. Запускаем kernel AD_DEBUG("GpuCompareAllVsAll: Launching kernel\n"); int threadsPerBlock = 256; - // Grid-stride loop: используем максимум блоков для параллелизма size_t blocks = count; - if (blocks > 65535) blocks = 65535; // Максимум CUDA grid size - if (blocks == 0) blocks = 1; // Минимум 1 блок + if (blocks > 65535) blocks = 65535; + if (blocks == 0) blocks = 1; - AD_DEBUG_FMT("GpuCompareAllVsAll: Launching %zu blocks, %d threads/block (grid-stride + shared mem)\n", blocks, threadsPerBlock); + AD_DEBUG_FMT("GpuCompareAllVsAll: Launching %zu blocks, %d threads/block\n", blocks, threadsPerBlock); - // Передаём размер shared memory динамически AllVsAllKernel<<<(int)blocks, threadsPerBlock, thumbSize>>>( - d_thumbnails, thumbSize, count, threshold, d_results, d_matchCount); + d_thumbnails, d_crcArray, thumbSize, count, threshold, maxDifference, addDiffForCrcMismatch, + d_results, d_matchCount); err = cudaGetLastError(); if (err != cudaSuccess) { AD_DEBUG_FMT("GpuCompareAllVsAll: Kernel launch failed: %s\n", cudaGetErrorString(err)); cudaFree(d_thumbnails); + cudaFree(d_crcArray); cudaFree(d_results); cudaFree(d_matchCount); return false; } - // 7. Ждём завершения + // 9. Ждём завершения kernel AD_DEBUG("GpuCompareAllVsAll: Synchronizing\n"); err = cudaDeviceSynchronize(); if (err != cudaSuccess) { AD_DEBUG_FMT("GpuCompareAllVsAll: Sync failed: %s\n", cudaGetErrorString(err)); cudaFree(d_thumbnails); + cudaFree(d_crcArray); cudaFree(d_results); cudaFree(d_matchCount); return false; } + AD_DEBUG("GpuCompareAllVsAll: Kernel complete\n"); - // 8. Считываем counter - AD_DEBUG("GpuCompareAllVsAll: Reading match count\n"); + // 10. Считываем total match count cudaMemcpy(&h_matchCount, d_matchCount, sizeof(size_t), cudaMemcpyDeviceToHost); + AD_DEBUG_FMT("GpuCompareAllVsAll: Found %zu total matches\n", h_matchCount); - AD_DEBUG_FMT("GpuCompareAllVsAll: Found %zu matches\n", h_matchCount); - - // 9. Считываем результаты + // 11. Streaming readback — читаем батчами и вызываем callback if (h_matchCount > 0) { - size_t readCount = (h_matchCount < maxMatches) ? h_matchCount : maxMatches; - AD_DEBUG_FMT("GpuCompareAllVsAll: Reading %zu results\n", readCount); - - std::vector h_results(readCount); - cudaMemcpy(h_results.data(), d_results, readCount * sizeof(Match), cudaMemcpyDeviceToHost); + std::vector h_batch(maxMatchesPerBatch); + size_t remaining = h_matchCount; + size_t offset = 0; + + while (remaining > 0) { + size_t batchSize = (remaining < maxMatchesPerBatch) ? remaining : maxMatchesPerBatch; + AD_DEBUG_FMT("GpuCompareAllVsAll: Reading batch %zu matches (offset %zu)\n", batchSize, offset); + + err = cudaMemcpy(h_batch.data(), d_results + offset, batchSize * sizeof(Match), cudaMemcpyDeviceToHost); + if (err != cudaSuccess) { + AD_DEBUG_FMT("GpuCompareAllVsAll: Readback failed: %s\n", cudaGetErrorString(err)); + break; + } - // Копируем в выходные массивы - for (size_t i = 0; i < readCount; i++) { - outImage1[i] = h_results[i].image1; - outImage2[i] = h_results[i].image2; - outDifference[i] = h_results[i].difference; + // Вызываем callback с батчем + callback(h_batch.data(), batchSize, callbackContext); + + remaining -= batchSize; + offset += batchSize; } + AD_DEBUG("GpuCompareAllVsAll: All batches processed\n"); } - *outMatchCount = h_matchCount; - - // 10. Освобождаем VRAM + // 12. Освобождаем VRAM cudaFree(d_thumbnails); + cudaFree(d_crcArray); cudaFree(d_results); cudaFree(d_matchCount); diff --git a/src/AntiDupl/adGPU.h b/src/AntiDupl/adGPU.h index 1299a68b..caba22aa 100644 --- a/src/AntiDupl/adGPU.h +++ b/src/AntiDupl/adGPU.h @@ -37,6 +37,13 @@ namespace ad bool isCompatible; }; + // Match result structure + struct Match { + uint32_t image1; + uint32_t image2; + float difference; + }; + bool GpuInit(GpuDeviceInfo* pInfo); void GpuRelease(); @@ -57,17 +64,20 @@ namespace ad // Single comparison (optimized, no malloc inside) double GpuCompareSquaredSum(const uint8_t* pSrc1, const uint8_t* pSrc2, size_t size); - // NEW: AllVsAll comparison с массовым upload + // NEW: AllVsAll comparison с streaming callback + typedef void (*GpuMatchCallback)(const void* batch, size_t count, void* context); + bool GpuCompareAllVsAll( - const uint8_t* allThumbnails, // Все thumbnails в RAM (непрерывный массив) + const uint8_t* allThumbnails, // Все thumbnails в RAM + const uint64_t* allCrcArray, // CRC32c для каждого изображения size_t count, // Количество изображений size_t thumbSize, // Размер одного thumbnail (1024) - double threshold, // Порог для дубликатов - uint32_t* outImage1, // Массив для image1 (результат) - uint32_t* outImage2, // Массив для image2 (результат) - float* outDifference, // Массив для difference (результат) - size_t* outMatchCount, // Количество найденных дубликатов - size_t maxMatches); // Максимальное количество результатов + double threshold, // Порог squared difference + double maxDifference, // Максимальная разница для нормализации + double addDiffForCrcMismatch, // Добавка за несовпадение CRC + void* callbackContext, // Контекст для callback + GpuMatchCallback callback, // Callback для streaming обработки + size_t maxMatchesPerBatch); // Максимум matches за один вызов } #endif//__adGPU_h__ diff --git a/src/AntiDupl/adGPUManager.h b/src/AntiDupl/adGPUManager.h index 6ec1f0d8..aee97d73 100644 --- a/src/AntiDupl/adGPUManager.h +++ b/src/AntiDupl/adGPUManager.h @@ -70,22 +70,23 @@ namespace ad bool EnsureCapacity(size_t required, size_t thumbSize); - // NEW: AllVsAll comparison + // NEW: AllVsAll comparison с streaming callback bool CompareAllVsAll( const uint8_t* allThumbnails, + const uint64_t* allCrcArray, size_t count, size_t thumbSize, double threshold, - uint32_t* outImage1, - uint32_t* outImage2, - float* outDifference, - size_t* outMatchCount, - size_t maxMatches) { + double maxDifference, + double addDiffForCrcMismatch, + void* callbackContext, + GpuMatchCallback callback, + size_t maxMatchesPerBatch) { if (!m_available) return false; std::lock_guard lock(m_mutex); - return GpuCompareAllVsAll(allThumbnails, count, thumbSize, threshold, - outImage1, outImage2, outDifference, - outMatchCount, maxMatches); + return GpuCompareAllVsAll(allThumbnails, allCrcArray, count, thumbSize, + threshold, maxDifference, addDiffForCrcMismatch, + callbackContext, callback, maxMatchesPerBatch); } private: diff --git a/src/AntiDupl/adThreadManagement.cpp b/src/AntiDupl/adThreadManagement.cpp index 5eab72a7..901386c4 100644 --- a/src/AntiDupl/adThreadManagement.cpp +++ b/src/AntiDupl/adThreadManagement.cpp @@ -390,6 +390,13 @@ namespace ad size_t TCollectManager::DefaultThreadCount() { size_t threadCountMax = GetProcessorCount(); + + // In GPU AllVsAll mode, maximize collection threads (leave 1 core for UI) + if (m_pEngine->SkipComparisonDuringCollection()) + { + return Simd::Max((size_t)1, threadCountMax - 1); + } + #ifdef AD_TURBO_JPEG_ENABLE return Simd::Max((size_t)1, threadCountMax / 2); #else From 526e98835cd32bd5ae9cef6f960909d866850c2e Mon Sep 17 00:00:00 2001 From: Antigravity Date: Sun, 5 Apr 2026 23:24:13 +0400 Subject: [PATCH 4/6] fix: null pointer safety in GPU streaming callback --- src/AntiDupl/adEngine.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/AntiDupl/adEngine.cpp b/src/AntiDupl/adEngine.cpp index 5ad99c05..a24057a6 100644 --- a/src/AntiDupl/adEngine.cpp +++ b/src/AntiDupl/adEngine.cpp @@ -247,8 +247,19 @@ namespace ad const Match* matches = (const Match*)batch; for (size_t i = 0; i < count; i++) { + // Проверяем индексы на валидность + if (matches[i].image1 >= ctx->imageByIndex->size() || + matches[i].image2 >= ctx->imageByIndex->size()) { + continue; + } + TImageDataPtr pImage1 = ctx->imageByIndex->at(matches[i].image1); TImageDataPtr pImage2 = ctx->imageByIndex->at(matches[i].image2); + + // Пропускаем пары с nullptr (изображения без данных) + if (!pImage1 || !pImage2) { + continue; + } ctx->engine->Result()->AddDuplImagePair(pImage1, pImage2, matches[i].difference, AD_TRANSFORM_TURN_0); ctx->totalProcessed++; From 41c4911a46ec811eeaba51ac3ccfec571f5e4895 Mon Sep 17 00:00:00 2001 From: Antigravity Date: Sun, 5 Apr 2026 23:34:12 +0400 Subject: [PATCH 5/6] perf: scaled JPEG decode for faster collection --- src/AntiDupl/adImage.cpp | 2 +- src/AntiDupl/adTurboJpeg.cpp | 32 +++++++++++++++++++++++++------- src/AntiDupl/adTurboJpeg.h | 2 +- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/src/AntiDupl/adImage.cpp b/src/AntiDupl/adImage.cpp index a409879e..8d8a2ed7 100644 --- a/src/AntiDupl/adImage.cpp +++ b/src/AntiDupl/adImage.cpp @@ -112,7 +112,7 @@ namespace ad return THeif::Load(hGlobal); #ifdef AD_TURBO_JPEG_ENABLE if (pOptions->advanced.useLibJpegTurbo && TTurboJpeg::Supported(hGlobal)) - return TTurboJpeg::Load(hGlobal); + return TTurboJpeg::Load(hGlobal, pOptions->advanced.reducedImageSize); #endif//AD_TURBO_JPEG_ENABLE else return TGdiplus::Load(hGlobal); diff --git a/src/AntiDupl/adTurboJpeg.cpp b/src/AntiDupl/adTurboJpeg.cpp index 3f2299f8..3135f0f3 100644 --- a/src/AntiDupl/adTurboJpeg.cpp +++ b/src/AntiDupl/adTurboJpeg.cpp @@ -48,18 +48,36 @@ namespace ad ::tjDestroy(_handle); } - TView * Decompress(const unsigned char * data, size_t size) + TView * Decompress(const unsigned char * data, size_t size, int targetSize = 0) { int subsamp, colorspace, width, height, flags = 0; if(::tjDecompressHeader3(_handle, data, (unsigned long)size, &width, &height, &subsamp, &colorspace) != 0) return NULL; if (width == 0 || height == 0) return NULL; - TView * pView = new TView(width, height, TView::Bgra32, NULL, 4); - if (::tjDecompress2(_handle, data, size, pView->data, width, 0, height, ::TJPF_RGBA, flags) != 0 && ::tjGetErrorCode(_handle) != ::TJERR_WARNING) + + // Scaled decode для ускорения (если targetSize > 0) + int scaledWidth = width; + int scaledHeight = height; + + if (targetSize > 0 && (width > targetSize || height > targetSize)) { + // libjpeg-turbo поддерживает только определённые scaling factors + // Выбираем ближайший: 1/2, 1/4, 1/8 + int denom = 1; + if (width / 8 >= targetSize && height / 8 >= targetSize) denom = 8; + else if (width / 4 >= targetSize && height / 4 >= targetSize) denom = 4; + else if (width / 2 >= targetSize && height / 2 >= targetSize) denom = 2; + + if (denom > 1) { + scaledWidth = width / denom; + scaledHeight = height / denom; + flags |= TJFLAG_FASTUPSAMPLE; + } + } + + TView * pView = new TView(scaledWidth, scaledHeight, TView::Bgra32, NULL, 4); + if (::tjDecompress2(_handle, data, size, pView->data, scaledWidth, 0, scaledHeight, ::TJPF_RGBA, flags) != 0 && ::tjGetErrorCode(_handle) != ::TJERR_WARNING) { - //int code = ::tjGetErrorCode(_handle); - //const char * str = ::tjGetErrorStr2(_handle); delete pView; pView = NULL; } @@ -72,14 +90,14 @@ namespace ad thread_local TurboJpeg turboJpeg; - TTurboJpeg * TTurboJpeg::Load(HGLOBAL hGlobal) + TTurboJpeg * TTurboJpeg::Load(HGLOBAL hGlobal, int targetSize) { if (hGlobal) { const unsigned char * data = (unsigned char*)::GlobalLock(hGlobal); size_t size = ::GlobalSize(hGlobal); TTurboJpeg * pTurboJpeg = NULL; - TView * pView = turboJpeg.Decompress(data, size); + TView * pView = turboJpeg.Decompress(data, size, targetSize); if (pView) { pTurboJpeg = new TTurboJpeg(); diff --git a/src/AntiDupl/adTurboJpeg.h b/src/AntiDupl/adTurboJpeg.h index 6d5ff8c4..b2d9960e 100644 --- a/src/AntiDupl/adTurboJpeg.h +++ b/src/AntiDupl/adTurboJpeg.h @@ -32,7 +32,7 @@ namespace ad class TTurboJpeg : public TImage { public: - static TTurboJpeg * Load(HGLOBAL hGlobal); + static TTurboJpeg * Load(HGLOBAL hGlobal, int targetSize = 0); static bool Supported(HGLOBAL hGlobal); }; } From fc0fb5c44f934fbae8cc7af3cc81f4a22f831ec6 Mon Sep 17 00:00:00 2001 From: Antigravity Date: Mon, 6 Apr 2026 01:37:16 +0400 Subject: [PATCH 6/6] feat: GPU AllVsAll production-ready with all critical fixes --- src/AntiDupl/adDataCollector.cpp | 2 +- src/AntiDupl/adEngine.cpp | 65 +++++++++++++++++++++++--------- src/AntiDupl/adEngine.h | 2 +- src/AntiDupl/adGPU.cu | 60 +++++++++++++++++++++-------- src/AntiDupl/adTurboJpeg.cpp | 10 +++++ src/AntiDupl/adTurboJpeg.h | 7 ++++ 6 files changed, 112 insertions(+), 34 deletions(-) diff --git a/src/AntiDupl/adDataCollector.cpp b/src/AntiDupl/adDataCollector.cpp index 858acb3e..bf599d90 100644 --- a/src/AntiDupl/adDataCollector.cpp +++ b/src/AntiDupl/adDataCollector.cpp @@ -78,7 +78,7 @@ namespace ad TImage *pImage = TImage::Load(pImageData->hGlobal, m_pOptions); if(pImage) { - pImageData->height = (TUInt32)pImage->View()->height; + pImageData->height = (TUInt32)pImage->View()->height; pImageData->width = (TUInt32)pImage->View()->width; pImageData->type = (TImageType)pImage->Format(); diff --git a/src/AntiDupl/adEngine.cpp b/src/AntiDupl/adEngine.cpp index a24057a6..ebd486b2 100644 --- a/src/AntiDupl/adEngine.cpp +++ b/src/AntiDupl/adEngine.cpp @@ -263,24 +263,30 @@ namespace ad ctx->engine->Result()->AddDuplImagePair(pImage1, pImage2, matches[i].difference, AD_TRANSFORM_TURN_0); ctx->totalProcessed++; + + // Обновляем прогресс (для GPU режима) + if (ctx->totalProcessed % 10000 == 0) { + ctx->engine->Status()->SetProgress(ctx->totalProcessed, ctx->totalProcessed); // dummy update + } } } // NEW: GPU AllVsAll comparison с streaming processing - void TEngine::ExecuteGpuAllVsAllComparison() + // Возвращает true при успешном выполнении, false при ошибке + bool TEngine::ExecuteGpuAllVsAllComparison() { AD_DEBUG("ExecuteGpuAllVsAllComparison: Starting\n"); if (!m_pGpuManager || !m_pGpuManager->IsAvailable()) { AD_DEBUG("ExecuteGpuAllVsAllComparison: GPU not available\n"); - return; + return false; } const TImageDataStorage::TStorage& storage = m_pImageDataStorage->Storage(); size_t count = storage.size(); if (count == 0) { AD_DEBUG("ExecuteGpuAllVsAllComparison: Empty storage\n"); - return; + return false; } size_t reducedImageSize = m_pOptions->advanced.reducedImageSize; @@ -288,24 +294,40 @@ namespace ad AD_DEBUG_FMT("ExecuteGpuAllVsAllComparison: Preparing data for %zu images\n", count); - // Собираем все thumbnails и CRC в один проход - std::vector allThumbnails(count * thumbSize); - std::vector allCrcArray(count); - std::vector imageByIndex(count); + // Собираем ТОЛЬКО валидные thumbnails в компактный массив + // Это предотвращает сравнение изображений без данных + std::vector allThumbnails; + std::vector allCrcArray; + std::vector imageByIndex; + allThumbnails.reserve(count * thumbSize); + allCrcArray.reserve(count); + imageByIndex.reserve(count); + size_t validCount = 0; - size_t idx = 0; - for (TImageDataStorage::TStorage::const_iterator it = storage.begin(); it != storage.end(); ++it, ++idx) { + for (TImageDataStorage::TStorage::const_iterator it = storage.begin(); it != storage.end(); ++it) { TImageDataPtr pImageData = it->second; if (pImageData->data && pImageData->data->filled && pImageData->data->main != nullptr) { - memcpy(&allThumbnails[idx * thumbSize], pImageData->data->main, thumbSize); - allCrcArray[idx] = pImageData->crc32c; - imageByIndex[idx] = pImageData; + // Копируем thumbnail + allThumbnails.resize((validCount + 1) * thumbSize); + memcpy(&allThumbnails[validCount * thumbSize], pImageData->data->main, thumbSize); + + // Копируем CRC + allCrcArray.push_back(pImageData->crc32c); + + // Сохраняем указатель + imageByIndex.push_back(pImageData); + validCount++; } } - AD_DEBUG_FMT("ExecuteGpuAllVsAllComparison: %zu valid thumbnails\n", validCount); + AD_DEBUG_FMT("ExecuteGpuAllVsAllComparison: %zu valid thumbnails out of %zu\n", validCount, count); + + if (validCount < 2) { + AD_DEBUG("ExecuteGpuAllVsAllComparison: Not enough valid images\n"); + return false; + } // Вычисляем threshold и maxDifference как в оригинальном TImageComparer int thresholdPerPixel = Simd::Square(m_pOptions->compare.thresholdDifference * PIXEL_MAX_DIFFERENCE) / @@ -328,12 +350,12 @@ namespace ad // Batch size для streaming readback: 5M matches = 60MB RAM const size_t BATCH_MATCHES = 5000000; - AD_DEBUG_FMT("ExecuteGpuAllVsAllComparison: Calling GPU (batch size: %zu)\n", BATCH_MATCHES); + AD_DEBUG_FMT("ExecuteGpuAllVsAllComparison: Calling GPU with %zu valid images (batch size: %zu)\n", validCount, BATCH_MATCHES); bool success = m_pGpuManager->CompareAllVsAll( allThumbnails.data(), allCrcArray.data(), - count, + validCount, // Используем validCount вместо count thumbSize, threshold, maxDifference, @@ -358,6 +380,7 @@ namespace ad imageByIndex.shrink_to_fit(); AD_DEBUG("ExecuteGpuAllVsAllComparison: Finished\n"); + return success; } void TEngine::Search() @@ -421,9 +444,17 @@ namespace ad if (useGpu) { AD_DEBUG("Search: Using GPU AllVsAll comparison\n"); - ExecuteGpuAllVsAllComparison(); + bool gpuSuccess = ExecuteGpuAllVsAllComparison(); m_skipComparisonDuringCollection = false; - AD_DEBUG("Search: GPU comparison completed\n"); + + if (!gpuSuccess) { + AD_DEBUG("Search: GPU comparison FAILED — no CPU fallback (too slow for large collections)\n"); + // CPU fallback removed — O(N^2) CPU comparison is impractical for 10K+ images + // User should retry with smaller collection or check GPU memory availability + } + else { + AD_DEBUG("Search: GPU comparison completed successfully\n"); + } } else { diff --git a/src/AntiDupl/adEngine.h b/src/AntiDupl/adEngine.h index 60488656..7fa07106 100644 --- a/src/AntiDupl/adEngine.h +++ b/src/AntiDupl/adEngine.h @@ -58,7 +58,7 @@ namespace ad void Search(); void UpdateGpuDatabase(); - void ExecuteGpuAllVsAllComparison(); + bool ExecuteGpuAllVsAllComparison(); // Возвращает true при успехе // Flag to skip comparison during collection (for GPU AllVsAll mode) bool m_skipComparisonDuringCollection; diff --git a/src/AntiDupl/adGPU.cu b/src/AntiDupl/adGPU.cu index 96a40d74..e4b45e05 100644 --- a/src/AntiDupl/adGPU.cu +++ b/src/AntiDupl/adGPU.cu @@ -79,6 +79,7 @@ namespace ad double threshold, // Порог squared difference double maxDifference, // Максимальная разница для нормализации double addDiffForCrcMismatch, // Добавка за несовпадение CRC + size_t maxMatches, // Максимум matches (bounds checking) Match* results, // Sparse buffer для результатов size_t* matchCount) // Atomic counter { @@ -110,18 +111,23 @@ namespace ad // Проверяем threshold ДО нормализации (как в CPU версии) if (sumSqDiff <= threshold) { - // Вычисляем финальный difference в процентах - double difference = sqrt(sumSqDiff / maxDifference) * 100.0; - - // Добавляем штраф за несовпадение CRC - if (crcArray[i] != crcArray[j]) { - difference += addDiffForCrcMismatch; - } - + // Атомарно получаем индекс с bounds checking size_t idx = atomicAdd(matchCount, (size_t)1); - results[idx].image1 = (uint32_t)i; - results[idx].image2 = (uint32_t)j; - results[idx].difference = (float)difference; + + // ЗАЩИТА от переполнения буфера + if (idx < maxMatches) { + // Вычисляем финальный difference в процентах + double difference = sqrt(sumSqDiff / maxDifference) * 100.0; + + // Добавляем штраф за несовпадение CRC + if (crcArray[i] != crcArray[j]) { + difference += addDiffForCrcMismatch; + } + + results[idx].image1 = (uint32_t)i; + results[idx].image2 = (uint32_t)j; + results[idx].difference = (float)difference; + } } } __syncthreads(); // Синхронизация перед следующей итерацией i @@ -616,6 +622,9 @@ namespace ad { AD_DEBUG("GpuCompareAllVsAll: Starting\n"); + // Очищаем любые накопленные ошибки CUDA перед началом + cudaGetLastError(); // clear any pending errors + if (!allThumbnails || !allCrcArray || count == 0 || thumbSize == 0 || !callback) { AD_DEBUG("GpuCompareAllVsAll: Invalid parameters\n"); return false; @@ -634,9 +643,24 @@ namespace ad // 1. Выделяем память для thumbnails AD_DEBUG("GpuCompareAllVsAll: Allocating VRAM for thumbnails\n"); + + // Проверяем доступную VRAM перед выделением + size_t freeMem = 0, totalMem = 0; + cudaMemGetInfo(&freeMem, &totalMem); + size_t requiredMem = count * thumbSize + count * sizeof(uint64_t) + maxMatchesPerBatch * sizeof(Match); + AD_DEBUG_FMT("GpuCompareAllVsAll: VRAM free=%zu MB, total=%zu MB, required=%zu MB\n", + freeMem / 1024 / 1024, totalMem / 1024 / 1024, requiredMem / 1024 / 1024); + + if (requiredMem > freeMem * 9 / 10) { // Используем максимум 90% свободной VRAM + AD_DEBUG_FMT("GpuCompareAllVsAll: Not enough VRAM (need %zu MB, have %zu MB)\n", + requiredMem / 1024 / 1024, freeMem / 1024 / 1024); + return false; + } + err = cudaMalloc(&d_thumbnails, count * thumbSize); if (err != cudaSuccess) { AD_DEBUG_FMT("GpuCompareAllVsAll: Failed to allocate thumbnails VRAM: %s\n", cudaGetErrorString(err)); + cudaGetLastError(); // Clear error state return false; } @@ -719,7 +743,7 @@ namespace ad AllVsAllKernel<<<(int)blocks, threadsPerBlock, thumbSize>>>( d_thumbnails, d_crcArray, thumbSize, count, threshold, maxDifference, addDiffForCrcMismatch, - d_results, d_matchCount); + maxMatchesPerBatch, d_results, d_matchCount); err = cudaGetLastError(); if (err != cudaSuccess) { @@ -746,12 +770,18 @@ namespace ad // 10. Считываем total match count cudaMemcpy(&h_matchCount, d_matchCount, sizeof(size_t), cudaMemcpyDeviceToHost); - AD_DEBUG_FMT("GpuCompareAllVsAll: Found %zu total matches\n", h_matchCount); + AD_DEBUG_FMT("GpuCompareAllVsAll: Found %zu total matches (buffer capacity: %zu)\n", h_matchCount, maxMatchesPerBatch); + + // Ограничиваем чтение размером буфера + size_t matchesToRead = (h_matchCount < maxMatchesPerBatch) ? h_matchCount : maxMatchesPerBatch; + if (h_matchCount > maxMatchesPerBatch) { + AD_DEBUG_FMT("GpuCompareAllVsAll: WARNING! Truncated from %zu to %zu matches\n", h_matchCount, maxMatchesPerBatch); + } // 11. Streaming readback — читаем батчами и вызываем callback - if (h_matchCount > 0) { + if (matchesToRead > 0) { std::vector h_batch(maxMatchesPerBatch); - size_t remaining = h_matchCount; + size_t remaining = matchesToRead; size_t offset = 0; while (remaining > 0) { diff --git a/src/AntiDupl/adTurboJpeg.cpp b/src/AntiDupl/adTurboJpeg.cpp index 3135f0f3..1cb1f542 100644 --- a/src/AntiDupl/adTurboJpeg.cpp +++ b/src/AntiDupl/adTurboJpeg.cpp @@ -38,6 +38,7 @@ namespace ad { struct TurboJpeg { + public: TurboJpeg() { _handle = ::tjInitDecompress(); @@ -47,6 +48,8 @@ namespace ad { ::tjDestroy(_handle); } + + tjhandle Handle() const { return _handle; } TView * Decompress(const unsigned char * data, size_t size, int targetSize = 0) { @@ -103,6 +106,13 @@ namespace ad pTurboJpeg = new TTurboJpeg(); pTurboJpeg->m_format = TImage::Jpeg; pTurboJpeg->m_pView = pView; + + // Сохраняем оригинальные размеры из заголовка JPEG + int subsamp, colorspace, origWidth, origHeight; + if(::tjDecompressHeader3(turboJpeg.Handle(), data, (unsigned long)size, &origWidth, &origHeight, &subsamp, &colorspace) == 0) { + pTurboJpeg->m_origWidth = origWidth; + pTurboJpeg->m_origHeight = origHeight; + } } ::GlobalUnlock(hGlobal); return pTurboJpeg; diff --git a/src/AntiDupl/adTurboJpeg.h b/src/AntiDupl/adTurboJpeg.h index b2d9960e..b8437c95 100644 --- a/src/AntiDupl/adTurboJpeg.h +++ b/src/AntiDupl/adTurboJpeg.h @@ -34,6 +34,13 @@ namespace ad public: static TTurboJpeg * Load(HGLOBAL hGlobal, int targetSize = 0); static bool Supported(HGLOBAL hGlobal); + + int OriginalWidth() const { return m_origWidth; } + int OriginalHeight() const { return m_origHeight; } + + private: + int m_origWidth = 0; + int m_origHeight = 0; }; } #endif//AD_TURBO_JPEG_ENABLE