From ac569da25fcc17a6e141fc469cb101ad6c8281e9 Mon Sep 17 00:00:00 2001 From: David Sanftenberg Date: Tue, 7 Oct 2025 10:10:35 +0000 Subject: [PATCH] Fix validation tolerance: use relative error instead of absolute The validation was using absolute error tolerance (1e-8) which fails for large matrix multiplication results (magnitude ~1e4). This caused false negatives where COSMA computed correct results but failed validation. Changes: - Switch from absolute error to relative error for validation - Use 1e-5 tolerance for float32 (appropriate for single precision) - Use 1e-8 tolerance for float64 (appropriate for double precision) - Handle small values near zero with absolute error fallback This fixes issue #153 where K-split strategy was incorrectly reported as producing 93.6% errors when actual relative errors were < 1e-6. Tested with: - 32x896x896 float32: now passes (was 93.8% false errors) - 32x10000x896 float32: now passes (was 93.6% false errors) - 32x32x32 float64: still passes (regression test) --- utils/cosma_utils.hpp | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/utils/cosma_utils.hpp b/utils/cosma_utils.hpp index 8b0d26dd..3eb73a96 100644 --- a/utils/cosma_utils.hpp +++ b/utils/cosma_utils.hpp @@ -333,23 +333,38 @@ bool test_cosma(Strategy s, // Now Check result isOK = globCcheck.size() == globC.size(); for (int i = 0; i < globC.size(); ++i) { - isOK = isOK && (std::abs(globC[i] - globCcheck[i]) < epsilon); + // Use relative error for large values, absolute error for small values + double abs_error = std::abs(globC[i] - globCcheck[i]); + double scale = std::max(std::abs(globC[i]), std::abs(globCcheck[i])); + double rel_error = (scale > 1e-10) ? abs_error / scale : abs_error; + // For float32, relative error tolerance should be ~1e-6 + // For float64, relative error tolerance should be ~1e-12 + double tolerance = (sizeof(Scalar) == 4) ? 1e-5 : epsilon; + isOK = isOK && (rel_error < tolerance); } if (!isOK) { std::cout << "Result is NOT OK" << std::endl; + int error_count = 0; + const int MAX_ERRORS_TO_PRINT = 20; for (int i = 0; i < m * n; i++) { if (globCcheck[i] != globC[i]) { - int x = i % m; - int y = i / m; - int locidx, rank; - std::tie(locidx, rank) = C.local_coordinates(x, y); - std::cout << "global(" << x << ", " << y - << ") = (loc = " << locidx << ", rank = " << rank - << ") = " << globC.at(i) << " and should be " - << globCcheck.at(i) << std::endl; + error_count++; + if (error_count <= MAX_ERRORS_TO_PRINT) { + int x = i % m; + int y = i / m; + int locidx, rank; + std::tie(locidx, rank) = C.local_coordinates(x, y); + std::cout << "global(" << x << ", " << y + << ") = (loc = " << locidx << ", rank = " << rank + << ") = " << globC.at(i) << " and should be " + << globCcheck.at(i) << " (diff = " + << std::abs(globC.at(i) - globCcheck.at(i)) << ")" << std::endl; + } } } + std::cout << "Total errors: " << error_count << " out of " << (m * n) << " elements (" + << (100.0 * error_count / (m * n)) << "%)" << std::endl; } else { std::cout <<"Result is OK"< 0 || isOK; }